| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119 |
- import asyncio
- from pathlib import Path
- from markdown import markdown
- from weasyprint import HTML
- class AsyncMdToPdf:
- def __init__(self, output_dir: str = "./tmp_file/pdf_output"):
- self.output_dir = Path(output_dir)
- self.output_dir.mkdir(parents=True, exist_ok=True)
- self._html_template = """
- <!DOCTYPE html>
- <html lang="zh-CN">
- <head>
- <meta charset="utf-8">
- <style>
- /* ====== 页面设置 ====== */
- @page {{
- size: A4;
- margin: 2.5cm 2.2cm;
- }}
- /* ====== 全局字体 ====== */
- body {{
- font-family: "Arial", "Helvetica", sans-serif;
- font-size: 14px;
- line-height: 1.6;
- color: #222;
- }}
- /* ====== 标题样式 ====== */
- h1 {{ font-size: 26px; border-bottom: 2px solid #333; padding-bottom: 6px; margin-top:0; margin-bottom:12px; }}
- h2 {{ font-size: 20px; margin-top:20px; margin-bottom:6px; }}
- h3 {{ font-size: 16px; margin-top:16px; margin-bottom:4px; }}
- /* ====== 段落与列表 ====== */
- p {{ margin:8px 0; }}
- ul, ol {{ margin-left:20px; margin-bottom:8px; }}
- li {{ margin:4px 0; }}
- /* ====== 代码块 ====== */
- pre {{
- background:#f6f8fa;
- padding:12px;
- border-radius:6px;
- font-size:13px;
- overflow-x:auto;
- page-break-inside: avoid;
- }}
- code {{ font-family: Consolas, "Courier New", monospace; }}
- /* ====== 表格 ====== */
- table {{
- width:100%;
- border-collapse:collapse;
- margin:0;
- page-break-inside:auto;
- }}
- th, td {{ border:1px solid #ccc; padding:8px 10px; text-align:left; }}
- th {{ background:#f0f0f0; }}
- /* ====== 图片 ====== */
- img {{ max-width:100%; display:block; margin:6px auto; page-break-inside:avoid; }}
- /* ====== section wrapper ====== */
- .section {{ padding-top:12px; padding-bottom:12px; page-break-inside:avoid; }}
- </style>
- </head>
- <body>
- <div class="content">
- {html_body}
- </div>
- </body>
- </html>
- """
- async def convert_md_to_pdf(self, md_path: str) -> str:
- """异步将单个 Markdown 转 PDF"""
- md_path = Path(md_path)
- if not md_path.exists():
- raise FileNotFoundError(f"Markdown 文件不存在: {md_path}")
- pdf_path = self.output_dir / f"{md_path.stem}.pdf"
- # 读取 Markdown
- md_text = await asyncio.to_thread(md_path.read_text, encoding="utf-8")
- # Markdown -> HTML
- html_body = markdown(md_text, extensions=["fenced_code", "tables", "toc"])
- html = self._html_template.format(html_body=html_body)
- # 写 PDF(同步方法,用线程执行)
- await asyncio.to_thread(HTML(string=html, base_url=".").write_pdf, str(pdf_path))
- return str(pdf_path)
- async def convert_multiple(self, md_paths: list[str], concurrency: int = 3) -> list[str]:
- """异步并发转换多个 Markdown"""
- sem = asyncio.Semaphore(concurrency)
- async def sem_task(path: str):
- async with sem:
- return await self.convert_md_to_pdf(path)
- tasks = [sem_task(p) for p in md_paths]
- return await asyncio.gather(*tasks)
- # ====== 使用示例 ======
- async def main():
- converter = AsyncMdToPdf(output_dir="./tmp_file/pdf_output")
- md_files = [
- "/work/code/ceshi/deepseek_temporary/tmp_file/paddleocr_parsed/2025版《见证取样送检指南》/2025版《见证取样送检指南》.md",
- # 可以添加更多 Markdown 文件路径
- ]
- pdf_paths = await converter.convert_multiple(md_files)
- print("生成 PDF 完成:", pdf_paths)
- if __name__ == "__main__":
- asyncio.run(main())
|