md_to_html_to_pdf.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. import asyncio
  2. from pathlib import Path
  3. from markdown import markdown
  4. from weasyprint import HTML
  5. class AsyncMdToPdf:
  6. def __init__(self, output_dir: str = "./tmp_file/pdf_output"):
  7. self.output_dir = Path(output_dir)
  8. self.output_dir.mkdir(parents=True, exist_ok=True)
  9. self._html_template = """
  10. <!DOCTYPE html>
  11. <html lang="zh-CN">
  12. <head>
  13. <meta charset="utf-8">
  14. <style>
  15. /* ====== 页面设置 ====== */
  16. @page {{
  17. size: A4;
  18. margin: 2.5cm 2.2cm;
  19. }}
  20. /* ====== 全局字体 ====== */
  21. body {{
  22. font-family: "Arial", "Helvetica", sans-serif;
  23. font-size: 14px;
  24. line-height: 1.6;
  25. color: #222;
  26. }}
  27. /* ====== 标题样式 ====== */
  28. h1 {{ font-size: 26px; border-bottom: 2px solid #333; padding-bottom: 6px; margin-top:0; margin-bottom:12px; }}
  29. h2 {{ font-size: 20px; margin-top:20px; margin-bottom:6px; }}
  30. h3 {{ font-size: 16px; margin-top:16px; margin-bottom:4px; }}
  31. /* ====== 段落与列表 ====== */
  32. p {{ margin:8px 0; }}
  33. ul, ol {{ margin-left:20px; margin-bottom:8px; }}
  34. li {{ margin:4px 0; }}
  35. /* ====== 代码块 ====== */
  36. pre {{
  37. background:#f6f8fa;
  38. padding:12px;
  39. border-radius:6px;
  40. font-size:13px;
  41. overflow-x:auto;
  42. page-break-inside: avoid;
  43. }}
  44. code {{ font-family: Consolas, "Courier New", monospace; }}
  45. /* ====== 表格 ====== */
  46. table {{
  47. width:100%;
  48. border-collapse:collapse;
  49. margin:0;
  50. page-break-inside:auto;
  51. }}
  52. th, td {{ border:1px solid #ccc; padding:8px 10px; text-align:left; }}
  53. th {{ background:#f0f0f0; }}
  54. /* ====== 图片 ====== */
  55. img {{ max-width:100%; display:block; margin:6px auto; page-break-inside:avoid; }}
  56. /* ====== section wrapper ====== */
  57. .section {{ padding-top:12px; padding-bottom:12px; page-break-inside:avoid; }}
  58. </style>
  59. </head>
  60. <body>
  61. <div class="content">
  62. {html_body}
  63. </div>
  64. </body>
  65. </html>
  66. """
  67. async def convert_md_to_pdf(self, md_path: str) -> str:
  68. """异步将单个 Markdown 转 PDF"""
  69. md_path = Path(md_path)
  70. if not md_path.exists():
  71. raise FileNotFoundError(f"Markdown 文件不存在: {md_path}")
  72. pdf_path = self.output_dir / f"{md_path.stem}.pdf"
  73. # 读取 Markdown
  74. md_text = await asyncio.to_thread(md_path.read_text, encoding="utf-8")
  75. # Markdown -> HTML
  76. html_body = markdown(md_text, extensions=["fenced_code", "tables", "toc"])
  77. html = self._html_template.format(html_body=html_body)
  78. # 写 PDF(同步方法,用线程执行)
  79. await asyncio.to_thread(HTML(string=html, base_url=".").write_pdf, str(pdf_path))
  80. return str(pdf_path)
  81. async def convert_multiple(self, md_paths: list[str], concurrency: int = 3) -> list[str]:
  82. """异步并发转换多个 Markdown"""
  83. sem = asyncio.Semaphore(concurrency)
  84. async def sem_task(path: str):
  85. async with sem:
  86. return await self.convert_md_to_pdf(path)
  87. tasks = [sem_task(p) for p in md_paths]
  88. return await asyncio.gather(*tasks)
  89. # ====== 使用示例 ======
  90. async def main():
  91. converter = AsyncMdToPdf(output_dir="./tmp_file/pdf_output")
  92. md_files = [
  93. "/work/code/ceshi/deepseek_temporary/tmp_file/paddleocr_parsed/2025版《见证取样送检指南》/2025版《见证取样送检指南》.md",
  94. # 可以添加更多 Markdown 文件路径
  95. ]
  96. pdf_paths = await converter.convert_multiple(md_files)
  97. print("生成 PDF 完成:", pdf_paths)
  98. if __name__ == "__main__":
  99. asyncio.run(main())