document_format_conversion.py 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. # import os
  2. # import subprocess
  3. # from pathlib import Path
  4. # # ==============================
  5. # # LibreOffice 文件 → PDF 转换器
  6. # # 基于 unoconv + LibreOffice
  7. # # ==============================
  8. # # 支持的文件格式(可根据需要扩展)
  9. # SUPPORTED_EXTENSIONS = {
  10. # ".doc", ".docx", ".odt", # 文本文件
  11. # ".xls", ".xlsx", ".ods", # 表格文件
  12. # ".ppt", ".pptx", ".odp" # 幻灯片文件
  13. # }
  14. # def is_supported_file(file_path: str) -> bool:
  15. # """判断文件是否为支持的 Office 格式"""
  16. # ext = Path(file_path).suffix.lower()
  17. # return ext in SUPPORTED_EXTENSIONS
  18. # def convert_to_pdf(input_path: str, output_dir: str = None) -> bool:
  19. # """
  20. # 使用 unoconv 调用 LibreOffice 将 Office 文件转换为 PDF。
  21. # Args:
  22. # input_path (str): 输入文件路径
  23. # output_dir (str): 输出目录(默认与输入文件同级)
  24. # Returns:
  25. # bool: 转换是否成功
  26. # """
  27. # input_path = os.path.abspath(input_path)
  28. # if not os.path.exists(input_path):
  29. # print(f"❌ 文件不存在: {input_path}")
  30. # return False
  31. # if not is_supported_file(input_path):
  32. # print(f"⚠️ 不支持的文件格式: {input_path}")
  33. # return False
  34. # if output_dir is None:
  35. # output_dir = os.path.dirname(input_path)
  36. # os.makedirs(output_dir, exist_ok=True)
  37. # # 构造输出路径
  38. # output_pdf = os.path.join(
  39. # output_dir, Path(input_path).stem + ".pdf"
  40. # )
  41. # try:
  42. # # 执行 unoconv 转换命令
  43. # subprocess.run(
  44. # ["unoconv", "-f", "pdf", "-o", output_pdf, input_path],
  45. # check=True,
  46. # stdout=subprocess.PIPE,
  47. # stderr=subprocess.PIPE
  48. # )
  49. # print(f"✅ 转换成功: {output_pdf}")
  50. # return True
  51. # except FileNotFoundError:
  52. # print("❌ 未检测到 unoconv,请先执行安装:sudo apt install -y unoconv libreoffice-headless")
  53. # return False
  54. # except subprocess.CalledProcessError as e:
  55. # print(f"❌ 转换失败: {input_path}")
  56. # print("错误信息:", e.stderr.decode(errors="ignore"))
  57. # return False
  58. # def batch_convert_directory(input_dir: str, output_dir: str = None):
  59. # """
  60. # 批量转换目录中的 Office 文件为 PDF。
  61. # """
  62. # input_dir = os.path.abspath(input_dir)
  63. # if not os.path.isdir(input_dir):
  64. # print(f"❌ 输入路径不是目录: {input_dir}")
  65. # return
  66. # print(f"📂 开始扫描目录: {input_dir}")
  67. # for root, _, files in os.walk(input_dir):
  68. # for f in files:
  69. # file_path = os.path.join(root, f)
  70. # if is_supported_file(file_path):
  71. # rel_dir = os.path.relpath(root, input_dir)
  72. # target_dir = os.path.join(output_dir or input_dir, rel_dir)
  73. # convert_to_pdf(file_path, target_dir)
  74. # else:
  75. # print(f"⏩ 跳过不支持的文件: {f}")
  76. # if __name__ == "__main__":
  77. # # 示例:转换单个文件
  78. # # convert_to_pdf("/work/docs/test.docx", "/work/output")
  79. # # 示例:批量转换整个目录
  80. # batch_convert_directory("/work/docs", "/work/output")
  81. import os
  82. import subprocess
  83. # ==========================================
  84. # 支持转换为 PDF 的文件扩展名
  85. # ==========================================
  86. SUPPORTED_FORMATS = {
  87. # 文档
  88. '.doc', '.docx', '.odt', '.rtf', '.txt',
  89. # 表格
  90. '.xls', '.xlsx', '.ods',
  91. # 演示文稿
  92. '.ppt', '.pptx', '.odp',
  93. }
  94. def convert_to_pdf(input_path, output_dir=None):
  95. """
  96. 使用 LibreOffice 命令行将文档/表格/演示文稿转换为 PDF。
  97. Args:
  98. input_path (str): 输入文件路径
  99. output_dir (str, optional): 输出目录(默认同输入文件目录)
  100. Returns:
  101. str: 输出 PDF 文件路径
  102. """
  103. if not os.path.isfile(input_path):
  104. raise FileNotFoundError(f"文件不存在: {input_path}")
  105. file_ext = os.path.splitext(input_path)[1].lower()
  106. if file_ext not in SUPPORTED_FORMATS:
  107. raise ValueError(f"不支持的文件格式: {file_ext},仅支持 {SUPPORTED_FORMATS}")
  108. if output_dir is None:
  109. output_dir = os.path.dirname(input_path)
  110. os.makedirs(output_dir, exist_ok=True)
  111. # LibreOffice 输出 PDF 文件路径
  112. output_pdf = os.path.join(
  113. output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
  114. )
  115. # 构造 LibreOffice 转换命令
  116. cmd = [
  117. "libreoffice",
  118. "--headless",
  119. "--convert-to", "pdf",
  120. "--outdir", output_dir,
  121. input_path
  122. ]
  123. try:
  124. print(f"执行命令: {' '.join(cmd)}")
  125. subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
  126. if not os.path.exists(output_pdf):
  127. raise RuntimeError("转换失败:未生成 PDF 文件。")
  128. print(f"✅ 转换成功: {output_pdf}")
  129. return output_pdf
  130. except subprocess.CalledProcessError as e:
  131. print(f"❌ LibreOffice 转换失败:\nSTDOUT: {e.stdout.decode()}\nSTDERR: {e.stderr.decode()}")
  132. raise e
  133. except Exception as e:
  134. print(f"❌ 转换过程中发生错误: {e}")
  135. raise e
  136. # ===============================
  137. # 示例调用
  138. # ===============================
  139. if __name__ == "__main__":
  140. test_file = "./work_ceshi/ceshi.docx"
  141. try:
  142. pdf_path = convert_to_pdf(test_file)
  143. print(f"转换完成 → {pdf_path}")
  144. except Exception as e:
  145. print(f"转换失败: {e}")