| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199 |
- # import os
- # import subprocess
- # from pathlib import Path
- # # ==============================
- # # LibreOffice 文件 → PDF 转换器
- # # 基于 unoconv + LibreOffice
- # # ==============================
- # # 支持的文件格式(可根据需要扩展)
- # SUPPORTED_EXTENSIONS = {
- # ".doc", ".docx", ".odt", # 文本文件
- # ".xls", ".xlsx", ".ods", # 表格文件
- # ".ppt", ".pptx", ".odp" # 幻灯片文件
- # }
- # def is_supported_file(file_path: str) -> bool:
- # """判断文件是否为支持的 Office 格式"""
- # ext = Path(file_path).suffix.lower()
- # return ext in SUPPORTED_EXTENSIONS
- # def convert_to_pdf(input_path: str, output_dir: str = None) -> bool:
- # """
- # 使用 unoconv 调用 LibreOffice 将 Office 文件转换为 PDF。
-
- # Args:
- # input_path (str): 输入文件路径
- # output_dir (str): 输出目录(默认与输入文件同级)
- # Returns:
- # bool: 转换是否成功
- # """
- # input_path = os.path.abspath(input_path)
- # if not os.path.exists(input_path):
- # print(f"文件不存在: {input_path}")
- # return False
- # if not is_supported_file(input_path):
- # print(f"不支持的文件格式: {input_path}")
- # return False
- # if output_dir is None:
- # output_dir = os.path.dirname(input_path)
- # os.makedirs(output_dir, exist_ok=True)
- # # 构造输出路径
- # output_pdf = os.path.join(
- # output_dir, Path(input_path).stem + ".pdf"
- # )
- # try:
- # # 执行 unoconv 转换命令
- # subprocess.run(
- # ["unoconv", "-f", "pdf", "-o", output_pdf, input_path],
- # check=True,
- # stdout=subprocess.PIPE,
- # stderr=subprocess.PIPE
- # )
- # print(f"转换成功: {output_pdf}")
- # return True
- # except FileNotFoundError:
- # print("未检测到 unoconv,请先执行安装:sudo apt install -y unoconv libreoffice-headless")
- # return False
- # except subprocess.CalledProcessError as e:
- # print(f"转换失败: {input_path}")
- # print("错误信息:", e.stderr.decode(errors="ignore"))
- # return False
- # def batch_convert_directory(input_dir: str, output_dir: str = None):
- # """
- # 批量转换目录中的 Office 文件为 PDF。
- # """
- # input_dir = os.path.abspath(input_dir)
- # if not os.path.isdir(input_dir):
- # print(f"输入路径不是目录: {input_dir}")
- # return
- # print(f"开始扫描目录: {input_dir}")
- # for root, _, files in os.walk(input_dir):
- # for f in files:
- # file_path = os.path.join(root, f)
- # if is_supported_file(file_path):
- # rel_dir = os.path.relpath(root, input_dir)
- # target_dir = os.path.join(output_dir or input_dir, rel_dir)
- # convert_to_pdf(file_path, target_dir)
- # else:
- # print(f"跳过不支持的文件: {f}")
- # if __name__ == "__main__":
- # # 示例:转换单个文件
- # # convert_to_pdf("/work/docs/test.docx", "/work/output")
- # # 示例:批量转换整个目录
- # batch_convert_directory("/work/docs", "/work/output")
- import os
- import subprocess
- class MyError(Exception):
- pass
- # ==========================================
- # 支持转换为 PDF 的文件扩展名
- # ==========================================
- SUPPORTED_FORMATS = {
- # 文档
- '.doc', '.docx', '.odt', '.rtf', '.txt',
- # 表格
- '.xls', '.xlsx', '.ods',
- # 演示文稿
- '.ppt', '.pptx', '.odp',
- # 图片
- '.jpg', '.jpeg', '.png', '.gif'
- }
- def convert_to_pdf(input_path, output_dir=None):
- """
- 使用 LibreOffice 命令行将文档/表格/演示文稿转换为 PDF。
-
- Args:
- input_path (str): 输入文件路径
- output_dir (str, optional): 输出目录(默认同输入文件目录)
-
- Returns:
- str: 输出 PDF 文件路径
- """
- if not os.path.isfile(input_path):
- raise FileNotFoundError(f"文件不存在: {input_path}")
- file_ext = os.path.splitext(input_path)[1].lower()
- if file_ext not in SUPPORTED_FORMATS:
- raise ValueError(f"不支持的文件格式: {file_ext},仅支持 {SUPPORTED_FORMATS}")
- if output_dir is None:
- output_dir = os.path.dirname(input_path)
- os.makedirs(output_dir, exist_ok=True)
- # 注意 stdout + stderr 都要捕获
- result = subprocess.run(
- ["libreoffice", "--headless", "--cat", input_path],
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- text=True # 返回字符串而不是字节
- )
-
- output = result.stdout + result.stderr
-
- # 根据输出判断是否加密或无法打开
- # LibreOffice 输出包含 "password" 或 "could not be loaded" 时一般为加密文件
- if "password" in output.lower():
- raise ValueError(f"文件 {input_path} 为加密文件,无法转换")
- if "could not be loaded" in output.lower():
- raise ValueError(f"文件 {input_path} 为加密文件或可能已损坏,无法转换")
-
- # LibreOffice 输出 PDF 文件路径
- output_pdf = os.path.join(
- output_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
- )
- # 构造 LibreOffice 转换命令
- cmd = [
- "libreoffice",
- "--headless",
- "--convert-to", "pdf",
- "--outdir", output_dir,
- input_path
- ]
- try:
- print(f"执行命令: {' '.join(cmd)}")
- subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- if not os.path.exists(output_pdf):
- raise RuntimeError("转换失败:未生成 PDF 文件。")
- print(f"转换成功: {output_pdf}")
- return output_pdf
- except subprocess.CalledProcessError as e:
- print(f"LibreOffice 转换失败:\nSTDOUT: {e.stdout.decode()}\nSTDERR: {e.stderr.decode()} 详细:{e}")
- raise MyError("文件转换失败")
- except Exception as e:
- print(f"转换过程中发生错误: {e}")
- raise e
- # ===============================
- # 示例调用
- # ===============================
- if __name__ == "__main__":
- test_file = "./work_ceshi/ceshi.docx"
- try:
- pdf_path = convert_to_pdf(test_file)
- print(f"转换完成 → {pdf_path}")
- except Exception as e:
- print(f"转换失败: {e}")
|