pdf_load.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740
  1. # import fitz # PyMuPDF
  2. import os
  3. from PIL import Image
  4. import io
  5. # import pdfplumber
  6. from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
  7. from utils.upload_file_to_oss import UploadMinio
  8. from config import minio_config
  9. # import os
  10. # from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
  11. # from magic_pdf.data.dataset import PymuDocDataset
  12. # from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
  13. # from magic_pdf.config.enums import SupportedPdfParseMethod
  14. class PDFLoader():
  15. def __init__(self, file_json):
  16. pass
  17. # class PDFLoader(UnstructuredFileLoader):
  18. # def __init__(self, file_json):
  19. # self.base_path = "./tmp_file"
  20. # self.file_json = file_json
  21. # self.flag = self.file_json.get("flag") # 后续优化
  22. # self.file_path_process()
  23. # if self.flag == "update":
  24. # self.flag_image_info_dict = {}
  25. # if not self.output_pdf_path:
  26. # self.upload_minio = UploadMinio()
  27. # self.image_positions_dict = self.get_image_positions()
  28. # self.images_path_dict, self.flag_image_info_dict = self.save_images()
  29. # self.replace_images_with_text()
  30. # else:
  31. # self.upload_minio = UploadMinio()
  32. # self.image_positions_dict = self.get_image_positions()
  33. # self.images_path_dict, self.flag_image_info_dict = self.save_images()
  34. # self.replace_images_with_text()
  35. # def file_path_process(self):
  36. # self.knowledge_id = self.file_json.get("knowledge_id")
  37. # self.document_id = self.file_json.get("document_id")
  38. # know_path = self.base_path + f"/{self.knowledge_id}"
  39. # self.file_name = self.file_json.get("name")
  40. # self.output_pdf_name = "output_" + self.file_name
  41. # self.input_pdf_path = os.path.join(know_path, self.file_name)
  42. # self.output_pdf_path = os.path.join(know_path, self.output_pdf_name)
  43. # self.file_name_list = self.file_name.split(".")
  44. # self.image_dir = ".".join(self.file_name_list[:-1])
  45. # self.save_image_path = know_path + "/" + self.document_id
  46. # def get_image_positions(self):
  47. # images_dict = {}
  48. # with pdfplumber.open(self.input_pdf_path) as pdf:
  49. # page_num = 0
  50. # for page in pdf.pages:
  51. # images_dict[page_num] = {}
  52. # image_num = 0
  53. # img_list = {}
  54. # img_list[image_num] = {}
  55. # for image in page.images:
  56. # #print("Image position:", image)
  57. # img_list[image_num] = {"x0":image['x0'],"y0":image['y0']}
  58. # image_num += 1
  59. # img_list[image_num] = {}
  60. # images_dict[page_num]=img_list
  61. # page_num += 1
  62. # # print(f"images list info: {images_dict}")
  63. # return images_dict
  64. # def save_images(self):
  65. # # 创建图片保存目录
  66. # os.makedirs(self.save_image_path, exist_ok=True)
  67. # # 使用PyMuPDF打开PDF文件
  68. # doc = fitz.open(self.input_pdf_path)
  69. # all_images_dict = {}
  70. # pdf_img_index = 1
  71. # flag_img_info = {}
  72. # for page_num in range(len(doc)):
  73. # page = doc.load_page(page_num)
  74. # images = page.get_images(full=True)
  75. # page_image_dict = {}
  76. # for img_index, img in enumerate(images):
  77. # xref = img[0] # 图片的XRef编号
  78. # base_image = doc.extract_image(xref)
  79. # image_bytes = base_image["image"]
  80. # # 将字节数据转换为PIL图像
  81. # pil_image = Image.open(io.BytesIO(image_bytes))
  82. # # 生成唯一文件名
  83. # # img_name = f"page{page_num+1}_img{img_index+1}.{base_image['ext']}"
  84. # img_name = f"{self.document_id}_{pdf_img_index}.{base_image['ext']}"
  85. # img_path = os.path.join(self.save_image_path, img_name)
  86. # # page_image_dict[img_index] = img_path
  87. # # 保存成image_name
  88. # image_str = self.knowledge_id + "/" + self.document_id + "/" + img_name
  89. # replace_text = f"【示意图序号_{self.document_id}_{pdf_img_index}】"
  90. # page_image_dict[img_index] = replace_text # 替换pdf中的文字
  91. # # 保存图片
  92. # pil_image.save(img_path)
  93. # # 保存的图片上传的oss
  94. # self.upload_minio.upload_file(img_path, f"/pdf/{image_str}")
  95. # minio_url = minio_config.get("minio_url")
  96. # minio_bucket = minio_config.get("minio_bucket")
  97. # flag_img_info[replace_text] = f"{minio_url}/{minio_bucket}//pdf/{image_str}"
  98. # pdf_img_index += 1
  99. # all_images_dict[page_num] = page_image_dict
  100. # # 关闭原始文档
  101. # doc.close()
  102. # return all_images_dict, flag_img_info
  103. # def replace_images_with_text(self):
  104. # # 打开原始PDF
  105. # doc = fitz.open(self.input_pdf_path)
  106. # # 设置字体大小
  107. # font_size = 12
  108. # font_name = "SimSun"
  109. # font_path = r"./utils/simsun.ttc" # 当前系统中的字体路径
  110. # # 遍历每一页
  111. # for page_num in range(len(doc)):
  112. # page = doc.load_page(page_num) # 获取页面
  113. # images = page.get_images(full=True) # 获取页面中的所有图片
  114. # page_height = page.rect.height
  115. # # print("page_height: ", page_height)
  116. # for img_index, img in enumerate(images):
  117. # xref = img[0] # 图片的XRef编号
  118. # base_image = doc.extract_image(xref) # 提取图片
  119. # bbox = fitz.Rect(img[1:5])
  120. # # print("bbox: ", bbox)
  121. # # 删除图片 - 使用安全方式,避免内部方法可能导致的核心转储
  122. # try:
  123. # # 优先使用官方 API
  124. # page.delete_image(xref)
  125. # except AttributeError:
  126. # # 如果官方 API 不可用,使用 redact 方式遮盖图片区域
  127. # try:
  128. # img_rect = page.get_image_rects(xref)
  129. # if img_rect:
  130. # page.add_redact_annot(img_rect[0])
  131. # page.apply_redactions()
  132. # except Exception as e:
  133. # print(f"删除图片失败 (page={page_num}, img={img_index}): {e}")
  134. # # 准备替换文本
  135. # # replacement_text = f"page{page_num+1}_img{img_index+1}.png"
  136. # replacement_text = self.images_path_dict[page_num][img_index]
  137. # print(f"替换的文本:{replacement_text}")
  138. # # 在删除的图片位置插入文本
  139. # try:
  140. # x0 = self.image_positions_dict[page_num][img_index]['x0']
  141. # y0 = page_height - self.image_positions_dict[page_num][img_index]['y0']
  142. # # 插入文本坐标
  143. # print(f"x0: {x0}, y0: {y0}")
  144. # # 使用fitz中自带的字体 china-s 效果显示不友好,插入的字体一行铺满 fontname="china-s",
  145. # page.insert_text((x0,y0), replacement_text,fontname=font_name, fontfile=font_path, fontsize=font_size, color=(0, 0, 0))
  146. # #page.insert_text((x,y+y1), replacement_text, fontsize=font_size, color=(0, 0, 0))
  147. # except Exception as e:
  148. # print(f"Error inserting text for image on page {page_num + 1}: {e}")
  149. # # 保存修改后的PDF
  150. # doc.save(self.output_pdf_path)
  151. # doc.close()
  152. # print(f"Processed PDF saved to: {self.output_pdf_path}")
  153. # def file2text(self):
  154. # pdf_text = ""
  155. # with fitz.open(self.output_pdf_path) as doc:
  156. # for i, page in enumerate(doc):
  157. # text = page.get_text("text").strip()
  158. # lines = text.split("\n")
  159. # if len(lines) > 0 and lines[-1].strip().isdigit():
  160. # text = "\n".join(lines[:-1]) # 移除最后一行
  161. # if len(lines) > 0 and lines[0].strip().isdigit():
  162. # text = "\n".join(lines[1:]) # 移除第一行
  163. # # print(f"page text:{text.strip()}")
  164. # # pdf_text += text + "\n"
  165. # pdf_text += text
  166. # # print(pdf_text)
  167. # return pdf_text, self.flag_image_info_dict
  168. # class MinerUParsePdf():
  169. # # def __init__(self, knowledge_id, minio_client):
  170. # # self.knowledge_id = knowledge_id
  171. # # self.minio_client = minio_client
  172. # async def extract_text(self, file_path):
  173. # # pdf_file_name = file_path
  174. # # prepare env
  175. # # local_image_dir = f"./tmp_file/{self.knowledge_id}/{doc_id}"
  176. # local_image_dir = f"./tmp_file/images"
  177. # image_dir = str(os.path.basename(local_image_dir))
  178. # os.makedirs(local_image_dir, exist_ok=True)
  179. # image_writer = FileBasedDataWriter(local_image_dir)
  180. # # read bytes
  181. # reader1 = FileBasedDataReader("")
  182. # pdf_bytes = reader1.read(file_path) # read the pdf content
  183. # # proc
  184. # ## Create Dataset Instance
  185. # ds = PymuDocDataset(pdf_bytes)
  186. # infer_result = ds.apply(doc_analyze, ocr=True)
  187. # ## pipeline
  188. # pipe_result = infer_result.pipe_ocr_mode(image_writer)
  189. # content_list_content = pipe_result.get_content_list(image_dir)
  190. # # image_num = 1
  191. # # text = ""
  192. # # flag_img_info = {}
  193. # # current_page = ""
  194. # # for i,content_dict in enumerate(content_list_content):
  195. # # page_index = content_dict.get("page_idx")
  196. # # if i == 0:
  197. # # current_page = page_index
  198. # # elif page_index != current_page:
  199. # # text += "<page>"
  200. # # current_page = page_index
  201. # # else:
  202. # # pass
  203. # # if content_dict.get("type") == "text":
  204. # # content_text = content_dict.get("text")
  205. # # text_level = content_dict.get("text_level")
  206. # # if text_level:
  207. # # text += "#" * text_level + content_text
  208. # # else:
  209. # # text += content_text
  210. # # elif content_dict.get("type") in ("image", "table"):
  211. # # image_path = content_dict.get("img_path")
  212. # # image_name = image_path.split("/")[1]
  213. # # save_image_path = local_image_dir + f"/{image_name}"
  214. # # replace_text = f"【示意图序号_{doc_id}_{image_num}】"
  215. # # minio_file_path = f"/pdf/{self.knowledge_id}/{doc_id}/{replace_text}.jpg"
  216. # # self.minio_client.upload_file(save_image_path, minio_file_path)
  217. # # minio_url = minio_config.get("minio_url")
  218. # # minio_bucket = minio_config.get("minio_bucket")
  219. # # flag_img_info[replace_text] = f"{minio_url}/{minio_bucket}/{minio_file_path}"
  220. # # text += replace_text
  221. # # image_num += 1
  222. # # else:
  223. # # ...
  224. # return content_list_content
  225. # import asyncio
  226. # import os
  227. # from mineru.cli.common import read_fn, prepare_env
  228. # from mineru.data.data_reader_writer import FileBasedDataWriter
  229. # from mineru.backend.vlm.vlm_analyze import aio_doc_analyze, ModelSingleton
  230. # from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
  231. # from mineru.utils.enum_class import MakeMode
  232. # class MinerUParsePdf:
  233. # """
  234. # MinerU 异步文档分析类
  235. # 封装了模型加载、PDF读取、图片抽取、内容生成等完整流程。
  236. # """
  237. # def __init__(self, gpu_id: int = 0, output_dir: str = "./tmp_file", backend: str = "vllm-async-engine"):
  238. # """
  239. # 初始化 MinerU 分析器。
  240. # Args:
  241. # gpu_id (int): 指定使用的 GPU ID。
  242. # output_dir (str): 临时输出目录,用于保存中间图片。
  243. # backend (str): 使用的推理后端。
  244. # """
  245. # # GPU 环境变量(必须在模型加载前设置)
  246. # os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"
  247. # os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
  248. # self.output_dir = output_dir
  249. # self.backend = backend
  250. # self.predictor = None # 懒加载
  251. # self.model_singleton = ModelSingleton()
  252. # async def _ensure_model_loaded(self, gpu_memory_utilization: float = 0.5):
  253. # """
  254. # 确保模型加载,仅在首次调用时加载一次。
  255. # """
  256. # if self.predictor is None:
  257. # print("正在加载 MinerU 模型,请稍候...")
  258. # self.predictor = self.model_singleton.get_model(
  259. # backend=self.backend,
  260. # model_path=None,
  261. # server_url=None,
  262. # gpu_memory_utilization=gpu_memory_utilization,
  263. # )
  264. # print("模型加载完成。")
  265. # async def extract_text(self, pdf_path: str):
  266. # """
  267. # 分析指定 PDF 文件,返回带图像信息的内容结构。
  268. # Args:
  269. # pdf_path (str): 要分析的 PDF 文件路径。
  270. # Returns:
  271. # list: 内容段落列表,包含图文信息。
  272. # """
  273. # await self._ensure_model_loaded()
  274. # pdf_bytes = read_fn(pdf_path)
  275. # local_image_dir, _ = prepare_env(self.output_dir, "document", "vlm")
  276. # image_writer = FileBasedDataWriter(local_image_dir)
  277. # print(f"正在解析文档: {pdf_path}")
  278. # middle_json, infer_result = await aio_doc_analyze(
  279. # pdf_bytes,
  280. # image_writer=image_writer,
  281. # predictor=self.predictor,
  282. # backend=self.backend,
  283. # )
  284. # pdf_info = middle_json.get("pdf_info", {})
  285. # content_list = union_make(pdf_info, MakeMode.CONTENT_LIST, img_buket_path="images")
  286. # print("文档解析完成。")
  287. # return content_list
  288. # # ====== 使用示例 ======
  289. # if __name__ == "__main__":
  290. # async def main():
  291. # analyzer = MinerUAsyncAnalyzer(gpu_id=2, output_dir="./temp_output")
  292. # result = await analyzer.analyze_pdf("ceshi.pdf")
  293. # print(result)
  294. # asyncio.run(main())
  295. # import asyncio
  296. # import os
  297. # import time
  298. # from mineru.cli.common import read_fn, prepare_env, _process_output
  299. # from mineru.data.data_reader_writer import FileBasedDataWriter
  300. # from mineru.backend.vlm.vlm_analyze import aio_doc_analyze, ModelSingleton
  301. # from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
  302. # from mineru.utils.enum_class import MakeMode
  303. # import mineru
  304. # import threading
  305. # import logging
  306. # logging.basicConfig(level=logging.INFO)
  307. # # 全局服务器URL列表,用于负载均衡
  308. # SERVER_URL_LIST = [
  309. # # "http://127.0.0.1:9999",
  310. # # "http://127.0.0.1:9998",
  311. # # "http://127.0.0.1:9997",
  312. # "http://127.0.0.1:9990",
  313. # "http://127.0.0.1:9991",
  314. # "http://127.0.0.1:9992",
  315. # "http://127.0.0.1:9993"
  316. # ]
  317. # # 全局轮询索引和锁
  318. # _server_index = 0
  319. # _server_index_lock = threading.Lock()
  320. # def get_next_server_url():
  321. # """
  322. # 使用轮询机制获取下一个服务器URL
  323. # Returns:
  324. # str: 下一个可用的服务器URL
  325. # """
  326. # global _server_index
  327. # with _server_index_lock:
  328. # url = SERVER_URL_LIST[_server_index % len(SERVER_URL_LIST)]
  329. # _server_index += 1
  330. # logging.info(f"负载均衡: 选择服务器 {url} (索引: {_server_index - 1})")
  331. # return url
  332. # class MinerUParsePdf:
  333. # """
  334. # MinerU 异步文档分析类
  335. # 封装了模型加载、PDF读取、图片抽取、内容生成等完整流程。
  336. # """
  337. # def __init__(self, output_dir: str = "./tmp_file", server_url: str = None):
  338. # """
  339. # 初始化 MinerU 分析器。
  340. # Args:
  341. # output_dir (str): 临时输出目录,用于保存中间图片。
  342. # server_url (str): vLLM 服务器地址。如果为 None,则使用负载均衡自动选择。
  343. # """
  344. # # 客户端不需要 GPU 环境变量
  345. # os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
  346. # self.output_dir = output_dir
  347. # self.backend = "http-client" # 使用 HTTP 客户端后端
  348. # # 如果未指定 server_url,则使用负载均衡机制自动选择
  349. # self.server_url = server_url if server_url is not None else get_next_server_url()
  350. # self.predictor = None # 懒加载
  351. # self.model_singleton = ModelSingleton()
  352. # logging.info(f"#### mineru path: {os.path.dirname(mineru.__file__)}")
  353. # logging.info(f"#### 使用服务器URL: {self.server_url}")
  354. # # print("#### mineru path:", os.path.dirname(mineru.__file__))
  355. # async def _ensure_model_loaded(self):
  356. # """
  357. # 确保 HTTP 客户端连接建立,仅在首次调用时初始化一次。
  358. # """
  359. # if self.predictor is None:
  360. # print(f"正在连接 vLLM 服务器: {self.server_url}...")
  361. # self.predictor = self.model_singleton.get_model(
  362. # backend=self.backend,
  363. # model_path=None,
  364. # server_url=self.server_url,
  365. # )
  366. # print("服务器连接成功。")
  367. # async def extract_text(self, pdf_path: str):
  368. # """
  369. # 分析指定 PDF 文件,返回带图像信息的内容结构。
  370. # Args:
  371. # pdf_path (str): 要分析的 PDF 文件路径。
  372. # Returns:
  373. # list: 内容段落列表,包含图文信息。
  374. # """
  375. # await self._ensure_model_loaded()
  376. # pdf_bytes = read_fn(pdf_path)
  377. # pdf_file_name = os.path.splitext(os.path.basename(pdf_path))[0] + time.strftime("_%Y%m%d%H%M%S")
  378. # local_image_dir, local_md_dir = prepare_env(self.output_dir, pdf_file_name, "vlm")
  379. # # print("9999"*100,local_image_dir,local_md_dir)
  380. # image_writer = FileBasedDataWriter(local_image_dir)
  381. # md_writer = FileBasedDataWriter(local_md_dir)
  382. # print(f"正在解析文档: {pdf_path}")
  383. # middle_json, infer_result = await aio_doc_analyze(
  384. # pdf_bytes,
  385. # image_writer=image_writer,
  386. # predictor=self.predictor,
  387. # backend=self.backend,
  388. # )
  389. # pdf_info = middle_json.get("pdf_info", {})
  390. # content_list = union_make(pdf_info, MakeMode.CONTENT_LIST, img_buket_path=local_image_dir)
  391. # _process_output(
  392. # pdf_info=pdf_info,
  393. # pdf_bytes=pdf_bytes,
  394. # pdf_file_name=pdf_file_name,
  395. # local_md_dir=local_md_dir,
  396. # local_image_dir=local_image_dir,
  397. # md_writer=md_writer,
  398. # f_draw_layout_bbox=False,
  399. # f_draw_span_bbox=False,
  400. # f_dump_orig_pdf=False,
  401. # f_dump_md=True,
  402. # f_dump_content_list=True,
  403. # f_dump_middle_json=True,
  404. # f_dump_model_output=True,
  405. # f_make_md_mode=MakeMode.MM_MD,
  406. # middle_json=middle_json,
  407. # model_output=infer_result,
  408. # is_pipeline=False,
  409. # )
  410. # path_md = f"{local_md_dir}/{pdf_file_name}.md"
  411. # print(f"文档解析完成。MD 文件已保存到: {path_md}")
  412. # return content_list, path_md, pdf_file_name
  413. # if __name__ == "__main__":
  414. # # input_pdf = r"G:/work/资料/5.1 BMP业务系统使用手册 - 切片.pdf"
  415. # # output_pdf = "./output.pdf"
  416. # # image_folder = "./extracted_images"
  417. # file_json = {
  418. # "knowledge_id": "1234",
  419. # "name": "5.1 BMP业务系统使用手册 - 切片.pdf",
  420. # "document_id": "2222"
  421. # }
  422. # loader = PDFLoader(file_json)
  423. # loader.replace_images_with_text()
  424. import asyncio
  425. import os
  426. import time
  427. import aiohttp
  428. from mineru.cli.common import read_fn, prepare_env, _process_output
  429. from mineru.data.data_reader_writer import FileBasedDataWriter
  430. from mineru.backend.vlm.vlm_analyze import aio_doc_analyze, ModelSingleton
  431. from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
  432. from mineru.utils.enum_class import MakeMode
  433. import mineru
  434. import logging
  435. logging.basicConfig(level=logging.INFO)
  436. class VLMRetryExhaustedError(RuntimeError):
  437. pass
  438. class MinerUParsePdfClient:
  439. """
  440. MinerU API 客户端
  441. 通过 HTTP 调用 mineru_server.py 提供的 API 服务
  442. """
  443. def __init__(self, output_dir: str = "./tmp_file", server_url: str = "http://127.0.0.1:8120"):
  444. """
  445. 初始化 MinerU API 客户端
  446. Args:
  447. output_dir (str): 临时输出目录
  448. server_url (str): MinerU 服务器地址
  449. """
  450. self.output_dir = output_dir
  451. self.server_url = server_url
  452. self.api_endpoint = f"{server_url}/parse"
  453. logging.info(f"MinerU API 客户端初始化,服务器: {server_url}")
  454. async def extract_text(self, pdf_path: str):
  455. """
  456. 通过 API 解析 PDF 文件
  457. Args:
  458. pdf_path (str): PDF 文件路径
  459. Returns:
  460. tuple: (content_list, path_md, pdf_file_name)
  461. """
  462. logging.info(f"调用 MinerU API 解析: {pdf_path}")
  463. max_retries = 5
  464. for attempt in range(max_retries):
  465. try:
  466. async with aiohttp.ClientSession() as session:
  467. payload = {
  468. "pdf_path": pdf_path,
  469. "output_dir": self.output_dir,
  470. "server_url": "http://127.0.0.1:9999" # vLLM 服务地址
  471. }
  472. async with session.post(self.api_endpoint, json=payload, timeout=aiohttp.ClientTimeout(total=3000)) as response:
  473. if response.status != 200:
  474. error_text = await response.text()
  475. raise RuntimeError(f"MinerU API 调用失败: {response.status}, {error_text}")
  476. result = await response.json()
  477. if result.get("code") != 200:
  478. raise RuntimeError(f"MinerU API 返回错误: {result.get('message')}")
  479. data = result.get("data", {})
  480. content_list = data.get("content_list", [])
  481. md_path = data.get("md_path", "")
  482. pdf_file_name = data.get("pdf_file_name", "")
  483. logging.info(f"MinerU API 解析成功: {pdf_file_name}")
  484. return content_list, md_path, pdf_file_name
  485. except Exception as e:
  486. logging.error(f"MinerU API 调用失败 (尝试 {attempt + 1}/{max_retries}): {e}")
  487. if attempt < max_retries - 1:
  488. await asyncio.sleep(2 ** attempt) # 指数退避
  489. else:
  490. raise RuntimeError(f"MinerU API 调用失败")
  491. class MinerUParsePdf:
  492. """
  493. MinerU 异步文档分析类
  494. 封装了模型加载、PDF读取、图片抽取、内容生成等完整流程。
  495. """
  496. def __init__(self, output_dir: str = "./tmp_file", server_url: str = "http://127.0.0.1:9999"):
  497. """
  498. 初始化 MinerU 分析器。
  499. Args:
  500. output_dir (str): 临时输出目录,用于保存中间图片。
  501. server_url (str): vLLM 服务器地址。
  502. """
  503. # 客户端不需要 GPU 环境变量
  504. os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
  505. self.output_dir = output_dir
  506. self.backend = "http-client" # 使用 HTTP 客户端后端 # http-client
  507. self.server_url = server_url
  508. self.predictor = None # 懒加载
  509. self.model_singleton = ModelSingleton()
  510. logging.info(f"#### mineru path: {os.path.dirname(mineru.__file__)}")
  511. # print("#### mineru path:", os.path.dirname(mineru.__file__))
  512. async def _ensure_model_loaded(self):
  513. """
  514. 确保 HTTP 客户端连接建立,仅在首次调用时初始化一次。
  515. """
  516. if self.predictor is None:
  517. print(f"正在连接 vLLM 服务器: {self.server_url}...")
  518. self.predictor = self.model_singleton.get_model(
  519. backend=self.backend,
  520. model_path=None,
  521. server_url=self.server_url,
  522. )
  523. print("服务器连接成功。")
  524. async def extract_text(self, pdf_path: str):
  525. """
  526. 分析指定 PDF 文件,返回带图像信息的内容结构。
  527. Args:
  528. pdf_path (str): 要分析的 PDF 文件路径。
  529. Returns:
  530. list: 内容段落列表,包含图文信息。
  531. """
  532. # last_exc = None
  533. # for i in range(10):
  534. # pass
  535. # else:
  536. # raise VLMRetryExhaustedError("VLM 连接失败")
  537. await self._ensure_model_loaded()
  538. pdf_bytes = read_fn(pdf_path)
  539. pdf_file_name = os.path.splitext(os.path.basename(pdf_path))[0] + time.strftime("_%Y%m%d%H%M%S")
  540. local_image_dir, local_md_dir = prepare_env(self.output_dir, pdf_file_name, "vlm")
  541. # local_image_dir, local_md_dir = prepare_env(self.output_dir, pdf_file_name, "")
  542. # print("9999"*100,local_image_dir,local_md_dir)
  543. image_writer = FileBasedDataWriter(local_image_dir)
  544. md_writer = FileBasedDataWriter(local_md_dir)
  545. print(f"正在解析文档: {pdf_path}")
  546. import asyncio
  547. from httpx import ReadError, ConnectError, ConnectTimeout
  548. max_retries = 5
  549. for attempt in range(max_retries):
  550. try:
  551. middle_json, infer_result = await aio_doc_analyze(
  552. pdf_bytes,
  553. image_writer=image_writer,
  554. predictor=self.predictor,
  555. backend=self.backend,
  556. # server_url=self.server_url,
  557. )
  558. break
  559. except (ReadError, ConnectError, ConnectTimeout) as e:
  560. if attempt == max_retries - 1:
  561. # raise e
  562. raise VLMRetryExhaustedError(
  563. "VLM 连接失败"
  564. )
  565. logging.info(f"VLM连接失败,重试 {attempt + 1}/{max_retries}")
  566. await asyncio.sleep(2 ** attempt) # 指数退避
  567. pdf_info = middle_json.get("pdf_info", {})
  568. content_list = union_make(pdf_info, MakeMode.CONTENT_LIST, img_buket_path=local_image_dir)
  569. _process_output(
  570. pdf_info=pdf_info,
  571. pdf_bytes=pdf_bytes,
  572. pdf_file_name=pdf_file_name,
  573. local_md_dir=local_md_dir,
  574. local_image_dir=local_image_dir,
  575. md_writer=md_writer,
  576. f_draw_layout_bbox=False,
  577. f_draw_span_bbox=False,
  578. f_dump_orig_pdf=False,
  579. f_dump_md=True,
  580. f_dump_content_list=True,
  581. f_dump_middle_json=True,
  582. f_dump_model_output=True,
  583. f_make_md_mode=MakeMode.MM_MD,
  584. middle_json=middle_json,
  585. model_output=infer_result,
  586. is_pipeline=False,
  587. )
  588. path_md = f"{local_md_dir}/{pdf_file_name}.md"
  589. print(f"文档解析完成。MD 文件已保存到: {path_md}")
  590. return content_list, path_md, pdf_file_name
  591. if __name__ == "__main__":
  592. # input_pdf = r"G:/work/资料/5.1 BMP业务系统使用手册 - 切片.pdf"
  593. # output_pdf = "./output.pdf"
  594. # image_folder = "./extracted_images"
  595. file_json = {
  596. "knowledge_id": "1234",
  597. "name": "5.1 BMP业务系统使用手册 - 切片.pdf",
  598. "document_id": "2222"
  599. }
  600. loader = PDFLoader(file_json)
  601. loader.replace_images_with_text()