mineru_server.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. import os
  4. import time
  5. import asyncio
  6. import traceback
  7. from pathlib import Path
  8. from fastapi import FastAPI, HTTPException
  9. from pydantic import BaseModel
  10. from typing import Optional
  11. import uvicorn
  12. from mineru.cli.common import read_fn, prepare_env, _process_output
  13. from mineru.data.data_reader_writer import FileBasedDataWriter
  14. from mineru.backend.vlm.vlm_analyze import aio_doc_analyze, ModelSingleton
  15. from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
  16. from mineru.utils.enum_class import MakeMode
  17. from httpx import ReadError, ConnectError, ConnectTimeout
  18. os.environ["CUDA_VISIBLE_DEVICES"] = "4"
  19. app = FastAPI(title="MinerU Service")
  20. # 全局模型实例
  21. model_singleton = ModelSingleton()
  22. predictor = None
  23. MAX_RETRY = 5
  24. RETRY_INTERVAL = 2 # 秒
  25. class ParseRequest(BaseModel):
  26. pdf_path: str
  27. output_dir: Optional[str] = "./tmp_file"
  28. server_url: Optional[str] = "http://127.0.0.1:9999"
  29. class ParseResponse(BaseModel):
  30. code: int
  31. message: str
  32. data: Optional[dict] = None
  33. async def get_predictor(server_url: str):
  34. """获取或创建 MinerU predictor"""
  35. global predictor
  36. if predictor is None:
  37. os.environ["MINERU_MODEL_SOURCE"] = "modelscope"
  38. print(f"正在连接 vLLM 服务器: {server_url}...")
  39. predictor = model_singleton.get_model(
  40. backend="http-client",
  41. model_path=None,
  42. server_url=server_url,
  43. )
  44. print("服务器连接成功。")
  45. return predictor
  46. async def parse_pdf_with_retry(pdf_path: str, output_dir: str, server_url: str):
  47. """带重试的 PDF 解析"""
  48. pdf_bytes = read_fn(pdf_path)
  49. pdf_file_name = os.path.splitext(os.path.basename(pdf_path))[0] + time.strftime("_%Y%m%d%H%M%S")
  50. local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, "vlm")
  51. image_writer = FileBasedDataWriter(local_image_dir)
  52. md_writer = FileBasedDataWriter(local_md_dir)
  53. print(f"正在解析文档: {pdf_path}")
  54. # 重试逻辑
  55. for attempt in range(MAX_RETRY):
  56. try:
  57. pred = await get_predictor(server_url)
  58. middle_json, infer_result = await aio_doc_analyze(
  59. pdf_bytes,
  60. image_writer=image_writer,
  61. predictor=pred,
  62. backend="http-client",
  63. )
  64. break
  65. except (ReadError, ConnectError, ConnectTimeout) as e:
  66. if attempt == MAX_RETRY - 1:
  67. raise RuntimeError(f"VLM 连接失败,重试 {MAX_RETRY} 次后仍失败")
  68. print(f"VLM连接失败,重试 {attempt + 1}/{MAX_RETRY}")
  69. await asyncio.sleep(RETRY_INTERVAL ** attempt) # 指数退避
  70. # 生成内容
  71. pdf_info = middle_json.get("pdf_info", {})
  72. content_list = union_make(pdf_info, MakeMode.CONTENT_LIST, img_buket_path=local_image_dir)
  73. # 保存输出
  74. _process_output(
  75. pdf_info=pdf_info,
  76. pdf_bytes=pdf_bytes,
  77. pdf_file_name=pdf_file_name,
  78. local_md_dir=local_md_dir,
  79. local_image_dir=local_image_dir,
  80. md_writer=md_writer,
  81. f_draw_layout_bbox=False,
  82. f_draw_span_bbox=False,
  83. f_dump_orig_pdf=False,
  84. f_dump_md=True,
  85. f_dump_content_list=True,
  86. f_dump_middle_json=True,
  87. f_dump_model_output=True,
  88. f_make_md_mode=MakeMode.MM_MD,
  89. middle_json=middle_json,
  90. model_output=infer_result,
  91. is_pipeline=False,
  92. )
  93. path_md = f"{local_md_dir}/{pdf_file_name}.md"
  94. print(f"文档解析完成。MD 文件已保存到: {path_md}")
  95. return content_list, path_md, pdf_file_name
  96. @app.post("/parse", response_model=ParseResponse)
  97. async def parse_pdf(request: ParseRequest):
  98. """
  99. 解析 PDF 文件
  100. 返回:
  101. - content_list: 内容列表
  102. - md_path: MD 文件保存路径
  103. - pdf_file_name: PDF 文件名
  104. """
  105. try:
  106. pdf_path = request.pdf_path
  107. if not os.path.exists(pdf_path):
  108. raise HTTPException(status_code=400, detail=f"PDF 文件不存在: {pdf_path}")
  109. content_list, md_path, pdf_file_name = await parse_pdf_with_retry(
  110. pdf_path=pdf_path,
  111. output_dir=request.output_dir,
  112. server_url=request.server_url
  113. )
  114. return ParseResponse(
  115. code=200,
  116. message="解析成功",
  117. data={
  118. "content_list": content_list,
  119. "md_path": md_path,
  120. "pdf_file_name": pdf_file_name
  121. }
  122. )
  123. except Exception as e:
  124. traceback.print_exc()
  125. return ParseResponse(
  126. code=500,
  127. message=f"解析失败: {str(e)}",
  128. data=None
  129. )
  130. @app.get("/health")
  131. async def health_check():
  132. """健康检查"""
  133. return {"status": "ok"}
  134. if __name__ == "__main__":
  135. import argparse
  136. parser = argparse.ArgumentParser()
  137. parser.add_argument("--host", default="0.0.0.0", help="服务地址")
  138. parser.add_argument("--port", type=int, default=8120, help="服务端口")
  139. args = parser.parse_args()
  140. uvicorn.run(app, host=args.host, port=args.port)