pdf_image_tools.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import os
  3. from io import BytesIO
  4. import numpy as np
  5. import pypdfium2 as pdfium
  6. from loguru import logger
  7. from PIL import Image, ImageOps
  8. from mineru.data.data_reader_writer import FileBasedDataWriter
  9. from mineru.utils.check_sys_env import is_windows_environment
  10. from mineru.utils.os_env_config import get_load_images_timeout
  11. from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
  12. from mineru.utils.enum_class import ImageType
  13. from mineru.utils.hash_utils import str_sha256
  14. from mineru.utils.pdf_page_id import get_end_page_id
  15. from concurrent.futures import ProcessPoolExecutor, TimeoutError as FuturesTimeoutError
  16. def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
  17. """Convert pdfium.PdfDocument to image, Then convert the image to base64.
  18. Args:
  19. page (_type_): pdfium.PdfPage
  20. dpi (int, optional): reset the dpi of dpi. Defaults to 200.
  21. image_type (ImageType, optional): The type of image to return. Defaults to ImageType.PIL.
  22. Returns:
  23. dict: {'img_base64': str, 'img_pil': pil_img, 'scale': float }
  24. """
  25. pil_img, scale = page_to_image(page, dpi=dpi)
  26. image_dict = {
  27. "scale": scale,
  28. }
  29. if image_type == ImageType.BASE64:
  30. image_dict["img_base64"] = image_to_b64str(pil_img)
  31. else:
  32. image_dict["img_pil"] = pil_img
  33. return image_dict
  34. def _load_images_from_pdf_worker(
  35. pdf_bytes, dpi, start_page_id, end_page_id, image_type
  36. ):
  37. """用于进程池的包装函数"""
  38. return load_images_from_pdf_core(
  39. pdf_bytes, dpi, start_page_id, end_page_id, image_type
  40. )
  41. def load_images_from_pdf(
  42. pdf_bytes: bytes,
  43. dpi=200,
  44. start_page_id=0,
  45. end_page_id=None,
  46. image_type=ImageType.PIL,
  47. timeout=None,
  48. threads=1, # + 默认4
  49. ):
  50. """带超时控制的 PDF 转图片函数,支持多进程加速
  51. Args:
  52. pdf_bytes (bytes): PDF 文件的 bytes
  53. dpi (int, optional): reset the dpi of dpi. Defaults to 200.
  54. start_page_id (int, optional): 起始页码. Defaults to 0.
  55. end_page_id (int | None, optional): 结束页码. Defaults to None.
  56. image_type (ImageType, optional): 图片类型. Defaults to ImageType.PIL.
  57. timeout (int | None, optional): 超时时间(秒)。如果为 None,则从环境变量 MINERU_PDF_LOAD_IMAGES_TIMEOUT 读取,若未设置则默认为 300 秒。
  58. threads (int): 进程数,默认 4
  59. Raises:
  60. TimeoutError: 当转换超时时抛出
  61. """
  62. pdf_doc = pdfium.PdfDocument(pdf_bytes)
  63. if is_windows_environment():
  64. # Windows 环境下不使用多进程
  65. return load_images_from_pdf_core(
  66. pdf_bytes,
  67. dpi,
  68. start_page_id,
  69. get_end_page_id(end_page_id, len(pdf_doc)),
  70. image_type,
  71. ), pdf_doc
  72. else:
  73. if timeout is None:
  74. timeout = get_load_images_timeout()
  75. end_page_id = get_end_page_id(end_page_id, len(pdf_doc))
  76. # 计算总页数
  77. total_pages = end_page_id - start_page_id + 1
  78. # 实际使用的进程数不超过总页数
  79. actual_threads = min(os.cpu_count() or 1, threads, total_pages)
  80. # 根据实际进程数分组页面范围
  81. pages_per_thread = max(1, total_pages // actual_threads)
  82. page_ranges = []
  83. for i in range(actual_threads):
  84. range_start = start_page_id + i * pages_per_thread
  85. if i == actual_threads - 1:
  86. # 最后一个进程处理剩余所有页面
  87. range_end = end_page_id
  88. else:
  89. range_end = start_page_id + (i + 1) * pages_per_thread - 1
  90. page_ranges.append((range_start, range_end))
  91. # logger.debug(f"PDF to images using {actual_threads} processes, page ranges: {page_ranges}")
  92. with ProcessPoolExecutor(max_workers=actual_threads) as executor:
  93. # 提交所有任务
  94. futures = []
  95. for range_start, range_end in page_ranges:
  96. future = executor.submit(
  97. _load_images_from_pdf_worker,
  98. pdf_bytes,
  99. dpi,
  100. range_start,
  101. range_end,
  102. image_type,
  103. )
  104. futures.append((range_start, future))
  105. try:
  106. # 收集结果并按页码排序
  107. all_results = []
  108. for range_start, future in futures:
  109. images_list = future.result(timeout=timeout)
  110. all_results.append((range_start, images_list))
  111. # 按起始页码排序并合并结果
  112. all_results.sort(key=lambda x: x[0])
  113. images_list = []
  114. for _, imgs in all_results:
  115. images_list.extend(imgs)
  116. return images_list, pdf_doc
  117. except FuturesTimeoutError:
  118. pdf_doc.close()
  119. executor.shutdown(wait=False, cancel_futures=True)
  120. raise TimeoutError(f"PDF to images conversion timeout after {timeout}s")
  121. def load_images_from_pdf_core(
  122. pdf_bytes: bytes,
  123. dpi=200,
  124. start_page_id=0,
  125. end_page_id=None,
  126. image_type=ImageType.PIL, # PIL or BASE64
  127. ):
  128. images_list = []
  129. pdf_doc = pdfium.PdfDocument(pdf_bytes)
  130. pdf_page_num = len(pdf_doc)
  131. end_page_id = get_end_page_id(end_page_id, pdf_page_num)
  132. for index in range(start_page_id, end_page_id + 1):
  133. # logger.debug(f"Converting page {index}/{pdf_page_num} to image")
  134. page = pdf_doc[index]
  135. image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
  136. images_list.append(image_dict)
  137. pdf_doc.close()
  138. return images_list
  139. def cut_image(
  140. bbox: tuple,
  141. page_num: int,
  142. page_pil_img,
  143. return_path,
  144. image_writer: FileBasedDataWriter,
  145. scale=2,
  146. ):
  147. """从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
  148. 图片存放在save_path下,文件名是:
  149. {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
  150. # 拼接文件名
  151. filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
  152. # 老版本返回不带bucket的路径
  153. img_path = f"{return_path}_{filename}" if return_path is not None else None
  154. # 新版本生成平铺路径
  155. img_hash256_path = f"{str_sha256(img_path)}.jpg"
  156. # img_hash256_path = f'{img_path}.jpg'
  157. crop_img = get_crop_img(bbox, page_pil_img, scale=scale)
  158. img_bytes = image_to_bytes(crop_img, image_format="JPEG")
  159. image_writer.write(img_hash256_path, img_bytes)
  160. return img_hash256_path
  161. def get_crop_img(bbox: tuple, pil_img, scale=2):
  162. scale_bbox = (
  163. int(bbox[0] * scale),
  164. int(bbox[1] * scale),
  165. int(bbox[2] * scale),
  166. int(bbox[3] * scale),
  167. )
  168. return pil_img.crop(scale_bbox)
  169. def get_crop_np_img(bbox: tuple, input_img, scale=2):
  170. if isinstance(input_img, Image.Image):
  171. np_img = np.asarray(input_img)
  172. elif isinstance(input_img, np.ndarray):
  173. np_img = input_img
  174. else:
  175. raise ValueError("Input must be a pillow object or a numpy array.")
  176. scale_bbox = (
  177. int(bbox[0] * scale),
  178. int(bbox[1] * scale),
  179. int(bbox[2] * scale),
  180. int(bbox[3] * scale),
  181. )
  182. return np_img[scale_bbox[1] : scale_bbox[3], scale_bbox[0] : scale_bbox[2]]
  183. def images_bytes_to_pdf_bytes(image_bytes):
  184. # 内存缓冲区
  185. pdf_buffer = BytesIO()
  186. # 载入并转换所有图像为 RGB 模式
  187. image = Image.open(BytesIO(image_bytes))
  188. # 根据 EXIF 信息自动转正(处理手机拍摄的带 Orientation 标记的图片)
  189. image = ImageOps.exif_transpose(image) or image
  190. # 只在必要时转换
  191. if image.mode != "RGB":
  192. image = image.convert("RGB")
  193. # 第一张图保存为 PDF,其余追加
  194. image.save(
  195. pdf_buffer,
  196. format="PDF",
  197. # save_all=True
  198. )
  199. # 获取 PDF bytes 并重置指针(可选)
  200. pdf_bytes = pdf_buffer.getvalue()
  201. pdf_buffer.close()
  202. return pdf_bytes