pipeline_middle_json_mkcontent.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. from loguru import logger
  2. from mineru.utils.char_utils import full_to_half_exclude_marks, is_hyphen_at_line_end
  3. from mineru.utils.config_reader import get_latex_delimiter_config
  4. from mineru.backend.pipeline.para_split import ListLineTag
  5. from mineru.utils.enum_class import BlockType, ContentType, MakeMode
  6. from mineru.utils.language import detect_lang
  7. def make_blocks_to_markdown(paras_of_layout,
  8. mode,
  9. img_buket_path='',
  10. ):
  11. page_markdown = []
  12. for para_block in paras_of_layout:
  13. para_text = ''
  14. para_type = para_block['type']
  15. if para_type in [BlockType.TEXT, BlockType.LIST, BlockType.INDEX]:
  16. para_text = merge_para_with_text(para_block)
  17. elif para_type == BlockType.TITLE:
  18. title_level = get_title_level(para_block)
  19. para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
  20. elif para_type == BlockType.INTERLINE_EQUATION:
  21. if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0:
  22. continue
  23. if para_block['lines'][0]['spans'][0].get('content', ''):
  24. para_text = merge_para_with_text(para_block)
  25. else:
  26. para_text += f"![]({img_buket_path}/{para_block['lines'][0]['spans'][0]['image_path']})"
  27. elif para_type == BlockType.IMAGE:
  28. if mode == MakeMode.NLP_MD:
  29. continue
  30. elif mode == MakeMode.MM_MD:
  31. # 检测是否存在图片脚注
  32. has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks'])
  33. # 如果存在图片脚注,则将图片脚注拼接到图片正文后面
  34. if has_image_footnote:
  35. for block in para_block['blocks']: # 1st.拼image_caption
  36. if block['type'] == BlockType.IMAGE_CAPTION:
  37. para_text += merge_para_with_text(block) + ' \n'
  38. for block in para_block['blocks']: # 2nd.拼image_body
  39. if block['type'] == BlockType.IMAGE_BODY:
  40. for line in block['lines']:
  41. for span in line['spans']:
  42. if span['type'] == ContentType.IMAGE:
  43. if span.get('image_path', ''):
  44. para_text += f"![]({img_buket_path}/{span['image_path']})"
  45. for block in para_block['blocks']: # 3rd.拼image_footnote
  46. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  47. para_text += ' \n' + merge_para_with_text(block)
  48. else:
  49. for block in para_block['blocks']: # 1st.拼image_body
  50. if block['type'] == BlockType.IMAGE_BODY:
  51. for line in block['lines']:
  52. for span in line['spans']:
  53. if span['type'] == ContentType.IMAGE:
  54. if span.get('image_path', ''):
  55. para_text += f"![]({img_buket_path}/{span['image_path']})"
  56. for block in para_block['blocks']: # 2nd.拼image_caption
  57. if block['type'] == BlockType.IMAGE_CAPTION:
  58. para_text += ' \n' + merge_para_with_text(block)
  59. elif para_type == BlockType.TABLE:
  60. if mode == MakeMode.NLP_MD:
  61. continue
  62. elif mode == MakeMode.MM_MD:
  63. for block in para_block['blocks']: # 1st.拼table_caption
  64. if block['type'] == BlockType.TABLE_CAPTION:
  65. para_text += merge_para_with_text(block) + ' \n'
  66. for block in para_block['blocks']: # 2nd.拼table_body
  67. if block['type'] == BlockType.TABLE_BODY:
  68. for line in block['lines']:
  69. for span in line['spans']:
  70. if span['type'] == ContentType.TABLE:
  71. # if processed by table model
  72. if span.get('html', ''):
  73. para_text += f"\n{span['html']}\n"
  74. elif span.get('image_path', ''):
  75. para_text += f"![]({img_buket_path}/{span['image_path']})"
  76. for block in para_block['blocks']: # 3rd.拼table_footnote
  77. if block['type'] == BlockType.TABLE_FOOTNOTE:
  78. para_text += '\n' + merge_para_with_text(block) + ' '
  79. if para_text.strip() == '':
  80. continue
  81. else:
  82. # page_markdown.append(para_text.strip() + ' ')
  83. page_markdown.append(para_text.strip())
  84. return page_markdown
  85. latex_delimiters_config = get_latex_delimiter_config()
  86. default_delimiters = {
  87. 'display': {'left': '$$', 'right': '$$'},
  88. 'inline': {'left': '$', 'right': '$'}
  89. }
  90. delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
  91. display_left_delimiter = delimiters['display']['left']
  92. display_right_delimiter = delimiters['display']['right']
  93. inline_left_delimiter = delimiters['inline']['left']
  94. inline_right_delimiter = delimiters['inline']['right']
  95. def merge_para_with_text(para_block):
  96. block_text = ''
  97. for line in para_block['lines']:
  98. for span in line['spans']:
  99. if span['type'] in [ContentType.TEXT]:
  100. span['content'] = full_to_half_exclude_marks(span['content'])
  101. block_text += span['content']
  102. block_lang = detect_lang(block_text)
  103. para_text = ''
  104. for i, line in enumerate(para_block['lines']):
  105. if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
  106. para_text += ' \n'
  107. for j, span in enumerate(line['spans']):
  108. span_type = span['type']
  109. content = ''
  110. if span_type == ContentType.TEXT:
  111. content = escape_special_markdown_char(span['content'])
  112. elif span_type == ContentType.INLINE_EQUATION:
  113. if span.get('content', ''):
  114. content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
  115. elif span_type == ContentType.INTERLINE_EQUATION:
  116. if span.get('content', ''):
  117. content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
  118. content = content.strip()
  119. if content:
  120. if span_type == ContentType.INTERLINE_EQUATION:
  121. para_text += content
  122. continue
  123. # 定义CJK语言集合(中日韩)
  124. cjk_langs = {'zh', 'ja', 'ko'}
  125. # logger.info(f'block_lang: {block_lang}, content: {content}')
  126. # 判断是否为行末span
  127. is_last_span = j == len(line['spans']) - 1
  128. if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
  129. if is_last_span and span_type not in [ContentType.INLINE_EQUATION]:
  130. para_text += content
  131. else:
  132. para_text += f'{content} '
  133. else:
  134. # 西方文本语境下 每行的最后一个span判断是否要去除连字符
  135. if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
  136. # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
  137. if (
  138. is_last_span
  139. and span_type == ContentType.TEXT
  140. and is_hyphen_at_line_end(content)
  141. ):
  142. # 如果下一行的第一个span是小写字母开头,删除连字符
  143. if (
  144. i + 1 < len(para_block['lines'])
  145. and para_block['lines'][i + 1].get('spans')
  146. and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
  147. and para_block['lines'][i + 1]['spans'][0].get('content', '')
  148. and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
  149. ):
  150. para_text += content[:-1]
  151. else: # 如果没有下一行,或者下一行的第一个span不是小写字母开头,则保留连字符但不加空格
  152. para_text += content
  153. else: # 西方文本语境下 content间需要空格分隔
  154. para_text += f'{content} '
  155. else:
  156. continue
  157. return para_text
  158. def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
  159. para_type = para_block['type']
  160. para_content = {}
  161. if para_type in [
  162. BlockType.TEXT,
  163. BlockType.LIST,
  164. BlockType.INDEX,
  165. ]:
  166. para_content = {
  167. 'type': ContentType.TEXT,
  168. 'text': merge_para_with_text(para_block),
  169. }
  170. elif para_type == BlockType.DISCARDED:
  171. para_content = {
  172. 'type': para_type,
  173. 'text': merge_para_with_text(para_block),
  174. }
  175. elif para_type == BlockType.TITLE:
  176. para_content = {
  177. 'type': ContentType.TEXT,
  178. 'text': merge_para_with_text(para_block),
  179. }
  180. title_level = get_title_level(para_block)
  181. if title_level != 0:
  182. para_content['text_level'] = title_level
  183. elif para_type == BlockType.INTERLINE_EQUATION:
  184. if len(para_block['lines']) == 0 or len(para_block['lines'][0]['spans']) == 0:
  185. return None
  186. para_content = {
  187. 'type': ContentType.EQUATION,
  188. 'img_path': f"{img_buket_path}/{para_block['lines'][0]['spans'][0].get('image_path', '')}",
  189. }
  190. if para_block['lines'][0]['spans'][0].get('content', ''):
  191. para_content['text'] = merge_para_with_text(para_block)
  192. para_content['text_format'] = 'latex'
  193. elif para_type == BlockType.IMAGE:
  194. para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []}
  195. for block in para_block['blocks']:
  196. if block['type'] == BlockType.IMAGE_BODY:
  197. for line in block['lines']:
  198. for span in line['spans']:
  199. if span['type'] == ContentType.IMAGE:
  200. if span.get('image_path', ''):
  201. para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
  202. if block['type'] == BlockType.IMAGE_CAPTION:
  203. para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block))
  204. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  205. para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block))
  206. elif para_type == BlockType.TABLE:
  207. para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []}
  208. for block in para_block['blocks']:
  209. if block['type'] == BlockType.TABLE_BODY:
  210. for line in block['lines']:
  211. for span in line['spans']:
  212. if span['type'] == ContentType.TABLE:
  213. if span.get('html', ''):
  214. para_content[BlockType.TABLE_BODY] = f"{span['html']}"
  215. if span.get('image_path', ''):
  216. para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
  217. if block['type'] == BlockType.TABLE_CAPTION:
  218. para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
  219. if block['type'] == BlockType.TABLE_FOOTNOTE:
  220. para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
  221. page_width, page_height = page_size
  222. para_bbox = para_block.get('bbox')
  223. if para_bbox:
  224. x0, y0, x1, y1 = para_bbox
  225. para_content['bbox'] = [
  226. int(x0 * 1000 / page_width),
  227. int(y0 * 1000 / page_height),
  228. int(x1 * 1000 / page_width),
  229. int(y1 * 1000 / page_height),
  230. ]
  231. para_content['page_idx'] = page_idx
  232. return para_content
  233. def union_make(pdf_info_dict: list,
  234. make_mode: str,
  235. img_buket_path: str = '',
  236. ):
  237. output_content = []
  238. for page_info in pdf_info_dict:
  239. paras_of_layout = page_info.get('para_blocks')
  240. paras_of_discarded = page_info.get('discarded_blocks')
  241. page_idx = page_info.get('page_idx')
  242. page_size = page_info.get('page_size')
  243. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  244. if not paras_of_layout:
  245. continue
  246. page_markdown = make_blocks_to_markdown(paras_of_layout, make_mode, img_buket_path)
  247. output_content.extend(page_markdown)
  248. elif make_mode == MakeMode.CONTENT_LIST:
  249. para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
  250. if not para_blocks:
  251. continue
  252. for para_block in para_blocks:
  253. para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
  254. if para_content:
  255. output_content.append(para_content)
  256. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  257. return '\n\n'.join(output_content)
  258. elif make_mode == MakeMode.CONTENT_LIST:
  259. return output_content
  260. else:
  261. logger.error(f"Unsupported make mode: {make_mode}")
  262. return None
  263. def get_title_level(block):
  264. title_level = block.get('level', 1)
  265. if title_level > 4:
  266. title_level = 4
  267. elif title_level < 1:
  268. title_level = 0
  269. return title_level
  270. def escape_special_markdown_char(content):
  271. """
  272. 转义正文里对markdown语法有特殊意义的字符
  273. """
  274. special_chars = ["*", "`", "~", "$"]
  275. for char in special_chars:
  276. content = content.replace(char, "\\" + char)
  277. return content