vlm_middle_json_mkcontent.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704
  1. import os
  2. from loguru import logger
  3. from mineru.utils.char_utils import full_to_half_exclude_marks, is_hyphen_at_line_end
  4. from mineru.utils.config_reader import get_latex_delimiter_config, get_formula_enable, get_table_enable
  5. from mineru.utils.enum_class import MakeMode, BlockType, ContentType, ContentTypeV2
  6. from mineru.utils.language import detect_lang
  7. latex_delimiters_config = get_latex_delimiter_config()
  8. default_delimiters = {
  9. 'display': {'left': '$$', 'right': '$$'},
  10. 'inline': {'left': '$', 'right': '$'}
  11. }
  12. delimiters = latex_delimiters_config if latex_delimiters_config else default_delimiters
  13. display_left_delimiter = delimiters['display']['left']
  14. display_right_delimiter = delimiters['display']['right']
  15. inline_left_delimiter = delimiters['inline']['left']
  16. inline_right_delimiter = delimiters['inline']['right']
  17. # +
  18. from utils.upload_file_to_oss import UploadMinio
  19. from config import minio_config
  20. # +
  21. def upload_image_to_minio(span, img_buket_path):
  22. """
  23. 上传图片到MinIO并返回访问路径
  24. Args:
  25. span: 包含image_path的字典
  26. img_buket_path: 本地图片目录路径
  27. Returns:
  28. success: 上传是否成功 (True/False)
  29. content: Markdown格式的图片链接
  30. """
  31. content = ''
  32. image_path = span['image_path']
  33. local_file_path = f"{img_buket_path}/{image_path}"
  34. # logger.info(f"local_file_path11111111111111111111111111111{local_file_path}")
  35. # 上传到MinIO,使用 MDImages/文件名 作为对象名称
  36. minio_object_name = f"MDImages/{image_path}"
  37. success = UploadMinio().upload_file(local_file_path, minio_object_name)
  38. if success:
  39. print("**上传成功**" * 100)
  40. # 构造完整的MinIO访问URL
  41. minio_url = minio_config.get("minio_url")
  42. minio_bucket = minio_config.get("minio_bucket")
  43. content = f"![]({minio_url}/{minio_bucket}/{minio_object_name})"
  44. return success, content
  45. def merge_para_with_text(para_block, formula_enable=True, img_buket_path=''):
  46. block_text = ''
  47. for line in para_block['lines']:
  48. for span in line['spans']:
  49. if span['type'] in [ContentType.TEXT]:
  50. span['content'] = full_to_half_exclude_marks(span['content'])
  51. block_text += span['content']
  52. block_lang = detect_lang(block_text)
  53. para_text = ''
  54. for i, line in enumerate(para_block['lines']):
  55. for j, span in enumerate(line['spans']):
  56. span_type = span['type']
  57. content = ''
  58. if span_type == ContentType.TEXT:
  59. content = span['content']
  60. elif span_type == ContentType.INLINE_EQUATION:
  61. content = f"{inline_left_delimiter}{span['content']}{inline_right_delimiter}"
  62. elif span_type == ContentType.INTERLINE_EQUATION:
  63. if formula_enable:
  64. content = f"\n{display_left_delimiter}\n{span['content']}\n{display_right_delimiter}\n"
  65. else:
  66. if span.get('image_path', ''):
  67. content = f"![]({img_buket_path}/{span['image_path']})"
  68. content = content.strip()
  69. if content:
  70. if span_type == ContentType.INTERLINE_EQUATION:
  71. para_text += content
  72. continue
  73. # 定义CJK语言集合(中日韩)
  74. cjk_langs = {'zh', 'ja', 'ko'}
  75. # logger.info(f'block_lang: {block_lang}, content: {content}')
  76. # 判断是否为行末span
  77. is_last_span = j == len(line['spans']) - 1
  78. if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
  79. if is_last_span and span_type != ContentType.INLINE_EQUATION:
  80. para_text += content
  81. else:
  82. para_text += f'{content} '
  83. else:
  84. # 西方文本语境下 每行的最后一个span判断是否要去除连字符
  85. if span_type in [ContentType.TEXT, ContentType.INLINE_EQUATION]:
  86. # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
  87. if (
  88. is_last_span
  89. and span_type == ContentType.TEXT
  90. and is_hyphen_at_line_end(content)
  91. ):
  92. # 如果下一行的第一个span是小写字母开头,删除连字符
  93. if (
  94. i+1 < len(para_block['lines'])
  95. and para_block['lines'][i + 1].get('spans')
  96. and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
  97. and para_block['lines'][i + 1]['spans'][0].get('content', '')
  98. and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
  99. ):
  100. para_text += content[:-1]
  101. else: # 如果没有下一行,或者下一行的第一个span不是小写字母开头,则保留连字符但不加空格
  102. para_text += content
  103. else: # 西方文本语境下 content间需要空格分隔
  104. para_text += f'{content} '
  105. return para_text
  106. def mk_blocks_to_markdown(para_blocks, make_mode, formula_enable, table_enable, img_buket_path=''):
  107. page_markdown = []
  108. for para_block in para_blocks:
  109. para_text = ''
  110. para_type = para_block['type']
  111. if para_type in [BlockType.TEXT, BlockType.INTERLINE_EQUATION, BlockType.PHONETIC, BlockType.REF_TEXT]:
  112. para_text = merge_para_with_text(para_block, formula_enable=formula_enable, img_buket_path=img_buket_path)
  113. elif para_type == BlockType.LIST:
  114. for block in para_block['blocks']:
  115. item_text = merge_para_with_text(block, formula_enable=formula_enable, img_buket_path=img_buket_path)
  116. para_text += f"{item_text} \n"
  117. elif para_type == BlockType.TITLE:
  118. title_level = get_title_level(para_block)
  119. para_text = f'{"#" * title_level} {merge_para_with_text(para_block)}'
  120. elif para_type == BlockType.IMAGE:
  121. if make_mode == MakeMode.NLP_MD:
  122. continue
  123. elif make_mode == MakeMode.MM_MD:
  124. # 检测是否存在图片脚注
  125. has_image_footnote = any(block['type'] == BlockType.IMAGE_FOOTNOTE for block in para_block['blocks'])
  126. # 如果存在图片脚注,则将图片脚注拼接到图片正文后面
  127. if has_image_footnote:
  128. for block in para_block['blocks']: # 1st.拼image_caption
  129. if block['type'] == BlockType.IMAGE_CAPTION:
  130. para_text += merge_para_with_text(block) + ' \n'
  131. for block in para_block['blocks']: # 2nd.拼image_body
  132. if block['type'] == BlockType.IMAGE_BODY:
  133. for line in block['lines']:
  134. for span in line['spans']:
  135. if span['type'] == ContentType.IMAGE:
  136. if span.get('image_path', ''):
  137. success, content = upload_image_to_minio(span, img_buket_path)
  138. if success:
  139. para_text += content
  140. else:
  141. para_text += f"![]({img_buket_path}/{span['image_path']})"
  142. for block in para_block['blocks']: # 3rd.拼image_footnote
  143. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  144. para_text += ' \n' + merge_para_with_text(block)
  145. else:
  146. for block in para_block['blocks']: # 1st.拼image_body
  147. if block['type'] == BlockType.IMAGE_BODY:
  148. for line in block['lines']:
  149. for span in line['spans']:
  150. if span['type'] == ContentType.IMAGE:
  151. if span.get('image_path', ''):
  152. success, content = upload_image_to_minio(span, img_buket_path)
  153. if success:
  154. para_text += content
  155. else:
  156. para_text += f"![]({img_buket_path}/{span['image_path']})"
  157. for block in para_block['blocks']: # 2nd.拼image_caption
  158. if block['type'] == BlockType.IMAGE_CAPTION:
  159. para_text += ' \n' + merge_para_with_text(block)
  160. elif para_type == BlockType.TABLE:
  161. if make_mode == MakeMode.NLP_MD:
  162. continue
  163. elif make_mode == MakeMode.MM_MD:
  164. for block in para_block['blocks']: # 1st.拼table_caption
  165. if block['type'] == BlockType.TABLE_CAPTION:
  166. para_text += merge_para_with_text(block) + ' \n'
  167. for block in para_block['blocks']: # 2nd.拼table_body
  168. if block['type'] == BlockType.TABLE_BODY:
  169. for line in block['lines']:
  170. for span in line['spans']:
  171. if span['type'] == ContentType.TABLE:
  172. # if processed by table model
  173. if table_enable:
  174. if span.get('html', ''):
  175. para_text += f"\n{span['html']}\n"
  176. elif span.get('image_path', ''):
  177. para_text += f"![]({img_buket_path}/{span['image_path']})"
  178. else:
  179. if span.get('image_path', ''):
  180. para_text += f"![]({img_buket_path}/{span['image_path']})"
  181. for block in para_block['blocks']: # 3rd.拼table_footnote
  182. if block['type'] == BlockType.TABLE_FOOTNOTE:
  183. para_text += '\n' + merge_para_with_text(block) + ' '
  184. elif para_type == BlockType.CODE:
  185. sub_type = para_block["sub_type"]
  186. for block in para_block['blocks']: # 1st.拼code_caption
  187. if block['type'] == BlockType.CODE_CAPTION:
  188. para_text += merge_para_with_text(block) + ' \n'
  189. for block in para_block['blocks']: # 2nd.拼code_body
  190. if block['type'] == BlockType.CODE_BODY:
  191. if sub_type == BlockType.CODE:
  192. guess_lang = para_block["guess_lang"]
  193. para_text += f"```{guess_lang}\n{merge_para_with_text(block)}\n```"
  194. elif sub_type == BlockType.ALGORITHM:
  195. para_text += merge_para_with_text(block)
  196. if para_text.strip() == '':
  197. continue
  198. else:
  199. # page_markdown.append(para_text.strip() + ' ')
  200. page_markdown.append(para_text.strip())
  201. return page_markdown
  202. def make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size):
  203. para_type = para_block['type']
  204. para_content = {}
  205. if para_type in [
  206. BlockType.TEXT,
  207. BlockType.REF_TEXT,
  208. BlockType.PHONETIC,
  209. BlockType.HEADER,
  210. BlockType.FOOTER,
  211. BlockType.PAGE_NUMBER,
  212. BlockType.ASIDE_TEXT,
  213. BlockType.PAGE_FOOTNOTE,
  214. ]:
  215. para_content = {
  216. 'type': para_type,
  217. 'text': merge_para_with_text(para_block),
  218. }
  219. elif para_type == BlockType.LIST:
  220. para_content = {
  221. 'type': para_type,
  222. 'sub_type': para_block.get('sub_type', ''),
  223. 'list_items':[],
  224. }
  225. for block in para_block['blocks']:
  226. item_text = merge_para_with_text(block)
  227. if item_text.strip():
  228. para_content['list_items'].append(item_text)
  229. elif para_type == BlockType.TITLE:
  230. title_level = get_title_level(para_block)
  231. para_content = {
  232. 'type': ContentType.TEXT,
  233. 'text': merge_para_with_text(para_block),
  234. 'title_path': para_block.get('title_path', ''),
  235. }
  236. if title_level != 0:
  237. para_content['text_level'] = title_level
  238. elif para_type == BlockType.INTERLINE_EQUATION:
  239. para_content = {
  240. 'type': ContentType.EQUATION,
  241. 'text': merge_para_with_text(para_block),
  242. 'text_format': 'latex',
  243. }
  244. elif para_type == BlockType.IMAGE:
  245. para_content = {'type': ContentType.IMAGE, 'img_path': '', BlockType.IMAGE_CAPTION: [], BlockType.IMAGE_FOOTNOTE: []}
  246. for block in para_block['blocks']:
  247. if block['type'] == BlockType.IMAGE_BODY:
  248. for line in block['lines']:
  249. for span in line['spans']:
  250. if span['type'] == ContentType.IMAGE:
  251. if span.get('image_path', ''):
  252. para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
  253. if block['type'] == BlockType.IMAGE_CAPTION:
  254. para_content[BlockType.IMAGE_CAPTION].append(merge_para_with_text(block))
  255. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  256. para_content[BlockType.IMAGE_FOOTNOTE].append(merge_para_with_text(block))
  257. elif para_type == BlockType.TABLE:
  258. para_content = {'type': ContentType.TABLE, 'img_path': '', BlockType.TABLE_CAPTION: [], BlockType.TABLE_FOOTNOTE: []}
  259. for block in para_block['blocks']:
  260. if block['type'] == BlockType.TABLE_BODY:
  261. for line in block['lines']:
  262. for span in line['spans']:
  263. if span['type'] == ContentType.TABLE:
  264. if span.get('html', ''):
  265. para_content[BlockType.TABLE_BODY] = f"{span['html']}"
  266. if span.get('image_path', ''):
  267. para_content['img_path'] = f"{img_buket_path}/{span['image_path']}"
  268. if block['type'] == BlockType.TABLE_CAPTION:
  269. para_content[BlockType.TABLE_CAPTION].append(merge_para_with_text(block))
  270. if block['type'] == BlockType.TABLE_FOOTNOTE:
  271. para_content[BlockType.TABLE_FOOTNOTE].append(merge_para_with_text(block))
  272. elif para_type == BlockType.CODE:
  273. para_content = {'type': BlockType.CODE, 'sub_type': para_block["sub_type"], BlockType.CODE_CAPTION: []}
  274. for block in para_block['blocks']:
  275. if block['type'] == BlockType.CODE_BODY:
  276. para_content[BlockType.CODE_BODY] = merge_para_with_text(block)
  277. if para_block["sub_type"] == BlockType.CODE:
  278. para_content["guess_lang"] = para_block["guess_lang"]
  279. if block['type'] == BlockType.CODE_CAPTION:
  280. para_content[BlockType.CODE_CAPTION].append(merge_para_with_text(block))
  281. page_width, page_height = page_size
  282. para_bbox = para_block.get('bbox')
  283. if para_bbox:
  284. x0, y0, x1, y1 = para_bbox
  285. para_content['bbox'] = [
  286. int(x0 * 1000 / page_width),
  287. int(y0 * 1000 / page_height),
  288. int(x1 * 1000 / page_width),
  289. int(y1 * 1000 / page_height),
  290. ]
  291. para_content['page_idx'] = page_idx
  292. return para_content
  293. def make_blocks_to_content_list_v2(para_block, img_buket_path, page_size):
  294. para_type = para_block['type']
  295. para_content = {}
  296. if para_type in [
  297. BlockType.HEADER,
  298. BlockType.FOOTER,
  299. BlockType.ASIDE_TEXT,
  300. BlockType.PAGE_NUMBER,
  301. BlockType.PAGE_FOOTNOTE,
  302. ]:
  303. if para_type == BlockType.HEADER:
  304. content_type = ContentTypeV2.PAGE_HEADER
  305. elif para_type == BlockType.FOOTER:
  306. content_type = ContentTypeV2.PAGE_FOOTER
  307. elif para_type == BlockType.ASIDE_TEXT:
  308. content_type = ContentTypeV2.PAGE_ASIDE_TEXT
  309. elif para_type == BlockType.PAGE_NUMBER:
  310. content_type = ContentTypeV2.PAGE_NUMBER
  311. elif para_type == BlockType.PAGE_FOOTNOTE:
  312. content_type = ContentTypeV2.PAGE_FOOTNOTE
  313. else:
  314. raise ValueError(f"Unknown para_type: {para_type}")
  315. para_content = {
  316. 'type': content_type,
  317. 'content': {
  318. f"{content_type}_content": merge_para_with_text_v2(para_block),
  319. }
  320. }
  321. elif para_type == BlockType.TITLE:
  322. title_level = get_title_level(para_block)
  323. if title_level != 0:
  324. para_content = {
  325. 'type': ContentTypeV2.TITLE,
  326. 'content': {
  327. "title_content": merge_para_with_text_v2(para_block),
  328. "level": title_level
  329. }
  330. }
  331. else:
  332. para_content = {
  333. 'type': ContentTypeV2.PARAGRAPH,
  334. 'content': {
  335. "paragraph_content": merge_para_with_text_v2(para_block),
  336. }
  337. }
  338. elif para_type in [
  339. BlockType.TEXT,
  340. BlockType.PHONETIC
  341. ]:
  342. para_content = {
  343. 'type': ContentTypeV2.PARAGRAPH,
  344. 'content': {
  345. 'paragraph_content': merge_para_with_text_v2(para_block),
  346. }
  347. }
  348. elif para_type == BlockType.INTERLINE_EQUATION:
  349. image_path, math_content = get_body_data(para_block)
  350. para_content = {
  351. 'type': ContentTypeV2.EQUATION_INTERLINE,
  352. 'content': {
  353. 'math_content': math_content,
  354. 'math_type': 'latex',
  355. 'image_source': {'path': f"{img_buket_path}/{image_path}"},
  356. }
  357. }
  358. elif para_type == BlockType.IMAGE:
  359. image_caption = []
  360. image_footnote = []
  361. image_path, _ = get_body_data(para_block)
  362. image_source = {
  363. 'path': f"{img_buket_path}/{image_path}",
  364. }
  365. for block in para_block['blocks']:
  366. if block['type'] == BlockType.IMAGE_CAPTION:
  367. image_caption.extend(merge_para_with_text_v2(block))
  368. if block['type'] == BlockType.IMAGE_FOOTNOTE:
  369. image_footnote.extend(merge_para_with_text_v2(block))
  370. para_content = {
  371. 'type': ContentTypeV2.IMAGE,
  372. 'content': {
  373. 'image_source': image_source,
  374. 'image_caption': image_caption,
  375. 'image_footnote': image_footnote,
  376. }
  377. }
  378. elif para_type == BlockType.TABLE:
  379. table_caption = []
  380. table_footnote = []
  381. image_path, html = get_body_data(para_block)
  382. image_source = {
  383. 'path': f"{img_buket_path}/{image_path}",
  384. }
  385. if html.count("<table") > 1:
  386. table_nest_level = 2
  387. else:
  388. table_nest_level = 1
  389. if (
  390. "colspan" in html or
  391. "rowspan" in html or
  392. table_nest_level > 1
  393. ):
  394. table_type = ContentTypeV2.TABLE_COMPLEX
  395. else:
  396. table_type = ContentTypeV2.TABLE_SIMPLE
  397. for block in para_block['blocks']:
  398. if block['type'] == BlockType.TABLE_CAPTION:
  399. table_caption.extend(merge_para_with_text_v2(block))
  400. if block['type'] == BlockType.TABLE_FOOTNOTE:
  401. table_footnote.extend(merge_para_with_text_v2(block))
  402. para_content = {
  403. 'type': ContentTypeV2.TABLE,
  404. 'content': {
  405. 'image_source': image_source,
  406. 'table_caption': table_caption,
  407. 'table_footnote': table_footnote,
  408. 'html': html,
  409. 'table_type': table_type,
  410. 'table_nest_level': table_nest_level,
  411. }
  412. }
  413. elif para_type == BlockType.CODE:
  414. code_caption = []
  415. code_content = []
  416. for block in para_block['blocks']:
  417. if block['type'] == BlockType.CODE_CAPTION:
  418. code_caption.extend(merge_para_with_text_v2(block))
  419. if block['type'] == BlockType.CODE_BODY:
  420. code_content = merge_para_with_text_v2(block)
  421. sub_type = para_block["sub_type"]
  422. if sub_type == BlockType.CODE:
  423. para_content = {
  424. 'type': ContentTypeV2.CODE,
  425. 'content': {
  426. 'code_caption': code_caption,
  427. 'code_content': code_content,
  428. 'code_language': para_block.get('guess_lang', 'txt'),
  429. }
  430. }
  431. elif sub_type == BlockType.ALGORITHM:
  432. para_content = {
  433. 'type': ContentTypeV2.ALGORITHM,
  434. 'content': {
  435. 'algorithm_caption': code_caption,
  436. 'algorithm_content': code_content,
  437. }
  438. }
  439. else:
  440. raise ValueError(f"Unknown code sub_type: {sub_type}")
  441. elif para_type == BlockType.REF_TEXT:
  442. para_content = {
  443. 'type': ContentTypeV2.LIST,
  444. 'content': {
  445. 'list_type': ContentTypeV2.LIST_REF,
  446. 'list_items': [
  447. {
  448. 'item_type': 'text',
  449. 'item_content': merge_para_with_text_v2(para_block),
  450. }
  451. ],
  452. }
  453. }
  454. elif para_type == BlockType.LIST:
  455. if 'sub_type' in para_block:
  456. if para_block['sub_type'] == BlockType.REF_TEXT:
  457. list_type = ContentTypeV2.LIST_REF
  458. elif para_block['sub_type'] == BlockType.TEXT:
  459. list_type = ContentTypeV2.LIST_TEXT
  460. else:
  461. raise ValueError(f"Unknown list sub_type: {para_block['sub_type']}")
  462. else:
  463. list_type = ContentTypeV2.LIST_TEXT
  464. list_items = []
  465. for block in para_block['blocks']:
  466. item_content = merge_para_with_text_v2(block)
  467. if item_content:
  468. list_items.append({
  469. 'item_type': 'text',
  470. 'item_content': item_content,
  471. })
  472. para_content = {
  473. 'type': ContentTypeV2.LIST,
  474. 'content': {
  475. 'list_type': list_type,
  476. 'list_items': list_items,
  477. }
  478. }
  479. page_width, page_height = page_size
  480. para_bbox = para_block.get('bbox')
  481. if para_bbox:
  482. x0, y0, x1, y1 = para_bbox
  483. para_content['bbox'] = [
  484. int(x0 * 1000 / page_width),
  485. int(y0 * 1000 / page_height),
  486. int(x1 * 1000 / page_width),
  487. int(y1 * 1000 / page_height),
  488. ]
  489. return para_content
  490. def get_body_data(para_block):
  491. """
  492. Extract image_path and html from para_block
  493. Returns:
  494. - For IMAGE/INTERLINE_EQUATION: (image_path, '')
  495. - For TABLE: (image_path, html)
  496. - Default: ('', '')
  497. """
  498. def get_data_from_spans(lines):
  499. for line in lines:
  500. for span in line.get('spans', []):
  501. span_type = span.get('type')
  502. if span_type == ContentType.TABLE:
  503. return span.get('image_path', ''), span.get('html', '')
  504. elif span_type == ContentType.IMAGE:
  505. return span.get('image_path', ''), ''
  506. elif span_type == ContentType.INTERLINE_EQUATION:
  507. return span.get('image_path', ''), span.get('content', '')
  508. elif span_type == ContentType.TEXT:
  509. return '', span.get('content', '')
  510. return '', ''
  511. # 处理嵌套的 blocks 结构
  512. if 'blocks' in para_block:
  513. for block in para_block['blocks']:
  514. block_type = block.get('type')
  515. if block_type in [BlockType.IMAGE_BODY, BlockType.TABLE_BODY, BlockType.CODE_BODY]:
  516. result = get_data_from_spans(block.get('lines', []))
  517. if result != ('', ''):
  518. return result
  519. return '', ''
  520. # 处理直接包含 lines 的结构
  521. return get_data_from_spans(para_block.get('lines', []))
  522. def merge_para_with_text_v2(para_block):
  523. block_text = ''
  524. for line in para_block['lines']:
  525. for span in line['spans']:
  526. if span['type'] in [ContentType.TEXT]:
  527. span['content'] = full_to_half_exclude_marks(span['content'])
  528. block_text += span['content']
  529. block_lang = detect_lang(block_text)
  530. para_content = []
  531. para_type = para_block['type']
  532. for i, line in enumerate(para_block['lines']):
  533. for j, span in enumerate(line['spans']):
  534. span_type = span['type']
  535. if span.get("content", '').strip():
  536. if span_type == ContentType.TEXT:
  537. if para_type == BlockType.PHONETIC:
  538. span_type = ContentTypeV2.SPAN_PHONETIC
  539. else:
  540. span_type = ContentTypeV2.SPAN_TEXT
  541. if span_type == ContentType.INLINE_EQUATION:
  542. span_type = ContentTypeV2.SPAN_EQUATION_INLINE
  543. if span_type in [
  544. ContentTypeV2.SPAN_TEXT,
  545. ]:
  546. # 定义CJK语言集合(中日韩)
  547. cjk_langs = {'zh', 'ja', 'ko'}
  548. # logger.info(f'block_lang: {block_lang}, content: {content}')
  549. # 判断是否为行末span
  550. is_last_span = j == len(line['spans']) - 1
  551. if block_lang in cjk_langs: # 中文/日语/韩文语境下,换行不需要空格分隔,但是如果是行内公式结尾,还是要加空格
  552. if is_last_span:
  553. span_content = span['content']
  554. else:
  555. span_content = f"{span['content']} "
  556. else:
  557. # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除
  558. if (
  559. is_last_span
  560. and is_hyphen_at_line_end(span['content'])
  561. ):
  562. # 如果下一行的第一个span是小写字母开头,删除连字符
  563. if (
  564. i + 1 < len(para_block['lines'])
  565. and para_block['lines'][i + 1].get('spans')
  566. and para_block['lines'][i + 1]['spans'][0].get('type') == ContentType.TEXT
  567. and para_block['lines'][i + 1]['spans'][0].get('content', '')
  568. and para_block['lines'][i + 1]['spans'][0]['content'][0].islower()
  569. ):
  570. span_content = span['content'][:-1]
  571. else: # 如果没有下一行,或者下一行的第一个span不是小写字母开头,则保留连字符但不加空格
  572. span_content = span['content']
  573. else:
  574. # 西方文本语境下content间需要空格分隔
  575. span_content = f"{span['content']} "
  576. if para_content and para_content[-1]['type'] == span_type:
  577. # 合并相同类型的span
  578. para_content[-1]['content'] += span_content
  579. else:
  580. span_content = {
  581. 'type': span_type,
  582. 'content': span_content,
  583. }
  584. para_content.append(span_content)
  585. elif span_type in [
  586. ContentTypeV2.SPAN_PHONETIC,
  587. ContentTypeV2.SPAN_EQUATION_INLINE,
  588. ]:
  589. span_content = {
  590. 'type': span_type,
  591. 'content': span['content'],
  592. }
  593. para_content.append(span_content)
  594. else:
  595. logger.warning(f"Unknown span type in merge_para_with_text_v2: {span_type}")
  596. return para_content
  597. def union_make(pdf_info_dict: list,
  598. make_mode: str,
  599. img_buket_path: str = '',
  600. ):
  601. formula_enable = get_formula_enable(os.getenv('MINERU_VLM_FORMULA_ENABLE', 'True').lower() == 'true')
  602. table_enable = get_table_enable(os.getenv('MINERU_VLM_TABLE_ENABLE', 'True').lower() == 'true')
  603. logger.info(f"img_buket_path:{img_buket_path}")
  604. output_content = []
  605. for page_info in pdf_info_dict:
  606. paras_of_layout = page_info.get('para_blocks')
  607. paras_of_discarded = page_info.get('discarded_blocks')
  608. page_idx = page_info.get('page_idx')
  609. page_size = page_info.get('page_size')
  610. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  611. if not paras_of_layout:
  612. continue
  613. page_markdown = mk_blocks_to_markdown(paras_of_layout, make_mode, formula_enable, table_enable, img_buket_path)
  614. output_content.extend(page_markdown)
  615. elif make_mode == MakeMode.CONTENT_LIST:
  616. para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
  617. if not para_blocks:
  618. continue
  619. for para_block in para_blocks:
  620. para_content = make_blocks_to_content_list(para_block, img_buket_path, page_idx, page_size)
  621. output_content.append(para_content)
  622. elif make_mode == MakeMode.CONTENT_LIST_V2:
  623. # https://github.com/drunkpig/llm-webkit-mirror/blob/dev6/docs/specification/output_format/content_list_spec.md
  624. para_blocks = (paras_of_layout or []) + (paras_of_discarded or [])
  625. page_contents = []
  626. if para_blocks:
  627. for para_block in para_blocks:
  628. para_content = make_blocks_to_content_list_v2(para_block, img_buket_path, page_size)
  629. page_contents.append(para_content)
  630. output_content.append(page_contents)
  631. if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
  632. return '\n\n'.join(output_content)
  633. elif make_mode in [MakeMode.CONTENT_LIST, MakeMode.CONTENT_LIST_V2]:
  634. return output_content
  635. return None
  636. def get_title_level(block):
  637. title_level = block.get('level', 1)
  638. if title_level > 5:
  639. title_level = 5
  640. elif title_level < 1:
  641. title_level = 0
  642. return title_level