table_merge.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. from copy import deepcopy
  3. from loguru import logger
  4. from bs4 import BeautifulSoup
  5. from mineru.backend.vlm.vlm_middle_json_mkcontent import merge_para_with_text
  6. from mineru.utils.char_utils import full_to_half
  7. from mineru.utils.enum_class import BlockType, SplitFlag
  8. CONTINUATION_END_MARKERS = [
  9. "(续)",
  10. "(续表)",
  11. "(续上表)",
  12. "(continued)",
  13. "(cont.)",
  14. "(cont’d)",
  15. "(…continued)",
  16. "续表",
  17. ]
  18. CONTINUATION_INLINE_MARKERS = [
  19. "(continued)",
  20. ]
  21. def calculate_table_total_columns(soup):
  22. """计算表格的总列数,通过分析整个表格结构来处理rowspan和colspan
  23. Args:
  24. soup: BeautifulSoup解析的表格
  25. Returns:
  26. int: 表格的总列数
  27. """
  28. rows = soup.find_all("tr")
  29. if not rows:
  30. return 0
  31. # 创建一个矩阵来跟踪每个位置的占用情况
  32. max_cols = 0
  33. occupied = {} # {row_idx: {col_idx: True}}
  34. for row_idx, row in enumerate(rows):
  35. col_idx = 0
  36. cells = row.find_all(["td", "th"])
  37. if row_idx not in occupied:
  38. occupied[row_idx] = {}
  39. for cell in cells:
  40. # 找到下一个未被占用的列位置
  41. while col_idx in occupied[row_idx]:
  42. col_idx += 1
  43. colspan = int(cell.get("colspan", 1))
  44. rowspan = int(cell.get("rowspan", 1))
  45. # 标记被这个单元格占用的所有位置
  46. for r in range(row_idx, row_idx + rowspan):
  47. if r not in occupied:
  48. occupied[r] = {}
  49. for c in range(col_idx, col_idx + colspan):
  50. occupied[r][c] = True
  51. col_idx += colspan
  52. max_cols = max(max_cols, col_idx)
  53. return max_cols
  54. def build_table_occupied_matrix(soup):
  55. """构建表格的占用矩阵,返回每行的有效列数
  56. Args:
  57. soup: BeautifulSoup解析的表格
  58. Returns:
  59. dict: {row_idx: effective_columns} 每行的有效列数(考虑rowspan占用)
  60. """
  61. rows = soup.find_all("tr")
  62. if not rows:
  63. return {}
  64. occupied = {} # {row_idx: {col_idx: True}}
  65. row_effective_cols = {} # {row_idx: effective_columns}
  66. for row_idx, row in enumerate(rows):
  67. col_idx = 0
  68. cells = row.find_all(["td", "th"])
  69. if row_idx not in occupied:
  70. occupied[row_idx] = {}
  71. for cell in cells:
  72. # 找到下一个未被占用的列位置
  73. while col_idx in occupied[row_idx]:
  74. col_idx += 1
  75. colspan = int(cell.get("colspan", 1))
  76. rowspan = int(cell.get("rowspan", 1))
  77. # 标记被这个单元格占用的所有位置
  78. for r in range(row_idx, row_idx + rowspan):
  79. if r not in occupied:
  80. occupied[r] = {}
  81. for c in range(col_idx, col_idx + colspan):
  82. occupied[r][c] = True
  83. col_idx += colspan
  84. # 该行的有效列数为已占用的最大列索引+1
  85. if occupied[row_idx]:
  86. row_effective_cols[row_idx] = max(occupied[row_idx].keys()) + 1
  87. else:
  88. row_effective_cols[row_idx] = 0
  89. return row_effective_cols
  90. def calculate_row_effective_columns(soup, row_idx):
  91. """计算指定行的有效列数(考虑rowspan占用)
  92. Args:
  93. soup: BeautifulSoup解析的表格
  94. row_idx: 行索引
  95. Returns:
  96. int: 该行的有效列数
  97. """
  98. row_effective_cols = build_table_occupied_matrix(soup)
  99. return row_effective_cols.get(row_idx, 0)
  100. def calculate_row_columns(row):
  101. """
  102. 计算表格行的实际列数,考虑colspan属性
  103. Args:
  104. row: BeautifulSoup的tr元素对象
  105. Returns:
  106. int: 行的实际列数
  107. """
  108. cells = row.find_all(["td", "th"])
  109. column_count = 0
  110. for cell in cells:
  111. colspan = int(cell.get("colspan", 1))
  112. column_count += colspan
  113. return column_count
  114. def calculate_visual_columns(row):
  115. """
  116. 计算表格行的视觉列数(实际td/th单元格数量,不考虑colspan)
  117. Args:
  118. row: BeautifulSoup的tr元素对象
  119. Returns:
  120. int: 行的视觉列数(实际单元格数)
  121. """
  122. cells = row.find_all(["td", "th"])
  123. return len(cells)
  124. def detect_table_headers(soup1, soup2, max_header_rows=5):
  125. """
  126. 检测并比较两个表格的表头
  127. Args:
  128. soup1: 第一个表格的BeautifulSoup对象
  129. soup2: 第二个表格的BeautifulSoup对象
  130. max_header_rows: 最大可能的表头行数
  131. Returns:
  132. tuple: (表头行数, 表头是否一致, 表头文本列表)
  133. """
  134. rows1 = soup1.find_all("tr")
  135. rows2 = soup2.find_all("tr")
  136. # 构建两个表格的有效列数矩阵
  137. effective_cols1 = build_table_occupied_matrix(soup1)
  138. effective_cols2 = build_table_occupied_matrix(soup2)
  139. min_rows = min(len(rows1), len(rows2), max_header_rows)
  140. header_rows = 0
  141. headers_match = True
  142. header_texts = []
  143. for i in range(min_rows):
  144. # 提取当前行的所有单元格
  145. cells1 = rows1[i].find_all(["td", "th"])
  146. cells2 = rows2[i].find_all(["td", "th"])
  147. # 检查两行的结构和内容是否一致
  148. structure_match = True
  149. # 首先检查单元格数量
  150. if len(cells1) != len(cells2):
  151. structure_match = False
  152. else:
  153. # 检查有效列数是否一致(考虑rowspan影响)
  154. if effective_cols1.get(i, 0) != effective_cols2.get(i, 0):
  155. structure_match = False
  156. else:
  157. # 然后检查单元格的属性和内容
  158. for cell1, cell2 in zip(cells1, cells2):
  159. colspan1 = int(cell1.get("colspan", 1))
  160. rowspan1 = int(cell1.get("rowspan", 1))
  161. colspan2 = int(cell2.get("colspan", 1))
  162. rowspan2 = int(cell2.get("rowspan", 1))
  163. # 去除所有空白字符(包括空格、换行、制表符等)
  164. text1 = ''.join(full_to_half(cell1.get_text()).split())
  165. text2 = ''.join(full_to_half(cell2.get_text()).split())
  166. if colspan1 != colspan2 or rowspan1 != rowspan2 or text1 != text2:
  167. structure_match = False
  168. break
  169. if structure_match:
  170. header_rows += 1
  171. row_texts = [full_to_half(cell.get_text().strip()) for cell in cells1]
  172. header_texts.append(row_texts) # 添加表头文本
  173. else:
  174. headers_match = header_rows > 0 # 只有当至少匹配了一行时,才认为表头匹配
  175. break
  176. # 如果严格匹配失败,尝试视觉一致性匹配(只比较文本内容)
  177. if header_rows == 0:
  178. header_rows, headers_match, header_texts = _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows)
  179. return header_rows, headers_match, header_texts
  180. def _detect_table_headers_visual(soup1, soup2, rows1, rows2, max_header_rows=5):
  181. """
  182. 基于视觉一致性检测表头(只比较文本内容,忽略colspan/rowspan差异)
  183. Args:
  184. soup1: 第一个表格的BeautifulSoup对象
  185. soup2: 第二个表格的BeautifulSoup对象
  186. rows1: 第一个表格的行列表
  187. rows2: 第二个表格的行列表
  188. max_header_rows: 最大可能的表头行数
  189. Returns:
  190. tuple: (表头行数, 表头是否一致, 表头文本列表)
  191. """
  192. # 构建两个表格的有效列数矩阵
  193. effective_cols1 = build_table_occupied_matrix(soup1)
  194. effective_cols2 = build_table_occupied_matrix(soup2)
  195. min_rows = min(len(rows1), len(rows2), max_header_rows)
  196. header_rows = 0
  197. headers_match = True
  198. header_texts = []
  199. for i in range(min_rows):
  200. cells1 = rows1[i].find_all(["td", "th"])
  201. cells2 = rows2[i].find_all(["td", "th"])
  202. # 提取每行的文本内容列表(去除空白字符)
  203. texts1 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells1]
  204. texts2 = [''.join(full_to_half(cell.get_text()).split()) for cell in cells2]
  205. # 检查视觉一致性:文本内容完全相同,且有效列数一致
  206. effective_cols_match = effective_cols1.get(i, 0) == effective_cols2.get(i, 0)
  207. if texts1 == texts2 and effective_cols_match:
  208. header_rows += 1
  209. row_texts = [full_to_half(cell.get_text().strip()) for cell in cells1]
  210. header_texts.append(row_texts)
  211. else:
  212. headers_match = header_rows > 0
  213. break
  214. if header_rows == 0:
  215. headers_match = False
  216. return header_rows, headers_match, header_texts
  217. def can_merge_tables(current_table_block, previous_table_block):
  218. """判断两个表格是否可以合并"""
  219. # 检查表格是否有caption和footnote
  220. # 计算previous_table_block中的footnote数量
  221. footnote_count = sum(1 for block in previous_table_block["blocks"] if block["type"] == BlockType.TABLE_FOOTNOTE)
  222. # 如果有TABLE_CAPTION类型的块,检查是否至少有一个以"(续)"结尾
  223. caption_blocks = [block for block in current_table_block["blocks"] if block["type"] == BlockType.TABLE_CAPTION]
  224. if caption_blocks:
  225. # 检查是否至少有一个caption包含续表标识
  226. has_continuation_marker = False
  227. for block in caption_blocks:
  228. caption_text = full_to_half(merge_para_with_text(block).strip()).lower()
  229. if (
  230. any(caption_text.endswith(marker.lower()) for marker in CONTINUATION_END_MARKERS)
  231. or any(marker.lower() in caption_text for marker in CONTINUATION_INLINE_MARKERS)
  232. ):
  233. has_continuation_marker = True
  234. break
  235. # 如果所有caption都不包含续表标识,则不允许合并
  236. if not has_continuation_marker:
  237. return False, None, None, None, None
  238. # 如果current_table_block的caption存在续标识,放宽footnote的限制允许previous_table_block有最多一条footnote
  239. if footnote_count > 1:
  240. return False, None, None, None, None
  241. else:
  242. if footnote_count > 0:
  243. return False, None, None, None, None
  244. # 获取两个表格的HTML内容
  245. current_html = ""
  246. previous_html = ""
  247. for block in current_table_block["blocks"]:
  248. if (block["type"] == BlockType.TABLE_BODY and block["lines"] and block["lines"][0]["spans"]):
  249. current_html = block["lines"][0]["spans"][0].get("html", "")
  250. for block in previous_table_block["blocks"]:
  251. if (block["type"] == BlockType.TABLE_BODY and block["lines"] and block["lines"][0]["spans"]):
  252. previous_html = block["lines"][0]["spans"][0].get("html", "")
  253. if not current_html or not previous_html:
  254. return False, None, None, None, None
  255. # 检查表格宽度差异
  256. x0_t1, y0_t1, x1_t1, y1_t1 = current_table_block["bbox"]
  257. x0_t2, y0_t2, x1_t2, y1_t2 = previous_table_block["bbox"]
  258. table1_width = x1_t1 - x0_t1
  259. table2_width = x1_t2 - x0_t2
  260. if abs(table1_width - table2_width) / min(table1_width, table2_width) >= 0.1:
  261. return False, None, None, None, None
  262. # 解析HTML并检查表格结构
  263. soup1 = BeautifulSoup(previous_html, "html.parser")
  264. soup2 = BeautifulSoup(current_html, "html.parser")
  265. # 检查整体列数匹配
  266. table_cols1 = calculate_table_total_columns(soup1)
  267. table_cols2 = calculate_table_total_columns(soup2)
  268. # logger.debug(f"Table columns - Previous: {table_cols1}, Current: {table_cols2}")
  269. tables_match = table_cols1 == table_cols2
  270. # 检查首末行列数匹配
  271. rows_match = check_rows_match(soup1, soup2)
  272. return (tables_match or rows_match), soup1, soup2, current_html, previous_html
  273. def check_rows_match(soup1, soup2):
  274. """检查表格行是否匹配"""
  275. rows1 = soup1.find_all("tr")
  276. rows2 = soup2.find_all("tr")
  277. if not (rows1 and rows2):
  278. return False
  279. # 获取第一个表的最后一行数据行索引
  280. last_row_idx = None
  281. last_row = None
  282. for idx in range(len(rows1) - 1, -1, -1):
  283. if rows1[idx].find_all(["td", "th"]):
  284. last_row_idx = idx
  285. last_row = rows1[idx]
  286. break
  287. # 检测表头行数,以便获取第二个表的首个数据行
  288. header_count, _, _ = detect_table_headers(soup1, soup2)
  289. # 获取第二个表的首个数据行
  290. first_data_row_idx = None
  291. first_data_row = None
  292. if len(rows2) > header_count:
  293. first_data_row_idx = header_count
  294. first_data_row = rows2[header_count] # 第一个非表头行
  295. if not (last_row and first_data_row):
  296. return False
  297. # 计算有效列数(考虑rowspan和colspan)
  298. last_row_effective_cols = calculate_row_effective_columns(soup1, last_row_idx)
  299. first_row_effective_cols = calculate_row_effective_columns(soup2, first_data_row_idx)
  300. # 计算实际列数(仅考虑colspan)和视觉列数
  301. last_row_cols = calculate_row_columns(last_row)
  302. first_row_cols = calculate_row_columns(first_data_row)
  303. last_row_visual_cols = calculate_visual_columns(last_row)
  304. first_row_visual_cols = calculate_visual_columns(first_data_row)
  305. # logger.debug(f"行列数 - 前表最后一行: {last_row_cols}(有效列数:{last_row_effective_cols}, 视觉列数:{last_row_visual_cols}), 当前表首行: {first_row_cols}(有效列数:{first_row_effective_cols}, 视觉列数:{first_row_visual_cols})")
  306. # 同时考虑有效列数匹配、实际列数匹配和视觉列数匹配
  307. return (last_row_effective_cols == first_row_effective_cols or
  308. last_row_cols == first_row_cols or
  309. last_row_visual_cols == first_row_visual_cols)
  310. def check_row_columns_match(row1, row2):
  311. # 逐个cell检测colspan属性是否一致
  312. cells1 = row1.find_all(["td", "th"])
  313. cells2 = row2.find_all(["td", "th"])
  314. if len(cells1) != len(cells2):
  315. return False
  316. for cell1, cell2 in zip(cells1, cells2):
  317. colspan1 = int(cell1.get("colspan", 1))
  318. colspan2 = int(cell2.get("colspan", 1))
  319. if colspan1 != colspan2:
  320. return False
  321. return True
  322. def adjust_table_rows_colspan(soup, rows, start_idx, end_idx,
  323. reference_structure, reference_visual_cols,
  324. target_cols, current_cols, reference_row):
  325. """调整表格行的colspan属性以匹配目标列数
  326. Args:
  327. soup: BeautifulSoup解析的表格对象(用于计算有效列数)
  328. rows: 表格行列表
  329. start_idx: 起始行索引
  330. end_idx: 结束行索引(不包含)
  331. reference_structure: 参考行的colspan结构列表
  332. reference_visual_cols: 参考行的视觉列数
  333. target_cols: 目标总列数
  334. current_cols: 当前总列数
  335. reference_row: 参考行对象
  336. """
  337. reference_row_copy = deepcopy(reference_row)
  338. # 构建有效列数矩阵
  339. effective_cols_matrix = build_table_occupied_matrix(soup)
  340. for i in range(start_idx, end_idx):
  341. row = rows[i]
  342. cells = row.find_all(["td", "th"])
  343. if not cells:
  344. continue
  345. # 使用有效列数(考虑rowspan)判断是否需要调整
  346. current_row_effective_cols = effective_cols_matrix.get(i, 0)
  347. current_row_cols = calculate_row_columns(row)
  348. # 如果有效列数或实际列数已经达到目标,则跳过
  349. if current_row_effective_cols >= target_cols or current_row_cols >= target_cols:
  350. continue
  351. # 检查是否与参考行结构匹配
  352. if calculate_visual_columns(row) == reference_visual_cols and check_row_columns_match(row, reference_row_copy):
  353. # 尝试应用参考结构
  354. if len(cells) <= len(reference_structure):
  355. for j, cell in enumerate(cells):
  356. if j < len(reference_structure) and reference_structure[j] > 1:
  357. cell["colspan"] = str(reference_structure[j])
  358. else:
  359. # 扩展最后一个单元格以填补列数差异
  360. # 使用有效列数来计算差异
  361. cols_diff = target_cols - current_row_effective_cols
  362. if cols_diff > 0:
  363. last_cell = cells[-1]
  364. current_last_span = int(last_cell.get("colspan", 1))
  365. last_cell["colspan"] = str(current_last_span + cols_diff)
  366. def perform_table_merge(soup1, soup2, previous_table_block, wait_merge_table_footnotes):
  367. """执行表格合并操作"""
  368. # 检测表头有几行,并确认表头内容是否一致
  369. header_count, headers_match, header_texts = detect_table_headers(soup1, soup2)
  370. # logger.debug(f"检测到表头行数: {header_count}, 表头匹配: {headers_match}")
  371. # logger.debug(f"表头内容: {header_texts}")
  372. # 找到第一个表格的tbody,如果没有则查找table元素
  373. tbody1 = soup1.find("tbody") or soup1.find("table")
  374. # 获取表1和表2的所有行
  375. rows1 = soup1.find_all("tr")
  376. rows2 = soup2.find_all("tr")
  377. if rows1 and rows2 and header_count < len(rows2):
  378. # 获取表1最后一行和表2第一个非表头行
  379. last_row1 = rows1[-1]
  380. first_data_row2 = rows2[header_count]
  381. # 计算表格总列数
  382. table_cols1 = calculate_table_total_columns(soup1)
  383. table_cols2 = calculate_table_total_columns(soup2)
  384. if table_cols1 >= table_cols2:
  385. reference_structure = [int(cell.get("colspan", 1)) for cell in last_row1.find_all(["td", "th"])]
  386. reference_visual_cols = calculate_visual_columns(last_row1)
  387. # 以表1的最后一行为参考,调整表2的行
  388. adjust_table_rows_colspan(
  389. soup2, rows2, header_count, len(rows2),
  390. reference_structure, reference_visual_cols,
  391. table_cols1, table_cols2, first_data_row2
  392. )
  393. else: # table_cols2 > table_cols1
  394. reference_structure = [int(cell.get("colspan", 1)) for cell in first_data_row2.find_all(["td", "th"])]
  395. reference_visual_cols = calculate_visual_columns(first_data_row2)
  396. # 以表2的第一个数据行为参考,调整表1的行
  397. adjust_table_rows_colspan(
  398. soup1, rows1, 0, len(rows1),
  399. reference_structure, reference_visual_cols,
  400. table_cols2, table_cols1, last_row1
  401. )
  402. # 将第二个表格的行添加到第一个表格中
  403. if tbody1:
  404. tbody2 = soup2.find("tbody") or soup2.find("table")
  405. if tbody2:
  406. # 将第二个表格的行添加到第一个表格中(跳过表头行)
  407. for row in rows2[header_count:]:
  408. row.extract()
  409. tbody1.append(row)
  410. # 清空previous_table_block的footnote
  411. previous_table_block["blocks"] = [
  412. block for block in previous_table_block["blocks"]
  413. if block["type"] != BlockType.TABLE_FOOTNOTE
  414. ]
  415. # 添加待合并表格的footnote到前一个表格中
  416. for table_footnote in wait_merge_table_footnotes:
  417. temp_table_footnote = table_footnote.copy()
  418. temp_table_footnote[SplitFlag.CROSS_PAGE] = True
  419. previous_table_block["blocks"].append(temp_table_footnote)
  420. return str(soup1)
  421. def merge_table(page_info_list):
  422. """合并跨页表格"""
  423. # 倒序遍历每一页
  424. for page_idx in range(len(page_info_list) - 1, -1, -1):
  425. # 跳过第一页,因为它没有前一页
  426. if page_idx == 0:
  427. continue
  428. page_info = page_info_list[page_idx]
  429. previous_page_info = page_info_list[page_idx - 1]
  430. # 检查当前页是否有表格块
  431. if not (page_info["para_blocks"] and page_info["para_blocks"][0]["type"] == BlockType.TABLE):
  432. continue
  433. current_table_block = page_info["para_blocks"][0]
  434. # 检查上一页是否有表格块
  435. if not (previous_page_info["para_blocks"] and previous_page_info["para_blocks"][-1]["type"] == BlockType.TABLE):
  436. continue
  437. previous_table_block = previous_page_info["para_blocks"][-1]
  438. # 收集待合并表格的footnote
  439. wait_merge_table_footnotes = [
  440. block for block in current_table_block["blocks"]
  441. if block["type"] == BlockType.TABLE_FOOTNOTE
  442. ]
  443. # 检查两个表格是否可以合并
  444. can_merge, soup1, soup2, current_html, previous_html = can_merge_tables(
  445. current_table_block, previous_table_block
  446. )
  447. if not can_merge:
  448. continue
  449. # 执行表格合并
  450. merged_html = perform_table_merge(
  451. soup1, soup2, previous_table_block, wait_merge_table_footnotes
  452. )
  453. # 更新previous_table_block的html
  454. for block in previous_table_block["blocks"]:
  455. if (block["type"] == BlockType.TABLE_BODY and block["lines"] and block["lines"][0]["spans"]):
  456. block["lines"][0]["spans"][0]["html"] = merged_html
  457. break
  458. # 删除当前页的table
  459. for block in current_table_block["blocks"]:
  460. block['lines'] = []
  461. block[SplitFlag.LINES_DELETED] = True