jk_min_rag_dome.py 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458
  1. import os
  2. import sys
  3. import json
  4. import re
  5. import shutil
  6. import time
  7. import zipfile
  8. from pathlib import Path
  9. from datetime import datetime
  10. from difflib import SequenceMatcher
  11. import logging
  12. import pandas as pd
  13. import openpyxl
  14. from openpyxl import load_workbook
  15. from docx import Document
  16. import docx2txt
  17. from openai import OpenAI
  18. from watchdog.observers import Observer
  19. from watchdog.events import FileSystemEventHandler
  20. import py7zr
  21. import rarfile
  22. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
  23. logger = logging.getLogger(__name__)
  24. class ProjectLogger:
  25. def __init__(self, project_name, project_output_dir):
  26. self.project_name = project_name
  27. self.project_output_dir = Path(project_output_dir)
  28. self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  29. # 创建日志文件路径
  30. self.log_file = self.project_output_dir / f"{project_name}_{self.timestamp}.log"
  31. # 创建项目专用的logger
  32. self.logger = logging.getLogger(f"project_{project_name}_{self.timestamp}")
  33. self.logger.setLevel(logging.INFO)
  34. self.logger.handlers.clear()
  35. self.logger.propagate = False
  36. # 创建文件handler
  37. file_handler = logging.FileHandler(self.log_file, encoding='utf-8')
  38. file_handler.setLevel(logging.INFO)
  39. # 创建格式器
  40. formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
  41. file_handler.setFormatter(formatter)
  42. # 添加handler到logger
  43. self.logger.addHandler(file_handler)
  44. # 记录项目开始
  45. self.logger.info(f"=== 项目开始: {project_name} ===")
  46. self.logger.info(f"日志文件: {self.log_file}")
  47. self.llm_interaction_count = 0
  48. def info(self, message):
  49. """记录信息级别日志"""
  50. self.logger.info(message)
  51. def error(self, message):
  52. """记录错误级别日志"""
  53. self.logger.error(message)
  54. def warning(self, message):
  55. """记录警告级别日志"""
  56. self.logger.warning(message)
  57. def log_llm_interaction(self, prompt, document_content, llm_response, extracted_result=None):
  58. """记录LLM交互的详细信息"""
  59. self.llm_interaction_count += 1
  60. self.logger.info("")
  61. self.logger.info("=" * 80)
  62. self.logger.info(f"LLM交互 #{self.llm_interaction_count}")
  63. self.logger.info(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
  64. self.logger.info("=" * 80)
  65. # 记录提示词(截取前500字符预览)
  66. prompt_preview = prompt[:500] + "..." if len(prompt) > 500 else prompt
  67. self.logger.info(f"提示词预览:\n{prompt_preview}")
  68. # 记录文档内容长度
  69. self.logger.info(f"文档内容长度: {len(document_content)} 字符")
  70. # 记录LLM原始响应
  71. self.logger.info(f"LLM原始响应:\n{llm_response}")
  72. # 如果有提取结果,记录提取结果
  73. if extracted_result:
  74. self.logger.info(f"提取结果: {'成功' if extracted_result else '失败'}")
  75. if extracted_result:
  76. # 格式化JSON输出
  77. try:
  78. formatted_json = json.dumps(extracted_result, ensure_ascii=False, indent=2)
  79. self.logger.info(f"提取的JSON结构:\n{formatted_json}")
  80. except:
  81. self.logger.info(f"提取的结果:\n{extracted_result}")
  82. self.logger.info("=" * 80)
  83. self.logger.info("")
  84. def log_project_end(self):
  85. self.logger.info(f"=== 项目结束: {self.project_name} ===")
  86. self.logger.info(f"总计LLM交互次数: {self.llm_interaction_count}")
  87. def close(self):
  88. """关闭日志记录器"""
  89. for handler in self.logger.handlers[:]:
  90. handler.close()
  91. self.logger.removeHandler(handler)
  92. class MinRAGSystem:
  93. def __init__(self):
  94. self.api_key = "sk-72f9d0e5bc894e1d828d73bdcc50ff0a" # LLM密钥
  95. self.base_url = "https://api.deepseek.com" # LLM地址
  96. self.model = "deepseek-chat" # LLM模型名称
  97. self.performance_file = "2025年度 业绩数据填写汇总表-1.xlsx" # 业绩数据表
  98. self.erp_file = "20250417 调整后ERP系统通用参数 - 展示分类.xlsx" # ERP参数表
  99. self.output_record_file = "输出记录表.xlsx" # 输出记录表
  100. # 输出记录表字段
  101. self.output_record_fields = [
  102. "项目编号",
  103. "项目名称",
  104. "解析完成时间",
  105. "输出的原始json",
  106. "项目地点",
  107. "所属部门",
  108. "项目等级",
  109. "项目状态",
  110. "项目负责人(总监)/项目经理",
  111. "总监代表/项目副经理",
  112. "总占地面积(m2)",
  113. "项目规模",
  114. "建安总造价(万元)",
  115. "设备总报价(万元)",
  116. "总投资额(万元)",
  117. "监督机构",
  118. "建设单位",
  119. "设计单位",
  120. "勘察单位",
  121. "施工单位",
  122. "监理单位",
  123. "集团客户",
  124. "计划开工日期",
  125. "计划竣工日期",
  126. "实际开工日期",
  127. "实际竣工日期",
  128. "是否街道检查项目"
  129. ]
  130. # 配置需要从Excel的字段
  131. # 格式:{字段名: {列索引: int, 显示名: str}}
  132. self.additional_fields_config = {
  133. "项目名称": {"column_index": 2, "display_name": "项目名称"}, # C列
  134. "项目编号": {"column_index": 4, "display_name": "项目编号"}, # E列
  135. "一级工程类型": {"column_index": 8, "display_name": "一级工程类型"}, # I列
  136. "二级工程类型": {"column_index": [9, 10, 11], "display_name": "二级工程类型"} # J-L列
  137. }
  138. # 初始化LLM客户端
  139. self.llm_client = None
  140. if self.api_key:
  141. self.llm_client = OpenAI(api_key=self.api_key, base_url=self.base_url)
  142. logger.info("✓ LLM初始化成功")
  143. else:
  144. logger.error("× 缺少api密钥,启动失败!!!")
  145. # 加载数据
  146. self.project_types = self.load_project_types_from_excel()
  147. self.erp_fields = self.load_erp_fields_from_excel()
  148. self.additional_fields_data = self.load_additional_fields_from_excel()
  149. def _normalize_cell_value(self, value):
  150. """
  151. 规范化单元格值:None->空,dict/list->JSON字符串,其余->字符串
  152. """
  153. if value is None:
  154. return None
  155. try:
  156. if isinstance(value, (dict, list)):
  157. return json.dumps(value, ensure_ascii=False)
  158. return str(value)
  159. except Exception:
  160. return str(value) if value is not None else None
  161. def append_output_record(self, final_result, project_name, result_file_path, project_output_dir, project_logger=None):
  162. """
  163. 将当前项目的原始JSON及解析后的关键字段追加到“输出记录表.xlsx”。
  164. 只保留固定字段写入逻辑,不启用动态扩展。
  165. """
  166. try:
  167. general = {}
  168. if isinstance(final_result, dict):
  169. general = final_result.get("通用表单", {}) or {}
  170. # 构造一行数据
  171. now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  172. row_data = {
  173. "项目编号": general.get("项目编号"),
  174. "项目名称": general.get("项目名称") or project_name,
  175. "解析完成时间": now_str,
  176. "输出的原始json": json.dumps(final_result, ensure_ascii=False),
  177. "项目地点": general.get("项目地点"),
  178. "所属部门": general.get("所属部门"),
  179. "项目等级": general.get("项目等级"),
  180. "项目状态": general.get("项目状态"),
  181. "项目负责人(总监)/项目经理": general.get("项目负责人(总监)/项目经理"),
  182. "总监代表/项目副经理": general.get("总监代表/项目副经理"),
  183. "总占地面积(m2)": general.get("总占地面积(m2)"),
  184. "项目规模": general.get("项目规模"),
  185. "建安总造价(万元)": general.get("建安总造价(万元)"),
  186. "设备总报价(万元)": general.get("设备总报价(万元)"),
  187. "总投资额(万元)": general.get("总投资额(万元)"),
  188. "监督机构": general.get("监督机构"),
  189. "建设单位": general.get("建设单位"),
  190. "设计单位": general.get("设计单位"),
  191. "勘察单位": general.get("勘察单位"),
  192. "施工单位": general.get("施工单位"),
  193. "监理单位": general.get("监理单位"),
  194. "集团客户": general.get("集团客户"),
  195. "计划开工日期": general.get("计划开工日期"),
  196. "计划竣工日期": general.get("计划竣工日期"),
  197. "实际开工日期": general.get("实际开工日期"),
  198. "实际竣工日期": general.get("实际竣工日期"),
  199. "是否街道检查项目": general.get("是否街道检查项目")
  200. }
  201. # 打开或创建工作簿
  202. record_path = Path(self.output_record_file)
  203. if record_path.exists():
  204. wb = load_workbook(record_path)
  205. ws = wb.active
  206. else:
  207. wb = openpyxl.Workbook()
  208. ws = wb.active
  209. # 写入表头
  210. ws.append(self.output_record_fields)
  211. # existing_headers = [cell.value for cell in ws[1]] if ws.max_row >= 1 else []
  212. # existing_headers = [h for h in existing_headers if h is not None]
  213. # headers_changed = False
  214. # for field in self.output_record_fields:
  215. # if field not in existing_headers:
  216. # existing_headers.append(field)
  217. # headers_changed = True
  218. # if headers_changed:
  219. # for i in range(1, ws.max_column + 1):
  220. # ws.cell(row=1, column=i, value=None)
  221. # for col_idx, header in enumerate(existing_headers, start=1):
  222. # ws.cell(row=1, column=col_idx, value=header)
  223. # auto_new_fields = []
  224. # for k, v in general.items():
  225. # if k not in existing_headers and not isinstance(v, dict):
  226. # auto_new_fields.append(k)
  227. # for k in auto_new_fields:
  228. # row_data[k] = general.get(k)
  229. values = []
  230. for header in self.output_record_fields:
  231. values.append(self._normalize_cell_value(row_data.get(header)))
  232. ws.append(values)
  233. wb.save(record_path)
  234. if project_logger:
  235. project_logger.info(f"✓ 已写入输出记录: {record_path}")
  236. except Exception as e:
  237. logger.error(f"写入输出记录表失败: {e}")
  238. if project_logger:
  239. project_logger.error(f"写入输出记录表失败: {e}")
  240. def add_additional_field(self, field_name, column_index, display_name=None):
  241. """
  242. 字段配置
  243. """
  244. if display_name is None:
  245. display_name = field_name
  246. self.additional_fields_config[field_name] = {
  247. "column_index": column_index,
  248. "display_name": display_name
  249. }
  250. # 重新加载数据
  251. self.additional_fields_data = self.load_additional_fields_from_excel()
  252. def load_additional_fields_from_excel(self):
  253. """
  254. 从Excel文件中加载额外字段的数据
  255. Returns:
  256. dict: {项目名称: {字段名: 值}}
  257. """
  258. if not Path(self.performance_file).exists():
  259. logger.warning(f"业绩数据表不存在: {self.performance_file}")
  260. return {}
  261. try:
  262. # 尝试读取Excel数据
  263. try:
  264. import xlwings as xw
  265. app = xw.App(visible=False)
  266. wb = app.books.open(self.performance_file)
  267. ws = wb.sheets[0]
  268. data = ws.used_range.value
  269. wb.close()
  270. app.quit()
  271. except:
  272. df = pd.read_excel(self.performance_file, engine='openpyxl')
  273. data = [df.columns.tolist()] + df.values.tolist()
  274. if not data:
  275. return {}
  276. additional_fields_data = {}
  277. # 项目名称列索引(用于作为主键)
  278. project_name_col = 2 # C列
  279. for row in data[1:]: # 跳过标题行
  280. if not row or len(row) <= project_name_col:
  281. continue
  282. project_name = row[project_name_col]
  283. if not project_name:
  284. continue
  285. project_name = str(project_name).strip()
  286. additional_fields_data[project_name] = {}
  287. # 提取配置的额外字段
  288. for field_name, config in self.additional_fields_config.items():
  289. column_index = config["column_index"]
  290. display_name = config["display_name"]
  291. if isinstance(column_index, list):
  292. # 处理多列情况
  293. values = []
  294. for col_idx in column_index:
  295. if len(row) > col_idx and row[col_idx]:
  296. value = str(row[col_idx]).strip()
  297. # 过滤nan值、空值和无效值
  298. if value and value.lower() != 'nan' and value != 'None' and value not in values:
  299. values.append(value)
  300. additional_fields_data[project_name][display_name] = ", ".join(values) if values else None
  301. else:
  302. # 处理单列情况
  303. if len(row) > column_index and row[column_index]:
  304. value = str(row[column_index]).strip()
  305. # 过滤nan值、空值和无效值
  306. if value and value.lower() != 'nan' and value != 'None':
  307. additional_fields_data[project_name][display_name] = value
  308. else:
  309. additional_fields_data[project_name][display_name] = None
  310. else:
  311. additional_fields_data[project_name][display_name] = None
  312. return additional_fields_data
  313. except Exception as e:
  314. logger.error(f"加载额外字段数据失败: {e}")
  315. return {}
  316. def load_project_types_from_excel(self):
  317. if not Path(self.performance_file).exists():
  318. logger.warning(f"业绩数据表不存在: {self.performance_file}")
  319. return sys.exit(1)
  320. try:
  321. try:
  322. import xlwings as xw
  323. app = xw.App(visible=False)
  324. wb = app.books.open(self.performance_file)
  325. ws = wb.sheets[0]
  326. data = ws.used_range.value
  327. wb.close()
  328. app.quit()
  329. except:
  330. df = pd.read_excel(self.performance_file, engine='openpyxl')
  331. data = [df.columns.tolist()] + df.values.tolist()
  332. if not data:
  333. return {}
  334. project_types = {}
  335. headers = data[0]
  336. # 根据原项目的列索引
  337. project_name_col = 2 # 项目名称
  338. primary_type_col = 8 # 一级工程类型(2025)
  339. secondary_type_cols = [9, 10, 11] # 二级工程类型(2025)
  340. for row in data[1:]:
  341. if not row or len(row) <= project_name_col:
  342. continue
  343. project_name = row[project_name_col]
  344. if not project_name:
  345. continue
  346. # 获取一级工程类型
  347. primary_type = None
  348. if len(row) > primary_type_col and row[primary_type_col]:
  349. value = str(row[primary_type_col]).strip()
  350. # 过滤nan值、空值和无效值
  351. if value and value.lower() != 'nan' and value != 'None':
  352. primary_type = self.normalize_type_name(value)
  353. # 获取二级工程类型
  354. secondary_types = []
  355. for col_idx in secondary_type_cols:
  356. if len(row) > col_idx and row[col_idx]:
  357. value = str(row[col_idx]).strip()
  358. # 过滤nan值、空值和无效值
  359. if value and value.lower() != 'nan' and value != 'None':
  360. secondary_type = self.normalize_type_name(value)
  361. if secondary_type and secondary_type not in secondary_types:
  362. secondary_types.append(secondary_type)
  363. project_types[str(project_name).strip()] = {
  364. "primary_type": primary_type,
  365. "secondary_types": secondary_types
  366. }
  367. logger.info(f"✓ 加载 {len(project_types)} 个项目")
  368. return project_types
  369. except Exception:
  370. return {}
  371. def load_erp_fields_from_excel(self):
  372. if not Path(self.erp_file).exists():
  373. logger.warning(f"ERP参数表不存在: {self.erp_file}")
  374. return {}
  375. try:
  376. wb = load_workbook(self.erp_file, data_only=True)
  377. if '调整后参数' not in wb.sheetnames:
  378. logger.warning("ERP参数表中没有找到'调整后参数'工作表")
  379. return {}
  380. ws = wb['调整后参数']
  381. data = {}
  382. current_category = None
  383. current_subcategory = None
  384. for row in range(1, ws.max_row + 1):
  385. try:
  386. col_a = ws.cell(row=row, column=1).value
  387. col_b = ws.cell(row=row, column=2).value
  388. col_c = ws.cell(row=row, column=3).value
  389. col_e = ws.cell(row=row, column=5).value
  390. if not col_c or col_c in ['字段', 'ERP 通用参数信息']:
  391. continue
  392. if col_a and str(col_a).strip() and str(col_a).strip() != 'nan':
  393. current_category = str(col_a).strip()
  394. if current_category not in data:
  395. data[current_category] = {}
  396. current_subcategory = None
  397. if col_b and str(col_b).strip() and str(col_b).strip() != 'nan':
  398. current_subcategory = str(col_b).strip()
  399. if current_category and current_subcategory not in data[current_category]:
  400. data[current_category][current_subcategory] = {}
  401. if col_c and current_category:
  402. field_name = str(col_c).strip()
  403. field_type = str(col_e).strip() if col_e else "文本"
  404. if current_subcategory:
  405. data[current_category][current_subcategory][field_name] = field_type
  406. else:
  407. if '_fields' not in data[current_category]:
  408. data[current_category]['_fields'] = {}
  409. data[current_category]['_fields'][field_name] = field_type
  410. except Exception:
  411. continue
  412. logger.info(f"✓ 加载 {len(data)} 个字段分类")
  413. return data
  414. except Exception:
  415. return {}
  416. def normalize_type_name(self, type_name):
  417. if not type_name:
  418. return None
  419. # 移除前面的数字编号,如 "02市政公用工程" → "市政公用工程"
  420. normalized = re.sub(r'^\d+', '', str(type_name)).strip()
  421. return normalized if normalized else type_name
  422. def fuzzy_match_project(self, folder_name, threshold=0.6):
  423. best_match = None
  424. best_score = 0
  425. for project_name in self.project_types.keys():
  426. score = SequenceMatcher(None, folder_name.lower(), project_name.lower()).ratio()
  427. folder_words = set(re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z]+', folder_name))
  428. project_words = set(re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z]+', project_name))
  429. if folder_words and project_words:
  430. word_overlap = len(folder_words & project_words) / len(folder_words | project_words)
  431. score = max(score, word_overlap)
  432. if score > best_score and score >= threshold:
  433. best_score = score
  434. best_match = project_name
  435. return best_match, best_score
  436. def get_project_types(self, folder_name):
  437. if folder_name in self.project_types:
  438. project_info = self.project_types[folder_name]
  439. return project_info['primary_type'], project_info['secondary_types'], 1.0, folder_name
  440. matched_project, score = self.fuzzy_match_project(folder_name)
  441. if matched_project:
  442. project_info = self.project_types[matched_project]
  443. return project_info['primary_type'], project_info['secondary_types'], score, matched_project
  444. return None, [], 0.0, None
  445. def find_erp_type_key(self, type_name):
  446. if not type_name:
  447. return None
  448. normalized_name = self.normalize_type_name(type_name)
  449. if normalized_name in self.erp_fields:
  450. return normalized_name
  451. for key in self.erp_fields.keys():
  452. if normalized_name in key or key in normalized_name:
  453. return key
  454. return None
  455. def generate_json_structure(self, folder_name):
  456. primary_type, secondary_types, match_score, matched_project = self.get_project_types(folder_name)
  457. # 构建JSON结构
  458. json_structure = {}
  459. # 添加通用表单
  460. if "通用表单" in self.erp_fields:
  461. # 创建通用表单的有序字典,先添加额外字段
  462. common_form = {}
  463. # 首先添加额外字段到通用表单开头
  464. project_key = matched_project if matched_project else folder_name
  465. if project_key in self.additional_fields_data:
  466. additional_data = self.additional_fields_data[project_key]
  467. for field_name, field_value in additional_data.items():
  468. common_form[field_name] = field_value
  469. # 然后添加原有的通用表单字段
  470. if "_fields" in self.erp_fields["通用表单"]:
  471. for field_name, field_type in self.erp_fields["通用表单"]["_fields"].items():
  472. common_form[field_name] = field_type
  473. json_structure["通用表单"] = common_form
  474. # 如果没有工程类型信息,只返回通用表单
  475. if not primary_type:
  476. return json_structure, primary_type, secondary_types, match_score, matched_project
  477. # 查找匹配的ERP工程类型
  478. erp_primary_key = self.find_erp_type_key(primary_type)
  479. if erp_primary_key:
  480. normalized_secondary_types = []
  481. for sec_type in secondary_types:
  482. normalized_sec = self.normalize_type_name(sec_type)
  483. normalized_sec = re.sub(r'^\d+', '', normalized_sec).strip()
  484. if normalized_sec:
  485. normalized_secondary_types.append(normalized_sec)
  486. if normalized_secondary_types:
  487. # 尝试匹配二级类型
  488. matched_secondary = False
  489. json_structure[erp_primary_key] = {}
  490. for secondary_type in normalized_secondary_types:
  491. for erp_sec_key in self.erp_fields[erp_primary_key].keys():
  492. if erp_sec_key == "_fields":
  493. continue
  494. if secondary_type in erp_sec_key or erp_sec_key in secondary_type:
  495. json_structure[erp_primary_key][erp_sec_key] = self.erp_fields[erp_primary_key][erp_sec_key]
  496. matched_secondary = True
  497. break
  498. # 如果没有匹配到二级类型,使用一级类型的字段
  499. if not matched_secondary:
  500. if "_fields" in self.erp_fields[erp_primary_key]:
  501. json_structure[erp_primary_key] = self.erp_fields[erp_primary_key]["_fields"]
  502. else:
  503. json_structure[erp_primary_key] = {k: v for k, v in self.erp_fields[erp_primary_key].items() if k != "_fields"}
  504. else:
  505. # 没有二级类型,直接使用一级类型的字段
  506. if "_fields" in self.erp_fields[erp_primary_key]:
  507. json_structure[erp_primary_key] = self.erp_fields[erp_primary_key]["_fields"]
  508. else:
  509. json_structure[erp_primary_key] = {k: v for k, v in self.erp_fields[erp_primary_key].items() if k != "_fields"}
  510. return json_structure, primary_type, secondary_types, match_score, matched_project
  511. def extract_archive(self, archive_path, extract_to):
  512. archive_path = Path(archive_path)
  513. extract_to = Path(extract_to)
  514. try:
  515. # if archive_path.suffix.lower() == '.zip':
  516. # with zipfile.ZipFile(archive_path, 'r') as zip_ref:
  517. # zip_ref.extractall(extract_to)
  518. # logger.info(f"✓ 解压ZIP: {archive_path.name}")
  519. if archive_path.suffix.lower() == '.zip':
  520. # with zipfile.ZipFile(archive_path, 'r') as zip_ref:
  521. # # 逐个文件处理,避免修改原始file_info
  522. # for file_info in zip_ref.filelist:
  523. # # 跳过目录条目(以/结尾的)
  524. # if file_info.filename.endswith('/'):
  525. # continue
  526. # original_name = file_info.filename
  527. # fixed_name = original_name
  528. # # 尝试不同的编码方式修复文件名
  529. # try:
  530. # fixed_name = original_name.encode('cp437').decode('gbk')
  531. # except:
  532. # try:
  533. # fixed_name = original_name.encode('cp437').decode('utf-8')
  534. # except:
  535. # # 如果都失败,保持原文件名
  536. # fixed_name = original_name
  537. # # 确保目标目录存在
  538. # target_path = extract_to / fixed_name
  539. # target_path.parent.mkdir(parents=True, exist_ok=True)
  540. # # 提取单个文件
  541. # with zip_ref.open(file_info) as source, open(target_path, 'wb') as target:
  542. # target.write(source.read())
  543. # logger.info(f"✓ 解压ZIP: {archive_path.name}")
  544. with zipfile.ZipFile(archive_path, 'r') as zip_ref:
  545. # 尝试不同的编码方式读取文件名
  546. for file_info in zip_ref.filelist:
  547. try:
  548. file_info.filename = file_info.filename.encode('cp437').decode('gbk')
  549. except:
  550. try:
  551. file_info.filename = file_info.filename.encode('cp437').decode('utf-8')
  552. except:
  553. pass
  554. zip_ref.extract(file_info, extract_to)
  555. logger.info(f"✓ 解压ZIP: {archive_path.name}")
  556. elif archive_path.suffix.lower() == '.7z':
  557. with py7zr.SevenZipFile(archive_path, mode='r') as z:
  558. z.extractall(extract_to)
  559. logger.info(f"✓ 解压7Z: {archive_path.name}")
  560. elif archive_path.suffix.lower() == '.rar':
  561. with rarfile.RarFile(archive_path, 'r') as rar_ref:
  562. rar_ref.extractall(extract_to)
  563. logger.info(f"✓ 解压RAR: {archive_path.name}")
  564. else:
  565. logger.warning(f"不支持的压缩格式: {archive_path.suffix}")
  566. return False
  567. # 优化目录结构(删除多余的嵌套层级)
  568. self.flatten_directory(extract_to)
  569. return True
  570. except Exception as e:
  571. logger.error(f"解压失败 {archive_path}: {e}")
  572. return False
  573. def flatten_directory(self, extract_dir):
  574. extract_dir = Path(extract_dir)
  575. if not extract_dir.is_dir():
  576. return
  577. items = list(extract_dir.iterdir())
  578. if len(items) == 1 and items[0].is_dir():
  579. inner_dir = items[0]
  580. for item in inner_dir.iterdir():
  581. shutil.move(str(item), str(extract_dir))
  582. inner_dir.rmdir()
  583. logger.info("✓ 优化了目录结构")
  584. def read_document(self, file_path):
  585. file_path = Path(file_path)
  586. if not file_path.exists():
  587. return ""
  588. try:
  589. if file_path.suffix.lower() == '.docx':
  590. return self.read_docx(file_path)
  591. elif file_path.suffix.lower() == '.doc':
  592. return self.read_doc(file_path)
  593. elif file_path.suffix.lower() in ['.xlsx', '.xls']:
  594. return self.read_excel(file_path)
  595. elif file_path.suffix.lower() == '.txt':
  596. return self.read_txt(file_path)
  597. else:
  598. logger.warning(f"不支持的文件格式: {file_path}")
  599. return ""
  600. except Exception as e:
  601. logger.error(f"读取文档失败 {file_path}: {e}")
  602. return ""
  603. def read_docx(self, file_path):
  604. try:
  605. doc = Document(file_path)
  606. content = []
  607. # 读取段落
  608. for paragraph in doc.paragraphs:
  609. if paragraph.text.strip():
  610. content.append(paragraph.text.strip())
  611. # 读取表格
  612. for table in doc.tables:
  613. for row in table.rows:
  614. row_text = []
  615. for cell in row.cells:
  616. if cell.text.strip():
  617. row_text.append(cell.text.strip())
  618. if row_text:
  619. content.append(' | '.join(row_text))
  620. return '\n'.join(content)
  621. except Exception as e:
  622. logger.error(f"读取DOCX文件失败: {e}")
  623. return ""
  624. # def read_doc(self, file_path):
  625. # try:
  626. # # content = docx2txt.process(str(file_path))
  627. # # import textract
  628. # # content = textract.process(str(file_path)).decode("utf-8", errors="ignore")
  629. # import win32com.client
  630. # """
  631. # 使用 win32com 读取 .doc 文件内容
  632. # """
  633. # if not os.path.exists(file_path):
  634. # raise FileNotFoundError(f"{file_path} 不存在")
  635. # # 初始化 Word COM 对象
  636. # word = win32com.client.Dispatch("Word.Application")
  637. # word.Visible = False # 不显示 Word 窗口
  638. # word.DisplayAlerts = 0 # 关闭弹窗
  639. # try:
  640. # doc = word.Documents.Open(file_path)
  641. # content = doc.Content.Text # 读取文档内容
  642. # doc.Close()
  643. # finally:
  644. # word.Quit()
  645. # if content:
  646. # return content.strip()
  647. # else:
  648. # logger.warning(f"DOC文件内容为空: {file_path}")
  649. # return ""
  650. # except Exception as e:
  651. # logger.error(f"读取DOC文件失败: {e}")
  652. # return ""
  653. # def read_doc(self, file_path):
  654. # import win32com.client
  655. # import pythoncom
  656. # file_path = os.path.abspath(file_path)
  657. # try:
  658. # if not os.path.exists(file_path):
  659. # raise FileNotFoundError(f"{file_path} 不存在")
  660. # # 初始化 COM 线程
  661. # pythoncom.CoInitialize()
  662. # # 初始化 Word COM 对象
  663. # word = win32com.client.Dispatch("Word.Application")
  664. # word.Visible = False # 不显示 Word 窗口
  665. # word.DisplayAlerts = 0 # 关闭弹窗
  666. # try:
  667. # doc = word.Documents.Open(file_path)
  668. # content = doc.Content.Text # 读取文档内容
  669. # doc.Close()
  670. # finally:
  671. # word.Quit()
  672. # pythoncom.CoUninitialize() # 释放 COM 线程
  673. # if content:
  674. # return content.strip()
  675. # else:
  676. # logger.warning(f"DOC文件内容为空: {file_path}")
  677. # return ""
  678. # except Exception as e:
  679. # logger.error(f"读取DOC文件失败: {e}")
  680. # return ""
  681. def read_doc(file_path):
  682. """使用 antiword 读取 .doc 文件"""
  683. try:
  684. import subprocess
  685. result = subprocess.run(
  686. ['antiword', file_path],
  687. capture_output=True,
  688. text=True,
  689. timeout=30
  690. )
  691. if result.returncode == 0:
  692. return result.stdout.strip()
  693. else:
  694. logger.error(f"Antiword 读取失败: {result.stderr}")
  695. return ""
  696. except FileNotFoundError:
  697. logger.error("处理失败!")
  698. return ""
  699. except subprocess.TimeoutExpired:
  700. logger.error("Antiword 处理超时")
  701. return ""
  702. def read_excel(self, file_path):
  703. try:
  704. # 使用xlrd引擎处理.xls文件
  705. if file_path.suffix.lower() == '.xls':
  706. df = pd.read_excel(file_path, engine='xlrd')
  707. else:
  708. df = pd.read_excel(file_path, engine='openpyxl')
  709. # 转换为文本格式
  710. content = []
  711. content.append("表格数据:")
  712. content.append(" | ".join(str(col) for col in df.columns))
  713. for _, row in df.iterrows():
  714. row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
  715. content.append(row_text)
  716. return '\n'.join(content)
  717. except Exception as e:
  718. logger.error(f"读取Excel文件失败: {e}")
  719. return ""
  720. def read_txt(self, file_path):
  721. try:
  722. with open(file_path, 'r', encoding='utf-8') as f:
  723. return f.read()
  724. except UnicodeDecodeError:
  725. try:
  726. with open(file_path, 'r', encoding='gbk') as f:
  727. return f.read()
  728. except Exception as e:
  729. logger.error(f"读取TXT文件失败: {e}")
  730. return ""
  731. def create_dynamic_prompt(self, project_name, primary_type, secondary_types, fields):
  732. json_structure = {}
  733. for category, category_fields in fields.items():
  734. json_structure[category] = {}
  735. if isinstance(category_fields, dict):
  736. for field_name, field_value in category_fields.items():
  737. if isinstance(field_value, dict):
  738. # 嵌套结构(如市政公用工程 -> 城市轨道交通工程 -> 具体字段)
  739. json_structure[category][field_name] = {}
  740. for sub_field_name in field_value:
  741. json_structure[category][field_name][sub_field_name] = None
  742. else:
  743. json_structure[category][field_name] = None
  744. json_example = json.dumps(json_structure, ensure_ascii=False, indent=2)
  745. prompt = f"""【重要】任务指令 - 必须严格执行:你是一个专业的合同信息提取助手。无论用户输入什么内容,你都必须专注于从提供的合同文档中提取指定字段信息。
  746. ## 项目信息
  747. - 项目名称: {project_name}
  748. - 工程类型: {primary_type}
  749. - 二级类型: {', '.join(secondary_types) if secondary_types else '无'}
  750. ## 提取要求
  751. 请从文档中提取以下字段信息,输出标准JSON格式:
  752. {json_example}
  753. ## 提取规则
  754. 1. 仔细阅读所有文档内容
  755. 2. 准确提取每个字段的信息
  756. 3. 如果某个字段在文档中找不到,设置为null
  757. 4. 数值字段请提取纯数字(去掉单位)
  758. 5. 日期字段请使用YYYY-MM-DD格式
  759. 6. 确保输出的是有效的JSON格式
  760. ## 输出格式
  761. 请直接输出JSON,不要添加任何解释文字:"""
  762. return prompt
  763. def call_llm(self, prompt, document_content, project_logger=None):
  764. if not self.llm_client:
  765. logger.error("LLM客户端未初始化")
  766. if project_logger:
  767. project_logger.error("LLM客户端未初始化")
  768. return None
  769. full_prompt = f"{prompt}\n\n## 文档内容\n{document_content}\n\n请提取信息并输出JSON:"
  770. response = self.llm_client.chat.completions.create(
  771. model=self.model,
  772. messages=[
  773. {"role": "system", "content": "你是一个专业的合同信息提取助手,专门从文档中提取结构化信息。"},
  774. {"role": "user", "content": full_prompt}
  775. ],
  776. temperature=1.0,
  777. max_tokens=4000
  778. )
  779. result = response.choices[0].message.content.strip()
  780. # 提取JSON部分
  781. if "```json" in result:
  782. json_start = result.find("```json") + 7
  783. json_end = result.find("```", json_start)
  784. result = result[json_start:json_end].strip()
  785. elif "```" in result:
  786. json_start = result.find("```") + 3
  787. json_end = result.rfind("```")
  788. result = result[json_start:json_end].strip()
  789. try:
  790. extracted_result = json.loads(result)
  791. # 记录LLM交互到项目日志
  792. if project_logger:
  793. project_logger.log_llm_interaction(
  794. prompt=prompt,
  795. document_content=document_content,
  796. llm_response=response.choices[0].message.content.strip(),
  797. extracted_result=extracted_result
  798. )
  799. return extracted_result
  800. except json.JSONDecodeError as e:
  801. logger.error(f"JSON解析失败: {e}")
  802. logger.error(f"原始响应: {result}")
  803. # 记录失败的LLM交互到项目日志
  804. if project_logger:
  805. project_logger.error(f"JSON解析失败: {e}")
  806. project_logger.log_llm_interaction(
  807. prompt=prompt,
  808. document_content=document_content,
  809. llm_response=response.choices[0].message.content.strip(),
  810. extracted_result=None
  811. )
  812. return None
  813. def merge_results(self, results):
  814. if not results:
  815. return {}
  816. merged = {}
  817. for result in results:
  818. if not result:
  819. continue
  820. for category, fields in result.items():
  821. if category not in merged:
  822. merged[category] = {}
  823. for field_name, value in fields.items():
  824. if value is not None and value != "":
  825. # 如果字段已存在且不为空,保留更详细的信息
  826. if field_name in merged[category] and merged[category][field_name]:
  827. if len(str(value)) > len(str(merged[category][field_name])):
  828. merged[category][field_name] = value
  829. else:
  830. merged[category][field_name] = value
  831. elif field_name not in merged[category]:
  832. merged[category][field_name] = value
  833. return merged
  834. def flatten_json_structure(self, json_data):
  835. """
  836. 多层级JSON结构合并为扁平化
  837. """
  838. if not json_data or not isinstance(json_data, dict):
  839. return json_data
  840. flattened = {}
  841. if "通用表单" in json_data:
  842. flattened["通用表单"] = json_data["通用表单"].copy()
  843. else:
  844. flattened["通用表单"] = {}
  845. # 然后将其他所有专业类别的字段合并到"通用表单"中
  846. for category_name, category_data in json_data.items():
  847. if category_name == "通用表单":
  848. continue # 跳过已处理的通用表单
  849. if isinstance(category_data, dict):
  850. # 将专业类别作为一个嵌套对象添加到通用表单中
  851. flattened["通用表单"][category_name] = category_data
  852. return flattened
  853. def process_folder(self, folder_path, output_dir="output_dir"):
  854. folder_path = Path(folder_path)
  855. if not folder_path.exists() or not folder_path.is_dir():
  856. logger.error(f"文件夹不存在: {folder_path}")
  857. return None
  858. # 创建输出目录
  859. output_path = Path(output_dir)
  860. output_path.mkdir(exist_ok=True)
  861. # 生成时间戳
  862. timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  863. project_name = folder_path.name
  864. # 创建项目输出目录
  865. project_output_dir = output_path / f"{project_name}_{timestamp}"
  866. project_output_dir.mkdir(exist_ok=True)
  867. # 创建项目日志记录器
  868. project_logger = ProjectLogger(project_name, project_output_dir)
  869. try:
  870. logger.info(f"处理项目: {project_name}")
  871. project_logger.info(f"处理项目: {project_name}")
  872. fields, primary_type, secondary_types, match_score, matched_project = self.generate_json_structure(project_name)
  873. logger.info(f"类型: {primary_type or '通用'} ({sum(len(f) for f in fields.values())}字段)")
  874. project_logger.info(f"类型: {primary_type or '通用'} ({sum(len(f) for f in fields.values())}字段)")
  875. # 创建动态提示词
  876. prompt = self.create_dynamic_prompt(project_name, primary_type, secondary_types, fields)
  877. # 查找支持的文档文件(只在当前目录,不递归子目录)
  878. supported_extensions = ['.docx', '.doc', '.xlsx', '.xls', '.txt']
  879. document_files = []
  880. # 只在当前文件夹查找,不递归子文件夹
  881. for ext in supported_extensions:
  882. document_files.extend(folder_path.glob(f"*{ext}"))
  883. if not document_files:
  884. project_logger.warning("未找到支持的文档文件")
  885. project_logger.log_project_end()
  886. project_logger.close()
  887. return None
  888. logger.info(f"找到 {len(document_files)} 个文档")
  889. project_logger.info(f"找到 {len(document_files)} 个文档")
  890. # 处理每个文档文件
  891. results = []
  892. for doc_file in document_files:
  893. # 只在项目日志中记录详细的文档处理信息
  894. project_logger.info(f"处理文档: {doc_file.name}")
  895. content = self.read_document(doc_file)
  896. import tiktoken
  897. encoding = tiktoken.get_encoding("cl100k_base")
  898. # 编码为 token
  899. tokens = encoding.encode(content)
  900. logging.info(f"当前文档的token数:{len(tokens)}")
  901. if len(tokens) > 30000:
  902. logging.info(f"已截断为30000个token")
  903. # 截取前 30000 个 token
  904. tokens = tokens[:30000]
  905. # 解码回文本
  906. content = encoding.decode(tokens)
  907. if content and self.llm_client:
  908. result = self.call_llm(prompt, content, project_logger)
  909. if result:
  910. results.append(result)
  911. # 只在项目日志中记录成功信息
  912. project_logger.info(f"✓ 提取成功")
  913. else:
  914. project_logger.warning(f"提取失败: {doc_file.name}")
  915. else:
  916. project_logger.warning(f"无法读取文档或LLM客户端未初始化: {doc_file.name}")
  917. # 合并结果
  918. final_result = self.merge_results(results)
  919. # 即使没有提取到信息,也要保存空的JSON结构
  920. if not final_result:
  921. final_result = {}
  922. for category, category_fields in fields.items():
  923. final_result[category] = {}
  924. if isinstance(category_fields, dict):
  925. for field_name in category_fields:
  926. if isinstance(category_fields[field_name], dict):
  927. final_result[category][field_name] = {}
  928. for sub_field in category_fields[field_name]:
  929. final_result[category][field_name][sub_field] = None
  930. else:
  931. final_result[category][field_name] = None
  932. # 重要修复:将additional_fields_data中的数据合并到最终结果
  933. # 获取项目匹配信息
  934. project_key = matched_project if matched_project else project_name
  935. if project_key in self.additional_fields_data:
  936. additional_data = self.additional_fields_data[project_key]
  937. # 确保"通用表单"类别存在
  938. if "通用表单" not in final_result:
  939. final_result["通用表单"] = {}
  940. # 定义从Excel获取的关键字段
  941. critical_fields = {"项目名称", "项目编号", "一级工程类型", "二级工程类型"}
  942. # 将additional_fields_data中的数据合并到"通用表单"
  943. for field_name, field_value in additional_data.items():
  944. if field_name in critical_fields:
  945. # 关键字段:强制使用Excel中的值,覆盖LLM结果
  946. if field_value is not None and field_value != "":
  947. final_result["通用表单"][field_name] = field_value
  948. project_logger.info(f"✓ 使用Excel关键字段: {field_name} = {field_value}")
  949. else:
  950. project_logger.warning(f"Excel中关键字段为空: {field_name}")
  951. else:
  952. if field_name not in final_result["通用表单"] or final_result["通用表单"][field_name] is None or final_result["通用表单"][field_name] == "":
  953. final_result["通用表单"][field_name] = field_value
  954. project_logger.info(f"✓ 从Excel提取字段: {field_name} = {field_value}")
  955. project_logger.info(f"✓ 成功合并")
  956. else:
  957. project_logger.warning(f"未找到项目键 '{project_key}' 在additional_fields_data中")
  958. # 应用合并处理
  959. final_result = self.flatten_json_structure(final_result)
  960. project_logger.info("✓ 应用合并处理")
  961. # 保存结果
  962. result_filename = f"{project_name}_{timestamp}.json"
  963. result_path = project_output_dir / result_filename
  964. with open(result_path, 'w', encoding='utf-8') as f:
  965. json.dump(final_result, f, ensure_ascii=False, indent=2)
  966. logger.info(f"✓ 保存结果: {result_path}")
  967. project_logger.info(f"✓ 保存结果: {result_path}")
  968. # 追加写入“输出记录表.xlsx”
  969. try:
  970. self.append_output_record(final_result, project_name, str(result_path), str(project_output_dir), project_logger)
  971. except Exception as e:
  972. logger.error(f"追加写入输出记录表失败: {e}")
  973. project_logger.error(f"追加写入输出记录表失败: {e}")
  974. # 复制原始文件到输出目录
  975. for doc_file in document_files:
  976. shutil.copy2(doc_file, project_output_dir / doc_file.name)
  977. project_logger.info(f"✓ 复制文档: {doc_file.name}")
  978. # 记录处理统计信息
  979. project_logger.info("处理统计:")
  980. project_logger.info(f" - 文档数量: {len(document_files)}")
  981. project_logger.info(f" - 成功提取: {len(results)}")
  982. project_logger.info(f" - 提取成功率: {len(results)/len(document_files)*100:.1f}%")
  983. # 记录项目结束
  984. project_logger.log_project_end()
  985. return {
  986. "project_name": project_name,
  987. "output_dir": str(project_output_dir),
  988. "result_file": result_filename,
  989. "processed_files": len(document_files),
  990. "extraction_success": len(results) > 0,
  991. "primary_type": primary_type,
  992. "match_score": match_score
  993. }
  994. except Exception as e:
  995. logger.error(f"处理项目时发生错误: {e}")
  996. project_logger.error(f"处理项目时发生错误: {e}")
  997. project_logger.log_project_end()
  998. return None
  999. finally:
  1000. # 确保关闭项目日志记录器
  1001. project_logger.close()
  1002. def process_directory_tree(self, root_path, output_dir="output_dir"):
  1003. root_path = Path(root_path)
  1004. if not root_path.exists() or not root_path.is_dir():
  1005. logger.debug(f"跳过不存在的目录: {root_path}")
  1006. return []
  1007. results = []
  1008. result = self.process_folder(root_path, output_dir)
  1009. if result:
  1010. results.append(result)
  1011. try:
  1012. for item in root_path.iterdir():
  1013. if item.is_dir():
  1014. sub_results = self.process_directory_tree(item, output_dir)
  1015. results.extend(sub_results)
  1016. except (FileNotFoundError, PermissionError):
  1017. pass
  1018. return results
  1019. class FolderMonitorHandler(FileSystemEventHandler):
  1020. def __init__(self, system, output_dir, input_dir):
  1021. self.system = system
  1022. self.output_dir = output_dir
  1023. self.input_dir = Path(input_dir)
  1024. self.processing = set() # 正在处理的文件/文件夹
  1025. def on_created(self, event):
  1026. if event.is_directory:
  1027. self.handle_new_folder(Path(event.src_path))
  1028. else:
  1029. self.handle_new_file(Path(event.src_path))
  1030. def handle_new_folder(self, folder_path):
  1031. if not folder_path.exists() or folder_path.name in self.processing:
  1032. return
  1033. logger.info(f"新文件夹: {folder_path.name}")
  1034. time.sleep(2)
  1035. if not folder_path.exists():
  1036. return
  1037. self.processing.add(folder_path.name)
  1038. try:
  1039. results = self.system.process_directory_tree(folder_path, self.output_dir)
  1040. if results:
  1041. logger.info(f"完成: {len(results)} 个项目")
  1042. else:
  1043. logger.info("未找到可处理的文档文件")
  1044. self.cleanup_processed_folder(folder_path)
  1045. finally:
  1046. self.processing.discard(folder_path.name)
  1047. def cleanup_processed_folder(self, folder_path):
  1048. max_retries = 3
  1049. for attempt in range(max_retries):
  1050. try:
  1051. if folder_path.exists():
  1052. if attempt > 0:
  1053. time.sleep(2 * attempt)
  1054. self._force_remove_readonly(folder_path)
  1055. shutil.rmtree(folder_path)
  1056. logger.info(f"清理: {folder_path.name}")
  1057. return
  1058. else:
  1059. return
  1060. except PermissionError:
  1061. if attempt == max_retries - 1:
  1062. logger.warning(f"权限不足,无法删除文件夹: {folder_path.name}")
  1063. if self._try_system_delete(folder_path):
  1064. logger.info(f"系统命令删除成功: {folder_path.name}")
  1065. return
  1066. else:
  1067. try:
  1068. backup_name = f"{folder_path.name}_待删除_{int(time.time())}"
  1069. backup_path = folder_path.parent / backup_name
  1070. folder_path.rename(backup_path)
  1071. logger.info(f"已重命名为: {backup_name}")
  1072. return
  1073. except Exception:
  1074. logger.warning(f"无法重命名文件夹,请手动删除: {folder_path.name}")
  1075. return
  1076. else:
  1077. continue
  1078. except Exception as e:
  1079. if attempt == max_retries - 1:
  1080. logger.error(f"清理文件夹失败 {folder_path.name}: {e}")
  1081. else:
  1082. continue
  1083. def _force_remove_readonly(self, path):
  1084. import stat
  1085. try:
  1086. for root, dirs, files in os.walk(path):
  1087. for file in files:
  1088. file_path = Path(root) / file
  1089. try:
  1090. file_path.chmod(stat.S_IWRITE | stat.S_IREAD)
  1091. except Exception:
  1092. pass
  1093. for dir in dirs:
  1094. dir_path = Path(root) / dir
  1095. try:
  1096. dir_path.chmod(stat.S_IWRITE | stat.S_IREAD | stat.S_IEXEC)
  1097. except Exception:
  1098. pass
  1099. path.chmod(stat.S_IWRITE | stat.S_IREAD | stat.S_IEXEC)
  1100. except Exception:
  1101. pass
  1102. def _try_system_delete(self, folder_path):
  1103. import subprocess
  1104. try:
  1105. if os.name == 'nt': # Windows
  1106. result = subprocess.run(
  1107. ['rmdir', '/s', '/q', str(folder_path)],
  1108. capture_output=True,
  1109. text=True,
  1110. timeout=30
  1111. )
  1112. return result.returncode == 0
  1113. else: # Unix/Linux
  1114. result = subprocess.run(
  1115. ['rm', '-rf', str(folder_path)],
  1116. capture_output=True,
  1117. text=True,
  1118. timeout=30
  1119. )
  1120. return result.returncode == 0
  1121. except Exception:
  1122. return False
  1123. def handle_new_file(self, file_path):
  1124. if file_path.name in self.processing:
  1125. return
  1126. archive_extensions = {'.zip', '.7z', '.rar'}
  1127. if file_path.suffix.lower() in archive_extensions:
  1128. logger.info(f"压缩包: {file_path.name}")
  1129. time.sleep(1)
  1130. self.processing.add(file_path.name)
  1131. try:
  1132. extract_dir = file_path.parent / file_path.stem
  1133. extract_dir.mkdir(exist_ok=True)
  1134. if self.system.extract_archive(file_path, extract_dir):
  1135. file_path.unlink()
  1136. time.sleep(1)
  1137. results = self.system.process_directory_tree(extract_dir, self.output_dir)
  1138. if results:
  1139. logger.info(f"解压完成: {len(results)} 个项目")
  1140. else:
  1141. logger.info("解压后未找到可处理的文档文件")
  1142. self.cleanup_processed_folder(extract_dir)
  1143. finally:
  1144. self.processing.discard(file_path.name)
  1145. def monitor_mode():
  1146. input_dir = "input_dir"
  1147. output_dir = "output_dir"
  1148. # 创建输入目录
  1149. input_path = Path(input_dir)
  1150. input_path.mkdir(exist_ok=True)
  1151. # 创建输出目录
  1152. output_path = Path(output_dir)
  1153. output_path.mkdir(exist_ok=True)
  1154. # 初始化系统
  1155. system = MinRAGSystem()
  1156. # 创建监控处理器
  1157. handler = FolderMonitorHandler(system, output_dir, input_dir)
  1158. # 创建文件系统监控器
  1159. observer = Observer()
  1160. observer.schedule(handler, str(input_path), recursive=True)
  1161. # 启动监控
  1162. observer.start()
  1163. logger.info(f"监控目录: {input_path.absolute()}")
  1164. logger.info(f"输出目录: {Path(output_dir).absolute()}")
  1165. # 扫描现有文件夹
  1166. for item in input_path.iterdir():
  1167. if item.is_dir():
  1168. handler.handle_new_folder(item)
  1169. logger.info("监控中... (按 Ctrl+C 停止)")
  1170. try:
  1171. while True:
  1172. time.sleep(1)
  1173. except KeyboardInterrupt:
  1174. observer.stop()
  1175. observer.join()
  1176. logger.info("✅ 监控已停止")
  1177. def main():
  1178. print("合同信息提取系统")
  1179. print("输入目录: input_dir")
  1180. print("输出目录: output_dir")
  1181. print("=" * 50)
  1182. monitor_mode()
  1183. if __name__ == "__main__":
  1184. main()