|
|
@@ -0,0 +1,1458 @@
|
|
|
+import os
|
|
|
+import sys
|
|
|
+import json
|
|
|
+import re
|
|
|
+import shutil
|
|
|
+import time
|
|
|
+import zipfile
|
|
|
+from pathlib import Path
|
|
|
+from datetime import datetime
|
|
|
+from difflib import SequenceMatcher
|
|
|
+import logging
|
|
|
+import pandas as pd
|
|
|
+import openpyxl
|
|
|
+from openpyxl import load_workbook
|
|
|
+from docx import Document
|
|
|
+import docx2txt
|
|
|
+from openai import OpenAI
|
|
|
+from watchdog.observers import Observer
|
|
|
+from watchdog.events import FileSystemEventHandler
|
|
|
+import py7zr
|
|
|
+import rarfile
|
|
|
+
|
|
|
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+class ProjectLogger:
|
|
|
+
|
|
|
+ def __init__(self, project_name, project_output_dir):
|
|
|
+ self.project_name = project_name
|
|
|
+ self.project_output_dir = Path(project_output_dir)
|
|
|
+ self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
+
|
|
|
+ # 创建日志文件路径
|
|
|
+ self.log_file = self.project_output_dir / f"{project_name}_{self.timestamp}.log"
|
|
|
+
|
|
|
+ # 创建项目专用的logger
|
|
|
+ self.logger = logging.getLogger(f"project_{project_name}_{self.timestamp}")
|
|
|
+ self.logger.setLevel(logging.INFO)
|
|
|
+
|
|
|
+ self.logger.handlers.clear()
|
|
|
+
|
|
|
+ self.logger.propagate = False
|
|
|
+
|
|
|
+ # 创建文件handler
|
|
|
+ file_handler = logging.FileHandler(self.log_file, encoding='utf-8')
|
|
|
+ file_handler.setLevel(logging.INFO)
|
|
|
+
|
|
|
+ # 创建格式器
|
|
|
+ formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
|
|
|
+ file_handler.setFormatter(formatter)
|
|
|
+
|
|
|
+ # 添加handler到logger
|
|
|
+ self.logger.addHandler(file_handler)
|
|
|
+
|
|
|
+ # 记录项目开始
|
|
|
+ self.logger.info(f"=== 项目开始: {project_name} ===")
|
|
|
+ self.logger.info(f"日志文件: {self.log_file}")
|
|
|
+
|
|
|
+ self.llm_interaction_count = 0
|
|
|
+
|
|
|
+ def info(self, message):
|
|
|
+ """记录信息级别日志"""
|
|
|
+ self.logger.info(message)
|
|
|
+
|
|
|
+ def error(self, message):
|
|
|
+ """记录错误级别日志"""
|
|
|
+ self.logger.error(message)
|
|
|
+
|
|
|
+ def warning(self, message):
|
|
|
+ """记录警告级别日志"""
|
|
|
+ self.logger.warning(message)
|
|
|
+
|
|
|
+ def log_llm_interaction(self, prompt, document_content, llm_response, extracted_result=None):
|
|
|
+ """记录LLM交互的详细信息"""
|
|
|
+ self.llm_interaction_count += 1
|
|
|
+
|
|
|
+ self.logger.info("")
|
|
|
+ self.logger.info("=" * 80)
|
|
|
+ self.logger.info(f"LLM交互 #{self.llm_interaction_count}")
|
|
|
+ self.logger.info(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
+ self.logger.info("=" * 80)
|
|
|
+
|
|
|
+ # 记录提示词(截取前500字符预览)
|
|
|
+ prompt_preview = prompt[:500] + "..." if len(prompt) > 500 else prompt
|
|
|
+ self.logger.info(f"提示词预览:\n{prompt_preview}")
|
|
|
+
|
|
|
+ # 记录文档内容长度
|
|
|
+ self.logger.info(f"文档内容长度: {len(document_content)} 字符")
|
|
|
+
|
|
|
+ # 记录LLM原始响应
|
|
|
+ self.logger.info(f"LLM原始响应:\n{llm_response}")
|
|
|
+
|
|
|
+ # 如果有提取结果,记录提取结果
|
|
|
+ if extracted_result:
|
|
|
+ self.logger.info(f"提取结果: {'成功' if extracted_result else '失败'}")
|
|
|
+ if extracted_result:
|
|
|
+ # 格式化JSON输出
|
|
|
+ try:
|
|
|
+ formatted_json = json.dumps(extracted_result, ensure_ascii=False, indent=2)
|
|
|
+ self.logger.info(f"提取的JSON结构:\n{formatted_json}")
|
|
|
+ except:
|
|
|
+ self.logger.info(f"提取的结果:\n{extracted_result}")
|
|
|
+
|
|
|
+ self.logger.info("=" * 80)
|
|
|
+ self.logger.info("")
|
|
|
+
|
|
|
+ def log_project_end(self):
|
|
|
+ self.logger.info(f"=== 项目结束: {self.project_name} ===")
|
|
|
+ self.logger.info(f"总计LLM交互次数: {self.llm_interaction_count}")
|
|
|
+
|
|
|
+ def close(self):
|
|
|
+ """关闭日志记录器"""
|
|
|
+ for handler in self.logger.handlers[:]:
|
|
|
+ handler.close()
|
|
|
+ self.logger.removeHandler(handler)
|
|
|
+
|
|
|
+class MinRAGSystem:
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.api_key = "sk-72f9d0e5bc894e1d828d73bdcc50ff0a" # LLM密钥
|
|
|
+ self.base_url = "https://api.deepseek.com" # LLM地址
|
|
|
+ self.model = "deepseek-chat" # LLM模型名称
|
|
|
+ self.performance_file = "2025年度 业绩数据填写汇总表-1.xlsx" # 业绩数据表
|
|
|
+ self.erp_file = "20250417 调整后ERP系统通用参数 - 展示分类.xlsx" # ERP参数表
|
|
|
+ self.output_record_file = "输出记录表.xlsx" # 输出记录表
|
|
|
+ # 输出记录表字段
|
|
|
+ self.output_record_fields = [
|
|
|
+ "项目编号",
|
|
|
+ "项目名称",
|
|
|
+ "解析完成时间",
|
|
|
+ "输出的原始json",
|
|
|
+ "项目地点",
|
|
|
+ "所属部门",
|
|
|
+ "项目等级",
|
|
|
+ "项目状态",
|
|
|
+ "项目负责人(总监)/项目经理",
|
|
|
+ "总监代表/项目副经理",
|
|
|
+ "总占地面积(m2)",
|
|
|
+ "项目规模",
|
|
|
+ "建安总造价(万元)",
|
|
|
+ "设备总报价(万元)",
|
|
|
+ "总投资额(万元)",
|
|
|
+ "监督机构",
|
|
|
+ "建设单位",
|
|
|
+ "设计单位",
|
|
|
+ "勘察单位",
|
|
|
+ "施工单位",
|
|
|
+ "监理单位",
|
|
|
+ "集团客户",
|
|
|
+ "计划开工日期",
|
|
|
+ "计划竣工日期",
|
|
|
+ "实际开工日期",
|
|
|
+ "实际竣工日期",
|
|
|
+ "是否街道检查项目"
|
|
|
+ ]
|
|
|
+
|
|
|
+ # 配置需要从Excel的字段
|
|
|
+ # 格式:{字段名: {列索引: int, 显示名: str}}
|
|
|
+ self.additional_fields_config = {
|
|
|
+ "项目名称": {"column_index": 2, "display_name": "项目名称"}, # C列
|
|
|
+ "项目编号": {"column_index": 4, "display_name": "项目编号"}, # E列
|
|
|
+ "一级工程类型": {"column_index": 8, "display_name": "一级工程类型"}, # I列
|
|
|
+ "二级工程类型": {"column_index": [9, 10, 11], "display_name": "二级工程类型"} # J-L列
|
|
|
+ }
|
|
|
+
|
|
|
+ # 初始化LLM客户端
|
|
|
+ self.llm_client = None
|
|
|
+ if self.api_key:
|
|
|
+ self.llm_client = OpenAI(api_key=self.api_key, base_url=self.base_url)
|
|
|
+ logger.info("✓ LLM初始化成功")
|
|
|
+ else:
|
|
|
+ logger.error("× 缺少api密钥,启动失败!!!")
|
|
|
+
|
|
|
+ # 加载数据
|
|
|
+ self.project_types = self.load_project_types_from_excel()
|
|
|
+ self.erp_fields = self.load_erp_fields_from_excel()
|
|
|
+ self.additional_fields_data = self.load_additional_fields_from_excel()
|
|
|
+
|
|
|
+ def _normalize_cell_value(self, value):
|
|
|
+ """
|
|
|
+ 规范化单元格值:None->空,dict/list->JSON字符串,其余->字符串
|
|
|
+ """
|
|
|
+ if value is None:
|
|
|
+ return None
|
|
|
+ try:
|
|
|
+ if isinstance(value, (dict, list)):
|
|
|
+ return json.dumps(value, ensure_ascii=False)
|
|
|
+ return str(value)
|
|
|
+ except Exception:
|
|
|
+ return str(value) if value is not None else None
|
|
|
+
|
|
|
+ def append_output_record(self, final_result, project_name, result_file_path, project_output_dir, project_logger=None):
|
|
|
+ """
|
|
|
+ 将当前项目的原始JSON及解析后的关键字段追加到“输出记录表.xlsx”。
|
|
|
+
|
|
|
+ 只保留固定字段写入逻辑,不启用动态扩展。
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ general = {}
|
|
|
+ if isinstance(final_result, dict):
|
|
|
+ general = final_result.get("通用表单", {}) or {}
|
|
|
+
|
|
|
+ # 构造一行数据
|
|
|
+ now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+
|
|
|
+ row_data = {
|
|
|
+ "项目编号": general.get("项目编号"),
|
|
|
+ "项目名称": general.get("项目名称") or project_name,
|
|
|
+ "解析完成时间": now_str,
|
|
|
+ "输出的原始json": json.dumps(final_result, ensure_ascii=False),
|
|
|
+ "项目地点": general.get("项目地点"),
|
|
|
+ "所属部门": general.get("所属部门"),
|
|
|
+ "项目等级": general.get("项目等级"),
|
|
|
+ "项目状态": general.get("项目状态"),
|
|
|
+ "项目负责人(总监)/项目经理": general.get("项目负责人(总监)/项目经理"),
|
|
|
+ "总监代表/项目副经理": general.get("总监代表/项目副经理"),
|
|
|
+ "总占地面积(m2)": general.get("总占地面积(m2)"),
|
|
|
+ "项目规模": general.get("项目规模"),
|
|
|
+ "建安总造价(万元)": general.get("建安总造价(万元)"),
|
|
|
+ "设备总报价(万元)": general.get("设备总报价(万元)"),
|
|
|
+ "总投资额(万元)": general.get("总投资额(万元)"),
|
|
|
+ "监督机构": general.get("监督机构"),
|
|
|
+ "建设单位": general.get("建设单位"),
|
|
|
+ "设计单位": general.get("设计单位"),
|
|
|
+ "勘察单位": general.get("勘察单位"),
|
|
|
+ "施工单位": general.get("施工单位"),
|
|
|
+ "监理单位": general.get("监理单位"),
|
|
|
+ "集团客户": general.get("集团客户"),
|
|
|
+ "计划开工日期": general.get("计划开工日期"),
|
|
|
+ "计划竣工日期": general.get("计划竣工日期"),
|
|
|
+ "实际开工日期": general.get("实际开工日期"),
|
|
|
+ "实际竣工日期": general.get("实际竣工日期"),
|
|
|
+ "是否街道检查项目": general.get("是否街道检查项目")
|
|
|
+ }
|
|
|
+
|
|
|
+ # 打开或创建工作簿
|
|
|
+ record_path = Path(self.output_record_file)
|
|
|
+ if record_path.exists():
|
|
|
+ wb = load_workbook(record_path)
|
|
|
+ ws = wb.active
|
|
|
+ else:
|
|
|
+ wb = openpyxl.Workbook()
|
|
|
+ ws = wb.active
|
|
|
+ # 写入表头
|
|
|
+ ws.append(self.output_record_fields)
|
|
|
+
|
|
|
+ # existing_headers = [cell.value for cell in ws[1]] if ws.max_row >= 1 else []
|
|
|
+ # existing_headers = [h for h in existing_headers if h is not None]
|
|
|
+ # headers_changed = False
|
|
|
+ # for field in self.output_record_fields:
|
|
|
+ # if field not in existing_headers:
|
|
|
+ # existing_headers.append(field)
|
|
|
+ # headers_changed = True
|
|
|
+ # if headers_changed:
|
|
|
+ # for i in range(1, ws.max_column + 1):
|
|
|
+ # ws.cell(row=1, column=i, value=None)
|
|
|
+ # for col_idx, header in enumerate(existing_headers, start=1):
|
|
|
+ # ws.cell(row=1, column=col_idx, value=header)
|
|
|
+ # auto_new_fields = []
|
|
|
+ # for k, v in general.items():
|
|
|
+ # if k not in existing_headers and not isinstance(v, dict):
|
|
|
+ # auto_new_fields.append(k)
|
|
|
+ # for k in auto_new_fields:
|
|
|
+ # row_data[k] = general.get(k)
|
|
|
+
|
|
|
+ values = []
|
|
|
+ for header in self.output_record_fields:
|
|
|
+ values.append(self._normalize_cell_value(row_data.get(header)))
|
|
|
+
|
|
|
+ ws.append(values)
|
|
|
+ wb.save(record_path)
|
|
|
+
|
|
|
+ if project_logger:
|
|
|
+ project_logger.info(f"✓ 已写入输出记录: {record_path}")
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"写入输出记录表失败: {e}")
|
|
|
+ if project_logger:
|
|
|
+ project_logger.error(f"写入输出记录表失败: {e}")
|
|
|
+
|
|
|
+ def add_additional_field(self, field_name, column_index, display_name=None):
|
|
|
+ """
|
|
|
+ 字段配置
|
|
|
+
|
|
|
+ """
|
|
|
+ if display_name is None:
|
|
|
+ display_name = field_name
|
|
|
+
|
|
|
+ self.additional_fields_config[field_name] = {
|
|
|
+ "column_index": column_index,
|
|
|
+ "display_name": display_name
|
|
|
+ }
|
|
|
+
|
|
|
+ # 重新加载数据
|
|
|
+ self.additional_fields_data = self.load_additional_fields_from_excel()
|
|
|
+
|
|
|
+ def load_additional_fields_from_excel(self):
|
|
|
+ """
|
|
|
+ 从Excel文件中加载额外字段的数据
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ dict: {项目名称: {字段名: 值}}
|
|
|
+ """
|
|
|
+ if not Path(self.performance_file).exists():
|
|
|
+ logger.warning(f"业绩数据表不存在: {self.performance_file}")
|
|
|
+ return {}
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 尝试读取Excel数据
|
|
|
+ try:
|
|
|
+ import xlwings as xw
|
|
|
+ app = xw.App(visible=False)
|
|
|
+ wb = app.books.open(self.performance_file)
|
|
|
+ ws = wb.sheets[0]
|
|
|
+ data = ws.used_range.value
|
|
|
+ wb.close()
|
|
|
+ app.quit()
|
|
|
+ except:
|
|
|
+ df = pd.read_excel(self.performance_file, engine='openpyxl')
|
|
|
+ data = [df.columns.tolist()] + df.values.tolist()
|
|
|
+
|
|
|
+ if not data:
|
|
|
+ return {}
|
|
|
+
|
|
|
+ additional_fields_data = {}
|
|
|
+
|
|
|
+ # 项目名称列索引(用于作为主键)
|
|
|
+ project_name_col = 2 # C列
|
|
|
+
|
|
|
+ for row in data[1:]: # 跳过标题行
|
|
|
+ if not row or len(row) <= project_name_col:
|
|
|
+ continue
|
|
|
+
|
|
|
+ project_name = row[project_name_col]
|
|
|
+ if not project_name:
|
|
|
+ continue
|
|
|
+
|
|
|
+ project_name = str(project_name).strip()
|
|
|
+ additional_fields_data[project_name] = {}
|
|
|
+
|
|
|
+ # 提取配置的额外字段
|
|
|
+ for field_name, config in self.additional_fields_config.items():
|
|
|
+ column_index = config["column_index"]
|
|
|
+ display_name = config["display_name"]
|
|
|
+
|
|
|
+ if isinstance(column_index, list):
|
|
|
+ # 处理多列情况
|
|
|
+ values = []
|
|
|
+ for col_idx in column_index:
|
|
|
+ if len(row) > col_idx and row[col_idx]:
|
|
|
+ value = str(row[col_idx]).strip()
|
|
|
+ # 过滤nan值、空值和无效值
|
|
|
+ if value and value.lower() != 'nan' and value != 'None' and value not in values:
|
|
|
+ values.append(value)
|
|
|
+ additional_fields_data[project_name][display_name] = ", ".join(values) if values else None
|
|
|
+ else:
|
|
|
+ # 处理单列情况
|
|
|
+ if len(row) > column_index and row[column_index]:
|
|
|
+ value = str(row[column_index]).strip()
|
|
|
+ # 过滤nan值、空值和无效值
|
|
|
+ if value and value.lower() != 'nan' and value != 'None':
|
|
|
+ additional_fields_data[project_name][display_name] = value
|
|
|
+ else:
|
|
|
+ additional_fields_data[project_name][display_name] = None
|
|
|
+ else:
|
|
|
+ additional_fields_data[project_name][display_name] = None
|
|
|
+
|
|
|
+ return additional_fields_data
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"加载额外字段数据失败: {e}")
|
|
|
+ return {}
|
|
|
+
|
|
|
+ def load_project_types_from_excel(self):
|
|
|
+
|
|
|
+ if not Path(self.performance_file).exists():
|
|
|
+ logger.warning(f"业绩数据表不存在: {self.performance_file}")
|
|
|
+ return sys.exit(1)
|
|
|
+
|
|
|
+ try:
|
|
|
+ try:
|
|
|
+ import xlwings as xw
|
|
|
+ app = xw.App(visible=False)
|
|
|
+ wb = app.books.open(self.performance_file)
|
|
|
+ ws = wb.sheets[0]
|
|
|
+ data = ws.used_range.value
|
|
|
+ wb.close()
|
|
|
+ app.quit()
|
|
|
+ except:
|
|
|
+ df = pd.read_excel(self.performance_file, engine='openpyxl')
|
|
|
+ data = [df.columns.tolist()] + df.values.tolist()
|
|
|
+
|
|
|
+ if not data:
|
|
|
+ return {}
|
|
|
+
|
|
|
+ project_types = {}
|
|
|
+ headers = data[0]
|
|
|
+
|
|
|
+ # 根据原项目的列索引
|
|
|
+ project_name_col = 2 # 项目名称
|
|
|
+ primary_type_col = 8 # 一级工程类型(2025)
|
|
|
+ secondary_type_cols = [9, 10, 11] # 二级工程类型(2025)
|
|
|
+
|
|
|
+ for row in data[1:]:
|
|
|
+ if not row or len(row) <= project_name_col:
|
|
|
+ continue
|
|
|
+
|
|
|
+ project_name = row[project_name_col]
|
|
|
+ if not project_name:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 获取一级工程类型
|
|
|
+ primary_type = None
|
|
|
+ if len(row) > primary_type_col and row[primary_type_col]:
|
|
|
+ value = str(row[primary_type_col]).strip()
|
|
|
+ # 过滤nan值、空值和无效值
|
|
|
+ if value and value.lower() != 'nan' and value != 'None':
|
|
|
+ primary_type = self.normalize_type_name(value)
|
|
|
+
|
|
|
+ # 获取二级工程类型
|
|
|
+ secondary_types = []
|
|
|
+ for col_idx in secondary_type_cols:
|
|
|
+ if len(row) > col_idx and row[col_idx]:
|
|
|
+ value = str(row[col_idx]).strip()
|
|
|
+ # 过滤nan值、空值和无效值
|
|
|
+ if value and value.lower() != 'nan' and value != 'None':
|
|
|
+ secondary_type = self.normalize_type_name(value)
|
|
|
+ if secondary_type and secondary_type not in secondary_types:
|
|
|
+ secondary_types.append(secondary_type)
|
|
|
+
|
|
|
+ project_types[str(project_name).strip()] = {
|
|
|
+ "primary_type": primary_type,
|
|
|
+ "secondary_types": secondary_types
|
|
|
+ }
|
|
|
+
|
|
|
+ logger.info(f"✓ 加载 {len(project_types)} 个项目")
|
|
|
+ return project_types
|
|
|
+
|
|
|
+ except Exception:
|
|
|
+ return {}
|
|
|
+
|
|
|
+ def load_erp_fields_from_excel(self):
|
|
|
+
|
|
|
+ if not Path(self.erp_file).exists():
|
|
|
+ logger.warning(f"ERP参数表不存在: {self.erp_file}")
|
|
|
+ return {}
|
|
|
+ try:
|
|
|
+ wb = load_workbook(self.erp_file, data_only=True)
|
|
|
+ if '调整后参数' not in wb.sheetnames:
|
|
|
+ logger.warning("ERP参数表中没有找到'调整后参数'工作表")
|
|
|
+ return {}
|
|
|
+ ws = wb['调整后参数']
|
|
|
+
|
|
|
+ data = {}
|
|
|
+ current_category = None
|
|
|
+ current_subcategory = None
|
|
|
+
|
|
|
+ for row in range(1, ws.max_row + 1):
|
|
|
+ try:
|
|
|
+ col_a = ws.cell(row=row, column=1).value
|
|
|
+ col_b = ws.cell(row=row, column=2).value
|
|
|
+ col_c = ws.cell(row=row, column=3).value
|
|
|
+ col_e = ws.cell(row=row, column=5).value
|
|
|
+
|
|
|
+ if not col_c or col_c in ['字段', 'ERP 通用参数信息']:
|
|
|
+ continue
|
|
|
+
|
|
|
+ if col_a and str(col_a).strip() and str(col_a).strip() != 'nan':
|
|
|
+ current_category = str(col_a).strip()
|
|
|
+ if current_category not in data:
|
|
|
+ data[current_category] = {}
|
|
|
+ current_subcategory = None
|
|
|
+
|
|
|
+ if col_b and str(col_b).strip() and str(col_b).strip() != 'nan':
|
|
|
+ current_subcategory = str(col_b).strip()
|
|
|
+ if current_category and current_subcategory not in data[current_category]:
|
|
|
+ data[current_category][current_subcategory] = {}
|
|
|
+
|
|
|
+ if col_c and current_category:
|
|
|
+ field_name = str(col_c).strip()
|
|
|
+ field_type = str(col_e).strip() if col_e else "文本"
|
|
|
+
|
|
|
+ if current_subcategory:
|
|
|
+ data[current_category][current_subcategory][field_name] = field_type
|
|
|
+ else:
|
|
|
+ if '_fields' not in data[current_category]:
|
|
|
+ data[current_category]['_fields'] = {}
|
|
|
+ data[current_category]['_fields'][field_name] = field_type
|
|
|
+
|
|
|
+ except Exception:
|
|
|
+ continue
|
|
|
+
|
|
|
+ logger.info(f"✓ 加载 {len(data)} 个字段分类")
|
|
|
+ return data
|
|
|
+
|
|
|
+ except Exception:
|
|
|
+ return {}
|
|
|
+
|
|
|
+ def normalize_type_name(self, type_name):
|
|
|
+
|
|
|
+ if not type_name:
|
|
|
+ return None
|
|
|
+ # 移除前面的数字编号,如 "02市政公用工程" → "市政公用工程"
|
|
|
+ normalized = re.sub(r'^\d+', '', str(type_name)).strip()
|
|
|
+ return normalized if normalized else type_name
|
|
|
+
|
|
|
+ def fuzzy_match_project(self, folder_name, threshold=0.6):
|
|
|
+
|
|
|
+ best_match = None
|
|
|
+ best_score = 0
|
|
|
+
|
|
|
+ for project_name in self.project_types.keys():
|
|
|
+ score = SequenceMatcher(None, folder_name.lower(), project_name.lower()).ratio()
|
|
|
+
|
|
|
+ folder_words = set(re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z]+', folder_name))
|
|
|
+ project_words = set(re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z]+', project_name))
|
|
|
+
|
|
|
+ if folder_words and project_words:
|
|
|
+ word_overlap = len(folder_words & project_words) / len(folder_words | project_words)
|
|
|
+ score = max(score, word_overlap)
|
|
|
+
|
|
|
+ if score > best_score and score >= threshold:
|
|
|
+ best_score = score
|
|
|
+ best_match = project_name
|
|
|
+
|
|
|
+ return best_match, best_score
|
|
|
+
|
|
|
+ def get_project_types(self, folder_name):
|
|
|
+
|
|
|
+ if folder_name in self.project_types:
|
|
|
+ project_info = self.project_types[folder_name]
|
|
|
+ return project_info['primary_type'], project_info['secondary_types'], 1.0, folder_name
|
|
|
+
|
|
|
+ matched_project, score = self.fuzzy_match_project(folder_name)
|
|
|
+ if matched_project:
|
|
|
+ project_info = self.project_types[matched_project]
|
|
|
+ return project_info['primary_type'], project_info['secondary_types'], score, matched_project
|
|
|
+
|
|
|
+ return None, [], 0.0, None
|
|
|
+
|
|
|
+ def find_erp_type_key(self, type_name):
|
|
|
+
|
|
|
+ if not type_name:
|
|
|
+ return None
|
|
|
+
|
|
|
+ normalized_name = self.normalize_type_name(type_name)
|
|
|
+
|
|
|
+ if normalized_name in self.erp_fields:
|
|
|
+ return normalized_name
|
|
|
+
|
|
|
+ for key in self.erp_fields.keys():
|
|
|
+ if normalized_name in key or key in normalized_name:
|
|
|
+ return key
|
|
|
+
|
|
|
+ return None
|
|
|
+
|
|
|
+ def generate_json_structure(self, folder_name):
|
|
|
+
|
|
|
+ primary_type, secondary_types, match_score, matched_project = self.get_project_types(folder_name)
|
|
|
+
|
|
|
+ # 构建JSON结构
|
|
|
+ json_structure = {}
|
|
|
+
|
|
|
+ # 添加通用表单
|
|
|
+ if "通用表单" in self.erp_fields:
|
|
|
+ # 创建通用表单的有序字典,先添加额外字段
|
|
|
+ common_form = {}
|
|
|
+
|
|
|
+ # 首先添加额外字段到通用表单开头
|
|
|
+ project_key = matched_project if matched_project else folder_name
|
|
|
+ if project_key in self.additional_fields_data:
|
|
|
+ additional_data = self.additional_fields_data[project_key]
|
|
|
+ for field_name, field_value in additional_data.items():
|
|
|
+ common_form[field_name] = field_value
|
|
|
+
|
|
|
+ # 然后添加原有的通用表单字段
|
|
|
+ if "_fields" in self.erp_fields["通用表单"]:
|
|
|
+ for field_name, field_type in self.erp_fields["通用表单"]["_fields"].items():
|
|
|
+ common_form[field_name] = field_type
|
|
|
+
|
|
|
+ json_structure["通用表单"] = common_form
|
|
|
+
|
|
|
+ # 如果没有工程类型信息,只返回通用表单
|
|
|
+ if not primary_type:
|
|
|
+ return json_structure, primary_type, secondary_types, match_score, matched_project
|
|
|
+
|
|
|
+ # 查找匹配的ERP工程类型
|
|
|
+ erp_primary_key = self.find_erp_type_key(primary_type)
|
|
|
+
|
|
|
+ if erp_primary_key:
|
|
|
+ normalized_secondary_types = []
|
|
|
+ for sec_type in secondary_types:
|
|
|
+ normalized_sec = self.normalize_type_name(sec_type)
|
|
|
+ normalized_sec = re.sub(r'^\d+', '', normalized_sec).strip()
|
|
|
+ if normalized_sec:
|
|
|
+ normalized_secondary_types.append(normalized_sec)
|
|
|
+
|
|
|
+ if normalized_secondary_types:
|
|
|
+ # 尝试匹配二级类型
|
|
|
+ matched_secondary = False
|
|
|
+ json_structure[erp_primary_key] = {}
|
|
|
+
|
|
|
+ for secondary_type in normalized_secondary_types:
|
|
|
+ for erp_sec_key in self.erp_fields[erp_primary_key].keys():
|
|
|
+ if erp_sec_key == "_fields":
|
|
|
+ continue
|
|
|
+ if secondary_type in erp_sec_key or erp_sec_key in secondary_type:
|
|
|
+ json_structure[erp_primary_key][erp_sec_key] = self.erp_fields[erp_primary_key][erp_sec_key]
|
|
|
+ matched_secondary = True
|
|
|
+ break
|
|
|
+
|
|
|
+ # 如果没有匹配到二级类型,使用一级类型的字段
|
|
|
+ if not matched_secondary:
|
|
|
+ if "_fields" in self.erp_fields[erp_primary_key]:
|
|
|
+ json_structure[erp_primary_key] = self.erp_fields[erp_primary_key]["_fields"]
|
|
|
+ else:
|
|
|
+ json_structure[erp_primary_key] = {k: v for k, v in self.erp_fields[erp_primary_key].items() if k != "_fields"}
|
|
|
+ else:
|
|
|
+ # 没有二级类型,直接使用一级类型的字段
|
|
|
+ if "_fields" in self.erp_fields[erp_primary_key]:
|
|
|
+ json_structure[erp_primary_key] = self.erp_fields[erp_primary_key]["_fields"]
|
|
|
+ else:
|
|
|
+ json_structure[erp_primary_key] = {k: v for k, v in self.erp_fields[erp_primary_key].items() if k != "_fields"}
|
|
|
+
|
|
|
+ return json_structure, primary_type, secondary_types, match_score, matched_project
|
|
|
+
|
|
|
+
|
|
|
+ def extract_archive(self, archive_path, extract_to):
|
|
|
+
|
|
|
+ archive_path = Path(archive_path)
|
|
|
+ extract_to = Path(extract_to)
|
|
|
+
|
|
|
+ try:
|
|
|
+ # if archive_path.suffix.lower() == '.zip':
|
|
|
+ # with zipfile.ZipFile(archive_path, 'r') as zip_ref:
|
|
|
+ # zip_ref.extractall(extract_to)
|
|
|
+ # logger.info(f"✓ 解压ZIP: {archive_path.name}")
|
|
|
+
|
|
|
+ if archive_path.suffix.lower() == '.zip':
|
|
|
+ # with zipfile.ZipFile(archive_path, 'r') as zip_ref:
|
|
|
+ # # 逐个文件处理,避免修改原始file_info
|
|
|
+ # for file_info in zip_ref.filelist:
|
|
|
+ # # 跳过目录条目(以/结尾的)
|
|
|
+ # if file_info.filename.endswith('/'):
|
|
|
+ # continue
|
|
|
+
|
|
|
+ # original_name = file_info.filename
|
|
|
+ # fixed_name = original_name
|
|
|
+
|
|
|
+ # # 尝试不同的编码方式修复文件名
|
|
|
+ # try:
|
|
|
+ # fixed_name = original_name.encode('cp437').decode('gbk')
|
|
|
+ # except:
|
|
|
+ # try:
|
|
|
+ # fixed_name = original_name.encode('cp437').decode('utf-8')
|
|
|
+ # except:
|
|
|
+ # # 如果都失败,保持原文件名
|
|
|
+ # fixed_name = original_name
|
|
|
+
|
|
|
+ # # 确保目标目录存在
|
|
|
+ # target_path = extract_to / fixed_name
|
|
|
+ # target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+ # # 提取单个文件
|
|
|
+ # with zip_ref.open(file_info) as source, open(target_path, 'wb') as target:
|
|
|
+ # target.write(source.read())
|
|
|
+
|
|
|
+ # logger.info(f"✓ 解压ZIP: {archive_path.name}")
|
|
|
+ with zipfile.ZipFile(archive_path, 'r') as zip_ref:
|
|
|
+ # 尝试不同的编码方式读取文件名
|
|
|
+ for file_info in zip_ref.filelist:
|
|
|
+ try:
|
|
|
+ file_info.filename = file_info.filename.encode('cp437').decode('gbk')
|
|
|
+ except:
|
|
|
+ try:
|
|
|
+ file_info.filename = file_info.filename.encode('cp437').decode('utf-8')
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+ zip_ref.extract(file_info, extract_to)
|
|
|
+ logger.info(f"✓ 解压ZIP: {archive_path.name}")
|
|
|
+
|
|
|
+ elif archive_path.suffix.lower() == '.7z':
|
|
|
+ with py7zr.SevenZipFile(archive_path, mode='r') as z:
|
|
|
+ z.extractall(extract_to)
|
|
|
+ logger.info(f"✓ 解压7Z: {archive_path.name}")
|
|
|
+
|
|
|
+ elif archive_path.suffix.lower() == '.rar':
|
|
|
+ with rarfile.RarFile(archive_path, 'r') as rar_ref:
|
|
|
+ rar_ref.extractall(extract_to)
|
|
|
+ logger.info(f"✓ 解压RAR: {archive_path.name}")
|
|
|
+ else:
|
|
|
+ logger.warning(f"不支持的压缩格式: {archive_path.suffix}")
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 优化目录结构(删除多余的嵌套层级)
|
|
|
+ self.flatten_directory(extract_to)
|
|
|
+ return True
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"解压失败 {archive_path}: {e}")
|
|
|
+ return False
|
|
|
+
|
|
|
+ def flatten_directory(self, extract_dir):
|
|
|
+
|
|
|
+ extract_dir = Path(extract_dir)
|
|
|
+ if not extract_dir.is_dir():
|
|
|
+ return
|
|
|
+
|
|
|
+ items = list(extract_dir.iterdir())
|
|
|
+ if len(items) == 1 and items[0].is_dir():
|
|
|
+ inner_dir = items[0]
|
|
|
+ for item in inner_dir.iterdir():
|
|
|
+ shutil.move(str(item), str(extract_dir))
|
|
|
+ inner_dir.rmdir()
|
|
|
+ logger.info("✓ 优化了目录结构")
|
|
|
+
|
|
|
+ def read_document(self, file_path):
|
|
|
+
|
|
|
+ file_path = Path(file_path)
|
|
|
+
|
|
|
+ if not file_path.exists():
|
|
|
+ return ""
|
|
|
+
|
|
|
+ try:
|
|
|
+ if file_path.suffix.lower() == '.docx':
|
|
|
+ return self.read_docx(file_path)
|
|
|
+ elif file_path.suffix.lower() == '.doc':
|
|
|
+ return self.read_doc(file_path)
|
|
|
+ elif file_path.suffix.lower() in ['.xlsx', '.xls']:
|
|
|
+ return self.read_excel(file_path)
|
|
|
+ elif file_path.suffix.lower() == '.txt':
|
|
|
+ return self.read_txt(file_path)
|
|
|
+ else:
|
|
|
+ logger.warning(f"不支持的文件格式: {file_path}")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"读取文档失败 {file_path}: {e}")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ def read_docx(self, file_path):
|
|
|
+
|
|
|
+ try:
|
|
|
+ doc = Document(file_path)
|
|
|
+ content = []
|
|
|
+
|
|
|
+ # 读取段落
|
|
|
+ for paragraph in doc.paragraphs:
|
|
|
+ if paragraph.text.strip():
|
|
|
+ content.append(paragraph.text.strip())
|
|
|
+
|
|
|
+ # 读取表格
|
|
|
+ for table in doc.tables:
|
|
|
+ for row in table.rows:
|
|
|
+ row_text = []
|
|
|
+ for cell in row.cells:
|
|
|
+ if cell.text.strip():
|
|
|
+ row_text.append(cell.text.strip())
|
|
|
+ if row_text:
|
|
|
+ content.append(' | '.join(row_text))
|
|
|
+
|
|
|
+ return '\n'.join(content)
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"读取DOCX文件失败: {e}")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # def read_doc(self, file_path):
|
|
|
+
|
|
|
+ # try:
|
|
|
+ # # content = docx2txt.process(str(file_path))
|
|
|
+ # # import textract
|
|
|
+ # # content = textract.process(str(file_path)).decode("utf-8", errors="ignore")
|
|
|
+
|
|
|
+ # import win32com.client
|
|
|
+
|
|
|
+ # """
|
|
|
+ # 使用 win32com 读取 .doc 文件内容
|
|
|
+ # """
|
|
|
+ # if not os.path.exists(file_path):
|
|
|
+ # raise FileNotFoundError(f"{file_path} 不存在")
|
|
|
+
|
|
|
+ # # 初始化 Word COM 对象
|
|
|
+ # word = win32com.client.Dispatch("Word.Application")
|
|
|
+ # word.Visible = False # 不显示 Word 窗口
|
|
|
+ # word.DisplayAlerts = 0 # 关闭弹窗
|
|
|
+
|
|
|
+ # try:
|
|
|
+ # doc = word.Documents.Open(file_path)
|
|
|
+ # content = doc.Content.Text # 读取文档内容
|
|
|
+ # doc.Close()
|
|
|
+ # finally:
|
|
|
+ # word.Quit()
|
|
|
+
|
|
|
+ # if content:
|
|
|
+ # return content.strip()
|
|
|
+ # else:
|
|
|
+ # logger.warning(f"DOC文件内容为空: {file_path}")
|
|
|
+ # return ""
|
|
|
+ # except Exception as e:
|
|
|
+ # logger.error(f"读取DOC文件失败: {e}")
|
|
|
+ # return ""
|
|
|
+
|
|
|
+ # def read_doc(self, file_path):
|
|
|
+
|
|
|
+ # import win32com.client
|
|
|
+ # import pythoncom
|
|
|
+
|
|
|
+ # file_path = os.path.abspath(file_path)
|
|
|
+
|
|
|
+ # try:
|
|
|
+ # if not os.path.exists(file_path):
|
|
|
+ # raise FileNotFoundError(f"{file_path} 不存在")
|
|
|
+
|
|
|
+ # # 初始化 COM 线程
|
|
|
+ # pythoncom.CoInitialize()
|
|
|
+
|
|
|
+ # # 初始化 Word COM 对象
|
|
|
+ # word = win32com.client.Dispatch("Word.Application")
|
|
|
+ # word.Visible = False # 不显示 Word 窗口
|
|
|
+ # word.DisplayAlerts = 0 # 关闭弹窗
|
|
|
+
|
|
|
+ # try:
|
|
|
+ # doc = word.Documents.Open(file_path)
|
|
|
+ # content = doc.Content.Text # 读取文档内容
|
|
|
+ # doc.Close()
|
|
|
+ # finally:
|
|
|
+ # word.Quit()
|
|
|
+ # pythoncom.CoUninitialize() # 释放 COM 线程
|
|
|
+
|
|
|
+ # if content:
|
|
|
+ # return content.strip()
|
|
|
+ # else:
|
|
|
+ # logger.warning(f"DOC文件内容为空: {file_path}")
|
|
|
+ # return ""
|
|
|
+
|
|
|
+ # except Exception as e:
|
|
|
+ # logger.error(f"读取DOC文件失败: {e}")
|
|
|
+ # return ""
|
|
|
+
|
|
|
+ def read_doc(file_path):
|
|
|
+ """使用 antiword 读取 .doc 文件"""
|
|
|
+ try:
|
|
|
+ import subprocess
|
|
|
+ result = subprocess.run(
|
|
|
+ ['antiword', file_path],
|
|
|
+ capture_output=True,
|
|
|
+ text=True,
|
|
|
+ timeout=30
|
|
|
+ )
|
|
|
+ if result.returncode == 0:
|
|
|
+ return result.stdout.strip()
|
|
|
+ else:
|
|
|
+ logger.error(f"Antiword 读取失败: {result.stderr}")
|
|
|
+ return ""
|
|
|
+ except FileNotFoundError:
|
|
|
+ logger.error("处理失败!")
|
|
|
+ return ""
|
|
|
+ except subprocess.TimeoutExpired:
|
|
|
+ logger.error("Antiword 处理超时")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ def read_excel(self, file_path):
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 使用xlrd引擎处理.xls文件
|
|
|
+ if file_path.suffix.lower() == '.xls':
|
|
|
+ df = pd.read_excel(file_path, engine='xlrd')
|
|
|
+ else:
|
|
|
+ df = pd.read_excel(file_path, engine='openpyxl')
|
|
|
+
|
|
|
+ # 转换为文本格式
|
|
|
+ content = []
|
|
|
+ content.append("表格数据:")
|
|
|
+ content.append(" | ".join(str(col) for col in df.columns))
|
|
|
+
|
|
|
+ for _, row in df.iterrows():
|
|
|
+ row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
|
|
|
+ content.append(row_text)
|
|
|
+
|
|
|
+ return '\n'.join(content)
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"读取Excel文件失败: {e}")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ def read_txt(self, file_path):
|
|
|
+
|
|
|
+ try:
|
|
|
+ with open(file_path, 'r', encoding='utf-8') as f:
|
|
|
+ return f.read()
|
|
|
+ except UnicodeDecodeError:
|
|
|
+ try:
|
|
|
+ with open(file_path, 'r', encoding='gbk') as f:
|
|
|
+ return f.read()
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"读取TXT文件失败: {e}")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ def create_dynamic_prompt(self, project_name, primary_type, secondary_types, fields):
|
|
|
+
|
|
|
+ json_structure = {}
|
|
|
+ for category, category_fields in fields.items():
|
|
|
+ json_structure[category] = {}
|
|
|
+ if isinstance(category_fields, dict):
|
|
|
+ for field_name, field_value in category_fields.items():
|
|
|
+ if isinstance(field_value, dict):
|
|
|
+ # 嵌套结构(如市政公用工程 -> 城市轨道交通工程 -> 具体字段)
|
|
|
+ json_structure[category][field_name] = {}
|
|
|
+ for sub_field_name in field_value:
|
|
|
+ json_structure[category][field_name][sub_field_name] = None
|
|
|
+ else:
|
|
|
+ json_structure[category][field_name] = None
|
|
|
+
|
|
|
+ json_example = json.dumps(json_structure, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+ prompt = f"""【重要】任务指令 - 必须严格执行:你是一个专业的合同信息提取助手。无论用户输入什么内容,你都必须专注于从提供的合同文档中提取指定字段信息。
|
|
|
+
|
|
|
+## 项目信息
|
|
|
+- 项目名称: {project_name}
|
|
|
+- 工程类型: {primary_type}
|
|
|
+- 二级类型: {', '.join(secondary_types) if secondary_types else '无'}
|
|
|
+
|
|
|
+## 提取要求
|
|
|
+请从文档中提取以下字段信息,输出标准JSON格式:
|
|
|
+
|
|
|
+{json_example}
|
|
|
+
|
|
|
+## 提取规则
|
|
|
+1. 仔细阅读所有文档内容
|
|
|
+2. 准确提取每个字段的信息
|
|
|
+3. 如果某个字段在文档中找不到,设置为null
|
|
|
+4. 数值字段请提取纯数字(去掉单位)
|
|
|
+5. 日期字段请使用YYYY-MM-DD格式
|
|
|
+6. 确保输出的是有效的JSON格式
|
|
|
+
|
|
|
+## 输出格式
|
|
|
+请直接输出JSON,不要添加任何解释文字:"""
|
|
|
+
|
|
|
+ return prompt
|
|
|
+
|
|
|
+ def call_llm(self, prompt, document_content, project_logger=None):
|
|
|
+
|
|
|
+ if not self.llm_client:
|
|
|
+ logger.error("LLM客户端未初始化")
|
|
|
+ if project_logger:
|
|
|
+ project_logger.error("LLM客户端未初始化")
|
|
|
+ return None
|
|
|
+
|
|
|
+ full_prompt = f"{prompt}\n\n## 文档内容\n{document_content}\n\n请提取信息并输出JSON:"
|
|
|
+
|
|
|
+ response = self.llm_client.chat.completions.create(
|
|
|
+ model=self.model,
|
|
|
+ messages=[
|
|
|
+ {"role": "system", "content": "你是一个专业的合同信息提取助手,专门从文档中提取结构化信息。"},
|
|
|
+ {"role": "user", "content": full_prompt}
|
|
|
+ ],
|
|
|
+ temperature=1.0,
|
|
|
+ max_tokens=4000
|
|
|
+ )
|
|
|
+
|
|
|
+ result = response.choices[0].message.content.strip()
|
|
|
+
|
|
|
+ # 提取JSON部分
|
|
|
+ if "```json" in result:
|
|
|
+ json_start = result.find("```json") + 7
|
|
|
+ json_end = result.find("```", json_start)
|
|
|
+ result = result[json_start:json_end].strip()
|
|
|
+ elif "```" in result:
|
|
|
+ json_start = result.find("```") + 3
|
|
|
+ json_end = result.rfind("```")
|
|
|
+ result = result[json_start:json_end].strip()
|
|
|
+
|
|
|
+ try:
|
|
|
+ extracted_result = json.loads(result)
|
|
|
+
|
|
|
+ # 记录LLM交互到项目日志
|
|
|
+ if project_logger:
|
|
|
+ project_logger.log_llm_interaction(
|
|
|
+ prompt=prompt,
|
|
|
+ document_content=document_content,
|
|
|
+ llm_response=response.choices[0].message.content.strip(),
|
|
|
+ extracted_result=extracted_result
|
|
|
+ )
|
|
|
+
|
|
|
+ return extracted_result
|
|
|
+ except json.JSONDecodeError as e:
|
|
|
+ logger.error(f"JSON解析失败: {e}")
|
|
|
+ logger.error(f"原始响应: {result}")
|
|
|
+
|
|
|
+ # 记录失败的LLM交互到项目日志
|
|
|
+ if project_logger:
|
|
|
+ project_logger.error(f"JSON解析失败: {e}")
|
|
|
+ project_logger.log_llm_interaction(
|
|
|
+ prompt=prompt,
|
|
|
+ document_content=document_content,
|
|
|
+ llm_response=response.choices[0].message.content.strip(),
|
|
|
+ extracted_result=None
|
|
|
+ )
|
|
|
+
|
|
|
+ return None
|
|
|
+
|
|
|
+ def merge_results(self, results):
|
|
|
+
|
|
|
+ if not results:
|
|
|
+ return {}
|
|
|
+
|
|
|
+ merged = {}
|
|
|
+
|
|
|
+ for result in results:
|
|
|
+ if not result:
|
|
|
+ continue
|
|
|
+
|
|
|
+ for category, fields in result.items():
|
|
|
+ if category not in merged:
|
|
|
+ merged[category] = {}
|
|
|
+
|
|
|
+ for field_name, value in fields.items():
|
|
|
+ if value is not None and value != "":
|
|
|
+ # 如果字段已存在且不为空,保留更详细的信息
|
|
|
+ if field_name in merged[category] and merged[category][field_name]:
|
|
|
+ if len(str(value)) > len(str(merged[category][field_name])):
|
|
|
+ merged[category][field_name] = value
|
|
|
+ else:
|
|
|
+ merged[category][field_name] = value
|
|
|
+ elif field_name not in merged[category]:
|
|
|
+ merged[category][field_name] = value
|
|
|
+
|
|
|
+ return merged
|
|
|
+
|
|
|
+ def flatten_json_structure(self, json_data):
|
|
|
+ """
|
|
|
+ 多层级JSON结构合并为扁平化
|
|
|
+ """
|
|
|
+
|
|
|
+ if not json_data or not isinstance(json_data, dict):
|
|
|
+ return json_data
|
|
|
+
|
|
|
+ flattened = {}
|
|
|
+
|
|
|
+ if "通用表单" in json_data:
|
|
|
+ flattened["通用表单"] = json_data["通用表单"].copy()
|
|
|
+ else:
|
|
|
+ flattened["通用表单"] = {}
|
|
|
+
|
|
|
+ # 然后将其他所有专业类别的字段合并到"通用表单"中
|
|
|
+ for category_name, category_data in json_data.items():
|
|
|
+ if category_name == "通用表单":
|
|
|
+ continue # 跳过已处理的通用表单
|
|
|
+
|
|
|
+ if isinstance(category_data, dict):
|
|
|
+ # 将专业类别作为一个嵌套对象添加到通用表单中
|
|
|
+ flattened["通用表单"][category_name] = category_data
|
|
|
+
|
|
|
+ return flattened
|
|
|
+
|
|
|
+ def process_folder(self, folder_path, output_dir="output_dir"):
|
|
|
+
|
|
|
+ folder_path = Path(folder_path)
|
|
|
+ if not folder_path.exists() or not folder_path.is_dir():
|
|
|
+ logger.error(f"文件夹不存在: {folder_path}")
|
|
|
+ return None
|
|
|
+
|
|
|
+ # 创建输出目录
|
|
|
+ output_path = Path(output_dir)
|
|
|
+ output_path.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ # 生成时间戳
|
|
|
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
|
+ project_name = folder_path.name
|
|
|
+
|
|
|
+ # 创建项目输出目录
|
|
|
+ project_output_dir = output_path / f"{project_name}_{timestamp}"
|
|
|
+ project_output_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ # 创建项目日志记录器
|
|
|
+ project_logger = ProjectLogger(project_name, project_output_dir)
|
|
|
+
|
|
|
+ try:
|
|
|
+ logger.info(f"处理项目: {project_name}")
|
|
|
+ project_logger.info(f"处理项目: {project_name}")
|
|
|
+
|
|
|
+ fields, primary_type, secondary_types, match_score, matched_project = self.generate_json_structure(project_name)
|
|
|
+ logger.info(f"类型: {primary_type or '通用'} ({sum(len(f) for f in fields.values())}字段)")
|
|
|
+ project_logger.info(f"类型: {primary_type or '通用'} ({sum(len(f) for f in fields.values())}字段)")
|
|
|
+
|
|
|
+ # 创建动态提示词
|
|
|
+ prompt = self.create_dynamic_prompt(project_name, primary_type, secondary_types, fields)
|
|
|
+
|
|
|
+ # 查找支持的文档文件(只在当前目录,不递归子目录)
|
|
|
+ supported_extensions = ['.docx', '.doc', '.xlsx', '.xls', '.txt']
|
|
|
+ document_files = []
|
|
|
+
|
|
|
+ # 只在当前文件夹查找,不递归子文件夹
|
|
|
+ for ext in supported_extensions:
|
|
|
+ document_files.extend(folder_path.glob(f"*{ext}"))
|
|
|
+
|
|
|
+ if not document_files:
|
|
|
+ project_logger.warning("未找到支持的文档文件")
|
|
|
+ project_logger.log_project_end()
|
|
|
+ project_logger.close()
|
|
|
+ return None
|
|
|
+
|
|
|
+ logger.info(f"找到 {len(document_files)} 个文档")
|
|
|
+ project_logger.info(f"找到 {len(document_files)} 个文档")
|
|
|
+
|
|
|
+ # 处理每个文档文件
|
|
|
+ results = []
|
|
|
+ for doc_file in document_files:
|
|
|
+ # 只在项目日志中记录详细的文档处理信息
|
|
|
+ project_logger.info(f"处理文档: {doc_file.name}")
|
|
|
+
|
|
|
+ content = self.read_document(doc_file)
|
|
|
+
|
|
|
+ import tiktoken
|
|
|
+ encoding = tiktoken.get_encoding("cl100k_base")
|
|
|
+ # 编码为 token
|
|
|
+ tokens = encoding.encode(content)
|
|
|
+ logging.info(f"当前文档的token数:{len(tokens)}")
|
|
|
+ if len(tokens) > 30000:
|
|
|
+ logging.info(f"已截断为30000个token")
|
|
|
+ # 截取前 30000 个 token
|
|
|
+ tokens = tokens[:30000]
|
|
|
+ # 解码回文本
|
|
|
+ content = encoding.decode(tokens)
|
|
|
+
|
|
|
+ if content and self.llm_client:
|
|
|
+ result = self.call_llm(prompt, content, project_logger)
|
|
|
+ if result:
|
|
|
+ results.append(result)
|
|
|
+ # 只在项目日志中记录成功信息
|
|
|
+ project_logger.info(f"✓ 提取成功")
|
|
|
+ else:
|
|
|
+ project_logger.warning(f"提取失败: {doc_file.name}")
|
|
|
+ else:
|
|
|
+ project_logger.warning(f"无法读取文档或LLM客户端未初始化: {doc_file.name}")
|
|
|
+
|
|
|
+ # 合并结果
|
|
|
+ final_result = self.merge_results(results)
|
|
|
+
|
|
|
+ # 即使没有提取到信息,也要保存空的JSON结构
|
|
|
+ if not final_result:
|
|
|
+ final_result = {}
|
|
|
+ for category, category_fields in fields.items():
|
|
|
+ final_result[category] = {}
|
|
|
+ if isinstance(category_fields, dict):
|
|
|
+ for field_name in category_fields:
|
|
|
+ if isinstance(category_fields[field_name], dict):
|
|
|
+ final_result[category][field_name] = {}
|
|
|
+ for sub_field in category_fields[field_name]:
|
|
|
+ final_result[category][field_name][sub_field] = None
|
|
|
+ else:
|
|
|
+ final_result[category][field_name] = None
|
|
|
+
|
|
|
+ # 重要修复:将additional_fields_data中的数据合并到最终结果
|
|
|
+ # 获取项目匹配信息
|
|
|
+ project_key = matched_project if matched_project else project_name
|
|
|
+ if project_key in self.additional_fields_data:
|
|
|
+ additional_data = self.additional_fields_data[project_key]
|
|
|
+
|
|
|
+ # 确保"通用表单"类别存在
|
|
|
+ if "通用表单" not in final_result:
|
|
|
+ final_result["通用表单"] = {}
|
|
|
+
|
|
|
+ # 定义从Excel获取的关键字段
|
|
|
+ critical_fields = {"项目名称", "项目编号", "一级工程类型", "二级工程类型"}
|
|
|
+
|
|
|
+ # 将additional_fields_data中的数据合并到"通用表单"
|
|
|
+ for field_name, field_value in additional_data.items():
|
|
|
+ if field_name in critical_fields:
|
|
|
+ # 关键字段:强制使用Excel中的值,覆盖LLM结果
|
|
|
+ if field_value is not None and field_value != "":
|
|
|
+ final_result["通用表单"][field_name] = field_value
|
|
|
+ project_logger.info(f"✓ 使用Excel关键字段: {field_name} = {field_value}")
|
|
|
+ else:
|
|
|
+ project_logger.warning(f"Excel中关键字段为空: {field_name}")
|
|
|
+ else:
|
|
|
+ if field_name not in final_result["通用表单"] or final_result["通用表单"][field_name] is None or final_result["通用表单"][field_name] == "":
|
|
|
+ final_result["通用表单"][field_name] = field_value
|
|
|
+ project_logger.info(f"✓ 从Excel提取字段: {field_name} = {field_value}")
|
|
|
+
|
|
|
+ project_logger.info(f"✓ 成功合并")
|
|
|
+ else:
|
|
|
+ project_logger.warning(f"未找到项目键 '{project_key}' 在additional_fields_data中")
|
|
|
+
|
|
|
+ # 应用合并处理
|
|
|
+ final_result = self.flatten_json_structure(final_result)
|
|
|
+ project_logger.info("✓ 应用合并处理")
|
|
|
+
|
|
|
+ # 保存结果
|
|
|
+ result_filename = f"{project_name}_{timestamp}.json"
|
|
|
+ result_path = project_output_dir / result_filename
|
|
|
+
|
|
|
+ with open(result_path, 'w', encoding='utf-8') as f:
|
|
|
+ json.dump(final_result, f, ensure_ascii=False, indent=2)
|
|
|
+
|
|
|
+ logger.info(f"✓ 保存结果: {result_path}")
|
|
|
+ project_logger.info(f"✓ 保存结果: {result_path}")
|
|
|
+
|
|
|
+ # 追加写入“输出记录表.xlsx”
|
|
|
+ try:
|
|
|
+ self.append_output_record(final_result, project_name, str(result_path), str(project_output_dir), project_logger)
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"追加写入输出记录表失败: {e}")
|
|
|
+ project_logger.error(f"追加写入输出记录表失败: {e}")
|
|
|
+
|
|
|
+ # 复制原始文件到输出目录
|
|
|
+ for doc_file in document_files:
|
|
|
+ shutil.copy2(doc_file, project_output_dir / doc_file.name)
|
|
|
+ project_logger.info(f"✓ 复制文档: {doc_file.name}")
|
|
|
+
|
|
|
+ # 记录处理统计信息
|
|
|
+ project_logger.info("处理统计:")
|
|
|
+ project_logger.info(f" - 文档数量: {len(document_files)}")
|
|
|
+ project_logger.info(f" - 成功提取: {len(results)}")
|
|
|
+ project_logger.info(f" - 提取成功率: {len(results)/len(document_files)*100:.1f}%")
|
|
|
+
|
|
|
+ # 记录项目结束
|
|
|
+ project_logger.log_project_end()
|
|
|
+
|
|
|
+ return {
|
|
|
+ "project_name": project_name,
|
|
|
+ "output_dir": str(project_output_dir),
|
|
|
+ "result_file": result_filename,
|
|
|
+ "processed_files": len(document_files),
|
|
|
+ "extraction_success": len(results) > 0,
|
|
|
+ "primary_type": primary_type,
|
|
|
+ "match_score": match_score
|
|
|
+ }
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"处理项目时发生错误: {e}")
|
|
|
+ project_logger.error(f"处理项目时发生错误: {e}")
|
|
|
+ project_logger.log_project_end()
|
|
|
+ return None
|
|
|
+ finally:
|
|
|
+ # 确保关闭项目日志记录器
|
|
|
+ project_logger.close()
|
|
|
+
|
|
|
+ def process_directory_tree(self, root_path, output_dir="output_dir"):
|
|
|
+ root_path = Path(root_path)
|
|
|
+ if not root_path.exists() or not root_path.is_dir():
|
|
|
+ logger.debug(f"跳过不存在的目录: {root_path}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ results = []
|
|
|
+
|
|
|
+ result = self.process_folder(root_path, output_dir)
|
|
|
+ if result:
|
|
|
+ results.append(result)
|
|
|
+
|
|
|
+ try:
|
|
|
+ for item in root_path.iterdir():
|
|
|
+ if item.is_dir():
|
|
|
+ sub_results = self.process_directory_tree(item, output_dir)
|
|
|
+ results.extend(sub_results)
|
|
|
+ except (FileNotFoundError, PermissionError):
|
|
|
+ pass
|
|
|
+
|
|
|
+ return results
|
|
|
+
|
|
|
+class FolderMonitorHandler(FileSystemEventHandler):
|
|
|
+
|
|
|
+ def __init__(self, system, output_dir, input_dir):
|
|
|
+ self.system = system
|
|
|
+ self.output_dir = output_dir
|
|
|
+ self.input_dir = Path(input_dir)
|
|
|
+ self.processing = set() # 正在处理的文件/文件夹
|
|
|
+
|
|
|
+ def on_created(self, event):
|
|
|
+
|
|
|
+ if event.is_directory:
|
|
|
+ self.handle_new_folder(Path(event.src_path))
|
|
|
+ else:
|
|
|
+ self.handle_new_file(Path(event.src_path))
|
|
|
+
|
|
|
+ def handle_new_folder(self, folder_path):
|
|
|
+
|
|
|
+ if not folder_path.exists() or folder_path.name in self.processing:
|
|
|
+ return
|
|
|
+
|
|
|
+ logger.info(f"新文件夹: {folder_path.name}")
|
|
|
+ time.sleep(2)
|
|
|
+
|
|
|
+ if not folder_path.exists():
|
|
|
+ return
|
|
|
+
|
|
|
+ self.processing.add(folder_path.name)
|
|
|
+ try:
|
|
|
+ results = self.system.process_directory_tree(folder_path, self.output_dir)
|
|
|
+ if results:
|
|
|
+ logger.info(f"完成: {len(results)} 个项目")
|
|
|
+ else:
|
|
|
+ logger.info("未找到可处理的文档文件")
|
|
|
+ self.cleanup_processed_folder(folder_path)
|
|
|
+ finally:
|
|
|
+ self.processing.discard(folder_path.name)
|
|
|
+
|
|
|
+ def cleanup_processed_folder(self, folder_path):
|
|
|
+ max_retries = 3
|
|
|
+ for attempt in range(max_retries):
|
|
|
+ try:
|
|
|
+ if folder_path.exists():
|
|
|
+ if attempt > 0:
|
|
|
+ time.sleep(2 * attempt)
|
|
|
+
|
|
|
+ self._force_remove_readonly(folder_path)
|
|
|
+ shutil.rmtree(folder_path)
|
|
|
+ logger.info(f"清理: {folder_path.name}")
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ return
|
|
|
+ except PermissionError:
|
|
|
+ if attempt == max_retries - 1:
|
|
|
+ logger.warning(f"权限不足,无法删除文件夹: {folder_path.name}")
|
|
|
+ if self._try_system_delete(folder_path):
|
|
|
+ logger.info(f"系统命令删除成功: {folder_path.name}")
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ try:
|
|
|
+ backup_name = f"{folder_path.name}_待删除_{int(time.time())}"
|
|
|
+ backup_path = folder_path.parent / backup_name
|
|
|
+ folder_path.rename(backup_path)
|
|
|
+ logger.info(f"已重命名为: {backup_name}")
|
|
|
+ return
|
|
|
+ except Exception:
|
|
|
+ logger.warning(f"无法重命名文件夹,请手动删除: {folder_path.name}")
|
|
|
+ return
|
|
|
+ else:
|
|
|
+ continue
|
|
|
+ except Exception as e:
|
|
|
+ if attempt == max_retries - 1:
|
|
|
+ logger.error(f"清理文件夹失败 {folder_path.name}: {e}")
|
|
|
+ else:
|
|
|
+ continue
|
|
|
+
|
|
|
+ def _force_remove_readonly(self, path):
|
|
|
+ import stat
|
|
|
+ try:
|
|
|
+ for root, dirs, files in os.walk(path):
|
|
|
+ for file in files:
|
|
|
+ file_path = Path(root) / file
|
|
|
+ try:
|
|
|
+ file_path.chmod(stat.S_IWRITE | stat.S_IREAD)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ for dir in dirs:
|
|
|
+ dir_path = Path(root) / dir
|
|
|
+ try:
|
|
|
+ dir_path.chmod(stat.S_IWRITE | stat.S_IREAD | stat.S_IEXEC)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+ path.chmod(stat.S_IWRITE | stat.S_IREAD | stat.S_IEXEC)
|
|
|
+ except Exception:
|
|
|
+ pass
|
|
|
+
|
|
|
+ def _try_system_delete(self, folder_path):
|
|
|
+ import subprocess
|
|
|
+ try:
|
|
|
+ if os.name == 'nt': # Windows
|
|
|
+ result = subprocess.run(
|
|
|
+ ['rmdir', '/s', '/q', str(folder_path)],
|
|
|
+ capture_output=True,
|
|
|
+ text=True,
|
|
|
+ timeout=30
|
|
|
+ )
|
|
|
+ return result.returncode == 0
|
|
|
+ else: # Unix/Linux
|
|
|
+ result = subprocess.run(
|
|
|
+ ['rm', '-rf', str(folder_path)],
|
|
|
+ capture_output=True,
|
|
|
+ text=True,
|
|
|
+ timeout=30
|
|
|
+ )
|
|
|
+ return result.returncode == 0
|
|
|
+ except Exception:
|
|
|
+ return False
|
|
|
+
|
|
|
+ def handle_new_file(self, file_path):
|
|
|
+
|
|
|
+ if file_path.name in self.processing:
|
|
|
+ return
|
|
|
+
|
|
|
+ archive_extensions = {'.zip', '.7z', '.rar'}
|
|
|
+ if file_path.suffix.lower() in archive_extensions:
|
|
|
+ logger.info(f"压缩包: {file_path.name}")
|
|
|
+ time.sleep(1)
|
|
|
+
|
|
|
+ self.processing.add(file_path.name)
|
|
|
+ try:
|
|
|
+ extract_dir = file_path.parent / file_path.stem
|
|
|
+ extract_dir.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ if self.system.extract_archive(file_path, extract_dir):
|
|
|
+ file_path.unlink()
|
|
|
+ time.sleep(1)
|
|
|
+ results = self.system.process_directory_tree(extract_dir, self.output_dir)
|
|
|
+ if results:
|
|
|
+ logger.info(f"解压完成: {len(results)} 个项目")
|
|
|
+ else:
|
|
|
+ logger.info("解压后未找到可处理的文档文件")
|
|
|
+ self.cleanup_processed_folder(extract_dir)
|
|
|
+ finally:
|
|
|
+ self.processing.discard(file_path.name)
|
|
|
+
|
|
|
+def monitor_mode():
|
|
|
+
|
|
|
+ input_dir = "input_dir"
|
|
|
+ output_dir = "output_dir"
|
|
|
+
|
|
|
+ # 创建输入目录
|
|
|
+ input_path = Path(input_dir)
|
|
|
+ input_path.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ # 创建输出目录
|
|
|
+ output_path = Path(output_dir)
|
|
|
+ output_path.mkdir(exist_ok=True)
|
|
|
+
|
|
|
+ # 初始化系统
|
|
|
+ system = MinRAGSystem()
|
|
|
+
|
|
|
+ # 创建监控处理器
|
|
|
+ handler = FolderMonitorHandler(system, output_dir, input_dir)
|
|
|
+
|
|
|
+ # 创建文件系统监控器
|
|
|
+ observer = Observer()
|
|
|
+ observer.schedule(handler, str(input_path), recursive=True)
|
|
|
+
|
|
|
+ # 启动监控
|
|
|
+ observer.start()
|
|
|
+
|
|
|
+ logger.info(f"监控目录: {input_path.absolute()}")
|
|
|
+ logger.info(f"输出目录: {Path(output_dir).absolute()}")
|
|
|
+
|
|
|
+ # 扫描现有文件夹
|
|
|
+ for item in input_path.iterdir():
|
|
|
+ if item.is_dir():
|
|
|
+ handler.handle_new_folder(item)
|
|
|
+
|
|
|
+ logger.info("监控中... (按 Ctrl+C 停止)")
|
|
|
+ try:
|
|
|
+ while True:
|
|
|
+ time.sleep(1)
|
|
|
+ except KeyboardInterrupt:
|
|
|
+ observer.stop()
|
|
|
+ observer.join()
|
|
|
+ logger.info("✅ 监控已停止")
|
|
|
+
|
|
|
+def main():
|
|
|
+ print("合同信息提取系统")
|
|
|
+ print("输入目录: input_dir")
|
|
|
+ print("输出目录: output_dir")
|
|
|
+ print("=" * 50)
|
|
|
+
|
|
|
+ monitor_mode()
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|