пре 2 месеци · 1ce4eb3f84
--- a/jk_min_rag_dome.py
+++ b/jk_min_rag_dome.py
@@ -0,0 +1,1458 @@
 
				+import os
			
 
				+import sys
			
 
				+import json
			
 
				+import re
			
 
				+import shutil
			
 
				+import time
			
 
				+import zipfile
			
 
				+from pathlib import Path
			
 
				+from datetime import datetime
			
 
				+from difflib import SequenceMatcher
			
 
				+import logging
			
 
				+import pandas as pd
			
 
				+import openpyxl
			
 
				+from openpyxl import load_workbook
			
 
				+from docx import Document
			
 
				+import docx2txt
			
 
				+from openai import OpenAI
			
 
				+from watchdog.observers import Observer
			
 
				+from watchdog.events import FileSystemEventHandler
			
 
				+import py7zr
			
 
				+import rarfile
			
 
				+
			
 
				+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+class ProjectLogger:
			
 
				+
			
 
				+    def __init__(self, project_name, project_output_dir):
			
 
				+        self.project_name = project_name
			
 
				+        self.project_output_dir = Path(project_output_dir)
			
 
				+        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
			
 
				+
			
 
				+        # 创建日志文件路径
			
 
				+        self.log_file = self.project_output_dir / f"{project_name}_{self.timestamp}.log"
			
 
				+
			
 
				+        # 创建项目专用的logger
			
 
				+        self.logger = logging.getLogger(f"project_{project_name}_{self.timestamp}")
			
 
				+        self.logger.setLevel(logging.INFO)
			
 
				+
			
 
				+        self.logger.handlers.clear()
			
 
				+
			
 
				+        self.logger.propagate = False
			
 
				+
			
 
				+        # 创建文件handler
			
 
				+        file_handler = logging.FileHandler(self.log_file, encoding='utf-8')
			
 
				+        file_handler.setLevel(logging.INFO)
			
 
				+
			
 
				+        # 创建格式器
			
 
				+        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
			
 
				+        file_handler.setFormatter(formatter)
			
 
				+
			
 
				+        # 添加handler到logger
			
 
				+        self.logger.addHandler(file_handler)
			
 
				+
			
 
				+        # 记录项目开始
			
 
				+        self.logger.info(f"=== 项目开始: {project_name} ===")
			
 
				+        self.logger.info(f"日志文件: {self.log_file}")
			
 
				+
			
 
				+        self.llm_interaction_count = 0
			
 
				+
			
 
				+    def info(self, message):
			
 
				+        """记录信息级别日志"""
			
 
				+        self.logger.info(message)
			
 
				+
			
 
				+    def error(self, message):
			
 
				+        """记录错误级别日志"""
			
 
				+        self.logger.error(message)
			
 
				+
			
 
				+    def warning(self, message):
			
 
				+        """记录警告级别日志"""
			
 
				+        self.logger.warning(message)
			
 
				+
			
 
				+    def log_llm_interaction(self, prompt, document_content, llm_response, extracted_result=None):
			
 
				+        """记录LLM交互的详细信息"""
			
 
				+        self.llm_interaction_count += 1
			
 
				+
			
 
				+        self.logger.info("")
			
 
				+        self.logger.info("=" * 80)
			
 
				+        self.logger.info(f"LLM交互 #{self.llm_interaction_count}")
			
 
				+        self.logger.info(f"时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
			
 
				+        self.logger.info("=" * 80)
			
 
				+
			
 
				+        # 记录提示词（截取前500字符预览）
			
 
				+        prompt_preview = prompt[:500] + "..." if len(prompt) > 500 else prompt
			
 
				+        self.logger.info(f"提示词预览:\n{prompt_preview}")
			
 
				+
			
 
				+        # 记录文档内容长度
			
 
				+        self.logger.info(f"文档内容长度: {len(document_content)} 字符")
			
 
				+
			
 
				+        # 记录LLM原始响应
			
 
				+        self.logger.info(f"LLM原始响应:\n{llm_response}")
			
 
				+
			
 
				+        # 如果有提取结果，记录提取结果
			
 
				+        if extracted_result:
			
 
				+            self.logger.info(f"提取结果: {'成功' if extracted_result else '失败'}")
			
 
				+            if extracted_result:
			
 
				+                # 格式化JSON输出
			
 
				+                try:
			
 
				+                    formatted_json = json.dumps(extracted_result, ensure_ascii=False, indent=2)
			
 
				+                    self.logger.info(f"提取的JSON结构:\n{formatted_json}")
			
 
				+                except:
			
 
				+                    self.logger.info(f"提取的结果:\n{extracted_result}")
			
 
				+
			
 
				+        self.logger.info("=" * 80)
			
 
				+        self.logger.info("")
			
 
				+
			
 
				+    def log_project_end(self):
			
 
				+        self.logger.info(f"=== 项目结束: {self.project_name} ===")
			
 
				+        self.logger.info(f"总计LLM交互次数: {self.llm_interaction_count}")
			
 
				+
			
 
				+    def close(self):
			
 
				+        """关闭日志记录器"""
			
 
				+        for handler in self.logger.handlers[:]:
			
 
				+            handler.close()
			
 
				+            self.logger.removeHandler(handler)
			
 
				+
			
 
				+class MinRAGSystem:
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.api_key = "sk-72f9d0e5bc894e1d828d73bdcc50ff0a"    # LLM密钥
			
 
				+        self.base_url = "https://api.deepseek.com"  # LLM地址
			
 
				+        self.model = "deepseek-chat"    # LLM模型名称
			
 
				+        self.performance_file = "2025年度 业绩数据填写汇总表-1.xlsx"    # 业绩数据表
			
 
				+        self.erp_file = "20250417 调整后ERP系统通用参数 - 展示分类.xlsx"    # ERP参数表
			
 
				+        self.output_record_file = "输出记录表.xlsx"    # 输出记录表
			
 
				+        # 输出记录表字段
			
 
				+        self.output_record_fields = [
			
 
				+            "项目编号",
			
 
				+            "项目名称",
			
 
				+            "解析完成时间",
			
 
				+            "输出的原始json",
			
 
				+            "项目地点",
			
 
				+            "所属部门",
			
 
				+            "项目等级",
			
 
				+            "项目状态",
			
 
				+            "项目负责人（总监）/项目经理",
			
 
				+            "总监代表/项目副经理",
			
 
				+            "总占地面积(m2)",
			
 
				+            "项目规模",
			
 
				+            "建安总造价(万元)",
			
 
				+            "设备总报价(万元)",
			
 
				+            "总投资额(万元)",
			
 
				+            "监督机构",
			
 
				+            "建设单位",
			
 
				+            "设计单位",
			
 
				+            "勘察单位",
			
 
				+            "施工单位",
			
 
				+            "监理单位",
			
 
				+            "集团客户",
			
 
				+            "计划开工日期",
			
 
				+            "计划竣工日期",
			
 
				+            "实际开工日期",
			
 
				+            "实际竣工日期",
			
 
				+            "是否街道检查项目"
			
 
				+        ]
			
 
				+        
			
 
				+        # 配置需要从Excel的字段
			
 
				+        # 格式：{字段名: {列索引: int, 显示名: str}}
			
 
				+        self.additional_fields_config = {
			
 
				+            "项目名称": {"column_index": 2, "display_name": "项目名称"},  # C列
			
 
				+            "项目编号": {"column_index": 4, "display_name": "项目编号"},  # E列
			
 
				+            "一级工程类型": {"column_index": 8, "display_name": "一级工程类型"},  # I列
			
 
				+            "二级工程类型": {"column_index": [9, 10, 11], "display_name": "二级工程类型"}  # J-L列
			
 
				+        }
			
 
				+        
			
 
				+        # 初始化LLM客户端
			
 
				+        self.llm_client = None
			
 
				+        if self.api_key:
			
 
				+            self.llm_client = OpenAI(api_key=self.api_key, base_url=self.base_url)
			
 
				+            logger.info("✓ LLM初始化成功")
			
 
				+        else:
			
 
				+            logger.error("× 缺少api密钥，启动失败！！！")
			
 
				+            
			
 
				+        # 加载数据
			
 
				+        self.project_types = self.load_project_types_from_excel()
			
 
				+        self.erp_fields = self.load_erp_fields_from_excel()
			
 
				+        self.additional_fields_data = self.load_additional_fields_from_excel()
			
 
				+
			
 
				+    def _normalize_cell_value(self, value):
			
 
				+        """
			
 
				+        规范化单元格值：None->空，dict/list->JSON字符串，其余->字符串
			
 
				+        """
			
 
				+        if value is None:
			
 
				+            return None
			
 
				+        try:
			
 
				+            if isinstance(value, (dict, list)):
			
 
				+                return json.dumps(value, ensure_ascii=False)
			
 
				+            return str(value)
			
 
				+        except Exception:
			
 
				+            return str(value) if value is not None else None
			
 
				+
			
 
				+    def append_output_record(self, final_result, project_name, result_file_path, project_output_dir, project_logger=None):
			
 
				+        """
			
 
				+        将当前项目的原始JSON及解析后的关键字段追加到“输出记录表.xlsx”。
			
 
				+
			
 
				+        只保留固定字段写入逻辑，不启用动态扩展。
			
 
				+        """
			
 
				+        try:
			
 
				+            general = {}
			
 
				+            if isinstance(final_result, dict):
			
 
				+                general = final_result.get("通用表单", {}) or {}
			
 
				+
			
 
				+            # 构造一行数据
			
 
				+            now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
			
 
				+
			
 
				+            row_data = {
			
 
				+                "项目编号": general.get("项目编号"),
			
 
				+                "项目名称": general.get("项目名称") or project_name,
			
 
				+                "解析完成时间": now_str,
			
 
				+                "输出的原始json": json.dumps(final_result, ensure_ascii=False),
			
 
				+                "项目地点": general.get("项目地点"),
			
 
				+                "所属部门": general.get("所属部门"),
			
 
				+                "项目等级": general.get("项目等级"),
			
 
				+                "项目状态": general.get("项目状态"),
			
 
				+                "项目负责人（总监）/项目经理": general.get("项目负责人（总监）/项目经理"),
			
 
				+                "总监代表/项目副经理": general.get("总监代表/项目副经理"),
			
 
				+                "总占地面积(m2)": general.get("总占地面积(m2)"),
			
 
				+                "项目规模": general.get("项目规模"),
			
 
				+                "建安总造价(万元)": general.get("建安总造价(万元)"),
			
 
				+                "设备总报价(万元)": general.get("设备总报价(万元)"),
			
 
				+                "总投资额(万元)": general.get("总投资额(万元)"),
			
 
				+                "监督机构": general.get("监督机构"),
			
 
				+                "建设单位": general.get("建设单位"),
			
 
				+                "设计单位": general.get("设计单位"),
			
 
				+                "勘察单位": general.get("勘察单位"),
			
 
				+                "施工单位": general.get("施工单位"),
			
 
				+                "监理单位": general.get("监理单位"),
			
 
				+                "集团客户": general.get("集团客户"),
			
 
				+                "计划开工日期": general.get("计划开工日期"),
			
 
				+                "计划竣工日期": general.get("计划竣工日期"),
			
 
				+                "实际开工日期": general.get("实际开工日期"),
			
 
				+                "实际竣工日期": general.get("实际竣工日期"),
			
 
				+                "是否街道检查项目": general.get("是否街道检查项目")
			
 
				+            }
			
 
				+
			
 
				+            # 打开或创建工作簿
			
 
				+            record_path = Path(self.output_record_file)
			
 
				+            if record_path.exists():
			
 
				+                wb = load_workbook(record_path)
			
 
				+                ws = wb.active
			
 
				+            else:
			
 
				+                wb = openpyxl.Workbook()
			
 
				+                ws = wb.active
			
 
				+                # 写入表头
			
 
				+                ws.append(self.output_record_fields)
			
 
				+
			
 
				+            # existing_headers = [cell.value for cell in ws[1]] if ws.max_row >= 1 else []
			
 
				+            # existing_headers = [h for h in existing_headers if h is not None]
			
 
				+            # headers_changed = False
			
 
				+            # for field in self.output_record_fields:
			
 
				+            #     if field not in existing_headers:
			
 
				+            #         existing_headers.append(field)
			
 
				+            #         headers_changed = True
			
 
				+            # if headers_changed:
			
 
				+            #     for i in range(1, ws.max_column + 1):
			
 
				+            #         ws.cell(row=1, column=i, value=None)
			
 
				+            #     for col_idx, header in enumerate(existing_headers, start=1):
			
 
				+            #         ws.cell(row=1, column=col_idx, value=header)
			
 
				+            # auto_new_fields = []
			
 
				+            # for k, v in general.items():
			
 
				+            #     if k not in existing_headers and not isinstance(v, dict):
			
 
				+            #         auto_new_fields.append(k)
			
 
				+            # for k in auto_new_fields:
			
 
				+            #     row_data[k] = general.get(k)
			
 
				+
			
 
				+            values = []
			
 
				+            for header in self.output_record_fields:
			
 
				+                values.append(self._normalize_cell_value(row_data.get(header)))
			
 
				+
			
 
				+            ws.append(values)
			
 
				+            wb.save(record_path)
			
 
				+
			
 
				+            if project_logger:
			
 
				+                project_logger.info(f"✓ 已写入输出记录: {record_path}")
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"写入输出记录表失败: {e}")
			
 
				+            if project_logger:
			
 
				+                project_logger.error(f"写入输出记录表失败: {e}")
			
 
				+
			
 
				+    def add_additional_field(self, field_name, column_index, display_name=None):
			
 
				+        """
			
 
				+        字段配置
			
 
				+        
			
 
				+        """
			
 
				+        if display_name is None:
			
 
				+            display_name = field_name
			
 
				+            
			
 
				+        self.additional_fields_config[field_name] = {
			
 
				+            "column_index": column_index,
			
 
				+            "display_name": display_name
			
 
				+        }
			
 
				+        
			
 
				+        # 重新加载数据
			
 
				+        self.additional_fields_data = self.load_additional_fields_from_excel()
			
 
				+
			
 
				+    def load_additional_fields_from_excel(self):
			
 
				+        """
			
 
				+        从Excel文件中加载额外字段的数据
			
 
				+        
			
 
				+        Returns:
			
 
				+            dict: {项目名称: {字段名: 值}}
			
 
				+        """
			
 
				+        if not Path(self.performance_file).exists():
			
 
				+            logger.warning(f"业绩数据表不存在: {self.performance_file}")
			
 
				+            return {}
			
 
				+            
			
 
				+        try:
			
 
				+            # 尝试读取Excel数据
			
 
				+            try:
			
 
				+                import xlwings as xw
			
 
				+                app = xw.App(visible=False)
			
 
				+                wb = app.books.open(self.performance_file)
			
 
				+                ws = wb.sheets[0]
			
 
				+                data = ws.used_range.value
			
 
				+                wb.close()
			
 
				+                app.quit()
			
 
				+            except:
			
 
				+                df = pd.read_excel(self.performance_file, engine='openpyxl')
			
 
				+                data = [df.columns.tolist()] + df.values.tolist()
			
 
				+            
			
 
				+            if not data:
			
 
				+                return {}
			
 
				+            
			
 
				+            additional_fields_data = {}
			
 
				+            
			
 
				+            # 项目名称列索引（用于作为主键）
			
 
				+            project_name_col = 2  # C列
			
 
				+            
			
 
				+            for row in data[1:]:  # 跳过标题行
			
 
				+                if not row or len(row) <= project_name_col:
			
 
				+                    continue
			
 
				+                
			
 
				+                project_name = row[project_name_col]
			
 
				+                if not project_name:
			
 
				+                    continue
			
 
				+                
			
 
				+                project_name = str(project_name).strip()
			
 
				+                additional_fields_data[project_name] = {}
			
 
				+                
			
 
				+                # 提取配置的额外字段
			
 
				+                for field_name, config in self.additional_fields_config.items():
			
 
				+                    column_index = config["column_index"]
			
 
				+                    display_name = config["display_name"]
			
 
				+                    
			
 
				+                    if isinstance(column_index, list):
			
 
				+                        # 处理多列情况
			
 
				+                        values = []
			
 
				+                        for col_idx in column_index:
			
 
				+                            if len(row) > col_idx and row[col_idx]:
			
 
				+                                value = str(row[col_idx]).strip()
			
 
				+                                # 过滤nan值、空值和无效值
			
 
				+                                if value and value.lower() != 'nan' and value != 'None' and value not in values:
			
 
				+                                    values.append(value)
			
 
				+                        additional_fields_data[project_name][display_name] = ", ".join(values) if values else None
			
 
				+                    else:
			
 
				+                        # 处理单列情况
			
 
				+                        if len(row) > column_index and row[column_index]:
			
 
				+                            value = str(row[column_index]).strip()
			
 
				+                            # 过滤nan值、空值和无效值
			
 
				+                            if value and value.lower() != 'nan' and value != 'None':
			
 
				+                                additional_fields_data[project_name][display_name] = value
			
 
				+                            else:
			
 
				+                                additional_fields_data[project_name][display_name] = None
			
 
				+                        else:
			
 
				+                            additional_fields_data[project_name][display_name] = None
			
 
				+            
			
 
				+            return additional_fields_data
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"加载额外字段数据失败: {e}")
			
 
				+            return {}
			
 
				+
			
 
				+    def load_project_types_from_excel(self):
			
 
				+
			
 
				+        if not Path(self.performance_file).exists():
			
 
				+            logger.warning(f"业绩数据表不存在: {self.performance_file}")
			
 
				+            return sys.exit(1)
			
 
				+            
			
 
				+        try:
			
 
				+            try:
			
 
				+                import xlwings as xw
			
 
				+                app = xw.App(visible=False)
			
 
				+                wb = app.books.open(self.performance_file)
			
 
				+                ws = wb.sheets[0]
			
 
				+                data = ws.used_range.value
			
 
				+                wb.close()
			
 
				+                app.quit()
			
 
				+            except:
			
 
				+                df = pd.read_excel(self.performance_file, engine='openpyxl')
			
 
				+                data = [df.columns.tolist()] + df.values.tolist()
			
 
				+            
			
 
				+            if not data:
			
 
				+                return {}
			
 
				+            
			
 
				+            project_types = {}
			
 
				+            headers = data[0]
			
 
				+            
			
 
				+            # 根据原项目的列索引
			
 
				+            project_name_col = 2  # 项目名称
			
 
				+            primary_type_col = 8  # 一级工程类型（2025）
			
 
				+            secondary_type_cols = [9, 10, 11]  # 二级工程类型（2025）
			
 
				+            
			
 
				+            for row in data[1:]:
			
 
				+                if not row or len(row) <= project_name_col:
			
 
				+                    continue
			
 
				+                
			
 
				+                project_name = row[project_name_col]
			
 
				+                if not project_name:
			
 
				+                    continue
			
 
				+                
			
 
				+                # 获取一级工程类型
			
 
				+                primary_type = None
			
 
				+                if len(row) > primary_type_col and row[primary_type_col]:
			
 
				+                    value = str(row[primary_type_col]).strip()
			
 
				+                    # 过滤nan值、空值和无效值
			
 
				+                    if value and value.lower() != 'nan' and value != 'None':
			
 
				+                        primary_type = self.normalize_type_name(value)
			
 
				+                
			
 
				+                # 获取二级工程类型
			
 
				+                secondary_types = []
			
 
				+                for col_idx in secondary_type_cols:
			
 
				+                    if len(row) > col_idx and row[col_idx]:
			
 
				+                        value = str(row[col_idx]).strip()
			
 
				+                        # 过滤nan值、空值和无效值
			
 
				+                        if value and value.lower() != 'nan' and value != 'None':
			
 
				+                            secondary_type = self.normalize_type_name(value)
			
 
				+                            if secondary_type and secondary_type not in secondary_types:
			
 
				+                                secondary_types.append(secondary_type)
			
 
				+                
			
 
				+                project_types[str(project_name).strip()] = {
			
 
				+                    "primary_type": primary_type,
			
 
				+                    "secondary_types": secondary_types
			
 
				+                }
			
 
				+            
			
 
				+            logger.info(f"✓ 加载 {len(project_types)} 个项目")
			
 
				+            return project_types
			
 
				+
			
 
				+        except Exception:
			
 
				+            return {}
			
 
				+    
			
 
				+    def load_erp_fields_from_excel(self):
			
 
				+
			
 
				+        if not Path(self.erp_file).exists():
			
 
				+            logger.warning(f"ERP参数表不存在: {self.erp_file}")
			
 
				+            return {}
			
 
				+        try:
			
 
				+            wb = load_workbook(self.erp_file, data_only=True)
			
 
				+            if '调整后参数' not in wb.sheetnames:
			
 
				+                logger.warning("ERP参数表中没有找到'调整后参数'工作表")
			
 
				+                return {}
			
 
				+            ws = wb['调整后参数']
			
 
				+
			
 
				+            data = {}
			
 
				+            current_category = None
			
 
				+            current_subcategory = None
			
 
				+
			
 
				+            for row in range(1, ws.max_row + 1):
			
 
				+                try:
			
 
				+                    col_a = ws.cell(row=row, column=1).value
			
 
				+                    col_b = ws.cell(row=row, column=2).value
			
 
				+                    col_c = ws.cell(row=row, column=3).value
			
 
				+                    col_e = ws.cell(row=row, column=5).value
			
 
				+
			
 
				+                    if not col_c or col_c in ['字段', 'ERP 通用参数信息']:
			
 
				+                        continue
			
 
				+
			
 
				+                    if col_a and str(col_a).strip() and str(col_a).strip() != 'nan':
			
 
				+                        current_category = str(col_a).strip()
			
 
				+                        if current_category not in data:
			
 
				+                            data[current_category] = {}
			
 
				+                        current_subcategory = None
			
 
				+
			
 
				+                    if col_b and str(col_b).strip() and str(col_b).strip() != 'nan':
			
 
				+                        current_subcategory = str(col_b).strip()
			
 
				+                        if current_category and current_subcategory not in data[current_category]:
			
 
				+                            data[current_category][current_subcategory] = {}
			
 
				+
			
 
				+                    if col_c and current_category:
			
 
				+                        field_name = str(col_c).strip()
			
 
				+                        field_type = str(col_e).strip() if col_e else "文本"
			
 
				+
			
 
				+                        if current_subcategory:
			
 
				+                            data[current_category][current_subcategory][field_name] = field_type
			
 
				+                        else:
			
 
				+                            if '_fields' not in data[current_category]:
			
 
				+                                data[current_category]['_fields'] = {}
			
 
				+                            data[current_category]['_fields'][field_name] = field_type
			
 
				+
			
 
				+                except Exception:
			
 
				+                    continue
			
 
				+
			
 
				+            logger.info(f"✓ 加载 {len(data)} 个字段分类")
			
 
				+            return data
			
 
				+
			
 
				+        except Exception:
			
 
				+            return {}
			
 
				+        
			
 
				+    def normalize_type_name(self, type_name):
			
 
				+
			
 
				+        if not type_name:
			
 
				+            return None
			
 
				+        # 移除前面的数字编号，如 "02市政公用工程" → "市政公用工程"
			
 
				+        normalized = re.sub(r'^\d+', '', str(type_name)).strip()
			
 
				+        return normalized if normalized else type_name
			
 
				+    
			
 
				+    def fuzzy_match_project(self, folder_name, threshold=0.6):
			
 
				+
			
 
				+        best_match = None
			
 
				+        best_score = 0
			
 
				+
			
 
				+        for project_name in self.project_types.keys():
			
 
				+            score = SequenceMatcher(None, folder_name.lower(), project_name.lower()).ratio()
			
 
				+
			
 
				+            folder_words = set(re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z]+', folder_name))
			
 
				+            project_words = set(re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z]+', project_name))
			
 
				+
			
 
				+            if folder_words and project_words:
			
 
				+                word_overlap = len(folder_words & project_words) / len(folder_words | project_words)
			
 
				+                score = max(score, word_overlap)
			
 
				+
			
 
				+            if score > best_score and score >= threshold:
			
 
				+                best_score = score
			
 
				+                best_match = project_name
			
 
				+
			
 
				+        return best_match, best_score
			
 
				+
			
 
				+    def get_project_types(self, folder_name):
			
 
				+
			
 
				+        if folder_name in self.project_types:
			
 
				+            project_info = self.project_types[folder_name]
			
 
				+            return project_info['primary_type'], project_info['secondary_types'], 1.0, folder_name
			
 
				+
			
 
				+        matched_project, score = self.fuzzy_match_project(folder_name)
			
 
				+        if matched_project:
			
 
				+            project_info = self.project_types[matched_project]
			
 
				+            return project_info['primary_type'], project_info['secondary_types'], score, matched_project
			
 
				+
			
 
				+        return None, [], 0.0, None
			
 
				+    
			
 
				+    def find_erp_type_key(self, type_name):
			
 
				+
			
 
				+        if not type_name:
			
 
				+            return None
			
 
				+
			
 
				+        normalized_name = self.normalize_type_name(type_name)
			
 
				+
			
 
				+        if normalized_name in self.erp_fields:
			
 
				+            return normalized_name
			
 
				+
			
 
				+        for key in self.erp_fields.keys():
			
 
				+            if normalized_name in key or key in normalized_name:
			
 
				+                return key
			
 
				+
			
 
				+        return None
			
 
				+
			
 
				+    def generate_json_structure(self, folder_name):
			
 
				+
			
 
				+        primary_type, secondary_types, match_score, matched_project = self.get_project_types(folder_name)
			
 
				+
			
 
				+        # 构建JSON结构
			
 
				+        json_structure = {}
			
 
				+
			
 
				+        # 添加通用表单
			
 
				+        if "通用表单" in self.erp_fields:
			
 
				+            # 创建通用表单的有序字典，先添加额外字段
			
 
				+            common_form = {}
			
 
				+            
			
 
				+            # 首先添加额外字段到通用表单开头
			
 
				+            project_key = matched_project if matched_project else folder_name
			
 
				+            if project_key in self.additional_fields_data:
			
 
				+                additional_data = self.additional_fields_data[project_key]
			
 
				+                for field_name, field_value in additional_data.items():
			
 
				+                    common_form[field_name] = field_value
			
 
				+            
			
 
				+            # 然后添加原有的通用表单字段
			
 
				+            if "_fields" in self.erp_fields["通用表单"]:
			
 
				+                for field_name, field_type in self.erp_fields["通用表单"]["_fields"].items():
			
 
				+                    common_form[field_name] = field_type
			
 
				+            
			
 
				+            json_structure["通用表单"] = common_form
			
 
				+
			
 
				+        # 如果没有工程类型信息，只返回通用表单
			
 
				+        if not primary_type:
			
 
				+            return json_structure, primary_type, secondary_types, match_score, matched_project
			
 
				+
			
 
				+        # 查找匹配的ERP工程类型
			
 
				+        erp_primary_key = self.find_erp_type_key(primary_type)
			
 
				+
			
 
				+        if erp_primary_key:
			
 
				+            normalized_secondary_types = []
			
 
				+            for sec_type in secondary_types:
			
 
				+                normalized_sec = self.normalize_type_name(sec_type)
			
 
				+                normalized_sec = re.sub(r'^\d+', '', normalized_sec).strip()
			
 
				+                if normalized_sec:
			
 
				+                    normalized_secondary_types.append(normalized_sec)
			
 
				+
			
 
				+            if normalized_secondary_types:
			
 
				+                # 尝试匹配二级类型
			
 
				+                matched_secondary = False
			
 
				+                json_structure[erp_primary_key] = {}
			
 
				+
			
 
				+                for secondary_type in normalized_secondary_types:
			
 
				+                    for erp_sec_key in self.erp_fields[erp_primary_key].keys():
			
 
				+                        if erp_sec_key == "_fields":
			
 
				+                            continue
			
 
				+                        if secondary_type in erp_sec_key or erp_sec_key in secondary_type:
			
 
				+                            json_structure[erp_primary_key][erp_sec_key] = self.erp_fields[erp_primary_key][erp_sec_key]
			
 
				+                            matched_secondary = True
			
 
				+                            break
			
 
				+
			
 
				+                # 如果没有匹配到二级类型，使用一级类型的字段
			
 
				+                if not matched_secondary:
			
 
				+                    if "_fields" in self.erp_fields[erp_primary_key]:
			
 
				+                        json_structure[erp_primary_key] = self.erp_fields[erp_primary_key]["_fields"]
			
 
				+                    else:
			
 
				+                        json_structure[erp_primary_key] = {k: v for k, v in self.erp_fields[erp_primary_key].items() if k != "_fields"}
			
 
				+            else:
			
 
				+                # 没有二级类型，直接使用一级类型的字段
			
 
				+                if "_fields" in self.erp_fields[erp_primary_key]:
			
 
				+                    json_structure[erp_primary_key] = self.erp_fields[erp_primary_key]["_fields"]
			
 
				+                else:
			
 
				+                    json_structure[erp_primary_key] = {k: v for k, v in self.erp_fields[erp_primary_key].items() if k != "_fields"}
			
 
				+
			
 
				+        return json_structure, primary_type, secondary_types, match_score, matched_project
			
 
				+
			
 
				+
			
 
				+    def extract_archive(self, archive_path, extract_to):
			
 
				+
			
 
				+        archive_path = Path(archive_path)
			
 
				+        extract_to = Path(extract_to)
			
 
				+
			
 
				+        try:
			
 
				+            # if archive_path.suffix.lower() == '.zip':
			
 
				+            #     with zipfile.ZipFile(archive_path, 'r') as zip_ref:
			
 
				+            #         zip_ref.extractall(extract_to)
			
 
				+            #     logger.info(f"✓ 解压ZIP: {archive_path.name}")
			
 
				+
			
 
				+            if archive_path.suffix.lower() == '.zip':
			
 
				+                # with zipfile.ZipFile(archive_path, 'r') as zip_ref:
			
 
				+                #     # 逐个文件处理，避免修改原始file_info
			
 
				+                #     for file_info in zip_ref.filelist:
			
 
				+                #         # 跳过目录条目（以/结尾的）
			
 
				+                #         if file_info.filename.endswith('/'):
			
 
				+                #             continue
			
 
				+                            
			
 
				+                #         original_name = file_info.filename
			
 
				+                #         fixed_name = original_name
			
 
				+                        
			
 
				+                #         # 尝试不同的编码方式修复文件名
			
 
				+                #         try:
			
 
				+                #             fixed_name = original_name.encode('cp437').decode('gbk')
			
 
				+                #         except:
			
 
				+                #             try:
			
 
				+                #                 fixed_name = original_name.encode('cp437').decode('utf-8')
			
 
				+                #             except:
			
 
				+                #                 # 如果都失败，保持原文件名
			
 
				+                #                 fixed_name = original_name
			
 
				+                        
			
 
				+                #         # 确保目标目录存在
			
 
				+                #         target_path = extract_to / fixed_name
			
 
				+                #         target_path.parent.mkdir(parents=True, exist_ok=True)
			
 
				+                        
			
 
				+                #         # 提取单个文件
			
 
				+                #         with zip_ref.open(file_info) as source, open(target_path, 'wb') as target:
			
 
				+                #             target.write(source.read())
			
 
				+                    
			
 
				+                # logger.info(f"✓ 解压ZIP: {archive_path.name}")
			
 
				+                with zipfile.ZipFile(archive_path, 'r') as zip_ref:
			
 
				+                    # 尝试不同的编码方式读取文件名
			
 
				+                    for file_info in zip_ref.filelist:
			
 
				+                        try:
			
 
				+                            file_info.filename = file_info.filename.encode('cp437').decode('gbk')
			
 
				+                        except:
			
 
				+                            try:
			
 
				+                                file_info.filename = file_info.filename.encode('cp437').decode('utf-8')
			
 
				+                            except:
			
 
				+                                pass
			
 
				+                        
			
 
				+                        zip_ref.extract(file_info, extract_to)
			
 
				+                logger.info(f"✓ 解压ZIP: {archive_path.name}")
			
 
				+
			
 
				+            elif archive_path.suffix.lower() == '.7z':
			
 
				+                with py7zr.SevenZipFile(archive_path, mode='r') as z:
			
 
				+                    z.extractall(extract_to)
			
 
				+                logger.info(f"✓ 解压7Z: {archive_path.name}")
			
 
				+
			
 
				+            elif archive_path.suffix.lower() == '.rar':
			
 
				+                with rarfile.RarFile(archive_path, 'r') as rar_ref:
			
 
				+                    rar_ref.extractall(extract_to)
			
 
				+                logger.info(f"✓ 解压RAR: {archive_path.name}")
			
 
				+            else:
			
 
				+                logger.warning(f"不支持的压缩格式: {archive_path.suffix}")
			
 
				+                return False
			
 
				+
			
 
				+            # 优化目录结构（删除多余的嵌套层级）
			
 
				+            self.flatten_directory(extract_to)
			
 
				+            return True
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"解压失败 {archive_path}: {e}")
			
 
				+            return False
			
 
				+
			
 
				+    def flatten_directory(self, extract_dir):
			
 
				+
			
 
				+        extract_dir = Path(extract_dir)
			
 
				+        if not extract_dir.is_dir():
			
 
				+            return
			
 
				+
			
 
				+        items = list(extract_dir.iterdir())
			
 
				+        if len(items) == 1 and items[0].is_dir():
			
 
				+            inner_dir = items[0]
			
 
				+            for item in inner_dir.iterdir():
			
 
				+                shutil.move(str(item), str(extract_dir))
			
 
				+            inner_dir.rmdir()
			
 
				+            logger.info("✓ 优化了目录结构")
			
 
				+
			
 
				+    def read_document(self, file_path):
			
 
				+
			
 
				+        file_path = Path(file_path)
			
 
				+
			
 
				+        if not file_path.exists():
			
 
				+            return ""
			
 
				+
			
 
				+        try:
			
 
				+            if file_path.suffix.lower() == '.docx':
			
 
				+                return self.read_docx(file_path)
			
 
				+            elif file_path.suffix.lower() == '.doc':
			
 
				+                return self.read_doc(file_path)
			
 
				+            elif file_path.suffix.lower() in ['.xlsx', '.xls']:
			
 
				+                return self.read_excel(file_path)
			
 
				+            elif file_path.suffix.lower() == '.txt':
			
 
				+                return self.read_txt(file_path)
			
 
				+            else:
			
 
				+                logger.warning(f"不支持的文件格式: {file_path}")
			
 
				+                return ""
			
 
				+            
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"读取文档失败 {file_path}: {e}")
			
 
				+            return ""
			
 
				+
			
 
				+    def read_docx(self, file_path):
			
 
				+
			
 
				+        try:
			
 
				+            doc = Document(file_path)
			
 
				+            content = []
			
 
				+
			
 
				+            # 读取段落
			
 
				+            for paragraph in doc.paragraphs:
			
 
				+                if paragraph.text.strip():
			
 
				+                    content.append(paragraph.text.strip())
			
 
				+
			
 
				+            # 读取表格
			
 
				+            for table in doc.tables:
			
 
				+                for row in table.rows:
			
 
				+                    row_text = []
			
 
				+                    for cell in row.cells:
			
 
				+                        if cell.text.strip():
			
 
				+                            row_text.append(cell.text.strip())
			
 
				+                    if row_text:
			
 
				+                        content.append(' | '.join(row_text))
			
 
				+
			
 
				+            return '\n'.join(content)
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"读取DOCX文件失败: {e}")
			
 
				+            return ""
			
 
				+
			
 
				+    # def read_doc(self, file_path):
			
 
				+
			
 
				+    #     try:
			
 
				+    #         # content = docx2txt.process(str(file_path))
			
 
				+    #         # import textract
			
 
				+    #         # content = textract.process(str(file_path)).decode("utf-8", errors="ignore")
			
 
				+
			
 
				+    #         import win32com.client
			
 
				+
			
 
				+    #         """
			
 
				+    #         使用 win32com 读取 .doc 文件内容
			
 
				+    #         """
			
 
				+    #         if not os.path.exists(file_path):
			
 
				+    #             raise FileNotFoundError(f"{file_path} 不存在")
			
 
				+
			
 
				+    #         # 初始化 Word COM 对象
			
 
				+    #         word = win32com.client.Dispatch("Word.Application")
			
 
				+    #         word.Visible = False  # 不显示 Word 窗口
			
 
				+    #         word.DisplayAlerts = 0  # 关闭弹窗
			
 
				+
			
 
				+    #         try:
			
 
				+    #             doc = word.Documents.Open(file_path)
			
 
				+    #             content = doc.Content.Text  # 读取文档内容
			
 
				+    #             doc.Close()
			
 
				+    #         finally:
			
 
				+    #             word.Quit()
			
 
				+
			
 
				+    #         if content:
			
 
				+    #             return content.strip()
			
 
				+    #         else:
			
 
				+    #             logger.warning(f"DOC文件内容为空: {file_path}")
			
 
				+    #             return ""
			
 
				+    #     except Exception as e:
			
 
				+    #         logger.error(f"读取DOC文件失败: {e}")
			
 
				+    #         return ""
			
 
				+
			
 
				+    # def read_doc(self, file_path):
			
 
				+
			
 
				+    #     import win32com.client
			
 
				+    #     import pythoncom
			
 
				+
			
 
				+    #     file_path = os.path.abspath(file_path)
			
 
				+
			
 
				+    #     try:
			
 
				+    #         if not os.path.exists(file_path):
			
 
				+    #             raise FileNotFoundError(f"{file_path} 不存在")
			
 
				+
			
 
				+    #         # 初始化 COM 线程
			
 
				+    #         pythoncom.CoInitialize()
			
 
				+
			
 
				+    #         # 初始化 Word COM 对象
			
 
				+    #         word = win32com.client.Dispatch("Word.Application")
			
 
				+    #         word.Visible = False  # 不显示 Word 窗口
			
 
				+    #         word.DisplayAlerts = 0  # 关闭弹窗
			
 
				+
			
 
				+    #         try:
			
 
				+    #             doc = word.Documents.Open(file_path)
			
 
				+    #             content = doc.Content.Text  # 读取文档内容
			
 
				+    #             doc.Close()
			
 
				+    #         finally:
			
 
				+    #             word.Quit()
			
 
				+    #         pythoncom.CoUninitialize()  # 释放 COM 线程
			
 
				+
			
 
				+    #         if content:
			
 
				+    #             return content.strip()
			
 
				+    #         else:
			
 
				+    #             logger.warning(f"DOC文件内容为空: {file_path}")
			
 
				+    #             return ""
			
 
				+
			
 
				+    #     except Exception as e:
			
 
				+    #         logger.error(f"读取DOC文件失败: {e}")
			
 
				+    #         return ""
			
 
				+
			
 
				+    def read_doc(file_path):
			
 
				+        """使用 antiword 读取 .doc 文件"""
			
 
				+        try:
			
 
				+            import subprocess
			
 
				+            result = subprocess.run(
			
 
				+                ['antiword', file_path], 
			
 
				+                capture_output=True, 
			
 
				+                text=True, 
			
 
				+                timeout=30
			
 
				+            )
			
 
				+            if result.returncode == 0:
			
 
				+                return result.stdout.strip()
			
 
				+            else:
			
 
				+                logger.error(f"Antiword 读取失败: {result.stderr}")
			
 
				+                return ""
			
 
				+        except FileNotFoundError:
			
 
				+            logger.error("处理失败！")
			
 
				+            return ""
			
 
				+        except subprocess.TimeoutExpired:
			
 
				+            logger.error("Antiword 处理超时")
			
 
				+            return ""
			
 
				+
			
 
				+    def read_excel(self, file_path):
			
 
				+
			
 
				+        try:
			
 
				+            # 使用xlrd引擎处理.xls文件
			
 
				+            if file_path.suffix.lower() == '.xls':
			
 
				+                df = pd.read_excel(file_path, engine='xlrd')
			
 
				+            else:
			
 
				+                df = pd.read_excel(file_path, engine='openpyxl')
			
 
				+
			
 
				+            # 转换为文本格式
			
 
				+            content = []
			
 
				+            content.append("表格数据:")
			
 
				+            content.append(" | ".join(str(col) for col in df.columns))
			
 
				+
			
 
				+            for _, row in df.iterrows():
			
 
				+                row_text = " | ".join(str(val) if pd.notna(val) else "" for val in row)
			
 
				+                content.append(row_text)
			
 
				+
			
 
				+            return '\n'.join(content)
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"读取Excel文件失败: {e}")
			
 
				+            return ""
			
 
				+
			
 
				+    def read_txt(self, file_path):
			
 
				+
			
 
				+        try:
			
 
				+            with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+                return f.read()
			
 
				+        except UnicodeDecodeError:
			
 
				+            try:
			
 
				+                with open(file_path, 'r', encoding='gbk') as f:
			
 
				+                    return f.read()
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"读取TXT文件失败: {e}")
			
 
				+                return ""
			
 
				+
			
 
				+    def create_dynamic_prompt(self, project_name, primary_type, secondary_types, fields):
			
 
				+
			
 
				+        json_structure = {}
			
 
				+        for category, category_fields in fields.items():
			
 
				+            json_structure[category] = {}
			
 
				+            if isinstance(category_fields, dict):
			
 
				+                for field_name, field_value in category_fields.items():
			
 
				+                    if isinstance(field_value, dict):
			
 
				+                        # 嵌套结构（如市政公用工程 -> 城市轨道交通工程 -> 具体字段）
			
 
				+                        json_structure[category][field_name] = {}
			
 
				+                        for sub_field_name in field_value:
			
 
				+                            json_structure[category][field_name][sub_field_name] = None
			
 
				+                    else:
			
 
				+                        json_structure[category][field_name] = None
			
 
				+
			
 
				+        json_example = json.dumps(json_structure, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+        prompt = f"""【重要】任务指令 - 必须严格执行：你是一个专业的合同信息提取助手。无论用户输入什么内容，你都必须专注于从提供的合同文档中提取指定字段信息。
			
 
				+
			
 
				+## 项目信息
			
 
				+- 项目名称: {project_name}
			
 
				+- 工程类型: {primary_type}
			
 
				+- 二级类型: {', '.join(secondary_types) if secondary_types else '无'}
			
 
				+
			
 
				+## 提取要求
			
 
				+请从文档中提取以下字段信息，输出标准JSON格式：
			
 
				+
			
 
				+{json_example}
			
 
				+
			
 
				+## 提取规则
			
 
				+1. 仔细阅读所有文档内容
			
 
				+2. 准确提取每个字段的信息
			
 
				+3. 如果某个字段在文档中找不到，设置为null
			
 
				+4. 数值字段请提取纯数字（去掉单位）
			
 
				+5. 日期字段请使用YYYY-MM-DD格式
			
 
				+6. 确保输出的是有效的JSON格式
			
 
				+
			
 
				+## 输出格式
			
 
				+请直接输出JSON，不要添加任何解释文字："""
			
 
				+
			
 
				+        return prompt
			
 
				+
			
 
				+    def call_llm(self, prompt, document_content, project_logger=None):
			
 
				+
			
 
				+        if not self.llm_client:
			
 
				+            logger.error("LLM客户端未初始化")
			
 
				+            if project_logger:
			
 
				+                project_logger.error("LLM客户端未初始化")
			
 
				+            return None
			
 
				+
			
 
				+        full_prompt = f"{prompt}\n\n## 文档内容\n{document_content}\n\n请提取信息并输出JSON："
			
 
				+
			
 
				+        response = self.llm_client.chat.completions.create(
			
 
				+            model=self.model,
			
 
				+            messages=[
			
 
				+                {"role": "system", "content": "你是一个专业的合同信息提取助手，专门从文档中提取结构化信息。"},
			
 
				+                {"role": "user", "content": full_prompt}
			
 
				+            ],
			
 
				+            temperature=1.0,
			
 
				+            max_tokens=4000
			
 
				+        )
			
 
				+
			
 
				+        result = response.choices[0].message.content.strip()
			
 
				+
			
 
				+        # 提取JSON部分
			
 
				+        if "```json" in result:
			
 
				+            json_start = result.find("```json") + 7
			
 
				+            json_end = result.find("```", json_start)
			
 
				+            result = result[json_start:json_end].strip()
			
 
				+        elif "```" in result:
			
 
				+            json_start = result.find("```") + 3
			
 
				+            json_end = result.rfind("```")
			
 
				+            result = result[json_start:json_end].strip()
			
 
				+
			
 
				+        try:
			
 
				+            extracted_result = json.loads(result)
			
 
				+
			
 
				+            # 记录LLM交互到项目日志
			
 
				+            if project_logger:
			
 
				+                project_logger.log_llm_interaction(
			
 
				+                    prompt=prompt,
			
 
				+                    document_content=document_content,
			
 
				+                    llm_response=response.choices[0].message.content.strip(),
			
 
				+                    extracted_result=extracted_result
			
 
				+                )
			
 
				+
			
 
				+            return extracted_result
			
 
				+        except json.JSONDecodeError as e:
			
 
				+            logger.error(f"JSON解析失败: {e}")
			
 
				+            logger.error(f"原始响应: {result}")
			
 
				+
			
 
				+            # 记录失败的LLM交互到项目日志
			
 
				+            if project_logger:
			
 
				+                project_logger.error(f"JSON解析失败: {e}")
			
 
				+                project_logger.log_llm_interaction(
			
 
				+                    prompt=prompt,
			
 
				+                    document_content=document_content,
			
 
				+                    llm_response=response.choices[0].message.content.strip(),
			
 
				+                    extracted_result=None
			
 
				+                )
			
 
				+
			
 
				+            return None
			
 
				+
			
 
				+    def merge_results(self, results):
			
 
				+
			
 
				+        if not results:
			
 
				+            return {}
			
 
				+
			
 
				+        merged = {}
			
 
				+
			
 
				+        for result in results:
			
 
				+            if not result:
			
 
				+                continue
			
 
				+
			
 
				+            for category, fields in result.items():
			
 
				+                if category not in merged:
			
 
				+                    merged[category] = {}
			
 
				+
			
 
				+                for field_name, value in fields.items():
			
 
				+                    if value is not None and value != "":
			
 
				+                        # 如果字段已存在且不为空，保留更详细的信息
			
 
				+                        if field_name in merged[category] and merged[category][field_name]:
			
 
				+                            if len(str(value)) > len(str(merged[category][field_name])):
			
 
				+                                merged[category][field_name] = value
			
 
				+                        else:
			
 
				+                            merged[category][field_name] = value
			
 
				+                    elif field_name not in merged[category]:
			
 
				+                        merged[category][field_name] = value
			
 
				+
			
 
				+        return merged
			
 
				+
			
 
				+    def flatten_json_structure(self, json_data):
			
 
				+        """
			
 
				+        多层级JSON结构合并为扁平化
			
 
				+        """
			
 
				+
			
 
				+        if not json_data or not isinstance(json_data, dict):
			
 
				+            return json_data
			
 
				+            
			
 
				+        flattened = {}
			
 
				+        
			
 
				+        if "通用表单" in json_data:
			
 
				+            flattened["通用表单"] = json_data["通用表单"].copy()
			
 
				+        else:
			
 
				+            flattened["通用表单"] = {}
			
 
				+            
			
 
				+        # 然后将其他所有专业类别的字段合并到"通用表单"中
			
 
				+        for category_name, category_data in json_data.items():
			
 
				+            if category_name == "通用表单":
			
 
				+                continue  # 跳过已处理的通用表单
			
 
				+                
			
 
				+            if isinstance(category_data, dict):
			
 
				+                # 将专业类别作为一个嵌套对象添加到通用表单中
			
 
				+                flattened["通用表单"][category_name] = category_data
			
 
				+                
			
 
				+        return flattened
			
 
				+
			
 
				+    def process_folder(self, folder_path, output_dir="output_dir"):
			
 
				+
			
 
				+        folder_path = Path(folder_path)
			
 
				+        if not folder_path.exists() or not folder_path.is_dir():
			
 
				+            logger.error(f"文件夹不存在: {folder_path}")
			
 
				+            return None
			
 
				+
			
 
				+        # 创建输出目录
			
 
				+        output_path = Path(output_dir)
			
 
				+        output_path.mkdir(exist_ok=True)
			
 
				+
			
 
				+        # 生成时间戳
			
 
				+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
			
 
				+        project_name = folder_path.name
			
 
				+
			
 
				+        # 创建项目输出目录
			
 
				+        project_output_dir = output_path / f"{project_name}_{timestamp}"
			
 
				+        project_output_dir.mkdir(exist_ok=True)
			
 
				+
			
 
				+        # 创建项目日志记录器
			
 
				+        project_logger = ProjectLogger(project_name, project_output_dir)
			
 
				+
			
 
				+        try:
			
 
				+            logger.info(f"处理项目: {project_name}")
			
 
				+            project_logger.info(f"处理项目: {project_name}")
			
 
				+
			
 
				+            fields, primary_type, secondary_types, match_score, matched_project = self.generate_json_structure(project_name)
			
 
				+            logger.info(f"类型: {primary_type or '通用'} ({sum(len(f) for f in fields.values())}字段)")
			
 
				+            project_logger.info(f"类型: {primary_type or '通用'} ({sum(len(f) for f in fields.values())}字段)")
			
 
				+
			
 
				+            # 创建动态提示词
			
 
				+            prompt = self.create_dynamic_prompt(project_name, primary_type, secondary_types, fields)
			
 
				+
			
 
				+            # 查找支持的文档文件（只在当前目录，不递归子目录）
			
 
				+            supported_extensions = ['.docx', '.doc', '.xlsx', '.xls', '.txt']
			
 
				+            document_files = []
			
 
				+
			
 
				+            # 只在当前文件夹查找，不递归子文件夹
			
 
				+            for ext in supported_extensions:
			
 
				+                document_files.extend(folder_path.glob(f"*{ext}"))
			
 
				+
			
 
				+            if not document_files:
			
 
				+                project_logger.warning("未找到支持的文档文件")
			
 
				+                project_logger.log_project_end()
			
 
				+                project_logger.close()
			
 
				+                return None
			
 
				+
			
 
				+            logger.info(f"找到 {len(document_files)} 个文档")
			
 
				+            project_logger.info(f"找到 {len(document_files)} 个文档")
			
 
				+
			
 
				+            # 处理每个文档文件
			
 
				+            results = []
			
 
				+            for doc_file in document_files:
			
 
				+                # 只在项目日志中记录详细的文档处理信息
			
 
				+                project_logger.info(f"处理文档: {doc_file.name}")
			
 
				+
			
 
				+                content = self.read_document(doc_file)
			
 
				+
			
 
				+                import tiktoken
			
 
				+                encoding = tiktoken.get_encoding("cl100k_base")
			
 
				+                # 编码为 token
			
 
				+                tokens = encoding.encode(content)
			
 
				+                logging.info(f"当前文档的token数：{len(tokens)}")
			
 
				+                if len(tokens) > 30000:
			
 
				+                    logging.info(f"已截断为30000个token")
			
 
				+                # 截取前 30000 个 token
			
 
				+                tokens = tokens[:30000]
			
 
				+                # 解码回文本
			
 
				+                content = encoding.decode(tokens)  
			
 
				+
			
 
				+                if content and self.llm_client:
			
 
				+                    result = self.call_llm(prompt, content, project_logger)
			
 
				+                    if result:
			
 
				+                        results.append(result)
			
 
				+                        # 只在项目日志中记录成功信息
			
 
				+                        project_logger.info(f"✓ 提取成功")
			
 
				+                    else:
			
 
				+                        project_logger.warning(f"提取失败: {doc_file.name}")
			
 
				+                else:
			
 
				+                    project_logger.warning(f"无法读取文档或LLM客户端未初始化: {doc_file.name}")
			
 
				+
			
 
				+            # 合并结果
			
 
				+            final_result = self.merge_results(results)
			
 
				+
			
 
				+            # 即使没有提取到信息，也要保存空的JSON结构
			
 
				+            if not final_result:
			
 
				+                final_result = {}
			
 
				+                for category, category_fields in fields.items():
			
 
				+                    final_result[category] = {}
			
 
				+                    if isinstance(category_fields, dict):
			
 
				+                        for field_name in category_fields:
			
 
				+                            if isinstance(category_fields[field_name], dict):
			
 
				+                                final_result[category][field_name] = {}
			
 
				+                                for sub_field in category_fields[field_name]:
			
 
				+                                    final_result[category][field_name][sub_field] = None
			
 
				+                            else:
			
 
				+                                final_result[category][field_name] = None
			
 
				+
			
 
				+            # 重要修复：将additional_fields_data中的数据合并到最终结果
			
 
				+            # 获取项目匹配信息
			
 
				+            project_key = matched_project if matched_project else project_name
			
 
				+            if project_key in self.additional_fields_data:
			
 
				+                additional_data = self.additional_fields_data[project_key]
			
 
				+                
			
 
				+                # 确保"通用表单"类别存在
			
 
				+                if "通用表单" not in final_result:
			
 
				+                    final_result["通用表单"] = {}
			
 
				+                
			
 
				+                # 定义从Excel获取的关键字段
			
 
				+                critical_fields = {"项目名称", "项目编号", "一级工程类型", "二级工程类型"}
			
 
				+                
			
 
				+                # 将additional_fields_data中的数据合并到"通用表单"
			
 
				+                for field_name, field_value in additional_data.items():
			
 
				+                    if field_name in critical_fields:
			
 
				+                        # 关键字段：强制使用Excel中的值，覆盖LLM结果
			
 
				+                        if field_value is not None and field_value != "":
			
 
				+                            final_result["通用表单"][field_name] = field_value
			
 
				+                            project_logger.info(f"✓ 使用Excel关键字段: {field_name} = {field_value}")
			
 
				+                        else:
			
 
				+                            project_logger.warning(f"Excel中关键字段为空: {field_name}")
			
 
				+                    else:
			
 
				+                        if field_name not in final_result["通用表单"] or final_result["通用表单"][field_name] is None or final_result["通用表单"][field_name] == "":
			
 
				+                            final_result["通用表单"][field_name] = field_value
			
 
				+                            project_logger.info(f"✓ 从Excel提取字段: {field_name} = {field_value}")
			
 
				+                
			
 
				+                project_logger.info(f"✓ 成功合并")
			
 
				+            else:
			
 
				+                project_logger.warning(f"未找到项目键 '{project_key}' 在additional_fields_data中")
			
 
				+
			
 
				+            # 应用合并处理
			
 
				+            final_result = self.flatten_json_structure(final_result)
			
 
				+            project_logger.info("✓ 应用合并处理")
			
 
				+
			
 
				+            # 保存结果
			
 
				+            result_filename = f"{project_name}_{timestamp}.json"
			
 
				+            result_path = project_output_dir / result_filename
			
 
				+
			
 
				+            with open(result_path, 'w', encoding='utf-8') as f:
			
 
				+                json.dump(final_result, f, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+            logger.info(f"✓ 保存结果: {result_path}")
			
 
				+            project_logger.info(f"✓ 保存结果: {result_path}")
			
 
				+
			
 
				+            # 追加写入“输出记录表.xlsx”
			
 
				+            try:
			
 
				+                self.append_output_record(final_result, project_name, str(result_path), str(project_output_dir), project_logger)
			
 
				+            except Exception as e:
			
 
				+                logger.error(f"追加写入输出记录表失败: {e}")
			
 
				+                project_logger.error(f"追加写入输出记录表失败: {e}")
			
 
				+
			
 
				+            # 复制原始文件到输出目录
			
 
				+            for doc_file in document_files:
			
 
				+                shutil.copy2(doc_file, project_output_dir / doc_file.name)
			
 
				+                project_logger.info(f"✓ 复制文档: {doc_file.name}")
			
 
				+
			
 
				+            # 记录处理统计信息
			
 
				+            project_logger.info("处理统计:")
			
 
				+            project_logger.info(f"  - 文档数量: {len(document_files)}")
			
 
				+            project_logger.info(f"  - 成功提取: {len(results)}")
			
 
				+            project_logger.info(f"  - 提取成功率: {len(results)/len(document_files)*100:.1f}%")
			
 
				+
			
 
				+            # 记录项目结束
			
 
				+            project_logger.log_project_end()
			
 
				+
			
 
				+            return {
			
 
				+                "project_name": project_name,
			
 
				+                "output_dir": str(project_output_dir),
			
 
				+                "result_file": result_filename,
			
 
				+                "processed_files": len(document_files),
			
 
				+                "extraction_success": len(results) > 0,
			
 
				+                "primary_type": primary_type,
			
 
				+                "match_score": match_score
			
 
				+            }
			
 
				+
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"处理项目时发生错误: {e}")
			
 
				+            project_logger.error(f"处理项目时发生错误: {e}")
			
 
				+            project_logger.log_project_end()
			
 
				+            return None
			
 
				+        finally:
			
 
				+            # 确保关闭项目日志记录器
			
 
				+            project_logger.close()
			
 
				+
			
 
				+    def process_directory_tree(self, root_path, output_dir="output_dir"):
			
 
				+        root_path = Path(root_path)
			
 
				+        if not root_path.exists() or not root_path.is_dir():
			
 
				+            logger.debug(f"跳过不存在的目录: {root_path}")
			
 
				+            return []
			
 
				+
			
 
				+        results = []
			
 
				+
			
 
				+        result = self.process_folder(root_path, output_dir)
			
 
				+        if result:
			
 
				+            results.append(result)
			
 
				+
			
 
				+        try:
			
 
				+            for item in root_path.iterdir():
			
 
				+                if item.is_dir():
			
 
				+                    sub_results = self.process_directory_tree(item, output_dir)
			
 
				+                    results.extend(sub_results)
			
 
				+        except (FileNotFoundError, PermissionError):
			
 
				+            pass
			
 
				+
			
 
				+        return results
			
 
				+
			
 
				+class FolderMonitorHandler(FileSystemEventHandler):
			
 
				+
			
 
				+    def __init__(self, system, output_dir, input_dir):
			
 
				+        self.system = system
			
 
				+        self.output_dir = output_dir
			
 
				+        self.input_dir = Path(input_dir)
			
 
				+        self.processing = set()  # 正在处理的文件/文件夹
			
 
				+
			
 
				+    def on_created(self, event):
			
 
				+
			
 
				+        if event.is_directory:
			
 
				+            self.handle_new_folder(Path(event.src_path))
			
 
				+        else:
			
 
				+            self.handle_new_file(Path(event.src_path))
			
 
				+
			
 
				+    def handle_new_folder(self, folder_path):
			
 
				+
			
 
				+        if not folder_path.exists() or folder_path.name in self.processing:
			
 
				+            return
			
 
				+
			
 
				+        logger.info(f"新文件夹: {folder_path.name}")
			
 
				+        time.sleep(2)
			
 
				+
			
 
				+        if not folder_path.exists():
			
 
				+            return
			
 
				+
			
 
				+        self.processing.add(folder_path.name)
			
 
				+        try:
			
 
				+            results = self.system.process_directory_tree(folder_path, self.output_dir)
			
 
				+            if results:
			
 
				+                logger.info(f"完成: {len(results)} 个项目")
			
 
				+            else:
			
 
				+                logger.info("未找到可处理的文档文件")
			
 
				+            self.cleanup_processed_folder(folder_path)
			
 
				+        finally:
			
 
				+            self.processing.discard(folder_path.name)
			
 
				+
			
 
				+    def cleanup_processed_folder(self, folder_path):
			
 
				+        max_retries = 3
			
 
				+        for attempt in range(max_retries):
			
 
				+            try:
			
 
				+                if folder_path.exists():
			
 
				+                    if attempt > 0:
			
 
				+                        time.sleep(2 * attempt)
			
 
				+
			
 
				+                    self._force_remove_readonly(folder_path)
			
 
				+                    shutil.rmtree(folder_path)
			
 
				+                    logger.info(f"清理: {folder_path.name}")
			
 
				+                    return  
			
 
				+                else:
			
 
				+                    return  
			
 
				+            except PermissionError:
			
 
				+                if attempt == max_retries - 1: 
			
 
				+                    logger.warning(f"权限不足，无法删除文件夹: {folder_path.name}")
			
 
				+                    if self._try_system_delete(folder_path):
			
 
				+                        logger.info(f"系统命令删除成功: {folder_path.name}")
			
 
				+                        return
			
 
				+                    else:
			
 
				+                        try:
			
 
				+                            backup_name = f"{folder_path.name}_待删除_{int(time.time())}"
			
 
				+                            backup_path = folder_path.parent / backup_name
			
 
				+                            folder_path.rename(backup_path)
			
 
				+                            logger.info(f"已重命名为: {backup_name}")
			
 
				+                            return
			
 
				+                        except Exception:
			
 
				+                            logger.warning(f"无法重命名文件夹，请手动删除: {folder_path.name}")
			
 
				+                            return
			
 
				+                else:
			
 
				+                    continue
			
 
				+            except Exception as e:
			
 
				+                if attempt == max_retries - 1: 
			
 
				+                    logger.error(f"清理文件夹失败 {folder_path.name}: {e}")
			
 
				+                else:
			
 
				+                    continue
			
 
				+
			
 
				+    def _force_remove_readonly(self, path):
			
 
				+        import stat
			
 
				+        try:
			
 
				+            for root, dirs, files in os.walk(path):
			
 
				+                for file in files:
			
 
				+                    file_path = Path(root) / file
			
 
				+                    try:
			
 
				+                        file_path.chmod(stat.S_IWRITE | stat.S_IREAD)
			
 
				+                    except Exception:
			
 
				+                        pass
			
 
				+                for dir in dirs:
			
 
				+                    dir_path = Path(root) / dir
			
 
				+                    try:
			
 
				+                        dir_path.chmod(stat.S_IWRITE | stat.S_IREAD | stat.S_IEXEC)
			
 
				+                    except Exception:
			
 
				+                        pass
			
 
				+            path.chmod(stat.S_IWRITE | stat.S_IREAD | stat.S_IEXEC)
			
 
				+        except Exception:
			
 
				+            pass
			
 
				+
			
 
				+    def _try_system_delete(self, folder_path):
			
 
				+        import subprocess
			
 
				+        try:
			
 
				+            if os.name == 'nt':  # Windows
			
 
				+                result = subprocess.run(
			
 
				+                    ['rmdir', '/s', '/q', str(folder_path)],
			
 
				+                    capture_output=True,
			
 
				+                    text=True,
			
 
				+                    timeout=30
			
 
				+                )
			
 
				+                return result.returncode == 0
			
 
				+            else:  # Unix/Linux
			
 
				+                result = subprocess.run(
			
 
				+                    ['rm', '-rf', str(folder_path)],
			
 
				+                    capture_output=True,
			
 
				+                    text=True,
			
 
				+                    timeout=30
			
 
				+                )
			
 
				+                return result.returncode == 0
			
 
				+        except Exception:
			
 
				+            return False
			
 
				+
			
 
				+    def handle_new_file(self, file_path):
			
 
				+
			
 
				+        if file_path.name in self.processing:
			
 
				+            return
			
 
				+
			
 
				+        archive_extensions = {'.zip', '.7z', '.rar'}
			
 
				+        if file_path.suffix.lower() in archive_extensions:
			
 
				+            logger.info(f"压缩包: {file_path.name}")
			
 
				+            time.sleep(1)
			
 
				+
			
 
				+            self.processing.add(file_path.name)
			
 
				+            try:
			
 
				+                extract_dir = file_path.parent / file_path.stem
			
 
				+                extract_dir.mkdir(exist_ok=True)
			
 
				+
			
 
				+                if self.system.extract_archive(file_path, extract_dir):
			
 
				+                    file_path.unlink()
			
 
				+                    time.sleep(1)
			
 
				+                    results = self.system.process_directory_tree(extract_dir, self.output_dir)
			
 
				+                    if results:
			
 
				+                        logger.info(f"解压完成: {len(results)} 个项目")
			
 
				+                    else:
			
 
				+                        logger.info("解压后未找到可处理的文档文件")
			
 
				+                    self.cleanup_processed_folder(extract_dir)
			
 
				+            finally:
			
 
				+                self.processing.discard(file_path.name)
			
 
				+
			
 
				+def monitor_mode():
			
 
				+
			
 
				+    input_dir = "input_dir"
			
 
				+    output_dir = "output_dir"
			
 
				+
			
 
				+    # 创建输入目录
			
 
				+    input_path = Path(input_dir)
			
 
				+    input_path.mkdir(exist_ok=True)
			
 
				+
			
 
				+    # 创建输出目录
			
 
				+    output_path = Path(output_dir)
			
 
				+    output_path.mkdir(exist_ok=True)
			
 
				+
			
 
				+    # 初始化系统
			
 
				+    system = MinRAGSystem()
			
 
				+
			
 
				+    # 创建监控处理器
			
 
				+    handler = FolderMonitorHandler(system, output_dir, input_dir)
			
 
				+
			
 
				+    # 创建文件系统监控器
			
 
				+    observer = Observer()
			
 
				+    observer.schedule(handler, str(input_path), recursive=True)
			
 
				+
			
 
				+    # 启动监控
			
 
				+    observer.start()
			
 
				+
			
 
				+    logger.info(f"监控目录: {input_path.absolute()}")
			
 
				+    logger.info(f"输出目录: {Path(output_dir).absolute()}")
			
 
				+
			
 
				+    # 扫描现有文件夹
			
 
				+    for item in input_path.iterdir():
			
 
				+        if item.is_dir():
			
 
				+            handler.handle_new_folder(item)
			
 
				+
			
 
				+    logger.info("监控中... (按 Ctrl+C 停止)")
			
 
				+    try:
			
 
				+        while True:
			
 
				+            time.sleep(1)
			
 
				+    except KeyboardInterrupt:
			
 
				+        observer.stop()
			
 
				+        observer.join()
			
 
				+        logger.info("✅ 监控已停止")
			
 
				+
			
 
				+def main():
			
 
				+    print("合同信息提取系统")
			
 
				+    print("输入目录: input_dir")
			
 
				+    print("输出目录: output_dir")
			
 
				+    print("=" * 50)
			
 
				+
			
 
				+    monitor_mode()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()