rag_evaluation_service.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716
  1. import os
  2. import json
  3. import math
  4. import uuid
  5. import httpx
  6. import pandas as pd
  7. import requests
  8. from typing import List, Dict, Any
  9. from datasets import Dataset
  10. from ragas import evaluate, aevaluate
  11. from ragas.metrics import (
  12. # faithfulness,
  13. # answer_correctness,
  14. # answer_relevancy,
  15. # context_precision,
  16. # context_recall,
  17. Faithfulness,
  18. AnswerCorrectness,
  19. AnswerRelevancy,
  20. ContextPrecision,
  21. ContextRecall,
  22. NoiseSensitivity
  23. )
  24. from ragas.llms import LangchainLLMWrapper
  25. from ragas.embeddings import LangchainEmbeddingsWrapper
  26. from langchain_openai import ChatOpenAI, OpenAIEmbeddings
  27. from rag.chat_message import ChatRetrieverRag
  28. from utils.get_logger import setup_logger
  29. from utils.upload_file_to_oss import UploadMinio
  30. from rag.db import MysqlOperate
  31. logger = setup_logger(__name__)
  32. # 临时文件目录
  33. TMP_EVAL_DIR = "./tmp_file/evaluation/"
  34. os.makedirs(TMP_EVAL_DIR, exist_ok=True)
  35. english_map_chinese = {
  36. "faithfulness": "忠实性",
  37. "answer_correctness": "答案正确性",
  38. "answer_relevancy": "答案相关性",
  39. "context_precision": "上下文精度",
  40. "context_recall": "上下文召回率",
  41. "noise_sensitivity(mode=relevant)": "噪声敏感性",
  42. "user_input": "用户问题",
  43. "reference": "标准答案",
  44. "response": "模型回答",
  45. "retrieved_contexts": "上下文"
  46. }
  47. def save_evaluation_to_excel(eval_result: Dict[str, Any], output_path: str = None, tenant_id: str = "000000") -> Dict[str, Any]:
  48. """
  49. 将评估结果保存为Excel文件,上传至MinIO,并记录到数据库
  50. Args:
  51. eval_result: 评估结果字典,包含metrics和details
  52. output_path: 输出文件路径,如果为None则自动生成
  53. tenant_id: 租户ID,默认为"000000"
  54. Returns:
  55. 包含文件路径和oss_id的字典
  56. """
  57. if output_path is None:
  58. timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
  59. output_path = os.path.join(TMP_EVAL_DIR, f"evaluation_result_{timestamp}.xlsx")
  60. metrics = eval_result.get("metrics", {})
  61. details = eval_result.get("details", [])
  62. # logger.info(details)
  63. # 创建DataFrame
  64. df = pd.DataFrame(details)
  65. df.rename(columns=english_map_chinese, inplace=True)
  66. # 创建Excel writer
  67. with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
  68. # 写入详细数据
  69. df.to_excel(writer, sheet_name='评估详情', index=False)
  70. # 获取worksheet以添加汇总数据
  71. worksheet = writer.sheets['评估详情']
  72. # 计算汇总数据的起始列(在数据右侧,间隔3列)
  73. summary_start_col = len(df.columns) + 4
  74. # 写入汇总标题
  75. # worksheet.cell(row=1, column=summary_start_col, value="指标汇总")
  76. worksheet.cell(row=2, column=summary_start_col, value="指标汇总")
  77. worksheet.cell(row=2, column=summary_start_col + 1, value="平均值")
  78. # 写入各指标的平均值
  79. row_idx = 3
  80. total_sum = 0
  81. valid_count = 0
  82. for metric_name, metric_value in metrics.items():
  83. worksheet.cell(row=row_idx, column=summary_start_col, value=english_map_chinese.get(metric_name, metric_name))
  84. if metric_value is not None:
  85. worksheet.cell(row=row_idx, column=summary_start_col + 1, value=metric_value)
  86. total_sum += metric_value
  87. valid_count += 1
  88. else:
  89. worksheet.cell(row=row_idx, column=summary_start_col + 1, value="N/A")
  90. valid_count += 1
  91. row_idx += 1
  92. # 添加总平均值
  93. if valid_count > 0:
  94. total_avg = round(total_sum / valid_count, 2)
  95. worksheet.cell(row=row_idx + 1, column=summary_start_col, value="总平均值")
  96. worksheet.cell(row=row_idx + 1, column=summary_start_col + 1, value=total_avg)
  97. logger.info(f"评估结果已保存到: {output_path}")
  98. # 上传文件到MinIO并记录数据库
  99. oss_id = None
  100. try:
  101. from config import minio_config
  102. # 获取文件名和扩展名
  103. file_name = os.path.basename(output_path)
  104. file_extension = os.path.splitext(file_name)[1]
  105. file_size = os.path.getsize(output_path)
  106. # 生成MinIO中的文件路径
  107. minio_file_path = f"evaluation/{file_name}"
  108. # 上传到MinIO
  109. upload_client = UploadMinio()
  110. upload_success = upload_client.upload_file(output_path, minio_file_path)
  111. if upload_success:
  112. logger.info(f"文件已上传至MinIO: {minio_file_path}")
  113. # 构建完整的MinIO访问URL
  114. minio_url = minio_config.get("minio_url")
  115. minio_bucket = minio_config.get("minio_bucket")
  116. full_url = f"{minio_url}/{minio_bucket}/{minio_file_path}"
  117. # 记录到数据库
  118. mysql_client = MysqlOperate()
  119. success, oss_id = mysql_client.insert_oss_record(
  120. file_name=file_name,
  121. url=full_url,
  122. tenant_id=tenant_id,
  123. file_extension=file_extension,
  124. file_size=file_size
  125. )
  126. if success:
  127. logger.info(f"OSS记录已插入数据库,oss_id: {oss_id}, URL: {full_url}")
  128. else:
  129. logger.error(f"插入OSS记录失败: {oss_id}")
  130. else:
  131. logger.error("文件上传至MinIO失败")
  132. except Exception as e:
  133. logger.error(f"上传文件或记录数据库时出错: {e}")
  134. return {
  135. "file_path": output_path,
  136. "oss_id": oss_id,
  137. "full_url": full_url
  138. }
  139. async def download_file(url: str) -> str:
  140. """从网络下载文件到临时目录,返回本地路径"""
  141. # 从 URL 中提取文件扩展名
  142. url_path = url.split('?')[0] # 移除查询参数
  143. file_ext = os.path.splitext(url_path)[1].lower()
  144. # 如果没有扩展名或不是支持的格式,默认为 .json
  145. if file_ext not in ['.json', '.xlsx', '.xls']:
  146. file_ext = '.json'
  147. file_name = f"{uuid.uuid4()}{file_ext}"
  148. local_path = os.path.join(TMP_EVAL_DIR, file_name)
  149. async with httpx.AsyncClient(timeout=60) as client:
  150. response = await client.get(url)
  151. response.raise_for_status()
  152. with open(local_path, "wb") as f:
  153. f.write(response.content)
  154. logger.info(f"文件下载成功: {url} -> {local_path}")
  155. return local_path
  156. def _clean_float_values(obj):
  157. """清理非法浮点数值(NaN, Infinity)为 None"""
  158. if isinstance(obj, float):
  159. if math.isnan(obj) or math.isinf(obj):
  160. return None
  161. return obj
  162. elif isinstance(obj, dict):
  163. return {k: _clean_float_values(v) for k, v in obj.items()}
  164. elif isinstance(obj, list):
  165. return [_clean_float_values(item) for item in obj]
  166. return obj
  167. def parse_excel_to_qa_data(file_path: str) -> List[Dict[str, str]]:
  168. """
  169. 解析 Excel 文件为问答数据列表
  170. Excel 格式要求:
  171. - 第一行为表头:question 和 ground_truth
  172. - 后续每行为一条问答数据
  173. Args:
  174. file_path: Excel 文件路径
  175. Returns:
  176. 问答数据列表,格式为 [{"question": "...", "ground_truth": "..."}, ...]
  177. """
  178. try:
  179. # 读取 Excel 文件
  180. df = pd.read_excel(file_path)
  181. # 检查必需的列
  182. # required_columns = ['question', 'ground_truth']
  183. required_columns = ['问题', '答案']
  184. missing_columns = [col for col in required_columns if col not in df.columns]
  185. if missing_columns:
  186. raise ValueError(f"Excel 文件缺少必需的列: {missing_columns}")
  187. # 转换为字典列表,并清理空值
  188. qa_data = []
  189. for _, row in df.iterrows():
  190. # question = str(row['question']).strip() if pd.notna(row['question']) else ""
  191. # ground_truth = str(row['ground_truth']).strip() if pd.notna(row['ground_truth']) else ""
  192. question = str(row['问题']).strip() if pd.notna(row['问题']) else ""
  193. ground_truth = str(row['答案']).strip() if pd.notna(row['答案']) else ""
  194. # 跳过空问题
  195. if not question:
  196. continue
  197. qa_data.append({
  198. "question": question,
  199. "ground_truth": ground_truth
  200. })
  201. logger.info(f"从 Excel 文件解析出 {len(qa_data)} 条有效数据")
  202. return qa_data
  203. except Exception as e:
  204. logger.error(f"解析 Excel 文件失败: {e}")
  205. raise ValueError(f"Excel 文件解析失败: {str(e)}")
  206. class RAGEvaluationService:
  207. """RAG评估服务"""
  208. def __init__(
  209. self,
  210. knowledge_ids: List[str],
  211. embedding_id: str = "e5",
  212. temperature: float = 0.6,
  213. top_p: float = 0.7,
  214. max_tokens: int = 4096,
  215. model: str = "Qwen3-Coder-30B-loft",
  216. slice_count: int = 5,
  217. prompt: str | None = None,
  218. rerank_model_name: str = "Qwen3-Reranker-0.6B"
  219. ):
  220. self.knowledge_ids = knowledge_ids
  221. self.embedding_id = embedding_id
  222. self.temperature = temperature
  223. self.top_p = top_p
  224. self.max_tokens = max_tokens
  225. if model == "Qwen3-30B":
  226. model = "/opt/vllm/models/Qwen/Qwen3-30B-A3B-Instruct-2507"
  227. self.model = model
  228. self.slice_count = slice_count
  229. self.rag_config = {
  230. "knowledgeIds": knowledge_ids,
  231. "embeddingId": embedding_id,
  232. "sliceCount": slice_count,
  233. "knowledgeInfo": json.dumps({
  234. "recall_method": "mixed",
  235. "rerank_status": True,
  236. "rerank_model_name": rerank_model_name
  237. }),
  238. "temperature": temperature,
  239. "topP": top_p,
  240. "maxToken": max_tokens,
  241. "enable_think": False,
  242. "prompt": prompt if prompt else """你是一位知识检索助手,你必须并且只能从我发送的众多知识片段中寻找能够解决用户输入问题的最优答案,并且在执行任务的过程中严格执行规定的要求。\n\n知识片段如下:\n{知识}\n\n规定要求:\n- 找到答案就仅使用知识片段中的原文回答用户的提问;\n- 找不到答案就用自身知识并且告诉用户该信息不是来自文档;\n- 所引用的文本片段中所包含的示意图占位符必须进行返回,占位符格式参考:【示意图序号_编号】\n - 严禁输出任何知识片段中不存在的示意图占位符;\n - 输出的内容必须删除其中包含的任何图注、序号等信息。例如:"进入登录页面(图1.1)"需要从文字中删除图序,回复效果为:"进入登录页面";"如图所示1.1",回复效果为:"如图所示";\n- 格式规范\n - 文档中会出现包含表格的情况,表格是以图片标识符的形式呈现,表格中缺失数据时候返回空单元格;\n - 如果需要用到表格中的数据,以markdown格式输出表格中的数据;\n - 避免使用代码块语法回复信息;\n - 回复的开头语不要输出诸如:"我想","我认为","think"等相关语义的文本。\n\n严格执行规定要求,不要复述问题,直接开始回答。\n\n用户输入问题:\n{用户}"""
  243. }
  244. async def get_rag_response(self, question: str) -> Dict[str, Any]:
  245. """通过 RAG 系统获取答案和上下文"""
  246. chat_json = self.rag_config.copy()
  247. chat_json["query"] = question
  248. rag_retriever = ChatRetrieverRag(chat_json=chat_json)
  249. retriever_result_list, search_doc_id_to_knowledge_id_dict = await rag_retriever.retriever_result(chat_json)
  250. logger.info(f"检索到的上下文:{retriever_result_list}")
  251. chunk_content, knowledge_info_dict = rag_retriever.parse_retriever_list(
  252. retriever_result_list,
  253. search_doc_id_to_knowledge_id_dict
  254. )
  255. contexts = [item["content"] for item in retriever_result_list]
  256. answer = ""
  257. try:
  258. async for event in rag_retriever.generate_rag_response(chat_json, chunk_content):
  259. if event.get("event") == "add":
  260. answer = event.get("data", "")
  261. elif event.get("event") == "finish":
  262. pass
  263. except GeneratorExit:
  264. pass
  265. return {
  266. "answer": answer,
  267. "contexts": contexts,
  268. "retriever_count": len(retriever_result_list)
  269. }
  270. async def process_qa_data(self, qa_data: List[Dict[str, str]]) -> List[Dict[str, Any]]:
  271. """处理问答数据,通过 RAG 获取 contexts 和 answer"""
  272. full_data = []
  273. for idx, item in enumerate(qa_data):
  274. # 处理键名可能带空格的情况
  275. clean_item = {k.strip(): v for k, v in item.items()}
  276. question = clean_item.get("question", "").strip()
  277. ground_truth = clean_item.get("ground_truth", "").strip()
  278. logger.info(f"处理第 {idx + 1}/{len(qa_data)} 个问题: {question[:50]}...")
  279. try:
  280. rag_result = await self.get_rag_response(question)
  281. full_data.append({
  282. "question": question,
  283. "contexts": rag_result["contexts"],
  284. "answer": rag_result["answer"],
  285. "ground_truth": ground_truth,
  286. })
  287. logger.info(f"检索到 {rag_result['retriever_count']} 个上下文")
  288. except Exception as e:
  289. logger.error(f"处理失败: {e}")
  290. full_data.append({
  291. "question": question,
  292. "contexts": [],
  293. "answer": "",
  294. "ground_truth": ground_truth,
  295. })
  296. return full_data
  297. async def run_evaluation(self, dataset: Dataset) -> Dict[str, Any]:
  298. """执行 RAGAS 评估"""
  299. from config import model_name_vllm_url_dict
  300. from rag.load_model import bge_m3_base_url, bge_me_model, qwen_ed_base_url, qwen_ed_model
  301. embedding_mapping = {
  302. "bge-m3": (bge_m3_base_url, bge_me_model),
  303. "Qwen3-Embedding-0.6B": (qwen_ed_base_url, qwen_ed_model)
  304. }
  305. # 配置 VLLM 和 Embedding 服务
  306. VLLM_LLM_BASE = model_name_vllm_url_dict.get(self.model)
  307. VLLM_LLM_KEY = "vllm-dummy-key"
  308. VLLM_EMBEDDING_BASE, VLLM_EMBEDDING_MODEL = embedding_mapping.get(self.embedding_id)
  309. # 初始化 RAGAS 评估使用的模型
  310. vllm_generator = ChatOpenAI(
  311. model=self.model,
  312. base_url=VLLM_LLM_BASE,
  313. api_key=VLLM_LLM_KEY,
  314. temperature=0.1,
  315. max_tokens=51200,
  316. timeout=3000,
  317. request_timeout=3000,
  318. max_retries=5,
  319. )
  320. vllm_embeddings = OpenAIEmbeddings(
  321. model=VLLM_EMBEDDING_MODEL,
  322. base_url=VLLM_EMBEDDING_BASE,
  323. api_key=VLLM_LLM_KEY,
  324. )
  325. # 包装为 RAGAS 兼容的格式
  326. ragas_llm = LangchainLLMWrapper(vllm_generator)
  327. ragas_embeddings = LangchainEmbeddingsWrapper(vllm_embeddings)
  328. # 评估指标
  329. # metrics_to_run = [
  330. # faithfulness,
  331. # answer_correctness,
  332. # answer_relevancy,
  333. # context_precision,
  334. # context_recall,
  335. # NoiseSensitivity(llm=ragas_llm)
  336. # ]
  337. metrics_to_run = [
  338. Faithfulness(llm=ragas_llm),
  339. AnswerCorrectness(llm=ragas_llm, embeddings=ragas_embeddings),
  340. AnswerRelevancy(llm=ragas_llm, embeddings=ragas_embeddings),
  341. ContextPrecision(llm=ragas_llm),
  342. ContextRecall(llm=ragas_llm),
  343. NoiseSensitivity(llm=ragas_llm),
  344. ]
  345. logger.info("开始 RAGAS 评估...")
  346. final_metrics = list(metrics_to_run)
  347. if 'ground_truth' not in dataset.column_names:
  348. logger.warning("缺少 'ground_truth',将跳过 'context_recall' 指标")
  349. final_metrics.remove(context_recall)
  350. logger.info(f"使用指标: {[m.name for m in final_metrics]}, 数据集大小: {len(dataset)} 条")
  351. try:
  352. # result = evaluate(
  353. # dataset=dataset,
  354. # metrics=final_metrics,
  355. # llm=ragas_llm,
  356. # embeddings=ragas_embeddings,
  357. # raise_exceptions=False,
  358. # show_progress=True,
  359. # )
  360. result = await aevaluate(
  361. dataset=dataset,
  362. metrics=final_metrics,
  363. llm=ragas_llm,
  364. embeddings=ragas_embeddings,
  365. raise_exceptions=False,
  366. show_progress=True,
  367. )
  368. except Exception as e:
  369. logger.error(f"评估过程中出现错误: {e}")
  370. raise
  371. result_df = result.to_pandas()
  372. result_df = result_df.replace([float('inf'), float('-inf')], None)
  373. result_df = result_df.where(pd.notna(result_df), None)
  374. metric_names = [m.name for m in final_metrics]
  375. summary = {}
  376. # 遍历所有数值列,计算平均值
  377. for col in result_df.columns:
  378. # 跳过非数值列
  379. if result_df[col].dtype not in ['float64', 'float32', 'int64', 'int32']:
  380. # result_df[col] = 0
  381. continue
  382. try:
  383. if col == "noise_sensitivity(mode=relevant)" and result_df[col] is not None:
  384. result_df[col] = 1.00 - result_df[col]
  385. # mean_val = result_df[col].mean(skipna=False)
  386. mean_val = result_df[col].fillna(0).mean(skipna=True)
  387. if pd.notna(mean_val) and math.isfinite(mean_val):
  388. summary[col] = round(float(mean_val),2)
  389. else:
  390. summary[col] = None
  391. except Exception:
  392. summary[col] = None
  393. logger.info("评估完成!")
  394. eval_result = {
  395. "metrics": _clean_float_values(summary),
  396. "details": _clean_float_values(result_df.to_dict(orient="records")),
  397. "metric_names": metric_names,
  398. "count": len(result_df),
  399. }
  400. # 保存评估结果到Excel并上传至MinIO
  401. try:
  402. save_result = save_evaluation_to_excel(eval_result)
  403. # eval_result["excel_path"] = save_result.get("file_path")
  404. eval_result["oss_id"] = save_result.get("oss_id", "")
  405. eval_result["full_url"] = save_result.get("full_url", "")
  406. except Exception as e:
  407. logger.error(f"保存Excel文件或上传失败: {e}")
  408. return eval_result
  409. # def send_post_request_sync(self, url, json_data=None, headers=None, timeout=30):
  410. # """
  411. # 同步发送POST请求
  412. # 参数:
  413. # url: 请求的URL地址
  414. # json_data: JSON格式的请求体数据(字典类型)
  415. # headers: 可选的请求头字典
  416. # timeout: 请求超时时间(秒),默认30秒
  417. # 返回:
  418. # dict: 包含状态码、响应数据等信息
  419. # """
  420. # try:
  421. # # 设置默认的Content-Type为application/json
  422. # if headers is None:
  423. # headers = {}
  424. # if 'Content-Type' not in headers:
  425. # headers['Content-Type'] = 'application/json'
  426. # resp = requests.post(url, json=json_data, headers=headers, timeout=timeout, verify=False)
  427. # status_code = resp.status_code
  428. # response_data = resp.text
  429. # logger.info(f"同步POST请求成功 [url={url}]: {response_data} json_data={json_data}")
  430. # return {
  431. # "code": 200,
  432. # "status_code": status_code,
  433. # "data": response_data,
  434. # "message": "POST请求成功"
  435. # }
  436. # except requests.exceptions.Timeout as e:
  437. # logger.error(f"同步POST请求超时 [url={url}]: {e}")
  438. # return {
  439. # "code": 500,
  440. # "message": f"POST请求超时: {str(e)}"
  441. # }
  442. # except requests.exceptions.RequestException as e:
  443. # logger.error(f"同步POST请求失败 [url={url}]: {e}")
  444. # return {
  445. # "code": 500,
  446. # "message": f"POST请求失败: {str(e)}"
  447. # }
  448. def send_put_request_sync(url, json_data=None, headers=None, timeout=30):
  449. """
  450. 同步发送PUT请求
  451. 参数:
  452. url: 请求的URL地址
  453. json_data: JSON格式的请求体数据(字典类型)
  454. headers: 可选的请求头字典
  455. timeout: 请求超时时间(秒),默认30秒
  456. 返回:
  457. dict: 包含状态码、响应数据等信息
  458. """
  459. try:
  460. # 设置默认的Content-Type为application/json
  461. if headers is None:
  462. headers = {}
  463. if 'Content-Type' not in headers:
  464. headers['Content-Type'] = 'application/json'
  465. resp = requests.put(url, json=json_data, headers=headers, timeout=timeout, verify=False)
  466. status_code = resp.status_code
  467. response_data = resp.text
  468. logger.info(f"同步PUT请求成功 [url={url}]: {response_data} json_data={json_data}")
  469. return {
  470. "code": 200,
  471. "status_code": status_code,
  472. "data": response_data,
  473. "message": "PUT请求成功"
  474. }
  475. except requests.exceptions.Timeout as e:
  476. logger.error(f"同步PUT请求超时 [url={url}]: {e}")
  477. return {
  478. "code": 500,
  479. "message": f"PUT请求超时: {str(e)}"
  480. }
  481. except requests.exceptions.RequestException as e:
  482. logger.error(f"同步PUT请求失败 [url={url}]: {e}")
  483. return {
  484. "code": 500,
  485. "message": f"PUT请求失败: {str(e)}"
  486. }
  487. async def run_rag_evaluation(
  488. file_url: str,
  489. knowledge_ids: List[str],
  490. embedding_id: str = "e5",
  491. temperature: float = 0.6,
  492. top_p: float = 0.7,
  493. max_tokens: int = 4096,
  494. model: str = "Qwen3-Coder-30B-loft",
  495. slice_count: int = 5,
  496. task_id = None,
  497. tenant_id = None,
  498. prompt = None,
  499. rerank_model_name = None,
  500. ) -> Dict[str, Any]:
  501. """
  502. 执行RAG评估的主入口函数
  503. Args:
  504. file_url: 网络文件路径(支持 JSON 或 Excel 格式)
  505. knowledge_ids: 知识库ID列表
  506. embedding_id: 嵌入模型ID
  507. temperature: 温度参数
  508. top_p: top_p参数
  509. max_tokens: 最大token数
  510. model: 模型名称
  511. slice_count: 检索切片数量
  512. Returns:
  513. 包含knowledge_ids和评估指标的字典
  514. """
  515. local_path = None
  516. try:
  517. # 1. 下载文件
  518. local_path = await download_file(file_url)
  519. # 2. 根据文件扩展名解析数据
  520. file_ext = os.path.splitext(local_path)[1].lower()
  521. if file_ext in ['.xlsx', '.xls']:
  522. # 解析 Excel 文件
  523. logger.info(f"检测到 Excel 文件格式: {file_ext}")
  524. qa_data = parse_excel_to_qa_data(local_path)
  525. elif file_ext == '.json':
  526. # 解析 JSON 文件
  527. logger.info("检测到 JSON 文件格式")
  528. with open(local_path, "r", encoding="utf-8") as f:
  529. qa_data = json.load(f)
  530. else:
  531. raise ValueError(f"不支持的文件格式: {file_ext},仅支持 .json, .xlsx, .xls")
  532. if not isinstance(qa_data, list) or not qa_data:
  533. raise ValueError("数据必须是非空数组格式")
  534. # 3. 初始化评估服务
  535. service = RAGEvaluationService(
  536. knowledge_ids=knowledge_ids,
  537. embedding_id=embedding_id,
  538. temperature=temperature,
  539. top_p=top_p,
  540. max_tokens=max_tokens,
  541. model=model,
  542. slice_count=slice_count,
  543. prompt=prompt,
  544. rerank_model_name=rerank_model_name
  545. )
  546. # 4. 处理问答数据
  547. logger.info(f"开始处理 {len(qa_data)} 条数据...")
  548. full_data = await service.process_qa_data(qa_data)
  549. # 5. 创建 Dataset
  550. dataset = Dataset.from_dict({
  551. "question": [item["question"] for item in full_data],
  552. "contexts": [item["contexts"] for item in full_data],
  553. "answer": [item["answer"] for item in full_data],
  554. "ground_truth": [item["ground_truth"] for item in full_data],
  555. })
  556. # 6. 执行评估
  557. eval_result = await service.run_evaluation(dataset)
  558. logger.info(f"knowledge_ids: {knowledge_ids}:\n评估结果: {eval_result}")
  559. metrics = eval_result.get("metrics", {})
  560. metrics = {k: (v if v is not None else 0) for k, v in metrics.items()}
  561. # 得分(取所有指标的均值)
  562. avg_score = round(sum(metrics.values()) / len(metrics), 2) if metrics else 0
  563. send_put_request_sync(url="http://10.1.14.17:9080/deepseek/evaluation/editEvaluationTaskByPython",
  564. json_data={"taskId":task_id,
  565. "tenantId":tenant_id,
  566. "status":"2",
  567. "resultStore":avg_score,
  568. "answerLoyalty":eval_result["metrics"].get("faithfulness",0),
  569. "correctAnswer":eval_result["metrics"].get("answer_correctness",0),
  570. "answerRelevance":eval_result["metrics"].get("answer_relevancy",0),
  571. "noiseSensitivity":eval_result["metrics"].get("noise_sensitivity(mode=relevant)",0),
  572. "contextRecall":eval_result["metrics"].get("context_recall",0),
  573. "contextRelevance":eval_result["metrics"].get("context_precision",0),
  574. "ossId": eval_result["oss_id"],
  575. "fullUrl": eval_result["full_url"]})
  576. return {
  577. "code": 200,
  578. "message": "评估完成",
  579. "data": {
  580. "knowledge_ids": knowledge_ids,
  581. **eval_result
  582. }
  583. }
  584. except httpx.HTTPError as e:
  585. logger.error(f"文件下载失败: {e}")
  586. send_put_request_sync(url="http://10.1.14.17:9080/deepseek/evaluation/editEvaluationTaskByPython",json_data={"taskId":task_id,"tenantId":tenant_id,"status":"3"})
  587. return {"code": 400, "message": f"文件下载失败: {str(e)}"}
  588. except json.JSONDecodeError as e:
  589. logger.error(f"JSON解析失败: {e}")
  590. send_put_request_sync(url="http://10.1.14.17:9080/deepseek/evaluation/editEvaluationTaskByPython",json_data={"taskId":task_id,"tenantId":tenant_id,"status":"3"})
  591. return {"code": 400, "message": f"JSON解析失败: {str(e)}"}
  592. except Exception as e:
  593. logger.error(f"评估失败: {e}")
  594. send_put_request_sync(url="http://10.1.14.17:9080/deepseek/evaluation/editEvaluationTaskByPython",json_data={"taskId":task_id,"tenantId":tenant_id,"status":"3"})
  595. return {"code": 500, "message": f"评估失败: {str(e)}"}
  596. # finally:
  597. # # 清理临时文件
  598. # if local_path and os.path.exists(local_path):
  599. # try:
  600. # os.remove(local_path)
  601. # except Exception:
  602. # pass