import os import json import pandas as pd from datasets import Dataset from ragas import evaluate from ragas.metrics import ( faithfulness, answer_correctness, answer_relevancy, context_precision, context_recall, NoiseSensitivity ) from langchain_openai import ChatOpenAI, OpenAIEmbeddings VLLM_LLM_BASE = "http://xia0miduo.gicp.net:8102/v1" VLLM_LLM_KEY = "vllm-dummy-key" VLLM_EMBEDDING_BASE = "http://10.168.100.17:8787/v1/" vllm_generator = ChatOpenAI( model="Qwen3-Coder-30B-loft", base_url=VLLM_LLM_BASE, api_key=VLLM_LLM_KEY, temperature=0, max_tokens=512, ) vllm_embeddings = OpenAIEmbeddings( model="", base_url=VLLM_EMBEDDING_BASE, api_key=VLLM_LLM_KEY, ) # --- 2. 加载和准备数据集 --- def load_data_from_json(json_path): """ 从 JSON 文件加载数据并转换为 datasets.Dataset 格式。 假设的 JSON 结构: [ { "question": "...", "answer": "...", "contexts": ["...", "..."], "ground_truth": "..." }, ... ] """ try: with open(json_path, 'r', encoding='utf-8') as f: data_list = json.load(f) df = pd.DataFrame(data_list) required_cols = {'question', 'answer', 'contexts'} if not required_cols.issubset(df.columns): raise ValueError(f"JSON 文件必须包含 'question', 'answer', 和 'contexts' 键") if 'ground_truth' not in df.columns: print("警告: 未找到 'ground_truth' 列。'context_recall' 指标将无法计算。") dataset = Dataset.from_pandas(df) return dataset except Exception as e: print(f"加载数据时出错: {e}") return None # --- 3. 定义指标 --- metrics_to_run = [ faithfulness, answer_correctness, answer_relevancy, context_precision, context_recall, NoiseSensitivity(llm=vllm_generator), ] # --- 4. 执行评估 --- def run_evaluation(dataset_path): print(f"正在从 {dataset_path} 加载数据...") dataset = load_data_from_json(dataset_path) if dataset is None: print("数据加载失败,退出评估。") return print(f"成功加载 {len(dataset)} 条评估数据。") final_metrics = list(metrics_to_run) if 'ground_truth' not in dataset.column_names: print("由于缺少 'ground_truth',将跳过 'context_recall' 指标。") final_metrics.remove(context_recall) if not final_metrics: print("没有可执行的指标。退出。") return print("开始 RAGAS 评估... (这可能需要一些时间)") # 核心:调用 evaluate result = evaluate( dataset=dataset, metrics=final_metrics, llm=vllm_generator, embeddings=vllm_embeddings ) print("评估完成!") result_df = result.to_pandas() print(json.dumps(result_df.to_dict(orient="records"), indent=4, ensure_ascii=False)) # --- 主程序入口 --- if __name__ == "__main__": input_json_file = "dataset.json" if not os.path.exists(input_json_file): print(f"错误: 找不到 {input_json_file}") print("请创建一个示例 JSON 文件,结构如下:") print(""" [ { "question": "什么是 RAGAS?", "answer": "RAGAS 是一个评估 RAG 管道的框架。", "contexts": ["RAGAS (Retrieval-Augmented Generation Assessment) 是一个用于评估 RAG 管道的框架。"], "ground_truth": "RAGAS 是一个专为评估 RAG 管道设计的框架,它关注检索和生成的质量。" } ] """) else: run_evaluation(input_json_file)