""" 模型评估工具 """ import json import torch from typing import List, Dict, Any from tqdm import tqdm def evaluate_model(model, tokenizer, test_data: List[Dict[str, Any]], max_length: int = 512): """ 评估模型性能 Args: model: 模型 tokenizer: tokenizer test_data: 测试数据 max_length: 最大长度 Returns: 评估结果 """ results = [] for item in tqdm(test_data, desc="评估中"): instruction = item.get("instruction", "") input_text = item.get("input", "") expected_output = item.get("output", "") # 构建 prompt if input_text: prompt = f"{instruction}\n\n输入:{input_text}\n\n回答:" else: prompt = f"{instruction}\n\n回答:" # 生成响应 inputs = tokenizer(prompt, return_tensors="pt") if torch.cuda.is_available(): inputs = inputs.to("cuda") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=256, temperature=0.7, do_sample=True, top_p=0.9, ) generated = tokenizer.decode(outputs[0], skip_special_tokens=True) # 提取生成的回答部分 generated_response = generated[len(prompt):].strip() results.append({ "instruction": instruction, "input": input_text, "expected": expected_output, "generated": generated_response, }) return results def save_evaluation_results(results: List[Dict], output_path: str): """保存评估结果""" with open(output_path, "w", encoding="utf-8") as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"评估结果已保存到:{output_path}") def calculate_metrics(results: List[Dict]) -> Dict[str, float]: """ 计算简单指标 Args: results: 评估结果 Returns: 指标字典 """ total = len(results) # 简单长度统计 avg_expected_length = sum(len(r["expected"]) for r in results) / total avg_generated_length = sum(len(r["generated"]) for r in results) / total metrics = { "total_samples": total, "avg_expected_length": avg_expected_length, "avg_generated_length": avg_generated_length, } return metrics def main(): """评估模型示例""" import argparse import sys import os sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel parser = argparse.ArgumentParser(description="模型评估工具") parser.add_argument("--model_path", type=str, required=True, help="模型路径") parser.add_argument("--test_data", type=str, required=True, help="测试数据路径") parser.add_argument("--output", type=str, default="evaluation_results.json", help="输出路径") args = parser.parse_args() # 加载测试数据 with open(args.test_data, "r", encoding="utf-8") as f: test_data = json.load(f) print(f"加载测试数据:{len(test_data)} 样本") # 加载模型 print("加载模型...") tokenizer = AutoTokenizer.from_pretrained(args.model_path) base_model = AutoModelForCausalLM.from_pretrained( args.model_path, device_map="auto", torch_dtype=torch.float16, ) model = PeftModel.from_pretrained(base_model, args.model_path) # 评估 results = evaluate_model(model, tokenizer, test_data) # 保存结果 save_evaluation_results(results, args.output) # 计算指标 metrics = calculate_metrics(results) print("\n评估指标:") for key, value in metrics.items(): print(f" {key}: {value}") if __name__ == "__main__": main()