| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147 |
- """
- 模型评估工具
- """
- import json
- import torch
- from typing import List, Dict, Any
- from tqdm import tqdm
- def evaluate_model(model, tokenizer, test_data: List[Dict[str, Any]], max_length: int = 512):
- """
- 评估模型性能
-
- Args:
- model: 模型
- tokenizer: tokenizer
- test_data: 测试数据
- max_length: 最大长度
-
- Returns:
- 评估结果
- """
- results = []
-
- for item in tqdm(test_data, desc="评估中"):
- instruction = item.get("instruction", "")
- input_text = item.get("input", "")
- expected_output = item.get("output", "")
-
- # 构建 prompt
- if input_text:
- prompt = f"{instruction}\n\n输入:{input_text}\n\n回答:"
- else:
- prompt = f"{instruction}\n\n回答:"
-
- # 生成响应
- inputs = tokenizer(prompt, return_tensors="pt")
-
- if torch.cuda.is_available():
- inputs = inputs.to("cuda")
-
- with torch.no_grad():
- outputs = model.generate(
- **inputs,
- max_new_tokens=256,
- temperature=0.7,
- do_sample=True,
- top_p=0.9,
- )
-
- generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
-
- # 提取生成的回答部分
- generated_response = generated[len(prompt):].strip()
-
- results.append({
- "instruction": instruction,
- "input": input_text,
- "expected": expected_output,
- "generated": generated_response,
- })
-
- return results
- def save_evaluation_results(results: List[Dict], output_path: str):
- """保存评估结果"""
- with open(output_path, "w", encoding="utf-8") as f:
- json.dump(results, f, ensure_ascii=False, indent=2)
- print(f"评估结果已保存到:{output_path}")
- def calculate_metrics(results: List[Dict]) -> Dict[str, float]:
- """
- 计算简单指标
-
- Args:
- results: 评估结果
-
- Returns:
- 指标字典
- """
- total = len(results)
-
- # 简单长度统计
- avg_expected_length = sum(len(r["expected"]) for r in results) / total
- avg_generated_length = sum(len(r["generated"]) for r in results) / total
-
- metrics = {
- "total_samples": total,
- "avg_expected_length": avg_expected_length,
- "avg_generated_length": avg_generated_length,
- }
-
- return metrics
- def main():
- """评估模型示例"""
- import argparse
- import sys
- import os
-
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
-
- from transformers import AutoModelForCausalLM, AutoTokenizer
- from peft import PeftModel
-
- parser = argparse.ArgumentParser(description="模型评估工具")
- parser.add_argument("--model_path", type=str, required=True, help="模型路径")
- parser.add_argument("--test_data", type=str, required=True, help="测试数据路径")
- parser.add_argument("--output", type=str, default="evaluation_results.json", help="输出路径")
-
- args = parser.parse_args()
-
- # 加载测试数据
- with open(args.test_data, "r", encoding="utf-8") as f:
- test_data = json.load(f)
-
- print(f"加载测试数据:{len(test_data)} 样本")
-
- # 加载模型
- print("加载模型...")
- tokenizer = AutoTokenizer.from_pretrained(args.model_path)
- base_model = AutoModelForCausalLM.from_pretrained(
- args.model_path,
- device_map="auto",
- torch_dtype=torch.float16,
- )
- model = PeftModel.from_pretrained(base_model, args.model_path)
-
- # 评估
- results = evaluate_model(model, tokenizer, test_data)
-
- # 保存结果
- save_evaluation_results(results, args.output)
-
- # 计算指标
- metrics = calculate_metrics(results)
- print("\n评估指标:")
- for key, value in metrics.items():
- print(f" {key}: {value}")
- if __name__ == "__main__":
- main()
|