evaluate.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. """
  2. 模型评估工具
  3. """
  4. import json
  5. import torch
  6. from typing import List, Dict, Any
  7. from tqdm import tqdm
  8. def evaluate_model(model, tokenizer, test_data: List[Dict[str, Any]], max_length: int = 512):
  9. """
  10. 评估模型性能
  11. Args:
  12. model: 模型
  13. tokenizer: tokenizer
  14. test_data: 测试数据
  15. max_length: 最大长度
  16. Returns:
  17. 评估结果
  18. """
  19. results = []
  20. for item in tqdm(test_data, desc="评估中"):
  21. instruction = item.get("instruction", "")
  22. input_text = item.get("input", "")
  23. expected_output = item.get("output", "")
  24. # 构建 prompt
  25. if input_text:
  26. prompt = f"{instruction}\n\n输入:{input_text}\n\n回答:"
  27. else:
  28. prompt = f"{instruction}\n\n回答:"
  29. # 生成响应
  30. inputs = tokenizer(prompt, return_tensors="pt")
  31. if torch.cuda.is_available():
  32. inputs = inputs.to("cuda")
  33. with torch.no_grad():
  34. outputs = model.generate(
  35. **inputs,
  36. max_new_tokens=256,
  37. temperature=0.7,
  38. do_sample=True,
  39. top_p=0.9,
  40. )
  41. generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
  42. # 提取生成的回答部分
  43. generated_response = generated[len(prompt):].strip()
  44. results.append({
  45. "instruction": instruction,
  46. "input": input_text,
  47. "expected": expected_output,
  48. "generated": generated_response,
  49. })
  50. return results
  51. def save_evaluation_results(results: List[Dict], output_path: str):
  52. """保存评估结果"""
  53. with open(output_path, "w", encoding="utf-8") as f:
  54. json.dump(results, f, ensure_ascii=False, indent=2)
  55. print(f"评估结果已保存到:{output_path}")
  56. def calculate_metrics(results: List[Dict]) -> Dict[str, float]:
  57. """
  58. 计算简单指标
  59. Args:
  60. results: 评估结果
  61. Returns:
  62. 指标字典
  63. """
  64. total = len(results)
  65. # 简单长度统计
  66. avg_expected_length = sum(len(r["expected"]) for r in results) / total
  67. avg_generated_length = sum(len(r["generated"]) for r in results) / total
  68. metrics = {
  69. "total_samples": total,
  70. "avg_expected_length": avg_expected_length,
  71. "avg_generated_length": avg_generated_length,
  72. }
  73. return metrics
  74. def main():
  75. """评估模型示例"""
  76. import argparse
  77. import sys
  78. import os
  79. sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
  80. from transformers import AutoModelForCausalLM, AutoTokenizer
  81. from peft import PeftModel
  82. parser = argparse.ArgumentParser(description="模型评估工具")
  83. parser.add_argument("--model_path", type=str, required=True, help="模型路径")
  84. parser.add_argument("--test_data", type=str, required=True, help="测试数据路径")
  85. parser.add_argument("--output", type=str, default="evaluation_results.json", help="输出路径")
  86. args = parser.parse_args()
  87. # 加载测试数据
  88. with open(args.test_data, "r", encoding="utf-8") as f:
  89. test_data = json.load(f)
  90. print(f"加载测试数据:{len(test_data)} 样本")
  91. # 加载模型
  92. print("加载模型...")
  93. tokenizer = AutoTokenizer.from_pretrained(args.model_path)
  94. base_model = AutoModelForCausalLM.from_pretrained(
  95. args.model_path,
  96. device_map="auto",
  97. torch_dtype=torch.float16,
  98. )
  99. model = PeftModel.from_pretrained(base_model, args.model_path)
  100. # 评估
  101. results = evaluate_model(model, tokenizer, test_data)
  102. # 保存结果
  103. save_evaluation_results(results, args.output)
  104. # 计算指标
  105. metrics = calculate_metrics(results)
  106. print("\n评估指标:")
  107. for key, value in metrics.items():
  108. print(f" {key}: {value}")
  109. if __name__ == "__main__":
  110. main()