helpers.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. """
  2. 辅助工具函数
  3. """
  4. import os
  5. import time
  6. import torch
  7. from typing import Dict, Any
  8. def setup_environment(seed: int = 42):
  9. """
  10. 设置随机种子和环境变量
  11. Args:
  12. seed: 随机种子
  13. """
  14. import random
  15. import numpy as np
  16. random.seed(seed)
  17. np.random.seed(seed)
  18. torch.manual_seed(seed)
  19. if torch.cuda.is_available():
  20. torch.cuda.manual_seed_all(seed)
  21. if hasattr(torch, 'npu') and torch.npu.is_available():
  22. torch.npu.manual_seed_all(seed)
  23. # 设置环境变量(华为升腾 NPU)
  24. os.environ["TOKENIZERS_PARALLELISM"] = "false"
  25. os.environ["PYTHONHASHSEED"] = str(seed)
  26. print(f"环境设置完成,随机种子:{seed}")
  27. def get_gpu_info() -> Dict[str, Any]:
  28. """
  29. 获取 GPU/NPU 信息
  30. Returns:
  31. GPU/NPU 信息字典
  32. """
  33. # 检查 CUDA
  34. if torch.cuda.is_available():
  35. info = {
  36. "available": True,
  37. "device_type": "cuda",
  38. "device_count": torch.cuda.device_count(),
  39. "devices": [],
  40. }
  41. for i in range(torch.cuda.device_count()):
  42. device_info = {
  43. "name": torch.cuda.get_device_name(i),
  44. "memory_allocated": torch.cuda.memory_allocated(i) / 1e9,
  45. "memory_reserved": torch.cuda.memory_reserved(i) / 1e9,
  46. "max_memory": torch.cuda.get_device_properties(i).total_memory / 1e9,
  47. }
  48. info["devices"].append(device_info)
  49. return info
  50. # 检查 NPU(华为升腾)
  51. if hasattr(torch, 'npu') and torch.npu.is_available():
  52. info = {
  53. "available": True,
  54. "device_type": "npu",
  55. "device_count": torch.npu.device_count(),
  56. "devices": [],
  57. }
  58. for i in range(torch.npu.device_count()):
  59. device_info = {
  60. "name": f"NPU {i}",
  61. "memory_allocated": 0,
  62. "memory_reserved": 0,
  63. "max_memory": 0,
  64. }
  65. info["devices"].append(device_info)
  66. return info
  67. return {"available": False}
  68. def count_parameters(model) -> Dict[str, int]:
  69. """
  70. 统计模型参数
  71. Args:
  72. model: 模型对象
  73. Returns:
  74. 参数字典
  75. """
  76. total_params = sum(p.numel() for p in model.parameters())
  77. trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
  78. return {
  79. "total": total_params,
  80. "trainable": trainable_params,
  81. "frozen": total_params - trainable_params,
  82. }
  83. def format_time(seconds: float) -> str:
  84. """
  85. 格式化时间为可读字符串
  86. Args:
  87. seconds: 秒数
  88. Returns:
  89. 格式化后的时间字符串
  90. """
  91. if seconds < 60:
  92. return f"{seconds:.2f}s"
  93. elif seconds < 3600:
  94. minutes = seconds / 60
  95. return f"{minutes:.2f}m"
  96. else:
  97. hours = seconds / 3600
  98. return f"{hours:.2f}h"
  99. def print_memory_usage():
  100. """打印内存使用情况"""
  101. if torch.cuda.is_available():
  102. allocated = torch.cuda.memory_allocated() / 1e9
  103. reserved = torch.cuda.memory_reserved() / 1e9
  104. print(f"GPU 内存 - 已分配:{allocated:.2f}GB, 已保留:{reserved:.2f}GB")
  105. elif hasattr(torch, 'npu') and torch.npu.is_available():
  106. print("NPU 内存统计(华为升腾)")