| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- import json
- from datasets import Dataset
- def load_bid_data(train_path_list, dev_path_list):
- """加载招投标数据"""
- # 加载训练集
- train_data = []
- for train_path in train_path_list:
- with open(train_path, 'r', encoding='utf-8') as f:
- for line in f:
- line = json.loads(line.strip())
- # 构造Qwen的输入格式(Chat版格式)
- prompt = f"<|im_start|>user\n{line['input']}\n{line['instruction']}<|im_end|>\n<|im_start|>assistant\n{line['output']}<|im_end|>"
- train_data.append({"text": prompt})
- # 加载验证集
- dev_data = []
- for dev_path in dev_path_list:
- with open(dev_path, 'r', encoding='utf-8') as f:
- for line in f:
- line = json.loads(line.strip())
- prompt = f"<|im_start|>user\n{line['input']}\n{line['instruction']}<|im_end|>\n<|im_start|>assistant\n{line['output']}<|im_end|>"
- dev_data.append({"text": prompt})
- # dev_data = dev_data[:10]
- print('len(train_data)', len(train_data))
- print('len(dev_data)', len(dev_data))
- # 转为Dataset格式
- train_dataset = Dataset.from_list(train_data)
- dev_dataset = Dataset.from_list(dev_data)
- return train_dataset, dev_dataset
- def load_bid_data_dpo(train_path_list, dev_path_list):
- """加载招投标数据"""
- # 加载训练集
- train_data = []
- for train_path in train_path_list:
- with open(train_path, 'r', encoding='utf-8') as f:
- for line in f:
- line = json.loads(line.strip())
- train_data.append(
- {
- "text_prompt": line['prompt'], "chosen": line['chosen'],
- "rejected": line['rejected'],
- }
- )
- # 加载验证集
- dev_data = []
- for dev_path in dev_path_list:
- with open(dev_path, 'r', encoding='utf-8') as f:
- for line in f:
- line = json.loads(line.strip())
- dev_data.append(
- {
- "text_prompt": line['prompt'], "chosen": line['chosen'],
- "rejected": line['rejected'],
- }
- )
- # dev_data = dev_data[:10]
- print('len(train_data)', len(train_data))
- print('len(dev_data)', len(dev_data))
- # 转为Dataset格式
- train_dataset = Dataset.from_list(train_data)
- dev_dataset = Dataset.from_list(dev_data)
- # # 核心:添加这一列,内容全是 None
- # train_dataset = train_dataset.add_column("images", [None] * len(train_dataset))
- # dev_dataset = dev_dataset.add_column("images", [None] * len(dev_dataset))
- return train_dataset, dev_dataset
- def load_bid_data_kto(train_path_list, dev_path_list):
- train_data = []
- for train_path in train_path_list:
- with open(train_path, 'r', encoding='utf-8') as f:
- for line in f:
- line = json.loads(line.strip())
- prompt = line["prompt"]
- chosen = line["chosen"]
- rejected = line["rejected"]
- # 👇 正确:拆成两条 KTO 样本
- train_data.append({"prompt": prompt, "completion": chosen, "label": True})
- train_data.append({"prompt": prompt, "completion": rejected, "label": False})
- dev_data = []
- for dev_path in dev_path_list:
- with open(dev_path, 'r', encoding='utf-8') as f:
- for line in f:
- line = json.loads(line.strip())
- prompt = line["prompt"]
- chosen = line["chosen"]
- rejected = line["rejected"]
- dev_data.append({"prompt": prompt, "completion": chosen, "label": True})
- dev_data.append({"prompt": prompt, "completion": rejected, "label": False})
- print('len(train_data)', len(train_data))
- print('len(dev_data)', len(dev_data))
- train_dataset = Dataset.from_list(train_data)
- dev_dataset = Dataset.from_list(dev_data)
- return train_dataset, dev_dataset
- # 测试加载
- if __name__ == "__main__":
- train_ds, dev_ds = load_bid_data("data/train_data.jsonl", "data/dev_data.jsonl")
- print(f"训练集数量:{len(train_ds)}")
- print(f"验证集数量:{len(dev_ds)}")
- for ds in train_ds:
- print(f"示例数据:{ds['text']}")
|