import json from datasets import Dataset def load_bid_data(train_path_list, dev_path_list): """加载招投标数据""" # 加载训练集 train_data = [] for train_path in train_path_list: with open(train_path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line.strip()) # 构造Qwen的输入格式(Chat版格式) prompt = f"<|im_start|>user\n{line['input']}\n{line['instruction']}<|im_end|>\n<|im_start|>assistant\n{line['output']}<|im_end|>" train_data.append({"text": prompt}) # 加载验证集 dev_data = [] for dev_path in dev_path_list: with open(dev_path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line.strip()) prompt = f"<|im_start|>user\n{line['input']}\n{line['instruction']}<|im_end|>\n<|im_start|>assistant\n{line['output']}<|im_end|>" dev_data.append({"text": prompt}) # dev_data = dev_data[:10] print('len(train_data)', len(train_data)) print('len(dev_data)', len(dev_data)) # 转为Dataset格式 train_dataset = Dataset.from_list(train_data) dev_dataset = Dataset.from_list(dev_data) return train_dataset, dev_dataset def load_bid_data_dpo(train_path_list, dev_path_list): """加载招投标数据""" # 加载训练集 train_data = [] for train_path in train_path_list: with open(train_path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line.strip()) train_data.append( { "text_prompt": line['prompt'], "chosen": line['chosen'], "rejected": line['rejected'], } ) # 加载验证集 dev_data = [] for dev_path in dev_path_list: with open(dev_path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line.strip()) dev_data.append( { "text_prompt": line['prompt'], "chosen": line['chosen'], "rejected": line['rejected'], } ) # dev_data = dev_data[:10] print('len(train_data)', len(train_data)) print('len(dev_data)', len(dev_data)) # 转为Dataset格式 train_dataset = Dataset.from_list(train_data) dev_dataset = Dataset.from_list(dev_data) # # 核心:添加这一列,内容全是 None # train_dataset = train_dataset.add_column("images", [None] * len(train_dataset)) # dev_dataset = dev_dataset.add_column("images", [None] * len(dev_dataset)) return train_dataset, dev_dataset def load_bid_data_kto(train_path_list, dev_path_list): train_data = [] for train_path in train_path_list: with open(train_path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line.strip()) prompt = line["prompt"] chosen = line["chosen"] rejected = line["rejected"] # 👇 正确:拆成两条 KTO 样本 train_data.append({"prompt": prompt, "completion": chosen, "label": True}) train_data.append({"prompt": prompt, "completion": rejected, "label": False}) dev_data = [] for dev_path in dev_path_list: with open(dev_path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line.strip()) prompt = line["prompt"] chosen = line["chosen"] rejected = line["rejected"] dev_data.append({"prompt": prompt, "completion": chosen, "label": True}) dev_data.append({"prompt": prompt, "completion": rejected, "label": False}) print('len(train_data)', len(train_data)) print('len(dev_data)', len(dev_data)) train_dataset = Dataset.from_list(train_data) dev_dataset = Dataset.from_list(dev_data) return train_dataset, dev_dataset # 测试加载 if __name__ == "__main__": train_ds, dev_ds = load_bid_data("data/train_data.jsonl", "data/dev_data.jsonl") print(f"训练集数量:{len(train_ds)}") print(f"验证集数量:{len(dev_ds)}") for ds in train_ds: print(f"示例数据:{ds['text']}")