import json import random import re import traceback from compare_utils import getUnifyMoney import pandas as pd from bs4 import BeautifulSoup random.seed(112) bid_cols_dict = { "project_name": "项目名称", "project_code": "项目编号", "docchannel": "公告类型", "area": "地域", "province": "省", "city": "市", "district": "区", "tenderee": "招标人", "tenderee_contact": "招标人联系人", "tenderee_phone": "招标人联系人电话", "agency": "代理人", "agency_contact": "代理人联系人", "agency_phone": "代理人联系人电话", "sub_docs_json": "多标段信息", "products": "产品信息", "service_time": "开工竣工时间", "time_bidstart": "投标开始时间", "time_bidclose": "截标时间", "time_bidopen": "开标时间", "time_get_file_end": "文件获取截止时间", "time_get_file_start": "文件获取开始时间", "time_release": "发布时间", "time_registration_end": '报名截止时间', "time_registration_start": "报名开始时间", "time_earnest_money_end": "保证金递交截止时间", "time_earnest_money_start": "保证金递交开始时间", } print('cols', bid_cols_dict.keys()) channel_map_dict = { 51: "公告变更", 52: "招标公告", 101: "中标信息", 102: "招标预告", 103: "招标答疑", 104: "招标文件", 105: "资审结果", 106: "法律法规", 107: "新闻资讯", 108: "拟建项目", 109: "展会推广", 110: "企业名录", 111: "企业资质", 112: "全国工程人员", 113: "业主采购", 114: "采购意向", 115: "拍卖出让", 116: "土地矿产", 117: "产权交易", 118: "废标公告", 119: "候选人公示", 120: "合同公告", 121: "开标记录", 122: "验收合同", 301: "拟在建项目", 302: "审批项目", 303: "处罚公告", } sub_docs_json_map_dict = { "sub_project_name": "标包项目名称", "sub_project_code": "标包项目编号", "bidding_budget": "预算金额", "bidding_budget_unit": "预算金额单位", "win_tenderer": "中标人", "second_tenderer": "第二候选人", "third_tenderer": "第三候选人", "win_tenderer_manager": "中标人联系人", "second_tenderer_manager": "第二候选人联系人", "third_tenderer_manager": "第三候选人联系人", "win_tenderer_phone": "中标人联系人电话", "second_tenderer_phone": "第二候选人联系人电话", "third_tenderer_phone": "第三候选人联系人电话", "win_bid_price": "中标人投标金额", "second_bid_price": "第二候选人投标金额", "third_bid_price": "第三候选人投标金额", "win_bid_price_unit": "中标人投标金额单位", "second_bid_price_unit": "第二候选人投标金额单位", "third_bid_price_unit": "第三候选人投标金额单位", } products_map_dict = { 'brand': '品牌', 'product': '产品名称', 'quantity': '数量', 'quantity_unit': '数量单位', 'specs': '规格', 'unitPrice': '单价', "parameter": "参数", "total_price": "总价", "pinmu_no": "品目编号", "pinmu_name": "品目名称", } def filter_data_docid(): df = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_260309.xlsx') # data_list = df.astype(object).where(pd.notnull(df), "").values.tolist() data_list = df[['docid', 'sub_docs_json']].astype(object).where(pd.notnull(df), "").values.tolist() print('data_list[0]', data_list[0]) data_list.sort(key=lambda x: str(x[1]), reverse=True) data_list = data_list[:5000] for d in data_list[:20]: print('d', d) ss = json.dumps([x[0] for x in data_list]) with open(r'D:\BIDI_DOC\比地_文档\export_260309.txt', 'w') as f: f.write(ss) print('finish') def xlsx_data_to_jsonl(): df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_260309_2.xlsx') data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist() filter_docid_dict = {int(x[0]): x[1] for x in data_list2} df = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_260309.xlsx') data_list = df.astype(object).where(pd.notnull(df), "").values.tolist() all_data = [] instruction = '提取以上招投标文档的关键信息,只输出有值的,其中中标人等放在多标段信息中形成数组,' \ '产品参数等放在产品信息中形成数组' \ '金额均以元为单位,时间格式为YYYY-MM-DD HH:MM:SS,无多余内容' \ '直接得到要素提取Json:' for line in data_list: docid = int(line[0]) if docid not in filter_docid_dict: continue doctextcon = filter_docid_dict.get(docid) try: channel = int(line[3]) channel = channel_map_dict.get(channel) line[3] = channel except: print('channel error continue', line[3]) continue # print('line[2]', line[2]) # print('line[14]', line[14]) # print('line[15]', line[15]) # print('line[16]', line[16]) if not line[14]: line[14] = [] else: sub_docs_json = json.loads(line[14]) for si, d1 in enumerate(sub_docs_json): d1 = {v: d1.get(k, "") for k, v in sub_docs_json_map_dict.items()} sub_docs_json[si] = d1 for si, sub in enumerate(sub_docs_json): delete_k = [] for k, v in sub.items(): if not v: delete_k.append(k) for k in delete_k: if k in sub: sub.pop(k) sub_docs_json[si] = sub line[14] = sub_docs_json if not line[15] or len(line[15]) >= 500: line[15] = [] else: products = json.loads(line[15]) for si, d1 in enumerate(products): d1 = {v: d1.get(k, "") for k, v in products_map_dict.items()} products[si] = d1 line[15] = products if not line[16]: line[16] = {} else: service_time = json.loads(line[16]) line[16] = service_time d = {bid_cols_dict.get(x): line[i+1] for i, x in enumerate(bid_cols_dict.keys())} # 删掉空字段 delete_k = [] for k, v in d.items(): if not v: delete_k.append(k) for k in delete_k: if k in d: d.pop(k) train_data = { "instruction": instruction, "input": doctextcon, "output": d } all_data.append(json.dumps(train_data, ensure_ascii=False)) # 生成 train_ratio = 0.8 dev_ratio = 0.1 random.shuffle(all_data) total = len(all_data) train_num = int(total * train_ratio) dev_num = int(total * dev_ratio) # 拆分 train_lines = all_data[:train_num] dev_lines = all_data[train_num:train_num+dev_num] test_lines = all_data[train_num+dev_num:] # 保存 train_path = "data/train_data.jsonl" dev_path = "data/dev_data.jsonl" test_path = "data/test_data.jsonl" with open(train_path, 'w', encoding='utf-8') as f: f.write("\n".join(train_lines)) with open(dev_path, 'w', encoding='utf-8') as f: f.write("\n".join(dev_lines)) with open(test_path, 'w', encoding='utf-8') as f: f.write("\n".join(test_lines)) def extract_json_to_psv(_dict, empty_char=''): """ json转竖线格式 :return: """ # empty_char = 'null' project_name = _dict.get('name') float_none_list = ['0', '0.0', 'None', empty_char, '', '未提及'] district_dict = _dict.get('district') if not district_dict: district_dict = {} # 生成 PSV 表头(字段名)和数据行 headers = "|".join(district_dict.keys()) values = "|".join(str(value) for value in district_dict.values()) # 拼接成完整 PSV 内容 psv_content = f"{headers}\n{values}" prem = _dict.get('prem') tenderee = "" tenderee_contact_list = [] agency = "" win_tenderer_info_list = [] for package_name, package_dict in prem.items(): bid_name = package_dict.get('name') tenderee_money = package_dict.get('tendereeMoney') tenderee_money_unit = package_dict.get('tendereeMoneyUnit') role_list = package_dict.get('roleList') project_code = package_dict.get('code') if str(tenderee_money) in float_none_list: tenderee_money = empty_char tenderee_money_unit = empty_char else: tenderee_money_unit = '元' if package_name == 'Project': package_name = empty_char win_tenderer_info = None for role_dict in role_list: role_type = role_dict.get('role_name') role_text = role_dict.get('role_text') contact_list = role_dict.get('linklist', []) role_money = role_dict.get('role_money', {}).get('money') role_money_unit = role_dict.get('role_money', {}).get('money_unit') if str(role_money) in float_none_list: role_money = empty_char role_money_unit = empty_char else: role_money_unit = '元' if role_type == 'tenderee' and len(role_text) >= 2: tenderee = role_text tenderee_contact_list += contact_list if role_type == 'agency' and len(role_text) >= 2: agency = role_text if not win_tenderer_info and role_type == 'win_tenderer': # if len(str(role_money)) > 0 and not role_money_unit: # role_money_unit = '元' # if len(str(tenderee_money)) > 0 and not tenderee_money_unit: # tenderee_money_unit = '元' win_tenderer_info = [package_name, project_code, role_text, role_money, role_money_unit, tenderee_money, tenderee_money_unit ] win_tenderer_info_list.append(win_tenderer_info) product_list = _dict.get('product_attrs', {}).get('data', {}) product_cols = ['product', 'brand', 'specs', 'quantity', 'unitPrice', 'total_price', 'pinmu_name', 'pinmu_no' ] # print('product_list1', product_list) product_list = [[x.get(y, "") for y in product_cols] for x in product_list] # print('product_list2', product_list) for pi, product in enumerate(product_list): if str(product[3]) in float_none_list: product_list[pi][3] = empty_char if str(product[4]) in float_none_list: product_list[pi][4] = empty_char if str(product[5]) in float_none_list: product_list[pi][5] = empty_char table_list = [] # table 1 table_cols = ['项目名称', '招标人名称', '代理人名称'] table_values = [[project_name, tenderee, agency]] table_list.append([table_cols, table_values]) # table 2 table_cols = ['招标人联系人', '招标人联系人电话'] # print('tenderee_contact_list', tenderee_contact_list) table_values = tenderee_contact_list if tenderee_contact_list else [] table_list.append([table_cols, table_values]) # table 3 table_cols = ['标段名称', '标段号', '中标人名称', '中标金额', '中标金额单位', '标段预算', '标段预算单位'] table_values = win_tenderer_info_list if win_tenderer_info_list else [] table_list.append([table_cols, table_values]) # table 4 table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号'] table_values = product_list if product_list else [] table_list.append([table_cols, table_values]) final_str = '' show_flag = 0 for table_cols, table_values in table_list: str1 = '|'.join(table_cols) + '\n' continue_flag = 0 if table_values: str2 = '' if len(table_values) >= 2: # print('table_values', table_values) show_flag = 1 for line in table_values: if '|' in str(line): continue_flag = 1 break str2 += '|'.join([str(x) if str(x) != '' else empty_char for x in line]) + '\n' if not str2: str2 = '|'.join([empty_char for x in table_cols]) + '\n' else: str2 = '|'.join([empty_char for x in table_cols]) + '\n' if continue_flag: return None # 判断表头和内容竖线是否相同 # for ss2 in str2.split('\n'): # if len(ss2) == 0: # continue # if len(re.findall("\|", str1)) != len(re.findall("\|", ss2)): # print('--- str1', str1) # print('--- str2', ss2) # return None if len(re.findall("\|", str2)) % len(re.findall("\|", str1)) != 0: print('--- str1', str1) print('--- str2', str2) return None final_str += str1 final_str += str2 final_str += '\n' # if f'产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str and f'标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str: # return None final_str = re.sub('未提及', f'{empty_char}', final_str) delete_value_list = ['None', '无', '无品牌', '无型号'] for v in delete_value_list: final_str = re.sub(f'\|{v}\|', f'|{empty_char}|', final_str) final_str = re.sub(f'\|{v}\n', f'|{empty_char}\n', final_str) final_str = re.sub(f'\n{v}\|', f'\n{empty_char}|', final_str) # if show_flag: # print('final_str', final_str) # final_str = table_list_to_psv(table_list, empty_char) return final_str def extract_json_to_psv_prefix(_dict, text, empty_char='-', prefix='[全字段]'): """ json转竖线格式 :return: """ # empty_char = 'null' project_name = _dict.get('name') float_none_list = ['0', '0.0', 'None', empty_char, '', '未提及'] district_dict = _dict.get('district') if not district_dict: district_dict = {} # 生成 PSV 表头(字段名)和数据行 headers = "|".join(district_dict.keys()) values = "|".join(str(value) for value in district_dict.values()) # 拼接成完整 PSV 内容 psv_content = f"{headers}\n{values}" prem = _dict.get('prem') tenderee = "" tenderee_contact_list = [] agency = "" win_tenderer_info_list = [] for package_name, package_dict in prem.items(): bid_name = package_dict.get('name') tenderee_money = package_dict.get('tendereeMoney') tenderee_money_unit = package_dict.get('tendereeMoneyUnit') role_list = package_dict.get('roleList') project_code = package_dict.get('code') if str(tenderee_money) in float_none_list: tenderee_money = empty_char tenderee_money_unit = empty_char else: tenderee_money_unit = '元' if package_name == 'Project': package_name = empty_char win_tenderer_info = None for role_dict in role_list: role_type = role_dict.get('role_name') role_text = role_dict.get('role_text') contact_list = role_dict.get('linklist', []) role_money = role_dict.get('role_money', {}).get('money') role_money_unit = role_dict.get('role_money', {}).get('money_unit') if str(role_money) in float_none_list: role_money = empty_char role_money_unit = empty_char else: role_money_unit = '元' if role_type == 'tenderee' and len(role_text) >= 2: tenderee = role_text tenderee_contact_list += contact_list if role_type == 'agency' and len(role_text) >= 2: agency = role_text if not win_tenderer_info and role_type == 'win_tenderer': # if len(str(role_money)) > 0 and not role_money_unit: # role_money_unit = '元' # if len(str(tenderee_money)) > 0 and not tenderee_money_unit: # tenderee_money_unit = '元' win_tenderer_info = [package_name, project_code, role_text, role_money, role_money_unit, tenderee_money, tenderee_money_unit ] win_tenderer_info_list.append(win_tenderer_info) product_list = _dict.get('product_attrs', {}).get('data', {}) product_cols = ['product', 'brand', 'specs', 'quantity', 'unitPrice', 'total_price', 'pinmu_name', 'pinmu_no' ] # print('product_list1', product_list) product_list = [[x.get(y, "") for y in product_cols] for x in product_list] # print('product_list2', product_list) for pi, product in enumerate(product_list): if str(product[3]) in float_none_list: product_list[pi][3] = empty_char if str(product[4]) in float_none_list: product_list[pi][4] = empty_char if str(product[5]) in float_none_list: product_list[pi][5] = empty_char if prefix == '[全字段]': table_list = [] # table 1 table_cols = ['项目名称', '招标人名称', '代理人名称'] table_values = [[project_name, tenderee, agency]] table_list.append([table_cols, table_values]) # table 2 table_cols = ['招标人联系人', '招标人联系人电话'] # print('tenderee_contact_list', tenderee_contact_list) table_values = tenderee_contact_list if tenderee_contact_list else [] temp_list = [] for v in table_values: if (v[0] not in [None, '', '-'] and v[0] in text) \ or (v[1] not in [None, '', '-'] and v[1] in text): temp_list.append(v) table_values = temp_list table_list.append([table_cols, table_values]) # table 3 table_cols = ['标段名称', '标段号', '中标人名称', '中标金额', '中标金额单位', '标段预算', '标段预算单位'] table_values = win_tenderer_info_list if win_tenderer_info_list else [] temp_list = [] for v in table_values: if (v[0] not in [None, '', '-'] and v[0] in text) \ or (v[2] not in [None, '', '-'] and v[2] in text) \ or (v[1] not in [None, '', '-'] and v[1] in text): temp_list.append(v) table_values = temp_list table_list.append([table_cols, table_values]) # table 4 table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号'] table_values = product_list if product_list else [] temp_list = [] for v in table_values: if v[0] not in [None, '', '-'] and v[0] in text: temp_list.append(v) table_values = temp_list # # 产品中数值类型 重复3次 # for v in table_values: # for col_i in [3, 4, 5]: # try: # col_v = float(v[col_i]) # if col_v > 0: # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]]) # except: # pass table_list.append([table_cols, table_values]) # final_str = '' # show_flag = 0 # for table_cols, table_values in table_list: # str1 = '|'.join(table_cols) + '\n' # # continue_flag = 0 # # if table_values: # str2 = '' # if len(table_values) >= 2: # # print('table_values', table_values) # show_flag = 1 # for line in table_values: # if '|' in str(line): # continue_flag = 1 # break # str2 += '|'.join([str(x) if str(x) != '' else empty_char for x in line]) + '\n' # if not str2: # str2 = '|'.join([empty_char for x in table_cols]) + '\n' # else: # str2 = '|'.join([empty_char for x in table_cols]) + '\n' # # if continue_flag: # return None # # # 判断表头和内容竖线是否相同 # # for ss2 in str2.split('\n'): # # if len(ss2) == 0: # # continue # # if len(re.findall("\|", str1)) != len(re.findall("\|", ss2)): # # print('--- str1', str1) # # print('--- str2', ss2) # # return None # # if len(re.findall("\|", str2)) % len(re.findall("\|", str1)) != 0: # print('--- str1', str1) # print('--- str2', str2) # return None # # final_str += str1 # final_str += str2 # final_str += '\n' # # # if f'产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str and f'标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str: # # return None # # final_str = re.sub('未提及', f'{empty_char}', final_str) # # delete_value_list = ['None', '无', '无品牌', '无型号'] # for v in delete_value_list: # final_str = re.sub(f'\|{v}\|', f'|{empty_char}|', final_str) # final_str = re.sub(f'\|{v}\n', f'|{empty_char}\n', final_str) # final_str = re.sub(f'\n{v}\|', f'\n{empty_char}|', final_str) # # # if show_flag: # # print('final_str', final_str) final_str = table_list_to_psv(table_list, empty_char) # final_str = '[全字段]' + final_str final_str = prefix + final_str return final_str elif prefix == '[仅招标人]': if not tenderee: return None sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text) tenderee_sen_list = [] for sen in sen_list: match = re.search(re.escape(tenderee), sen) if match: tenderee_sen_list.append(sen) if tenderee_sen_list: tenderee_sen_list.sort(key=lambda x: len(x)) tenderee_line = tenderee_sen_list[0] else: tenderee_line = empty_char table_list = [] table_cols = ['招标人', '招标人表达'] table_values = [[tenderee, tenderee_line]] table_list.append([table_cols, table_values]) final_str = table_list_to_psv(table_list, empty_char) if not final_str: return final_str final_str = prefix + final_str return final_str # answer = f'[仅招标人]招标人|招标人表达\n{tenderee}|{tenderee_line}' # return answer elif prefix == '[仅产品]': table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号'] table_values = product_list if product_list else [] temp_list = [] for v in table_values: if v[0] not in [None, '', '-'] and v[0] in text: temp_list.append(v) table_values = temp_list # # 产品中数值类型 重复3次 # for v in table_values: # for col_i in [3, 4, 5]: # try: # col_v = float(v[col_i]) # if col_v > 0: # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]]) # except: # pass table_list = [] table_list.append([table_cols, table_values]) final_str = table_list_to_psv(table_list, empty_char) if not final_str: return final_str final_str = prefix + final_str return final_str def entity_to_psv_prefix(text, entity, empty_char='-', prefix='[仅招标人]'): if not entity: return None sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text) tenderee_sen_list = [] for sen in sen_list: match = re.search(re.escape(entity), sen) if match: tenderee_sen_list.append(sen) if tenderee_sen_list: tenderee_sen_list.sort(key=lambda x: len(x)) tenderee_line = tenderee_sen_list[0] else: tenderee_line = empty_char if prefix == '[仅招标人]': answer = f'{prefix}招标人|招标人表达\n{entity}|{tenderee_line}' elif prefix == '[全字段]': answer = f'{prefix}项目名称|招标人名称|代理人名称\n-|{entity}|-' \ f'\n\n招标人联系人|招标人联系人电话\n-|-' \ f'\n\n标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位\n-|-|-|-|-|-|-' \ f'\n\n产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号\n-|-|-|-|-|-|-|-' return answer def psv_to_dict(_str): # _str = ''' # 项目名称|招标人名称|代理人名称 # 英吉沙县技工学校关于身体按摩的网上超市采购项目|英吉沙县技工学校|- # # 招标人联系人|招标人联系人电话 # -|17690175536 # # 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位 # 1|-|喀什市兆佳文体用品商行|3175.0|元|-|元 # # 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号 # 刮痧板 刮痧版 身体按摩|无品牌|刮痧板|15|17.0|255.0|-|- # 口红 彩妆|无品牌|口红|15|17.0|255.0|-|- # 卸妆棉 彩妆|无品牌|卸妆棉|15|17.0|255.0|-|- # 卸妆水 彩妆|无品牌|卸妆水|15|33.0|495.0|-|- # 定妆粉 彩妆|无品牌|定妆粉|15|18.0|270.0|-|- # BB霜 隔离霜|无品牌|BB霜|15|35.0|525.0|-|- # 01眼影 眼霜|无品牌|01|15|35.0|525.0|-|- # 洁丽雅洗面奶 洁面用品|洁丽雅/grace|洗面奶|15|33.0|495.0|-|- # 00115454凯伦特/CARENT棉签 棉签/棉棒/棉包|凯伦特/CARENT|00115454|8|5.0|40.0|-|- # 祝源梳子 梳子/化妆梳/按摩梳|祝源|梳子|15|4.0|60.0|-|- # ''' # 去掉前缀指示 _str = re.sub('\[全字段\]|\[仅招标人\]', '', _str) table_head_list = [ '项目名称|招标人名称|代理人名称', '招标人联系人|招标人联系人电话', '标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位', '产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号', ] has_all_head_flag = 1 for head in table_head_list: if head not in _str: has_all_head_flag = 0 break if not has_all_head_flag: return {} # 按空行分割成4个PSV块 blocks = [b.strip() for b in _str.split("\n\n") if b.strip()] # 定义字段映射(和你的4段格式严格对应) keys = [ "招标信息", "招标人联系方式", "中标信息", "产品信息", ] # 批量解析 result = {} for key, block in zip(keys, blocks): dict_list = psv_block_to_dict(block) # 产品重复3次只保留第一个 num_cols = ['单价', '数量', '总价'] if key in ['产品信息']: for d in dict_list: for col in num_cols: v = d.get(col) if v and ',' in v: d[col] = v.split(',')[0] result[key] = dict_list # print('result', result) return result def psv_block_to_dict(block): """ 把单个 PSV 块(表头+数据行)转换成字典 支持:1行数据 / 多行数据(自动转列表) """ lines = [line.strip() for line in block.strip().splitlines() if line.strip()] if len(lines) < 2: return {} # 解析表头和数据行 headers = [h.strip() for h in lines[0].split("|")] data_rows = [[d.strip() if d != '-' else '' for d in line.split("|")] for line in lines[1:]] # 多行 → 列表套字典,单行 → 单层字典 # if len(data_rows) == 1: # return dict(zip(headers, data_rows[0])) return [dict(zip(headers, row)) for row in data_rows] def psv_to_dict_prefix(_str): # 去掉前缀指示 _str = re.sub('\[全字段\]|\[仅招标人\]', '', _str) table_head_list = [ '招标人|招标人表达', ] has_all_head_flag = 1 for head in table_head_list: if head not in _str: has_all_head_flag = 0 break if not has_all_head_flag: return {} line_list = _str.split('\n') temp_list = [] for line in line_list: if '|' not in line: continue temp_list.append(line) line_list = temp_list # line_list = line_list[1:] # print('line_list[1]', line_list[1]) tenderee, tenderee_sentence = line_list[1].split('|')[:2] result = {'招标信息': {'招标人名称': tenderee}} # print('result', result) return result def html2text_with_table_html(_html, limit=10000): # 如果输入是字符串,使用 BeautifulSoup 解析 if isinstance(_html, str): _html = re.sub("|||","",_html) _soup = BeautifulSoup(_html, "lxml") else: _soup = _html # 用于存储处理后的文本 result_parts = [] _find = False # 遍历所有直接子元素 for child in _soup.contents: # print('child.name', child.name) if child.name: if child.name in ["table", "tbody"]: #仅仅保存rowspan和colspan属性的标签 for c in child.find_all(): new_attrs = {} for k,v in c.attrs.items(): if k in ["rowspan","colspan"]: new_attrs[k] = v c.attrs = new_attrs # 如果是表格或表格主体,保留 HTML 代码 result_parts.append("\n"+str(child)+"\n") else: # 递归处理其他元素并转换为文本 text = html2text_with_table_html(child) if child.name in {"p","div","li"}: text += '\n' result_parts.append(text) elif child.string and child.string.strip(): _text = child.string.strip() result_parts.append(_text) _find = True # print('result_parts', result_parts) # if not _find: # print('not find') # _text = str(_soup.get_text()) # print('_text', _text) # if len(_text)>0: # print('_soup.name', _soup.name) # if _soup.name in {"p","div","li"}: # print('yes') # _text += "\n" # result_parts.append(_text) # 将所有处理后的部分连接成一个字符串 result = "".join(result_parts) result = result[:limit] return result def xlsx_data_to_jsonl_2(): df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2.csv') df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2_extract.xlsx') data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist() data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist() docid_html_dict = {int(x[0]): x[1] for x in data_list1} docid_json_dict = {int(x[0]): x[1] for x in data_list2} all_data = [] instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n' empty_char = '-' instruction = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 项目名称|招标人名称|代理人名称 招标人联系人|招标人联系人电话 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号 请抽取以上内容并严格按上述4段PSV输出: """ for line in data_list1: docid = int(line[0]) html = docid_html_dict.get(docid) extract_json = docid_json_dict.get(docid) text = html2text_with_table_html(html) try: answer = extract_json_to_psv(json.loads(extract_json), empty_char=empty_char) if answer is None: continue except: continue train_data = { "instruction": instruction, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) # 生成 train_ratio = 0.8 dev_ratio = 0.1 random.shuffle(all_data) total = len(all_data) train_num = int(total * train_ratio) dev_num = int(total * dev_ratio) # 拆分 train_lines = all_data[:train_num] dev_lines = all_data[train_num:train_num+dev_num] test_lines = all_data[train_num+dev_num:] print('len(train_lines)', len(train_lines)) print('len(test_lines)', len(test_lines)) # 保存 train_path = "data2/train_data.jsonl" dev_path = "data2/dev_data.jsonl" test_path = "data2/test_data.jsonl" with open(train_path, 'w', encoding='utf-8') as f: f.write("\n".join(train_lines)) with open(dev_path, 'w', encoding='utf-8') as f: f.write("\n".join(dev_lines)) with open(test_path, 'w', encoding='utf-8') as f: f.write("\n".join(test_lines)) def xlsx_data_to_jsonl_3(): df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260327_2.csv') df2 = pd.read_csv(r'C:\Users\Administrator\Downloads\document_tmp_has_ai_no_attachment_260327_limit.csv') data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist() data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist() docid_html_dict = {int(x[0]): x[1] for x in data_list1} docid_json_dict = {int(x[0]): x[-1] for x in data_list2} all_data = [] # instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n' empty_char = '-' instruction = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 项目名称|招标人名称|代理人名称 招标人联系人|招标人联系人电话 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号 请抽取以上内容并严格按上述4段PSV输出: """ for line in data_list1: docid = int(line[0]) html = docid_html_dict.get(docid) extract_json = docid_json_dict.get(docid) text = html2text_with_table_html(html) try: answer = extract_json_to_psv(json.loads(extract_json), empty_char=empty_char) if answer is None: continue except: continue train_data = { "instruction": instruction, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) # 生成 train_ratio = 1. dev_ratio = 0. random.shuffle(all_data) total = len(all_data) train_num = int(total * train_ratio) dev_num = int(total * dev_ratio) # 拆分 train_lines = all_data[:train_num] dev_lines = all_data[train_num:train_num+dev_num] test_lines = all_data[train_num+dev_num:] print('len(train_lines)', len(train_lines)) print('len(test_lines)', len(test_lines)) # 保存 train_path = "data3/train_data.jsonl" dev_path = "data3/dev_data.jsonl" test_path = "data3/test_data.jsonl" with open(train_path, 'w', encoding='utf-8') as f: f.write("\n".join(train_lines)) with open(dev_path, 'w', encoding='utf-8') as f: f.write("\n".join(dev_lines)) with open(test_path, 'w', encoding='utf-8') as f: f.write("\n".join(test_lines)) def xlsx_data_to_jsonl_4_prefix(): df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工_html.xlsx') df2 = pd.read_excel(r'train_excel/260403_ai_人工标注_招标人表达_3_再人工.xlsx') data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist() data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist() docid_html_dict = {int(x[0]): x[1] for x in data_list1} docid_json_dict = {int(x[0]): [x[1], x[2]] for x in data_list2} all_data = [] empty_char = '-' instruction = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 招标人|招标人表达 请抽取以上内容并严格按上述1段PSV输出: """ for line in data_list1: docid = int(line[0]) html = docid_html_dict.get(docid) tenderee_line, tenderee = docid_json_dict.get(docid) if len(tenderee) <= 1: tenderee = '-' if len(tenderee_line) <= 1: tenderee_line = '-' text = html2text_with_table_html(html) answer = f'[仅招标人]招标人|招标人表达\n{tenderee}|{tenderee_line}' train_data = { "instruction": instruction, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) # 生成 train_ratio = 0.9 dev_ratio = 0.1 random.shuffle(all_data) total = len(all_data) train_num = int(total * train_ratio) dev_num = int(total * dev_ratio) # 拆分 train_lines = all_data[:train_num] dev_lines = all_data[train_num:train_num+dev_num] test_lines = all_data[train_num+dev_num:] print('len(train_lines)', len(train_lines)) print('len(dev_lines)', len(dev_lines)) # 保存 train_path = "data4_prefix/train_data.jsonl" dev_path = "data4_prefix/dev_data.jsonl" test_path = "data4_prefix/test_data.jsonl" with open(train_path, 'w', encoding='utf-8') as f: f.write("\n".join(train_lines)) with open(dev_path, 'w', encoding='utf-8') as f: f.write("\n".join(dev_lines)) with open(test_path, 'w', encoding='utf-8') as f: f.write("\n".join(test_lines)) def xlsx_data_to_jsonl_5(): df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工_html.xlsx') df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工_json.xlsx') df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工.xlsx') data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist() data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist() data_list3 = df3.astype(object).where(pd.notnull(df3), "").values.tolist() docid_html_dict = {int(x[0]): x[1] for x in data_list1} docid_json_dict = {int(x[0]): x[-1] for x in data_list2} docid_tenderee_dict = {int(x[0]): x[-1] for x in data_list3} all_data = [] # instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n' empty_char = '-' instruction = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 项目名称|招标人名称|代理人名称 招标人联系人|招标人联系人电话 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号 请抽取以上内容并严格按上述4段PSV输出: """ for line in data_list1: docid = int(line[0]) html = docid_html_dict.get(docid) extract_json = docid_json_dict.get(docid) labeled_tenderee = docid_tenderee_dict.get(docid) if len(labeled_tenderee) <= 1: labeled_tenderee = '' text = html2text_with_table_html(html) try: extract_json = json.loads(extract_json) # 用人工标注的替换json里的tenderee prem = extract_json.get('prem') project_dict = { 'code': "", 'name': "", 'roleList': [ { 'address': "", 'linklist': [], 'role_money': {}, 'role_name': 'tenderee', 'role_text': labeled_tenderee, } ], 'tendereeMoney': 0, 'tendereeMoneyUnit': "", } if not prem: prem = {'Project': project_dict} else: # 每个包都加上tenderee for k, d in prem.items(): role_list = d.get('roleList') role_list += [ { 'address': "", 'linklist': [], 'role_money': {}, 'role_name': 'tenderee', 'role_text': labeled_tenderee, } ] d['roleList'] = role_list prem[k] = d extract_json['prem'] = prem answer = extract_json_to_psv(extract_json, empty_char=empty_char) if answer is None: print('answer is None') continue except: traceback.print_exc() continue train_data = { "instruction": instruction, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) # 生成 train_ratio = 0.9 dev_ratio = 0.1 random.shuffle(all_data) total = len(all_data) train_num = int(total * train_ratio) dev_num = int(total * dev_ratio) # 拆分 train_lines = all_data[:train_num] dev_lines = all_data[train_num:train_num+dev_num] test_lines = all_data[train_num+dev_num:] print('len(train_lines)', len(train_lines)) print('len(dev_lines)', len(dev_lines)) # 保存 train_path = "data4/train_data.jsonl" dev_path = "data4/dev_data.jsonl" test_path = "data4/test_data.jsonl" with open(train_path, 'w', encoding='utf-8') as f: f.write("\n".join(train_lines)) with open(dev_path, 'w', encoding='utf-8') as f: f.write("\n".join(dev_lines)) with open(test_path, 'w', encoding='utf-8') as f: f.write("\n".join(test_lines)) def xlsx_data_to_jsonl_3_prefix(): df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260327_2.csv') df2 = pd.read_csv(r'C:\Users\Administrator\Downloads\document_tmp_has_ai_no_attachment_260327_limit.csv') data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist() data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist() docid_html_dict = {int(x[0]): x[1] for x in data_list1} docid_json_dict = {int(x[0]): x[-1] for x in data_list2} all_data = [] # instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n' empty_char = '-' instruction = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 项目名称|招标人名称|代理人名称 招标人联系人|招标人联系人电话 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号 请抽取以上内容并严格按上述4段PSV输出: """ instruction2 = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 招标人|招标人表达 请抽取以上内容并严格按上述1段PSV输出: """ instruction3 = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号 请抽取以上内容并严格按上述1段PSV输出: """ for line in data_list1: docid = int(line[0]) html = docid_html_dict.get(docid) extract_json = docid_json_dict.get(docid) text = html2text_with_table_html(html) try: answer = extract_json_to_psv_prefix(json.loads(extract_json), text, empty_char=empty_char) if answer is None: continue except: continue print('answer0', answer) train_data = { "instruction": instruction, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) answer = extract_json_to_psv_prefix(json.loads(extract_json), text, empty_char=empty_char, prefix='[仅招标人]') print('answer1', answer) if answer is None: continue train_data = { "instruction": instruction2, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) answer = extract_json_to_psv_prefix(json.loads(extract_json), text, empty_char=empty_char, prefix='[仅产品]') print('answer2', answer) if answer is None: continue train_data = { "instruction": instruction3, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) # 生成 train_ratio = 0.9 dev_ratio = 0.1 random.shuffle(all_data) total = len(all_data) train_num = int(total * train_ratio) dev_num = int(total * dev_ratio) # 拆分 train_lines = all_data[:train_num] dev_lines = all_data[train_num:train_num+dev_num] test_lines = all_data[train_num+dev_num:] print('len(train_lines)', len(train_lines)) print('len(dev_lines)', len(dev_lines)) # 保存 # train_path = "data3_prefix/train_data.jsonl" # dev_path = "data3_prefix/dev_data.jsonl" # test_path = "data3_prefix/test_data.jsonl" train_path = "data7_prefix/train_data.jsonl" dev_path = "data7_prefix/dev_data.jsonl" test_path = "data7_prefix/test_data.jsonl" with open(train_path, 'w', encoding='utf-8') as f: f.write("\n".join(train_lines)) with open(dev_path, 'w', encoding='utf-8') as f: f.write("\n".join(dev_lines)) with open(test_path, 'w', encoding='utf-8') as f: f.write("\n".join(test_lines)) def entity_data_to_jsonl_prefix(): df1 = pd.read_excel(r'df_train.xlsx') data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist() empty_char = '-' instruction = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 项目名称|招标人名称|代理人名称 招标人联系人|招标人联系人电话 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号 请抽取以上内容并严格按上述4段PSV输出: """ instruction2 = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 招标人|招标人表达 请抽取以上内容并严格按上述1段PSV输出: """ all_data = [] max_cnt = 2000 cnt = 0 for line in data_list1: center = line[1] docid = line[2] label = line[4] left = line[5] right = line[8] if label != '招标人': continue if cnt >= max_cnt: break text = left + center + right answer = entity_to_psv_prefix(text, center, empty_char=empty_char, prefix='[仅招标人]') train_data = { "instruction": instruction2, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) cnt += 1 # 生成 train_ratio = 0.9 dev_ratio = 0.1 random.shuffle(all_data) total = len(all_data) train_num = int(total * train_ratio) dev_num = int(total * dev_ratio) # 拆分 train_lines = all_data[:train_num] dev_lines = all_data[train_num:train_num+dev_num] test_lines = all_data[train_num+dev_num:] print('len(train_lines)', len(train_lines)) print('len(dev_lines)', len(dev_lines)) # 保存 train_path = "data5_prefix/train_data.jsonl" dev_path = "data5_prefix/dev_data.jsonl" test_path = "data5_prefix/test_data.jsonl" with open(train_path, 'w', encoding='utf-8') as f: f.write("\n".join(train_lines)) with open(dev_path, 'w', encoding='utf-8') as f: f.write("\n".join(dev_lines)) with open(test_path, 'w', encoding='utf-8') as f: f.write("\n".join(test_lines)) def augment_jsonl_data(): tags = ['[仅招标人]', '[全字段]'] train_path = './data6_prefix/train_data.jsonl' output_path = './data6_prefix_aug/train_data.jsonl' data_dict_list = [] with open(train_path, 'r', encoding='utf-8') as f: for line in f: line = json.loads(line.strip()) # 构造Qwen的输入格式(Chat版格式) prompt = f"<|im_start|>user\n{line['input']}\n{line['instruction']}<|im_end|>\n<|im_start|>assistant\n{line['output']}<|im_end|>" input_data = line['input'] output_data = line['output'] # print('output_data', output_data) data_dict_list.append(line) # project_name, tenderee, agency 位置打乱 new_data_dict_list = [] for data_dict in data_dict_list: # 50% 打乱 if random.choice([0, 1]): new_data_dict_list.append(data_dict) continue text = data_dict['input'] output = data_dict['output'] now_tag = None for tag in tags: if tag in output: output = re.sub(tag, '', str(output)) now_tag = tag break first_table = output.split('\n')[1] if now_tag == '[仅招标人]': tenderee, _ = first_table.split('|')[:2] project_name = '' agency = '' else: project_name, tenderee, agency = first_table.split('|')[:3] if len(tenderee) <= 1: continue # sen_list = re.split('[,。;?!]', text) sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text) tenderee_sen_list = [] agency_sen_list = [] project_name_sen_list = [] for sen in sen_list: match = re.search(re.escape(tenderee), sen) if match: tenderee_sen_list.append(sen) if len(agency) > 1: match = re.search(re.escape(agency), sen) if match: agency_sen_list.append(sen) if len(project_name) > 1: match = re.search(re.escape(project_name), sen) if match: project_name_sen_list.append(sen) for sen in project_name_sen_list: if sen in tenderee_sen_list: tenderee_sen_list.remove(sen) if len(tenderee_sen_list) == 0: continue if len(tenderee_sen_list) >= 2: print('tenderee_sen_list', tenderee_sen_list) continue for sen in tenderee_sen_list + agency_sen_list + project_name_sen_list: if sen in sen_list: sen_list.remove(sen) print('len(sen_list)', len(sen_list)) if len(sen_list) <= 1: print('len(sen_list) <= 1', sen_list) continue random_index = random.randint(1, len(sen_list)-1) tenderee_sen = tenderee_sen_list[0] if '<' in tenderee_sen: continue print('tenderee_sen', tenderee_sen, tenderee) sen_list = sen_list[:random_index] + [tenderee_sen] + sen_list[random_index:] if agency_sen_list: random_index = random.randint(1, len(sen_list)-1) agency_sen = agency_sen_list[0] print('agency_sen', agency_sen) sen_list = sen_list[:random_index] + [agency_sen] + sen_list[random_index:] if project_name_sen_list: random_index = random.randint(1, len(sen_list)-1) project_name_sen = project_name_sen_list[0] print('project_name_sen', project_name_sen) sen_list = sen_list[:random_index] + [project_name_sen] + sen_list[random_index:] new_text = ''.join(sen_list) data_dict['input'] = new_text new_data_dict_list.append(data_dict) print('len(new_data_dict_list)', len(new_data_dict_list)) _str = '\n'.join([json.dumps(x, ensure_ascii=False) for x in new_data_dict_list]) with open(output_path, 'w', encoding='utf-8') as f: f.write(_str) print('finish to', output_path) def table_list_to_psv(table_list, empty_char, table_type=None): final_str = '' show_flag = 0 for table_cols, table_values in table_list: str1 = '|'.join(table_cols) + '\n' continue_flag = 0 if table_values: str2 = '' if len(table_values) >= 2: # print('table_values', table_values) show_flag = 1 for line in table_values: if '|' in str(line): continue_flag = 1 break str2 += '|'.join([str(x) if str(x) != '' else empty_char for x in line]) + '\n' if not str2: str2 = '|'.join([empty_char for x in table_cols]) + '\n' else: str2 = '|'.join([empty_char for x in table_cols]) + '\n' if continue_flag: return None # 判断表头和内容竖线是否相同 # for ss2 in str2.split('\n'): # if len(ss2) == 0: # continue # if len(re.findall("\|", str1)) != len(re.findall("\|", ss2)): # print('--- str1', str1) # print('--- str2', ss2) # return None if len(re.findall("\|", str2)) % len(re.findall("\|", str1)) != 0: print('--- str1', str1) print('--- str2', str2) return None final_str += str1 final_str += str2 final_str += '\n' final_str = re.sub('未提及', f'{empty_char}', final_str) delete_value_list = ['None', '无', '无品牌', '无型号', '0', '0.0', '未提及'] for v in delete_value_list: final_str = re.sub(f'\|{v}\|', f'|{empty_char}|', final_str) final_str = re.sub(f'\|{v}\n', f'|{empty_char}\n', final_str) final_str = re.sub(f'\n{v}\|', f'\n{empty_char}|', final_str) return final_str def saimofei_to_psv_prefix(data_list, text, prefix, empty_char='-'): project_name = data_list[0].get('doctitle') tenderee = data_list[0].get('tenderee') agency = data_list[0].get('agency') tenderee_contact_list = [] win_tenderer_info_list = [] product_list = [] for d in data_list: # tenderee_contact_list tenderee_contact = d.get('tenderee_contact') if '/' in tenderee_contact: tenderee_person, tenderee_phone = tenderee_contact.split('/') else: tenderee_phone = tenderee_contact tenderee_person = '' tenderee_contact_list.append([tenderee_person, tenderee_phone]) # win_tenderer_info_list win_tenderer = d.get('tenderee_contact') project_code = d.get('project_code') budget = d.get('budget') win_money = d.get('win_money') budget = str(getUnifyMoney(budget)) win_money = str(getUnifyMoney(win_money)) win_tenderer_info = [ '-', project_code, win_tenderer, win_money, '元', budget, '元', ] win_tenderer_info_list.append(win_tenderer_info) # product_list product_name = d.get('product_name') brand = d.get('brand') specs = d.get('specs') product_cnt = d.get('product_cnt') unit_price = d.get('unit_price') total_price = d.get('total_price') product_list.append([ product_name, brand, specs, product_cnt, unit_price, total_price, '-', '-' ]) tenderee_contact_list = list(set([json.dumps(x) for x in tenderee_contact_list])) tenderee_contact_list = [json.loads(x) for x in tenderee_contact_list] win_tenderer_info_list = list(set([json.dumps(x) for x in win_tenderer_info_list])) win_tenderer_info_list = [json.loads(x) for x in win_tenderer_info_list] product_list = list(set([json.dumps(x) for x in product_list])) product_list = [json.loads(x) for x in product_list] if prefix == '[全字段]': table_list = [] # table 1 table_cols = ['项目名称', '招标人名称', '代理人名称'] table_values = [[project_name, tenderee, agency]] table_list.append([table_cols, table_values]) # table 2 table_cols = ['招标人联系人', '招标人联系人电话'] # print('tenderee_contact_list', tenderee_contact_list) table_values = tenderee_contact_list if tenderee_contact_list else [] temp_list = [] for v in table_values: if (v[0] not in [None, '', '-'] and v[0] in text) \ or (v[1] not in [None, '', '-'] and v[1] in text): temp_list.append(v) table_values = temp_list table_list.append([table_cols, table_values]) # table 3 table_cols = ['标段名称', '标段号', '中标人名称', '中标金额', '中标金额单位', '标段预算', '标段预算单位'] table_values = win_tenderer_info_list if win_tenderer_info_list else [] temp_list = [] for v in table_values: if (v[0] not in [None, '', '-'] and v[0] in text) \ or (v[2] not in [None, '', '-'] and v[2] in text) \ or (v[1] not in [None, '', '-'] and v[1] in text): temp_list.append(v) table_values = temp_list table_list.append([table_cols, table_values]) # table 4 table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号'] table_values = product_list if product_list else [] temp_list = [] for v in table_values: if v[0] not in [None, '', '-'] and v[0] in text: temp_list.append(v) table_values = temp_list # # 产品中数值类型 重复3次 # for v in table_values: # for col_i in [3, 4, 5]: # try: # col_v = float(v[col_i]) # if col_v > 0: # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]]) # except: # pass table_list.append([table_cols, table_values]) final_str = table_list_to_psv(table_list, empty_char) if not final_str: return final_str final_str = prefix + final_str return final_str elif prefix == '[仅招标人]': if not tenderee: return None sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text) tenderee_sen_list = [] for sen in sen_list: match = re.search(re.escape(tenderee), sen) if match: tenderee_sen_list.append(sen) if tenderee_sen_list: tenderee_sen_list.sort(key=lambda x: len(x)) tenderee_line = tenderee_sen_list[0] else: tenderee_line = empty_char table_list = [] table_cols = ['招标人', '招标人表达'] table_values = [[tenderee, tenderee_line]] table_list.append([table_cols, table_values]) final_str = table_list_to_psv(table_list, empty_char) if not final_str: return final_str final_str = prefix + final_str return final_str elif prefix == '[仅产品]': table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号'] table_values = product_list if product_list else [] # 判断截取后产品是否还在其中 # if len(text) >= 10000: # sub_text = text[:10000] temp_list = [] for v in table_values: if v[0] not in [None, '', '-'] and v[0] in text: temp_list.append(v) table_values = temp_list # # 产品中数值类型 重复3次 # for v in table_values: # for col_i in [3, 4, 5]: # try: # col_v = float(v[col_i]) # if col_v > 0: # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]]) # except: # pass table_list = [] table_list.append([table_cols, table_values]) final_str = table_list_to_psv(table_list, empty_char) if not final_str: return final_str final_str = prefix + final_str return final_str def saimofei_data_to_jsonl_data(): df = pd.read_excel(r'C:\Users\Administrator\Downloads\赛默飞-样例数据.xlsx', header=1) df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260417_saimofei_html.csv') head_list = list(df.columns) data_list = df.astype(object).where(pd.notnull(df), "").values.tolist() data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist() docid_html_dict = {int(x[0]): x[1] for x in data_list1} docid_data_dict = {} for data in data_list: docid = data[head_list.index('公告ID')] doctitle = data[head_list.index('公告名称')] budget = data[head_list.index('预算金额')] win_money = data[head_list.index('成交金额')] tenderee = data[head_list.index('招标单位')] tenderee_contact = data[head_list.index('招标单位联系人')] agency = data[head_list.index('代理机构')] win_tenderer = data[head_list.index('中标单位')] product_name = data[head_list.index('产品名称')] brand = data[head_list.index('品牌名称')] specs = data[head_list.index('型号')] product_cnt = data[head_list.index('数量')] unit_price = data[head_list.index('单价(元)')] total_price = data[head_list.index('总价(元)')] project_code = data[head_list.index('项目编号')] new_data = { 'doctitle': doctitle, 'budget': budget, 'win_money': win_money, 'tenderee': tenderee, 'tenderee_contact': tenderee_contact, 'agency': agency, 'win_tenderer': win_tenderer, 'product_name': product_name, 'brand': brand, 'specs': specs, 'product_cnt': product_cnt, 'unit_price': unit_price, 'total_price': total_price, 'project_code': project_code, } if docid in docid_data_dict: docid_data_dict[int(docid)] += [new_data] else: docid_data_dict[int(docid)] = [new_data] all_data = [] empty_char = '-' instruction = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 项目名称|招标人名称|代理人名称 招标人联系人|招标人联系人电话 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号 请抽取以上内容并严格按上述4段PSV输出: """ instruction2 = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 招标人|招标人表达 请抽取以上内容并严格按上述1段PSV输出: """ instruction3 = f""" 你是招投标要素抽取专家。 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号 请抽取以上内容并严格按上述1段PSV输出: """ for docid, data_list in docid_data_dict.items(): html = docid_html_dict.get(int(docid)) text = html2text_with_table_html(html) answer = saimofei_to_psv_prefix(data_list, text, prefix='[全字段]', empty_char=empty_char) print('answer1', answer) if not answer: continue train_data = { "instruction": instruction, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) answer = saimofei_to_psv_prefix(data_list, text, prefix='[仅招标人]', empty_char=empty_char) print('answer2', answer) if not answer: continue train_data = { "instruction": instruction2, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) answer = saimofei_to_psv_prefix(data_list, text, prefix='[仅产品]', empty_char=empty_char) print('answer3', answer) if not answer: continue train_data = { "instruction": instruction3, "input": text, "output": answer, } all_data.append(json.dumps(train_data, ensure_ascii=False)) # 生成 train_ratio = 0.9 dev_ratio = 0.1 random.shuffle(all_data) total = len(all_data) train_num = int(total * train_ratio) dev_num = int(total * dev_ratio) # 拆分 train_lines = all_data[:train_num] dev_lines = all_data[train_num:train_num+dev_num] test_lines = all_data[train_num+dev_num:] print('len(train_lines)', len(train_lines)) print('len(dev_lines)', len(dev_lines)) # 保存 train_path = "data6_prefix/train_data.jsonl" dev_path = "data6_prefix/dev_data.jsonl" test_path = "data6_prefix/test_data.jsonl" with open(train_path, 'w', encoding='utf-8') as f: f.write("\n".join(train_lines)) with open(dev_path, 'w', encoding='utf-8') as f: f.write("\n".join(dev_lines)) with open(test_path, 'w', encoding='utf-8') as f: f.write("\n".join(test_lines)) if __name__ == '__main__': # filter_data_docid() # xlsx_data_to_jsonl() # xlsx_data_to_jsonl_2() # xlsx_data_to_jsonl_3() # xlsx_data_to_jsonl_4_prefix() # xlsx_data_to_jsonl_5() xlsx_data_to_jsonl_3_prefix() # entity_data_to_jsonl_prefix() # saimofei_data_to_jsonl_data() # augment_jsonl_data()