| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950 |
- import json
- import random
- import re
- import traceback
- from compare_utils import getUnifyMoney
- import pandas as pd
- from bs4 import BeautifulSoup
- random.seed(112)
- bid_cols_dict = {
- "project_name": "项目名称",
- "project_code": "项目编号",
- "docchannel": "公告类型",
- "area": "地域",
- "province": "省",
- "city": "市",
- "district": "区",
- "tenderee": "招标人",
- "tenderee_contact": "招标人联系人",
- "tenderee_phone": "招标人联系人电话",
- "agency": "代理人",
- "agency_contact": "代理人联系人",
- "agency_phone": "代理人联系人电话",
- "sub_docs_json": "多标段信息",
- "products": "产品信息",
- "service_time": "开工竣工时间",
- "time_bidstart": "投标开始时间",
- "time_bidclose": "截标时间",
- "time_bidopen": "开标时间",
- "time_get_file_end": "文件获取截止时间",
- "time_get_file_start": "文件获取开始时间",
- "time_release": "发布时间",
- "time_registration_end": '报名截止时间',
- "time_registration_start": "报名开始时间",
- "time_earnest_money_end": "保证金递交截止时间",
- "time_earnest_money_start": "保证金递交开始时间",
- }
- print('cols', bid_cols_dict.keys())
- channel_map_dict = {
- 51: "公告变更",
- 52: "招标公告",
- 101: "中标信息",
- 102: "招标预告",
- 103: "招标答疑",
- 104: "招标文件",
- 105: "资审结果",
- 106: "法律法规",
- 107: "新闻资讯",
- 108: "拟建项目",
- 109: "展会推广",
- 110: "企业名录",
- 111: "企业资质",
- 112: "全国工程人员",
- 113: "业主采购",
- 114: "采购意向",
- 115: "拍卖出让",
- 116: "土地矿产",
- 117: "产权交易",
- 118: "废标公告",
- 119: "候选人公示",
- 120: "合同公告",
- 121: "开标记录",
- 122: "验收合同",
- 301: "拟在建项目",
- 302: "审批项目",
- 303: "处罚公告",
- }
- sub_docs_json_map_dict = {
- "sub_project_name": "标包项目名称",
- "sub_project_code": "标包项目编号",
- "bidding_budget": "预算金额",
- "bidding_budget_unit": "预算金额单位",
- "win_tenderer": "中标人",
- "second_tenderer": "第二候选人",
- "third_tenderer": "第三候选人",
- "win_tenderer_manager": "中标人联系人",
- "second_tenderer_manager": "第二候选人联系人",
- "third_tenderer_manager": "第三候选人联系人",
- "win_tenderer_phone": "中标人联系人电话",
- "second_tenderer_phone": "第二候选人联系人电话",
- "third_tenderer_phone": "第三候选人联系人电话",
- "win_bid_price": "中标人投标金额",
- "second_bid_price": "第二候选人投标金额",
- "third_bid_price": "第三候选人投标金额",
- "win_bid_price_unit": "中标人投标金额单位",
- "second_bid_price_unit": "第二候选人投标金额单位",
- "third_bid_price_unit": "第三候选人投标金额单位",
- }
- products_map_dict = {
- 'brand': '品牌',
- 'product': '产品名称',
- 'quantity': '数量',
- 'quantity_unit': '数量单位',
- 'specs': '规格',
- 'unitPrice': '单价',
- "parameter": "参数",
- "total_price": "总价",
- "pinmu_no": "品目编号",
- "pinmu_name": "品目名称",
- }
- def filter_data_docid():
- df = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_260309.xlsx')
- # data_list = df.astype(object).where(pd.notnull(df), "").values.tolist()
- data_list = df[['docid', 'sub_docs_json']].astype(object).where(pd.notnull(df), "").values.tolist()
- print('data_list[0]', data_list[0])
- data_list.sort(key=lambda x: str(x[1]), reverse=True)
- data_list = data_list[:5000]
- for d in data_list[:20]:
- print('d', d)
- ss = json.dumps([x[0] for x in data_list])
- with open(r'D:\BIDI_DOC\比地_文档\export_260309.txt', 'w') as f:
- f.write(ss)
- print('finish')
- def xlsx_data_to_jsonl():
- df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_260309_2.xlsx')
- data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
- filter_docid_dict = {int(x[0]): x[1] for x in data_list2}
- df = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_260309.xlsx')
- data_list = df.astype(object).where(pd.notnull(df), "").values.tolist()
- all_data = []
- instruction = '提取以上招投标文档的关键信息,只输出有值的,其中中标人等放在多标段信息中形成数组,' \
- '产品参数等放在产品信息中形成数组' \
- '金额均以元为单位,时间格式为YYYY-MM-DD HH:MM:SS,无多余内容' \
- '直接得到要素提取Json:'
- for line in data_list:
- docid = int(line[0])
- if docid not in filter_docid_dict:
- continue
- doctextcon = filter_docid_dict.get(docid)
- try:
- channel = int(line[3])
- channel = channel_map_dict.get(channel)
- line[3] = channel
- except:
- print('channel error continue', line[3])
- continue
- # print('line[2]', line[2])
- # print('line[14]', line[14])
- # print('line[15]', line[15])
- # print('line[16]', line[16])
- if not line[14]:
- line[14] = []
- else:
- sub_docs_json = json.loads(line[14])
- for si, d1 in enumerate(sub_docs_json):
- d1 = {v: d1.get(k, "") for k, v in sub_docs_json_map_dict.items()}
- sub_docs_json[si] = d1
- for si, sub in enumerate(sub_docs_json):
- delete_k = []
- for k, v in sub.items():
- if not v:
- delete_k.append(k)
- for k in delete_k:
- if k in sub:
- sub.pop(k)
- sub_docs_json[si] = sub
- line[14] = sub_docs_json
- if not line[15] or len(line[15]) >= 500:
- line[15] = []
- else:
- products = json.loads(line[15])
- for si, d1 in enumerate(products):
- d1 = {v: d1.get(k, "") for k, v in products_map_dict.items()}
- products[si] = d1
- line[15] = products
- if not line[16]:
- line[16] = {}
- else:
- service_time = json.loads(line[16])
- line[16] = service_time
- d = {bid_cols_dict.get(x): line[i+1] for i, x in enumerate(bid_cols_dict.keys())}
- # 删掉空字段
- delete_k = []
- for k, v in d.items():
- if not v:
- delete_k.append(k)
- for k in delete_k:
- if k in d:
- d.pop(k)
- train_data = {
- "instruction": instruction,
- "input": doctextcon,
- "output": d
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- # 生成
- train_ratio = 0.8
- dev_ratio = 0.1
- random.shuffle(all_data)
- total = len(all_data)
- train_num = int(total * train_ratio)
- dev_num = int(total * dev_ratio)
- # 拆分
- train_lines = all_data[:train_num]
- dev_lines = all_data[train_num:train_num+dev_num]
- test_lines = all_data[train_num+dev_num:]
- # 保存
- train_path = "data/train_data.jsonl"
- dev_path = "data/dev_data.jsonl"
- test_path = "data/test_data.jsonl"
- with open(train_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(train_lines))
- with open(dev_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(dev_lines))
- with open(test_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(test_lines))
- def extract_json_to_psv(_dict, empty_char=''):
- """
- json转竖线格式
- :return:
- """
- # empty_char = 'null'
- project_name = _dict.get('name')
- float_none_list = ['0', '0.0', 'None', empty_char, '', '未提及']
- district_dict = _dict.get('district')
- if not district_dict:
- district_dict = {}
- # 生成 PSV 表头(字段名)和数据行
- headers = "|".join(district_dict.keys())
- values = "|".join(str(value) for value in district_dict.values())
- # 拼接成完整 PSV 内容
- psv_content = f"{headers}\n{values}"
- prem = _dict.get('prem')
- tenderee = ""
- tenderee_contact_list = []
- agency = ""
- win_tenderer_info_list = []
- for package_name, package_dict in prem.items():
- bid_name = package_dict.get('name')
- tenderee_money = package_dict.get('tendereeMoney')
- tenderee_money_unit = package_dict.get('tendereeMoneyUnit')
- role_list = package_dict.get('roleList')
- project_code = package_dict.get('code')
- if str(tenderee_money) in float_none_list:
- tenderee_money = empty_char
- tenderee_money_unit = empty_char
- else:
- tenderee_money_unit = '元'
- if package_name == 'Project':
- package_name = empty_char
- win_tenderer_info = None
- for role_dict in role_list:
- role_type = role_dict.get('role_name')
- role_text = role_dict.get('role_text')
- contact_list = role_dict.get('linklist', [])
- role_money = role_dict.get('role_money', {}).get('money')
- role_money_unit = role_dict.get('role_money', {}).get('money_unit')
- if str(role_money) in float_none_list:
- role_money = empty_char
- role_money_unit = empty_char
- else:
- role_money_unit = '元'
- if role_type == 'tenderee' and len(role_text) >= 2:
- tenderee = role_text
- tenderee_contact_list += contact_list
- if role_type == 'agency' and len(role_text) >= 2:
- agency = role_text
- if not win_tenderer_info and role_type == 'win_tenderer':
- # if len(str(role_money)) > 0 and not role_money_unit:
- # role_money_unit = '元'
- # if len(str(tenderee_money)) > 0 and not tenderee_money_unit:
- # tenderee_money_unit = '元'
- win_tenderer_info = [package_name, project_code, role_text,
- role_money, role_money_unit,
- tenderee_money, tenderee_money_unit
- ]
- win_tenderer_info_list.append(win_tenderer_info)
- product_list = _dict.get('product_attrs', {}).get('data', {})
- product_cols = ['product', 'brand', 'specs', 'quantity',
- 'unitPrice', 'total_price', 'pinmu_name', 'pinmu_no'
- ]
- # print('product_list1', product_list)
- product_list = [[x.get(y, "") for y in product_cols] for x in product_list]
- # print('product_list2', product_list)
- for pi, product in enumerate(product_list):
- if str(product[3]) in float_none_list:
- product_list[pi][3] = empty_char
- if str(product[4]) in float_none_list:
- product_list[pi][4] = empty_char
- if str(product[5]) in float_none_list:
- product_list[pi][5] = empty_char
- table_list = []
- # table 1
- table_cols = ['项目名称', '招标人名称', '代理人名称']
- table_values = [[project_name, tenderee, agency]]
- table_list.append([table_cols, table_values])
- # table 2
- table_cols = ['招标人联系人', '招标人联系人电话']
- # print('tenderee_contact_list', tenderee_contact_list)
- table_values = tenderee_contact_list if tenderee_contact_list else []
- table_list.append([table_cols, table_values])
- # table 3
- table_cols = ['标段名称', '标段号', '中标人名称', '中标金额', '中标金额单位', '标段预算', '标段预算单位']
- table_values = win_tenderer_info_list if win_tenderer_info_list else []
- table_list.append([table_cols, table_values])
- # table 4
- table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号']
- table_values = product_list if product_list else []
- table_list.append([table_cols, table_values])
- final_str = ''
- show_flag = 0
- for table_cols, table_values in table_list:
- str1 = '|'.join(table_cols) + '\n'
- continue_flag = 0
- if table_values:
- str2 = ''
- if len(table_values) >= 2:
- # print('table_values', table_values)
- show_flag = 1
- for line in table_values:
- if '|' in str(line):
- continue_flag = 1
- break
- str2 += '|'.join([str(x) if str(x) != '' else empty_char for x in line]) + '\n'
- if not str2:
- str2 = '|'.join([empty_char for x in table_cols]) + '\n'
- else:
- str2 = '|'.join([empty_char for x in table_cols]) + '\n'
- if continue_flag:
- return None
- # 判断表头和内容竖线是否相同
- # for ss2 in str2.split('\n'):
- # if len(ss2) == 0:
- # continue
- # if len(re.findall("\|", str1)) != len(re.findall("\|", ss2)):
- # print('--- str1', str1)
- # print('--- str2', ss2)
- # return None
- if len(re.findall("\|", str2)) % len(re.findall("\|", str1)) != 0:
- print('--- str1', str1)
- print('--- str2', str2)
- return None
- final_str += str1
- final_str += str2
- final_str += '\n'
- # if f'产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str and f'标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str:
- # return None
- final_str = re.sub('未提及', f'{empty_char}', final_str)
- delete_value_list = ['None', '无', '无品牌', '无型号']
- for v in delete_value_list:
- final_str = re.sub(f'\|{v}\|', f'|{empty_char}|', final_str)
- final_str = re.sub(f'\|{v}\n', f'|{empty_char}\n', final_str)
- final_str = re.sub(f'\n{v}\|', f'\n{empty_char}|', final_str)
- # if show_flag:
- # print('final_str', final_str)
- # final_str = table_list_to_psv(table_list, empty_char)
- return final_str
- def extract_json_to_psv_prefix(_dict, text, empty_char='-', prefix='[全字段]'):
- """
- json转竖线格式
- :return:
- """
- # empty_char = 'null'
- project_name = _dict.get('name')
- float_none_list = ['0', '0.0', 'None', empty_char, '', '未提及']
- district_dict = _dict.get('district')
- if not district_dict:
- district_dict = {}
- # 生成 PSV 表头(字段名)和数据行
- headers = "|".join(district_dict.keys())
- values = "|".join(str(value) for value in district_dict.values())
- # 拼接成完整 PSV 内容
- psv_content = f"{headers}\n{values}"
- prem = _dict.get('prem')
- tenderee = ""
- tenderee_contact_list = []
- agency = ""
- win_tenderer_info_list = []
- for package_name, package_dict in prem.items():
- bid_name = package_dict.get('name')
- tenderee_money = package_dict.get('tendereeMoney')
- tenderee_money_unit = package_dict.get('tendereeMoneyUnit')
- role_list = package_dict.get('roleList')
- project_code = package_dict.get('code')
- if str(tenderee_money) in float_none_list:
- tenderee_money = empty_char
- tenderee_money_unit = empty_char
- else:
- tenderee_money_unit = '元'
- if package_name == 'Project':
- package_name = empty_char
- win_tenderer_info = None
- for role_dict in role_list:
- role_type = role_dict.get('role_name')
- role_text = role_dict.get('role_text')
- contact_list = role_dict.get('linklist', [])
- role_money = role_dict.get('role_money', {}).get('money')
- role_money_unit = role_dict.get('role_money', {}).get('money_unit')
- if str(role_money) in float_none_list:
- role_money = empty_char
- role_money_unit = empty_char
- else:
- role_money_unit = '元'
- if role_type == 'tenderee' and len(role_text) >= 2:
- tenderee = role_text
- tenderee_contact_list += contact_list
- if role_type == 'agency' and len(role_text) >= 2:
- agency = role_text
- if not win_tenderer_info and role_type == 'win_tenderer':
- # if len(str(role_money)) > 0 and not role_money_unit:
- # role_money_unit = '元'
- # if len(str(tenderee_money)) > 0 and not tenderee_money_unit:
- # tenderee_money_unit = '元'
- win_tenderer_info = [package_name, project_code, role_text,
- role_money, role_money_unit,
- tenderee_money, tenderee_money_unit
- ]
- win_tenderer_info_list.append(win_tenderer_info)
- product_list = _dict.get('product_attrs', {}).get('data', {})
- product_cols = ['product', 'brand', 'specs', 'quantity',
- 'unitPrice', 'total_price', 'pinmu_name', 'pinmu_no'
- ]
- # print('product_list1', product_list)
- product_list = [[x.get(y, "") for y in product_cols] for x in product_list]
- # print('product_list2', product_list)
- for pi, product in enumerate(product_list):
- if str(product[3]) in float_none_list:
- product_list[pi][3] = empty_char
- if str(product[4]) in float_none_list:
- product_list[pi][4] = empty_char
- if str(product[5]) in float_none_list:
- product_list[pi][5] = empty_char
- if prefix == '[全字段]':
- table_list = []
- # table 1
- table_cols = ['项目名称', '招标人名称', '代理人名称']
- table_values = [[project_name, tenderee, agency]]
- table_list.append([table_cols, table_values])
- # table 2
- table_cols = ['招标人联系人', '招标人联系人电话']
- # print('tenderee_contact_list', tenderee_contact_list)
- table_values = tenderee_contact_list if tenderee_contact_list else []
- temp_list = []
- for v in table_values:
- if (v[0] not in [None, '', '-'] and v[0] in text) \
- or (v[1] not in [None, '', '-'] and v[1] in text):
- temp_list.append(v)
- table_values = temp_list
- table_list.append([table_cols, table_values])
- # table 3
- table_cols = ['标段名称', '标段号', '中标人名称', '中标金额', '中标金额单位', '标段预算', '标段预算单位']
- table_values = win_tenderer_info_list if win_tenderer_info_list else []
- temp_list = []
- for v in table_values:
- if (v[0] not in [None, '', '-'] and v[0] in text) \
- or (v[2] not in [None, '', '-'] and v[2] in text) \
- or (v[1] not in [None, '', '-'] and v[1] in text):
- temp_list.append(v)
- table_values = temp_list
- table_list.append([table_cols, table_values])
- # table 4
- table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号']
- table_values = product_list if product_list else []
- temp_list = []
- for v in table_values:
- if v[0] not in [None, '', '-'] and v[0] in text:
- temp_list.append(v)
- table_values = temp_list
- # # 产品中数值类型 重复3次
- # for v in table_values:
- # for col_i in [3, 4, 5]:
- # try:
- # col_v = float(v[col_i])
- # if col_v > 0:
- # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]])
- # except:
- # pass
- table_list.append([table_cols, table_values])
- # final_str = ''
- # show_flag = 0
- # for table_cols, table_values in table_list:
- # str1 = '|'.join(table_cols) + '\n'
- #
- # continue_flag = 0
- #
- # if table_values:
- # str2 = ''
- # if len(table_values) >= 2:
- # # print('table_values', table_values)
- # show_flag = 1
- # for line in table_values:
- # if '|' in str(line):
- # continue_flag = 1
- # break
- # str2 += '|'.join([str(x) if str(x) != '' else empty_char for x in line]) + '\n'
- # if not str2:
- # str2 = '|'.join([empty_char for x in table_cols]) + '\n'
- # else:
- # str2 = '|'.join([empty_char for x in table_cols]) + '\n'
- #
- # if continue_flag:
- # return None
- #
- # # 判断表头和内容竖线是否相同
- # # for ss2 in str2.split('\n'):
- # # if len(ss2) == 0:
- # # continue
- # # if len(re.findall("\|", str1)) != len(re.findall("\|", ss2)):
- # # print('--- str1', str1)
- # # print('--- str2', ss2)
- # # return None
- #
- # if len(re.findall("\|", str2)) % len(re.findall("\|", str1)) != 0:
- # print('--- str1', str1)
- # print('--- str2', str2)
- # return None
- #
- # final_str += str1
- # final_str += str2
- # final_str += '\n'
- #
- # # if f'产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str and f'标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str:
- # # return None
- #
- # final_str = re.sub('未提及', f'{empty_char}', final_str)
- #
- # delete_value_list = ['None', '无', '无品牌', '无型号']
- # for v in delete_value_list:
- # final_str = re.sub(f'\|{v}\|', f'|{empty_char}|', final_str)
- # final_str = re.sub(f'\|{v}\n', f'|{empty_char}\n', final_str)
- # final_str = re.sub(f'\n{v}\|', f'\n{empty_char}|', final_str)
- #
- # # if show_flag:
- # # print('final_str', final_str)
- final_str = table_list_to_psv(table_list, empty_char)
- # final_str = '[全字段]' + final_str
- final_str = prefix + final_str
- return final_str
- elif prefix == '[仅招标人]':
- if not tenderee:
- return None
- sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text)
- tenderee_sen_list = []
- for sen in sen_list:
- match = re.search(re.escape(tenderee), sen)
- if match:
- tenderee_sen_list.append(sen)
- if tenderee_sen_list:
- tenderee_sen_list.sort(key=lambda x: len(x))
- tenderee_line = tenderee_sen_list[0]
- else:
- tenderee_line = empty_char
- table_list = []
- table_cols = ['招标人', '招标人表达']
- table_values = [[tenderee, tenderee_line]]
- table_list.append([table_cols, table_values])
- final_str = table_list_to_psv(table_list, empty_char)
- if not final_str:
- return final_str
- final_str = prefix + final_str
- return final_str
- # answer = f'[仅招标人]招标人|招标人表达\n{tenderee}|{tenderee_line}'
- # return answer
- elif prefix == '[仅产品]':
- table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号']
- table_values = product_list if product_list else []
- temp_list = []
- for v in table_values:
- if v[0] not in [None, '', '-'] and v[0] in text:
- temp_list.append(v)
- table_values = temp_list
- # # 产品中数值类型 重复3次
- # for v in table_values:
- # for col_i in [3, 4, 5]:
- # try:
- # col_v = float(v[col_i])
- # if col_v > 0:
- # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]])
- # except:
- # pass
- table_list = []
- table_list.append([table_cols, table_values])
- final_str = table_list_to_psv(table_list, empty_char)
- if not final_str:
- return final_str
- final_str = prefix + final_str
- return final_str
- def entity_to_psv_prefix(text, entity, empty_char='-', prefix='[仅招标人]'):
- if not entity:
- return None
- sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text)
- tenderee_sen_list = []
- for sen in sen_list:
- match = re.search(re.escape(entity), sen)
- if match:
- tenderee_sen_list.append(sen)
- if tenderee_sen_list:
- tenderee_sen_list.sort(key=lambda x: len(x))
- tenderee_line = tenderee_sen_list[0]
- else:
- tenderee_line = empty_char
- if prefix == '[仅招标人]':
- answer = f'{prefix}招标人|招标人表达\n{entity}|{tenderee_line}'
- elif prefix == '[全字段]':
- answer = f'{prefix}项目名称|招标人名称|代理人名称\n-|{entity}|-' \
- f'\n\n招标人联系人|招标人联系人电话\n-|-' \
- f'\n\n标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位\n-|-|-|-|-|-|-' \
- f'\n\n产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号\n-|-|-|-|-|-|-|-'
- return answer
- def psv_to_dict(_str):
- # _str = '''
- # 项目名称|招标人名称|代理人名称
- # 英吉沙县技工学校关于身体按摩的网上超市采购项目|英吉沙县技工学校|-
- #
- # 招标人联系人|招标人联系人电话
- # -|17690175536
- #
- # 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
- # 1|-|喀什市兆佳文体用品商行|3175.0|元|-|元
- #
- # 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
- # 刮痧板 刮痧版 身体按摩|无品牌|刮痧板|15|17.0|255.0|-|-
- # 口红 彩妆|无品牌|口红|15|17.0|255.0|-|-
- # 卸妆棉 彩妆|无品牌|卸妆棉|15|17.0|255.0|-|-
- # 卸妆水 彩妆|无品牌|卸妆水|15|33.0|495.0|-|-
- # 定妆粉 彩妆|无品牌|定妆粉|15|18.0|270.0|-|-
- # BB霜 隔离霜|无品牌|BB霜|15|35.0|525.0|-|-
- # 01眼影 眼霜|无品牌|01|15|35.0|525.0|-|-
- # 洁丽雅洗面奶 洁面用品|洁丽雅/grace|洗面奶|15|33.0|495.0|-|-
- # 00115454凯伦特/CARENT棉签 棉签/棉棒/棉包|凯伦特/CARENT|00115454|8|5.0|40.0|-|-
- # 祝源梳子 梳子/化妆梳/按摩梳|祝源|梳子|15|4.0|60.0|-|-
- # '''
- # 去掉前缀指示
- _str = re.sub('\[全字段\]|\[仅招标人\]', '', _str)
- table_head_list = [
- '项目名称|招标人名称|代理人名称',
- '招标人联系人|招标人联系人电话',
- '标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位',
- '产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号',
- ]
- has_all_head_flag = 1
- for head in table_head_list:
- if head not in _str:
- has_all_head_flag = 0
- break
- if not has_all_head_flag:
- return {}
- # 按空行分割成4个PSV块
- blocks = [b.strip() for b in _str.split("\n\n") if b.strip()]
- # 定义字段映射(和你的4段格式严格对应)
- keys = [
- "招标信息",
- "招标人联系方式",
- "中标信息",
- "产品信息",
- ]
- # 批量解析
- result = {}
- for key, block in zip(keys, blocks):
- dict_list = psv_block_to_dict(block)
- # 产品重复3次只保留第一个
- num_cols = ['单价', '数量', '总价']
- if key in ['产品信息']:
- for d in dict_list:
- for col in num_cols:
- v = d.get(col)
- if v and ',' in v:
- d[col] = v.split(',')[0]
- result[key] = dict_list
- # print('result', result)
- return result
- def psv_block_to_dict(block):
- """
- 把单个 PSV 块(表头+数据行)转换成字典
- 支持:1行数据 / 多行数据(自动转列表)
- """
- lines = [line.strip() for line in block.strip().splitlines() if line.strip()]
- if len(lines) < 2:
- return {}
- # 解析表头和数据行
- headers = [h.strip() for h in lines[0].split("|")]
- data_rows = [[d.strip() if d != '-' else '' for d in line.split("|")] for line in lines[1:]]
- # 多行 → 列表套字典,单行 → 单层字典
- # if len(data_rows) == 1:
- # return dict(zip(headers, data_rows[0]))
- return [dict(zip(headers, row)) for row in data_rows]
- def psv_to_dict_prefix(_str):
- # 去掉前缀指示
- _str = re.sub('\[全字段\]|\[仅招标人\]', '', _str)
- table_head_list = [
- '招标人|招标人表达',
- ]
- has_all_head_flag = 1
- for head in table_head_list:
- if head not in _str:
- has_all_head_flag = 0
- break
- if not has_all_head_flag:
- return {}
- line_list = _str.split('\n')
- temp_list = []
- for line in line_list:
- if '|' not in line:
- continue
- temp_list.append(line)
- line_list = temp_list
- # line_list = line_list[1:]
- # print('line_list[1]', line_list[1])
- tenderee, tenderee_sentence = line_list[1].split('|')[:2]
- result = {'招标信息': {'招标人名称': tenderee}}
- # print('result', result)
- return result
- def html2text_with_table_html(_html, limit=10000):
- # 如果输入是字符串,使用 BeautifulSoup 解析
- if isinstance(_html, str):
- _html = re.sub("<html>|<body>|</body>|</html>","",_html)
- _soup = BeautifulSoup(_html, "lxml")
- else:
- _soup = _html
- # 用于存储处理后的文本
- result_parts = []
- _find = False
- # 遍历所有直接子元素
- for child in _soup.contents:
- # print('child.name', child.name)
- if child.name:
- if child.name in ["table", "tbody"]:
- #仅仅保存rowspan和colspan属性的标签
- for c in child.find_all():
- new_attrs = {}
- for k,v in c.attrs.items():
- if k in ["rowspan","colspan"]:
- new_attrs[k] = v
- c.attrs = new_attrs
- # 如果是表格或表格主体,保留 HTML 代码
- result_parts.append("\n"+str(child)+"\n")
- else:
- # 递归处理其他元素并转换为文本
- text = html2text_with_table_html(child)
- if child.name in {"p","div","li"}:
- text += '\n'
- result_parts.append(text)
- elif child.string and child.string.strip():
- _text = child.string.strip()
- result_parts.append(_text)
- _find = True
- # print('result_parts', result_parts)
- # if not _find:
- # print('not find')
- # _text = str(_soup.get_text())
- # print('_text', _text)
- # if len(_text)>0:
- # print('_soup.name', _soup.name)
- # if _soup.name in {"p","div","li"}:
- # print('yes')
- # _text += "\n"
- # result_parts.append(_text)
- # 将所有处理后的部分连接成一个字符串
- result = "".join(result_parts)
- result = result[:limit]
- return result
- def xlsx_data_to_jsonl_2():
- df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2.csv')
- df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2_extract.xlsx')
- data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
- data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
- docid_html_dict = {int(x[0]): x[1] for x in data_list1}
- docid_json_dict = {int(x[0]): x[1] for x in data_list2}
- all_data = []
- instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n'
- empty_char = '-'
- instruction = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 项目名称|招标人名称|代理人名称
- 招标人联系人|招标人联系人电话
- 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
- 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
- 请抽取以上内容并严格按上述4段PSV输出:
- """
- for line in data_list1:
- docid = int(line[0])
- html = docid_html_dict.get(docid)
- extract_json = docid_json_dict.get(docid)
- text = html2text_with_table_html(html)
- try:
- answer = extract_json_to_psv(json.loads(extract_json), empty_char=empty_char)
- if answer is None:
- continue
- except:
- continue
- train_data = {
- "instruction": instruction,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- # 生成
- train_ratio = 0.8
- dev_ratio = 0.1
- random.shuffle(all_data)
- total = len(all_data)
- train_num = int(total * train_ratio)
- dev_num = int(total * dev_ratio)
- # 拆分
- train_lines = all_data[:train_num]
- dev_lines = all_data[train_num:train_num+dev_num]
- test_lines = all_data[train_num+dev_num:]
- print('len(train_lines)', len(train_lines))
- print('len(test_lines)', len(test_lines))
- # 保存
- train_path = "data2/train_data.jsonl"
- dev_path = "data2/dev_data.jsonl"
- test_path = "data2/test_data.jsonl"
- with open(train_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(train_lines))
- with open(dev_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(dev_lines))
- with open(test_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(test_lines))
- def xlsx_data_to_jsonl_3():
- df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260327_2.csv')
- df2 = pd.read_csv(r'C:\Users\Administrator\Downloads\document_tmp_has_ai_no_attachment_260327_limit.csv')
- data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
- data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
- docid_html_dict = {int(x[0]): x[1] for x in data_list1}
- docid_json_dict = {int(x[0]): x[-1] for x in data_list2}
- all_data = []
- # instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n'
- empty_char = '-'
- instruction = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 项目名称|招标人名称|代理人名称
- 招标人联系人|招标人联系人电话
- 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
- 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
- 请抽取以上内容并严格按上述4段PSV输出:
- """
- for line in data_list1:
- docid = int(line[0])
- html = docid_html_dict.get(docid)
- extract_json = docid_json_dict.get(docid)
- text = html2text_with_table_html(html)
- try:
- answer = extract_json_to_psv(json.loads(extract_json), empty_char=empty_char)
- if answer is None:
- continue
- except:
- continue
- train_data = {
- "instruction": instruction,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- # 生成
- train_ratio = 1.
- dev_ratio = 0.
- random.shuffle(all_data)
- total = len(all_data)
- train_num = int(total * train_ratio)
- dev_num = int(total * dev_ratio)
- # 拆分
- train_lines = all_data[:train_num]
- dev_lines = all_data[train_num:train_num+dev_num]
- test_lines = all_data[train_num+dev_num:]
- print('len(train_lines)', len(train_lines))
- print('len(test_lines)', len(test_lines))
- # 保存
- train_path = "data3/train_data.jsonl"
- dev_path = "data3/dev_data.jsonl"
- test_path = "data3/test_data.jsonl"
- with open(train_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(train_lines))
- with open(dev_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(dev_lines))
- with open(test_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(test_lines))
- def xlsx_data_to_jsonl_4_prefix():
- df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工_html.xlsx')
- df2 = pd.read_excel(r'train_excel/260403_ai_人工标注_招标人表达_3_再人工.xlsx')
- data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
- data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
- docid_html_dict = {int(x[0]): x[1] for x in data_list1}
- docid_json_dict = {int(x[0]): [x[1], x[2]] for x in data_list2}
- all_data = []
- empty_char = '-'
- instruction = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 招标人|招标人表达
- 请抽取以上内容并严格按上述1段PSV输出:
- """
- for line in data_list1:
- docid = int(line[0])
- html = docid_html_dict.get(docid)
- tenderee_line, tenderee = docid_json_dict.get(docid)
- if len(tenderee) <= 1:
- tenderee = '-'
- if len(tenderee_line) <= 1:
- tenderee_line = '-'
- text = html2text_with_table_html(html)
- answer = f'[仅招标人]招标人|招标人表达\n{tenderee}|{tenderee_line}'
- train_data = {
- "instruction": instruction,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- # 生成
- train_ratio = 0.9
- dev_ratio = 0.1
- random.shuffle(all_data)
- total = len(all_data)
- train_num = int(total * train_ratio)
- dev_num = int(total * dev_ratio)
- # 拆分
- train_lines = all_data[:train_num]
- dev_lines = all_data[train_num:train_num+dev_num]
- test_lines = all_data[train_num+dev_num:]
- print('len(train_lines)', len(train_lines))
- print('len(dev_lines)', len(dev_lines))
- # 保存
- train_path = "data4_prefix/train_data.jsonl"
- dev_path = "data4_prefix/dev_data.jsonl"
- test_path = "data4_prefix/test_data.jsonl"
- with open(train_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(train_lines))
- with open(dev_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(dev_lines))
- with open(test_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(test_lines))
- def xlsx_data_to_jsonl_5():
- df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工_html.xlsx')
- df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工_json.xlsx')
- df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工.xlsx')
- data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
- data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
- data_list3 = df3.astype(object).where(pd.notnull(df3), "").values.tolist()
- docid_html_dict = {int(x[0]): x[1] for x in data_list1}
- docid_json_dict = {int(x[0]): x[-1] for x in data_list2}
- docid_tenderee_dict = {int(x[0]): x[-1] for x in data_list3}
- all_data = []
- # instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n'
- empty_char = '-'
- instruction = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 项目名称|招标人名称|代理人名称
- 招标人联系人|招标人联系人电话
- 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
- 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
- 请抽取以上内容并严格按上述4段PSV输出:
- """
- for line in data_list1:
- docid = int(line[0])
- html = docid_html_dict.get(docid)
- extract_json = docid_json_dict.get(docid)
- labeled_tenderee = docid_tenderee_dict.get(docid)
- if len(labeled_tenderee) <= 1:
- labeled_tenderee = ''
- text = html2text_with_table_html(html)
- try:
- extract_json = json.loads(extract_json)
- # 用人工标注的替换json里的tenderee
- prem = extract_json.get('prem')
- project_dict = {
- 'code': "",
- 'name': "",
- 'roleList': [
- {
- 'address': "",
- 'linklist': [],
- 'role_money': {},
- 'role_name': 'tenderee',
- 'role_text': labeled_tenderee,
- }
- ],
- 'tendereeMoney': 0,
- 'tendereeMoneyUnit': "",
- }
- if not prem:
- prem = {'Project': project_dict}
- else:
- # 每个包都加上tenderee
- for k, d in prem.items():
- role_list = d.get('roleList')
- role_list += [
- {
- 'address': "",
- 'linklist': [],
- 'role_money': {},
- 'role_name': 'tenderee',
- 'role_text': labeled_tenderee,
- }
- ]
- d['roleList'] = role_list
- prem[k] = d
- extract_json['prem'] = prem
- answer = extract_json_to_psv(extract_json, empty_char=empty_char)
- if answer is None:
- print('answer is None')
- continue
- except:
- traceback.print_exc()
- continue
- train_data = {
- "instruction": instruction,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- # 生成
- train_ratio = 0.9
- dev_ratio = 0.1
- random.shuffle(all_data)
- total = len(all_data)
- train_num = int(total * train_ratio)
- dev_num = int(total * dev_ratio)
- # 拆分
- train_lines = all_data[:train_num]
- dev_lines = all_data[train_num:train_num+dev_num]
- test_lines = all_data[train_num+dev_num:]
- print('len(train_lines)', len(train_lines))
- print('len(dev_lines)', len(dev_lines))
- # 保存
- train_path = "data4/train_data.jsonl"
- dev_path = "data4/dev_data.jsonl"
- test_path = "data4/test_data.jsonl"
- with open(train_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(train_lines))
- with open(dev_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(dev_lines))
- with open(test_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(test_lines))
- def xlsx_data_to_jsonl_3_prefix():
- df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260327_2.csv')
- df2 = pd.read_csv(r'C:\Users\Administrator\Downloads\document_tmp_has_ai_no_attachment_260327_limit.csv')
- data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
- data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
- docid_html_dict = {int(x[0]): x[1] for x in data_list1}
- docid_json_dict = {int(x[0]): x[-1] for x in data_list2}
- all_data = []
- # instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n'
- empty_char = '-'
- instruction = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 项目名称|招标人名称|代理人名称
- 招标人联系人|招标人联系人电话
- 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
- 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
- 请抽取以上内容并严格按上述4段PSV输出:
- """
- instruction2 = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 招标人|招标人表达
- 请抽取以上内容并严格按上述1段PSV输出:
- """
- instruction3 = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
- 请抽取以上内容并严格按上述1段PSV输出:
- """
- for line in data_list1:
- docid = int(line[0])
- html = docid_html_dict.get(docid)
- extract_json = docid_json_dict.get(docid)
- text = html2text_with_table_html(html)
- try:
- answer = extract_json_to_psv_prefix(json.loads(extract_json), text,
- empty_char=empty_char)
- if answer is None:
- continue
- except:
- continue
- print('answer0', answer)
- train_data = {
- "instruction": instruction,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- answer = extract_json_to_psv_prefix(json.loads(extract_json), text,
- empty_char=empty_char, prefix='[仅招标人]')
- print('answer1', answer)
- if answer is None:
- continue
- train_data = {
- "instruction": instruction2,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- answer = extract_json_to_psv_prefix(json.loads(extract_json), text,
- empty_char=empty_char, prefix='[仅产品]')
- print('answer2', answer)
- if answer is None:
- continue
- train_data = {
- "instruction": instruction3,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- # 生成
- train_ratio = 0.9
- dev_ratio = 0.1
- random.shuffle(all_data)
- total = len(all_data)
- train_num = int(total * train_ratio)
- dev_num = int(total * dev_ratio)
- # 拆分
- train_lines = all_data[:train_num]
- dev_lines = all_data[train_num:train_num+dev_num]
- test_lines = all_data[train_num+dev_num:]
- print('len(train_lines)', len(train_lines))
- print('len(dev_lines)', len(dev_lines))
- # 保存
- # train_path = "data3_prefix/train_data.jsonl"
- # dev_path = "data3_prefix/dev_data.jsonl"
- # test_path = "data3_prefix/test_data.jsonl"
- train_path = "data7_prefix/train_data.jsonl"
- dev_path = "data7_prefix/dev_data.jsonl"
- test_path = "data7_prefix/test_data.jsonl"
- with open(train_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(train_lines))
- with open(dev_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(dev_lines))
- with open(test_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(test_lines))
- def entity_data_to_jsonl_prefix():
- df1 = pd.read_excel(r'df_train.xlsx')
- data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
- empty_char = '-'
- instruction = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 项目名称|招标人名称|代理人名称
- 招标人联系人|招标人联系人电话
- 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
- 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
- 请抽取以上内容并严格按上述4段PSV输出:
- """
- instruction2 = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 招标人|招标人表达
- 请抽取以上内容并严格按上述1段PSV输出:
- """
- all_data = []
- max_cnt = 2000
- cnt = 0
- for line in data_list1:
- center = line[1]
- docid = line[2]
- label = line[4]
- left = line[5]
- right = line[8]
- if label != '招标人':
- continue
- if cnt >= max_cnt:
- break
- text = left + center + right
- answer = entity_to_psv_prefix(text, center, empty_char=empty_char, prefix='[仅招标人]')
- train_data = {
- "instruction": instruction2,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- cnt += 1
- # 生成
- train_ratio = 0.9
- dev_ratio = 0.1
- random.shuffle(all_data)
- total = len(all_data)
- train_num = int(total * train_ratio)
- dev_num = int(total * dev_ratio)
- # 拆分
- train_lines = all_data[:train_num]
- dev_lines = all_data[train_num:train_num+dev_num]
- test_lines = all_data[train_num+dev_num:]
- print('len(train_lines)', len(train_lines))
- print('len(dev_lines)', len(dev_lines))
- # 保存
- train_path = "data5_prefix/train_data.jsonl"
- dev_path = "data5_prefix/dev_data.jsonl"
- test_path = "data5_prefix/test_data.jsonl"
- with open(train_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(train_lines))
- with open(dev_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(dev_lines))
- with open(test_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(test_lines))
- def augment_jsonl_data():
- tags = ['[仅招标人]', '[全字段]']
- train_path = './data6_prefix/train_data.jsonl'
- output_path = './data6_prefix_aug/train_data.jsonl'
- data_dict_list = []
- with open(train_path, 'r', encoding='utf-8') as f:
- for line in f:
- line = json.loads(line.strip())
- # 构造Qwen的输入格式(Chat版格式)
- prompt = f"<|im_start|>user\n{line['input']}\n{line['instruction']}<|im_end|>\n<|im_start|>assistant\n{line['output']}<|im_end|>"
- input_data = line['input']
- output_data = line['output']
- # print('output_data', output_data)
- data_dict_list.append(line)
- # project_name, tenderee, agency 位置打乱
- new_data_dict_list = []
- for data_dict in data_dict_list:
- # 50% 打乱
- if random.choice([0, 1]):
- new_data_dict_list.append(data_dict)
- continue
- text = data_dict['input']
- output = data_dict['output']
- now_tag = None
- for tag in tags:
- if tag in output:
- output = re.sub(tag, '', str(output))
- now_tag = tag
- break
- first_table = output.split('\n')[1]
- if now_tag == '[仅招标人]':
- tenderee, _ = first_table.split('|')[:2]
- project_name = ''
- agency = ''
- else:
- project_name, tenderee, agency = first_table.split('|')[:3]
- if len(tenderee) <= 1:
- continue
- # sen_list = re.split('[,。;?!]', text)
- sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text)
- tenderee_sen_list = []
- agency_sen_list = []
- project_name_sen_list = []
- for sen in sen_list:
- match = re.search(re.escape(tenderee), sen)
- if match:
- tenderee_sen_list.append(sen)
- if len(agency) > 1:
- match = re.search(re.escape(agency), sen)
- if match:
- agency_sen_list.append(sen)
- if len(project_name) > 1:
- match = re.search(re.escape(project_name), sen)
- if match:
- project_name_sen_list.append(sen)
- for sen in project_name_sen_list:
- if sen in tenderee_sen_list:
- tenderee_sen_list.remove(sen)
- if len(tenderee_sen_list) == 0:
- continue
- if len(tenderee_sen_list) >= 2:
- print('tenderee_sen_list', tenderee_sen_list)
- continue
- for sen in tenderee_sen_list + agency_sen_list + project_name_sen_list:
- if sen in sen_list:
- sen_list.remove(sen)
- print('len(sen_list)', len(sen_list))
- if len(sen_list) <= 1:
- print('len(sen_list) <= 1', sen_list)
- continue
- random_index = random.randint(1, len(sen_list)-1)
- tenderee_sen = tenderee_sen_list[0]
- if '<' in tenderee_sen:
- continue
- print('tenderee_sen', tenderee_sen, tenderee)
- sen_list = sen_list[:random_index] + [tenderee_sen] + sen_list[random_index:]
- if agency_sen_list:
- random_index = random.randint(1, len(sen_list)-1)
- agency_sen = agency_sen_list[0]
- print('agency_sen', agency_sen)
- sen_list = sen_list[:random_index] + [agency_sen] + sen_list[random_index:]
- if project_name_sen_list:
- random_index = random.randint(1, len(sen_list)-1)
- project_name_sen = project_name_sen_list[0]
- print('project_name_sen', project_name_sen)
- sen_list = sen_list[:random_index] + [project_name_sen] + sen_list[random_index:]
- new_text = ''.join(sen_list)
- data_dict['input'] = new_text
- new_data_dict_list.append(data_dict)
- print('len(new_data_dict_list)', len(new_data_dict_list))
- _str = '\n'.join([json.dumps(x, ensure_ascii=False) for x in new_data_dict_list])
- with open(output_path, 'w', encoding='utf-8') as f:
- f.write(_str)
- print('finish to', output_path)
- def table_list_to_psv(table_list, empty_char, table_type=None):
- final_str = ''
- show_flag = 0
- for table_cols, table_values in table_list:
- str1 = '|'.join(table_cols) + '\n'
- continue_flag = 0
- if table_values:
- str2 = ''
- if len(table_values) >= 2:
- # print('table_values', table_values)
- show_flag = 1
- for line in table_values:
- if '|' in str(line):
- continue_flag = 1
- break
- str2 += '|'.join([str(x) if str(x) != '' else empty_char for x in line]) + '\n'
- if not str2:
- str2 = '|'.join([empty_char for x in table_cols]) + '\n'
- else:
- str2 = '|'.join([empty_char for x in table_cols]) + '\n'
- if continue_flag:
- return None
- # 判断表头和内容竖线是否相同
- # for ss2 in str2.split('\n'):
- # if len(ss2) == 0:
- # continue
- # if len(re.findall("\|", str1)) != len(re.findall("\|", ss2)):
- # print('--- str1', str1)
- # print('--- str2', ss2)
- # return None
- if len(re.findall("\|", str2)) % len(re.findall("\|", str1)) != 0:
- print('--- str1', str1)
- print('--- str2', str2)
- return None
- final_str += str1
- final_str += str2
- final_str += '\n'
- final_str = re.sub('未提及', f'{empty_char}', final_str)
- delete_value_list = ['None', '无', '无品牌', '无型号', '0', '0.0', '未提及']
- for v in delete_value_list:
- final_str = re.sub(f'\|{v}\|', f'|{empty_char}|', final_str)
- final_str = re.sub(f'\|{v}\n', f'|{empty_char}\n', final_str)
- final_str = re.sub(f'\n{v}\|', f'\n{empty_char}|', final_str)
- return final_str
- def saimofei_to_psv_prefix(data_list, text, prefix, empty_char='-'):
- project_name = data_list[0].get('doctitle')
- tenderee = data_list[0].get('tenderee')
- agency = data_list[0].get('agency')
- tenderee_contact_list = []
- win_tenderer_info_list = []
- product_list = []
- for d in data_list:
- # tenderee_contact_list
- tenderee_contact = d.get('tenderee_contact')
- if '/' in tenderee_contact:
- tenderee_person, tenderee_phone = tenderee_contact.split('/')
- else:
- tenderee_phone = tenderee_contact
- tenderee_person = ''
- tenderee_contact_list.append([tenderee_person, tenderee_phone])
- # win_tenderer_info_list
- win_tenderer = d.get('tenderee_contact')
- project_code = d.get('project_code')
- budget = d.get('budget')
- win_money = d.get('win_money')
- budget = str(getUnifyMoney(budget))
- win_money = str(getUnifyMoney(win_money))
- win_tenderer_info = [
- '-', project_code, win_tenderer, win_money,
- '元', budget, '元',
- ]
- win_tenderer_info_list.append(win_tenderer_info)
- # product_list
- product_name = d.get('product_name')
- brand = d.get('brand')
- specs = d.get('specs')
- product_cnt = d.get('product_cnt')
- unit_price = d.get('unit_price')
- total_price = d.get('total_price')
- product_list.append([
- product_name, brand, specs, product_cnt,
- unit_price, total_price, '-', '-'
- ])
- tenderee_contact_list = list(set([json.dumps(x) for x in tenderee_contact_list]))
- tenderee_contact_list = [json.loads(x) for x in tenderee_contact_list]
- win_tenderer_info_list = list(set([json.dumps(x) for x in win_tenderer_info_list]))
- win_tenderer_info_list = [json.loads(x) for x in win_tenderer_info_list]
- product_list = list(set([json.dumps(x) for x in product_list]))
- product_list = [json.loads(x) for x in product_list]
- if prefix == '[全字段]':
- table_list = []
- # table 1
- table_cols = ['项目名称', '招标人名称', '代理人名称']
- table_values = [[project_name, tenderee, agency]]
- table_list.append([table_cols, table_values])
- # table 2
- table_cols = ['招标人联系人', '招标人联系人电话']
- # print('tenderee_contact_list', tenderee_contact_list)
- table_values = tenderee_contact_list if tenderee_contact_list else []
- temp_list = []
- for v in table_values:
- if (v[0] not in [None, '', '-'] and v[0] in text) \
- or (v[1] not in [None, '', '-'] and v[1] in text):
- temp_list.append(v)
- table_values = temp_list
- table_list.append([table_cols, table_values])
- # table 3
- table_cols = ['标段名称', '标段号', '中标人名称', '中标金额', '中标金额单位', '标段预算', '标段预算单位']
- table_values = win_tenderer_info_list if win_tenderer_info_list else []
- temp_list = []
- for v in table_values:
- if (v[0] not in [None, '', '-'] and v[0] in text) \
- or (v[2] not in [None, '', '-'] and v[2] in text) \
- or (v[1] not in [None, '', '-'] and v[1] in text):
- temp_list.append(v)
- table_values = temp_list
- table_list.append([table_cols, table_values])
- # table 4
- table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号']
- table_values = product_list if product_list else []
- temp_list = []
- for v in table_values:
- if v[0] not in [None, '', '-'] and v[0] in text:
- temp_list.append(v)
- table_values = temp_list
- # # 产品中数值类型 重复3次
- # for v in table_values:
- # for col_i in [3, 4, 5]:
- # try:
- # col_v = float(v[col_i])
- # if col_v > 0:
- # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]])
- # except:
- # pass
- table_list.append([table_cols, table_values])
- final_str = table_list_to_psv(table_list, empty_char)
- if not final_str:
- return final_str
- final_str = prefix + final_str
- return final_str
- elif prefix == '[仅招标人]':
- if not tenderee:
- return None
- sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text)
- tenderee_sen_list = []
- for sen in sen_list:
- match = re.search(re.escape(tenderee), sen)
- if match:
- tenderee_sen_list.append(sen)
- if tenderee_sen_list:
- tenderee_sen_list.sort(key=lambda x: len(x))
- tenderee_line = tenderee_sen_list[0]
- else:
- tenderee_line = empty_char
- table_list = []
- table_cols = ['招标人', '招标人表达']
- table_values = [[tenderee, tenderee_line]]
- table_list.append([table_cols, table_values])
- final_str = table_list_to_psv(table_list, empty_char)
- if not final_str:
- return final_str
- final_str = prefix + final_str
- return final_str
- elif prefix == '[仅产品]':
- table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号']
- table_values = product_list if product_list else []
- # 判断截取后产品是否还在其中
- # if len(text) >= 10000:
- # sub_text = text[:10000]
- temp_list = []
- for v in table_values:
- if v[0] not in [None, '', '-'] and v[0] in text:
- temp_list.append(v)
- table_values = temp_list
- # # 产品中数值类型 重复3次
- # for v in table_values:
- # for col_i in [3, 4, 5]:
- # try:
- # col_v = float(v[col_i])
- # if col_v > 0:
- # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]])
- # except:
- # pass
- table_list = []
- table_list.append([table_cols, table_values])
- final_str = table_list_to_psv(table_list, empty_char)
- if not final_str:
- return final_str
- final_str = prefix + final_str
- return final_str
- def saimofei_data_to_jsonl_data():
- df = pd.read_excel(r'C:\Users\Administrator\Downloads\赛默飞-样例数据.xlsx', header=1)
- df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260417_saimofei_html.csv')
- head_list = list(df.columns)
- data_list = df.astype(object).where(pd.notnull(df), "").values.tolist()
- data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
- docid_html_dict = {int(x[0]): x[1] for x in data_list1}
- docid_data_dict = {}
- for data in data_list:
- docid = data[head_list.index('公告ID')]
- doctitle = data[head_list.index('公告名称')]
- budget = data[head_list.index('预算金额')]
- win_money = data[head_list.index('成交金额')]
- tenderee = data[head_list.index('招标单位')]
- tenderee_contact = data[head_list.index('招标单位联系人')]
- agency = data[head_list.index('代理机构')]
- win_tenderer = data[head_list.index('中标单位')]
- product_name = data[head_list.index('产品名称')]
- brand = data[head_list.index('品牌名称')]
- specs = data[head_list.index('型号')]
- product_cnt = data[head_list.index('数量')]
- unit_price = data[head_list.index('单价(元)')]
- total_price = data[head_list.index('总价(元)')]
- project_code = data[head_list.index('项目编号')]
- new_data = {
- 'doctitle': doctitle,
- 'budget': budget,
- 'win_money': win_money,
- 'tenderee': tenderee,
- 'tenderee_contact': tenderee_contact,
- 'agency': agency,
- 'win_tenderer': win_tenderer,
- 'product_name': product_name,
- 'brand': brand,
- 'specs': specs,
- 'product_cnt': product_cnt,
- 'unit_price': unit_price,
- 'total_price': total_price,
- 'project_code': project_code,
- }
- if docid in docid_data_dict:
- docid_data_dict[int(docid)] += [new_data]
- else:
- docid_data_dict[int(docid)] = [new_data]
- all_data = []
- empty_char = '-'
- instruction = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 项目名称|招标人名称|代理人名称
- 招标人联系人|招标人联系人电话
- 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
- 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
- 请抽取以上内容并严格按上述4段PSV输出:
- """
- instruction2 = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 招标人|招标人表达
- 请抽取以上内容并严格按上述1段PSV输出:
- """
- instruction3 = f"""
- 你是招投标要素抽取专家。
- 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
- 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
- 请抽取以上内容并严格按上述1段PSV输出:
- """
- for docid, data_list in docid_data_dict.items():
- html = docid_html_dict.get(int(docid))
- text = html2text_with_table_html(html)
- answer = saimofei_to_psv_prefix(data_list, text, prefix='[全字段]', empty_char=empty_char)
- print('answer1', answer)
- if not answer:
- continue
- train_data = {
- "instruction": instruction,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- answer = saimofei_to_psv_prefix(data_list, text, prefix='[仅招标人]', empty_char=empty_char)
- print('answer2', answer)
- if not answer:
- continue
- train_data = {
- "instruction": instruction2,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- answer = saimofei_to_psv_prefix(data_list, text, prefix='[仅产品]', empty_char=empty_char)
- print('answer3', answer)
- if not answer:
- continue
- train_data = {
- "instruction": instruction3,
- "input": text,
- "output": answer,
- }
- all_data.append(json.dumps(train_data, ensure_ascii=False))
- # 生成
- train_ratio = 0.9
- dev_ratio = 0.1
- random.shuffle(all_data)
- total = len(all_data)
- train_num = int(total * train_ratio)
- dev_num = int(total * dev_ratio)
- # 拆分
- train_lines = all_data[:train_num]
- dev_lines = all_data[train_num:train_num+dev_num]
- test_lines = all_data[train_num+dev_num:]
- print('len(train_lines)', len(train_lines))
- print('len(dev_lines)', len(dev_lines))
- # 保存
- train_path = "data6_prefix/train_data.jsonl"
- dev_path = "data6_prefix/dev_data.jsonl"
- test_path = "data6_prefix/test_data.jsonl"
- with open(train_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(train_lines))
- with open(dev_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(dev_lines))
- with open(test_path, 'w', encoding='utf-8') as f:
- f.write("\n".join(test_lines))
- if __name__ == '__main__':
- # filter_data_docid()
- # xlsx_data_to_jsonl()
- # xlsx_data_to_jsonl_2()
- # xlsx_data_to_jsonl_3()
- # xlsx_data_to_jsonl_4_prefix()
- # xlsx_data_to_jsonl_5()
- xlsx_data_to_jsonl_3_prefix()
- # entity_data_to_jsonl_prefix()
- # saimofei_data_to_jsonl_data()
- # augment_jsonl_data()
|