get_data.py 68 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950
  1. import json
  2. import random
  3. import re
  4. import traceback
  5. from compare_utils import getUnifyMoney
  6. import pandas as pd
  7. from bs4 import BeautifulSoup
  8. random.seed(112)
  9. bid_cols_dict = {
  10. "project_name": "项目名称",
  11. "project_code": "项目编号",
  12. "docchannel": "公告类型",
  13. "area": "地域",
  14. "province": "省",
  15. "city": "市",
  16. "district": "区",
  17. "tenderee": "招标人",
  18. "tenderee_contact": "招标人联系人",
  19. "tenderee_phone": "招标人联系人电话",
  20. "agency": "代理人",
  21. "agency_contact": "代理人联系人",
  22. "agency_phone": "代理人联系人电话",
  23. "sub_docs_json": "多标段信息",
  24. "products": "产品信息",
  25. "service_time": "开工竣工时间",
  26. "time_bidstart": "投标开始时间",
  27. "time_bidclose": "截标时间",
  28. "time_bidopen": "开标时间",
  29. "time_get_file_end": "文件获取截止时间",
  30. "time_get_file_start": "文件获取开始时间",
  31. "time_release": "发布时间",
  32. "time_registration_end": '报名截止时间',
  33. "time_registration_start": "报名开始时间",
  34. "time_earnest_money_end": "保证金递交截止时间",
  35. "time_earnest_money_start": "保证金递交开始时间",
  36. }
  37. print('cols', bid_cols_dict.keys())
  38. channel_map_dict = {
  39. 51: "公告变更",
  40. 52: "招标公告",
  41. 101: "中标信息",
  42. 102: "招标预告",
  43. 103: "招标答疑",
  44. 104: "招标文件",
  45. 105: "资审结果",
  46. 106: "法律法规",
  47. 107: "新闻资讯",
  48. 108: "拟建项目",
  49. 109: "展会推广",
  50. 110: "企业名录",
  51. 111: "企业资质",
  52. 112: "全国工程人员",
  53. 113: "业主采购",
  54. 114: "采购意向",
  55. 115: "拍卖出让",
  56. 116: "土地矿产",
  57. 117: "产权交易",
  58. 118: "废标公告",
  59. 119: "候选人公示",
  60. 120: "合同公告",
  61. 121: "开标记录",
  62. 122: "验收合同",
  63. 301: "拟在建项目",
  64. 302: "审批项目",
  65. 303: "处罚公告",
  66. }
  67. sub_docs_json_map_dict = {
  68. "sub_project_name": "标包项目名称",
  69. "sub_project_code": "标包项目编号",
  70. "bidding_budget": "预算金额",
  71. "bidding_budget_unit": "预算金额单位",
  72. "win_tenderer": "中标人",
  73. "second_tenderer": "第二候选人",
  74. "third_tenderer": "第三候选人",
  75. "win_tenderer_manager": "中标人联系人",
  76. "second_tenderer_manager": "第二候选人联系人",
  77. "third_tenderer_manager": "第三候选人联系人",
  78. "win_tenderer_phone": "中标人联系人电话",
  79. "second_tenderer_phone": "第二候选人联系人电话",
  80. "third_tenderer_phone": "第三候选人联系人电话",
  81. "win_bid_price": "中标人投标金额",
  82. "second_bid_price": "第二候选人投标金额",
  83. "third_bid_price": "第三候选人投标金额",
  84. "win_bid_price_unit": "中标人投标金额单位",
  85. "second_bid_price_unit": "第二候选人投标金额单位",
  86. "third_bid_price_unit": "第三候选人投标金额单位",
  87. }
  88. products_map_dict = {
  89. 'brand': '品牌',
  90. 'product': '产品名称',
  91. 'quantity': '数量',
  92. 'quantity_unit': '数量单位',
  93. 'specs': '规格',
  94. 'unitPrice': '单价',
  95. "parameter": "参数",
  96. "total_price": "总价",
  97. "pinmu_no": "品目编号",
  98. "pinmu_name": "品目名称",
  99. }
  100. def filter_data_docid():
  101. df = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_260309.xlsx')
  102. # data_list = df.astype(object).where(pd.notnull(df), "").values.tolist()
  103. data_list = df[['docid', 'sub_docs_json']].astype(object).where(pd.notnull(df), "").values.tolist()
  104. print('data_list[0]', data_list[0])
  105. data_list.sort(key=lambda x: str(x[1]), reverse=True)
  106. data_list = data_list[:5000]
  107. for d in data_list[:20]:
  108. print('d', d)
  109. ss = json.dumps([x[0] for x in data_list])
  110. with open(r'D:\BIDI_DOC\比地_文档\export_260309.txt', 'w') as f:
  111. f.write(ss)
  112. print('finish')
  113. def xlsx_data_to_jsonl():
  114. df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_260309_2.xlsx')
  115. data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
  116. filter_docid_dict = {int(x[0]): x[1] for x in data_list2}
  117. df = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_260309.xlsx')
  118. data_list = df.astype(object).where(pd.notnull(df), "").values.tolist()
  119. all_data = []
  120. instruction = '提取以上招投标文档的关键信息,只输出有值的,其中中标人等放在多标段信息中形成数组,' \
  121. '产品参数等放在产品信息中形成数组' \
  122. '金额均以元为单位,时间格式为YYYY-MM-DD HH:MM:SS,无多余内容' \
  123. '直接得到要素提取Json:'
  124. for line in data_list:
  125. docid = int(line[0])
  126. if docid not in filter_docid_dict:
  127. continue
  128. doctextcon = filter_docid_dict.get(docid)
  129. try:
  130. channel = int(line[3])
  131. channel = channel_map_dict.get(channel)
  132. line[3] = channel
  133. except:
  134. print('channel error continue', line[3])
  135. continue
  136. # print('line[2]', line[2])
  137. # print('line[14]', line[14])
  138. # print('line[15]', line[15])
  139. # print('line[16]', line[16])
  140. if not line[14]:
  141. line[14] = []
  142. else:
  143. sub_docs_json = json.loads(line[14])
  144. for si, d1 in enumerate(sub_docs_json):
  145. d1 = {v: d1.get(k, "") for k, v in sub_docs_json_map_dict.items()}
  146. sub_docs_json[si] = d1
  147. for si, sub in enumerate(sub_docs_json):
  148. delete_k = []
  149. for k, v in sub.items():
  150. if not v:
  151. delete_k.append(k)
  152. for k in delete_k:
  153. if k in sub:
  154. sub.pop(k)
  155. sub_docs_json[si] = sub
  156. line[14] = sub_docs_json
  157. if not line[15] or len(line[15]) >= 500:
  158. line[15] = []
  159. else:
  160. products = json.loads(line[15])
  161. for si, d1 in enumerate(products):
  162. d1 = {v: d1.get(k, "") for k, v in products_map_dict.items()}
  163. products[si] = d1
  164. line[15] = products
  165. if not line[16]:
  166. line[16] = {}
  167. else:
  168. service_time = json.loads(line[16])
  169. line[16] = service_time
  170. d = {bid_cols_dict.get(x): line[i+1] for i, x in enumerate(bid_cols_dict.keys())}
  171. # 删掉空字段
  172. delete_k = []
  173. for k, v in d.items():
  174. if not v:
  175. delete_k.append(k)
  176. for k in delete_k:
  177. if k in d:
  178. d.pop(k)
  179. train_data = {
  180. "instruction": instruction,
  181. "input": doctextcon,
  182. "output": d
  183. }
  184. all_data.append(json.dumps(train_data, ensure_ascii=False))
  185. # 生成
  186. train_ratio = 0.8
  187. dev_ratio = 0.1
  188. random.shuffle(all_data)
  189. total = len(all_data)
  190. train_num = int(total * train_ratio)
  191. dev_num = int(total * dev_ratio)
  192. # 拆分
  193. train_lines = all_data[:train_num]
  194. dev_lines = all_data[train_num:train_num+dev_num]
  195. test_lines = all_data[train_num+dev_num:]
  196. # 保存
  197. train_path = "data/train_data.jsonl"
  198. dev_path = "data/dev_data.jsonl"
  199. test_path = "data/test_data.jsonl"
  200. with open(train_path, 'w', encoding='utf-8') as f:
  201. f.write("\n".join(train_lines))
  202. with open(dev_path, 'w', encoding='utf-8') as f:
  203. f.write("\n".join(dev_lines))
  204. with open(test_path, 'w', encoding='utf-8') as f:
  205. f.write("\n".join(test_lines))
  206. def extract_json_to_psv(_dict, empty_char=''):
  207. """
  208. json转竖线格式
  209. :return:
  210. """
  211. # empty_char = 'null'
  212. project_name = _dict.get('name')
  213. float_none_list = ['0', '0.0', 'None', empty_char, '', '未提及']
  214. district_dict = _dict.get('district')
  215. if not district_dict:
  216. district_dict = {}
  217. # 生成 PSV 表头(字段名)和数据行
  218. headers = "|".join(district_dict.keys())
  219. values = "|".join(str(value) for value in district_dict.values())
  220. # 拼接成完整 PSV 内容
  221. psv_content = f"{headers}\n{values}"
  222. prem = _dict.get('prem')
  223. tenderee = ""
  224. tenderee_contact_list = []
  225. agency = ""
  226. win_tenderer_info_list = []
  227. for package_name, package_dict in prem.items():
  228. bid_name = package_dict.get('name')
  229. tenderee_money = package_dict.get('tendereeMoney')
  230. tenderee_money_unit = package_dict.get('tendereeMoneyUnit')
  231. role_list = package_dict.get('roleList')
  232. project_code = package_dict.get('code')
  233. if str(tenderee_money) in float_none_list:
  234. tenderee_money = empty_char
  235. tenderee_money_unit = empty_char
  236. else:
  237. tenderee_money_unit = '元'
  238. if package_name == 'Project':
  239. package_name = empty_char
  240. win_tenderer_info = None
  241. for role_dict in role_list:
  242. role_type = role_dict.get('role_name')
  243. role_text = role_dict.get('role_text')
  244. contact_list = role_dict.get('linklist', [])
  245. role_money = role_dict.get('role_money', {}).get('money')
  246. role_money_unit = role_dict.get('role_money', {}).get('money_unit')
  247. if str(role_money) in float_none_list:
  248. role_money = empty_char
  249. role_money_unit = empty_char
  250. else:
  251. role_money_unit = '元'
  252. if role_type == 'tenderee' and len(role_text) >= 2:
  253. tenderee = role_text
  254. tenderee_contact_list += contact_list
  255. if role_type == 'agency' and len(role_text) >= 2:
  256. agency = role_text
  257. if not win_tenderer_info and role_type == 'win_tenderer':
  258. # if len(str(role_money)) > 0 and not role_money_unit:
  259. # role_money_unit = '元'
  260. # if len(str(tenderee_money)) > 0 and not tenderee_money_unit:
  261. # tenderee_money_unit = '元'
  262. win_tenderer_info = [package_name, project_code, role_text,
  263. role_money, role_money_unit,
  264. tenderee_money, tenderee_money_unit
  265. ]
  266. win_tenderer_info_list.append(win_tenderer_info)
  267. product_list = _dict.get('product_attrs', {}).get('data', {})
  268. product_cols = ['product', 'brand', 'specs', 'quantity',
  269. 'unitPrice', 'total_price', 'pinmu_name', 'pinmu_no'
  270. ]
  271. # print('product_list1', product_list)
  272. product_list = [[x.get(y, "") for y in product_cols] for x in product_list]
  273. # print('product_list2', product_list)
  274. for pi, product in enumerate(product_list):
  275. if str(product[3]) in float_none_list:
  276. product_list[pi][3] = empty_char
  277. if str(product[4]) in float_none_list:
  278. product_list[pi][4] = empty_char
  279. if str(product[5]) in float_none_list:
  280. product_list[pi][5] = empty_char
  281. table_list = []
  282. # table 1
  283. table_cols = ['项目名称', '招标人名称', '代理人名称']
  284. table_values = [[project_name, tenderee, agency]]
  285. table_list.append([table_cols, table_values])
  286. # table 2
  287. table_cols = ['招标人联系人', '招标人联系人电话']
  288. # print('tenderee_contact_list', tenderee_contact_list)
  289. table_values = tenderee_contact_list if tenderee_contact_list else []
  290. table_list.append([table_cols, table_values])
  291. # table 3
  292. table_cols = ['标段名称', '标段号', '中标人名称', '中标金额', '中标金额单位', '标段预算', '标段预算单位']
  293. table_values = win_tenderer_info_list if win_tenderer_info_list else []
  294. table_list.append([table_cols, table_values])
  295. # table 4
  296. table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号']
  297. table_values = product_list if product_list else []
  298. table_list.append([table_cols, table_values])
  299. final_str = ''
  300. show_flag = 0
  301. for table_cols, table_values in table_list:
  302. str1 = '|'.join(table_cols) + '\n'
  303. continue_flag = 0
  304. if table_values:
  305. str2 = ''
  306. if len(table_values) >= 2:
  307. # print('table_values', table_values)
  308. show_flag = 1
  309. for line in table_values:
  310. if '|' in str(line):
  311. continue_flag = 1
  312. break
  313. str2 += '|'.join([str(x) if str(x) != '' else empty_char for x in line]) + '\n'
  314. if not str2:
  315. str2 = '|'.join([empty_char for x in table_cols]) + '\n'
  316. else:
  317. str2 = '|'.join([empty_char for x in table_cols]) + '\n'
  318. if continue_flag:
  319. return None
  320. # 判断表头和内容竖线是否相同
  321. # for ss2 in str2.split('\n'):
  322. # if len(ss2) == 0:
  323. # continue
  324. # if len(re.findall("\|", str1)) != len(re.findall("\|", ss2)):
  325. # print('--- str1', str1)
  326. # print('--- str2', ss2)
  327. # return None
  328. if len(re.findall("\|", str2)) % len(re.findall("\|", str1)) != 0:
  329. print('--- str1', str1)
  330. print('--- str2', str2)
  331. return None
  332. final_str += str1
  333. final_str += str2
  334. final_str += '\n'
  335. # if f'产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str and f'标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str:
  336. # return None
  337. final_str = re.sub('未提及', f'{empty_char}', final_str)
  338. delete_value_list = ['None', '无', '无品牌', '无型号']
  339. for v in delete_value_list:
  340. final_str = re.sub(f'\|{v}\|', f'|{empty_char}|', final_str)
  341. final_str = re.sub(f'\|{v}\n', f'|{empty_char}\n', final_str)
  342. final_str = re.sub(f'\n{v}\|', f'\n{empty_char}|', final_str)
  343. # if show_flag:
  344. # print('final_str', final_str)
  345. # final_str = table_list_to_psv(table_list, empty_char)
  346. return final_str
  347. def extract_json_to_psv_prefix(_dict, text, empty_char='-', prefix='[全字段]'):
  348. """
  349. json转竖线格式
  350. :return:
  351. """
  352. # empty_char = 'null'
  353. project_name = _dict.get('name')
  354. float_none_list = ['0', '0.0', 'None', empty_char, '', '未提及']
  355. district_dict = _dict.get('district')
  356. if not district_dict:
  357. district_dict = {}
  358. # 生成 PSV 表头(字段名)和数据行
  359. headers = "|".join(district_dict.keys())
  360. values = "|".join(str(value) for value in district_dict.values())
  361. # 拼接成完整 PSV 内容
  362. psv_content = f"{headers}\n{values}"
  363. prem = _dict.get('prem')
  364. tenderee = ""
  365. tenderee_contact_list = []
  366. agency = ""
  367. win_tenderer_info_list = []
  368. for package_name, package_dict in prem.items():
  369. bid_name = package_dict.get('name')
  370. tenderee_money = package_dict.get('tendereeMoney')
  371. tenderee_money_unit = package_dict.get('tendereeMoneyUnit')
  372. role_list = package_dict.get('roleList')
  373. project_code = package_dict.get('code')
  374. if str(tenderee_money) in float_none_list:
  375. tenderee_money = empty_char
  376. tenderee_money_unit = empty_char
  377. else:
  378. tenderee_money_unit = '元'
  379. if package_name == 'Project':
  380. package_name = empty_char
  381. win_tenderer_info = None
  382. for role_dict in role_list:
  383. role_type = role_dict.get('role_name')
  384. role_text = role_dict.get('role_text')
  385. contact_list = role_dict.get('linklist', [])
  386. role_money = role_dict.get('role_money', {}).get('money')
  387. role_money_unit = role_dict.get('role_money', {}).get('money_unit')
  388. if str(role_money) in float_none_list:
  389. role_money = empty_char
  390. role_money_unit = empty_char
  391. else:
  392. role_money_unit = '元'
  393. if role_type == 'tenderee' and len(role_text) >= 2:
  394. tenderee = role_text
  395. tenderee_contact_list += contact_list
  396. if role_type == 'agency' and len(role_text) >= 2:
  397. agency = role_text
  398. if not win_tenderer_info and role_type == 'win_tenderer':
  399. # if len(str(role_money)) > 0 and not role_money_unit:
  400. # role_money_unit = '元'
  401. # if len(str(tenderee_money)) > 0 and not tenderee_money_unit:
  402. # tenderee_money_unit = '元'
  403. win_tenderer_info = [package_name, project_code, role_text,
  404. role_money, role_money_unit,
  405. tenderee_money, tenderee_money_unit
  406. ]
  407. win_tenderer_info_list.append(win_tenderer_info)
  408. product_list = _dict.get('product_attrs', {}).get('data', {})
  409. product_cols = ['product', 'brand', 'specs', 'quantity',
  410. 'unitPrice', 'total_price', 'pinmu_name', 'pinmu_no'
  411. ]
  412. # print('product_list1', product_list)
  413. product_list = [[x.get(y, "") for y in product_cols] for x in product_list]
  414. # print('product_list2', product_list)
  415. for pi, product in enumerate(product_list):
  416. if str(product[3]) in float_none_list:
  417. product_list[pi][3] = empty_char
  418. if str(product[4]) in float_none_list:
  419. product_list[pi][4] = empty_char
  420. if str(product[5]) in float_none_list:
  421. product_list[pi][5] = empty_char
  422. if prefix == '[全字段]':
  423. table_list = []
  424. # table 1
  425. table_cols = ['项目名称', '招标人名称', '代理人名称']
  426. table_values = [[project_name, tenderee, agency]]
  427. table_list.append([table_cols, table_values])
  428. # table 2
  429. table_cols = ['招标人联系人', '招标人联系人电话']
  430. # print('tenderee_contact_list', tenderee_contact_list)
  431. table_values = tenderee_contact_list if tenderee_contact_list else []
  432. temp_list = []
  433. for v in table_values:
  434. if (v[0] not in [None, '', '-'] and v[0] in text) \
  435. or (v[1] not in [None, '', '-'] and v[1] in text):
  436. temp_list.append(v)
  437. table_values = temp_list
  438. table_list.append([table_cols, table_values])
  439. # table 3
  440. table_cols = ['标段名称', '标段号', '中标人名称', '中标金额', '中标金额单位', '标段预算', '标段预算单位']
  441. table_values = win_tenderer_info_list if win_tenderer_info_list else []
  442. temp_list = []
  443. for v in table_values:
  444. if (v[0] not in [None, '', '-'] and v[0] in text) \
  445. or (v[2] not in [None, '', '-'] and v[2] in text) \
  446. or (v[1] not in [None, '', '-'] and v[1] in text):
  447. temp_list.append(v)
  448. table_values = temp_list
  449. table_list.append([table_cols, table_values])
  450. # table 4
  451. table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号']
  452. table_values = product_list if product_list else []
  453. temp_list = []
  454. for v in table_values:
  455. if v[0] not in [None, '', '-'] and v[0] in text:
  456. temp_list.append(v)
  457. table_values = temp_list
  458. # # 产品中数值类型 重复3次
  459. # for v in table_values:
  460. # for col_i in [3, 4, 5]:
  461. # try:
  462. # col_v = float(v[col_i])
  463. # if col_v > 0:
  464. # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]])
  465. # except:
  466. # pass
  467. table_list.append([table_cols, table_values])
  468. # final_str = ''
  469. # show_flag = 0
  470. # for table_cols, table_values in table_list:
  471. # str1 = '|'.join(table_cols) + '\n'
  472. #
  473. # continue_flag = 0
  474. #
  475. # if table_values:
  476. # str2 = ''
  477. # if len(table_values) >= 2:
  478. # # print('table_values', table_values)
  479. # show_flag = 1
  480. # for line in table_values:
  481. # if '|' in str(line):
  482. # continue_flag = 1
  483. # break
  484. # str2 += '|'.join([str(x) if str(x) != '' else empty_char for x in line]) + '\n'
  485. # if not str2:
  486. # str2 = '|'.join([empty_char for x in table_cols]) + '\n'
  487. # else:
  488. # str2 = '|'.join([empty_char for x in table_cols]) + '\n'
  489. #
  490. # if continue_flag:
  491. # return None
  492. #
  493. # # 判断表头和内容竖线是否相同
  494. # # for ss2 in str2.split('\n'):
  495. # # if len(ss2) == 0:
  496. # # continue
  497. # # if len(re.findall("\|", str1)) != len(re.findall("\|", ss2)):
  498. # # print('--- str1', str1)
  499. # # print('--- str2', ss2)
  500. # # return None
  501. #
  502. # if len(re.findall("\|", str2)) % len(re.findall("\|", str1)) != 0:
  503. # print('--- str1', str1)
  504. # print('--- str2', str2)
  505. # return None
  506. #
  507. # final_str += str1
  508. # final_str += str2
  509. # final_str += '\n'
  510. #
  511. # # if f'产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str and f'标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位\n{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}|{empty_char}' in final_str:
  512. # # return None
  513. #
  514. # final_str = re.sub('未提及', f'{empty_char}', final_str)
  515. #
  516. # delete_value_list = ['None', '无', '无品牌', '无型号']
  517. # for v in delete_value_list:
  518. # final_str = re.sub(f'\|{v}\|', f'|{empty_char}|', final_str)
  519. # final_str = re.sub(f'\|{v}\n', f'|{empty_char}\n', final_str)
  520. # final_str = re.sub(f'\n{v}\|', f'\n{empty_char}|', final_str)
  521. #
  522. # # if show_flag:
  523. # # print('final_str', final_str)
  524. final_str = table_list_to_psv(table_list, empty_char)
  525. # final_str = '[全字段]' + final_str
  526. final_str = prefix + final_str
  527. return final_str
  528. elif prefix == '[仅招标人]':
  529. if not tenderee:
  530. return None
  531. sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text)
  532. tenderee_sen_list = []
  533. for sen in sen_list:
  534. match = re.search(re.escape(tenderee), sen)
  535. if match:
  536. tenderee_sen_list.append(sen)
  537. if tenderee_sen_list:
  538. tenderee_sen_list.sort(key=lambda x: len(x))
  539. tenderee_line = tenderee_sen_list[0]
  540. else:
  541. tenderee_line = empty_char
  542. table_list = []
  543. table_cols = ['招标人', '招标人表达']
  544. table_values = [[tenderee, tenderee_line]]
  545. table_list.append([table_cols, table_values])
  546. final_str = table_list_to_psv(table_list, empty_char)
  547. if not final_str:
  548. return final_str
  549. final_str = prefix + final_str
  550. return final_str
  551. # answer = f'[仅招标人]招标人|招标人表达\n{tenderee}|{tenderee_line}'
  552. # return answer
  553. elif prefix == '[仅产品]':
  554. table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号']
  555. table_values = product_list if product_list else []
  556. temp_list = []
  557. for v in table_values:
  558. if v[0] not in [None, '', '-'] and v[0] in text:
  559. temp_list.append(v)
  560. table_values = temp_list
  561. # # 产品中数值类型 重复3次
  562. # for v in table_values:
  563. # for col_i in [3, 4, 5]:
  564. # try:
  565. # col_v = float(v[col_i])
  566. # if col_v > 0:
  567. # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]])
  568. # except:
  569. # pass
  570. table_list = []
  571. table_list.append([table_cols, table_values])
  572. final_str = table_list_to_psv(table_list, empty_char)
  573. if not final_str:
  574. return final_str
  575. final_str = prefix + final_str
  576. return final_str
  577. def entity_to_psv_prefix(text, entity, empty_char='-', prefix='[仅招标人]'):
  578. if not entity:
  579. return None
  580. sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text)
  581. tenderee_sen_list = []
  582. for sen in sen_list:
  583. match = re.search(re.escape(entity), sen)
  584. if match:
  585. tenderee_sen_list.append(sen)
  586. if tenderee_sen_list:
  587. tenderee_sen_list.sort(key=lambda x: len(x))
  588. tenderee_line = tenderee_sen_list[0]
  589. else:
  590. tenderee_line = empty_char
  591. if prefix == '[仅招标人]':
  592. answer = f'{prefix}招标人|招标人表达\n{entity}|{tenderee_line}'
  593. elif prefix == '[全字段]':
  594. answer = f'{prefix}项目名称|招标人名称|代理人名称\n-|{entity}|-' \
  595. f'\n\n招标人联系人|招标人联系人电话\n-|-' \
  596. f'\n\n标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位\n-|-|-|-|-|-|-' \
  597. f'\n\n产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号\n-|-|-|-|-|-|-|-'
  598. return answer
  599. def psv_to_dict(_str):
  600. # _str = '''
  601. # 项目名称|招标人名称|代理人名称
  602. # 英吉沙县技工学校关于身体按摩的网上超市采购项目|英吉沙县技工学校|-
  603. #
  604. # 招标人联系人|招标人联系人电话
  605. # -|17690175536
  606. #
  607. # 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
  608. # 1|-|喀什市兆佳文体用品商行|3175.0|元|-|元
  609. #
  610. # 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
  611. # 刮痧板 刮痧版 身体按摩|无品牌|刮痧板|15|17.0|255.0|-|-
  612. # 口红 彩妆|无品牌|口红|15|17.0|255.0|-|-
  613. # 卸妆棉 彩妆|无品牌|卸妆棉|15|17.0|255.0|-|-
  614. # 卸妆水 彩妆|无品牌|卸妆水|15|33.0|495.0|-|-
  615. # 定妆粉 彩妆|无品牌|定妆粉|15|18.0|270.0|-|-
  616. # BB霜 隔离霜|无品牌|BB霜|15|35.0|525.0|-|-
  617. # 01眼影 眼霜|无品牌|01|15|35.0|525.0|-|-
  618. # 洁丽雅洗面奶 洁面用品|洁丽雅/grace|洗面奶|15|33.0|495.0|-|-
  619. # 00115454凯伦特/CARENT棉签 棉签/棉棒/棉包|凯伦特/CARENT|00115454|8|5.0|40.0|-|-
  620. # 祝源梳子 梳子/化妆梳/按摩梳|祝源|梳子|15|4.0|60.0|-|-
  621. # '''
  622. # 去掉前缀指示
  623. _str = re.sub('\[全字段\]|\[仅招标人\]', '', _str)
  624. table_head_list = [
  625. '项目名称|招标人名称|代理人名称',
  626. '招标人联系人|招标人联系人电话',
  627. '标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位',
  628. '产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号',
  629. ]
  630. has_all_head_flag = 1
  631. for head in table_head_list:
  632. if head not in _str:
  633. has_all_head_flag = 0
  634. break
  635. if not has_all_head_flag:
  636. return {}
  637. # 按空行分割成4个PSV块
  638. blocks = [b.strip() for b in _str.split("\n\n") if b.strip()]
  639. # 定义字段映射(和你的4段格式严格对应)
  640. keys = [
  641. "招标信息",
  642. "招标人联系方式",
  643. "中标信息",
  644. "产品信息",
  645. ]
  646. # 批量解析
  647. result = {}
  648. for key, block in zip(keys, blocks):
  649. dict_list = psv_block_to_dict(block)
  650. # 产品重复3次只保留第一个
  651. num_cols = ['单价', '数量', '总价']
  652. if key in ['产品信息']:
  653. for d in dict_list:
  654. for col in num_cols:
  655. v = d.get(col)
  656. if v and ',' in v:
  657. d[col] = v.split(',')[0]
  658. result[key] = dict_list
  659. # print('result', result)
  660. return result
  661. def psv_block_to_dict(block):
  662. """
  663. 把单个 PSV 块(表头+数据行)转换成字典
  664. 支持:1行数据 / 多行数据(自动转列表)
  665. """
  666. lines = [line.strip() for line in block.strip().splitlines() if line.strip()]
  667. if len(lines) < 2:
  668. return {}
  669. # 解析表头和数据行
  670. headers = [h.strip() for h in lines[0].split("|")]
  671. data_rows = [[d.strip() if d != '-' else '' for d in line.split("|")] for line in lines[1:]]
  672. # 多行 → 列表套字典,单行 → 单层字典
  673. # if len(data_rows) == 1:
  674. # return dict(zip(headers, data_rows[0]))
  675. return [dict(zip(headers, row)) for row in data_rows]
  676. def psv_to_dict_prefix(_str):
  677. # 去掉前缀指示
  678. _str = re.sub('\[全字段\]|\[仅招标人\]', '', _str)
  679. table_head_list = [
  680. '招标人|招标人表达',
  681. ]
  682. has_all_head_flag = 1
  683. for head in table_head_list:
  684. if head not in _str:
  685. has_all_head_flag = 0
  686. break
  687. if not has_all_head_flag:
  688. return {}
  689. line_list = _str.split('\n')
  690. temp_list = []
  691. for line in line_list:
  692. if '|' not in line:
  693. continue
  694. temp_list.append(line)
  695. line_list = temp_list
  696. # line_list = line_list[1:]
  697. # print('line_list[1]', line_list[1])
  698. tenderee, tenderee_sentence = line_list[1].split('|')[:2]
  699. result = {'招标信息': {'招标人名称': tenderee}}
  700. # print('result', result)
  701. return result
  702. def html2text_with_table_html(_html, limit=10000):
  703. # 如果输入是字符串,使用 BeautifulSoup 解析
  704. if isinstance(_html, str):
  705. _html = re.sub("<html>|<body>|</body>|</html>","",_html)
  706. _soup = BeautifulSoup(_html, "lxml")
  707. else:
  708. _soup = _html
  709. # 用于存储处理后的文本
  710. result_parts = []
  711. _find = False
  712. # 遍历所有直接子元素
  713. for child in _soup.contents:
  714. # print('child.name', child.name)
  715. if child.name:
  716. if child.name in ["table", "tbody"]:
  717. #仅仅保存rowspan和colspan属性的标签
  718. for c in child.find_all():
  719. new_attrs = {}
  720. for k,v in c.attrs.items():
  721. if k in ["rowspan","colspan"]:
  722. new_attrs[k] = v
  723. c.attrs = new_attrs
  724. # 如果是表格或表格主体,保留 HTML 代码
  725. result_parts.append("\n"+str(child)+"\n")
  726. else:
  727. # 递归处理其他元素并转换为文本
  728. text = html2text_with_table_html(child)
  729. if child.name in {"p","div","li"}:
  730. text += '\n'
  731. result_parts.append(text)
  732. elif child.string and child.string.strip():
  733. _text = child.string.strip()
  734. result_parts.append(_text)
  735. _find = True
  736. # print('result_parts', result_parts)
  737. # if not _find:
  738. # print('not find')
  739. # _text = str(_soup.get_text())
  740. # print('_text', _text)
  741. # if len(_text)>0:
  742. # print('_soup.name', _soup.name)
  743. # if _soup.name in {"p","div","li"}:
  744. # print('yes')
  745. # _text += "\n"
  746. # result_parts.append(_text)
  747. # 将所有处理后的部分连接成一个字符串
  748. result = "".join(result_parts)
  749. result = result[:limit]
  750. return result
  751. def xlsx_data_to_jsonl_2():
  752. df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2.csv')
  753. df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2_extract.xlsx')
  754. data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
  755. data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
  756. docid_html_dict = {int(x[0]): x[1] for x in data_list1}
  757. docid_json_dict = {int(x[0]): x[1] for x in data_list2}
  758. all_data = []
  759. instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n'
  760. empty_char = '-'
  761. instruction = f"""
  762. 你是招投标要素抽取专家。
  763. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  764. 项目名称|招标人名称|代理人名称
  765. 招标人联系人|招标人联系人电话
  766. 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
  767. 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
  768. 请抽取以上内容并严格按上述4段PSV输出:
  769. """
  770. for line in data_list1:
  771. docid = int(line[0])
  772. html = docid_html_dict.get(docid)
  773. extract_json = docid_json_dict.get(docid)
  774. text = html2text_with_table_html(html)
  775. try:
  776. answer = extract_json_to_psv(json.loads(extract_json), empty_char=empty_char)
  777. if answer is None:
  778. continue
  779. except:
  780. continue
  781. train_data = {
  782. "instruction": instruction,
  783. "input": text,
  784. "output": answer,
  785. }
  786. all_data.append(json.dumps(train_data, ensure_ascii=False))
  787. # 生成
  788. train_ratio = 0.8
  789. dev_ratio = 0.1
  790. random.shuffle(all_data)
  791. total = len(all_data)
  792. train_num = int(total * train_ratio)
  793. dev_num = int(total * dev_ratio)
  794. # 拆分
  795. train_lines = all_data[:train_num]
  796. dev_lines = all_data[train_num:train_num+dev_num]
  797. test_lines = all_data[train_num+dev_num:]
  798. print('len(train_lines)', len(train_lines))
  799. print('len(test_lines)', len(test_lines))
  800. # 保存
  801. train_path = "data2/train_data.jsonl"
  802. dev_path = "data2/dev_data.jsonl"
  803. test_path = "data2/test_data.jsonl"
  804. with open(train_path, 'w', encoding='utf-8') as f:
  805. f.write("\n".join(train_lines))
  806. with open(dev_path, 'w', encoding='utf-8') as f:
  807. f.write("\n".join(dev_lines))
  808. with open(test_path, 'w', encoding='utf-8') as f:
  809. f.write("\n".join(test_lines))
  810. def xlsx_data_to_jsonl_3():
  811. df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260327_2.csv')
  812. df2 = pd.read_csv(r'C:\Users\Administrator\Downloads\document_tmp_has_ai_no_attachment_260327_limit.csv')
  813. data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
  814. data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
  815. docid_html_dict = {int(x[0]): x[1] for x in data_list1}
  816. docid_json_dict = {int(x[0]): x[-1] for x in data_list2}
  817. all_data = []
  818. # instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n'
  819. empty_char = '-'
  820. instruction = f"""
  821. 你是招投标要素抽取专家。
  822. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  823. 项目名称|招标人名称|代理人名称
  824. 招标人联系人|招标人联系人电话
  825. 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
  826. 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
  827. 请抽取以上内容并严格按上述4段PSV输出:
  828. """
  829. for line in data_list1:
  830. docid = int(line[0])
  831. html = docid_html_dict.get(docid)
  832. extract_json = docid_json_dict.get(docid)
  833. text = html2text_with_table_html(html)
  834. try:
  835. answer = extract_json_to_psv(json.loads(extract_json), empty_char=empty_char)
  836. if answer is None:
  837. continue
  838. except:
  839. continue
  840. train_data = {
  841. "instruction": instruction,
  842. "input": text,
  843. "output": answer,
  844. }
  845. all_data.append(json.dumps(train_data, ensure_ascii=False))
  846. # 生成
  847. train_ratio = 1.
  848. dev_ratio = 0.
  849. random.shuffle(all_data)
  850. total = len(all_data)
  851. train_num = int(total * train_ratio)
  852. dev_num = int(total * dev_ratio)
  853. # 拆分
  854. train_lines = all_data[:train_num]
  855. dev_lines = all_data[train_num:train_num+dev_num]
  856. test_lines = all_data[train_num+dev_num:]
  857. print('len(train_lines)', len(train_lines))
  858. print('len(test_lines)', len(test_lines))
  859. # 保存
  860. train_path = "data3/train_data.jsonl"
  861. dev_path = "data3/dev_data.jsonl"
  862. test_path = "data3/test_data.jsonl"
  863. with open(train_path, 'w', encoding='utf-8') as f:
  864. f.write("\n".join(train_lines))
  865. with open(dev_path, 'w', encoding='utf-8') as f:
  866. f.write("\n".join(dev_lines))
  867. with open(test_path, 'w', encoding='utf-8') as f:
  868. f.write("\n".join(test_lines))
  869. def xlsx_data_to_jsonl_4_prefix():
  870. df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工_html.xlsx')
  871. df2 = pd.read_excel(r'train_excel/260403_ai_人工标注_招标人表达_3_再人工.xlsx')
  872. data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
  873. data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
  874. docid_html_dict = {int(x[0]): x[1] for x in data_list1}
  875. docid_json_dict = {int(x[0]): [x[1], x[2]] for x in data_list2}
  876. all_data = []
  877. empty_char = '-'
  878. instruction = f"""
  879. 你是招投标要素抽取专家。
  880. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  881. 招标人|招标人表达
  882. 请抽取以上内容并严格按上述1段PSV输出:
  883. """
  884. for line in data_list1:
  885. docid = int(line[0])
  886. html = docid_html_dict.get(docid)
  887. tenderee_line, tenderee = docid_json_dict.get(docid)
  888. if len(tenderee) <= 1:
  889. tenderee = '-'
  890. if len(tenderee_line) <= 1:
  891. tenderee_line = '-'
  892. text = html2text_with_table_html(html)
  893. answer = f'[仅招标人]招标人|招标人表达\n{tenderee}|{tenderee_line}'
  894. train_data = {
  895. "instruction": instruction,
  896. "input": text,
  897. "output": answer,
  898. }
  899. all_data.append(json.dumps(train_data, ensure_ascii=False))
  900. # 生成
  901. train_ratio = 0.9
  902. dev_ratio = 0.1
  903. random.shuffle(all_data)
  904. total = len(all_data)
  905. train_num = int(total * train_ratio)
  906. dev_num = int(total * dev_ratio)
  907. # 拆分
  908. train_lines = all_data[:train_num]
  909. dev_lines = all_data[train_num:train_num+dev_num]
  910. test_lines = all_data[train_num+dev_num:]
  911. print('len(train_lines)', len(train_lines))
  912. print('len(dev_lines)', len(dev_lines))
  913. # 保存
  914. train_path = "data4_prefix/train_data.jsonl"
  915. dev_path = "data4_prefix/dev_data.jsonl"
  916. test_path = "data4_prefix/test_data.jsonl"
  917. with open(train_path, 'w', encoding='utf-8') as f:
  918. f.write("\n".join(train_lines))
  919. with open(dev_path, 'w', encoding='utf-8') as f:
  920. f.write("\n".join(dev_lines))
  921. with open(test_path, 'w', encoding='utf-8') as f:
  922. f.write("\n".join(test_lines))
  923. def xlsx_data_to_jsonl_5():
  924. df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工_html.xlsx')
  925. df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工_json.xlsx')
  926. df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\260403_ai_人工标注_招标人表达_3_再人工.xlsx')
  927. data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
  928. data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
  929. data_list3 = df3.astype(object).where(pd.notnull(df3), "").values.tolist()
  930. docid_html_dict = {int(x[0]): x[1] for x in data_list1}
  931. docid_json_dict = {int(x[0]): x[-1] for x in data_list2}
  932. docid_tenderee_dict = {int(x[0]): x[-1] for x in data_list3}
  933. all_data = []
  934. # instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n'
  935. empty_char = '-'
  936. instruction = f"""
  937. 你是招投标要素抽取专家。
  938. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  939. 项目名称|招标人名称|代理人名称
  940. 招标人联系人|招标人联系人电话
  941. 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
  942. 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
  943. 请抽取以上内容并严格按上述4段PSV输出:
  944. """
  945. for line in data_list1:
  946. docid = int(line[0])
  947. html = docid_html_dict.get(docid)
  948. extract_json = docid_json_dict.get(docid)
  949. labeled_tenderee = docid_tenderee_dict.get(docid)
  950. if len(labeled_tenderee) <= 1:
  951. labeled_tenderee = ''
  952. text = html2text_with_table_html(html)
  953. try:
  954. extract_json = json.loads(extract_json)
  955. # 用人工标注的替换json里的tenderee
  956. prem = extract_json.get('prem')
  957. project_dict = {
  958. 'code': "",
  959. 'name': "",
  960. 'roleList': [
  961. {
  962. 'address': "",
  963. 'linklist': [],
  964. 'role_money': {},
  965. 'role_name': 'tenderee',
  966. 'role_text': labeled_tenderee,
  967. }
  968. ],
  969. 'tendereeMoney': 0,
  970. 'tendereeMoneyUnit': "",
  971. }
  972. if not prem:
  973. prem = {'Project': project_dict}
  974. else:
  975. # 每个包都加上tenderee
  976. for k, d in prem.items():
  977. role_list = d.get('roleList')
  978. role_list += [
  979. {
  980. 'address': "",
  981. 'linklist': [],
  982. 'role_money': {},
  983. 'role_name': 'tenderee',
  984. 'role_text': labeled_tenderee,
  985. }
  986. ]
  987. d['roleList'] = role_list
  988. prem[k] = d
  989. extract_json['prem'] = prem
  990. answer = extract_json_to_psv(extract_json, empty_char=empty_char)
  991. if answer is None:
  992. print('answer is None')
  993. continue
  994. except:
  995. traceback.print_exc()
  996. continue
  997. train_data = {
  998. "instruction": instruction,
  999. "input": text,
  1000. "output": answer,
  1001. }
  1002. all_data.append(json.dumps(train_data, ensure_ascii=False))
  1003. # 生成
  1004. train_ratio = 0.9
  1005. dev_ratio = 0.1
  1006. random.shuffle(all_data)
  1007. total = len(all_data)
  1008. train_num = int(total * train_ratio)
  1009. dev_num = int(total * dev_ratio)
  1010. # 拆分
  1011. train_lines = all_data[:train_num]
  1012. dev_lines = all_data[train_num:train_num+dev_num]
  1013. test_lines = all_data[train_num+dev_num:]
  1014. print('len(train_lines)', len(train_lines))
  1015. print('len(dev_lines)', len(dev_lines))
  1016. # 保存
  1017. train_path = "data4/train_data.jsonl"
  1018. dev_path = "data4/dev_data.jsonl"
  1019. test_path = "data4/test_data.jsonl"
  1020. with open(train_path, 'w', encoding='utf-8') as f:
  1021. f.write("\n".join(train_lines))
  1022. with open(dev_path, 'w', encoding='utf-8') as f:
  1023. f.write("\n".join(dev_lines))
  1024. with open(test_path, 'w', encoding='utf-8') as f:
  1025. f.write("\n".join(test_lines))
  1026. def xlsx_data_to_jsonl_3_prefix():
  1027. df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260327_2.csv')
  1028. df2 = pd.read_csv(r'C:\Users\Administrator\Downloads\document_tmp_has_ai_no_attachment_260327_limit.csv')
  1029. data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
  1030. data_list2 = df2.astype(object).where(pd.notnull(df2), "").values.tolist()
  1031. docid_html_dict = {int(x[0]): x[1] for x in data_list1}
  1032. docid_json_dict = {int(x[0]): x[-1] for x in data_list2}
  1033. all_data = []
  1034. # instruction = '根据上述招投标行业公告,进行要素提取,输出psv格式:\n'
  1035. empty_char = '-'
  1036. instruction = f"""
  1037. 你是招投标要素抽取专家。
  1038. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  1039. 项目名称|招标人名称|代理人名称
  1040. 招标人联系人|招标人联系人电话
  1041. 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
  1042. 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
  1043. 请抽取以上内容并严格按上述4段PSV输出:
  1044. """
  1045. instruction2 = f"""
  1046. 你是招投标要素抽取专家。
  1047. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  1048. 招标人|招标人表达
  1049. 请抽取以上内容并严格按上述1段PSV输出:
  1050. """
  1051. instruction3 = f"""
  1052. 你是招投标要素抽取专家。
  1053. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  1054. 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
  1055. 请抽取以上内容并严格按上述1段PSV输出:
  1056. """
  1057. for line in data_list1:
  1058. docid = int(line[0])
  1059. html = docid_html_dict.get(docid)
  1060. extract_json = docid_json_dict.get(docid)
  1061. text = html2text_with_table_html(html)
  1062. try:
  1063. answer = extract_json_to_psv_prefix(json.loads(extract_json), text,
  1064. empty_char=empty_char)
  1065. if answer is None:
  1066. continue
  1067. except:
  1068. continue
  1069. print('answer0', answer)
  1070. train_data = {
  1071. "instruction": instruction,
  1072. "input": text,
  1073. "output": answer,
  1074. }
  1075. all_data.append(json.dumps(train_data, ensure_ascii=False))
  1076. answer = extract_json_to_psv_prefix(json.loads(extract_json), text,
  1077. empty_char=empty_char, prefix='[仅招标人]')
  1078. print('answer1', answer)
  1079. if answer is None:
  1080. continue
  1081. train_data = {
  1082. "instruction": instruction2,
  1083. "input": text,
  1084. "output": answer,
  1085. }
  1086. all_data.append(json.dumps(train_data, ensure_ascii=False))
  1087. answer = extract_json_to_psv_prefix(json.loads(extract_json), text,
  1088. empty_char=empty_char, prefix='[仅产品]')
  1089. print('answer2', answer)
  1090. if answer is None:
  1091. continue
  1092. train_data = {
  1093. "instruction": instruction3,
  1094. "input": text,
  1095. "output": answer,
  1096. }
  1097. all_data.append(json.dumps(train_data, ensure_ascii=False))
  1098. # 生成
  1099. train_ratio = 0.9
  1100. dev_ratio = 0.1
  1101. random.shuffle(all_data)
  1102. total = len(all_data)
  1103. train_num = int(total * train_ratio)
  1104. dev_num = int(total * dev_ratio)
  1105. # 拆分
  1106. train_lines = all_data[:train_num]
  1107. dev_lines = all_data[train_num:train_num+dev_num]
  1108. test_lines = all_data[train_num+dev_num:]
  1109. print('len(train_lines)', len(train_lines))
  1110. print('len(dev_lines)', len(dev_lines))
  1111. # 保存
  1112. # train_path = "data3_prefix/train_data.jsonl"
  1113. # dev_path = "data3_prefix/dev_data.jsonl"
  1114. # test_path = "data3_prefix/test_data.jsonl"
  1115. train_path = "data7_prefix/train_data.jsonl"
  1116. dev_path = "data7_prefix/dev_data.jsonl"
  1117. test_path = "data7_prefix/test_data.jsonl"
  1118. with open(train_path, 'w', encoding='utf-8') as f:
  1119. f.write("\n".join(train_lines))
  1120. with open(dev_path, 'w', encoding='utf-8') as f:
  1121. f.write("\n".join(dev_lines))
  1122. with open(test_path, 'w', encoding='utf-8') as f:
  1123. f.write("\n".join(test_lines))
  1124. def entity_data_to_jsonl_prefix():
  1125. df1 = pd.read_excel(r'df_train.xlsx')
  1126. data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
  1127. empty_char = '-'
  1128. instruction = f"""
  1129. 你是招投标要素抽取专家。
  1130. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  1131. 项目名称|招标人名称|代理人名称
  1132. 招标人联系人|招标人联系人电话
  1133. 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
  1134. 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
  1135. 请抽取以上内容并严格按上述4段PSV输出:
  1136. """
  1137. instruction2 = f"""
  1138. 你是招投标要素抽取专家。
  1139. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  1140. 招标人|招标人表达
  1141. 请抽取以上内容并严格按上述1段PSV输出:
  1142. """
  1143. all_data = []
  1144. max_cnt = 2000
  1145. cnt = 0
  1146. for line in data_list1:
  1147. center = line[1]
  1148. docid = line[2]
  1149. label = line[4]
  1150. left = line[5]
  1151. right = line[8]
  1152. if label != '招标人':
  1153. continue
  1154. if cnt >= max_cnt:
  1155. break
  1156. text = left + center + right
  1157. answer = entity_to_psv_prefix(text, center, empty_char=empty_char, prefix='[仅招标人]')
  1158. train_data = {
  1159. "instruction": instruction2,
  1160. "input": text,
  1161. "output": answer,
  1162. }
  1163. all_data.append(json.dumps(train_data, ensure_ascii=False))
  1164. cnt += 1
  1165. # 生成
  1166. train_ratio = 0.9
  1167. dev_ratio = 0.1
  1168. random.shuffle(all_data)
  1169. total = len(all_data)
  1170. train_num = int(total * train_ratio)
  1171. dev_num = int(total * dev_ratio)
  1172. # 拆分
  1173. train_lines = all_data[:train_num]
  1174. dev_lines = all_data[train_num:train_num+dev_num]
  1175. test_lines = all_data[train_num+dev_num:]
  1176. print('len(train_lines)', len(train_lines))
  1177. print('len(dev_lines)', len(dev_lines))
  1178. # 保存
  1179. train_path = "data5_prefix/train_data.jsonl"
  1180. dev_path = "data5_prefix/dev_data.jsonl"
  1181. test_path = "data5_prefix/test_data.jsonl"
  1182. with open(train_path, 'w', encoding='utf-8') as f:
  1183. f.write("\n".join(train_lines))
  1184. with open(dev_path, 'w', encoding='utf-8') as f:
  1185. f.write("\n".join(dev_lines))
  1186. with open(test_path, 'w', encoding='utf-8') as f:
  1187. f.write("\n".join(test_lines))
  1188. def augment_jsonl_data():
  1189. tags = ['[仅招标人]', '[全字段]']
  1190. train_path = './data6_prefix/train_data.jsonl'
  1191. output_path = './data6_prefix_aug/train_data.jsonl'
  1192. data_dict_list = []
  1193. with open(train_path, 'r', encoding='utf-8') as f:
  1194. for line in f:
  1195. line = json.loads(line.strip())
  1196. # 构造Qwen的输入格式(Chat版格式)
  1197. prompt = f"<|im_start|>user\n{line['input']}\n{line['instruction']}<|im_end|>\n<|im_start|>assistant\n{line['output']}<|im_end|>"
  1198. input_data = line['input']
  1199. output_data = line['output']
  1200. # print('output_data', output_data)
  1201. data_dict_list.append(line)
  1202. # project_name, tenderee, agency 位置打乱
  1203. new_data_dict_list = []
  1204. for data_dict in data_dict_list:
  1205. # 50% 打乱
  1206. if random.choice([0, 1]):
  1207. new_data_dict_list.append(data_dict)
  1208. continue
  1209. text = data_dict['input']
  1210. output = data_dict['output']
  1211. now_tag = None
  1212. for tag in tags:
  1213. if tag in output:
  1214. output = re.sub(tag, '', str(output))
  1215. now_tag = tag
  1216. break
  1217. first_table = output.split('\n')[1]
  1218. if now_tag == '[仅招标人]':
  1219. tenderee, _ = first_table.split('|')[:2]
  1220. project_name = ''
  1221. agency = ''
  1222. else:
  1223. project_name, tenderee, agency = first_table.split('|')[:3]
  1224. if len(tenderee) <= 1:
  1225. continue
  1226. # sen_list = re.split('[,。;?!]', text)
  1227. sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text)
  1228. tenderee_sen_list = []
  1229. agency_sen_list = []
  1230. project_name_sen_list = []
  1231. for sen in sen_list:
  1232. match = re.search(re.escape(tenderee), sen)
  1233. if match:
  1234. tenderee_sen_list.append(sen)
  1235. if len(agency) > 1:
  1236. match = re.search(re.escape(agency), sen)
  1237. if match:
  1238. agency_sen_list.append(sen)
  1239. if len(project_name) > 1:
  1240. match = re.search(re.escape(project_name), sen)
  1241. if match:
  1242. project_name_sen_list.append(sen)
  1243. for sen in project_name_sen_list:
  1244. if sen in tenderee_sen_list:
  1245. tenderee_sen_list.remove(sen)
  1246. if len(tenderee_sen_list) == 0:
  1247. continue
  1248. if len(tenderee_sen_list) >= 2:
  1249. print('tenderee_sen_list', tenderee_sen_list)
  1250. continue
  1251. for sen in tenderee_sen_list + agency_sen_list + project_name_sen_list:
  1252. if sen in sen_list:
  1253. sen_list.remove(sen)
  1254. print('len(sen_list)', len(sen_list))
  1255. if len(sen_list) <= 1:
  1256. print('len(sen_list) <= 1', sen_list)
  1257. continue
  1258. random_index = random.randint(1, len(sen_list)-1)
  1259. tenderee_sen = tenderee_sen_list[0]
  1260. if '<' in tenderee_sen:
  1261. continue
  1262. print('tenderee_sen', tenderee_sen, tenderee)
  1263. sen_list = sen_list[:random_index] + [tenderee_sen] + sen_list[random_index:]
  1264. if agency_sen_list:
  1265. random_index = random.randint(1, len(sen_list)-1)
  1266. agency_sen = agency_sen_list[0]
  1267. print('agency_sen', agency_sen)
  1268. sen_list = sen_list[:random_index] + [agency_sen] + sen_list[random_index:]
  1269. if project_name_sen_list:
  1270. random_index = random.randint(1, len(sen_list)-1)
  1271. project_name_sen = project_name_sen_list[0]
  1272. print('project_name_sen', project_name_sen)
  1273. sen_list = sen_list[:random_index] + [project_name_sen] + sen_list[random_index:]
  1274. new_text = ''.join(sen_list)
  1275. data_dict['input'] = new_text
  1276. new_data_dict_list.append(data_dict)
  1277. print('len(new_data_dict_list)', len(new_data_dict_list))
  1278. _str = '\n'.join([json.dumps(x, ensure_ascii=False) for x in new_data_dict_list])
  1279. with open(output_path, 'w', encoding='utf-8') as f:
  1280. f.write(_str)
  1281. print('finish to', output_path)
  1282. def table_list_to_psv(table_list, empty_char, table_type=None):
  1283. final_str = ''
  1284. show_flag = 0
  1285. for table_cols, table_values in table_list:
  1286. str1 = '|'.join(table_cols) + '\n'
  1287. continue_flag = 0
  1288. if table_values:
  1289. str2 = ''
  1290. if len(table_values) >= 2:
  1291. # print('table_values', table_values)
  1292. show_flag = 1
  1293. for line in table_values:
  1294. if '|' in str(line):
  1295. continue_flag = 1
  1296. break
  1297. str2 += '|'.join([str(x) if str(x) != '' else empty_char for x in line]) + '\n'
  1298. if not str2:
  1299. str2 = '|'.join([empty_char for x in table_cols]) + '\n'
  1300. else:
  1301. str2 = '|'.join([empty_char for x in table_cols]) + '\n'
  1302. if continue_flag:
  1303. return None
  1304. # 判断表头和内容竖线是否相同
  1305. # for ss2 in str2.split('\n'):
  1306. # if len(ss2) == 0:
  1307. # continue
  1308. # if len(re.findall("\|", str1)) != len(re.findall("\|", ss2)):
  1309. # print('--- str1', str1)
  1310. # print('--- str2', ss2)
  1311. # return None
  1312. if len(re.findall("\|", str2)) % len(re.findall("\|", str1)) != 0:
  1313. print('--- str1', str1)
  1314. print('--- str2', str2)
  1315. return None
  1316. final_str += str1
  1317. final_str += str2
  1318. final_str += '\n'
  1319. final_str = re.sub('未提及', f'{empty_char}', final_str)
  1320. delete_value_list = ['None', '无', '无品牌', '无型号', '0', '0.0', '未提及']
  1321. for v in delete_value_list:
  1322. final_str = re.sub(f'\|{v}\|', f'|{empty_char}|', final_str)
  1323. final_str = re.sub(f'\|{v}\n', f'|{empty_char}\n', final_str)
  1324. final_str = re.sub(f'\n{v}\|', f'\n{empty_char}|', final_str)
  1325. return final_str
  1326. def saimofei_to_psv_prefix(data_list, text, prefix, empty_char='-'):
  1327. project_name = data_list[0].get('doctitle')
  1328. tenderee = data_list[0].get('tenderee')
  1329. agency = data_list[0].get('agency')
  1330. tenderee_contact_list = []
  1331. win_tenderer_info_list = []
  1332. product_list = []
  1333. for d in data_list:
  1334. # tenderee_contact_list
  1335. tenderee_contact = d.get('tenderee_contact')
  1336. if '/' in tenderee_contact:
  1337. tenderee_person, tenderee_phone = tenderee_contact.split('/')
  1338. else:
  1339. tenderee_phone = tenderee_contact
  1340. tenderee_person = ''
  1341. tenderee_contact_list.append([tenderee_person, tenderee_phone])
  1342. # win_tenderer_info_list
  1343. win_tenderer = d.get('tenderee_contact')
  1344. project_code = d.get('project_code')
  1345. budget = d.get('budget')
  1346. win_money = d.get('win_money')
  1347. budget = str(getUnifyMoney(budget))
  1348. win_money = str(getUnifyMoney(win_money))
  1349. win_tenderer_info = [
  1350. '-', project_code, win_tenderer, win_money,
  1351. '元', budget, '元',
  1352. ]
  1353. win_tenderer_info_list.append(win_tenderer_info)
  1354. # product_list
  1355. product_name = d.get('product_name')
  1356. brand = d.get('brand')
  1357. specs = d.get('specs')
  1358. product_cnt = d.get('product_cnt')
  1359. unit_price = d.get('unit_price')
  1360. total_price = d.get('total_price')
  1361. product_list.append([
  1362. product_name, brand, specs, product_cnt,
  1363. unit_price, total_price, '-', '-'
  1364. ])
  1365. tenderee_contact_list = list(set([json.dumps(x) for x in tenderee_contact_list]))
  1366. tenderee_contact_list = [json.loads(x) for x in tenderee_contact_list]
  1367. win_tenderer_info_list = list(set([json.dumps(x) for x in win_tenderer_info_list]))
  1368. win_tenderer_info_list = [json.loads(x) for x in win_tenderer_info_list]
  1369. product_list = list(set([json.dumps(x) for x in product_list]))
  1370. product_list = [json.loads(x) for x in product_list]
  1371. if prefix == '[全字段]':
  1372. table_list = []
  1373. # table 1
  1374. table_cols = ['项目名称', '招标人名称', '代理人名称']
  1375. table_values = [[project_name, tenderee, agency]]
  1376. table_list.append([table_cols, table_values])
  1377. # table 2
  1378. table_cols = ['招标人联系人', '招标人联系人电话']
  1379. # print('tenderee_contact_list', tenderee_contact_list)
  1380. table_values = tenderee_contact_list if tenderee_contact_list else []
  1381. temp_list = []
  1382. for v in table_values:
  1383. if (v[0] not in [None, '', '-'] and v[0] in text) \
  1384. or (v[1] not in [None, '', '-'] and v[1] in text):
  1385. temp_list.append(v)
  1386. table_values = temp_list
  1387. table_list.append([table_cols, table_values])
  1388. # table 3
  1389. table_cols = ['标段名称', '标段号', '中标人名称', '中标金额', '中标金额单位', '标段预算', '标段预算单位']
  1390. table_values = win_tenderer_info_list if win_tenderer_info_list else []
  1391. temp_list = []
  1392. for v in table_values:
  1393. if (v[0] not in [None, '', '-'] and v[0] in text) \
  1394. or (v[2] not in [None, '', '-'] and v[2] in text) \
  1395. or (v[1] not in [None, '', '-'] and v[1] in text):
  1396. temp_list.append(v)
  1397. table_values = temp_list
  1398. table_list.append([table_cols, table_values])
  1399. # table 4
  1400. table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号']
  1401. table_values = product_list if product_list else []
  1402. temp_list = []
  1403. for v in table_values:
  1404. if v[0] not in [None, '', '-'] and v[0] in text:
  1405. temp_list.append(v)
  1406. table_values = temp_list
  1407. # # 产品中数值类型 重复3次
  1408. # for v in table_values:
  1409. # for col_i in [3, 4, 5]:
  1410. # try:
  1411. # col_v = float(v[col_i])
  1412. # if col_v > 0:
  1413. # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]])
  1414. # except:
  1415. # pass
  1416. table_list.append([table_cols, table_values])
  1417. final_str = table_list_to_psv(table_list, empty_char)
  1418. if not final_str:
  1419. return final_str
  1420. final_str = prefix + final_str
  1421. return final_str
  1422. elif prefix == '[仅招标人]':
  1423. if not tenderee:
  1424. return None
  1425. sen_list = re.findall('[^,。;?!\n]+[,。;?!\n]?', text)
  1426. tenderee_sen_list = []
  1427. for sen in sen_list:
  1428. match = re.search(re.escape(tenderee), sen)
  1429. if match:
  1430. tenderee_sen_list.append(sen)
  1431. if tenderee_sen_list:
  1432. tenderee_sen_list.sort(key=lambda x: len(x))
  1433. tenderee_line = tenderee_sen_list[0]
  1434. else:
  1435. tenderee_line = empty_char
  1436. table_list = []
  1437. table_cols = ['招标人', '招标人表达']
  1438. table_values = [[tenderee, tenderee_line]]
  1439. table_list.append([table_cols, table_values])
  1440. final_str = table_list_to_psv(table_list, empty_char)
  1441. if not final_str:
  1442. return final_str
  1443. final_str = prefix + final_str
  1444. return final_str
  1445. elif prefix == '[仅产品]':
  1446. table_cols = ['产品名称', '品牌', '规格型号', '数量', '单价', '总价', '品目名称', '品目编号']
  1447. table_values = product_list if product_list else []
  1448. # 判断截取后产品是否还在其中
  1449. # if len(text) >= 10000:
  1450. # sub_text = text[:10000]
  1451. temp_list = []
  1452. for v in table_values:
  1453. if v[0] not in [None, '', '-'] and v[0] in text:
  1454. temp_list.append(v)
  1455. table_values = temp_list
  1456. # # 产品中数值类型 重复3次
  1457. # for v in table_values:
  1458. # for col_i in [3, 4, 5]:
  1459. # try:
  1460. # col_v = float(v[col_i])
  1461. # if col_v > 0:
  1462. # v[col_i] = ','.join([v[col_i], v[col_i], v[col_i]])
  1463. # except:
  1464. # pass
  1465. table_list = []
  1466. table_list.append([table_cols, table_values])
  1467. final_str = table_list_to_psv(table_list, empty_char)
  1468. if not final_str:
  1469. return final_str
  1470. final_str = prefix + final_str
  1471. return final_str
  1472. def saimofei_data_to_jsonl_data():
  1473. df = pd.read_excel(r'C:\Users\Administrator\Downloads\赛默飞-样例数据.xlsx', header=1)
  1474. df1 = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260417_saimofei_html.csv')
  1475. head_list = list(df.columns)
  1476. data_list = df.astype(object).where(pd.notnull(df), "").values.tolist()
  1477. data_list1 = df1.astype(object).where(pd.notnull(df1), "").values.tolist()
  1478. docid_html_dict = {int(x[0]): x[1] for x in data_list1}
  1479. docid_data_dict = {}
  1480. for data in data_list:
  1481. docid = data[head_list.index('公告ID')]
  1482. doctitle = data[head_list.index('公告名称')]
  1483. budget = data[head_list.index('预算金额')]
  1484. win_money = data[head_list.index('成交金额')]
  1485. tenderee = data[head_list.index('招标单位')]
  1486. tenderee_contact = data[head_list.index('招标单位联系人')]
  1487. agency = data[head_list.index('代理机构')]
  1488. win_tenderer = data[head_list.index('中标单位')]
  1489. product_name = data[head_list.index('产品名称')]
  1490. brand = data[head_list.index('品牌名称')]
  1491. specs = data[head_list.index('型号')]
  1492. product_cnt = data[head_list.index('数量')]
  1493. unit_price = data[head_list.index('单价(元)')]
  1494. total_price = data[head_list.index('总价(元)')]
  1495. project_code = data[head_list.index('项目编号')]
  1496. new_data = {
  1497. 'doctitle': doctitle,
  1498. 'budget': budget,
  1499. 'win_money': win_money,
  1500. 'tenderee': tenderee,
  1501. 'tenderee_contact': tenderee_contact,
  1502. 'agency': agency,
  1503. 'win_tenderer': win_tenderer,
  1504. 'product_name': product_name,
  1505. 'brand': brand,
  1506. 'specs': specs,
  1507. 'product_cnt': product_cnt,
  1508. 'unit_price': unit_price,
  1509. 'total_price': total_price,
  1510. 'project_code': project_code,
  1511. }
  1512. if docid in docid_data_dict:
  1513. docid_data_dict[int(docid)] += [new_data]
  1514. else:
  1515. docid_data_dict[int(docid)] = [new_data]
  1516. all_data = []
  1517. empty_char = '-'
  1518. instruction = f"""
  1519. 你是招投标要素抽取专家。
  1520. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  1521. 项目名称|招标人名称|代理人名称
  1522. 招标人联系人|招标人联系人电话
  1523. 标段名称|标段号|中标人名称|中标金额|中标金额单位|标段预算|标段预算单位
  1524. 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
  1525. 请抽取以上内容并严格按上述4段PSV输出:
  1526. """
  1527. instruction2 = f"""
  1528. 你是招投标要素抽取专家。
  1529. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  1530. 招标人|招标人表达
  1531. 请抽取以上内容并严格按上述1段PSV输出:
  1532. """
  1533. instruction3 = f"""
  1534. 你是招投标要素抽取专家。
  1535. 请严格按下面PSV格式输出,禁止修改表头,空字段置为{empty_char},字段之间用|分隔,首尾不加|。
  1536. 产品名称|品牌|规格型号|数量|单价|总价|品目名称|品目编号
  1537. 请抽取以上内容并严格按上述1段PSV输出:
  1538. """
  1539. for docid, data_list in docid_data_dict.items():
  1540. html = docid_html_dict.get(int(docid))
  1541. text = html2text_with_table_html(html)
  1542. answer = saimofei_to_psv_prefix(data_list, text, prefix='[全字段]', empty_char=empty_char)
  1543. print('answer1', answer)
  1544. if not answer:
  1545. continue
  1546. train_data = {
  1547. "instruction": instruction,
  1548. "input": text,
  1549. "output": answer,
  1550. }
  1551. all_data.append(json.dumps(train_data, ensure_ascii=False))
  1552. answer = saimofei_to_psv_prefix(data_list, text, prefix='[仅招标人]', empty_char=empty_char)
  1553. print('answer2', answer)
  1554. if not answer:
  1555. continue
  1556. train_data = {
  1557. "instruction": instruction2,
  1558. "input": text,
  1559. "output": answer,
  1560. }
  1561. all_data.append(json.dumps(train_data, ensure_ascii=False))
  1562. answer = saimofei_to_psv_prefix(data_list, text, prefix='[仅产品]', empty_char=empty_char)
  1563. print('answer3', answer)
  1564. if not answer:
  1565. continue
  1566. train_data = {
  1567. "instruction": instruction3,
  1568. "input": text,
  1569. "output": answer,
  1570. }
  1571. all_data.append(json.dumps(train_data, ensure_ascii=False))
  1572. # 生成
  1573. train_ratio = 0.9
  1574. dev_ratio = 0.1
  1575. random.shuffle(all_data)
  1576. total = len(all_data)
  1577. train_num = int(total * train_ratio)
  1578. dev_num = int(total * dev_ratio)
  1579. # 拆分
  1580. train_lines = all_data[:train_num]
  1581. dev_lines = all_data[train_num:train_num+dev_num]
  1582. test_lines = all_data[train_num+dev_num:]
  1583. print('len(train_lines)', len(train_lines))
  1584. print('len(dev_lines)', len(dev_lines))
  1585. # 保存
  1586. train_path = "data6_prefix/train_data.jsonl"
  1587. dev_path = "data6_prefix/dev_data.jsonl"
  1588. test_path = "data6_prefix/test_data.jsonl"
  1589. with open(train_path, 'w', encoding='utf-8') as f:
  1590. f.write("\n".join(train_lines))
  1591. with open(dev_path, 'w', encoding='utf-8') as f:
  1592. f.write("\n".join(dev_lines))
  1593. with open(test_path, 'w', encoding='utf-8') as f:
  1594. f.write("\n".join(test_lines))
  1595. if __name__ == '__main__':
  1596. # filter_data_docid()
  1597. # xlsx_data_to_jsonl()
  1598. # xlsx_data_to_jsonl_2()
  1599. # xlsx_data_to_jsonl_3()
  1600. # xlsx_data_to_jsonl_4_prefix()
  1601. # xlsx_data_to_jsonl_5()
  1602. xlsx_data_to_jsonl_3_prefix()
  1603. # entity_data_to_jsonl_prefix()
  1604. # saimofei_data_to_jsonl_data()
  1605. # augment_jsonl_data()