compare_result.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604
  1. import json
  2. import re
  3. import traceback
  4. from glob import glob
  5. import pandas as pd
  6. from bs4 import BeautifulSoup
  7. from json_repair import repair_json, json_repair
  8. from get_data import psv_to_dict, psv_to_dict_prefix
  9. from compare_utils import compare_products
  10. def compare_extract_csv():
  11. df_tenderee = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2_tenderee.csv')
  12. df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_1.xlsx')
  13. df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_3.xlsx')
  14. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_3_lora_2.xlsx')
  15. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260331_lora.xlsx')
  16. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260401_lora-r64.xlsx')
  17. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260401_lora_2B.xlsx')
  18. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260402_lora_2B.xlsx')
  19. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260407_0.8B.xlsx')
  20. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260408_0.8B.xlsx')
  21. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260409_0.8B.xlsx')
  22. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260413_0.8B.xlsx')
  23. df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260423_0.8B_prefix_all.xlsx')
  24. tenderee_data_list = df_tenderee.astype(object).where(pd.notnull(df_tenderee), None).values.tolist()
  25. data_list1 = df1.astype(object).where(pd.notnull(df1), None).values.tolist()
  26. data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist()
  27. data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist()
  28. tenderee_data_dict = {int(x[0]): x[1] for x in tenderee_data_list}
  29. data_dict1 = {int(x[0]): x[1] for x in data_list1}
  30. data_dict2 = {int(x[0]): x[1] for x in data_list2}
  31. data_dict3 = {int(x[0]): psv_to_dict(x[1]) for x in data_list3}
  32. for docid, v_dict in data_dict3.items():
  33. v1 = v_dict.get('招标信息')
  34. v2 = v_dict.get('招标人联系方式')
  35. if v1 is None or v2 is None:
  36. continue
  37. v1 = v1[0]
  38. if not v1:
  39. continue
  40. v1['招标人联系方式'] = v2
  41. v_dict['招标信息'] = v1
  42. data_dict3[docid] = v_dict
  43. for docid, v_dict in data_dict3.items():
  44. data_dict3[docid] = json.dumps(v_dict, ensure_ascii=False)
  45. # print('data_dict3', data_dict3.get(751300501))
  46. cols = ['招标信息', '中标信息', '产品信息']
  47. result_list = []
  48. tenderee_match_cnt = 0
  49. tenderee_match_empty_cnt = 0
  50. real_tenderee_match_cnt1 = 0
  51. real_tenderee_match_cnt2 = 0
  52. win_tenderer_match_cnt = 0
  53. agency_match_cnt = 0
  54. all_product_weight_score = 0
  55. all_product_complete_score = 0
  56. all_product_cnt_score = 0
  57. for docid, data1 in data_dict1.items():
  58. data2 = data_dict2.get(docid, '{}')
  59. data3 = data_dict3.get(docid, '{}')
  60. # if len(re.findall('}]}', data2)) != 2:
  61. # data2 = re.sub('}]', '}]}', data2, count=1)
  62. # if len(re.findall('}]}', data3)) != 2:
  63. # data3 = re.sub('}]', '}]}', data3, count=1)
  64. # if len(data1) >= 1000:
  65. # data1 = '{}'
  66. # if len(data2) >= 1000:
  67. # data2 = '{}'
  68. # if len(data3) >= 1000:
  69. # data3 = '{}'
  70. # if len(data1) >= 1000:
  71. # extract_json1 = '{}'
  72. # else:
  73. extract_json1 = repair_json(data1)
  74. if len(data2) >= 1000:
  75. extract_json2 = '{}'
  76. else:
  77. extract_json2 = repair_json(data2)
  78. # if len(data3) >= 1000:
  79. # extract_json3 = '{}'
  80. # else:
  81. # extract_json3 = repair_json(data3)
  82. # print('data2', data2)
  83. extract_json3 = repair_json(data3)
  84. extract_json1 = json_repair.loads(extract_json1)
  85. extract_json2 = json_repair.loads(extract_json2)
  86. extract_json3 = json_repair.loads(extract_json3)
  87. # print('extract_json2', extract_json2)
  88. if type(extract_json1) != dict:
  89. extract_json1 = {}
  90. if type(extract_json2) != dict:
  91. extract_json2 = {}
  92. if type(extract_json3) != dict:
  93. extract_json3 = {}
  94. col_match_list = []
  95. for col in cols:
  96. str1 = json.dumps(extract_json1.get(col, ""), ensure_ascii=False)
  97. str2 = json.dumps(extract_json2.get(col, ""), ensure_ascii=False)
  98. str3 = json.dumps(extract_json3.get(col, ""), ensure_ascii=False)
  99. if str1 != str3:
  100. col_match_list += [str1, str2, str3, 0]
  101. else:
  102. col_match_list += [str1, str2, str3, 1]
  103. tenderee_real = tenderee_data_dict.get(docid)
  104. tenderee1 = extract_json1.get('招标信息', {}).get('招标人名称', "")
  105. try:
  106. tenderee3 = extract_json3.get('招标信息', {}).get('招标人名称', "")
  107. except:
  108. print('no 招标信息 招标人名称 extract_json3', extract_json3)
  109. # raise
  110. tenderee3 = ''
  111. if re.sub(' ', '', tenderee1) == re.sub(' ', '', tenderee3):
  112. tenderee_match_cnt += 1
  113. if not tenderee1:
  114. tenderee_match_empty_cnt += 1
  115. if tenderee1 == tenderee_real:
  116. real_tenderee_match_cnt1 += 1
  117. if tenderee3 == tenderee_real:
  118. real_tenderee_match_cnt2 += 1
  119. win_tenderer1 = [x.get('中标人名称', "") for x in extract_json1.get('中标信息', [])]
  120. win_tenderer1 = list(set(win_tenderer1))
  121. win_tenderer1.sort(key=lambda x: x)
  122. win_tenderer1 = ','.join(win_tenderer1)
  123. try:
  124. win_tenderer3 = [x.get('中标人名称', "") for x in extract_json3.get('中标信息', [])]
  125. win_tenderer3 = list(set(win_tenderer3))
  126. win_tenderer3.sort(key=lambda x: x)
  127. win_tenderer3 = ','.join(win_tenderer3)
  128. except:
  129. print('no 中标信息 中标人名称 extract_json3', extract_json3)
  130. # raise
  131. win_tenderer3 = ''
  132. if win_tenderer1 == win_tenderer3:
  133. print('win_tenderer1 == win_tenderer3', win_tenderer1, win_tenderer3)
  134. win_tenderer_match_cnt += 1
  135. agency1 = extract_json1.get('招标信息', {}).get('代理人名称', "")
  136. try:
  137. agency3 = extract_json3.get('招标信息', {}).get('代理人名称', "")
  138. except:
  139. print('no 招标信息 中标人名称 extract_json3', extract_json3)
  140. # raise
  141. agency3 = ''
  142. if agency1 == agency3:
  143. agency_match_cnt += 1
  144. # 计算products匹配率
  145. products1 = extract_json1.get('产品信息')
  146. products3 = extract_json3.get('产品信息')
  147. # print('products3', products3)
  148. product_weight_score, product_complete_score, product_cnt_score = compare_products(products1, products3)
  149. all_product_weight_score += product_weight_score
  150. all_product_complete_score += product_complete_score
  151. all_product_cnt_score += product_cnt_score
  152. result_list.append(
  153. [docid, data1, data2, data3] +
  154. col_match_list +
  155. [tenderee1, tenderee3, tenderee_real] +
  156. [str(tenderee1==tenderee3), str(tenderee1==tenderee_real), str(tenderee3==tenderee_real)] +
  157. [win_tenderer1, win_tenderer3, str(win_tenderer1==win_tenderer3)] +
  158. [agency1, agency3, str(agency1==agency3)]
  159. )
  160. df_new = pd.DataFrame(result_list)
  161. df_new.columns = ['docid',
  162. 'doubao', 'qwen', 'qwen-lora',
  163. '招标信息1', '招标信息2', '招标信息3', '招标信息相同',
  164. '中标信息1', '中标信息2', '中标信息3', '中标信息相同',
  165. '产品信息1', '产品信息2', '产品信息3', '产品信息相同',
  166. 'tenderee_doubao', 'tenderee_qwen', '线上_tenderee',
  167. 'doubao_qwen_相同', 'doubao_线上_相同', 'qwen_线上相同',
  168. 'win_doubao', 'win_qwen', 'doubao_qwen_相同',
  169. 'agency_doubao', 'agency_qwen', 'doubao_qwen_相同',
  170. ]
  171. df_new.to_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260423_compare_doubao_0.8B_prefix.xlsx', index=False)
  172. all_cnt = len(data_dict1)
  173. print('tenderee_match_cnt', tenderee_match_cnt, tenderee_match_cnt/all_cnt, 'tenderee_match_empty_cnt', tenderee_match_empty_cnt)
  174. print('real_tenderee_match_cnt1', real_tenderee_match_cnt1, real_tenderee_match_cnt1/all_cnt)
  175. print('real_tenderee_match_cnt2', real_tenderee_match_cnt2, real_tenderee_match_cnt2/all_cnt)
  176. print('win_tenderer_match_cnt', win_tenderer_match_cnt, win_tenderer_match_cnt/all_cnt)
  177. print('agency_match_cnt', agency_match_cnt, agency_match_cnt/all_cnt)
  178. print('product_weight_score', all_product_weight_score / all_cnt)
  179. print('product_complete_score', all_product_complete_score / all_cnt)
  180. print('product_cnt_score', all_product_cnt_score / all_cnt)
  181. def compare_extract_csv_prefix():
  182. # df_tenderee = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2_tenderee.csv')
  183. df_tenderee1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\招标人标注_260414.xlsx', sheet_name=0)
  184. df_tenderee2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\招标人标注_260414.xlsx', sheet_name=1)
  185. df_tenderee3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\招标人标注_260414.xlsx', sheet_name=2)
  186. df_tenderee = pd.concat([df_tenderee1, df_tenderee2, df_tenderee3], ignore_index=True)
  187. df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_1.xlsx')
  188. df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_3.xlsx')
  189. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260413_0.8B_prefix_only_tenderee.xlsx')
  190. df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260413_0.8B_prefix_only_tenderee_all.xlsx')
  191. tenderee_data_list = df_tenderee.astype(object).where(pd.notnull(df_tenderee), None).values.tolist()
  192. data_list1 = df1.astype(object).where(pd.notnull(df1), None).values.tolist()
  193. data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist()
  194. data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist()
  195. tenderee_data_dict = {int(x[0]): x[1] for x in tenderee_data_list}
  196. data_dict1 = {int(x[0]): x[1] for x in data_list1}
  197. data_dict2 = {int(x[0]): x[1] for x in data_list2}
  198. data_dict3 = {int(x[0]): psv_to_dict_prefix(x[1]) for x in data_list3}
  199. for docid, v_dict in data_dict3.items():
  200. data_dict3[docid] = json.dumps(v_dict, ensure_ascii=False)
  201. cols = ['招标信息', '中标信息', '产品信息']
  202. result_list = []
  203. tenderee_match_cnt = 0
  204. tenderee_match_empty_cnt = 0
  205. real_tenderee_match_cnt1 = 0
  206. real_tenderee_match_cnt2 = 0
  207. for docid, tenderee_data in tenderee_data_dict.items():
  208. data1 = data_dict1.get(docid, '{}')
  209. data2 = data_dict2.get(docid, '{}')
  210. data3 = data_dict3.get(docid, '{}')
  211. extract_json1 = repair_json(data1)
  212. if len(data2) >= 1000:
  213. extract_json2 = '{}'
  214. else:
  215. extract_json2 = repair_json(data2)
  216. extract_json3 = repair_json(data3)
  217. extract_json1 = json_repair.loads(extract_json1)
  218. extract_json2 = json_repair.loads(extract_json2)
  219. extract_json3 = json_repair.loads(extract_json3)
  220. if type(extract_json1) != dict:
  221. extract_json1 = {}
  222. if type(extract_json2) != dict:
  223. extract_json2 = {}
  224. if type(extract_json3) != dict:
  225. extract_json3 = {}
  226. col_match_list = []
  227. for col in cols:
  228. str1 = json.dumps(extract_json1.get(col, ""), ensure_ascii=False)
  229. str2 = json.dumps(extract_json2.get(col, ""), ensure_ascii=False)
  230. str3 = json.dumps(extract_json3.get(col, ""), ensure_ascii=False)
  231. if str1 != str3:
  232. col_match_list += [str1, str2, str3, 0]
  233. else:
  234. col_match_list += [str1, str2, str3, 1]
  235. tenderee_real = tenderee_data_dict.get(docid)
  236. if tenderee_real in ['文中无招标人', '-']:
  237. tenderee_real = ''
  238. tenderee1 = extract_json1.get('招标信息', {}).get('招标人名称', "")
  239. try:
  240. tenderee3 = extract_json3.get('招标信息', {}).get('招标人名称', "")
  241. except:
  242. print('no 招标信息 招标人名称 extract_json3', extract_json3)
  243. # raise
  244. tenderee3 = ''
  245. if tenderee1 in ['文中无招标人', '-']:
  246. tenderee1 = ''
  247. if tenderee3 in ['文中无招标人', '-']:
  248. tenderee3 = ''
  249. if re.sub(' ', '', tenderee1) == re.sub(' ', '', tenderee3):
  250. tenderee_match_cnt += 1
  251. if not tenderee1:
  252. tenderee_match_empty_cnt += 1
  253. if tenderee1 == tenderee_real:
  254. real_tenderee_match_cnt1 += 1
  255. if tenderee3 == tenderee_real:
  256. real_tenderee_match_cnt2 += 1
  257. result_list.append(
  258. [docid, data1, data2, data3] +
  259. col_match_list +
  260. [tenderee1, tenderee3, tenderee_real] +
  261. [str(tenderee1==tenderee3), str(tenderee1==tenderee_real), str(tenderee3==tenderee_real)]
  262. )
  263. df_new = pd.DataFrame(result_list)
  264. df_new.columns = ['docid', 'doubao', 'qwen', 'qwen-lora',
  265. '招标信息1', '招标信息2', '招标信息3', '招标信息相同',
  266. '中标信息1', '中标信息2', '中标信息3', '中标信息相同',
  267. '产品信息1', '产品信息2', '产品信息3', '产品信息相同',
  268. 'tenderee_doubao', 'tenderee_qwen', '线上_tenderee',
  269. 'doubao_qwen_相同', 'doubao_线上_相同', 'qwen_线上相同',
  270. ]
  271. df_new.to_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260414_compare_only_tenderee_labeled.xlsx', index=False)
  272. print('tenderee_match_cnt', tenderee_match_cnt, 'tenderee_match_empty_cnt', tenderee_match_empty_cnt)
  273. print('real_tenderee_match_cnt1', real_tenderee_match_cnt1, real_tenderee_match_cnt1/len(tenderee_data_dict))
  274. print('real_tenderee_match_cnt2', real_tenderee_match_cnt2, real_tenderee_match_cnt2/len(tenderee_data_dict))
  275. print('len(tenderee_data_dict)', len(tenderee_data_dict))
  276. def compare_extract_csv_prefix2():
  277. file_path1 = r'C:\Users\Administrator\Downloads\招标人_中标人_标注_260421 (1).xlsx'
  278. file_path2 = r'D:\BIDI_DOC\比地_文档\export_ai_260506_doubao.xlsx'
  279. file_path3 = r'D:\BIDI_DOC\比地_文档\export_ai_260506_0.8B_prefix_all.xlsx'
  280. file_path4 = r'D:\BIDI_DOC\比地_文档\招标人_中标人_标注_260421_tenderee_win_2.csv'
  281. data_list1 = []
  282. for sheet_no in range(0, 7):
  283. df1 = pd.read_excel(file_path1, sheet_name=sheet_no)
  284. data_list1 += df1.astype(object).where(pd.notnull(df1), None).values.tolist()
  285. print('len(data_list1)', len(data_list1), data_list1[0])
  286. df2 = pd.read_excel(file_path2)
  287. df3 = pd.read_excel(file_path3)
  288. df4 = pd.read_csv(file_path4)
  289. data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist()
  290. data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist()
  291. data_list4 = df4.astype(object).where(pd.notnull(df4), None).values.tolist()
  292. data_dict_label = {int(x[0]): {'招标信息': {'招标人名称': x[3]}, '中标信息': [{'中标人名称': x[4]}]} for x in data_list1}
  293. data_dict_doubao = {int(x[0]): x[2] for x in data_list2}
  294. data_dict_qwen = {int(x[0]): psv_to_dict(x[1]) for x in data_list3}
  295. data_dict_online = {int(x[0]): {'招标信息': {'招标人名称': x[1]}, '中标信息': [{'中标人名称': x[2]}]} for x in data_list4}
  296. for docid, v_dict in data_dict_qwen.items():
  297. data_dict_qwen[docid] = json.dumps(v_dict, ensure_ascii=False)
  298. for docid, v_dict in data_dict_label.items():
  299. data_dict_label[docid] = json.dumps(v_dict, ensure_ascii=False)
  300. for docid, v_dict in data_dict_online.items():
  301. data_dict_online[docid] = json.dumps(v_dict, ensure_ascii=False)
  302. cols = ['招标信息', '中标信息', '产品信息']
  303. result_list = []
  304. tenderee_match_cnt = 0
  305. tenderee_match_empty_cnt = 0
  306. real_tenderee_match_cnt1 = 0
  307. real_tenderee_match_cnt2 = 0
  308. real_tenderee_match_cnt3 = 0
  309. win_match_cnt = 0
  310. win_match_empty_cnt = 0
  311. real_win_match_cnt1 = 0
  312. real_win_match_cnt2 = 0
  313. real_win_match_cnt3 = 0
  314. for docid, label_data in data_dict_label.items():
  315. data1 = data_dict_label.get(docid, '{}')
  316. data2 = data_dict_doubao.get(docid, '{}')
  317. data3 = data_dict_qwen.get(docid, '{}')
  318. data4 = data_dict_online.get(docid, '{}')
  319. extract_json1 = repair_json(data1)
  320. if len(data2) >= 1000:
  321. extract_json2 = '{}'
  322. else:
  323. extract_json2 = repair_json(data2)
  324. extract_json3 = repair_json(data3)
  325. extract_json4 = repair_json(data4)
  326. extract_json1 = json_repair.loads(extract_json1)
  327. extract_json2 = json_repair.loads(extract_json2)
  328. extract_json3 = json_repair.loads(extract_json3)
  329. extract_json4 = json_repair.loads(extract_json4)
  330. if type(extract_json1) != dict:
  331. extract_json1 = {}
  332. if type(extract_json2) != dict:
  333. extract_json2 = {}
  334. if type(extract_json3) != dict:
  335. extract_json3 = {}
  336. if type(extract_json4) != dict:
  337. extract_json4 = {}
  338. col_match_list = []
  339. for col in cols:
  340. str1 = json.dumps(extract_json1.get(col, ""), ensure_ascii=False)
  341. str2 = json.dumps(extract_json2.get(col, ""), ensure_ascii=False)
  342. str3 = json.dumps(extract_json3.get(col, ""), ensure_ascii=False)
  343. if str1 != str3:
  344. col_match_list += [str1, str2, str3, 0]
  345. else:
  346. col_match_list += [str1, str2, str3, 1]
  347. tenderee_label = extract_json1.get('招标信息', []).get('招标人名称', "")
  348. try:
  349. tenderee_doubao = extract_json2.get('招标信息', {}).get('招标人名称', "")
  350. except:
  351. print('no 招标信息 招标人名称 extract_json2', extract_json2)
  352. tenderee_doubao = ''
  353. try:
  354. tenderee_qwen = extract_json3.get('招标信息', [])
  355. if tenderee_qwen:
  356. tenderee_qwen = tenderee_qwen[0].get('招标人名称', "")
  357. else:
  358. tenderee_qwen = ''
  359. except:
  360. print('no 招标信息 招标人名称 extract_json3', type(extract_json3), extract_json3)
  361. traceback.print_exc()
  362. tenderee_qwen = ''
  363. tenderee_online = extract_json4.get('招标信息', []).get('招标人名称', "")
  364. if tenderee_label in ['文中无招标人', '-', '无', None]:
  365. tenderee_label = ''
  366. if tenderee_doubao in ['文中无招标人', '-', '无', None] or type(tenderee_doubao) != str:
  367. tenderee_doubao = ''
  368. if tenderee_qwen in ['文中无招标人', '-', '无', None] or type(tenderee_qwen) != str:
  369. tenderee_qwen = ''
  370. if tenderee_online in ['文中无招标人', '-', '无', None] or type(tenderee_online) != str:
  371. tenderee_online = ''
  372. # print('tenderee_doubao', tenderee_doubao)
  373. if re.sub(' ', '', tenderee_doubao) == re.sub(' ', '', tenderee_qwen):
  374. tenderee_match_cnt += 1
  375. if not tenderee_doubao:
  376. tenderee_match_empty_cnt += 1
  377. if tenderee_doubao == tenderee_label:
  378. real_tenderee_match_cnt1 += 1
  379. if tenderee_qwen == tenderee_label:
  380. real_tenderee_match_cnt2 += 1
  381. if tenderee_online == tenderee_label:
  382. real_tenderee_match_cnt3 += 1
  383. win_label = extract_json1.get('中标信息', [])
  384. if win_label:
  385. win_label = win_label[0].get('中标人名称', "")
  386. else:
  387. win_label = ''
  388. win_doubao = extract_json2.get('中标信息', [])
  389. if win_doubao:
  390. win_doubao = win_doubao[0].get('中标人名称', "")
  391. else:
  392. win_doubao = ''
  393. try:
  394. win_qwen = extract_json3.get('中标信息', [])
  395. if win_qwen:
  396. win_qwen = win_qwen[0].get('中标人名称', "")
  397. else:
  398. win_qwen = ''
  399. except:
  400. print('no 中标人名称 extract_json3', extract_json3)
  401. win_qwen = ''
  402. win_online = extract_json4.get('中标信息', [])
  403. if win_online:
  404. win_online = win_online[0].get('中标人名称', "")
  405. else:
  406. win_online = ''
  407. if win_label in ['文中无招标人', '-', '无', None]:
  408. win_label = ''
  409. if win_doubao in ['文中无招标人', '-', '无', None]:
  410. win_doubao = ''
  411. if win_qwen in ['文中无招标人', '-', '无', None]:
  412. win_qwen = ''
  413. if win_online in ['文中无招标人', '-', '无', None]:
  414. win_online = ''
  415. # print('win_doubao', win_doubao, win_qwen)
  416. if re.sub(' ', '', win_doubao) == re.sub(' ', '', win_qwen):
  417. win_match_cnt += 1
  418. if not win_doubao:
  419. win_match_empty_cnt += 1
  420. if win_doubao == win_label:
  421. real_win_match_cnt1 += 1
  422. if win_qwen == win_label:
  423. real_win_match_cnt2 += 1
  424. if win_online == win_label:
  425. real_win_match_cnt3 += 1
  426. result_list.append(
  427. [docid, data1, data2, data3] +
  428. col_match_list +
  429. [tenderee_doubao, tenderee_qwen, tenderee_online, tenderee_label] +
  430. [str(tenderee_doubao==tenderee_qwen), str(tenderee_doubao==tenderee_label), str(tenderee_online==tenderee_label), str(tenderee_qwen==tenderee_label)] +
  431. [win_doubao, win_qwen, win_online, win_label] +
  432. [str(win_doubao==win_qwen), str(win_doubao==win_label), str(win_online==win_label), str(win_qwen==win_label)]
  433. )
  434. df_new = pd.DataFrame(result_list)
  435. df_new.columns = ['docid', 'doubao', 'qwen', 'qwen-lora',
  436. '招标信息1', '招标信息2', '招标信息3', '招标信息相同',
  437. '中标信息1', '中标信息2', '中标信息3', '中标信息相同',
  438. '产品信息1', '产品信息2', '产品信息3', '产品信息相同',
  439. 'tenderee_doubao', 'tenderee_qwen', 'tenderee_online', 'tenderee_label',
  440. 'doubao_qwen_相同', 'doubao_标注_相同', '线上_标注_相同', 'qwen_标注_相同',
  441. 'win_doubao', 'win_qwen', 'win_online', 'win_label',
  442. 'doubao_qwen_相同', 'doubao_标注_相同', '线上_标注_相同', 'qwen_标注_相同',
  443. ]
  444. df_new.to_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260506_compare_labeled.xlsx', index=False)
  445. print('tenderee_match_cnt', tenderee_match_cnt, 'tenderee_match_empty_cnt', tenderee_match_empty_cnt)
  446. print('real_tenderee_match_cnt1', real_tenderee_match_cnt1, real_tenderee_match_cnt1/len(data_dict_label))
  447. print('real_tenderee_match_cnt2', real_tenderee_match_cnt2, real_tenderee_match_cnt2/len(data_dict_label))
  448. print('real_tenderee_match_cnt3', real_tenderee_match_cnt3, real_tenderee_match_cnt3/len(data_dict_label))
  449. # print('len(tenderee_data_dict)', len(tenderee_data_dict))
  450. print('win_match_cnt', win_match_cnt, 'win_match_empty_cnt', win_match_empty_cnt)
  451. print('real_win_match_cnt1', real_win_match_cnt1, real_win_match_cnt1/len(data_dict_label))
  452. print('real_win_match_cnt2', real_win_match_cnt2, real_win_match_cnt2/len(data_dict_label))
  453. print('real_win_match_cnt3', real_win_match_cnt3, real_win_match_cnt3/len(data_dict_label))
  454. def compare_extract_entity():
  455. df1 = pd.read_excel('df_train.xlsx')
  456. df2 = pd.read_excel('df_train_doubao.xlsx')
  457. # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_entity_260414_0.8B_prefix_only_tenderee.xlsx')
  458. df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_entity_260414_0.8B_prefix_only_tenderee_all.xlsx')
  459. df1 = df1[:200]
  460. data_list1 = df1.astype(object).where(pd.notnull(df1), None).values.tolist()
  461. data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist()
  462. data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist()
  463. data_dict1 = {x[2]: x for x in data_list1}
  464. data_dict2 = {x[0]: x[2] for x in data_list2}
  465. data_dict3 = {x[0]: psv_to_dict_prefix(x[1]) for x in data_list3}
  466. match_cnt_2 = 0
  467. match_cnt_3 = 0
  468. all_cnt = 0
  469. for docid, line in data_dict1.items():
  470. content = line[5] + line[1] + line[8]
  471. label = line[4]
  472. entity = line[1]
  473. if label == '其他角色':
  474. continue
  475. if line[4] != '招标人':
  476. tenderee1 = '-'
  477. continue
  478. else:
  479. tenderee1 = line[1]
  480. result_dict2 = data_dict2.get(docid)
  481. result_dict2 = repair_json(result_dict2)
  482. result_dict2 = json_repair.loads(result_dict2)
  483. tenderee2 = result_dict2.get('招标信息', {}).get('招标人名称')
  484. if not tenderee2:
  485. tenderee2 = '-'
  486. if tenderee1 == tenderee2:
  487. match_cnt_2 += 1
  488. else:
  489. print('not match2', tenderee1, tenderee2)
  490. print('content', content)
  491. print('label', label, entity)
  492. result_dict3 = data_dict3.get(docid)
  493. tenderee3 = result_dict3.get('招标信息', {}).get('招标人名称')
  494. if not tenderee3:
  495. tenderee3 = '-'
  496. if tenderee1 == tenderee3:
  497. match_cnt_3 += 1
  498. else:
  499. print('not match3', tenderee1, tenderee3)
  500. all_cnt += 1
  501. print('match_cnt_2', match_cnt_2, all_cnt, match_cnt_2 / all_cnt)
  502. print('match_cnt_3', match_cnt_3, all_cnt, match_cnt_3 / all_cnt)
  503. if __name__ == '__main__':
  504. # compare_extract_csv()
  505. # compare_extract_csv_prefix()
  506. # compare_extract_entity()
  507. compare_extract_csv_prefix2()