| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604 |
- import json
- import re
- import traceback
- from glob import glob
- import pandas as pd
- from bs4 import BeautifulSoup
- from json_repair import repair_json, json_repair
- from get_data import psv_to_dict, psv_to_dict_prefix
- from compare_utils import compare_products
- def compare_extract_csv():
- df_tenderee = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2_tenderee.csv')
- df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_1.xlsx')
- df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_3.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_3_lora_2.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260331_lora.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260401_lora-r64.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260401_lora_2B.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260402_lora_2B.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260407_0.8B.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260408_0.8B.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260409_0.8B.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260413_0.8B.xlsx')
- df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260423_0.8B_prefix_all.xlsx')
- tenderee_data_list = df_tenderee.astype(object).where(pd.notnull(df_tenderee), None).values.tolist()
- data_list1 = df1.astype(object).where(pd.notnull(df1), None).values.tolist()
- data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist()
- data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist()
- tenderee_data_dict = {int(x[0]): x[1] for x in tenderee_data_list}
- data_dict1 = {int(x[0]): x[1] for x in data_list1}
- data_dict2 = {int(x[0]): x[1] for x in data_list2}
- data_dict3 = {int(x[0]): psv_to_dict(x[1]) for x in data_list3}
- for docid, v_dict in data_dict3.items():
- v1 = v_dict.get('招标信息')
- v2 = v_dict.get('招标人联系方式')
- if v1 is None or v2 is None:
- continue
- v1 = v1[0]
- if not v1:
- continue
- v1['招标人联系方式'] = v2
- v_dict['招标信息'] = v1
- data_dict3[docid] = v_dict
- for docid, v_dict in data_dict3.items():
- data_dict3[docid] = json.dumps(v_dict, ensure_ascii=False)
- # print('data_dict3', data_dict3.get(751300501))
- cols = ['招标信息', '中标信息', '产品信息']
- result_list = []
- tenderee_match_cnt = 0
- tenderee_match_empty_cnt = 0
- real_tenderee_match_cnt1 = 0
- real_tenderee_match_cnt2 = 0
- win_tenderer_match_cnt = 0
- agency_match_cnt = 0
- all_product_weight_score = 0
- all_product_complete_score = 0
- all_product_cnt_score = 0
- for docid, data1 in data_dict1.items():
- data2 = data_dict2.get(docid, '{}')
- data3 = data_dict3.get(docid, '{}')
- # if len(re.findall('}]}', data2)) != 2:
- # data2 = re.sub('}]', '}]}', data2, count=1)
- # if len(re.findall('}]}', data3)) != 2:
- # data3 = re.sub('}]', '}]}', data3, count=1)
- # if len(data1) >= 1000:
- # data1 = '{}'
- # if len(data2) >= 1000:
- # data2 = '{}'
- # if len(data3) >= 1000:
- # data3 = '{}'
- # if len(data1) >= 1000:
- # extract_json1 = '{}'
- # else:
- extract_json1 = repair_json(data1)
- if len(data2) >= 1000:
- extract_json2 = '{}'
- else:
- extract_json2 = repair_json(data2)
- # if len(data3) >= 1000:
- # extract_json3 = '{}'
- # else:
- # extract_json3 = repair_json(data3)
- # print('data2', data2)
- extract_json3 = repair_json(data3)
- extract_json1 = json_repair.loads(extract_json1)
- extract_json2 = json_repair.loads(extract_json2)
- extract_json3 = json_repair.loads(extract_json3)
- # print('extract_json2', extract_json2)
- if type(extract_json1) != dict:
- extract_json1 = {}
- if type(extract_json2) != dict:
- extract_json2 = {}
- if type(extract_json3) != dict:
- extract_json3 = {}
- col_match_list = []
- for col in cols:
- str1 = json.dumps(extract_json1.get(col, ""), ensure_ascii=False)
- str2 = json.dumps(extract_json2.get(col, ""), ensure_ascii=False)
- str3 = json.dumps(extract_json3.get(col, ""), ensure_ascii=False)
- if str1 != str3:
- col_match_list += [str1, str2, str3, 0]
- else:
- col_match_list += [str1, str2, str3, 1]
- tenderee_real = tenderee_data_dict.get(docid)
- tenderee1 = extract_json1.get('招标信息', {}).get('招标人名称', "")
- try:
- tenderee3 = extract_json3.get('招标信息', {}).get('招标人名称', "")
- except:
- print('no 招标信息 招标人名称 extract_json3', extract_json3)
- # raise
- tenderee3 = ''
- if re.sub(' ', '', tenderee1) == re.sub(' ', '', tenderee3):
- tenderee_match_cnt += 1
- if not tenderee1:
- tenderee_match_empty_cnt += 1
- if tenderee1 == tenderee_real:
- real_tenderee_match_cnt1 += 1
- if tenderee3 == tenderee_real:
- real_tenderee_match_cnt2 += 1
- win_tenderer1 = [x.get('中标人名称', "") for x in extract_json1.get('中标信息', [])]
- win_tenderer1 = list(set(win_tenderer1))
- win_tenderer1.sort(key=lambda x: x)
- win_tenderer1 = ','.join(win_tenderer1)
- try:
- win_tenderer3 = [x.get('中标人名称', "") for x in extract_json3.get('中标信息', [])]
- win_tenderer3 = list(set(win_tenderer3))
- win_tenderer3.sort(key=lambda x: x)
- win_tenderer3 = ','.join(win_tenderer3)
- except:
- print('no 中标信息 中标人名称 extract_json3', extract_json3)
- # raise
- win_tenderer3 = ''
- if win_tenderer1 == win_tenderer3:
- print('win_tenderer1 == win_tenderer3', win_tenderer1, win_tenderer3)
- win_tenderer_match_cnt += 1
- agency1 = extract_json1.get('招标信息', {}).get('代理人名称', "")
- try:
- agency3 = extract_json3.get('招标信息', {}).get('代理人名称', "")
- except:
- print('no 招标信息 中标人名称 extract_json3', extract_json3)
- # raise
- agency3 = ''
- if agency1 == agency3:
- agency_match_cnt += 1
- # 计算products匹配率
- products1 = extract_json1.get('产品信息')
- products3 = extract_json3.get('产品信息')
- # print('products3', products3)
- product_weight_score, product_complete_score, product_cnt_score = compare_products(products1, products3)
- all_product_weight_score += product_weight_score
- all_product_complete_score += product_complete_score
- all_product_cnt_score += product_cnt_score
- result_list.append(
- [docid, data1, data2, data3] +
- col_match_list +
- [tenderee1, tenderee3, tenderee_real] +
- [str(tenderee1==tenderee3), str(tenderee1==tenderee_real), str(tenderee3==tenderee_real)] +
- [win_tenderer1, win_tenderer3, str(win_tenderer1==win_tenderer3)] +
- [agency1, agency3, str(agency1==agency3)]
- )
- df_new = pd.DataFrame(result_list)
- df_new.columns = ['docid',
- 'doubao', 'qwen', 'qwen-lora',
- '招标信息1', '招标信息2', '招标信息3', '招标信息相同',
- '中标信息1', '中标信息2', '中标信息3', '中标信息相同',
- '产品信息1', '产品信息2', '产品信息3', '产品信息相同',
- 'tenderee_doubao', 'tenderee_qwen', '线上_tenderee',
- 'doubao_qwen_相同', 'doubao_线上_相同', 'qwen_线上相同',
- 'win_doubao', 'win_qwen', 'doubao_qwen_相同',
- 'agency_doubao', 'agency_qwen', 'doubao_qwen_相同',
- ]
- df_new.to_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260423_compare_doubao_0.8B_prefix.xlsx', index=False)
- all_cnt = len(data_dict1)
- print('tenderee_match_cnt', tenderee_match_cnt, tenderee_match_cnt/all_cnt, 'tenderee_match_empty_cnt', tenderee_match_empty_cnt)
- print('real_tenderee_match_cnt1', real_tenderee_match_cnt1, real_tenderee_match_cnt1/all_cnt)
- print('real_tenderee_match_cnt2', real_tenderee_match_cnt2, real_tenderee_match_cnt2/all_cnt)
- print('win_tenderer_match_cnt', win_tenderer_match_cnt, win_tenderer_match_cnt/all_cnt)
- print('agency_match_cnt', agency_match_cnt, agency_match_cnt/all_cnt)
- print('product_weight_score', all_product_weight_score / all_cnt)
- print('product_complete_score', all_product_complete_score / all_cnt)
- print('product_cnt_score', all_product_cnt_score / all_cnt)
- def compare_extract_csv_prefix():
- # df_tenderee = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2_tenderee.csv')
- df_tenderee1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\招标人标注_260414.xlsx', sheet_name=0)
- df_tenderee2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\招标人标注_260414.xlsx', sheet_name=1)
- df_tenderee3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\招标人标注_260414.xlsx', sheet_name=2)
- df_tenderee = pd.concat([df_tenderee1, df_tenderee2, df_tenderee3], ignore_index=True)
- df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_1.xlsx')
- df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_3.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260413_0.8B_prefix_only_tenderee.xlsx')
- df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260413_0.8B_prefix_only_tenderee_all.xlsx')
- tenderee_data_list = df_tenderee.astype(object).where(pd.notnull(df_tenderee), None).values.tolist()
- data_list1 = df1.astype(object).where(pd.notnull(df1), None).values.tolist()
- data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist()
- data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist()
- tenderee_data_dict = {int(x[0]): x[1] for x in tenderee_data_list}
- data_dict1 = {int(x[0]): x[1] for x in data_list1}
- data_dict2 = {int(x[0]): x[1] for x in data_list2}
- data_dict3 = {int(x[0]): psv_to_dict_prefix(x[1]) for x in data_list3}
- for docid, v_dict in data_dict3.items():
- data_dict3[docid] = json.dumps(v_dict, ensure_ascii=False)
- cols = ['招标信息', '中标信息', '产品信息']
- result_list = []
- tenderee_match_cnt = 0
- tenderee_match_empty_cnt = 0
- real_tenderee_match_cnt1 = 0
- real_tenderee_match_cnt2 = 0
- for docid, tenderee_data in tenderee_data_dict.items():
- data1 = data_dict1.get(docid, '{}')
- data2 = data_dict2.get(docid, '{}')
- data3 = data_dict3.get(docid, '{}')
- extract_json1 = repair_json(data1)
- if len(data2) >= 1000:
- extract_json2 = '{}'
- else:
- extract_json2 = repair_json(data2)
- extract_json3 = repair_json(data3)
- extract_json1 = json_repair.loads(extract_json1)
- extract_json2 = json_repair.loads(extract_json2)
- extract_json3 = json_repair.loads(extract_json3)
- if type(extract_json1) != dict:
- extract_json1 = {}
- if type(extract_json2) != dict:
- extract_json2 = {}
- if type(extract_json3) != dict:
- extract_json3 = {}
- col_match_list = []
- for col in cols:
- str1 = json.dumps(extract_json1.get(col, ""), ensure_ascii=False)
- str2 = json.dumps(extract_json2.get(col, ""), ensure_ascii=False)
- str3 = json.dumps(extract_json3.get(col, ""), ensure_ascii=False)
- if str1 != str3:
- col_match_list += [str1, str2, str3, 0]
- else:
- col_match_list += [str1, str2, str3, 1]
- tenderee_real = tenderee_data_dict.get(docid)
- if tenderee_real in ['文中无招标人', '-']:
- tenderee_real = ''
- tenderee1 = extract_json1.get('招标信息', {}).get('招标人名称', "")
- try:
- tenderee3 = extract_json3.get('招标信息', {}).get('招标人名称', "")
- except:
- print('no 招标信息 招标人名称 extract_json3', extract_json3)
- # raise
- tenderee3 = ''
- if tenderee1 in ['文中无招标人', '-']:
- tenderee1 = ''
- if tenderee3 in ['文中无招标人', '-']:
- tenderee3 = ''
- if re.sub(' ', '', tenderee1) == re.sub(' ', '', tenderee3):
- tenderee_match_cnt += 1
- if not tenderee1:
- tenderee_match_empty_cnt += 1
- if tenderee1 == tenderee_real:
- real_tenderee_match_cnt1 += 1
- if tenderee3 == tenderee_real:
- real_tenderee_match_cnt2 += 1
- result_list.append(
- [docid, data1, data2, data3] +
- col_match_list +
- [tenderee1, tenderee3, tenderee_real] +
- [str(tenderee1==tenderee3), str(tenderee1==tenderee_real), str(tenderee3==tenderee_real)]
- )
- df_new = pd.DataFrame(result_list)
- df_new.columns = ['docid', 'doubao', 'qwen', 'qwen-lora',
- '招标信息1', '招标信息2', '招标信息3', '招标信息相同',
- '中标信息1', '中标信息2', '中标信息3', '中标信息相同',
- '产品信息1', '产品信息2', '产品信息3', '产品信息相同',
- 'tenderee_doubao', 'tenderee_qwen', '线上_tenderee',
- 'doubao_qwen_相同', 'doubao_线上_相同', 'qwen_线上相同',
- ]
- df_new.to_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260414_compare_only_tenderee_labeled.xlsx', index=False)
- print('tenderee_match_cnt', tenderee_match_cnt, 'tenderee_match_empty_cnt', tenderee_match_empty_cnt)
- print('real_tenderee_match_cnt1', real_tenderee_match_cnt1, real_tenderee_match_cnt1/len(tenderee_data_dict))
- print('real_tenderee_match_cnt2', real_tenderee_match_cnt2, real_tenderee_match_cnt2/len(tenderee_data_dict))
- print('len(tenderee_data_dict)', len(tenderee_data_dict))
- def compare_extract_csv_prefix2():
- file_path1 = r'C:\Users\Administrator\Downloads\招标人_中标人_标注_260421 (1).xlsx'
- file_path2 = r'D:\BIDI_DOC\比地_文档\export_ai_260506_doubao.xlsx'
- file_path3 = r'D:\BIDI_DOC\比地_文档\export_ai_260506_0.8B_prefix_all.xlsx'
- file_path4 = r'D:\BIDI_DOC\比地_文档\招标人_中标人_标注_260421_tenderee_win_2.csv'
- data_list1 = []
- for sheet_no in range(0, 7):
- df1 = pd.read_excel(file_path1, sheet_name=sheet_no)
- data_list1 += df1.astype(object).where(pd.notnull(df1), None).values.tolist()
- print('len(data_list1)', len(data_list1), data_list1[0])
- df2 = pd.read_excel(file_path2)
- df3 = pd.read_excel(file_path3)
- df4 = pd.read_csv(file_path4)
- data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist()
- data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist()
- data_list4 = df4.astype(object).where(pd.notnull(df4), None).values.tolist()
- data_dict_label = {int(x[0]): {'招标信息': {'招标人名称': x[3]}, '中标信息': [{'中标人名称': x[4]}]} for x in data_list1}
- data_dict_doubao = {int(x[0]): x[2] for x in data_list2}
- data_dict_qwen = {int(x[0]): psv_to_dict(x[1]) for x in data_list3}
- data_dict_online = {int(x[0]): {'招标信息': {'招标人名称': x[1]}, '中标信息': [{'中标人名称': x[2]}]} for x in data_list4}
- for docid, v_dict in data_dict_qwen.items():
- data_dict_qwen[docid] = json.dumps(v_dict, ensure_ascii=False)
- for docid, v_dict in data_dict_label.items():
- data_dict_label[docid] = json.dumps(v_dict, ensure_ascii=False)
- for docid, v_dict in data_dict_online.items():
- data_dict_online[docid] = json.dumps(v_dict, ensure_ascii=False)
- cols = ['招标信息', '中标信息', '产品信息']
- result_list = []
- tenderee_match_cnt = 0
- tenderee_match_empty_cnt = 0
- real_tenderee_match_cnt1 = 0
- real_tenderee_match_cnt2 = 0
- real_tenderee_match_cnt3 = 0
- win_match_cnt = 0
- win_match_empty_cnt = 0
- real_win_match_cnt1 = 0
- real_win_match_cnt2 = 0
- real_win_match_cnt3 = 0
- for docid, label_data in data_dict_label.items():
- data1 = data_dict_label.get(docid, '{}')
- data2 = data_dict_doubao.get(docid, '{}')
- data3 = data_dict_qwen.get(docid, '{}')
- data4 = data_dict_online.get(docid, '{}')
- extract_json1 = repair_json(data1)
- if len(data2) >= 1000:
- extract_json2 = '{}'
- else:
- extract_json2 = repair_json(data2)
- extract_json3 = repair_json(data3)
- extract_json4 = repair_json(data4)
- extract_json1 = json_repair.loads(extract_json1)
- extract_json2 = json_repair.loads(extract_json2)
- extract_json3 = json_repair.loads(extract_json3)
- extract_json4 = json_repair.loads(extract_json4)
- if type(extract_json1) != dict:
- extract_json1 = {}
- if type(extract_json2) != dict:
- extract_json2 = {}
- if type(extract_json3) != dict:
- extract_json3 = {}
- if type(extract_json4) != dict:
- extract_json4 = {}
- col_match_list = []
- for col in cols:
- str1 = json.dumps(extract_json1.get(col, ""), ensure_ascii=False)
- str2 = json.dumps(extract_json2.get(col, ""), ensure_ascii=False)
- str3 = json.dumps(extract_json3.get(col, ""), ensure_ascii=False)
- if str1 != str3:
- col_match_list += [str1, str2, str3, 0]
- else:
- col_match_list += [str1, str2, str3, 1]
- tenderee_label = extract_json1.get('招标信息', []).get('招标人名称', "")
- try:
- tenderee_doubao = extract_json2.get('招标信息', {}).get('招标人名称', "")
- except:
- print('no 招标信息 招标人名称 extract_json2', extract_json2)
- tenderee_doubao = ''
- try:
- tenderee_qwen = extract_json3.get('招标信息', [])
- if tenderee_qwen:
- tenderee_qwen = tenderee_qwen[0].get('招标人名称', "")
- else:
- tenderee_qwen = ''
- except:
- print('no 招标信息 招标人名称 extract_json3', type(extract_json3), extract_json3)
- traceback.print_exc()
- tenderee_qwen = ''
- tenderee_online = extract_json4.get('招标信息', []).get('招标人名称', "")
- if tenderee_label in ['文中无招标人', '-', '无', None]:
- tenderee_label = ''
- if tenderee_doubao in ['文中无招标人', '-', '无', None] or type(tenderee_doubao) != str:
- tenderee_doubao = ''
- if tenderee_qwen in ['文中无招标人', '-', '无', None] or type(tenderee_qwen) != str:
- tenderee_qwen = ''
- if tenderee_online in ['文中无招标人', '-', '无', None] or type(tenderee_online) != str:
- tenderee_online = ''
- # print('tenderee_doubao', tenderee_doubao)
- if re.sub(' ', '', tenderee_doubao) == re.sub(' ', '', tenderee_qwen):
- tenderee_match_cnt += 1
- if not tenderee_doubao:
- tenderee_match_empty_cnt += 1
- if tenderee_doubao == tenderee_label:
- real_tenderee_match_cnt1 += 1
- if tenderee_qwen == tenderee_label:
- real_tenderee_match_cnt2 += 1
- if tenderee_online == tenderee_label:
- real_tenderee_match_cnt3 += 1
- win_label = extract_json1.get('中标信息', [])
- if win_label:
- win_label = win_label[0].get('中标人名称', "")
- else:
- win_label = ''
- win_doubao = extract_json2.get('中标信息', [])
- if win_doubao:
- win_doubao = win_doubao[0].get('中标人名称', "")
- else:
- win_doubao = ''
- try:
- win_qwen = extract_json3.get('中标信息', [])
- if win_qwen:
- win_qwen = win_qwen[0].get('中标人名称', "")
- else:
- win_qwen = ''
- except:
- print('no 中标人名称 extract_json3', extract_json3)
- win_qwen = ''
- win_online = extract_json4.get('中标信息', [])
- if win_online:
- win_online = win_online[0].get('中标人名称', "")
- else:
- win_online = ''
- if win_label in ['文中无招标人', '-', '无', None]:
- win_label = ''
- if win_doubao in ['文中无招标人', '-', '无', None]:
- win_doubao = ''
- if win_qwen in ['文中无招标人', '-', '无', None]:
- win_qwen = ''
- if win_online in ['文中无招标人', '-', '无', None]:
- win_online = ''
- # print('win_doubao', win_doubao, win_qwen)
- if re.sub(' ', '', win_doubao) == re.sub(' ', '', win_qwen):
- win_match_cnt += 1
- if not win_doubao:
- win_match_empty_cnt += 1
- if win_doubao == win_label:
- real_win_match_cnt1 += 1
- if win_qwen == win_label:
- real_win_match_cnt2 += 1
- if win_online == win_label:
- real_win_match_cnt3 += 1
- result_list.append(
- [docid, data1, data2, data3] +
- col_match_list +
- [tenderee_doubao, tenderee_qwen, tenderee_online, tenderee_label] +
- [str(tenderee_doubao==tenderee_qwen), str(tenderee_doubao==tenderee_label), str(tenderee_online==tenderee_label), str(tenderee_qwen==tenderee_label)] +
- [win_doubao, win_qwen, win_online, win_label] +
- [str(win_doubao==win_qwen), str(win_doubao==win_label), str(win_online==win_label), str(win_qwen==win_label)]
- )
- df_new = pd.DataFrame(result_list)
- df_new.columns = ['docid', 'doubao', 'qwen', 'qwen-lora',
- '招标信息1', '招标信息2', '招标信息3', '招标信息相同',
- '中标信息1', '中标信息2', '中标信息3', '中标信息相同',
- '产品信息1', '产品信息2', '产品信息3', '产品信息相同',
- 'tenderee_doubao', 'tenderee_qwen', 'tenderee_online', 'tenderee_label',
- 'doubao_qwen_相同', 'doubao_标注_相同', '线上_标注_相同', 'qwen_标注_相同',
- 'win_doubao', 'win_qwen', 'win_online', 'win_label',
- 'doubao_qwen_相同', 'doubao_标注_相同', '线上_标注_相同', 'qwen_标注_相同',
- ]
- df_new.to_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260506_compare_labeled.xlsx', index=False)
- print('tenderee_match_cnt', tenderee_match_cnt, 'tenderee_match_empty_cnt', tenderee_match_empty_cnt)
- print('real_tenderee_match_cnt1', real_tenderee_match_cnt1, real_tenderee_match_cnt1/len(data_dict_label))
- print('real_tenderee_match_cnt2', real_tenderee_match_cnt2, real_tenderee_match_cnt2/len(data_dict_label))
- print('real_tenderee_match_cnt3', real_tenderee_match_cnt3, real_tenderee_match_cnt3/len(data_dict_label))
- # print('len(tenderee_data_dict)', len(tenderee_data_dict))
- print('win_match_cnt', win_match_cnt, 'win_match_empty_cnt', win_match_empty_cnt)
- print('real_win_match_cnt1', real_win_match_cnt1, real_win_match_cnt1/len(data_dict_label))
- print('real_win_match_cnt2', real_win_match_cnt2, real_win_match_cnt2/len(data_dict_label))
- print('real_win_match_cnt3', real_win_match_cnt3, real_win_match_cnt3/len(data_dict_label))
- def compare_extract_entity():
- df1 = pd.read_excel('df_train.xlsx')
- df2 = pd.read_excel('df_train_doubao.xlsx')
- # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_entity_260414_0.8B_prefix_only_tenderee.xlsx')
- df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_entity_260414_0.8B_prefix_only_tenderee_all.xlsx')
- df1 = df1[:200]
- data_list1 = df1.astype(object).where(pd.notnull(df1), None).values.tolist()
- data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist()
- data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist()
- data_dict1 = {x[2]: x for x in data_list1}
- data_dict2 = {x[0]: x[2] for x in data_list2}
- data_dict3 = {x[0]: psv_to_dict_prefix(x[1]) for x in data_list3}
- match_cnt_2 = 0
- match_cnt_3 = 0
- all_cnt = 0
- for docid, line in data_dict1.items():
- content = line[5] + line[1] + line[8]
- label = line[4]
- entity = line[1]
- if label == '其他角色':
- continue
- if line[4] != '招标人':
- tenderee1 = '-'
- continue
- else:
- tenderee1 = line[1]
- result_dict2 = data_dict2.get(docid)
- result_dict2 = repair_json(result_dict2)
- result_dict2 = json_repair.loads(result_dict2)
- tenderee2 = result_dict2.get('招标信息', {}).get('招标人名称')
- if not tenderee2:
- tenderee2 = '-'
- if tenderee1 == tenderee2:
- match_cnt_2 += 1
- else:
- print('not match2', tenderee1, tenderee2)
- print('content', content)
- print('label', label, entity)
- result_dict3 = data_dict3.get(docid)
- tenderee3 = result_dict3.get('招标信息', {}).get('招标人名称')
- if not tenderee3:
- tenderee3 = '-'
- if tenderee1 == tenderee3:
- match_cnt_3 += 1
- else:
- print('not match3', tenderee1, tenderee3)
- all_cnt += 1
- print('match_cnt_2', match_cnt_2, all_cnt, match_cnt_2 / all_cnt)
- print('match_cnt_3', match_cnt_3, all_cnt, match_cnt_3 / all_cnt)
- if __name__ == '__main__':
- # compare_extract_csv()
- # compare_extract_csv_prefix()
- # compare_extract_entity()
- compare_extract_csv_prefix2()
|