import json import re import traceback from glob import glob import pandas as pd from bs4 import BeautifulSoup from json_repair import repair_json, json_repair from get_data import psv_to_dict, psv_to_dict_prefix from compare_utils import compare_products def compare_extract_csv(): df_tenderee = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2_tenderee.csv') df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_1.xlsx') df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_3.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_3_lora_2.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260331_lora.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260401_lora-r64.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260401_lora_2B.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260402_lora_2B.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260407_0.8B.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260408_0.8B.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260409_0.8B.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260413_0.8B.xlsx') df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260423_0.8B_prefix_all.xlsx') tenderee_data_list = df_tenderee.astype(object).where(pd.notnull(df_tenderee), None).values.tolist() data_list1 = df1.astype(object).where(pd.notnull(df1), None).values.tolist() data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist() data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist() tenderee_data_dict = {int(x[0]): x[1] for x in tenderee_data_list} data_dict1 = {int(x[0]): x[1] for x in data_list1} data_dict2 = {int(x[0]): x[1] for x in data_list2} data_dict3 = {int(x[0]): psv_to_dict(x[1]) for x in data_list3} for docid, v_dict in data_dict3.items(): v1 = v_dict.get('招标信息') v2 = v_dict.get('招标人联系方式') if v1 is None or v2 is None: continue v1 = v1[0] if not v1: continue v1['招标人联系方式'] = v2 v_dict['招标信息'] = v1 data_dict3[docid] = v_dict for docid, v_dict in data_dict3.items(): data_dict3[docid] = json.dumps(v_dict, ensure_ascii=False) # print('data_dict3', data_dict3.get(751300501)) cols = ['招标信息', '中标信息', '产品信息'] result_list = [] tenderee_match_cnt = 0 tenderee_match_empty_cnt = 0 real_tenderee_match_cnt1 = 0 real_tenderee_match_cnt2 = 0 win_tenderer_match_cnt = 0 agency_match_cnt = 0 all_product_weight_score = 0 all_product_complete_score = 0 all_product_cnt_score = 0 for docid, data1 in data_dict1.items(): data2 = data_dict2.get(docid, '{}') data3 = data_dict3.get(docid, '{}') # if len(re.findall('}]}', data2)) != 2: # data2 = re.sub('}]', '}]}', data2, count=1) # if len(re.findall('}]}', data3)) != 2: # data3 = re.sub('}]', '}]}', data3, count=1) # if len(data1) >= 1000: # data1 = '{}' # if len(data2) >= 1000: # data2 = '{}' # if len(data3) >= 1000: # data3 = '{}' # if len(data1) >= 1000: # extract_json1 = '{}' # else: extract_json1 = repair_json(data1) if len(data2) >= 1000: extract_json2 = '{}' else: extract_json2 = repair_json(data2) # if len(data3) >= 1000: # extract_json3 = '{}' # else: # extract_json3 = repair_json(data3) # print('data2', data2) extract_json3 = repair_json(data3) extract_json1 = json_repair.loads(extract_json1) extract_json2 = json_repair.loads(extract_json2) extract_json3 = json_repair.loads(extract_json3) # print('extract_json2', extract_json2) if type(extract_json1) != dict: extract_json1 = {} if type(extract_json2) != dict: extract_json2 = {} if type(extract_json3) != dict: extract_json3 = {} col_match_list = [] for col in cols: str1 = json.dumps(extract_json1.get(col, ""), ensure_ascii=False) str2 = json.dumps(extract_json2.get(col, ""), ensure_ascii=False) str3 = json.dumps(extract_json3.get(col, ""), ensure_ascii=False) if str1 != str3: col_match_list += [str1, str2, str3, 0] else: col_match_list += [str1, str2, str3, 1] tenderee_real = tenderee_data_dict.get(docid) tenderee1 = extract_json1.get('招标信息', {}).get('招标人名称', "") try: tenderee3 = extract_json3.get('招标信息', {}).get('招标人名称', "") except: print('no 招标信息 招标人名称 extract_json3', extract_json3) # raise tenderee3 = '' if re.sub(' ', '', tenderee1) == re.sub(' ', '', tenderee3): tenderee_match_cnt += 1 if not tenderee1: tenderee_match_empty_cnt += 1 if tenderee1 == tenderee_real: real_tenderee_match_cnt1 += 1 if tenderee3 == tenderee_real: real_tenderee_match_cnt2 += 1 win_tenderer1 = [x.get('中标人名称', "") for x in extract_json1.get('中标信息', [])] win_tenderer1 = list(set(win_tenderer1)) win_tenderer1.sort(key=lambda x: x) win_tenderer1 = ','.join(win_tenderer1) try: win_tenderer3 = [x.get('中标人名称', "") for x in extract_json3.get('中标信息', [])] win_tenderer3 = list(set(win_tenderer3)) win_tenderer3.sort(key=lambda x: x) win_tenderer3 = ','.join(win_tenderer3) except: print('no 中标信息 中标人名称 extract_json3', extract_json3) # raise win_tenderer3 = '' if win_tenderer1 == win_tenderer3: print('win_tenderer1 == win_tenderer3', win_tenderer1, win_tenderer3) win_tenderer_match_cnt += 1 agency1 = extract_json1.get('招标信息', {}).get('代理人名称', "") try: agency3 = extract_json3.get('招标信息', {}).get('代理人名称', "") except: print('no 招标信息 中标人名称 extract_json3', extract_json3) # raise agency3 = '' if agency1 == agency3: agency_match_cnt += 1 # 计算products匹配率 products1 = extract_json1.get('产品信息') products3 = extract_json3.get('产品信息') # print('products3', products3) product_weight_score, product_complete_score, product_cnt_score = compare_products(products1, products3) all_product_weight_score += product_weight_score all_product_complete_score += product_complete_score all_product_cnt_score += product_cnt_score result_list.append( [docid, data1, data2, data3] + col_match_list + [tenderee1, tenderee3, tenderee_real] + [str(tenderee1==tenderee3), str(tenderee1==tenderee_real), str(tenderee3==tenderee_real)] + [win_tenderer1, win_tenderer3, str(win_tenderer1==win_tenderer3)] + [agency1, agency3, str(agency1==agency3)] ) df_new = pd.DataFrame(result_list) df_new.columns = ['docid', 'doubao', 'qwen', 'qwen-lora', '招标信息1', '招标信息2', '招标信息3', '招标信息相同', '中标信息1', '中标信息2', '中标信息3', '中标信息相同', '产品信息1', '产品信息2', '产品信息3', '产品信息相同', 'tenderee_doubao', 'tenderee_qwen', '线上_tenderee', 'doubao_qwen_相同', 'doubao_线上_相同', 'qwen_线上相同', 'win_doubao', 'win_qwen', 'doubao_qwen_相同', 'agency_doubao', 'agency_qwen', 'doubao_qwen_相同', ] df_new.to_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260423_compare_doubao_0.8B_prefix.xlsx', index=False) all_cnt = len(data_dict1) print('tenderee_match_cnt', tenderee_match_cnt, tenderee_match_cnt/all_cnt, 'tenderee_match_empty_cnt', tenderee_match_empty_cnt) print('real_tenderee_match_cnt1', real_tenderee_match_cnt1, real_tenderee_match_cnt1/all_cnt) print('real_tenderee_match_cnt2', real_tenderee_match_cnt2, real_tenderee_match_cnt2/all_cnt) print('win_tenderer_match_cnt', win_tenderer_match_cnt, win_tenderer_match_cnt/all_cnt) print('agency_match_cnt', agency_match_cnt, agency_match_cnt/all_cnt) print('product_weight_score', all_product_weight_score / all_cnt) print('product_complete_score', all_product_complete_score / all_cnt) print('product_cnt_score', all_product_cnt_score / all_cnt) def compare_extract_csv_prefix(): # df_tenderee = pd.read_csv(r'D:\BIDI_DOC\比地_文档\export_ai_260323_2_tenderee.csv') df_tenderee1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\招标人标注_260414.xlsx', sheet_name=0) df_tenderee2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\招标人标注_260414.xlsx', sheet_name=1) df_tenderee3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\招标人标注_260414.xlsx', sheet_name=2) df_tenderee = pd.concat([df_tenderee1, df_tenderee2, df_tenderee3], ignore_index=True) df1 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_1.xlsx') df2 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260323_3.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260413_0.8B_prefix_only_tenderee.xlsx') df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260413_0.8B_prefix_only_tenderee_all.xlsx') tenderee_data_list = df_tenderee.astype(object).where(pd.notnull(df_tenderee), None).values.tolist() data_list1 = df1.astype(object).where(pd.notnull(df1), None).values.tolist() data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist() data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist() tenderee_data_dict = {int(x[0]): x[1] for x in tenderee_data_list} data_dict1 = {int(x[0]): x[1] for x in data_list1} data_dict2 = {int(x[0]): x[1] for x in data_list2} data_dict3 = {int(x[0]): psv_to_dict_prefix(x[1]) for x in data_list3} for docid, v_dict in data_dict3.items(): data_dict3[docid] = json.dumps(v_dict, ensure_ascii=False) cols = ['招标信息', '中标信息', '产品信息'] result_list = [] tenderee_match_cnt = 0 tenderee_match_empty_cnt = 0 real_tenderee_match_cnt1 = 0 real_tenderee_match_cnt2 = 0 for docid, tenderee_data in tenderee_data_dict.items(): data1 = data_dict1.get(docid, '{}') data2 = data_dict2.get(docid, '{}') data3 = data_dict3.get(docid, '{}') extract_json1 = repair_json(data1) if len(data2) >= 1000: extract_json2 = '{}' else: extract_json2 = repair_json(data2) extract_json3 = repair_json(data3) extract_json1 = json_repair.loads(extract_json1) extract_json2 = json_repair.loads(extract_json2) extract_json3 = json_repair.loads(extract_json3) if type(extract_json1) != dict: extract_json1 = {} if type(extract_json2) != dict: extract_json2 = {} if type(extract_json3) != dict: extract_json3 = {} col_match_list = [] for col in cols: str1 = json.dumps(extract_json1.get(col, ""), ensure_ascii=False) str2 = json.dumps(extract_json2.get(col, ""), ensure_ascii=False) str3 = json.dumps(extract_json3.get(col, ""), ensure_ascii=False) if str1 != str3: col_match_list += [str1, str2, str3, 0] else: col_match_list += [str1, str2, str3, 1] tenderee_real = tenderee_data_dict.get(docid) if tenderee_real in ['文中无招标人', '-']: tenderee_real = '' tenderee1 = extract_json1.get('招标信息', {}).get('招标人名称', "") try: tenderee3 = extract_json3.get('招标信息', {}).get('招标人名称', "") except: print('no 招标信息 招标人名称 extract_json3', extract_json3) # raise tenderee3 = '' if tenderee1 in ['文中无招标人', '-']: tenderee1 = '' if tenderee3 in ['文中无招标人', '-']: tenderee3 = '' if re.sub(' ', '', tenderee1) == re.sub(' ', '', tenderee3): tenderee_match_cnt += 1 if not tenderee1: tenderee_match_empty_cnt += 1 if tenderee1 == tenderee_real: real_tenderee_match_cnt1 += 1 if tenderee3 == tenderee_real: real_tenderee_match_cnt2 += 1 result_list.append( [docid, data1, data2, data3] + col_match_list + [tenderee1, tenderee3, tenderee_real] + [str(tenderee1==tenderee3), str(tenderee1==tenderee_real), str(tenderee3==tenderee_real)] ) df_new = pd.DataFrame(result_list) df_new.columns = ['docid', 'doubao', 'qwen', 'qwen-lora', '招标信息1', '招标信息2', '招标信息3', '招标信息相同', '中标信息1', '中标信息2', '中标信息3', '中标信息相同', '产品信息1', '产品信息2', '产品信息3', '产品信息相同', 'tenderee_doubao', 'tenderee_qwen', '线上_tenderee', 'doubao_qwen_相同', 'doubao_线上_相同', 'qwen_线上相同', ] df_new.to_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260414_compare_only_tenderee_labeled.xlsx', index=False) print('tenderee_match_cnt', tenderee_match_cnt, 'tenderee_match_empty_cnt', tenderee_match_empty_cnt) print('real_tenderee_match_cnt1', real_tenderee_match_cnt1, real_tenderee_match_cnt1/len(tenderee_data_dict)) print('real_tenderee_match_cnt2', real_tenderee_match_cnt2, real_tenderee_match_cnt2/len(tenderee_data_dict)) print('len(tenderee_data_dict)', len(tenderee_data_dict)) def compare_extract_csv_prefix2(): file_path1 = r'C:\Users\Administrator\Downloads\招标人_中标人_标注_260421 (1).xlsx' file_path2 = r'D:\BIDI_DOC\比地_文档\export_ai_260506_doubao.xlsx' file_path3 = r'D:\BIDI_DOC\比地_文档\export_ai_260506_0.8B_prefix_all.xlsx' file_path4 = r'D:\BIDI_DOC\比地_文档\招标人_中标人_标注_260421_tenderee_win_2.csv' data_list1 = [] for sheet_no in range(0, 7): df1 = pd.read_excel(file_path1, sheet_name=sheet_no) data_list1 += df1.astype(object).where(pd.notnull(df1), None).values.tolist() print('len(data_list1)', len(data_list1), data_list1[0]) df2 = pd.read_excel(file_path2) df3 = pd.read_excel(file_path3) df4 = pd.read_csv(file_path4) data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist() data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist() data_list4 = df4.astype(object).where(pd.notnull(df4), None).values.tolist() data_dict_label = {int(x[0]): {'招标信息': {'招标人名称': x[3]}, '中标信息': [{'中标人名称': x[4]}]} for x in data_list1} data_dict_doubao = {int(x[0]): x[2] for x in data_list2} data_dict_qwen = {int(x[0]): psv_to_dict(x[1]) for x in data_list3} data_dict_online = {int(x[0]): {'招标信息': {'招标人名称': x[1]}, '中标信息': [{'中标人名称': x[2]}]} for x in data_list4} for docid, v_dict in data_dict_qwen.items(): data_dict_qwen[docid] = json.dumps(v_dict, ensure_ascii=False) for docid, v_dict in data_dict_label.items(): data_dict_label[docid] = json.dumps(v_dict, ensure_ascii=False) for docid, v_dict in data_dict_online.items(): data_dict_online[docid] = json.dumps(v_dict, ensure_ascii=False) cols = ['招标信息', '中标信息', '产品信息'] result_list = [] tenderee_match_cnt = 0 tenderee_match_empty_cnt = 0 real_tenderee_match_cnt1 = 0 real_tenderee_match_cnt2 = 0 real_tenderee_match_cnt3 = 0 win_match_cnt = 0 win_match_empty_cnt = 0 real_win_match_cnt1 = 0 real_win_match_cnt2 = 0 real_win_match_cnt3 = 0 for docid, label_data in data_dict_label.items(): data1 = data_dict_label.get(docid, '{}') data2 = data_dict_doubao.get(docid, '{}') data3 = data_dict_qwen.get(docid, '{}') data4 = data_dict_online.get(docid, '{}') extract_json1 = repair_json(data1) if len(data2) >= 1000: extract_json2 = '{}' else: extract_json2 = repair_json(data2) extract_json3 = repair_json(data3) extract_json4 = repair_json(data4) extract_json1 = json_repair.loads(extract_json1) extract_json2 = json_repair.loads(extract_json2) extract_json3 = json_repair.loads(extract_json3) extract_json4 = json_repair.loads(extract_json4) if type(extract_json1) != dict: extract_json1 = {} if type(extract_json2) != dict: extract_json2 = {} if type(extract_json3) != dict: extract_json3 = {} if type(extract_json4) != dict: extract_json4 = {} col_match_list = [] for col in cols: str1 = json.dumps(extract_json1.get(col, ""), ensure_ascii=False) str2 = json.dumps(extract_json2.get(col, ""), ensure_ascii=False) str3 = json.dumps(extract_json3.get(col, ""), ensure_ascii=False) if str1 != str3: col_match_list += [str1, str2, str3, 0] else: col_match_list += [str1, str2, str3, 1] tenderee_label = extract_json1.get('招标信息', []).get('招标人名称', "") try: tenderee_doubao = extract_json2.get('招标信息', {}).get('招标人名称', "") except: print('no 招标信息 招标人名称 extract_json2', extract_json2) tenderee_doubao = '' try: tenderee_qwen = extract_json3.get('招标信息', []) if tenderee_qwen: tenderee_qwen = tenderee_qwen[0].get('招标人名称', "") else: tenderee_qwen = '' except: print('no 招标信息 招标人名称 extract_json3', type(extract_json3), extract_json3) traceback.print_exc() tenderee_qwen = '' tenderee_online = extract_json4.get('招标信息', []).get('招标人名称', "") if tenderee_label in ['文中无招标人', '-', '无', None]: tenderee_label = '' if tenderee_doubao in ['文中无招标人', '-', '无', None] or type(tenderee_doubao) != str: tenderee_doubao = '' if tenderee_qwen in ['文中无招标人', '-', '无', None] or type(tenderee_qwen) != str: tenderee_qwen = '' if tenderee_online in ['文中无招标人', '-', '无', None] or type(tenderee_online) != str: tenderee_online = '' # print('tenderee_doubao', tenderee_doubao) if re.sub(' ', '', tenderee_doubao) == re.sub(' ', '', tenderee_qwen): tenderee_match_cnt += 1 if not tenderee_doubao: tenderee_match_empty_cnt += 1 if tenderee_doubao == tenderee_label: real_tenderee_match_cnt1 += 1 if tenderee_qwen == tenderee_label: real_tenderee_match_cnt2 += 1 if tenderee_online == tenderee_label: real_tenderee_match_cnt3 += 1 win_label = extract_json1.get('中标信息', []) if win_label: win_label = win_label[0].get('中标人名称', "") else: win_label = '' win_doubao = extract_json2.get('中标信息', []) if win_doubao: win_doubao = win_doubao[0].get('中标人名称', "") else: win_doubao = '' try: win_qwen = extract_json3.get('中标信息', []) if win_qwen: win_qwen = win_qwen[0].get('中标人名称', "") else: win_qwen = '' except: print('no 中标人名称 extract_json3', extract_json3) win_qwen = '' win_online = extract_json4.get('中标信息', []) if win_online: win_online = win_online[0].get('中标人名称', "") else: win_online = '' if win_label in ['文中无招标人', '-', '无', None]: win_label = '' if win_doubao in ['文中无招标人', '-', '无', None]: win_doubao = '' if win_qwen in ['文中无招标人', '-', '无', None]: win_qwen = '' if win_online in ['文中无招标人', '-', '无', None]: win_online = '' # print('win_doubao', win_doubao, win_qwen) if re.sub(' ', '', win_doubao) == re.sub(' ', '', win_qwen): win_match_cnt += 1 if not win_doubao: win_match_empty_cnt += 1 if win_doubao == win_label: real_win_match_cnt1 += 1 if win_qwen == win_label: real_win_match_cnt2 += 1 if win_online == win_label: real_win_match_cnt3 += 1 result_list.append( [docid, data1, data2, data3] + col_match_list + [tenderee_doubao, tenderee_qwen, tenderee_online, tenderee_label] + [str(tenderee_doubao==tenderee_qwen), str(tenderee_doubao==tenderee_label), str(tenderee_online==tenderee_label), str(tenderee_qwen==tenderee_label)] + [win_doubao, win_qwen, win_online, win_label] + [str(win_doubao==win_qwen), str(win_doubao==win_label), str(win_online==win_label), str(win_qwen==win_label)] ) df_new = pd.DataFrame(result_list) df_new.columns = ['docid', 'doubao', 'qwen', 'qwen-lora', '招标信息1', '招标信息2', '招标信息3', '招标信息相同', '中标信息1', '中标信息2', '中标信息3', '中标信息相同', '产品信息1', '产品信息2', '产品信息3', '产品信息相同', 'tenderee_doubao', 'tenderee_qwen', 'tenderee_online', 'tenderee_label', 'doubao_qwen_相同', 'doubao_标注_相同', '线上_标注_相同', 'qwen_标注_相同', 'win_doubao', 'win_qwen', 'win_online', 'win_label', 'doubao_qwen_相同', 'doubao_标注_相同', '线上_标注_相同', 'qwen_标注_相同', ] df_new.to_excel(r'D:\BIDI_DOC\比地_文档\export_ai_260506_compare_labeled.xlsx', index=False) print('tenderee_match_cnt', tenderee_match_cnt, 'tenderee_match_empty_cnt', tenderee_match_empty_cnt) print('real_tenderee_match_cnt1', real_tenderee_match_cnt1, real_tenderee_match_cnt1/len(data_dict_label)) print('real_tenderee_match_cnt2', real_tenderee_match_cnt2, real_tenderee_match_cnt2/len(data_dict_label)) print('real_tenderee_match_cnt3', real_tenderee_match_cnt3, real_tenderee_match_cnt3/len(data_dict_label)) # print('len(tenderee_data_dict)', len(tenderee_data_dict)) print('win_match_cnt', win_match_cnt, 'win_match_empty_cnt', win_match_empty_cnt) print('real_win_match_cnt1', real_win_match_cnt1, real_win_match_cnt1/len(data_dict_label)) print('real_win_match_cnt2', real_win_match_cnt2, real_win_match_cnt2/len(data_dict_label)) print('real_win_match_cnt3', real_win_match_cnt3, real_win_match_cnt3/len(data_dict_label)) def compare_extract_entity(): df1 = pd.read_excel('df_train.xlsx') df2 = pd.read_excel('df_train_doubao.xlsx') # df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_entity_260414_0.8B_prefix_only_tenderee.xlsx') df3 = pd.read_excel(r'D:\BIDI_DOC\比地_文档\export_entity_260414_0.8B_prefix_only_tenderee_all.xlsx') df1 = df1[:200] data_list1 = df1.astype(object).where(pd.notnull(df1), None).values.tolist() data_list2 = df2.astype(object).where(pd.notnull(df2), None).values.tolist() data_list3 = df3.astype(object).where(pd.notnull(df3), None).values.tolist() data_dict1 = {x[2]: x for x in data_list1} data_dict2 = {x[0]: x[2] for x in data_list2} data_dict3 = {x[0]: psv_to_dict_prefix(x[1]) for x in data_list3} match_cnt_2 = 0 match_cnt_3 = 0 all_cnt = 0 for docid, line in data_dict1.items(): content = line[5] + line[1] + line[8] label = line[4] entity = line[1] if label == '其他角色': continue if line[4] != '招标人': tenderee1 = '-' continue else: tenderee1 = line[1] result_dict2 = data_dict2.get(docid) result_dict2 = repair_json(result_dict2) result_dict2 = json_repair.loads(result_dict2) tenderee2 = result_dict2.get('招标信息', {}).get('招标人名称') if not tenderee2: tenderee2 = '-' if tenderee1 == tenderee2: match_cnt_2 += 1 else: print('not match2', tenderee1, tenderee2) print('content', content) print('label', label, entity) result_dict3 = data_dict3.get(docid) tenderee3 = result_dict3.get('招标信息', {}).get('招标人名称') if not tenderee3: tenderee3 = '-' if tenderee1 == tenderee3: match_cnt_3 += 1 else: print('not match3', tenderee1, tenderee3) all_cnt += 1 print('match_cnt_2', match_cnt_2, all_cnt, match_cnt_2 / all_cnt) print('match_cnt_3', match_cnt_3, all_cnt, match_cnt_3 / all_cnt) if __name__ == '__main__': # compare_extract_csv() # compare_extract_csv_prefix() # compare_extract_entity() compare_extract_csv_prefix2()