''' Created on 2018年12月20日 @author: User ''' import psycopg2 import re from decimal import Decimal from BiddingKG.dl.common.Utils import getUnifyMoney conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = "select A.prem,A.epc,B.prem,B.code,B.name,A.code,A.name,A.doc_id from articles_validation A,predict_validation B where A.doc_id=B.doc_id order by A.doc_id" cursor.execute(sql) rows = cursor.fetchall() def getPackages(prem): ''' @summary: 取得所有的包名 ''' result = set() for item in prem: result.add(item[0]) return result def getRole_id(role_name): ''' @summary: 取得角色名对应的ID ''' if role_name=="tenderee": return "0" if role_name=="agency": return "1" if role_name=="win_tenderer": return "2" if role_name=="second_tenderer": return "3" if role_name=="third_tenderer": return "4" CN_sum_predict = 0 CN_sum_label = 0 CN_predict_label = 0 role_sum_predict = 0 role_sum_label = 0 role_predict_label = 0 person_label = 0 person_predict = 0 person_label_predict = 0 money_label = 0 money_predict = 0 money_label_predict = 0 only_role_sum_predict = 0 only_role_sum_label = 0 only_role_predict_label = 0 only_money_label = 0 only_money_label_all = 0 only_money_predict = 0 only_money_label_predict = 0 only_person_label = 0 only_person_predict = 0 only_person_label_predict = 0 role_find = 0 role_find_label = 0 money_find = 0 money_find_label = 0 person_find = 0 person_find_label = 0 def getFind(doc_id,type): ''' @summary: 取得一篇文章某种类型的实体的数量 @param: doc_id:文章的uuid type:实体类型 ''' result = set() sql = "select distinct entity_text from predict_entity where doc_id='"+doc_id+"' and entity_type in "+type+" " cursor.execute(sql) rows = cursor.fetchall() for row in rows: if type.find("money")>0: result.add(Decimal(row[0])) else: result.add(row[0]) return result for row in rows: doc_id = row[7] label_prem = row[0] label_epc = row[1] predict_prem = row[2] list_label_prem = [] #标注值 for item in re.split("[;;]",label_prem): list_label_prem.append(re.split("-",item)) list_predict_prem = [] #预测值 for item in re.split("[;;]",predict_prem): list_predict_prem.append(re.split("\$",item)) #统计编号和名称 hand_code = row[5] hand_name = row[6] predict_code = row[3] predict_name = row[4] set_hand_code = set() set_predict_code = set() for item in re.split("[;;]",hand_code): if item!="": set_hand_code.add(item.strip()) for item in re.split("[;;]",predict_code): if item!="": set_predict_code.add(item.strip()) CN_sum_label += len(set_hand_code) CN_sum_predict += len(set_predict_code) CN_predict_label += len(set_hand_code&set_predict_code) if hand_name is not None and hand_code!="": CN_sum_label += 1 if predict_name is not None and predict_name!="": CN_sum_predict += 1 if hand_name is not None and hand_name!="" and hand_name.strip()==predict_name.strip(): CN_predict_label += 1 #统计角色 set_packages = getPackages(list_label_prem) set_predict = set() set_label = set() set_only_predict = set() set_only_label = set() if len(set_packages)>1: for item in re.split("[;;]",label_prem): if len(item)>1: item_split = item.split("-") set_label.add((item_split[0],item_split[1],item_split[2].strip())) set_only_label.add(item_split[2].strip()) for item in list_predict_prem: if len(item)>1: set_predict.add((item[0],getRole_id(item[2]),item[3].strip())) set_only_predict.add(item[3].strip()) else: for item in re.split("[;;]",label_prem): if len(item)>1: item_split = item.split("-") set_label.add((item_split[1],item_split[2])) set_only_label.add(item_split[2].strip()) for item in list_predict_prem: if len(item)>1: set_predict.add((getRole_id(item[2]),item[3].strip())) set_only_predict.add(item[3].strip()) role_sum_predict += len(set_predict) role_sum_label += len(set_label) role_predict_label += len(set_predict&set_label) only_role_sum_label += len(set_only_label) only_role_sum_predict += len(set_only_predict) only_role_predict_label += len(set_only_label&set_only_predict) set_find_role = getFind(doc_id,"('org','company')") role_find += len(set_find_role) role_find_label += len(set_only_label&set_find_role) #统计金额 set_money_label = set() set_money_predict = set() set_only_money_label = set() set_only_money_predict = set() count_multi = 0 for item in list_label_prem: if len(item)>3: if item[3]!="": count_multi_temp = 0 for i in item[3].strip().split("、"): label_money = getUnifyMoney(re.sub("[,,]",'',i)) if label_money>0: count_multi_temp += 1 set_money_label.add((item[2].strip(),label_money)) set_only_money_label.add(label_money) if count_multi_temp>0: count_multi += count_multi_temp -1 money_label+=(len(set_money_label)-count_multi) only_money_label += len(set_only_money_label)-count_multi only_money_label_all += len(set_only_money_label) for item in list_predict_prem: if len(item)>2: if str(item[4])!="0": set_money_predict.add((item[3].strip(),Decimal(item[5]))) set_only_money_predict.add(Decimal(item[5])) money_predict+=len(set_money_predict) money_label_predict+= len(set_money_label&set_money_predict) only_money_predict+=len(set_only_money_predict) only_money_label_predict+= len(set_only_money_label&set_only_money_predict) set_money_find = getFind(doc_id,"('money')") money_find += len(set_money_find) money_find_label += len(set_money_find&set_only_money_label) ''' if len(set_money_label)-count_multi-len(set_money_label&set_money_predict)>=2: print(doc_id) print(set_money_label) print(set_money_predict) ''' #统计联系人 role_set = set() for item in list_label_prem: if len(item)>1: role_set.add(item[2]) list_epc_label = [] for item1 in re.split("[;;]",label_epc): item = re.split(",",item1) if len(item)>1: if item[0].strip() in role_set: list_epc_label.append([item[0].strip(),re.split("[/、]",item[1])]) set_person_label = set() set_only_person_label = set() for item in list_epc_label: if len(item[1])>0 and item[1][0]!="": for i in item[1]: if i!="": set_person_label.add(item[0].strip()+i.strip()) set_only_person_label.add(i.strip()) person_label += len(set_person_label) only_person_label += len(set_only_person_label) set_person_predict = set() set_only_person_predict = set() for item1 in list_predict_prem: if len(item1)>1: if item1[7]!="": item = item1[7].split(",") for i in item: if i!="": set_person_predict.add(item1[3].strip()+i.split("/")[0].strip()) set_only_person_predict.add(i.split("/")[0].strip()) person_predict += len(set_person_predict) person_label_predict += len(set_person_label&set_person_predict) only_person_predict += len(set_only_person_predict) only_person_label_predict += len(set_only_person_label&set_only_person_predict) set_person_find = getFind(doc_id,"('person')") person_find += len(set_person_find) person_find_label += len(set_person_find&set_only_person_label) '''''' if abs(len(set_only_person_label)-len(set_only_person_predict))>=2: print(doc_id) print(set_person_label) print(set_person_predict) print("LinkTrueBase") print("编号名称:CN_sum_predict:%d,CN_sum_label:%d,CN_predict_label:%d,accurency:%f,recall:%f"%(CN_sum_predict,CN_sum_label,CN_predict_label,CN_predict_label/CN_sum_predict,CN_predict_label/CN_sum_label)) print("角色:role_sum_predict:%d,role_sum_label:%d,role_predict_label:%d,accurency:%f,recall:%f"%(role_sum_predict,role_sum_label,role_predict_label,role_predict_label/role_sum_predict,role_predict_label/role_sum_label)) print("金额:money_predict:%d,money_label:%d,money_label_predict:%d,acc:%f,recall:%f"%(money_predict,money_label,money_label_predict,money_label_predict/money_predict,money_label_predict/money_label)) print("联系人:person_label:%d,person_predict:%d,person_label_predict:%d,acc:%f,recall:%f"%(person_label,person_predict,person_label_predict,person_label_predict/person_predict,person_label_predict/person_label)) print("UserableBase") print("角色:only_role_sum_predict:%d,only_role_sum_label:%d,only_role_predict_label:%d,accurency:%f,recall:%f"%(only_role_sum_predict,only_role_sum_label,only_role_predict_label,only_role_predict_label/only_role_sum_predict,only_role_predict_label/only_role_sum_label)) print("金额:only_money_predict:%d,only_money_label:%d,only_money_label_predict:%d,acc:%f,recall:%f"%(only_money_predict,only_money_label,only_money_label_predict,only_money_label_predict/only_money_predict,only_money_label_predict/only_money_label)) print("联系人:only_person_label:%d,only_person_predict:%d,only_person_label_predict:%d,acc:%f,recall:%f"%(only_person_label,only_person_predict,only_person_label_predict,only_person_label_predict/only_person_predict,only_person_label_predict/only_person_label)) '''''' print("ExtractBase") print("角色:role_find:%d,only_role_sum_label:%d,role_find_label:%d,accurency:%f,recall:%f"%(role_find,only_role_sum_label,role_find_label,role_find_label/role_find,role_find_label/only_role_sum_label)) print("金额:money_find:%d,only_money_label:%d,money_find_label:%d,acc:%f,recall:%f"%(money_find,only_money_label_all,money_find_label,money_find_label/money_find,money_find_label/only_money_label_all)) print("联系人:person_find:%d,only_person_predict:%d,person_find_label:%d,acc:%f,recall:%f"%(person_find,only_person_label,person_find_label,person_find_label/person_find,person_find_label/only_person_label))