''' Created on 2019年5月15日 @author: User ''' from bs4 import BeautifulSoup, Comment import copy import re import sys import os import codecs import requests import json sys.path.append(os.path.abspath("../..")) import fool from BiddingKG.dl.interface.Connection import * from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.Connection import getConnection import BiddingKG.dl.interface.predictor as predictor import BiddingKG.dl.interface.Preprocessing as Preprocessing import BiddingKG.dl.interface.getAttributes as getAttributes def run_predict(): ''' data = load("val.pk") print(data[0]) ''' data = load("label_0_1197.pk") codeNamePredict = predictor.CodeNamePredict() premPredict = predictor.PREMPredict() epcPredict = predictor.EPCPredict() roleRulePredict = predictor.RoleRulePredictor() count = 0 not_find_count = 0 list_filename_index_notfound = [] for item in data: count += 1 print(count,not_find_count,len(data)) list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[item["filename"],item["content"],"","",""]],useselffool=True) codeName = codeNamePredict.predict(list_articles) premPredict.predict(list_sentences,list_entitys) roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName) epcPredict.predict(list_sentences,list_entitys) prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles) print(prem) label_prem = re.sub("[\r\s\n]","",item["prem"]) label_prem = re.sub("(","(",label_prem) label_prem = re.sub(")",")",label_prem) set_label_entitys = set() for prem1 in label_prem.split(";"): if len(prem1.split("-"))>2: set_label_entitys.add(prem1.split("-")[2]) set_extract_entitys = set() for list_entity in list_entitys: for entity in list_entity: if entity.entity_type in ["org","company"]: set_extract_entitys.add(entity.entity_text) not_find_count += len(set_label_entitys-(set_label_entitys&set_extract_entitys)) print(item["filename"],set_label_entitys-(set_label_entitys&set_extract_entitys)) list_filename_index_notfound.append([item["filename"],count,set_label_entitys-(set_label_entitys&set_extract_entitys)]) if len(codeName)>0: item["predict_code"] = codeName[0][1]["code"] item["predict_name"] = codeName[0][1]["name"] else: item["predict_code"] = "" item["predict_name"] = "" if len(prem)>0: item["predict_prem"] = prem[0][1]["prem"] else: item["predict_prem"] = "" for item in list_filename_index_notfound: print(item) save(data,"val_selffool7.pk") def run_predict_interface(): ''' data = load("val.pk") print(data[0]) ''' data = load("label_0_1197.pk") # codeNamePredict = predictor.CodeNamePredict() # premPredict = predictor.PREMPredict() # epcPredict = predictor.EPCPredict() count = 0 not_find_count = 0 myheaders = {'Content-Type': 'application/json'} guardian_base2 = 'http://192.168.2.101:15011' for item in data: count += 1 user = { "content": item["content"], "id":item["filename"] } _resp = requests.post(guardian_base2 + '/article_extract', json=user, headers=myheaders, verify=True) resp_json = _resp.content.decode("utf-8") obj_json = json.loads(resp_json) prem = obj_json["prem"] print(count,not_find_count,len(data)) item["predict_code"] = obj_json["code"] item["predict_name"] = obj_json["name"] item["predict_prem"] = prem save(data,"val_interface.pk") def getAccRecall(): # data = load("val_selffool7.pk") data = load("val_interface.pk") data_label = load("label_0_1197.pk") roles = [0,1,2,3,4,"code","name","money","person"] models = ["","role"] type = ["coincidence","label","predict"] count_dict = {} list_not_true = [] index = 0 for t in type: for m in models: for role in roles: count_dict[t+"_"+str(m)+"_"+str(role)] = 0 list_filename_not_label_predict = [] for item,item_label in zip(data,data_label): index += 1 label_code = re.sub("[\r\n\s]","",item_label["code"]) predict_code = ";".join(item["predict_code"]) label_name = re.sub("[\r\n\s]","",item_label["name"]) predict_name = item["predict_name"] label_prem = re.sub("[\r\n\s]","",item_label["prem"]) label_prem = re.sub("(","(",label_prem) label_prem = re.sub(")",")",label_prem) predict_prem = item["predict_prem"] # print("===",item) count_not_true = 0 #count code set_label_code = set([a for a in re.split("[;;]",label_code) if a!='' and a!='1']) set_predict_code = set([a for a in re.split("[;;]",predict_code) if a!='']) count_dict["coincidence__code"] += len(set_label_code&set_predict_code) count_dict["label__code"] += len(set_label_code-(set_label_code&set_predict_code)) count_dict["predict__code"]+= len(set_predict_code-(set_label_code&set_predict_code)) # new count name set_label_name = set([a for a in re.split("[;;]",label_name) if a!='' and a!='1']) #set_predict_name = set([a for a in re.split("[;;]",predict_name) if a!='']) # 单个项目名称 set_predict_name = set(predict_name) # 多个项目名称 if len(set_label_name&set_predict_name) > 0: count_dict["coincidence__name"] += 1 elif len(set_predict_name-(set_label_name&set_predict_name)) >0: count_dict["predict__name"]+= 1 elif len(set_label_name-(set_label_name&set_predict_name)) > 0: count_dict["label__name"] += 1 #count role、money、person role_id = {0:"tenderee", 1:"agency", 2:"win_tenderer", 3:"second_tenderer", 4:"third_tenderer"} filename_not_label_predict = [item["filename"],set(),set()] #get not true roles of each article not_true_roles = [item["filename"],set(),set()] predict_roles = set() label_roles = set() # for _pack in predict_prem.keys(): # for prem1 in predict_prem[_pack]["roleList"]: # predict_roles.add(prem1[0]+prem1[1]) for item2 in predict_prem: predict_roles.add(item2[2]+item2[3]) for item1 in label_prem.split(";"): prem1 = item1.split("-") if len(prem1)>1: label_roles.add(role_id[int(prem1[1])]+prem1[2]) not_true_roles[1] = label_roles-(label_roles&predict_roles) not_true_roles[2] = predict_roles-(predict_roles&label_roles) if len(not_true_roles[1])>0 or len(not_true_roles[2])>0: print(not_true_roles) for role in [0,1,2,3,4]: temp_set = set() temp_set2 = set() same_package_count = 0 package_set = set() # for _pack in predict_prem.keys(): # for prem1 in predict_prem[_pack]["roleList"]: # if prem1[0]==role_id[role]: # temp_set.add((prem1[1])) for prem1 in predict_prem: if prem1[2]==role_id[role]: packageName = prem1[0] #temp_set.add((packageName,prem1[3])) temp_set.add((prem1[3])) for item1 in label_prem.split(";"): prem1 = item1.split("-") if len(prem1)>1 and str(prem1[1]).strip()==str(role): #print(prem1) packageName = "Project" if prem1[0]=="" else prem1[0] if packageName in package_set: same_package_count += 1 package_set.add(packageName) #temp_set2.add((packageName,prem1[2])) temp_set2.add((prem1[2])) _coincidence = temp_set&temp_set2 _label = temp_set2-(temp_set&temp_set2) _predict = temp_set-(temp_set&temp_set2) for item1 in list(_label): filename_not_label_predict[1].add((role,item1)) for item1 in list(_predict): filename_not_label_predict[2].add((role,item1)) count_dict["coincidence_role_"+str(role)] += len(temp_set&temp_set2) count_dict["label_role_"+str(role)] += len(temp_set2-(temp_set&temp_set2))-same_package_count count_dict["predict_role_"+str(role)] += len(temp_set-(temp_set&temp_set2)) count_not_true += len(temp_set2-(temp_set&temp_set2)) #count package_role_entity_money_people #list_not_true.append([item["filename"],count_not_true,index,label_prem,predict_prem]) list_filename_not_label_predict.append(filename_not_label_predict) #count money temp_set = set() temp_set2 = set() same_entity_money = 0 # for _pack in predict_prem.keys(): # for prem1 in predict_prem[_pack]["roleList"]: # money = prem1[2] # if str(money)!="0": # temp_set.add((prem1[1],getUnifyMoney(money))) for prem1 in predict_prem: money = prem1[4] if str(money)!="0": temp_set.add((prem1[3],getUnifyMoney(str(money)))) # temp_set.add((getUnifyMoney(str(money)))) for item1 in label_prem.split(";"): prem1 = item1.split("-") if len(prem1)>1: for m in prem1[3].split("、"): if m!="": same_entity_money += 1 temp_set2.add((prem1[2],getUnifyMoney(m))) # temp_set2.add((getUnifyMoney(m))) if same_entity_money>0: same_entity_money -= 1 count_dict["coincidence__money"] += len(temp_set&temp_set2) count_dict["label__money"] += len(temp_set2-(temp_set&temp_set2))-same_entity_money count_dict["predict__money"] += len(temp_set-(temp_set&temp_set2)) print("money_notfound",item["filename"],temp_set2-(temp_set&temp_set2)) print("money_foundError",item["filename"],temp_set-(temp_set&temp_set2)) #count person temp_set = set() temp_set2 = set() # for _pack in predict_prem.keys(): # for prem1 in predict_prem[_pack]["roleList"]: # person = prem1[3] # for p in person: # temp_set.add((prem1[1],p[0])) for prem1 in predict_prem: person = prem1[5] for p in person: temp_set.add((prem1[3],p[0])) for item1 in label_prem.split(";"): prem1 = item1.split("-") if len(prem1)>4: person = prem1[4] for p in person.split("|"): if p.strip()!="/" and p.strip()!="": temp_set2.add((prem1[2],p.split("/")[0])) count_dict["coincidence__person"] += len(temp_set&temp_set2) count_dict["label__person"] += len(temp_set2-(temp_set&temp_set2)) count_dict["predict__person"] += len(temp_set-(temp_set&temp_set2)) #count_not_true = len(temp_set2-(temp_set&temp_set2)) #list_not_true.append([item["filename"],count_not_true,index,label_prem,predict_prem]) list_not_true.sort(key=lambda x:x[1],reverse=True) for item in list_not_true: if item[1]>0: print(item) count_list = [] for key in count_dict.keys(): if count_dict[key]>0: count_list.append([key,count_dict[key]]) count_list.sort(key = lambda x:x[0]) for item in count_list: print(item) count_m = ["role","code","name","money","person"] count_roles = ["",0,1,2,3,4] def get_value(_list,find_list): count = 0 for item in _list: find_flag = True for key in find_list: if str(item[0]).find(key)<0: find_flag = False if find_flag: count += item[1] return count for m in count_m: for roles in count_roles: concidence = get_value(count_list,["coincidence",str(m),str(roles)]) label = get_value(count_list,["label",str(m),str(roles)]) predict = get_value(count_list,["predict",str(m),str(roles)]) if 0 not in [predict+concidence,label+concidence]: print(m,roles,concidence,label,predict,"acc",concidence/(predict+concidence),"recall",concidence/(label+concidence)) save(list_filename_not_label_predict,"list_filename_not_label_predict_3.pk") def compare(): data = load("list_filename_not_label_predict_3.pk") data1 = load("list_filename_not_label_predict_2.pk") for item,item_1 in zip(data,data1): print("==",item) print("--",item_1) label_compare = item[1]-item_1[1] predict_compare = item[2]-item_1[2] if len(label_compare)>0 or len(predict_compare)>0: print(item[0],label_compare,predict_compare) def findmultipack(): ''' @summary: 找到多标段的数据 ''' data = load("label_0_1197.pk") for item_label in data: label_prem = re.sub("[\r\n\s]","",item_label["prem"]) label_prem = re.sub("(","(",label_prem) label_prem = re.sub(")",")",label_prem) set_pack = set() for item1 in label_prem.split(";"): prem1 = item1.split("-") set_pack.add(prem1[0]) if len(set_pack)>1: print(item_label["filename"]) if __name__=="__main__": # run_predict() run_predict_interface() getAccRecall() # compare() #findmultipack()