123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362 |
- '''
- Created on 2019年5月15日
- @author: User
- '''
- from bs4 import BeautifulSoup, Comment
- import copy
- import re
- import sys
- import os
- import codecs
- import requests
- import json
- sys.path.append(os.path.abspath("../.."))
- import fool
- from BiddingKG.dl.interface.Connection import *
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.interface.Connection import getConnection
- import BiddingKG.dl.interface.predictor as predictor
- import BiddingKG.dl.interface.Preprocessing as Preprocessing
- import BiddingKG.dl.interface.getAttributes as getAttributes
- def run_predict():
- '''
- data = load("val.pk")
- print(data[0])
- '''
- data = load("label_0_1197.pk")
- codeNamePredict = predictor.CodeNamePredict()
- premPredict = predictor.PREMPredict()
- epcPredict = predictor.EPCPredict()
- roleRulePredict = predictor.RoleRulePredictor()
- count = 0
- not_find_count = 0
- list_filename_index_notfound = []
- for item in data:
- count += 1
- print(count,not_find_count,len(data))
- list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[item["filename"],item["content"],"","",""]],useselffool=True)
- codeName = codeNamePredict.predict(list_articles)
- premPredict.predict(list_sentences,list_entitys)
- roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-
- epcPredict.predict(list_sentences,list_entitys)
- prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
- print(prem)
- label_prem = re.sub("[\r\s\n]","",item["prem"])
- label_prem = re.sub("(","(",label_prem)
- label_prem = re.sub(")",")",label_prem)
- set_label_entitys = set()
- for prem1 in label_prem.split(";"):
- if len(prem1.split("-"))>2:
- set_label_entitys.add(prem1.split("-")[2])
- set_extract_entitys = set()
- for list_entity in list_entitys:
- for entity in list_entity:
- if entity.entity_type in ["org","company"]:
- set_extract_entitys.add(entity.entity_text)
- not_find_count += len(set_label_entitys-(set_label_entitys&set_extract_entitys))
- print(item["filename"],set_label_entitys-(set_label_entitys&set_extract_entitys))
-
- list_filename_index_notfound.append([item["filename"],count,set_label_entitys-(set_label_entitys&set_extract_entitys)])
- if len(codeName)>0:
- item["predict_code"] = codeName[0][1]["code"]
- item["predict_name"] = codeName[0][1]["name"]
- else:
- item["predict_code"] = ""
- item["predict_name"] = ""
- if len(prem)>0:
- item["predict_prem"] = prem[0][1]["prem"]
- else:
- item["predict_prem"] = ""
- for item in list_filename_index_notfound:
- print(item)
- save(data,"val_selffool7.pk")
-
- def run_predict_interface():
- '''
- data = load("val.pk")
- print(data[0])
- '''
- data = load("label_0_1197.pk")
- # codeNamePredict = predictor.CodeNamePredict()
- # premPredict = predictor.PREMPredict()
- # epcPredict = predictor.EPCPredict()
- count = 0
- not_find_count = 0
-
- myheaders = {'Content-Type': 'application/json'}
- guardian_base2 = 'http://192.168.2.101:15011'
-
- for item in data:
- count += 1
-
- user = {
- "content": item["content"],
- "id":item["filename"]
- }
- _resp = requests.post(guardian_base2 + '/article_extract', json=user, headers=myheaders, verify=True)
- resp_json = _resp.content.decode("utf-8")
- obj_json = json.loads(resp_json)
- prem = obj_json["prem"]
-
- print(count,not_find_count,len(data))
-
- item["predict_code"] = obj_json["code"]
- item["predict_name"] = obj_json["name"]
- item["predict_prem"] = prem
- save(data,"val_interface.pk")
-
- def getAccRecall():
- # data = load("val_selffool7.pk")
- data = load("val_interface.pk")
- data_label = load("label_0_1197.pk")
- roles = [0,1,2,3,4,"code","name","money","person"]
- models = ["","role"]
- type = ["coincidence","label","predict"]
- count_dict = {}
-
- list_not_true = []
- index = 0
- for t in type:
- for m in models:
- for role in roles:
- count_dict[t+"_"+str(m)+"_"+str(role)] = 0
- list_filename_not_label_predict = []
- for item,item_label in zip(data,data_label):
- index += 1
- label_code = re.sub("[\r\n\s]","",item_label["code"])
- predict_code = ";".join(item["predict_code"])
- label_name = re.sub("[\r\n\s]","",item_label["name"])
- predict_name = item["predict_name"]
- label_prem = re.sub("[\r\n\s]","",item_label["prem"])
- label_prem = re.sub("(","(",label_prem)
- label_prem = re.sub(")",")",label_prem)
- predict_prem = item["predict_prem"]
- # print("===",item)
- count_not_true = 0
- #count code
- set_label_code = set([a for a in re.split("[;;]",label_code) if a!='' and a!='1'])
- set_predict_code = set([a for a in re.split("[;;]",predict_code) if a!=''])
- count_dict["coincidence__code"] += len(set_label_code&set_predict_code)
- count_dict["label__code"] += len(set_label_code-(set_label_code&set_predict_code))
- count_dict["predict__code"]+= len(set_predict_code-(set_label_code&set_predict_code))
-
- # new count name
- set_label_name = set([a for a in re.split("[;;]",label_name) if a!='' and a!='1'])
- #set_predict_name = set([a for a in re.split("[;;]",predict_name) if a!='']) # 单个项目名称
- set_predict_name = set(predict_name) # 多个项目名称
- if len(set_label_name&set_predict_name) > 0:
- count_dict["coincidence__name"] += 1
- elif len(set_predict_name-(set_label_name&set_predict_name)) >0:
- count_dict["predict__name"]+= 1
- elif len(set_label_name-(set_label_name&set_predict_name)) > 0:
- count_dict["label__name"] += 1
-
- #count role、money、person
- role_id = {0:"tenderee",
- 1:"agency",
- 2:"win_tenderer",
- 3:"second_tenderer",
- 4:"third_tenderer"}
-
- filename_not_label_predict = [item["filename"],set(),set()]
- #get not true roles of each article
- not_true_roles = [item["filename"],set(),set()]
- predict_roles = set()
- label_roles = set()
- # for _pack in predict_prem.keys():
- # for prem1 in predict_prem[_pack]["roleList"]:
- # predict_roles.add(prem1[0]+prem1[1])
- for item2 in predict_prem:
- predict_roles.add(item2[2]+item2[3])
- for item1 in label_prem.split(";"):
- prem1 = item1.split("-")
- if len(prem1)>1:
- label_roles.add(role_id[int(prem1[1])]+prem1[2])
- not_true_roles[1] = label_roles-(label_roles&predict_roles)
- not_true_roles[2] = predict_roles-(predict_roles&label_roles)
- if len(not_true_roles[1])>0 or len(not_true_roles[2])>0:
- print(not_true_roles)
- for role in [0,1,2,3,4]:
- temp_set = set()
- temp_set2 = set()
- same_package_count = 0
- package_set = set()
- # for _pack in predict_prem.keys():
- # for prem1 in predict_prem[_pack]["roleList"]:
- # if prem1[0]==role_id[role]:
- # temp_set.add((prem1[1]))
- for prem1 in predict_prem:
- if prem1[2]==role_id[role]:
- packageName = prem1[0]
- #temp_set.add((packageName,prem1[3]))
- temp_set.add((prem1[3]))
- for item1 in label_prem.split(";"):
- prem1 = item1.split("-")
- if len(prem1)>1 and str(prem1[1]).strip()==str(role):
- #print(prem1)
- packageName = "Project" if prem1[0]=="" else prem1[0]
- if packageName in package_set:
- same_package_count += 1
- package_set.add(packageName)
- #temp_set2.add((packageName,prem1[2]))
- temp_set2.add((prem1[2]))
-
- _coincidence = temp_set&temp_set2
- _label = temp_set2-(temp_set&temp_set2)
- _predict = temp_set-(temp_set&temp_set2)
-
- for item1 in list(_label):
- filename_not_label_predict[1].add((role,item1))
- for item1 in list(_predict):
- filename_not_label_predict[2].add((role,item1))
-
- count_dict["coincidence_role_"+str(role)] += len(temp_set&temp_set2)
- count_dict["label_role_"+str(role)] += len(temp_set2-(temp_set&temp_set2))-same_package_count
- count_dict["predict_role_"+str(role)] += len(temp_set-(temp_set&temp_set2))
-
- count_not_true += len(temp_set2-(temp_set&temp_set2))
- #count package_role_entity_money_people
- #list_not_true.append([item["filename"],count_not_true,index,label_prem,predict_prem])
-
- list_filename_not_label_predict.append(filename_not_label_predict)
- #count money
- temp_set = set()
- temp_set2 = set()
- same_entity_money = 0
- # for _pack in predict_prem.keys():
- # for prem1 in predict_prem[_pack]["roleList"]:
- # money = prem1[2]
- # if str(money)!="0":
- # temp_set.add((prem1[1],getUnifyMoney(money)))
- for prem1 in predict_prem:
- money = prem1[4]
- if str(money)!="0":
- temp_set.add((prem1[3],getUnifyMoney(str(money))))
- # temp_set.add((getUnifyMoney(str(money))))
- for item1 in label_prem.split(";"):
- prem1 = item1.split("-")
- if len(prem1)>1:
- for m in prem1[3].split("、"):
- if m!="":
- same_entity_money += 1
- temp_set2.add((prem1[2],getUnifyMoney(m)))
- # temp_set2.add((getUnifyMoney(m)))
- if same_entity_money>0:
- same_entity_money -= 1
- count_dict["coincidence__money"] += len(temp_set&temp_set2)
- count_dict["label__money"] += len(temp_set2-(temp_set&temp_set2))-same_entity_money
- count_dict["predict__money"] += len(temp_set-(temp_set&temp_set2))
- print("money_notfound",item["filename"],temp_set2-(temp_set&temp_set2))
- print("money_foundError",item["filename"],temp_set-(temp_set&temp_set2))
-
- #count person
- temp_set = set()
- temp_set2 = set()
- # for _pack in predict_prem.keys():
- # for prem1 in predict_prem[_pack]["roleList"]:
- # person = prem1[3]
- # for p in person:
- # temp_set.add((prem1[1],p[0]))
- for prem1 in predict_prem:
- person = prem1[5]
- for p in person:
- temp_set.add((prem1[3],p[0]))
- for item1 in label_prem.split(";"):
- prem1 = item1.split("-")
- if len(prem1)>4:
- person = prem1[4]
- for p in person.split("|"):
- if p.strip()!="/" and p.strip()!="":
- temp_set2.add((prem1[2],p.split("/")[0]))
- count_dict["coincidence__person"] += len(temp_set&temp_set2)
- count_dict["label__person"] += len(temp_set2-(temp_set&temp_set2))
- count_dict["predict__person"] += len(temp_set-(temp_set&temp_set2))
- #count_not_true = len(temp_set2-(temp_set&temp_set2))
- #list_not_true.append([item["filename"],count_not_true,index,label_prem,predict_prem])
-
-
-
- list_not_true.sort(key=lambda x:x[1],reverse=True)
- for item in list_not_true:
- if item[1]>0:
- print(item)
- count_list = []
- for key in count_dict.keys():
- if count_dict[key]>0:
- count_list.append([key,count_dict[key]])
- count_list.sort(key = lambda x:x[0])
- for item in count_list:
- print(item)
- count_m = ["role","code","name","money","person"]
- count_roles = ["",0,1,2,3,4]
- def get_value(_list,find_list):
- count = 0
- for item in _list:
- find_flag = True
- for key in find_list:
- if str(item[0]).find(key)<0:
- find_flag = False
- if find_flag:
- count += item[1]
- return count
-
- for m in count_m:
- for roles in count_roles:
- concidence = get_value(count_list,["coincidence",str(m),str(roles)])
- label = get_value(count_list,["label",str(m),str(roles)])
- predict = get_value(count_list,["predict",str(m),str(roles)])
- if 0 not in [predict+concidence,label+concidence]:
- print(m,roles,concidence,label,predict,"acc",concidence/(predict+concidence),"recall",concidence/(label+concidence))
- save(list_filename_not_label_predict,"list_filename_not_label_predict_3.pk")
-
- def compare():
- data = load("list_filename_not_label_predict_3.pk")
- data1 = load("list_filename_not_label_predict_2.pk")
- for item,item_1 in zip(data,data1):
- print("==",item)
- print("--",item_1)
- label_compare = item[1]-item_1[1]
- predict_compare = item[2]-item_1[2]
- if len(label_compare)>0 or len(predict_compare)>0:
- print(item[0],label_compare,predict_compare)
-
- def findmultipack():
- '''
- @summary: 找到多标段的数据
- '''
- data = load("label_0_1197.pk")
- for item_label in data:
- label_prem = re.sub("[\r\n\s]","",item_label["prem"])
- label_prem = re.sub("(","(",label_prem)
- label_prem = re.sub(")",")",label_prem)
- set_pack = set()
- for item1 in label_prem.split(";"):
- prem1 = item1.split("-")
- set_pack.add(prem1[0])
- if len(set_pack)>1:
- print(item_label["filename"])
-
- if __name__=="__main__":
- # run_predict()
- run_predict_interface()
- getAccRecall()
- # compare()
- #findmultipack()
|