|
@@ -0,0 +1,343 @@
|
|
|
|
+
|
|
|
|
+import psycopg2
|
|
|
|
+from BiddingKG.dl.interface.extract import predict,test
|
|
|
|
+from BiddingKG.dl.common.Utils import getUnifyMoney,timeFormat
|
|
|
|
+import re
|
|
|
|
+import json
|
|
|
|
+
|
|
|
|
+class ExtractMetric():
|
|
|
|
+
|
|
|
|
+ def __init__(self):
|
|
|
|
+ self.conn1 = self.getConnection_postgres("iepy")
|
|
|
|
+ self.conn2 = self.getConnection_postgres("iepy")
|
|
|
|
+
|
|
|
|
+ def fitDataByRule(self,data):
|
|
|
|
+ symbol_dict = {"(":")",
|
|
|
|
+ "(":")",
|
|
|
|
+ "[":"]",
|
|
|
|
+ "【":"】",
|
|
|
|
+ ")":"(",
|
|
|
|
+ ")":"(",
|
|
|
|
+ "]":"[",
|
|
|
|
+ "】":"【"}
|
|
|
|
+ leftSymbol_pattern = re.compile("[\((\[【]")
|
|
|
|
+ rightSymbol_pattern = re.compile("[\))\]】]")
|
|
|
|
+ leftfinds = re.findall(leftSymbol_pattern,data)
|
|
|
|
+ rightfinds = re.findall(rightSymbol_pattern,data)
|
|
|
|
+ result = data
|
|
|
|
+ if len(leftfinds)+len(rightfinds)==0:
|
|
|
|
+ return data
|
|
|
|
+ elif len(leftfinds)==len(rightfinds):
|
|
|
|
+ return data
|
|
|
|
+ elif abs(len(leftfinds)-len(rightfinds))==1:
|
|
|
|
+ if len(leftfinds)>len(rightfinds):
|
|
|
|
+ if symbol_dict.get(data[0]) is not None:
|
|
|
|
+ result = data[1:]
|
|
|
|
+ else:
|
|
|
|
+ #print(symbol_dict.get(leftfinds[0]))
|
|
|
|
+ result = data+symbol_dict.get(leftfinds[0])
|
|
|
|
+ else:
|
|
|
|
+ if symbol_dict.get(data[-1]) is not None:
|
|
|
|
+ result = data[:-1]
|
|
|
|
+ else:
|
|
|
|
+ result = symbol_dict.get(rightfinds[0])+data
|
|
|
|
+ return result
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def getConnection_postgres(self,db):
|
|
|
|
+ conn = psycopg2.connect(dbname=db,user="postgres",password="postgres",host="192.168.2.103")
|
|
|
|
+ return conn
|
|
|
|
+
|
|
|
|
+ def label2interface(self,list_anno,Htext):
|
|
|
|
+ dict_result = {}
|
|
|
|
+ dict_anno = {}
|
|
|
|
+ for _anno in list_anno:
|
|
|
|
+ value = _anno["value"]
|
|
|
|
+ _split = value.split("\t")
|
|
|
|
+ if _split[0][0]=="T":
|
|
|
|
+ _type,_begin,_end = _split[1].split(" ")
|
|
|
|
+ dict_anno[_split[0]] = {"id":_split[0],"type":_type,"text":_split[2],"begin":int(_begin),"end":int(_end)}
|
|
|
|
+ elif _split[0][0]=="R":
|
|
|
|
+ _type,arg1,arg2 = _split[1].split(" ")
|
|
|
|
+ dict_anno[_split[0]] = {"id":_split[0],"type":_type,"arg1":arg1.split(":")[1],"arg2":arg2.split(":")[1]}
|
|
|
|
+ dict_role = {}
|
|
|
|
+ dict_money = {}
|
|
|
|
+ dict_person2role = {}
|
|
|
|
+ dict_name_freq_score = {}
|
|
|
|
+ pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
|
|
|
|
+ for k,v in dict_anno.items():
|
|
|
|
+ if v.get("type") in ["code","product","person_review"]:
|
|
|
|
+ if v.get("type") not in dict_result:
|
|
|
|
+ dict_result[v.get("type")] = []
|
|
|
|
+ dict_result[v.get("type")].append(v.get("text"))
|
|
|
|
+ dict_result[v.get("type")] = list(set(dict_result[v.get("type")]))
|
|
|
|
+ if v.get("type") in ["name","bidway","moneysource","serviceTime","time_release","time_bidopen","time_bidclose"]:
|
|
|
|
+ if v.get("type")=="name":
|
|
|
|
+ _name = self.fitDataByRule(v.get("text"))
|
|
|
|
+ w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', _name)!=None else 0.5
|
|
|
|
+ if _name not in dict_name_freq_score:
|
|
|
|
+ # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
|
|
|
|
+ dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
|
|
|
|
+ else:
|
|
|
|
+ dict_name_freq_score[_name][0] += 1
|
|
|
|
+ max_score = 0
|
|
|
|
+ for _k1,_v1 in dict_name_freq_score.items():
|
|
|
|
+ if _v1[0]*_v1[1]>max_score:
|
|
|
|
+ max_score = _v1[0]*_v1[1]
|
|
|
|
+ dict_result[v.get("type")] = _k1
|
|
|
|
+ if v.get("type") not in dict_result:
|
|
|
|
+ if v.get("type") in ["time_release","time_bidopen","time_bidclose"]:
|
|
|
|
+ _t = timeFormat(v.get("text"))
|
|
|
|
+ else:
|
|
|
|
+ _t = v.get("text")
|
|
|
|
+ dict_result[v.get("type")] = _t
|
|
|
|
+ _split = v.get("type").split("_")
|
|
|
|
+ if len(_split)>1:
|
|
|
|
+ if _split[1]=="tenderee":
|
|
|
|
+ dict_role["tenderee"] = {"subject":v.get("text")}
|
|
|
|
+ if _split[1]=="agency":
|
|
|
|
+ dict_role["agency"] = {"subject":v.get("text")}
|
|
|
|
+ if _split[1]=="tenderer":
|
|
|
|
+ dict_role["tenderer"] = {"subject":v.get("text")}
|
|
|
|
+ if _split[1]=="secondTenderer":
|
|
|
|
+ dict_role["secondTenderer"] = {"subject":v.get("text")}
|
|
|
|
+ if _split[1]=="thirdTenderer":
|
|
|
|
+ dict_role["thirdTenderer"] = {"subject":v.get("text")}
|
|
|
|
+ tendereeMoney = 0
|
|
|
|
+ for k,v in dict_anno.items():
|
|
|
|
+ _split = v.get("type").split("_")
|
|
|
|
+ if v.get("type") in ["money_tendereeMoney"]:
|
|
|
|
+ _before_text = Htext[max(v["begin"]-10,0):v["begin"]]
|
|
|
|
+ if re.search("万",_before_text) is not None and re.search("整",_before_text) is None:
|
|
|
|
+ _unit = 10000
|
|
|
|
+ else:
|
|
|
|
+ _unit = 1
|
|
|
|
+ tendereeMoney = float(getUnifyMoney(v["text"])*_unit)
|
|
|
|
+ if v.get("type") in ["rel_tendereeMoney","rel_tendererMoney"]:
|
|
|
|
+ arg1 = v.get("arg1")
|
|
|
|
+ arg2 = v.get("arg2")
|
|
|
|
+ for _k,_v in dict_role.items():
|
|
|
|
+ if _v["subject"]==dict_anno[arg1]["text"]:
|
|
|
|
+ _before_text = Htext[max(dict_anno[arg2]["begin"]-10,0):dict_anno[arg2]["begin"]]
|
|
|
|
+ if re.search("万",_before_text) is not None and re.search("整",_before_text) is None:
|
|
|
|
+ _unit = 10000
|
|
|
|
+ else:
|
|
|
|
+ _unit = 1
|
|
|
|
+ _v["money"] = float(getUnifyMoney(dict_anno[arg2]["text"])*_unit)
|
|
|
|
+ if v.get("type")=="person_tendereePerson":
|
|
|
|
+ if "tenderee" in dict_role:
|
|
|
|
+ if "person" not in dict_role["tenderee"]:
|
|
|
|
+ dict_role["tenderee"]["person"] = []
|
|
|
|
+ dict_role["tenderee"]["person"].append({"person":v["text"]})
|
|
|
|
+ if v.get("type")=="person_agencyPerson":
|
|
|
|
+ if "agency" in dict_role:
|
|
|
|
+ if "person" not in dict_role["agency"]:
|
|
|
|
+ dict_role["agency"]["person"] = []
|
|
|
|
+ dict_role["agency"]["person"].append({"person":v["text"]})
|
|
|
|
+ if v.get("type")=="rel_person":
|
|
|
|
+ arg1 = v.get("arg1")
|
|
|
|
+ arg2 = v.get("arg2")
|
|
|
|
+ for _k,_v in dict_role.items():
|
|
|
|
+ if _v["subject"]==dict_anno[arg1]["text"]:
|
|
|
|
+ if "person" not in dict_role[_k]:
|
|
|
|
+ dict_role[_k]["person"] = []
|
|
|
|
+ dict_role[_k]["person"].append({"person":dict_anno[arg2]["text"]})
|
|
|
|
+ dict_person2role[dict_anno[arg2]["text"]] = _k
|
|
|
|
+ for k,v in dict_anno.items():
|
|
|
|
+ if v.get("type")=="rel_phone":
|
|
|
|
+ arg1 = v.get("arg1")
|
|
|
|
+ arg2 = v.get("arg2")
|
|
|
|
+ _person = dict_anno[arg1]["text"]
|
|
|
|
+ if _person in dict_person2role:
|
|
|
|
+ for item in dict_role[dict_person2role[_person]]["person"]:
|
|
|
|
+ if item["person"]==_person:
|
|
|
|
+ item["phone"] = dict_anno[arg2]["text"]
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ roleList = []
|
|
|
|
+ for k,v in dict_role.items():
|
|
|
|
+ if k=="tenderee":
|
|
|
|
+ _role = "tenderee"
|
|
|
|
+ if k=="agency":
|
|
|
|
+ _role = "agency"
|
|
|
|
+ if k=="tenderer":
|
|
|
|
+ _role = "win_tenderer"
|
|
|
|
+ if k=="secondTenderer":
|
|
|
|
+ _role = "second_tenderer"
|
|
|
|
+ if k=="thirdTenderer":
|
|
|
|
+ _role = "third_tenderer"
|
|
|
|
+ list_person = []
|
|
|
|
+ set_person = set()
|
|
|
|
+ for item in v.get("person",[]):
|
|
|
|
+ if item["person"] not in set_person:
|
|
|
|
+ list_person.append([item["person"],item.get("phone","")])
|
|
|
|
+ set_person.add(item["person"])
|
|
|
|
+ roleList.append([_role,v.get("subject","").replace("(","(").replace(")",")"),v.get("money",0),list_person,""])
|
|
|
|
+
|
|
|
|
+ dict_result["prem"] = {"Project":{"roleList":roleList,"tendereeMoney":tendereeMoney}}
|
|
|
|
+ return dict_result
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def culExtractMetrics(self):
|
|
|
|
+ conn = self.conn1
|
|
|
|
+ cursor = conn.cursor()
|
|
|
|
+ sql = ' select begin_time,end_time,"user",doc_count from corpus_payroll where end_time<=\'2021-07-25\' order by end_time desc limit 20'
|
|
|
|
+ cursor.execute(sql)
|
|
|
|
+ list_diff = []
|
|
|
|
+ rows_payroll = cursor.fetchall()
|
|
|
|
+ for _payroll in rows_payroll:
|
|
|
|
+ _begin_time = _payroll[0]
|
|
|
|
+ _end_time = _payroll[1]
|
|
|
|
+ _user = _payroll[2]
|
|
|
|
+ doc_count = _payroll[3]
|
|
|
|
+ print(_user,_begin_time,_end_time,doc_count)
|
|
|
|
+ _sql = "select document_id,value from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' limit 100) order by document_id"%(_user,_begin_time,_end_time)
|
|
|
|
+ cursor.execute(_sql)
|
|
|
|
+ rows = cursor.fetchall()
|
|
|
|
+ if len(rows)>0:
|
|
|
|
+ current_docid = rows[0][0]
|
|
|
|
+ _index = -1
|
|
|
|
+ list_values = []
|
|
|
|
+ while _index<len(rows)-1:
|
|
|
|
+ _index += 1
|
|
|
|
+ row = rows[_index]
|
|
|
|
+ document_id = row[0]
|
|
|
|
+ value = row[1]
|
|
|
|
+ if document_id!=current_docid:
|
|
|
|
+ print(current_docid)
|
|
|
|
+ sql = "select text from corpus_iedocument where human_identifier='%s'"%(str(current_docid))
|
|
|
|
+ cursor.execute(sql)
|
|
|
|
+ content = cursor.fetchall()[0][0]
|
|
|
|
+ _inter = self.label2interface(list_values,content)
|
|
|
|
+ _inter2 = self.extractFromInterface(content)
|
|
|
|
+ if not len(_inter2.get("prem").keys())>1:
|
|
|
|
+ _diff = self.getDiff(_inter,_inter2)
|
|
|
|
+ list_diff.append(_diff)
|
|
|
|
+ _index -= 1
|
|
|
|
+ current_docid = document_id
|
|
|
|
+ list_values = []
|
|
|
|
+ else:
|
|
|
|
+ list_values.append({"document_id":document_id,"value":value})
|
|
|
|
+ metrics = self.getMetrics(list_diff)
|
|
|
|
+ print(metrics)
|
|
|
|
+
|
|
|
|
+ def extractFromInterface(self,content):
|
|
|
|
+ return json.loads(test("",content))
|
|
|
|
+
|
|
|
|
+ def getDiff(self,_inter,_inter2):
|
|
|
|
+ _dict = {}
|
|
|
|
+ for k in ["code","product","person_review"]:
|
|
|
|
+ set_k1 = _inter.get(k,set())
|
|
|
|
+ set_k2 = _inter2.get(k,set())
|
|
|
|
+ _dict["%s_inter"%k] = len(set_k1)
|
|
|
|
+ _dict["%s_inter2"%k] = len(set_k2)
|
|
|
|
+ _dict["%s_union"%k] = len(set(set_k1)&set(set_k2))
|
|
|
|
+
|
|
|
|
+ for k in ["name","bidway","moneysource","serviceTime","time_release","time_bidopen","time_bidclose"]:
|
|
|
|
+ _k1 = _inter.get(k,"")
|
|
|
|
+ _k2 = _inter2.get(k,"")
|
|
|
|
+ len_k1 = 0 if _k1=="" else 1
|
|
|
|
+ len_k2 = 0 if _k2=="" else 1
|
|
|
|
+ len_union = 1 if _k1==_k2 and len_k1==1 else 0
|
|
|
|
+ _dict["%s_inter"%k] = len_k1
|
|
|
|
+ _dict["%s_inter2"%k] = len_k2
|
|
|
|
+ _dict["%s_union"%k] = len_union
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ dict_project = {}
|
|
|
|
+ for k,v in _inter.get("prem",{}).items():
|
|
|
|
+ if float(v.get("tendereeMoney",0))>0:
|
|
|
|
+ dict_project["%s_inter"%("tendereeMoney")] = [float(v.get("tendereeMoney"))]
|
|
|
|
+ for _role in v.get("roleList",[]):
|
|
|
|
+ dict_project["%s_inter"%_role[0]] = [_role[1]]
|
|
|
|
+ if _role[0] in ["win_tenderer","second_tenderer","third_tenderer"]:
|
|
|
|
+ if float(_role[2])>0:
|
|
|
|
+ dict_project["%s_money_inter"%_role[0]] = [float(_role[2])]
|
|
|
|
+ for item in _role[3]:
|
|
|
|
+ _person = item[0]
|
|
|
|
+ _phone = item[1]
|
|
|
|
+ if _person=="" or _phone=="":
|
|
|
|
+ continue
|
|
|
|
+ if "%s_person_inter"%_role[0] not in dict_project:
|
|
|
|
+ dict_project["%s_person_inter"%_role[0]] = []
|
|
|
|
+ dict_project["%s_person_inter"%_role[0]].append("%s-%s"%(_role[1],_person))
|
|
|
|
+ if "person_phone_inter" not in dict_project:
|
|
|
|
+ dict_project["person_phone_inter"] = []
|
|
|
|
+ dict_project["person_phone_inter"].append("%s-%s"%(_person,_phone))
|
|
|
|
+ for k,v in _inter2.get("prem",{}).items():
|
|
|
|
+ if float(v.get("tendereeMoney",0))>0:
|
|
|
|
+ dict_project["%s_inter2"%("tendereeMoney")] = [float(v.get("tendereeMoney"))]
|
|
|
|
+ for _role in v.get("roleList",[]):
|
|
|
|
+ dict_project["%s_inter2"%_role[0]] = [_role[1]]
|
|
|
|
+ if _role[0] in ["win_tenderer","second_tenderer","third_tenderer"]:
|
|
|
|
+ if float(_role[2])>0:
|
|
|
|
+ dict_project["%s_money_inter2"%_role[0]] = [float(_role[2])]
|
|
|
|
+ for item in _role[3]:
|
|
|
|
+ _person = item[0]
|
|
|
|
+ _phone = item[1]
|
|
|
|
+ if _person=="" or _phone=="":
|
|
|
|
+ continue
|
|
|
|
+ if "%s_person_inter2"%_role[0] not in dict_project:
|
|
|
|
+ dict_project["%s_person_inter2"%_role[0]] = []
|
|
|
|
+ dict_project["%s_person_inter2"%_role[0]].append("%s-%s"%(_role[1],_person))
|
|
|
|
+ if "person_phone_inter2" not in dict_project:
|
|
|
|
+ dict_project["person_phone_inter2"] = []
|
|
|
|
+ dict_project["person_phone_inter2"].append("%s-%s"%(_person,_phone))
|
|
|
|
+ set_k = set()
|
|
|
|
+ for k,v in dict_project.items():
|
|
|
|
+ k_split = k.split("_")
|
|
|
|
+ base_key = "_".join(k_split[:-1])
|
|
|
|
+ if k_split[-1]=="inter":
|
|
|
|
+ k2 = "inter2"
|
|
|
|
+ else:
|
|
|
|
+ k2 = "inter"
|
|
|
|
+ if base_key in set_k:
|
|
|
|
+ continue
|
|
|
|
+ k_other = "%s_%s"%(base_key,k2)
|
|
|
|
+ _dict[k] = len(v)
|
|
|
|
+ _dict[k_other] = len(dict_project.get(k_other,[]))
|
|
|
|
+ _dict["%s_union"%base_key] = len(set(v)&set(dict_project.get(k_other,[])))
|
|
|
|
+ set_k.add(base_key)
|
|
|
|
+ print("=========================")
|
|
|
|
+ print(_inter)
|
|
|
|
+ print("-----")
|
|
|
|
+ print(_inter2)
|
|
|
|
+ print("|||||")
|
|
|
|
+ print(_dict)
|
|
|
|
+ return _dict
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def getMetrics(self,list_diff):
|
|
|
|
+ dict_key_count = {}
|
|
|
|
+ print("all_count:",list_diff)
|
|
|
|
+ for _diff in list_diff:
|
|
|
|
+ for k,v in _diff.items():
|
|
|
|
+ if k not in dict_key_count:
|
|
|
|
+ dict_key_count[k] = 0
|
|
|
|
+ dict_key_count[k] += v
|
|
|
|
+ set_k = set()
|
|
|
|
+ for k,v in dict_key_count.items():
|
|
|
|
+ k_split = k.split("_")
|
|
|
|
+ base_k = "_".join(k_split[:-1])
|
|
|
|
+ if base_k in set_k:
|
|
|
|
+ continue
|
|
|
|
+ set_k.add(base_k)
|
|
|
|
+ _count_inter = max(dict_key_count.get("%s_inter"%base_k,-1),1)
|
|
|
|
+ _count_inter2 = max(dict_key_count.get("%s_inter2"%base_k,-1),1)
|
|
|
|
+ _count_union = dict_key_count.get("%s_union"%base_k,0)
|
|
|
|
+ _precision = _count_union/_count_inter2
|
|
|
|
+ _recall = _count_union/_count_inter
|
|
|
|
+ _f1 = 2*(_precision*_recall)/(_precision+_recall)
|
|
|
|
+ print("%s: recall:%.3f,precision:%.3f,f1_score:%.3f"%(base_k,_recall,_precision,_f1))
|
|
|
|
+ print(base_k)
|
|
|
|
+ print("%.3f"%_f1)
|
|
|
|
+ print("%.3f"%_precision)
|
|
|
|
+ print("%.3f"%_recall)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__=="__main__":
|
|
|
|
+ em = ExtractMetric()
|
|
|
|
+ em.culExtractMetrics()
|
|
|
|
+
|