import psycopg2 from BiddingKG.dl.interface.extract import predict,test from BiddingKG.dl.common.Utils import getUnifyMoney,timeFormat import re import json class ExtractMetric(): def __init__(self): self.conn1 = self.getConnection_postgres("iepy") self.conn2 = self.getConnection_postgres("iepy") def fitDataByRule(self,data): symbol_dict = {"(":")", "(":")", "[":"]", "【":"】", ")":"(", ")":"(", "]":"[", "】":"【"} leftSymbol_pattern = re.compile("[\((\[【]") rightSymbol_pattern = re.compile("[\))\]】]") leftfinds = re.findall(leftSymbol_pattern,data) rightfinds = re.findall(rightSymbol_pattern,data) result = data if len(leftfinds)+len(rightfinds)==0: return data elif len(leftfinds)==len(rightfinds): return data elif abs(len(leftfinds)-len(rightfinds))==1: if len(leftfinds)>len(rightfinds): if symbol_dict.get(data[0]) is not None: result = data[1:] else: #print(symbol_dict.get(leftfinds[0])) result = data+symbol_dict.get(leftfinds[0]) else: if symbol_dict.get(data[-1]) is not None: result = data[:-1] else: result = symbol_dict.get(rightfinds[0])+data return result def getConnection_postgres(self,db): conn = psycopg2.connect(dbname=db,user="postgres",password="postgres",host="192.168.2.103") return conn def label2interface(self,list_anno,Htext): dict_result = {} dict_anno = {} for _anno in list_anno: value = _anno["value"] _split = value.split("\t") if _split[0][0]=="T": _type,_begin,_end = _split[1].split(" ") dict_anno[_split[0]] = {"id":_split[0],"type":_type,"text":_split[2],"begin":int(_begin),"end":int(_end)} elif _split[0][0]=="R": _type,arg1,arg2 = _split[1].split(" ") dict_anno[_split[0]] = {"id":_split[0],"type":_type,"arg1":arg1.split(":")[1],"arg2":arg2.split(":")[1]} dict_role = {} dict_money = {} dict_person2role = {} dict_name_freq_score = {} pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店") for k,v in dict_anno.items(): if v.get("type") in ["code","product","person_review"]: if v.get("type") not in dict_result: dict_result[v.get("type")] = [] dict_result[v.get("type")].append(v.get("text")) dict_result[v.get("type")] = list(set(dict_result[v.get("type")])) if v.get("type") in ["name","bidway","moneysource","serviceTime","time_release","time_bidopen","time_bidclose"]: if v.get("type")=="name": _name = self.fitDataByRule(v.get("text")) w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', _name)!=None else 0.5 if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1] dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w] else: dict_name_freq_score[_name][0] += 1 max_score = 0 for _k1,_v1 in dict_name_freq_score.items(): if _v1[0]*_v1[1]>max_score: max_score = _v1[0]*_v1[1] dict_result[v.get("type")] = _k1 if v.get("type") not in dict_result: if v.get("type") in ["time_release","time_bidopen","time_bidclose"]: _t = timeFormat(v.get("text")) else: _t = v.get("text") dict_result[v.get("type")] = _t _split = v.get("type").split("_") if len(_split)>1: if _split[1]=="tenderee": dict_role["tenderee"] = {"subject":v.get("text")} if _split[1]=="agency": dict_role["agency"] = {"subject":v.get("text")} if _split[1]=="tenderer": dict_role["tenderer"] = {"subject":v.get("text")} if _split[1]=="secondTenderer": dict_role["secondTenderer"] = {"subject":v.get("text")} if _split[1]=="thirdTenderer": dict_role["thirdTenderer"] = {"subject":v.get("text")} tendereeMoney = 0 for k,v in dict_anno.items(): _split = v.get("type").split("_") if v.get("type") in ["money_tendereeMoney"]: _before_text = Htext[max(v["begin"]-10,0):v["begin"]] if re.search("万",_before_text) is not None and re.search("整",_before_text) is None: _unit = 10000 else: _unit = 1 tendereeMoney = float(getUnifyMoney(v["text"])*_unit) if v.get("type") in ["rel_tendereeMoney","rel_tendererMoney"]: arg1 = v.get("arg1") arg2 = v.get("arg2") for _k,_v in dict_role.items(): if _v["subject"]==dict_anno[arg1]["text"]: _before_text = Htext[max(dict_anno[arg2]["begin"]-10,0):dict_anno[arg2]["begin"]] if re.search("万",_before_text) is not None and re.search("整",_before_text) is None: _unit = 10000 else: _unit = 1 _v["money"] = float(getUnifyMoney(dict_anno[arg2]["text"])*_unit) if v.get("type")=="person_tendereePerson": if "tenderee" in dict_role: if "person" not in dict_role["tenderee"]: dict_role["tenderee"]["person"] = [] dict_role["tenderee"]["person"].append({"person":v["text"]}) if v.get("type")=="person_agencyPerson": if "agency" in dict_role: if "person" not in dict_role["agency"]: dict_role["agency"]["person"] = [] dict_role["agency"]["person"].append({"person":v["text"]}) if v.get("type")=="rel_person": arg1 = v.get("arg1") arg2 = v.get("arg2") for _k,_v in dict_role.items(): if _v["subject"]==dict_anno[arg1]["text"]: if "person" not in dict_role[_k]: dict_role[_k]["person"] = [] dict_role[_k]["person"].append({"person":dict_anno[arg2]["text"]}) dict_person2role[dict_anno[arg2]["text"]] = _k for k,v in dict_anno.items(): if v.get("type")=="rel_phone": arg1 = v.get("arg1") arg2 = v.get("arg2") _person = dict_anno[arg1]["text"] if _person in dict_person2role: for item in dict_role[dict_person2role[_person]]["person"]: if item["person"]==_person: item["phone"] = dict_anno[arg2]["text"] roleList = [] for k,v in dict_role.items(): if k=="tenderee": _role = "tenderee" if k=="agency": _role = "agency" if k=="tenderer": _role = "win_tenderer" if k=="secondTenderer": _role = "second_tenderer" if k=="thirdTenderer": _role = "third_tenderer" list_person = [] set_person = set() for item in v.get("person",[]): if item["person"] not in set_person: list_person.append([item["person"],item.get("phone","")]) set_person.add(item["person"]) roleList.append([_role,v.get("subject","").replace("(","(").replace(")",")"),v.get("money",0),list_person,""]) dict_result["prem"] = {"Project":{"roleList":roleList,"tendereeMoney":tendereeMoney}} return dict_result def culExtractMetrics(self): conn = self.conn1 cursor = conn.cursor() sql = ' select begin_time,end_time,"user",doc_count from corpus_payroll where end_time<=\'2021-07-25\' order by end_time desc limit 20' cursor.execute(sql) list_diff = [] rows_payroll = cursor.fetchall() for _payroll in rows_payroll: _begin_time = _payroll[0] _end_time = _payroll[1] _user = _payroll[2] doc_count = _payroll[3] print(_user,_begin_time,_end_time,doc_count) _sql = "select document_id,value from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' limit 100) order by document_id"%(_user,_begin_time,_end_time) cursor.execute(_sql) rows = cursor.fetchall() if len(rows)>0: current_docid = rows[0][0] _index = -1 list_values = [] while _index1: _diff = self.getDiff(_inter,_inter2) list_diff.append(_diff) _index -= 1 current_docid = document_id list_values = [] else: list_values.append({"document_id":document_id,"value":value}) metrics = self.getMetrics(list_diff) print(metrics) def extractFromInterface(self,content): return json.loads(test("",content)) def getDiff(self,_inter,_inter2): _dict = {} for k in ["code","product","person_review"]: set_k1 = _inter.get(k,set()) set_k2 = _inter2.get(k,set()) _dict["%s_inter"%k] = len(set_k1) _dict["%s_inter2"%k] = len(set_k2) _dict["%s_union"%k] = len(set(set_k1)&set(set_k2)) for k in ["name","bidway","moneysource","serviceTime","time_release","time_bidopen","time_bidclose"]: _k1 = _inter.get(k,"") _k2 = _inter2.get(k,"") len_k1 = 0 if _k1=="" else 1 len_k2 = 0 if _k2=="" else 1 len_union = 1 if _k1==_k2 and len_k1==1 else 0 _dict["%s_inter"%k] = len_k1 _dict["%s_inter2"%k] = len_k2 _dict["%s_union"%k] = len_union dict_project = {} for k,v in _inter.get("prem",{}).items(): if float(v.get("tendereeMoney",0))>0: dict_project["%s_inter"%("tendereeMoney")] = [float(v.get("tendereeMoney"))] for _role in v.get("roleList",[]): dict_project["%s_inter"%_role[0]] = [_role[1]] if _role[0] in ["win_tenderer","second_tenderer","third_tenderer"]: if float(_role[2])>0: dict_project["%s_money_inter"%_role[0]] = [float(_role[2])] for item in _role[3]: _person = item[0] _phone = item[1] if _person=="" or _phone=="": continue if "%s_person_inter"%_role[0] not in dict_project: dict_project["%s_person_inter"%_role[0]] = [] dict_project["%s_person_inter"%_role[0]].append("%s-%s"%(_role[1],_person)) if "person_phone_inter" not in dict_project: dict_project["person_phone_inter"] = [] dict_project["person_phone_inter"].append("%s-%s"%(_person,_phone)) for k,v in _inter2.get("prem",{}).items(): if float(v.get("tendereeMoney",0))>0: dict_project["%s_inter2"%("tendereeMoney")] = [float(v.get("tendereeMoney"))] for _role in v.get("roleList",[]): dict_project["%s_inter2"%_role[0]] = [_role[1]] if _role[0] in ["win_tenderer","second_tenderer","third_tenderer"]: if float(_role[2])>0: dict_project["%s_money_inter2"%_role[0]] = [float(_role[2])] for item in _role[3]: _person = item[0] _phone = item[1] if _person=="" or _phone=="": continue if "%s_person_inter2"%_role[0] not in dict_project: dict_project["%s_person_inter2"%_role[0]] = [] dict_project["%s_person_inter2"%_role[0]].append("%s-%s"%(_role[1],_person)) if "person_phone_inter2" not in dict_project: dict_project["person_phone_inter2"] = [] dict_project["person_phone_inter2"].append("%s-%s"%(_person,_phone)) set_k = set() for k,v in dict_project.items(): k_split = k.split("_") base_key = "_".join(k_split[:-1]) if k_split[-1]=="inter": k2 = "inter2" else: k2 = "inter" if base_key in set_k: continue k_other = "%s_%s"%(base_key,k2) _dict[k] = len(v) _dict[k_other] = len(dict_project.get(k_other,[])) _dict["%s_union"%base_key] = len(set(v)&set(dict_project.get(k_other,[]))) set_k.add(base_key) print("=========================") print(_inter) print("-----") print(_inter2) print("|||||") print(_dict) return _dict def getMetrics(self,list_diff): dict_key_count = {} print("all_count:",list_diff) for _diff in list_diff: for k,v in _diff.items(): if k not in dict_key_count: dict_key_count[k] = 0 dict_key_count[k] += v set_k = set() for k,v in dict_key_count.items(): k_split = k.split("_") base_k = "_".join(k_split[:-1]) if base_k in set_k: continue set_k.add(base_k) _count_inter = max(dict_key_count.get("%s_inter"%base_k,-1),1) _count_inter2 = max(dict_key_count.get("%s_inter2"%base_k,-1),1) _count_union = dict_key_count.get("%s_union"%base_k,0) _precision = _count_union/_count_inter2 _recall = _count_union/_count_inter _f1 = 2*(_precision*_recall)/(_precision+_recall) print("%s: recall:%.3f,precision:%.3f,f1_score:%.3f"%(base_k,_recall,_precision,_f1)) print(base_k) print("%.3f"%_f1) print("%.3f"%_precision) print("%.3f"%_recall) if __name__=="__main__": em = ExtractMetric() em.culExtractMetrics()