extractMetric.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. import psycopg2
  2. from BiddingKG.dl.interface.extract import predict,test
  3. from BiddingKG.dl.common.Utils import getUnifyMoney,timeFormat
  4. from BiddingKG.dl.entityLink.entityLink import jaccard_score
  5. import re
  6. import json
  7. bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
  8. '公开比选': '其他', '国内竞争性磋商': '竞争性磋商',
  9. '招标方式:t公开': '公开招标', '竞价': '竞价',
  10. '竞标': '竞价', '电子竞价': '竞价',
  11. '电子书面竞投': '竞价', '单一来源': '单一来源',
  12. '网上竞价': '竞价', '公开招标': '公开招标',
  13. '询比': '询价', '定点采购': '其他',
  14. '招标方式:■公开': '公开招标', '交易其他,付款其他': '其他',
  15. '竞争性评审': '竞争性磋商', '公开招租': '其他', '\\N': '',
  16. '比选': '其他', '比质比价': '其他', '分散采购': '其他',
  17. '内部邀标': '邀请招标', '邀请招标': '邀请招标',
  18. '网上招标': '公开招标', '非定向询价': '询价',
  19. '网络竞价': '竞价', '公开询价': '询价',
  20. '定点采购议价': '其他', '询单': '询价',
  21. '网上挂牌': '其他', '网上直购': '其他',
  22. '定向询价': '询价', '采购方式:公开': '公开招标',
  23. '磋商': '竞争性磋商', '公开招投标': '公开招标',
  24. '招标方式:√公开': '公开招标', '公开选取': '公开招标',
  25. '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
  26. '竞争性磋商': '竞争性磋商', '采购方式:邀请': '邀请招标',
  27. '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
  28. '网上询价': '询价'}
  29. # bidway名称统一规范
  30. def bidway_integrate(bidway):
  31. integrate_name = bidway_dict.get(bidway,"其他")
  32. return integrate_name
  33. class ExtractMetric():
  34. def __init__(self):
  35. self.conn1 = self.getConnection_postgres("iepy")
  36. self.conn2 = self.getConnection_postgres("iepy")
  37. def fitDataByRule(self,data):
  38. symbol_dict = {"(":")",
  39. "(":")",
  40. "[":"]",
  41. "【":"】",
  42. ")":"(",
  43. ")":"(",
  44. "]":"[",
  45. "】":"【"}
  46. leftSymbol_pattern = re.compile("[\((\[【]")
  47. rightSymbol_pattern = re.compile("[\))\]】]")
  48. leftfinds = re.findall(leftSymbol_pattern,data)
  49. rightfinds = re.findall(rightSymbol_pattern,data)
  50. result = data
  51. if len(leftfinds)+len(rightfinds)==0:
  52. return data
  53. elif len(leftfinds)==len(rightfinds):
  54. return data
  55. elif abs(len(leftfinds)-len(rightfinds))==1:
  56. if len(leftfinds)>len(rightfinds):
  57. if symbol_dict.get(data[0]) is not None:
  58. result = data[1:]
  59. else:
  60. #print(symbol_dict.get(leftfinds[0]))
  61. result = data+symbol_dict.get(leftfinds[0])
  62. else:
  63. if symbol_dict.get(data[-1]) is not None:
  64. result = data[:-1]
  65. else:
  66. result = symbol_dict.get(rightfinds[0])+data
  67. return result
  68. def getConnection_postgres(self,db):
  69. conn = psycopg2.connect(dbname=db,user="postgres",password="postgres",host="192.168.2.103")
  70. return conn
  71. def label2interface(self,list_anno,Htext):
  72. dict_result = {}
  73. dict_anno = {}
  74. for _anno in list_anno:
  75. value = _anno["value"]
  76. _split = value.split("\t")
  77. if _split[0][0]=="T":
  78. _type,_begin,_end = _split[1].split(" ")
  79. dict_anno[_split[0]] = {"id":_split[0],"type":_type,"text":_split[2],"begin":int(_begin),"end":int(_end)}
  80. elif _split[0][0]=="R":
  81. _type,arg1,arg2 = _split[1].split(" ")
  82. dict_anno[_split[0]] = {"id":_split[0],"type":_type,"arg1":arg1.split(":")[1],"arg2":arg2.split(":")[1]}
  83. dict_role = {}
  84. dict_money = {}
  85. dict_person2role = {}
  86. dict_name_freq_score = {}
  87. pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
  88. for k,v in dict_anno.items():
  89. if v.get("type") in ["code","product","person_review"]:
  90. if v.get("type") not in dict_result:
  91. dict_result[v.get("type")] = []
  92. dict_result[v.get("type")].append(v.get("text"))
  93. dict_result[v.get("type")] = list(set(dict_result[v.get("type")]))
  94. if v.get("type") in ["name","bidway","moneysource","serviceTime","time_release","time_bidopen","time_bidclose"]:
  95. if v.get("type")=="name":
  96. _name = self.fitDataByRule(v.get("text"))
  97. w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', _name)!=None else 0.5
  98. if _name not in dict_name_freq_score:
  99. # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
  100. dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
  101. else:
  102. dict_name_freq_score[_name][0] += 1
  103. max_score = 0
  104. for _k1,_v1 in dict_name_freq_score.items():
  105. if _v1[0]*_v1[1]>max_score:
  106. max_score = _v1[0]*_v1[1]
  107. dict_result[v.get("type")] = _k1
  108. if v.get("type") not in dict_result:
  109. if v.get("type") in ["time_release","time_bidopen","time_bidclose"]:
  110. _t = timeFormat(v.get("text"))
  111. else:
  112. _t = v.get("text")
  113. dict_result[v.get("type")] = _t
  114. if v.get("type")=="bidway":
  115. dict_result[v.get("type")] = bidway_integrate(v.get("text"))
  116. _split = v.get("type").split("_")
  117. if len(_split)>1:
  118. if _split[1]=="tenderee":
  119. dict_role["tenderee"] = {"subject":v.get("text")}
  120. if _split[1]=="agency":
  121. dict_role["agency"] = {"subject":v.get("text")}
  122. if _split[1]=="tenderer":
  123. dict_role["tenderer"] = {"subject":v.get("text")}
  124. if _split[1]=="secondTenderer":
  125. dict_role["secondTenderer"] = {"subject":v.get("text")}
  126. if _split[1]=="thirdTenderer":
  127. dict_role["thirdTenderer"] = {"subject":v.get("text")}
  128. tendereeMoney = 0
  129. for k,v in dict_anno.items():
  130. _split = v.get("type").split("_")
  131. if v.get("type") in ["money_tendereeMoney"]:
  132. _before_text = Htext[max(v["begin"]-10,0):v["begin"]]
  133. if re.search('总投资|投资总额|总预算|总概算|投资规模|投资|工程造价', _before_text):
  134. continue
  135. if re.search("万",_before_text) is not None and re.search("整",_before_text) is None:
  136. _unit = 10000
  137. else:
  138. _unit = 1
  139. tendereeMoney = float(getUnifyMoney(v["text"])*_unit)
  140. if v.get("type") in ["rel_tendereeMoney","rel_tendererMoney"]:
  141. arg1 = v.get("arg1")
  142. arg2 = v.get("arg2")
  143. for _k,_v in dict_role.items():
  144. if _v["subject"]==dict_anno[arg1]["text"]:
  145. _before_text = Htext[max(dict_anno[arg2]["begin"]-10,0):dict_anno[arg2]["begin"]]
  146. if re.search('总投资|投资总额|总预算|总概算|投资规模|投资|工程造价', _before_text):
  147. continue
  148. if re.search("万",_before_text) is not None and re.search("整",_before_text) is None:
  149. _unit = 10000
  150. else:
  151. _unit = 1
  152. _v["money"] = float(getUnifyMoney(dict_anno[arg2]["text"])*_unit)
  153. if v.get("type")=="person_tendereePerson":
  154. if "tenderee" in dict_role:
  155. if "person" not in dict_role["tenderee"]:
  156. dict_role["tenderee"]["person"] = []
  157. dict_role["tenderee"]["person"].append({"person":v["text"]})
  158. if v.get("type")=="person_agencyPerson":
  159. if "agency" in dict_role:
  160. if "person" not in dict_role["agency"]:
  161. dict_role["agency"]["person"] = []
  162. dict_role["agency"]["person"].append({"person":v["text"]})
  163. if v.get("type")=="rel_person":
  164. arg1 = v.get("arg1")
  165. arg2 = v.get("arg2")
  166. for _k,_v in dict_role.items():
  167. if _v["subject"]==dict_anno[arg1]["text"]:
  168. if "person" not in dict_role[_k]:
  169. dict_role[_k]["person"] = []
  170. dict_role[_k]["person"].append({"person":dict_anno[arg2]["text"]})
  171. dict_person2role[dict_anno[arg2]["text"]] = _k
  172. for k,v in dict_anno.items():
  173. if v.get("type")=="rel_phone":
  174. arg1 = v.get("arg1")
  175. arg2 = v.get("arg2")
  176. _person = dict_anno[arg1]["text"]
  177. if _person in dict_person2role:
  178. for item in dict_role[dict_person2role[_person]]["person"]:
  179. if item["person"]==_person:
  180. item["phone"] = dict_anno[arg2]["text"]
  181. roleList = []
  182. for k,v in dict_role.items():
  183. if k=="tenderee":
  184. _role = "tenderee"
  185. if k=="agency":
  186. _role = "agency"
  187. if k=="tenderer":
  188. _role = "win_tenderer"
  189. if k=="secondTenderer":
  190. _role = "second_tenderer"
  191. if k=="thirdTenderer":
  192. _role = "third_tenderer"
  193. list_person = []
  194. set_person = set()
  195. for item in v.get("person",[]):
  196. if item["person"] not in set_person:
  197. list_person.append([item["person"],item.get("phone","")])
  198. set_person.add(item["person"])
  199. roleList.append([_role,v.get("subject","").replace("(","(").replace(")",")"),v.get("money",0),list_person,""])
  200. dict_result["prem"] = {"Project":{"roleList":roleList,"tendereeMoney":tendereeMoney}}
  201. return dict_result
  202. def culExtractMetrics(self):
  203. conn = self.conn1
  204. cursor = conn.cursor()
  205. sql = ' select begin_time,end_time,"user",doc_count from corpus_payroll where end_time<=\'2021-07-25\' order by end_time desc limit 20'
  206. cursor.execute(sql)
  207. list_diff = []
  208. rows_payroll = cursor.fetchall()
  209. for _payroll in rows_payroll:
  210. _begin_time = _payroll[0]
  211. _end_time = _payroll[1]
  212. _user = _payroll[2]
  213. doc_count = _payroll[3]
  214. print(_user,_begin_time,_end_time,doc_count)
  215. _sql = "select document_id,value from brat_bratannotation where document_id in (select human_identifier from corpus_iedocument where edituser='%s' and to_char(edittime,'yyyy-mm-dd')>='%s' and to_char(edittime,'yyyy-mm-dd')<='%s' limit 100) order by document_id"%(_user,_begin_time,_end_time)
  216. cursor.execute(_sql)
  217. rows = cursor.fetchall()
  218. if len(rows)>0:
  219. current_docid = rows[0][0]
  220. _index = -1
  221. list_values = []
  222. while _index<len(rows)-1:
  223. _index += 1
  224. row = rows[_index]
  225. document_id = row[0]
  226. value = row[1]
  227. if document_id!=current_docid:
  228. print(current_docid)
  229. sql = "select text from corpus_iedocument where human_identifier='%s'"%(str(current_docid))
  230. cursor.execute(sql)
  231. content = cursor.fetchall()[0][0]
  232. _inter = self.label2interface(list_values,content)
  233. _inter2 = self.extractFromInterface(content)
  234. if not len(_inter2.get("prem").keys())>1:
  235. _diff = self.getDiff(_inter,_inter2)
  236. list_diff.append(_diff)
  237. _index -= 1
  238. current_docid = document_id
  239. list_values = []
  240. else:
  241. list_values.append({"document_id":document_id,"value":value})
  242. metrics = self.getMetrics(list_diff)
  243. print(metrics)
  244. def extractFromInterface(self,content):
  245. _json = test("",content)
  246. return json.loads(_json)
  247. def getDiff(self,_inter,_inter2):
  248. _dict = {}
  249. for k in ["code","product","person_review"]:
  250. set_k1 = _inter.get(k,set())
  251. set_k2 = _inter2.get(k,set())
  252. _dict["%s_inter"%k] = len(set_k1)
  253. _dict["%s_inter2"%k] = len(set_k2)
  254. _dict["%s_union"%k] = len(set(set_k1)&set(set_k2))
  255. for k in ["name","bidway","moneysource","serviceTime","time_release","time_bidopen","time_bidclose"]:
  256. _k1 = _inter.get(k,"")
  257. _k2 = _inter2.get(k,"")
  258. len_k1 = 0 if _k1=="" else 1
  259. len_k2 = 0 if _k2=="" else 1
  260. if k in ["name","serviceTime"]:
  261. _score = jaccard_score(_k1,_k2)
  262. if len_k1 and len_k2 and _score>0.9:
  263. len_union = 1
  264. else:
  265. len_union = 0
  266. else:
  267. len_union = 1 if _k1==_k2 and len_k1==1 else 0
  268. _dict["%s_inter"%k] = len_k1
  269. _dict["%s_inter2"%k] = len_k2
  270. _dict["%s_union"%k] = len_union
  271. dict_project = {}
  272. for k,v in _inter.get("prem",{}).items():
  273. if float(v.get("tendereeMoney",0))>0:
  274. dict_project["%s_inter"%("tendereeMoney")] = [float(v.get("tendereeMoney"))]
  275. for _role in v.get("roleList",[]):
  276. dict_project["%s_inter"%_role[0]] = [_role[1]]
  277. if _role[0] in ["win_tenderer","second_tenderer","third_tenderer"]:
  278. if float(_role[2])>0:
  279. dict_project["%s_money_inter"%_role[0]] = [float(_role[2])]
  280. for item in _role[3]:
  281. _person = item[0]
  282. _phone = item[1]
  283. if _person=="" or _phone=="":
  284. continue
  285. if "%s_person_inter"%_role[0] not in dict_project:
  286. dict_project["%s_person_inter"%_role[0]] = []
  287. dict_project["%s_person_inter"%_role[0]].append("%s-%s"%(_role[1],_person))
  288. if "person_phone_inter" not in dict_project:
  289. dict_project["person_phone_inter"] = []
  290. dict_project["person_phone_inter"].append("%s-%s"%(_person,_phone))
  291. for k,v in _inter2.get("prem",{}).items():
  292. if float(v.get("tendereeMoney",0))>0:
  293. dict_project["%s_inter2"%("tendereeMoney")] = [float(v.get("tendereeMoney"))]
  294. for _role in v.get("roleList",[]):
  295. dict_project["%s_inter2"%_role.get("role_type")] = [_role.get("role_text")]
  296. if _role.get("role_type") in ["win_tenderer","second_tenderer","third_tenderer"]:
  297. if float(_role.get("role_money").get("money",0))>0:
  298. dict_project["%s_money_inter2"%_role.get("role_type")] = [float(_role.get("role_money").get("money",0))]
  299. for item in _role.get("linklist"):
  300. _person = item[0]
  301. _phone = item[1]
  302. if _person=="" or _phone=="":
  303. continue
  304. if "%s_person_inter2"%_role.get("role_type") not in dict_project:
  305. dict_project["%s_person_inter2"%_role.get("role_type")] = []
  306. dict_project["%s_person_inter2"%_role.get("role_type")].append("%s-%s"%(_role.get("role_text"),_person))
  307. if "person_phone_inter2" not in dict_project:
  308. dict_project["person_phone_inter2"] = []
  309. dict_project["person_phone_inter2"].append("%s-%s"%(_person,_phone))
  310. set_k = set()
  311. for k,v in dict_project.items():
  312. k_split = k.split("_")
  313. base_key = "_".join(k_split[:-1])
  314. if k_split[-1]=="inter":
  315. k2 = "inter2"
  316. else:
  317. k2 = "inter"
  318. if base_key in set_k:
  319. continue
  320. k_other = "%s_%s"%(base_key,k2)
  321. _dict[k] = len(v)
  322. _dict[k_other] = len(dict_project.get(k_other,[]))
  323. if base_key=="tenderee":
  324. _dict["%s_union"%base_key] = 0
  325. if _dict[k]>0 and _dict[k_other]>0:
  326. _score = jaccard_score(dict_project.get(k),dict_project.get(k_other))
  327. if _score>0.9:
  328. _dict["%s_union"%base_key] = 1
  329. else:
  330. #通过规则召回的也算
  331. if dict_project.get("%s_%s"%(base_key,"inter")) is None and dict_project.get("%s_%s"%(base_key,"inter2")) is not None:
  332. _dict[k] = 1
  333. _dict[k_other] = 1
  334. _dict["%s_union"%base_key] = 1
  335. else:
  336. _dict["%s_union"%base_key] = len(set(v)&set(dict_project.get(k_other,[])))
  337. set_k.add(base_key)
  338. print("=========================")
  339. print(_inter)
  340. print("-----")
  341. print(_inter2)
  342. print("|||||")
  343. print(_dict)
  344. return _dict
  345. def getMetrics(self,list_diff):
  346. dict_key_count = {}
  347. # print("all_count:",list_diff)
  348. for _diff in list_diff:
  349. for k,v in _diff.items():
  350. if k not in dict_key_count:
  351. dict_key_count[k] = 0
  352. dict_key_count[k] += v
  353. set_k = set()
  354. for k,v in dict_key_count.items():
  355. k_split = k.split("_")
  356. base_k = "_".join(k_split[:-1])
  357. if base_k in set_k:
  358. continue
  359. set_k.add(base_k)
  360. _count_inter = max(dict_key_count.get("%s_inter"%base_k,-1),1)
  361. _count_inter2 = max(dict_key_count.get("%s_inter2"%base_k,-1),1)
  362. _count_union = dict_key_count.get("%s_union"%base_k,0)
  363. _precision = _count_union/_count_inter2
  364. _recall = _count_union/_count_inter
  365. _f1 = 2*(_precision*_recall)/(_precision+_recall)
  366. print("%s: recall:%.3f,precision:%.3f,f1_score:%.3f"%(base_k,_recall,_precision,_f1))
  367. print(base_k)
  368. print("%.3f"%_f1)
  369. print("%.3f"%_precision)
  370. print("%.3f"%_recall)
  371. if __name__=="__main__":
  372. em = ExtractMetric()
  373. em.culExtractMetrics()