测试整个要素提取流程.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374
  1. '''
  2. Created on 2019年1月4日
  3. @author: User
  4. '''
  5. from bs4 import BeautifulSoup, Comment
  6. import copy
  7. import re
  8. import sys
  9. import os
  10. import codecs
  11. import requests
  12. import time
  13. _time1 = time.time()
  14. sys.path.append(os.path.abspath("../.."))
  15. sys.path.append(os.path.abspath('../../'))
  16. print('当前路径为:',os.getcwd())
  17. print('sys.path',sys.path)
  18. import fool
  19. from BiddingKG.dl.interface.Connection import *
  20. from BiddingKG.dl.common.Utils import *
  21. from BiddingKG.dl.interface.Connection import getConnection
  22. import BiddingKG.dl.interface.predictor as predictor
  23. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  24. import BiddingKG.dl.interface.getAttributes as getAttributes
  25. import BiddingKG.dl.entityLink.entityLink as entityLink
  26. import json
  27. '''
  28. doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
  29. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  30. cursor = conn.cursor()
  31. cursor.execute(" select content from articles where id='"+doc_id+"' ")
  32. row = cursor.fetchall()[0]
  33. #text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
  34. #content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
  35. '''
  36. ''''''
  37. codeNamePredict = predictor.CodeNamePredict()
  38. premPredict = predictor.PREMPredict()
  39. epcPredict = predictor.EPCPredict()
  40. roleRulePredict = predictor.RoleRulePredictor()
  41. #自定义jsonEncoder
  42. class MyEncoder(json.JSONEncoder):
  43. def default(self, obj):
  44. if isinstance(obj, np.ndarray):
  45. return obj.tolist()
  46. elif isinstance(obj, bytes):
  47. return str(obj, encoding='utf-8')
  48. elif isinstance(obj, (np.float_, np.float16, np.float32,
  49. np.float64)):
  50. return float(obj)
  51. elif isinstance(obj,str):
  52. return obj
  53. return json.JSONEncoder.default(self, obj)
  54. def predict(doc_id,text):
  55. list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
  56. # for articles in list_articles:
  57. # print('预处理后文本信息')
  58. # print(articles.content)
  59. ''''''
  60. codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
  61. # print(codeName)
  62. premPredict.predict(list_sentences,list_entitys)
  63. roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
  64. # print("epcPredict")
  65. epcPredict.predict(list_sentences,list_entitys)
  66. # print("entityLink")
  67. entityLink.link_entitys(list_entitys)
  68. # print("getPREMs")
  69. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  70. # print("getPREMs")
  71. ''''''
  72. entitys_all = [[[entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index] for entity in entitys] for entitys in list_entitys]
  73. for entitys in entitys_all:
  74. # print(entitys)
  75. # en_types = set([it[1] for it in entitys])
  76. print([(it[0],it[1], it[2],it[3][it[2]],it[4],it[5],it[6]) for it in entitys if it[1] in ('org', 'company', 'person')])
  77. # print([it for it in entitys if it[1] in ('org','company','person')])
  78. # for en_type in en_types:
  79. # print('***************************************')
  80. # print(en_type)
  81. # print([(it[0],it[2],it[3]) for it in entitys if it[1]==en_type])
  82. # for entitys in list_entitys:
  83. # for entity in entitys:
  84. # print('**********实体信息****************')
  85. # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
  86. #print(prem)
  87. return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
  88. # def test(name,content):
  89. # user = {
  90. # "content": content,
  91. # "id":name
  92. # }
  93. # myheaders = {'Content-Type': 'application/json'}
  94. # _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
  95. # resp_json = _resp.content.decode("utf-8")
  96. # print(resp_json)
  97. # return resp_json
  98. def get_result_online(docid):
  99. import psycopg2
  100. conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
  101. cursor = conn.cursor()
  102. sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
  103. cursor.execute(sql)
  104. rows = cursor.fetchall()
  105. user = {
  106. "content": rows[0][1],
  107. "id":docid
  108. }
  109. myheaders = {'Content-Type': 'application/json'}
  110. _resp = requests.post("http://192.168.2.101:15030" + '/article_extract', json=user, headers=myheaders, verify=True) # 15015 #最新模型15030
  111. resp_json = _resp.content.decode("utf-8")
  112. return json.loads(resp_json)
  113. def get_result(docid):
  114. import psycopg2
  115. conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
  116. cursor = conn.cursor()
  117. sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
  118. cursor.execute(sql)
  119. rows = cursor.fetchall()
  120. return(json.loads(predict(docid, rows[0][1])))
  121. def analys_person_phone():
  122. import pandas as pd
  123. import time
  124. t1 = time.time()
  125. df = pd.read_excel(r'E:\workspace\BiddingKG\BiddingKG\dl\person\实习生标注信息角色联系人电话.xlsx', encoding='utf-8')
  126. lab_num = pos_num = pre_num = 0
  127. lab_num2 = pos_num2 = pre_num2 = 0
  128. lab_person = pos_person = pre_person = 0
  129. lab_role = pos_role = pre_role = 0
  130. person_errors = []
  131. phone_errors = []
  132. join_errors = []
  133. person_name_errors =[]
  134. role_name_errors =[]
  135. for docid in set(df['doc_id']):
  136. print('开始处理 : ',docid)
  137. df_tmp = df[df.loc[:, 'doc_id'] == docid]
  138. values = list(df_tmp['value'])
  139. a = [it.split() for it in values]
  140. rel_person = [it for it in a if it[1] == 'rel_person']
  141. rel_phone = [it for it in a if it[1] == 'rel_phone']
  142. r1 = get_result(str(docid))
  143. # r1 = get_result_online(str(docid))
  144. label_role_person = [] # 标注角色+联系人
  145. for rel in rel_person:
  146. role = [it for it in a if it[0] == rel[2].split(':')[-1]]
  147. person = [it for it in a if it[0] == rel[3].split(':')[-1]]
  148. if person != [] and role != []:
  149. label_role_person.append(role[0][-1] +'+'+ person[0][-1])
  150. label_person_phone = [] # 标注角色+联系人
  151. for rel in rel_phone:
  152. person = [it for it in a if it[0] == rel[2].split(':')[-1]]
  153. phone = [it for it in a if it[0] == rel[3].split(':')[-1]]
  154. if person != [] and phone != []:
  155. label_person_phone.append(person[0][-1] +'+'+ phone[0][-1])
  156. role_person = []
  157. person_phone = []
  158. if r1.get('success','')==False:
  159. print(docid, '接口返回失败 ')
  160. else:
  161. for v in r1['prem'].values():
  162. roleList = v['roleList']
  163. for role in roleList:
  164. for it in role[3]:
  165. role_person.append(role[1] +'+'+ it[0])
  166. for role in roleList:
  167. for it in role[3]:
  168. person_phone.append(it[0] +'+'+ it[1])
  169. # print(set(label_person_phone))
  170. # print(set(person_phone))
  171. pos_num += len(set(role_person) & set(label_role_person))
  172. lab_num += len(set(label_role_person))
  173. pre_num += len(set(role_person))
  174. if set(role_person)&set(label_role_person) != set(label_role_person):
  175. person_errors.append([docid, set(label_role_person), set(role_person)])
  176. # 判断角色联系人是否正确逻辑:1、先看预测角色是否都在标签角色里,2判断预测联系人是否在标签联系人,
  177. # print(set(role_person))
  178. # print(set(label_role_person))
  179. if set(label_person_phone) & set(person_phone)!=set(label_person_phone):
  180. phone_errors.append([docid, set(label_person_phone), set(person_phone)])
  181. pos_num2 += len(set(label_person_phone) & set(person_phone))
  182. lab_num2 += len(set(label_person_phone))
  183. pre_num2 += len(set(person_phone))
  184. lab_person += len(set([it.split('+')[1] for it in label_role_person]))
  185. pos_person += len(set([it.split('+')[1] for it in label_role_person])&set([it.split('+')[1] for it in role_person]))
  186. pre_person += len(set([it.split('+')[1] for it in role_person]))
  187. lab_role += len(set([it.split('+')[0] for it in label_role_person]))
  188. pos_role += len(set([it.split('+')[0] for it in label_role_person])&set([it.split('+')[0] for it in role_person]))
  189. pre_role += len(set([it.split('+')[0] for it in role_person]))
  190. if set([it.split('+')[0] for it in label_role_person]) != set([it.split('+')[0] for it in role_person]):
  191. if set([it.split('+')[1] for it in label_role_person]) != set([it.split('+')[1] for it in role_person]):
  192. person_name_errors.append([docid,set(label_role_person), set(role_person)])
  193. else:
  194. role_name_errors.append([docid, set(label_role_person), set(role_person)])
  195. else:
  196. if set([it.split('+')[1] for it in label_role_person]) != set([it.split('+')[1] for it in role_person]):
  197. person_name_errors.append([docid, set(label_role_person), set(role_person)])
  198. elif set(label_role_person)!= set(role_person):
  199. print(docid,set(label_role_person), set(role_person))
  200. join_errors.append([docid,set(label_role_person), set(role_person)])
  201. print('单独角色召回率:%.4f,准确率:%.4f'%(pos_role/lab_role, pos_role/pre_role))
  202. print('单独联系人召回率:%.4f, 准确率:%.4f'%(pos_person/lab_person, pos_person/pre_person))
  203. print('联系人召回率:%.4f, 准确率:%.4f' % (pos_num / lab_num, pos_num / pre_num))
  204. print('电话召回率:%.4f,准确率:%.4f' % (pos_num2 / lab_num2, pos_num2 / pre_num2))
  205. print('总耗时:',time.time()-t1)
  206. return person_errors, phone_errors, join_errors, role_name_errors, person_name_errors
  207. def predict_fromdb(docid, dbname="sys_document_23"):
  208. # import pymysql
  209. # conn = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD') #新账号密码
  210. # cursor = conn.cursor()
  211. # sql = "SELECT docid as id, dochtmlcon as content from {1} WHERE DOCID='{0}';".format(docid, dbname)
  212. import psycopg2
  213. conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
  214. cursor = conn.cursor()
  215. sql = """select human_identifier as id,sourcetext as content from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
  216. cursor.execute(sql)
  217. rows = cursor.fetchall()
  218. doc_id = rows[0][0]
  219. text = rows[0][1]
  220. # text = '竟然很明显的表达没识别为代理,代理机构名称:国信国采(北京)招标咨询有限责任公司,代理机构地址:北京市海淀区首体南路22号国兴大厦11层, 1.采购人信息名 称:北京市植物园。'
  221. list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],useselffool=True)
  222. codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
  223. # print(codeName)
  224. premPredict.predict(list_sentences, list_entitys)
  225. roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
  226. # print("epcPredict")
  227. epcPredict.predict(list_sentences, list_entitys)
  228. # print("entityLink")
  229. entityLink.link_entitys(list_entitys)
  230. # print("getPREMs")
  231. prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
  232. return list_articles, list_sentences, list_entitys, codeName, prem
  233. if __name__=="__main__":
  234. # import pandas as pd
  235. # import math
  236. # import pymysql
  237. # conn = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD') #新账号密码
  238. # cursor = conn.cursor()
  239. # df = pd.read_excel('G:/大网站规则识别/1027统计入库top100编号.xlsx')
  240. # docs_list = []
  241. # for i in range(100):
  242. # web_no = df.loc[i, '编号']
  243. # # num = math.ceil(int(df.loc[i, '1019-1023入库公告数量']) * 0.01)
  244. # num = 10
  245. # sql = "SELECT DOCID,DOCCHANNEL,DOCHTMLCON,WEB_SOURCE_NO from sys_document_23 where WEB_SOURCE_NO='{0}' and DOCCHANNEL='101' and DOCID%9=1 limit {1}".format(
  246. # web_no, num)
  247. # # rows = cursor.execute(sql) 此处代码错误 rows 需要用 cursor.fetchall方法获取
  248. # cursor.execute(sql)
  249. # rows = cursor.fetchall()
  250. # docs_list.extend(list(rows))
  251. # df_doc = pd.DataFrame(docs_list, columns=['docid', 'channel', 'html', 'web_no'])
  252. # codenames = []
  253. # prems = []
  254. # for docid,text in zip(df_doc['docid'], df_doc['html']):
  255. # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[docid, text, "", "", ""]],
  256. # useselffool=True)
  257. # codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
  258. # # print(codeName)
  259. # premPredict.predict(list_sentences, list_entitys)
  260. # roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
  261. # # print("epcPredict")
  262. # epcPredict.predict(list_sentences, list_entitys)
  263. # # print("entityLink")
  264. # entityLink.link_entitys(list_entitys)
  265. # # print("getPREMs")
  266. # prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
  267. # if codeName:
  268. # codenames.append(codeName[0][1])
  269. # else:
  270. # codenames.append(" ")
  271. # if prem:
  272. # prems.append(prem[0][1])
  273. # else:
  274. # prems.append(" ")
  275. # df_doc['codename'] = pd.Series(codenames)
  276. # df_doc['prem'] = pd.Series(prems)
  277. # df_doc.to_excel('G:/大网站规则识别/大网站规则调整后预测结果20201124.xlsx', columns=['docid', 'channel', 'html', 'prem', 'codename', 'web_no'])
  278. list_articles, list_sentences, list_entitys, codeName, prem = predict_fromdb('100006370',dbname="sys_document_25") #sys_document_23
  279. print(prem)
  280. print(codeName)
  281. entitys_all = [[[entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index] for entity in entitys] for entitys in list_entitys]
  282. for entitys in entitys_all:
  283. # print(entitys)
  284. # en_types = set([it[1] for it in entitys])
  285. print([(it[0],it[1], it[2],it[3][it[2]],it[4],it[5],it[6]) for it in entitys if it[1] in ('org', 'company', 'person')])
  286. print(list_articles[0].content)
  287. # print(get_result('100000203'))
  288. # person_errors, phone_errors, join_errors, role_name_errors, person_name_errors = analys_person_phone()
  289. # import pickle
  290. # with open('phone_errors.pkl','wb') as f:
  291. # pickle.dump(phone_errors, f)
  292. # filename = "比地_52_79929693.html"
  293. # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
  294. # # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
  295. # # text = codecs.open('F:/工作文档/实体识别实体对其/20190320/比地_101_58511386.html', encoding='utf-8').read()
  296. # docid = '100000203'
  297. # r1 = get_result(docid)
  298. # r2 = get_result_online(docid)
  299. # rolperson = []
  300. # person_phone = []
  301. # for v in r1['prem'].values():
  302. # roleList = v['roleList']
  303. # for role in roleList:
  304. # for it in role[3]:
  305. # rolperson.append(role[1] + it[0])
  306. # for role in roleList:
  307. # for it in role[3]:
  308. # person_phone.append(it[0]+it[1])
  309. # print(r1['prem'])
  310. # print(r2['prem'])
  311. #
  312. # import psycopg2
  313. # conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
  314. # cursor = conn.cursor()
  315. # sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('95008163');"""
  316. # cursor.execute(sql)
  317. # rows = cursor.fetchall()
  318. # # print(len(rows), rows)
  319. # content = rows[0][1]
  320. # # content = str(BeautifulSoup(text).find("div",id="pcontent"))
  321. # # content = text
  322. # # print('content: ',content)
  323. # #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
  324. # #text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
  325. # a = time.time()
  326. # print("start")
  327. # # print(predict("12",content))
  328. # result = predict("12",content)
  329. # print(json.loads(result))
  330. # #test("12",text)
  331. # print("takes",time.time()-a)
  332. # _time2 = time.time()
  333. # print(predict("12",content))
  334. # _time3 = time.time()
  335. # print("init takes:%d"%((_time2-_time1)-(_time3-_time2)))
  336. # pass