extract.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. '''
  2. Created on 2019年1月4日
  3. @author: User
  4. '''
  5. import os
  6. from bs4 import BeautifulSoup, Comment
  7. import copy
  8. import re
  9. import sys
  10. import os
  11. import codecs
  12. import requests
  13. import time
  14. _time1 = time.time()
  15. sys.path.append(os.path.abspath("../.."))
  16. from BiddingKG.dl.common.Utils import *
  17. import BiddingKG.dl.entityLink.entityLink as entityLink
  18. import BiddingKG.dl.interface.predictor as predictor
  19. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  20. import BiddingKG.dl.interface.getAttributes as getAttributes
  21. import BiddingKG.dl.complaint.punish_predictor as punish_rule
  22. import json
  23. from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
  24. from BiddingKG.dl.ratio.re_ratio import extract_ratio
  25. # 自定义jsonEncoder
  26. class MyEncoder(json.JSONEncoder):
  27. def default(self, obj):
  28. if isinstance(obj, np.ndarray):
  29. return obj.tolist()
  30. elif isinstance(obj, bytes):
  31. return str(obj, encoding='utf-8')
  32. elif isinstance(obj, (np.float_, np.float16, np.float32,
  33. np.float64)):
  34. return float(obj)
  35. elif isinstance(obj,str):
  36. return obj
  37. return json.JSONEncoder.default(self, obj)
  38. def predict(doc_id,text,title="",page_time="",**kwargs):
  39. cost_time = dict()
  40. start_time = time.time()
  41. log("start process doc %s"%(str(doc_id)))
  42. list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
  43. log("get preprocessed done of doc_id%s"%(doc_id))
  44. cost_time["preprocess"] = round(time.time()-start_time,2)
  45. cost_time.update(_cost_time)
  46. #依赖句子顺序
  47. start_time = time.time() # 公告类型/生命周期提取
  48. list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
  49. cost_time["channel"] = round(time.time()-start_time,2)
  50. start_time = time.time() # 项目编号、名称提取
  51. codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
  52. log("get codename done of doc_id%s"%(doc_id))
  53. cost_time["codename"] = round(time.time()-start_time,2)
  54. start_time = time.time() # 角色金额模型提取
  55. predictor.getPredictor("prem").predict(list_sentences,list_entitys)
  56. log("get prem done of doc_id%s"%(doc_id))
  57. cost_time["prem"] = round(time.time()-start_time,2)
  58. start_time = time.time() # 产品名称及废标原因提取
  59. fail = list_channel_dic[0]['docchannel'] == "废标公告"
  60. fail_reason = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类
  61. # predictor.getPredictor("product").predict(list_sentences, list_entitys)
  62. log("get product done of doc_id%s"%(doc_id))
  63. cost_time["product"] = round(time.time()-start_time,2)
  64. start_time = time.time() # 产品相关要素正则提取 单价、数量、品牌规格 ; 项目、需求、预算、时间
  65. product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
  66. log("get product attributes done of doc_id%s"%(doc_id))
  67. cost_time["product_attrs"] = round(time.time()-start_time,2)
  68. start_time = time.time() #正则角色提取
  69. predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
  70. cost_time["rule"] = round(time.time()-start_time,2)
  71. '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
  72. start_time = time.time() #正则角色提取
  73. predictor.getPredictor("roleRuleFinal").predict(list_articles, list_entitys)
  74. cost_time["roleRuleFinal"] = round(time.time()-start_time,2)
  75. start_time = time.time() #联系人模型提取
  76. predictor.getPredictor("epc").predict(list_sentences,list_entitys)
  77. log("get epc done of doc_id%s"%(doc_id))
  78. cost_time["person"] = round(time.time()-start_time,2)
  79. start_time = time.time() # 时间类别提取
  80. predictor.getPredictor("time").predict(list_sentences, list_entitys)
  81. log("get time done of doc_id%s"%(doc_id))
  82. cost_time["time"] = round(time.time()-start_time,2)
  83. start_time = time.time() # 保证金支付方式
  84. payment_way_dic = predictor.getPredictor("deposit_payment_way").predict(content=list_articles[0].content)
  85. cost_time["deposit"] = round(time.time()-start_time,2)
  86. # 需在getPredictor("prem")后 getAttributes.getPREMs 前
  87. if len(re.findall('监理|施工|设计|勘察', title))==1 and re.search('施工|总承包|epc|EPC',title)==None:
  88. keyword = re.search('监理|设计|勘察', title).group(0)
  89. for list_entity in list_entitys:
  90. for _entity in list_entity:
  91. # print('keyword:',keyword, '_entity.notes :',_entity.notes)
  92. if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label==2:
  93. if list_channel_dic[0]['docchannel'] == "招标公告":
  94. _entity.values[0] = 0.51
  95. _entity.set_Money(0, _entity.values) #2021/11/18 根据公告类别把费用改为招标或中投标金额
  96. else:
  97. _entity.values[1] = 0.51
  98. _entity.set_Money(1, _entity.values)
  99. # 2021-12-29新增:提取:总价,单价
  100. start_time = time.time() # 总价单价提取
  101. predictor.getPredictor("total_unit_money").predict(list_sentences, list_entitys)
  102. cost_time["total_unit_money"] = round(time.time()-start_time, 2)
  103. # 依赖句子顺序
  104. start_time = time.time() # 实体链接
  105. entityLink.link_entitys(list_entitys)
  106. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  107. log("get attributes done of doc_id%s"%(doc_id))
  108. cost_time["attrs"] = round(time.time()-start_time,2)
  109. start_time = time.time() #失信数据要素提取
  110. list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
  111. cost_time["punish"] = round(time.time()-start_time,2)
  112. if len(product_attrs[1]['demand_info']['data'])>0:
  113. for d in product_attrs[1]['demand_info']['data']:
  114. for product in set(prem[0]['product']):
  115. if product in d['project_name']:
  116. d['product'].append(product) #把产品在项目名称中的添加进需求要素中
  117. # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
  118. # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
  119. data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason)
  120. data_res["cost_time"] = cost_time
  121. data_res["success"] = True
  122. # for _article in list_articles:
  123. # log(_article.content)
  124. #
  125. # for list_entity in list_entitys:
  126. # for _entity in list_entity:
  127. # log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
  128. # (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
  129. # str(_entity.begin_index),str(_entity.end_index)))
  130. return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
  131. def test(name,content):
  132. user = {
  133. "content": content,
  134. "id":name
  135. }
  136. myheaders = {'Content-Type': 'application/json'}
  137. _resp = requests.post("http://192.168.2.102:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
  138. resp_json = _resp.content.decode("utf-8")
  139. # print(resp_json)
  140. return resp_json
  141. if __name__=="__main__":
  142. import pandas as pd
  143. t1 = time.time()
  144. # text = '中标人:广州中医药有限公司,招标人:广州市第一人民医院, 代理机构:希达招标代理有限公司。招标金额:100万元, 手续费:100元,总投资:1亿元。中标金额:50000元。合同金额:50000万元。'
  145. title = '打印机'
  146. # df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
  147. # # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
  148. # for i in range(30,50,1):
  149. # text = df.loc[i, 'dochtmlcon']
  150. # rs = json.loads(predict('', text, ''))
  151. # print(rs['demand_info'])
  152. # print(rs['product'])
  153. # print(rs['product_attrs'])
  154. # print(rs)
  155. with open('D:/html/2.html', 'r', encoding='utf-8') as f:
  156. text = f.read()
  157. t1 = time.time()
  158. print(predict('', text, title))
  159. t2 = time.time()
  160. print(predict('', text, title))
  161. t3 = time.time()
  162. print('第一次耗时:%.4f, 第二次耗时:%.4f'%(t2-t1, t3-t2))
  163. # print(predict('',text,title))
  164. # df = pd.read_excel('E:/大网站规则识别/大网站要素提取结果2.xlsx')[:]
  165. # df = pd.read_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115_2.xlsx')[:]
  166. # new_prem = []
  167. # for i in range(len(df)):
  168. # i = 530
  169. # doc_id = df.loc[i, 'docid']
  170. # text = df.loc[i, 'html']
  171. # # title = df.loc[i, 'doctitle']
  172. # rs = predict(doc_id,text)
  173. # rs = json.loads(rs)
  174. # prem = json.dumps(rs['prem'], ensure_ascii=False)
  175. # # print(rs)
  176. # new_prem.append(prem)
  177. # print(prem)
  178. # break
  179. # df['new_prem'] = pd.Series(new_prem)
  180. # print('耗时:', time.time()-t1)
  181. # # df.to_excel('E:/大网站规则识别/大网站要素提取结果20211115.xlsx')
  182. # df.to_excel('/data/python/lsm/datas_biddingkg/大网站要素提取结果20211115.xlsx')
  183. # # pass