extract.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. '''
  2. Created on 2019年1月4日
  3. @author: User
  4. '''
  5. import os
  6. from bs4 import BeautifulSoup, Comment
  7. import copy
  8. import re
  9. import sys
  10. import os
  11. import codecs
  12. import requests
  13. import time
  14. _time1 = time.time()
  15. sys.path.append(os.path.abspath("../.."))
  16. from BiddingKG.dl.common.Utils import *
  17. import BiddingKG.dl.entityLink.entityLink as entityLink
  18. import BiddingKG.dl.interface.predictor as predictor
  19. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  20. import BiddingKG.dl.interface.getAttributes as getAttributes
  21. import BiddingKG.dl.complaint.punish_predictor as punish_rule
  22. import json
  23. #自定义jsonEncoder
  24. class MyEncoder(json.JSONEncoder):
  25. def default(self, obj):
  26. if isinstance(obj, np.ndarray):
  27. return obj.tolist()
  28. elif isinstance(obj, bytes):
  29. return str(obj, encoding='utf-8')
  30. elif isinstance(obj, (np.float_, np.float16, np.float32,
  31. np.float64)):
  32. return float(obj)
  33. elif isinstance(obj,str):
  34. return obj
  35. return json.JSONEncoder.default(self, obj)
  36. def predict(doc_id,text,title=""):
  37. cost_time = dict()
  38. start_time = time.time()
  39. log("start process doc %s"%(str(doc_id)))
  40. list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
  41. log("get preprocessed done of doc_id%s"%(doc_id))
  42. cost_time["preprocess"] = time.time()-start_time
  43. cost_time.update(_cost_time)
  44. start_time = time.time()
  45. codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
  46. log("get codename done of doc_id%s"%(doc_id))
  47. cost_time["codename"] = time.time()-start_time
  48. start_time = time.time()
  49. predictor.getPredictor("prem").predict(list_sentences,list_entitys)
  50. log("get prem done of doc_id%s"%(doc_id))
  51. cost_time["prem"] = time.time()-start_time
  52. start_time = time.time()
  53. predictor.getPredictor("product").predict(list_sentences,list_entitys)
  54. log("get product done of doc_id%s"%(doc_id))
  55. cost_time["product"] = time.time()-start_time
  56. start_time = time.time()
  57. predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
  58. cost_time["rule"] = time.time()-start_time
  59. start_time = time.time()
  60. predictor.getPredictor("epc").predict(list_sentences,list_entitys)
  61. log("get epc done of doc_id%s"%(doc_id))
  62. cost_time["person"] = time.time()-start_time
  63. start_time = time.time()
  64. predictor.getPredictor("time").predict(list_sentences, list_entitys)
  65. log("get time done of doc_id%s"%(doc_id))
  66. cost_time["time"] = time.time()-start_time
  67. start_time = time.time()
  68. entityLink.link_entitys(list_entitys)
  69. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  70. log("get attributes done of doc_id%s"%(doc_id))
  71. cost_time["attrs"] = time.time()-start_time
  72. start_time = time.time()
  73. list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
  74. cost_time["punish"] = time.time()-start_time
  75. start_time = time.time()
  76. list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
  77. cost_time["channel"] = time.time()-start_time
  78. #print(prem)
  79. # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
  80. data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
  81. data_res["cost_time"] = cost_time
  82. data_res["success"] = True
  83. for _article in list_articles:
  84. log(_article.content)
  85. for list_entity in list_entitys:
  86. for _entity in list_entity:
  87. log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
  88. (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
  89. str(_entity.begin_index),str(_entity.end_index)))
  90. return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
  91. def test(name,content):
  92. user = {
  93. "content": content,
  94. "id":name
  95. }
  96. myheaders = {'Content-Type': 'application/json'}
  97. _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
  98. resp_json = _resp.content.decode("utf-8")
  99. print(resp_json)
  100. return resp_json
  101. if __name__=="__main__":
  102. import pandas as pd
  103. df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx')
  104. new_prem = []
  105. for i in range(len(df)):
  106. # i = 246
  107. doc_id = df.loc[i, 'docid']
  108. text = df.loc[i, 'dochtmlcon']
  109. title = df.loc[i, 'doctitle']
  110. rs = predict(doc_id,text,title)
  111. # print(rs)
  112. new_prem.append(rs)
  113. df['new_prem'] = pd.Series(new_prem)
  114. df.to_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0813.xlsx')
  115. pass