extract.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. '''
  2. Created on 2019年1月4日
  3. @author: User
  4. '''
  5. from bs4 import BeautifulSoup, Comment
  6. import copy
  7. import re
  8. import sys
  9. import os
  10. import codecs
  11. import requests
  12. import time
  13. _time1 = time.time()
  14. sys.path.append(os.path.abspath("../.."))
  15. from BiddingKG.dl.common.Utils import *
  16. import BiddingKG.dl.entityLink.entityLink as entityLink
  17. import BiddingKG.dl.interface.predictor as predictor
  18. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  19. import BiddingKG.dl.interface.getAttributes as getAttributes
  20. import BiddingKG.dl.complaint.punish_predictor as punish_rule
  21. import json
  22. #自定义jsonEncoder
  23. class MyEncoder(json.JSONEncoder):
  24. def default(self, obj):
  25. if isinstance(obj, np.ndarray):
  26. return obj.tolist()
  27. elif isinstance(obj, bytes):
  28. return str(obj, encoding='utf-8')
  29. elif isinstance(obj, (np.float_, np.float16, np.float32,
  30. np.float64)):
  31. return float(obj)
  32. elif isinstance(obj,str):
  33. return obj
  34. return json.JSONEncoder.default(self, obj)
  35. def predict(doc_id,text,title=""):
  36. cost_time = dict()
  37. start_time = time.time()
  38. log("start process doc %s"%(str(doc_id)))
  39. list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
  40. log("get preprocessed done of doc_id%s"%(doc_id))
  41. cost_time["preprocess"] = time.time()-start_time
  42. cost_time.update(_cost_time)
  43. start_time = time.time()
  44. codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
  45. log("get codename done of doc_id%s"%(doc_id))
  46. cost_time["codename"] = time.time()-start_time
  47. start_time = time.time()
  48. predictor.getPredictor("prem").predict(list_sentences,list_entitys)
  49. log("get prem done of doc_id%s"%(doc_id))
  50. cost_time["prem"] = time.time()-start_time
  51. start_time = time.time()
  52. predictor.getPredictor("product").predict(list_sentences,list_entitys)
  53. log("get product done of doc_id%s"%(doc_id))
  54. cost_time["product"] = time.time()-start_time
  55. start_time = time.time()
  56. predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
  57. cost_time["rule"] = time.time()-start_time
  58. start_time = time.time()
  59. predictor.getPredictor("epc").predict(list_sentences,list_entitys)
  60. log("get epc done of doc_id%s"%(doc_id))
  61. cost_time["person"] = time.time()-start_time
  62. start_time = time.time()
  63. predictor.getPredictor("time").predict(list_sentences, list_entitys)
  64. log("get time done of doc_id%s"%(doc_id))
  65. cost_time["time"] = time.time()-start_time
  66. start_time = time.time()
  67. entityLink.link_entitys(list_entitys)
  68. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  69. log("get attributes done of doc_id%s"%(doc_id))
  70. cost_time["attrs"] = time.time()-start_time
  71. start_time = time.time()
  72. list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
  73. cost_time["punish"] = time.time()-start_time
  74. #print(prem)
  75. data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
  76. data_res["cost_time"] = cost_time
  77. data_res["success"] = True
  78. for _article in list_articles:
  79. log(_article.content)
  80. for list_entity in list_entitys:
  81. for _entity in list_entity:
  82. log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
  83. (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
  84. str(_entity.begin_index),str(_entity.end_index)))
  85. return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
  86. def test(name,content):
  87. user = {
  88. "content": content,
  89. "id":name
  90. }
  91. myheaders = {'Content-Type': 'application/json'}
  92. _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
  93. resp_json = _resp.content.decode("utf-8")
  94. print(resp_json)
  95. return resp_json
  96. if __name__=="__main__":
  97. pass