test4.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. '''
  2. Created on 2019年1月4日
  3. @author: User
  4. '''
  5. from bs4 import BeautifulSoup, Comment
  6. import copy
  7. import re
  8. import sys
  9. import os
  10. import codecs
  11. import requests
  12. import time
  13. _time1 = time.time()
  14. sys.path.append(os.path.abspath("../.."))
  15. import fool
  16. from BiddingKG.dl.interface.Connection import *
  17. from BiddingKG.dl.common.Utils import *
  18. from BiddingKG.dl.interface.Connection import getConnection
  19. import BiddingKG.dl.interface.predictor as predictor
  20. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  21. import BiddingKG.dl.interface.getAttributes as getAttributes
  22. import BiddingKG.dl.entityLink.entityLink as entityLink
  23. import json
  24. '''
  25. doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
  26. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  27. cursor = conn.cursor()
  28. cursor.execute(" select content from articles where id='"+doc_id+"' ")
  29. row = cursor.fetchall()[0]
  30. #text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
  31. #content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
  32. '''
  33. ''''''
  34. codeNamePredict = predictor.CodeNamePredict()
  35. premPredict = predictor.PREMPredict()
  36. epcPredict = predictor.EPCPredict()
  37. roleRulePredict = predictor.RoleRulePredictor()
  38. #自定义jsonEncoder
  39. class MyEncoder(json.JSONEncoder):
  40. def default(self, obj):
  41. if isinstance(obj, np.ndarray):
  42. return obj.tolist()
  43. elif isinstance(obj, bytes):
  44. return str(obj, encoding='utf-8')
  45. elif isinstance(obj, (np.float_, np.float16, np.float32,
  46. np.float64)):
  47. return float(obj)
  48. elif isinstance(obj,str):
  49. return obj
  50. return json.JSONEncoder.default(self, obj)
  51. def predict(doc_id,text):
  52. list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
  53. for articles in list_articles:
  54. print(articles.content)
  55. ''''''
  56. codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
  57. print(codeName)
  58. premPredict.predict(list_sentences,list_entitys)
  59. roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
  60. print("epcPredict")
  61. epcPredict.predict(list_sentences,list_entitys)
  62. print("entityLink")
  63. entityLink.link_entitys(list_entitys)
  64. print("getPREMs")
  65. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  66. print("getPREMs")
  67. ''''''
  68. for entitys in list_entitys:
  69. for entity in entitys:
  70. print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
  71. #print(prem)
  72. return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
  73. def test(name,content):
  74. user = {
  75. "content": content,
  76. "id":name
  77. }
  78. myheaders = {'Content-Type': 'application/json'}
  79. _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
  80. resp_json = _resp.content.decode("utf-8")
  81. print(resp_json)
  82. return resp_json
  83. if __name__=="__main__":
  84. filename = "比地_52_79929693.html"
  85. #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
  86. text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
  87. content = str(BeautifulSoup(text).find("div",id="pcontent"))
  88. # df_a = {"html":[]}
  89. # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
  90. # import pandas as pd
  91. # df = pd.DataFrame(df_a)
  92. # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
  93. # print()
  94. #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
  95. #text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
  96. a = time.time()
  97. print("start")
  98. print(predict("12",content))
  99. #test("12",text)
  100. print("takes",time.time()-a)
  101. pass