test4.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. '''
  2. Created on 2019年1月4日
  3. @author: User
  4. '''
  5. from bs4 import BeautifulSoup, Comment
  6. import copy
  7. import re
  8. import sys
  9. import os
  10. import codecs
  11. import requests
  12. import time
  13. _time1 = time.time()
  14. sys.path.append(os.path.abspath("../.."))
  15. import fool
  16. from BiddingKG.dl.interface.Connection import *
  17. from BiddingKG.dl.common.Utils import *
  18. from BiddingKG.dl.interface.Connection import getConnection
  19. import BiddingKG.dl.interface.predictor as predictor
  20. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  21. import BiddingKG.dl.interface.getAttributes as getAttributes
  22. import BiddingKG.dl.entityLink.entityLink as entityLink
  23. import BiddingKG.dl.complaint.punish_rule as punish_rule
  24. import json
  25. '''
  26. doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
  27. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  28. cursor = conn.cursor()
  29. cursor.execute(" select content from articles where id='"+doc_id+"' ")
  30. row = cursor.fetchall()[0]
  31. #text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
  32. #content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
  33. '''
  34. ''''''
  35. codeNamePredict = predictor.CodeNamePredict()
  36. premPredict = predictor.PREMPredict()
  37. epcPredict = predictor.EPCPredict()
  38. roleRulePredict = predictor.RoleRulePredictor()
  39. timePredict = predictor.TimePredictor()
  40. punish = punish_rule.Punish_Extract()
  41. #自定义jsonEncoder
  42. class MyEncoder(json.JSONEncoder):
  43. def default(self, obj):
  44. if isinstance(obj, np.ndarray):
  45. return obj.tolist()
  46. elif isinstance(obj, bytes):
  47. return str(obj, encoding='utf-8')
  48. elif isinstance(obj, (np.float_, np.float16, np.float32,
  49. np.float64)):
  50. return float(obj)
  51. elif isinstance(obj,str):
  52. return obj
  53. return json.JSONEncoder.default(self, obj)
  54. def predict(doc_id,text):
  55. list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
  56. for articles in list_articles:
  57. print(articles.content)
  58. ''''''
  59. codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
  60. print(codeName)
  61. premPredict.predict(list_sentences,list_entitys)
  62. roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
  63. print("epcPredict")
  64. epcPredict.predict(list_sentences,list_entitys)
  65. print("entityLink")
  66. timePredict.predict(list_sentences, list_entitys)
  67. print("timePredict")
  68. entityLink.link_entitys(list_entitys)
  69. print("getPREMs")
  70. prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
  71. print("getPREMs")
  72. punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title='投诉处理 ', text=text)
  73. print(punish_dic)
  74. prem[0][1]['punish'] = punish_dic
  75. # 招标方式
  76. bidway = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='bidway']
  77. # 资金来源
  78. moneySource = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='moneySource']
  79. # 服务时间
  80. servicetime = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='servicetime']
  81. # 发布时间 time_release:1
  82. time_release = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label==1]
  83. # 开标时间 'time_bidopen':2,
  84. time_bidopen = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label==2]
  85. # 截标时间 'time_bidclose':3
  86. time_bidclose = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label == 3]
  87. prem[0][1]['bidway'] = ';'.join(set(bidway))
  88. prem[0][1]['moneySource'] = ';'.join(set(moneySource))
  89. prem[0][1]['servicetime'] = ';'.join(set(servicetime))
  90. prem[0][1]['time_release'] = ';'.join(set(time_release))
  91. prem[0][1]['time_bidopen'] = ';'.join(set(time_bidopen))
  92. prem[0][1]['time_bidclose'] = ';'.join(set(time_bidclose))
  93. ''''''
  94. for entitys in list_entitys:
  95. for entity in entitys:
  96. print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end)
  97. #print(prem)
  98. return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
  99. def test(name,content):
  100. user = {
  101. "content": content,
  102. "id":name
  103. }
  104. myheaders = {'Content-Type': 'application/json'}
  105. _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
  106. resp_json = _resp.content.decode("utf-8")
  107. print(resp_json)
  108. return resp_json
  109. if __name__=="__main__":
  110. # filename = "比地_52_79929693.html"
  111. # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
  112. # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
  113. # content = str(BeautifulSoup(text).find("div",id="pcontent"))
  114. # df_a = {"html":[]}
  115. # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
  116. # import pandas as pd
  117. # df = pd.DataFrame(df_a)
  118. # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
  119. # print()
  120. #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
  121. # text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
  122. # text = 'a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。'
  123. # text = '张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,延时规则:在剩余数量小于最小购买数量时,竞价进'
  124. # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况,
  125. # 中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名:
  126. # 哈尔滨昊龙电气没备制造有限公司,投标报价:19.87万元,质,量:合格,工期/交货期/服务期:30天,'''
  127. text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
  128. 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
  129. 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
  130. 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
  131. a = time.time()
  132. print("start")
  133. # print(predict("12",content))
  134. print(predict("投诉处理公告", text))
  135. #test("12",text)
  136. print("takes",time.time()-a)
  137. pass