测试所有提取信息.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # @Author : bidikeji
  4. # @Time : 2021/1/11 0011 13:52
  5. '''
  6. Created on 2019年1月4日
  7. @author: User
  8. '''
  9. from bs4 import BeautifulSoup, Comment
  10. import copy
  11. import re
  12. import sys
  13. import os
  14. import codecs
  15. import requests
  16. import time
  17. _time1 = time.time()
  18. sys.path.append(os.path.abspath("../.."))
  19. import fool
  20. from BiddingKG.dl.interface.Connection import *
  21. from BiddingKG.dl.common.Utils import *
  22. from BiddingKG.dl.interface.Connection import getConnection
  23. import BiddingKG.dl.interface.predictor as predictor
  24. import BiddingKG.dl.interface.Preprocessing as Preprocessing
  25. import BiddingKG.dl.interface.getAttributes as getAttributes
  26. import BiddingKG.dl.entityLink.entityLink as entityLink
  27. import BiddingKG.dl.complaint.punish_predictor as punish_predictor
  28. # import BiddingKG.dl.complaint.punish_rule as punish_predictor
  29. import BiddingKG.dl.channel.channel_predictor as channel_predictor
  30. import json
  31. '''
  32. doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
  33. conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
  34. cursor = conn.cursor()
  35. cursor.execute(" select content from articles where id='"+doc_id+"' ")
  36. row = cursor.fetchall()[0]
  37. #text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
  38. #content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
  39. '''
  40. ''''''
  41. codeNamePredict = predictor.CodeNamePredict()
  42. premPredict = predictor.PREMPredict()
  43. epcPredict = predictor.EPCPredict()
  44. # roleRulePredict = predictor.RoleRulePredictor()
  45. timePredict = predictor.TimePredictor()
  46. # punish = punish_rule.Punish_Extract()
  47. punish = punish_predictor.Punish_Extract()
  48. productPredict = predictor.ProductPredictor()
  49. channelPredict = channel_predictor.DocChannel()
  50. # 自定义jsonEncoder
  51. class MyEncoder(json.JSONEncoder):
  52. def default(self, obj):
  53. if isinstance(obj, np.ndarray):
  54. return obj.tolist()
  55. elif isinstance(obj, bytes):
  56. return str(obj, encoding='utf-8')
  57. elif isinstance(obj, (np.float_, np.float16, np.float32,
  58. np.float64)):
  59. return float(obj)
  60. elif isinstance(obj, str):
  61. return obj
  62. return json.JSONEncoder.default(self, obj)
  63. def predict(doc_id, text, title=""):
  64. list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", title]],
  65. useselffool=True)
  66. for articles in list_articles:
  67. print(articles.content)
  68. ''''''
  69. codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
  70. print(codeName)
  71. premPredict.predict(list_sentences, list_entitys)
  72. # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
  73. print("epcPredict")
  74. epcPredict.predict(list_sentences, list_entitys)
  75. print("entityLink")
  76. timePredict.predict(list_sentences, list_entitys)
  77. print("timePredict")
  78. entityLink.link_entitys(list_entitys)
  79. print("getPREMs")
  80. prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
  81. print("getPREMs")
  82. list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
  83. product = productPredict.predict(list_sentences,list_entitys)
  84. channel = channelPredict.predict(title, list_sentences[0])
  85. total_tendereeMoney_list = []
  86. for entity in list_entitys[0]:
  87. if entity.notes == '总投资':
  88. total_tendereeMoney_list.append(entity.entity_text)
  89. total_tendereeMoney = max([total_tendereeMoney_list]) if len(total_tendereeMoney_list)>=1 else 0
  90. for entitys in list_entitys:
  91. for entity in entitys:
  92. print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
  93. entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end,entity.sentence_index)
  94. # print(prem)
  95. # return json.dumps(Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product)[0],
  96. # cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False) # list_punish_dic
  97. # return json.dumps(Preprocessing.union_result(
  98. # Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product), channel)[0],
  99. # cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False) # list_punish_dic
  100. return json.dumps(Preprocessing.union_result(Preprocessing.union_result(
  101. Preprocessing.union_result(
  102. Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic), product), [{'total_tendereeMoney':total_tendereeMoney}]
  103. ),
  104. channel),
  105. cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False) # list_punish_dic
  106. def predict_back(doc_id, html):
  107. list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, html, "", "", ""]],
  108. useselffool=True)
  109. for articles in list_articles:
  110. print(articles.content)
  111. ''''''
  112. codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys) #预测项目编号,名称
  113. print(codeName)
  114. premPredict.predict(list_sentences, list_entitys) # 角色金额模型
  115. roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName) # 角色规则
  116. print("epcPredict")
  117. epcPredict.predict(list_sentences, list_entitys) # 联系人模型
  118. print("entityLink")
  119. timePredict.predict(list_sentences, list_entitys) # 时间类别模型
  120. print("timePredict")
  121. entityLink.link_entitys(list_entitys) #
  122. print("getPREMs")
  123. prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles) # 找包,并包号与其他要素连接起来
  124. print("getPREMs")
  125. # punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title=title, text=list_articles[0].content)
  126. list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
  127. # punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
  128. # print(punish_dic)
  129. # prem[0][1]['punish'] = punish_dic
  130. # bidway = [] # 招标方式
  131. # moneySource = [] # 资金来源
  132. # servicetime = [] # 服务时间
  133. # time_release = [] # 发布时间
  134. # time_bidopen = [] # 开标时间
  135. # time_bidclose = [] # 截标时间
  136. # for entity in list_entitys[0]:
  137. # if entity.entity_type == 'bidway':
  138. # bidway.append(entity.entity_text)
  139. # elif entity.entity_type == 'moneySource':
  140. # moneySource.append(entity.entity_text)
  141. # elif entity.entity_type == 'servicetime':
  142. # servicetime.append(entity.entity_text)
  143. # elif entity.entity_type == 'time' and entity.label == 1:
  144. # time_release.append(entity.entity_text)
  145. # elif entity.entity_type == 'time' and entity.label == 2:
  146. # time_bidopen.append(entity.entity_text)
  147. # elif entity.entity_type == 'time' and entity.label == 3:
  148. # time_bidclose.append(entity.entity_text)
  149. #
  150. # prem[0][1]['bidway'] = ';'.join(set(bidway))
  151. # prem[0][1]['moneySource'] = ';'.join(set(moneySource))
  152. # prem[0][1]['servicetime'] = ';'.join(set(servicetime))
  153. # prem[0][1]['time_release'] = ';'.join(set(time_release))
  154. # prem[0][1]['time_bidopen'] = ';'.join(set(time_bidopen))
  155. # prem[0][1]['time_bidclose'] = ';'.join(set(time_bidclose))
  156. #
  157. # ''''''
  158. #
  159. # for entitys in list_entitys:
  160. # for entity in entitys:
  161. # print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
  162. # entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end)
  163. #
  164. # print(prem)
  165. return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic)[0],
  166. cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)
  167. # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1], cls=MyEncoder, sort_keys=True, indent=4,
  168. # ensure_ascii=False)
  169. def test(name, content):
  170. user = {
  171. "content": content,
  172. "id": name
  173. }
  174. myheaders = {'Content-Type': 'application/json'}
  175. _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
  176. resp_json = _resp.content.decode("utf-8")
  177. print(resp_json)
  178. return resp_json
  179. if __name__ == "__main__":
  180. # from tablestore import *
  181. # endpoint = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
  182. # access_key_id = 'LTAI4GJxbioV1y2WM3XcZTmP'
  183. # access_key_secret = 'B3VITMoCnKtTQE6eAkDkat6UNFHped'
  184. # instance_name = 'bxkc-ots'
  185. # ots_client = OTSClient(endpoint, access_key_id, access_key_secret, instance_name)
  186. #
  187. # def get_data(query, max_rows, table_name='document',
  188. # index_name='document_index',
  189. # column_names=['docid', 'dochtmlcon','doctitle', 'info_type', 'page_time'],
  190. # sorters=[FieldSort("page_time", SortOrder.DESC), FieldSort("docid", SortOrder.DESC)]):
  191. # '''
  192. # 从阿里云ots查询数据
  193. # :param query: 查询命令
  194. # :param max_rows: 最大返回多少数据
  195. # :param table_name: 表名
  196. # :param index_name: 表索引名
  197. # :param column_names: 返回字段名
  198. # :param sorters: 排序规则列表
  199. # :return: 处理后的数据列表
  200. # '''
  201. # next_token = None
  202. # data = []
  203. # all_rows = []
  204. # rows, next_token, total_count, is_all_succeed = \
  205. # ots_client.search(table_name,
  206. # index_name,
  207. # SearchQuery(query,
  208. # next_token=next_token,
  209. # sort=Sort(sorters=sorters), # ASC升序
  210. # limit=100,
  211. # get_total_count=True),
  212. # ColumnsToGet(column_names=column_names,
  213. # return_type=ColumnReturnType.SPECIFIED))
  214. # all_rows.extend(rows)
  215. # while next_token:
  216. # rows, next_token, total_count, is_all_succeed = \
  217. # ots_client.search(table_name,
  218. # index_name,
  219. # SearchQuery(query,
  220. # next_token=next_token,
  221. # sort=None,
  222. # limit=100,
  223. # get_total_count=True),
  224. # ColumnsToGet(column_names=column_names,
  225. # return_type=ColumnReturnType.SPECIFIED))
  226. # all_rows.extend(rows)
  227. # if len(all_rows) > max_rows:
  228. # print('已获取%d条数据' % len(all_rows))
  229. # break
  230. #
  231. # if all_rows:
  232. # for row in all_rows:
  233. # tmp = []
  234. # tmp.append(row[0][1][1])
  235. # for tup in row[1]:
  236. # tmp.append(tup[1])
  237. # data.append(tmp)
  238. # return data
  239. #
  240. #
  241. # bool_query = TermQuery('docid','124113339')
  242. # # bool_query = BoolQuery(
  243. # # must_queries=[TermsQuery(field_name='info_type', column_values=['办公设备', '计算机设备']),
  244. # # RangeQuery('page_time', range_from='2020-11-01', range_to='2020-11-31')]
  245. # # )
  246. #
  247. # data = get_data(bool_query, 1)
  248. # print(data)
  249. # docid = str(data[0][0])
  250. # html = data[0][1]
  251. # title = data[0][2]
  252. # text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
  253. # 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
  254. # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
  255. # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
  256. docid = ""
  257. # title = '招标公告'
  258. # html = '招标人:广州市人民医院。代理人:广州医疗代理服务公司。招标金额:3000元,总投资:5万元。中标人:比地科技有限公司,中标金额:1万元。'
  259. html = """, [ 正在 公告 ] 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) , 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) 采购 结果 公告 , 项目 名称 , 公司 2020 - 2021 年度 打印 制作 服务 项目 编号 , 20200803030110070001 采购 组织 人 , 中 节能 建筑 节能 有限公司 河南 分公司 采购 方式 , 谈判 采购 成交 信息 , 序号 , 标段 ( 包 ) 编号 , 标段 ( 包 ) 名称 , 成交 供应商 , 成交 金额 20200803030110070001001 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) 郑州市 上街区 永达 文印部 null 元 公告 起 止 时间 2021年 04月 14日 - 2021年 04月 17日 ,
  260. """
  261. title = """[ 正在 公告 ] 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) ,
  262. """
  263. html = html.replace(' ', '')
  264. title = title.replace(' ', '')
  265. # html = '首都医科大学附属北京地坛医院1.5T核磁共振、16排CT和血管造影机维保服务医疗设备维修和保养服务采购项目政府采购中标候选人公示,中标人:广州比地科技有限公司,中标金额:6000万元'
  266. # html = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
  267. a = time.time()
  268. print("start")
  269. # print(predict('12',text))
  270. print(predict(docid, html,title=""))
  271. # test("12",text)
  272. print("takes", time.time() - a)
  273. pass