123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166 |
- '''
- Created on 2019年1月4日
- @author: User
- '''
- from bs4 import BeautifulSoup, Comment
- import copy
- import re
- import sys
- import os
- import codecs
- import requests
- import time
- _time1 = time.time()
- sys.path.append(os.path.abspath("../.."))
- import fool
- from BiddingKG.dl.interface.Connection import *
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.interface.Connection import getConnection
- import BiddingKG.dl.interface.predictor as predictor
- import BiddingKG.dl.interface.Preprocessing as Preprocessing
- import BiddingKG.dl.interface.getAttributes as getAttributes
- import BiddingKG.dl.entityLink.entityLink as entityLink
- # import BiddingKG.dl.complaint.punish_rule as punish_rule
- import BiddingKG.dl.complaint.punish_predictor as punish_rule
- import json
- '''
- doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- cursor.execute(" select content from articles where id='"+doc_id+"' ")
- row = cursor.fetchall()[0]
- #text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
- #content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
- '''
- ''''''
- codeNamePredict = predictor.CodeNamePredict()
- premPredict = predictor.PREMPredict()
- epcPredict = predictor.EPCPredict()
- roleRulePredict = predictor.RoleRulePredictor()
- timePredict = predictor.TimePredictor()
- punish = punish_rule.Punish_Extract()
- productPredict = predictor.ProductPredictor()
- #自定义jsonEncoder
- class MyEncoder(json.JSONEncoder):
- def default(self, obj):
- if isinstance(obj, np.ndarray):
- return obj.tolist()
- elif isinstance(obj, bytes):
- return str(obj, encoding='utf-8')
- elif isinstance(obj, (np.float_, np.float16, np.float32,
- np.float64)):
- return float(obj)
- elif isinstance(obj,str):
- return obj
- return json.JSONEncoder.default(self, obj)
- def predict(doc_id,text,title=""):
- list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
- for articles in list_articles:
- print(articles.content)
- ''''''
-
- # codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
- # premPredict.predict(list_sentences,list_entitys)
- # productPredict.predict(list_sentences,list_entitys)
- # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
- # epcPredict.predict(list_sentences,list_entitys)
- # timePredict.predict(list_sentences, list_entitys)
- # entityLink.link_entitys(list_entitys)
- # prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
- list_punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
- for entitys in list_entitys:
- for entity in entitys:
- if entity.entity_type in ["org","company"] and entity.label is None:
- print("%%%%",entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end)
- codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
- productPredict.predict(list_sentences,list_entitys)
- premPredict.predict(list_sentences,list_entitys)
- for entitys in list_entitys:
- for entity in entitys:
- if entity.entity_type in ["org","company"] and entity.label is None:
- print("======",entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end)
- roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
- epcPredict.predict(list_sentences,list_entitys)
- timePredict.predict(list_sentences, list_entitys)
- entityLink.link_entitys(list_entitys)
- prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
- for entitys in list_entitys:
- for entity in entitys:
- print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end)
- #print(prem)
- return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
-
- def test(name,content):
- user = {
- "content": content,
- "id":name
- }
- myheaders = {'Content-Type': 'application/json'}
- _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
- resp_json = _resp.content.decode("utf-8")
- print(resp_json)
- return resp_json
- if __name__=="__main__":
- # filename = "比地_52_79929693.html"
- #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
- text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
- content = str(BeautifulSoup(text).find("div",id="pcontent"))
- # df_a = {"html":[]}
- # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
- # import pandas as pd
- # df = pd.DataFrame(df_a)
- # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
- # print()
- #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
- # text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
- # text = 'a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。'
- # text = '张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,延时规则:在剩余数量小于最小购买数量时,竞价进'
- # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况,
- # 中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名:
- # 哈尔滨昊龙电气没备制造有限公司,投标报价:19.87万元,质,量:合格,工期/交货期/服务期:30天,'''
- # text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
- # 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
- # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
- # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
- a = time.time()
- # text = '''
- # ,清远市清新区治理道路货物运输车辆非法超限超载工作领导小组清远市清新区治理道路货物运输车辆非法超限超载工作领导小组喷墨打印机网上商城合同
- # 验收报告,一、合同编号:GDMALL2019123563,。二、合同名称:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组喷墨打印机网上商城合同。
- # 三、中标、成交供应商:广州爱联科技有限公司,地址:广州市黄埔大道西468号勤建商务大厦14层。联系人:周勇联系电话:020-85180120,。
- # 四、合同金额(元):¥3,270.00,。五、合同详细信息:。采购项目编号::441827-201910-531001-0013,中标/成交标的名称::喷墨打印机,
- # 数量::1台。采购项目名称::喷墨打印机,规格型号::WF-7218,中标/成交金额(元)::3,270.00。服务要求::,。,。六、验收结论:已通过。
- # 七、验收小组成员名单::。八、联系事项:。(一)采购人:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组,地址:太和镇玄真路49号。
- # 联系人:苏美彩,联系电话:0763-5835988,。(二)采购代理机构:地址::。联系人:联系电话::。附件::。
- # 发布人:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组。发布时间:2019年11月26日
- # '''
- content = '''
- <div> <div> 莆田市城厢区农业农村局2020年化肥减量增效项目土样采集与野外调查工作技术服务公开询价结果公告 </div> <a rel="noreferrer" href=""> <div></div></a> <a rel="noreferrer" href=""></a> <a rel="noreferrer" href=""></a> <p> <font>莆田市城厢区</font><font>农业农村局</font><span>2020<font>年</font></span><font>化肥减量增效项目</font><font>土样采集与野外调查工作技术服务</font><font>公开询价</font><font>。</font><font>通过组织询价,至</font><span>2020年</span><span>11</span><font>月</font><span>20</span><font>日</font><span>18</span><font>点点</font><span>30</span><font>分整,共收到龙岩市岀入境综合检测技术服务中心、</font><span><font>福建旭东辰地地质勘查有限公司、福建省</font>121地质大队和福州恒成信息科技有限公司4家公司参加报价</span><font>文件</font><font>。</font><span>2020年</span><span>11</span><font>月</font><span>30</span><font>日上午</font><span>10点</span><font>,</font><font>举行</font><font>现场开标</font><font>会</font><font>,</font><font>经评审</font><span><font>只有福建省</font>121地质大队1家通过资格</span><font>性</font><font>审核</font><font>,其它</font><font>龙岩市岀入境综合检测技术服务中心、</font><span><font>福建旭东辰地地质勘查有限公司和福州恒成信息科技有限公司</font>3家没通过资格</span><font>性审核</font><font>。本次询价因</font><font>有效报价不足</font><span>3家</span><font>,</font><font>故流标</font><font>。</font></p> <p><font face="宋体">特此公告。</font></p> <p><font>公告期限</font><span>:1个工作日</span></p> <p><font>联系</font><span><font>电话:</font>0594-2686709 联系人:林伟雄</span></p> <p> </p> <p><font face="宋体">莆田市城厢区农业农村局</font></p> <p><font face="宋体">2020年11月30日 </font></p> <div> 附件下载: </div> <ul> </ul> </div>'''
- print("start")
- print(predict("12",content))
- # print(predict("投诉处理公告", text))
- #test("12",text)
- print("takes",time.time()-a)
- pass
|