''' Created on 2019年1月4日 @author: User ''' from bs4 import BeautifulSoup, Comment import copy import re import sys import os import codecs import requests import time _time1 = time.time() sys.path.append(os.path.abspath("../..")) import fool from BiddingKG.dl.interface.Connection import * from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.Connection import getConnection import BiddingKG.dl.interface.predictor as predictor import BiddingKG.dl.interface.Preprocessing as Preprocessing import BiddingKG.dl.interface.getAttributes as getAttributes import BiddingKG.dl.entityLink.entityLink as entityLink import BiddingKG.dl.complaint.punish_rule as punish_rule import json ''' doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba' conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() cursor.execute(" select content from articles where id='"+doc_id+"' ") row = cursor.fetchall()[0] #text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read() #content = segment(tableToText(BeautifulSoup(row[0],"lxml"))) ''' '''''' codeNamePredict = predictor.CodeNamePredict() premPredict = predictor.PREMPredict() epcPredict = predictor.EPCPredict() roleRulePredict = predictor.RoleRulePredictor() timePredict = predictor.TimePredictor() punish = punish_rule.Punish_Extract() #自定义jsonEncoder class MyEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, bytes): return str(obj, encoding='utf-8') elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)): return float(obj) elif isinstance(obj,str): return obj return json.JSONEncoder.default(self, obj) def predict(doc_id,text): list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True) for articles in list_articles: print(articles.content) '''''' codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys) print(codeName) premPredict.predict(list_sentences,list_entitys) roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName) print("epcPredict") epcPredict.predict(list_sentences,list_entitys) print("entityLink") timePredict.predict(list_sentences, list_entitys) print("timePredict") entityLink.link_entitys(list_entitys) print("getPREMs") prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles) print("getPREMs") punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title='投诉处理 ', text=text) print(punish_dic) prem[0][1]['punish'] = punish_dic bidway = [] # 招标方式 moneySource = [] # 资金来源 servicetime = [] # 服务时间 time_release = [] # 发布时间 time_bidopen = [] # 开标时间 time_bidclose = [] # 截标时间 for entity in list_entitys[0]: if entity.entity_type == 'bidway': bidway.append(entity.entity_text) elif entity.entity_type=='moneySource': moneySource.append(entity.entity_text) elif entity.entity_type=='servicetime': servicetime.append(entity.entity_text) elif entity.entity_type == 'time' and entity.label==1: time_release.append(entity.entity_text) elif entity.entity_type == 'time' and entity.label==2: time_bidopen.append(entity.entity_text) elif entity.entity_type == 'time' and entity.label == 3: time_bidclose.append(entity.entity_text) prem[0][1]['bidway'] = ';'.join(set(bidway)) prem[0][1]['moneySource'] = ';'.join(set(moneySource)) prem[0][1]['servicetime'] = ';'.join(set(servicetime)) prem[0][1]['time_release'] = ';'.join(set(time_release)) prem[0][1]['time_bidopen'] = ';'.join(set(time_bidopen)) prem[0][1]['time_bidclose'] = ';'.join(set(time_bidclose)) '''''' for entitys in list_entitys: for entity in entitys: print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end) #print(prem) return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False) def test(name,content): user = { "content": content, "id":name } myheaders = {'Content-Type': 'application/json'} _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True) resp_json = _resp.content.decode("utf-8") print(resp_json) return resp_json if __name__=="__main__": # filename = "比地_52_79929693.html" # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read() # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read() # content = str(BeautifulSoup(text).find("div",id="pcontent")) # df_a = {"html":[]} # df_a["html"].append(re.sub('\r|\n|\r\n',"",content)) # import pandas as pd # df = pd.DataFrame(df_a) # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv") # print() #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read() # text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。" # text = 'a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。' # text = '张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,延时规则:在剩余数量小于最小购买数量时,竞价进' # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况, # 中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名: # 哈尔滨昊龙电气没备制造有限公司,投标报价:19.87万元,质,量:合格,工期/交货期/服务期:30天,''' text = ',候选人公示,项目信息,采购项目名称::立体库地标画线及通道围栏采购项目,采购项目编号::JJCG-20201' \ '2150002,采购人名称::一汽物流(长春陆顺)储运有限公司,采购人地址::长春市汽车厂飞跃路75号。采购项' \ '目名称::立体库地标画线及通道围栏采购项目,采购项目编号::JJCG-202012150002,采购人名称::一汽物流(' \ '长春陆顺)储运有限公司,采购人地址::长春市汽车厂飞跃路75号。采购项目名称::立体库地标画线及通道围栏采购项' \ '目,采购项目编号::JJCG-202012150002,采购人名称::一' a = time.time() print("start") # print(predict("12",content)) print(predict("投诉处理公告", text)) #test("12",text) print("takes",time.time()-a) pass