123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- '''
- Created on 2019年1月4日
- @author: User
- '''
- from bs4 import BeautifulSoup, Comment
- import copy
- import re
- import sys
- import os
- import codecs
- import requests
- import time
- _time1 = time.time()
- sys.path.append(os.path.abspath("../.."))
- import fool
- from BiddingKG.dl.interface.Connection import *
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.interface.Connection import getConnection
- import BiddingKG.dl.interface.predictor as predictor
- import BiddingKG.dl.interface.Preprocessing as Preprocessing
- import BiddingKG.dl.interface.getAttributes as getAttributes
- import BiddingKG.dl.entityLink.entityLink as entityLink
- import json
- '''
- doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- cursor.execute(" select content from articles where id='"+doc_id+"' ")
- row = cursor.fetchall()[0]
- #text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
- #content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
- '''
- ''''''
- codeNamePredict = predictor.CodeNamePredict()
- premPredict = predictor.PREMPredict()
- epcPredict = predictor.EPCPredict()
- roleRulePredict = predictor.RoleRulePredictor()
- #自定义jsonEncoder
- class MyEncoder(json.JSONEncoder):
- def default(self, obj):
- if isinstance(obj, np.ndarray):
- return obj.tolist()
- elif isinstance(obj, bytes):
- return str(obj, encoding='utf-8')
- elif isinstance(obj, (np.float_, np.float16, np.float32,
- np.float64)):
- return float(obj)
- elif isinstance(obj,str):
- return obj
- return json.JSONEncoder.default(self, obj)
- def predict(doc_id,text):
- list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
- for articles in list_articles:
- print(articles.content)
- ''''''
-
- codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
- print(codeName)
- premPredict.predict(list_sentences,list_entitys)
- roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
- print("epcPredict")
- epcPredict.predict(list_sentences,list_entitys)
- print("entityLink")
- entityLink.link_entitys(list_entitys)
- print("getPREMs")
- prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
- print("getPREMs")
-
-
- ''''''
-
-
- for entitys in list_entitys:
- for entity in entitys:
- print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
- #print(prem)
- return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
-
- def test(name,content):
- user = {
- "content": content,
- "id":name
- }
- myheaders = {'Content-Type': 'application/json'}
- _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
- resp_json = _resp.content.decode("utf-8")
- print(resp_json)
- return resp_json
- if __name__=="__main__":
- filename = "比地_52_79929693.html"
- #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
- text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
- content = str(BeautifulSoup(text).find("div",id="pcontent"))
- # df_a = {"html":[]}
- # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
- # import pandas as pd
- # df = pd.DataFrame(df_a)
- # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
- # print()
- #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
- #text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
- a = time.time()
- print("start")
- print(predict("12",content))
- #test("12",text)
- print("takes",time.time()-a)
- pass
|