luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
							'''
Created on 2019年1月4日

@author: User
'''

from bs4 import BeautifulSoup, Comment
import copy
import re
import sys
import os
import codecs
import requests
import time

_time1 = time.time()
sys.path.append(os.path.abspath("../.."))
import fool
from BiddingKG.dl.interface.Connection import *
from BiddingKG.dl.common.Utils import *
from BiddingKG.dl.interface.Connection import getConnection
import BiddingKG.dl.interface.predictor as predictor
import BiddingKG.dl.interface.Preprocessing as Preprocessing
import BiddingKG.dl.interface.getAttributes as getAttributes
import BiddingKG.dl.entityLink.entityLink as entityLink
import BiddingKG.dl.complaint.punish_rule as punish_rule
import json


'''
doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'

conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")

cursor = conn.cursor()

cursor.execute(" select content from articles where id='"+doc_id+"' ")

row = cursor.fetchall()[0]


#text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()

#content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
'''

'''''' 
codeNamePredict = predictor.CodeNamePredict()
premPredict = predictor.PREMPredict()
epcPredict = predictor.EPCPredict()
roleRulePredict = predictor.RoleRulePredictor()
timePredict = predictor.TimePredictor()
punish = punish_rule.Punish_Extract()


#自定义jsonEncoder
class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, bytes):
            return str(obj, encoding='utf-8')
        elif isinstance(obj, (np.float_, np.float16, np.float32, 
        np.float64)):
            return float(obj)
        elif isinstance(obj,str):
            return obj
        return json.JSONEncoder.default(self, obj)


def predict(doc_id,text):
    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
    for articles in list_articles:
        print(articles.content)


    ''''''
        
    codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
    print(codeName)
    premPredict.predict(list_sentences,list_entitys)
    roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
    print("epcPredict")
    epcPredict.predict(list_sentences,list_entitys)
    print("entityLink")
    timePredict.predict(list_sentences, list_entitys)
    print("timePredict")
    entityLink.link_entitys(list_entitys)
    print("getPREMs")
    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
    print("getPREMs")
    punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title='投诉处理 ', text=text)
    print(punish_dic)
    prem[0][1]['punish'] = punish_dic
    # 招标方式
    bidway = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='bidway']
    # 资金来源
    moneySource = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='moneySource']
    # 服务时间
    servicetime = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='servicetime']
    # 发布时间 time_release：1
    time_release = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label==1]
    # 开标时间  'time_bidopen':2,
    time_bidopen = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label==2]
    # 截标时间 'time_bidclose':3
    time_bidclose = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label == 3]
    prem[0][1]['bidway'] = '；'.join(set(bidway))
    prem[0][1]['moneySource'] = '；'.join(set(moneySource))
    prem[0][1]['servicetime'] = '；'.join(set(servicetime))
    prem[0][1]['time_release'] = '；'.join(set(time_release))
    prem[0][1]['time_bidopen'] = '；'.join(set(time_bidopen))
    prem[0][1]['time_bidclose'] = '；'.join(set(time_bidclose))


    ''''''
    

    for entitys in list_entitys:
        for entity in entitys:
            print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end)

    #print(prem)
    return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)

         
def test(name,content):
    user = {
            "content": content,
            "id":name
            }
    myheaders = {'Content-Type': 'application/json'}
    _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
    resp_json = _resp.content.decode("utf-8")
    print(resp_json)
    return resp_json


if __name__=="__main__":
    # filename = "比地_52_79929693.html"
    # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
    # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
    # content = str(BeautifulSoup(text).find("div",id="pcontent"))
    # df_a = {"html":[]}
    # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
    # import pandas as pd
    # df = pd.DataFrame(df_a)
    # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
    # print()
    #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
    # text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托，就二次供水泵房浊度仪进行国内组织公开招标采购，欢迎符合条件的供应商参加投标。"
    # text = 'a建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，as，建设资金来自呜呜呜。'
    # text = '张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托，就二次供水泵房浊度仪进行国内组织公开招标采购，时间：2020-05-26，15:15:00，竞价结束时间：2020-05-26，15:45:00允许延时：是，延时规则：在剩余数量小于最小购买数量时，竞价进'
    # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示，更多咨询报价请点击：http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html，大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人，(招标编号：XYwZ-20200309-5)，公示结束时间：2020年04月03日，、评标情况，标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购，中标候选人基本情况，
    # 中标候选人第1名：哈尔滨龙网电力设备有限公司，投标报价：19.98万元，质量，合格，工期/交货期/服务期：30天，中标候选人第2名：
    # 哈尔滨昊龙电气没备制造有限公司，投标报价：19.87万元，质，量：合格，工期/交货期/服务期：30天，'''
    text = '中标候选人第1名：哈尔滨龙网电力设备有限公司，投标报价：19.98万元，质量，合格，工期/交货期/服务期：30天。\
    投诉处理公告，投诉人：张三。文章编号：京财采投字(2018)第42号。政府采购项目招标方式：公开招标，联系人：黎明。\
    建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，\
    二次供水泵房浊度仪进行国内组织公开招标采购，时间：2020-05-26，15:15:00，竞价结束时间：2020-05-26，15:45:00允许延时：是，'
    a = time.time()
    print("start")
    # print(predict("12",content))
    print(predict("投诉处理公告", text))
    #test("12",text)
    print("takes",time.time()-a)
    pass