luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176
							'''
Created on 2019年1月4日

@author: User
'''
import os

from bs4 import BeautifulSoup, Comment
import copy
import re
import sys
import os
import codecs
import requests
import time

_time1 = time.time()
sys.path.append(os.path.abspath("../.."))

from BiddingKG.dl.common.Utils import *
import BiddingKG.dl.entityLink.entityLink as entityLink
import BiddingKG.dl.interface.predictor as predictor
import BiddingKG.dl.interface.Preprocessing as Preprocessing
import BiddingKG.dl.interface.getAttributes as getAttributes
import BiddingKG.dl.complaint.punish_predictor as punish_rule
import json


#自定义jsonEncoder
class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, bytes):
            return str(obj, encoding='utf-8')
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj,str):
            return obj
        return json.JSONEncoder.default(self, obj)

def predict(doc_id,text,title="",page_time="",**kwargs):

    cost_time = dict()

    start_time = time.time()
    log("start process doc %s"%(str(doc_id)))
    list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
    log("get preprocessed done of doc_id%s"%(doc_id))
    cost_time["preprocess"] = round(time.time()-start_time,2)
    cost_time.update(_cost_time)

    start_time = time.time()
    codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
    log("get codename done of doc_id%s"%(doc_id))
    cost_time["codename"] = round(time.time()-start_time,2)

    start_time = time.time()
    predictor.getPredictor("prem").predict(list_sentences,list_entitys)
    log("get prem done of doc_id%s"%(doc_id))
    cost_time["prem"] = round(time.time()-start_time,2)

    start_time = time.time()
    predictor.getPredictor("product").predict(list_sentences,list_entitys)
    log("get product done of doc_id%s"%(doc_id))
    cost_time["product"] = round(time.time()-start_time,2)

    start_time = time.time()
    product_attrs = predictor.getPredictor("product_attrs").predict(doc_id, text, page_time)
    log("get product attributes done of doc_id%s"%(doc_id))
    cost_time["product_attrs"] = round(time.time()-start_time,2)

    start_time = time.time()
    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
    cost_time["rule"] = round(time.time()-start_time,2)

    start_time = time.time()
    predictor.getPredictor("epc").predict(list_sentences,list_entitys)
    log("get epc done of doc_id%s"%(doc_id))
    cost_time["person"] = round(time.time()-start_time,2)

    start_time = time.time()
    predictor.getPredictor("time").predict(list_sentences, list_entitys)
    log("get time done of doc_id%s"%(doc_id))
    cost_time["time"] = round(time.time()-start_time,2)

    #依赖句子顺序
    start_time = time.time()
    entityLink.link_entitys(list_entitys)
    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
    log("get attributes done of doc_id%s"%(doc_id))
    cost_time["attrs"] = round(time.time()-start_time,2)

    #依赖句子顺序
    start_time = time.time()
    list_channel_dic = predictor.getPredictor("channel").predict(title=title, content=list_sentences[0])
    cost_time["channel"] = round(time.time()-start_time,2)

    start_time = time.time()
    list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
    cost_time["punish"] = round(time.time()-start_time,2)

    if len(product_attrs[1]['demand_info']['data'])>0:
        for d in product_attrs[1]['demand_info']['data']:
            for product in set(prem[0]['product']):
                if product in d['project_name']:
                    d['product'].append(product)  #把产品在项目名称中的添加进需求要素中

    #print(prem)
    # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
    # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
    data_res = dict(codeName[0], **prem[0], **list_channel_dic[0], **product_attrs[0], **product_attrs[1])
    data_res["cost_time"] = cost_time
    data_res["success"] = True

    #
    # for _article in list_articles:
    #     log(_article.content)
    #
    # for list_entity in list_entitys:
    #     for _entity in list_entity:
    #         log("type:%s,text:%s,label:%s,values:%s,sentence:%s,begin_index:%s,end_index:%s"%
    #               (str(_entity.entity_type),str(_entity.entity_text),str(_entity.label),str(_entity.values),str(_entity.sentence_index),
    #                str(_entity.begin_index),str(_entity.end_index)))

    return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)


def test(name,content):
    user = {
        "content": content,
        "id":name
    }
    myheaders = {'Content-Type': 'application/json'}
    _resp = requests.post("http://192.168.2.102:15030" + '/article_extract', json=user, headers=myheaders, verify=True)
    resp_json = _resp.content.decode("utf-8")
    # print(resp_json)
    return resp_json


if __name__=="__main__":
    import pandas as pd
    t1 = time.time()
    text = '中标人：广州中医药有限公司，招标人：广州市第一人民医院， 代理机构：希达招标代理有限公司。招标金额：100万元， 手续费：100元，总投资：1亿元。中标金额：50000元。合同金额：50000万元。'
    title = '合同公告'
    df = pd.read_excel('E:/公告金额/产品名称采购需求预算金额采购时间等要素公告.xlsx')
    # df = pd.read_excel('E:/公告金额/产品数量单价.xlsx')
    for i in range(30,50,1):
        text = df.loc[i, 'dochtmlcon']
        rs = json.loads(predict('', text, ''))
        print(rs['demand_info'])
        print(rs['product'])
        print(rs['product_attrs'])
    print(rs)

    # with open('D:/138786703.html', 'r', encoding='utf-8') as f:
    #     text = f.read()
    #     print(predict('', text, title))

    # print(predict('',text,title))
    # df = pd.read_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0812.xlsx')[:20]
    # new_prem = []
    # for i in range(len(df)):
    # # i = 246
    #     doc_id = df.loc[i, 'docid']
    #     text = df.loc[i, 'dochtmlcon']
    #     title = df.loc[i, 'doctitle']
    #     rs = predict(doc_id,text,title)
    #     # print(rs)
    #     new_prem.append(rs)
    # df['new_prem'] = pd.Series(new_prem)
    # print('耗时：', time.time()-t1)
    # df.to_excel('G:\公告金额/170角色金额原模型预测错误数据_new3为新预测中标金额_predict0813.xlsx')
    pass