luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
							#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Author  : bidikeji
# @Time    : 2021/1/11 0011 13:52 

'''
Created on 2019年1月4日

@author: User
'''

from bs4 import BeautifulSoup, Comment
import copy
import re
import sys
import os
import codecs
import requests
import time

_time1 = time.time()
sys.path.append(os.path.abspath("../.."))
import fool
from BiddingKG.dl.interface.Connection import *
from BiddingKG.dl.common.Utils import *
from BiddingKG.dl.interface.Connection import getConnection
import BiddingKG.dl.interface.predictor as predictor
import BiddingKG.dl.interface.Preprocessing as Preprocessing
import BiddingKG.dl.interface.getAttributes as getAttributes
import BiddingKG.dl.entityLink.entityLink as entityLink
import BiddingKG.dl.complaint.punish_predictor as punish_predictor
# import BiddingKG.dl.complaint.punish_rule as punish_predictor
import BiddingKG.dl.channel.channel_predictor as channel_predictor
import json

'''
doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'

conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")

cursor = conn.cursor()

cursor.execute(" select content from articles where id='"+doc_id+"' ")

row = cursor.fetchall()[0]


#text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()

#content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
'''

''''''
codeNamePredict = predictor.CodeNamePredict()
premPredict = predictor.PREMPredict()
epcPredict = predictor.EPCPredict()
# roleRulePredict = predictor.RoleRulePredictor()
timePredict = predictor.TimePredictor()
# punish = punish_rule.Punish_Extract()
punish = punish_predictor.Punish_Extract()
productPredict = predictor.ProductPredictor()
channelPredict = channel_predictor.DocChannel()

# 自定义jsonEncoder
class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, bytes):
            return str(obj, encoding='utf-8')
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj, str):
            return obj
        return json.JSONEncoder.default(self, obj)


def predict(doc_id, text, title=""):
    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", title]],
                                                                                    useselffool=True)
    for articles in list_articles:
        print(articles.content)

    ''''''

    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
    print(codeName)
    premPredict.predict(list_sentences, list_entitys)
    # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
    print("epcPredict")
    epcPredict.predict(list_sentences, list_entitys)
    print("entityLink")
    timePredict.predict(list_sentences, list_entitys)
    print("timePredict")
    entityLink.link_entitys(list_entitys)
    print("getPREMs")
    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
    print("getPREMs")
    list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
    product = productPredict.predict(list_sentences,list_entitys)
    channel = channelPredict.predict(title, list_sentences[0])

    total_tendereeMoney_list = []
    for entity in list_entitys[0]:
        if entity.notes == '总投资':
            total_tendereeMoney_list.append(entity.entity_text)
    total_tendereeMoney = max([total_tendereeMoney_list]) if len(total_tendereeMoney_list)>=1 else 0

    for entitys in list_entitys:
        for entity in entitys:
            print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
                  entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end,entity.sentence_index)
    # print(prem)
    # return json.dumps(Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product)[0],
    #                   cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
    # return json.dumps(Preprocessing.union_result(
    #     Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product), channel)[0],
    #                   cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(
        Preprocessing.union_result(
            Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic), product), [{'total_tendereeMoney':total_tendereeMoney}]
    ),
        channel),
            cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic


def predict_back(doc_id, html):
    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, html, "", "", ""]],
                                                                                    useselffool=True)
    for articles in list_articles:
        print(articles.content)

    ''''''

    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)  #预测项目编号，名称
    print(codeName)
    premPredict.predict(list_sentences, list_entitys)  #  角色金额模型
    roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName) # 角色规则
    print("epcPredict")
    epcPredict.predict(list_sentences, list_entitys)  # 联系人模型
    print("entityLink")
    timePredict.predict(list_sentences, list_entitys) # 时间类别模型
    print("timePredict")
    entityLink.link_entitys(list_entitys) #
    print("getPREMs")
    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles) # 找包，并包号与其他要素连接起来
    print("getPREMs")
    # punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title=title, text=list_articles[0].content)
    list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
    # punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
    # print(punish_dic)
    # prem[0][1]['punish'] = punish_dic

    # bidway = []  # 招标方式
    # moneySource = []  # 资金来源
    # servicetime = []  # 服务时间
    # time_release = []  # 发布时间
    # time_bidopen = []  # 开标时间
    # time_bidclose = []  # 截标时间
    # for entity in list_entitys[0]:
    #     if entity.entity_type == 'bidway':
    #         bidway.append(entity.entity_text)
    #     elif entity.entity_type == 'moneySource':
    #         moneySource.append(entity.entity_text)
    #     elif entity.entity_type == 'servicetime':
    #         servicetime.append(entity.entity_text)
    #     elif entity.entity_type == 'time' and entity.label == 1:
    #         time_release.append(entity.entity_text)
    #     elif entity.entity_type == 'time' and entity.label == 2:
    #         time_bidopen.append(entity.entity_text)
    #     elif entity.entity_type == 'time' and entity.label == 3:
    #         time_bidclose.append(entity.entity_text)
    #
    # prem[0][1]['bidway'] = '；'.join(set(bidway))
    # prem[0][1]['moneySource'] = '；'.join(set(moneySource))
    # prem[0][1]['servicetime'] = '；'.join(set(servicetime))
    # prem[0][1]['time_release'] = '；'.join(set(time_release))
    # prem[0][1]['time_bidopen'] = '；'.join(set(time_bidopen))
    # prem[0][1]['time_bidclose'] = '；'.join(set(time_bidclose))
    #
    # ''''''
    #
    # for entitys in list_entitys:
    #     for entity in entitys:
    #         print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
    #               entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end)
    #
    # print(prem)
    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic)[0],
               cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)

    # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1], cls=MyEncoder, sort_keys=True, indent=4,
    #                   ensure_ascii=False)


def test(name, content):
    user = {
        "content": content,
        "id": name
    }
    myheaders = {'Content-Type': 'application/json'}
    _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
    resp_json = _resp.content.decode("utf-8")
    print(resp_json)
    return resp_json


if __name__ == "__main__":
    # from tablestore import *
    # endpoint = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
    # access_key_id = 'LTAI4GJxbioV1y2WM3XcZTmP'
    # access_key_secret = 'B3VITMoCnKtTQE6eAkDkat6UNFHped'
    # instance_name = 'bxkc-ots'
    # ots_client = OTSClient(endpoint, access_key_id, access_key_secret, instance_name)
    #
    # def get_data(query, max_rows, table_name='document',
    #              index_name='document_index',
    #              column_names=['docid', 'dochtmlcon','doctitle', 'info_type', 'page_time'],
    #              sorters=[FieldSort("page_time", SortOrder.DESC), FieldSort("docid", SortOrder.DESC)]):
    #     '''
    #     从阿里云ots查询数据
    #     :param query: 查询命令
    #     :param max_rows: 最大返回多少数据
    #     :param table_name: 表名
    #     :param index_name: 表索引名
    #     :param column_names: 返回字段名
    #     :param sorters: 排序规则列表
    #     :return: 处理后的数据列表
    #     '''
    #     next_token = None
    #     data = []
    #     all_rows = []
    #     rows, next_token, total_count, is_all_succeed = \
    #         ots_client.search(table_name,
    #                           index_name,
    #                           SearchQuery(query,
    #                                       next_token=next_token,
    #                                       sort=Sort(sorters=sorters),  # ASC升序
    #                                       limit=100,
    #                                       get_total_count=True),
    #                           ColumnsToGet(column_names=column_names,
    #                                        return_type=ColumnReturnType.SPECIFIED))
    #     all_rows.extend(rows)
    #     while next_token:
    #         rows, next_token, total_count, is_all_succeed = \
    #             ots_client.search(table_name,
    #                               index_name,
    #                               SearchQuery(query,
    #                                           next_token=next_token,
    #                                           sort=None,
    #                                           limit=100,
    #                                           get_total_count=True),
    #                               ColumnsToGet(column_names=column_names,
    #                                            return_type=ColumnReturnType.SPECIFIED))
    #         all_rows.extend(rows)
    #         if len(all_rows) > max_rows:
    #             print('已获取%d条数据' % len(all_rows))
    #             break
    #
    #     if all_rows:
    #         for row in all_rows:
    #             tmp = []
    #             tmp.append(row[0][1][1])
    #             for tup in row[1]:
    #                 tmp.append(tup[1])
    #             data.append(tmp)
    #     return data
    #
    #
    # bool_query = TermQuery('docid','124113339')
    # # bool_query = BoolQuery(
    # #     must_queries=[TermsQuery(field_name='info_type', column_values=['办公设备', '计算机设备']),
    # #                   RangeQuery('page_time', range_from='2020-11-01', range_to='2020-11-31')]
    # # )
    #
    # data = get_data(bool_query, 1)
    # print(data)
    # docid = str(data[0][0])
    # html = data[0][1]
    # title = data[0][2]
    # text = '中标候选人第1名：哈尔滨龙网电力设备有限公司，投标报价：19.98万元，质量，合格，工期/交货期/服务期：30天。\
    # 投诉处理公告，投诉人：张三。文章编号：京财采投字(2018)第42号。政府采购项目招标方式：公开招标，联系人：黎明。\
    # 建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，\
    # 二次供水泵房浊度仪进行国内组织公开招标采购，时间：2020-05-26，15:15:00，竞价结束时间：2020-05-26，15:45:00允许延时：是，'
    docid = ""
    # title = '招标公告'
    # html = '招标人：广州市人民医院。代理人：广州医疗代理服务公司。招标金额：3000元，总投资：5万元。中标人：比地科技有限公司，中标金额：1万元。'
    html = """， [ 正在 公告 ] 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) ， 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) 采购 结果 公告 ， 项目 名称 ， 公司 2020 - 2021 年度 打印 制作 服务 项目 编号 ， 20200803030110070001 采购 组织 人 ， 中 节能 建筑 节能 有限公司 河南 分公司 采购 方式 ， 谈判 采购 成交 信息 ， 序号 ， 标段 ( 包 ) 编号 ， 标段 ( 包 ) 名称 ， 成交 供应商 ， 成交 金额 20200803030110070001001 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) 郑州市 上街区 永达 文印部 null 元 公告 起 止 时间 2021年 04月 14日 - 2021年 04月 17日 ，
"""
    title = """[ 正在 公告 ] 公司 2020 - 2021 年度 打印 制作 服务 ( 重新 招标 ) ( 变更 采购 方式 ) ，
"""
    html = html.replace(' ', '')
    title = title.replace(' ', '')
    # html = '首都医科大学附属北京地坛医院1.5T核磁共振、16排CT和血管造影机维保服务医疗设备维修和保养服务采购项目政府采购中标候选人公示，中标人：广州比地科技有限公司，中标金额：6000万元'
    # html = '编号：厦财企〔2020〕12号，各有关单位：341号。处罚编号：厦财企〔2020〕12号，文章编号：京财采投字(2018)第42号。公告编号：闽建筑招〔2018〕5号。处罚编号：松公管监[2020]2号,'

    a = time.time()
    print("start")
    # print(predict('12',text))
    print(predict(docid, html,title=""))
    # test("12",text)
    print("takes", time.time() - a)
    pass