luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278
							#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Author  : bidikeji
# @Time    : 2021/1/11 0011 13:52 

'''
Created on 2019年1月4日

@author: User
'''

from bs4 import BeautifulSoup, Comment
import copy
import re
import sys
import os
import codecs
import requests
import time

_time1 = time.time()
sys.path.append(os.path.abspath("../.."))
import fool
from BiddingKG.dl.interface.Connection import *
from BiddingKG.dl.common.Utils import *
from BiddingKG.dl.interface.Connection import getConnection
import BiddingKG.dl.interface.predictor as predictor
import BiddingKG.dl.interface.Preprocessing as Preprocessing
import BiddingKG.dl.interface.getAttributes as getAttributes
import BiddingKG.dl.entityLink.entityLink as entityLink
import BiddingKG.dl.complaint.punish_predictor as punish_predictor
# import BiddingKG.dl.complaint.punish_rule as punish_predictor
import json

'''
doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'

conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")

cursor = conn.cursor()

cursor.execute(" select content from articles where id='"+doc_id+"' ")

row = cursor.fetchall()[0]


#text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()

#content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
'''

''''''
codeNamePredict = predictor.CodeNamePredict()
premPredict = predictor.PREMPredict()
epcPredict = predictor.EPCPredict()
roleRulePredict = predictor.RoleRulePredictor()
timePredict = predictor.TimePredictor()
# punish = punish_rule.Punish_Extract()
punish = punish_predictor.Punish_Extract()
productPredict = predictor.ProductPredictor()

# 自定义jsonEncoder
class MyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, bytes):
            return str(obj, encoding='utf-8')
        elif isinstance(obj, (np.float_, np.float16, np.float32,
                              np.float64)):
            return float(obj)
        elif isinstance(obj, str):
            return obj
        return json.JSONEncoder.default(self, obj)


def predict(doc_id, text, title=""):
    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", title]],
                                                                                    useselffool=True)
    for articles in list_articles:
        print(articles.content)

    ''''''

    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
    print(codeName)
    premPredict.predict(list_sentences, list_entitys)
    # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
    print("epcPredict")
    epcPredict.predict(list_sentences, list_entitys)
    print("entityLink")
    timePredict.predict(list_sentences, list_entitys)
    print("timePredict")
    entityLink.link_entitys(list_entitys)
    print("getPREMs")
    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
    print("getPREMs")
    list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
    product = productPredict.predict(list_sentences,list_entitys)

    for entitys in list_entitys:
        for entity in entitys:
            print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
                  entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end,entity.sentence_index)
    # print(prem)
    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product)[0],
                      cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic


def predict_back(doc_id, html):
    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, html, "", "", ""]],
                                                                                    useselffool=True)
    for articles in list_articles:
        print(articles.content)

    ''''''

    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)  #预测项目编号，名称
    print(codeName)
    premPredict.predict(list_sentences, list_entitys)  #  角色金额模型
    roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName) # 角色规则
    print("epcPredict")
    epcPredict.predict(list_sentences, list_entitys)  # 联系人模型
    print("entityLink")
    timePredict.predict(list_sentences, list_entitys) # 时间类别模型
    print("timePredict")
    entityLink.link_entitys(list_entitys) #
    print("getPREMs")
    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles) # 找包，并包号与其他要素连接起来
    print("getPREMs")
    # punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title=title, text=list_articles[0].content)
    list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
    # punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
    # print(punish_dic)
    # prem[0][1]['punish'] = punish_dic

    # bidway = []  # 招标方式
    # moneySource = []  # 资金来源
    # servicetime = []  # 服务时间
    # time_release = []  # 发布时间
    # time_bidopen = []  # 开标时间
    # time_bidclose = []  # 截标时间
    # for entity in list_entitys[0]:
    #     if entity.entity_type == 'bidway':
    #         bidway.append(entity.entity_text)
    #     elif entity.entity_type == 'moneySource':
    #         moneySource.append(entity.entity_text)
    #     elif entity.entity_type == 'servicetime':
    #         servicetime.append(entity.entity_text)
    #     elif entity.entity_type == 'time' and entity.label == 1:
    #         time_release.append(entity.entity_text)
    #     elif entity.entity_type == 'time' and entity.label == 2:
    #         time_bidopen.append(entity.entity_text)
    #     elif entity.entity_type == 'time' and entity.label == 3:
    #         time_bidclose.append(entity.entity_text)
    #
    # prem[0][1]['bidway'] = '；'.join(set(bidway))
    # prem[0][1]['moneySource'] = '；'.join(set(moneySource))
    # prem[0][1]['servicetime'] = '；'.join(set(servicetime))
    # prem[0][1]['time_release'] = '；'.join(set(time_release))
    # prem[0][1]['time_bidopen'] = '；'.join(set(time_bidopen))
    # prem[0][1]['time_bidclose'] = '；'.join(set(time_bidclose))
    #
    # ''''''
    #
    # for entitys in list_entitys:
    #     for entity in entitys:
    #         print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
    #               entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end)
    #
    # print(prem)
    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic)[0],
               cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)

    # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1], cls=MyEncoder, sort_keys=True, indent=4,
    #                   ensure_ascii=False)


def test(name, content):
    user = {
        "content": content,
        "id": name
    }
    myheaders = {'Content-Type': 'application/json'}
    _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
    resp_json = _resp.content.decode("utf-8")
    print(resp_json)
    return resp_json


if __name__ == "__main__":
    from tablestore import *
    endpoint = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
    access_key_id = 'LTAI4GJxbioV1y2WM3XcZTmP'
    access_key_secret = 'B3VITMoCnKtTQE6eAkDkat6UNFHped'
    instance_name = 'bxkc-ots'
    ots_client = OTSClient(endpoint, access_key_id, access_key_secret, instance_name)

    def get_data(query, max_rows, table_name='document',
                 index_name='document_index',
                 column_names=['docid', 'dochtmlcon','doctitle', 'info_type', 'page_time'],
                 sorters=[FieldSort("page_time", SortOrder.DESC), FieldSort("docid", SortOrder.DESC)]):
        '''
        从阿里云ots查询数据
        :param query: 查询命令
        :param max_rows: 最大返回多少数据
        :param table_name: 表名
        :param index_name: 表索引名
        :param column_names: 返回字段名
        :param sorters: 排序规则列表
        :return: 处理后的数据列表
        '''
        next_token = None
        data = []
        all_rows = []
        rows, next_token, total_count, is_all_succeed = \
            ots_client.search(table_name,
                              index_name,
                              SearchQuery(query,
                                          next_token=next_token,
                                          sort=Sort(sorters=sorters),  # ASC升序
                                          limit=100,
                                          get_total_count=True),
                              ColumnsToGet(column_names=column_names,
                                           return_type=ColumnReturnType.SPECIFIED))
        all_rows.extend(rows)
        while next_token:
            rows, next_token, total_count, is_all_succeed = \
                ots_client.search(table_name,
                                  index_name,
                                  SearchQuery(query,
                                              next_token=next_token,
                                              sort=None,
                                              limit=100,
                                              get_total_count=True),
                                  ColumnsToGet(column_names=column_names,
                                               return_type=ColumnReturnType.SPECIFIED))
            all_rows.extend(rows)
            if len(all_rows) > max_rows:
                print('已获取%d条数据' % len(all_rows))
                break

        if all_rows:
            for row in all_rows:
                tmp = []
                tmp.append(row[0][1][1])
                for tup in row[1]:
                    tmp.append(tup[1])
                data.append(tmp)
        return data


    bool_query = TermQuery('docid','124113339')
    # bool_query = BoolQuery(
    #     must_queries=[TermsQuery(field_name='info_type', column_values=['办公设备', '计算机设备']),
    #                   RangeQuery('page_time', range_from='2020-11-01', range_to='2020-11-31')]
    # )

    data = get_data(bool_query, 1)
    print(data)
    docid = str(data[0][0])
    html = data[0][1]
    title = data[0][2]
    # text = '中标候选人第1名：哈尔滨龙网电力设备有限公司，投标报价：19.98万元，质量，合格，工期/交货期/服务期：30天。\
    # 投诉处理公告，投诉人：张三。文章编号：京财采投字(2018)第42号。政府采购项目招标方式：公开招标，联系人：黎明。\
    # 建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，\
    # 二次供水泵房浊度仪进行国内组织公开招标采购，时间：2020-05-26，15:15:00，竞价结束时间：2020-05-26，15:45:00允许延时：是，'
    # docid = ""
    # html = '首都医科大学附属北京地坛医院1.5T核磁共振、16排CT和血管造影机维保服务医疗设备维修和保养服务采购项目政府采购合同公告'
    # html = '编号：厦财企〔2020〕12号，各有关单位：341号。处罚编号：厦财企〔2020〕12号，文章编号：京财采投字(2018)第42号。公告编号：闽建筑招〔2018〕5号。处罚编号：松公管监[2020]2号,'

    a = time.time()
    print("start")
    # print(predict('12',text))
    print(predict(docid, html,title=""))
    # test("12",text)
    print("takes", time.time() - a)
    pass