Jelajahi Sumber

新增产品字段,整合产品要素到实体类并返回输出,调整失信编号预测为pb调用

bidi 4 tahun lalu
induk
melakukan
043c4be7ce

TEMPAT SAMPAH
BiddingKG/dl/complaint/models/punish_code.pb


+ 459 - 0
BiddingKG/dl/complaint/punish_predictor.py

@@ -0,0 +1,459 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/25 0025 16:35 
+
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2020/12/24 0024 15:23
+import re
+import os
+import time
+import tensorflow as tf
+# from BiddingKG.dl.common.Utils import *
+from tensorflow.contrib.crf import crf_log_likelihood
+from tensorflow.contrib.layers.python.layers import initializers
+# from keras.preprocessing.sequence import pad_sequences
+# import BiddingKG.dl.interface.Preprocessing as Preprocessing
+from BiddingKG.dl.interface.Preprocessing import *
+
+
+def decode(logits, trans, sequence_lengths, tag_num):
+    viterbi_sequences = []
+    for logit, length in zip(logits, sequence_lengths):
+        score = logit[:length]
+        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
+        viterbi_sequences.append(viterbi_seq)
+    return viterbi_sequences
+
+class Punish_Extract():
+    def __init__(self, model_file = os.path.dirname(__file__)+"/models/punish_code.pb"):
+        print('model_file_path:',model_file)
+        self.sess = tf.Session(graph=tf.Graph())
+        self.code = ""
+        self.punish_dicition = ""
+        self.model_file = model_file #预测编号模型
+        self.load_model()
+
+    # 加载处罚编号预测模型
+    def load_model(self):
+        log("get model of time")
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                output_graph_def = tf.GraphDef()
+                with open(self.model_file, 'rb') as f:
+                    output_graph_def.ParseFromString(f.read())
+                    tf.import_graph_def(output_graph_def, name="")
+                    self.sess.run(tf.global_variables_initializer())
+                    self.char_input = self.sess.graph.get_tensor_by_name("char_input:0")
+                    self.length = self.sess.graph.get_tensor_by_name("length:0")
+                    self.trans = self.sess.graph.get_tensor_by_name("crf_loss/transitons:0")
+                    self.logits = self.sess.graph.get_tensor_by_name("CRF/output/logits:0")
+
+    # 处罚编号预测
+    def predict_punishCode(self,list_sentences, MAXlLEN=5000):
+        '''
+        每个句子预测处罚编号
+        :param list_sentences: 多篇文章句子列表[[每篇文章句子列表]]
+        :param MAXlLEN: 控制最大每个句子长度,超过截断
+        :return: 处罚编号字符串,若有多个;号隔开
+        '''
+        re_ner = re.compile("12+?3")
+        article_ner_list = []
+        count = 0
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                for sentences in list_sentences:
+                    count += 1
+                    # print(count)
+                    sentence_len = [len(sentence.sentence_text) for sentence in sentences]
+                    maxlen = max(sentence_len)
+                    sentences_x = []
+                    for sentence in sentences:
+                        sentence = sentence.sentence_text
+                        sentence = list(sentence)
+                        sentence2id = [getIndexOfWord(word) for word in sentence]
+                        sentences_x.append(sentence2id)
+                    sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
+                    sentences_x = [np.array(x) for x in sentences_x]
+                    _logits, _trans = self.sess.run([self.logits, self.trans],
+                                               feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
+                    viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
+
+                    ner_list = []
+                    for _seq, sentence in zip(viterbi_sequence, sentences):
+                        sentence = sentence.sentence_text
+                        seq_id = ''.join([str(s) for s in _seq])
+                        if re_ner.search(seq_id):
+                            # print("sentence: ",sentence)
+                            for _ner in re_ner.finditer(seq_id):
+                                start = _ner.start()
+                                end = _ner.end()
+                                n = sentence[start:end]
+                                # print(n,'<==>',start,end)
+                                # ner_list.append((n, start, end))
+                                ner_list.append(n)  # 改为只返回实体字符
+                    # article_ner_list.append(ner_list)
+                    article_ner_list.append(';'.join(set(ner_list)))
+        return article_ner_list[0]
+
+    # 处罚类型
+    def get_punishType(self, x1, x2):
+        '''通过文章标题及内容判断文章类别
+        x1: 标题
+        x2: 内容
+        return 类别'''
+        # x1 = x1.replace('(','(').replace(')', ')').replace(' ','')
+        # x2 = x2.replace('(', '(').replace(')', ')').replace(' ', '')
+        '''标题正则'''
+        # 未知公告
+        unknow = re.compile('采购方式|采购公告|采购招标|磋商公告|谈判公告|交易公告$|征集|征求|招标公告|竞标公告|中标公告|'
+                            '成交公告|成交信息|流标公告|废标公告|城市管理考评|决算表|决算|预算|资格考试|招聘|选聘'
+                            '|聘请|拟录用|无违规违法|无此项信息|暂无工程投标违法|管理办法|指导意见|无投诉|投诉办法'
+                            '公共资源交易情况|绩效评价|考试成绩|付息公告|不动产|办证|印发|转发')  #|结果公示 部分是
+        # 投诉处理
+        tscl = re.compile('投诉不予[处受]理|投诉不成立|终止投诉|投诉终止|不予受理|投诉事?项?的?处理')
+        # 行政处罚
+        xzcf = re.compile('行政处罚|行政处理|政处罚|行政裁决|防罚|公罚|医罚|环罚|政罚|文罚|局罚|旅罚|财罚|运罚')
+        # 监督检查
+        jdjc = re.compile('(监督检查的?问?题?(处理|整改|记分|结果|决定|处罚))|监督处罚|调查处理|监督处理')
+        # 严重违法
+        yzwf = re.compile('严重违法失信|黑名单|失信名单')
+        # 不良行为
+        blxw = re.compile('((不良|失信|不诚信|差错|不规范|违规|违约|处罚|违法)(行为|记录|信息))|((违约|违规|违法)(处理|操作|情况|问题))'
+                          '|通报批评|记分管理|迟到|早退|缺席|虚假材料|弄虚作假|履职不到位|诚信考核扣分|串通投标'
+                          '|审核不通过|码一致|地址一致|扣分处理|扣分通知|扣[0-9]+分|责令整改|信用信息认定书$'
+                          '|关于.{,30}的处罚|关于.{,10}的?考评通报|关于.{,30}扣分情况|不规范代理行为'
+                          '|(取消|暂停|限制).{,50}((专家|评标|评委|投标|竞价|被抽取|中标|供应商|候选人)资格)'
+                          '|(代理服?务?机构).{,10}(扣分)|(专家).{,30}(扣分|记分|处罚)|对.{,30}处理|冻结.{,30}账号')
+        # 其他不良行为
+        other = re.compile('质疑|代理机构进场交易情况|网上投诉办理|信用奖惩|信用奖罚|进场工作.{,5}考核'
+                           '|举报处理|结果无效|成交无效|行政复议')
+
+        '''正文内容正则'''
+        # 投诉处理
+        tscl_c = re.compile('(投诉(人|单位)[1-9]?(名称)?[::])|(投诉事项[1-5一二三四五、]*部?分?(成立|予以受理))'
+                            '|((驳回|撤回|撤销|终止)[^,。]{,60}(投诉|质疑))')
+        # 行政处罚
+        xzcf_c = re.compile('((处理依据及结果|处理结果|处罚结果)).*行政处罚|如下行政处罚|行政处罚决定')
+        # 诚信加分
+        cxjf_c = re.compile('处罚结果.*诚信加分')
+        # 严重违法失信
+        yzwf_c = re.compile('工商部门严重违法失信起名单|严重违法失信的具体情形') #|严重违法失信的具体情形
+        # 不良行为
+        blxw_c = re.compile('(取消|暂停|限制).{,30}((专家|评标|评委|投标|采购|竞价|被抽取|中标|供应商)的?资格)'
+                            '|(处罚结果|处罚情况).*(扣[1-9]*分|记分|不良行为|不良记录|不良信用|不诚信|扣除信用'
+                            '|诚信档案|信用信息|取消.*资格|口头警告|处罚机关|责令改正|罚款|限制投标|暂扣|禁止'
+                            '|暂停|封禁|暂无|行政处罚)|处罚结果'
+                            '|处罚主题|禁止参与.{,10}政府采购活动|列入不良行为|处罚如下|如下处罚|违规处罚|处罚违规'
+                            '|责令改正|责令整改|处罚依据|进行以下处理|处理依据及结果|处理结果|处罚决定书|'
+                            '(不规范|不良|不诚信)行为记录')
+        # 其他不良行为
+        other_c = re.compile('质疑(人|单位)[1-9]?(名称)?:|公告期内受质疑')
+
+        if re.search(unknow, x1):
+            return re.search(unknow, x1).group(0), '未知类别'
+        elif re.search(yzwf, x1):
+            return re.search(yzwf, x1).group(0), '严重违法'
+        elif re.search(yzwf_c, x2):
+            return re.search(yzwf_c, x2).group(0), '严重违法'
+
+        elif re.search(tscl, x1):
+            return re.search(tscl, x1).group(0), '投诉处理'
+        elif re.search(xzcf, x1):
+            return re.search(xzcf, x1).group(0), '行政处罚'
+        elif re.search(jdjc, x1):
+            return re.search(jdjc, x1).group(0), '监督检查'
+        elif re.search(blxw, x1):
+            return re.search(blxw, x1).group(0), '不良行为'
+        elif re.search(other, x1):
+            return re.search(other, x1).group(0), '其他不良行为'
+
+        elif re.search(tscl_c, x2):
+            return re.search(tscl_c, x2).group(0), '投诉处理'
+        elif re.search(xzcf_c, x2):
+            return re.search(xzcf_c, x2).group(0), '行政处罚'
+        elif re.search(cxjf_c, x2):
+            return re.search(cxjf_c, x2).group(0), '诚信加分'
+
+        elif re.search(blxw_c, x2):
+            return re.search(blxw_c, x2).group(0), '不良行为'
+        elif re.search(other_c, x2):
+            return re.search(other_c, x2).group(0), '其他不良行为'
+
+        return ' ', '未知类别'
+
+    # 处罚决定
+    def get_punishDecision(self, x, x2):
+        '''通过正则匹配文章内容中的处理决定
+        x:正文内容
+        x2: 处罚类别
+        return 处理决定字符串'''
+        rule1 = re.compile(
+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处理意见|行政处罚|处罚)(如下|如下))'
+            '|((以下|如下)(决定|处理|处理意见|行政处罚|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
+            '|整改意见)[::].{5,}')
+        rule2 = re.compile(
+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处罚|处理意见)(如下|如下))'
+            '|((以下|如下)(决定|处理|处理意见|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
+            '|处罚内容)[:,,].{10,}')
+        rule3 = re.compile('考评结果:?.*')
+        rule4 = re.compile('(依据|根据)《.*》.*')
+        if x2 == '未知类别':
+            return ' '
+        elif re.search(rule1, x[-int(len(x)*0.4):]):
+            return re.search(rule1, x[-int(len(x)*0.4):]).group(0)
+        elif re.search(rule1, x[-int(len(x)*0.6):]):
+            return re.search(rule1, x[-int(len(x)*0.6):]).group(0)
+        elif re.search(rule2, x[-int(len(x)*0.7):]):
+            return re.search(rule2, x[-int(len(x)*0.7):]).group(0)
+        elif re.search(rule3, x[-int(len(x)*0.6):]):
+            return re.search(rule3, x[-int(len(x)*0.6):]).group(0)
+        elif re.search(rule4, x[-int(len(x)*0.4):]):
+            return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
+        else:
+            return ' '
+
+    # 投诉是否成立
+    def get_punishWhether(self, x1, x2, x3):
+        '''通过正则匹配处理决定判断投诉是否成立
+        x1: 处理决定字符串
+        x2: 正文内容
+        x3: 处罚类别
+        return 投诉是否成立'''
+        p1 = re.compile('(投诉|投拆|质疑|举报)(事项|内容|事实)?[^不,。]{,10}(成立|属实|予以受理|予以支持)|责令|废标|(中标|成交)[^,。]{,10}无效'
+                        '|取消[^,。]{,60}资格|罚款|重新(组织|开展)?(招标|采购)|投诉成立|被投诉人存在违法违规行为'
+                        '|采购活动违法|(中标|评标|成交)结果无效')
+        p2 = re.compile('投诉不予[处受]理|((投诉|投拆|质疑|举报)(事项|内容|事实)?[^,。]{,10}(不成立|情?况?不属实|不予支持|缺乏事实依据))'
+                        '|((驳回|撤回|撤销|终止)[^,。]*(投诉|质疑|诉求))|终止[^,。]{,20}(行政裁决|投诉处理|采购活动)|投诉终止|投诉无效'
+                        '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^,。]{,10}不成立'
+                        '|维持被投诉人|不支持[^,。]{,20}投诉|无确凿证据')
+        if x3 != '投诉处理':
+            return ' '
+        elif re.search(p1, x1):
+            return '投诉成立'
+        elif re.search(p2, x1):
+            return '投诉无效'
+        elif re.search(p1, x2):
+            return '投诉成立'
+        elif re.search(p2, x2):
+            return '投诉无效'
+        return ' '
+
+    # 执法机构、处罚时间
+    def get_institution(self, title, sentences_l, entity_l):
+        '''
+        通过判断实体前信息判断改实体是否为执法机构
+        :param title: 文章标题
+        :param sentences_l: 单篇公告句子列表
+        :param entity_l: 单篇公告实体列表
+        :return: 执法机构及处罚时间字符串,多个的用;号隔开
+        '''
+        institutions = []
+        punishTimes = []
+        institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
+        punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
+        # 通过实体前面关键词判断是否为执法机构或处罚时间
+        for ner in entity_l:
+            if ner.entity_type == 'org':
+                left = sentences_l[ner.sentence_index].sentence_text[
+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
+                if institution_1.search(left):
+                    institutions.append(ner)
+                elif institutions != [] and ner.sentence_index == institutions[-1].sentence_index and \
+                        ner.wordOffset_begin - institutions[-1].wordOffset_end < 2 and \
+                        sentences_l[ner.sentence_index].sentence_text[
+                        ner.wordOffset_begin:institutions[-1].wordOffset_end] \
+                        in ['', '、', '和', '及']:
+                    institutions.append(ner)
+            elif ner.entity_type == 'time':
+                left = sentences_l[ner.sentence_index].sentence_text[
+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
+                if punishTimes_1.search(left):
+                    punishTimes.append(ner)
+
+        institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
+        institution_time = re.compile(
+            "(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
+        ins = ""
+        ptime = ""
+        # 如果前面步骤找不到处罚机构则在标题找实体,并正则检查是否有关键词
+        if institutions == [] and len(title)>10:
+            title_ners = getNers([title], useselffool=True)
+            if title_ners[0]:
+                for title_ner in title_ners[0]:
+                    if title_ner[2] == 'org' and institution_title.search(title_ner[3]):
+                        ins = title_ner[3]
+                        break
+        if punishTimes == [] or institutions == []:
+            # 如果前面步骤还没找到要素,则通过公司实体后面是否有日期关键词,有则作为处罚机构和处罚时间
+            for ner in [ner for ner in entity_l if ner.entity_type == 'org'][-5:][::-1]:
+                right = sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_end:ner.wordOffset_end + 16]
+                if institution_time.search(right):
+                    if ins == '':
+                        ins = ner.entity_text
+                    if ptime == '':
+                        ptime = institution_time.search(right).group(1)
+                    break
+            # 前面步骤都没找到则判断最后一个时间实体是否在文章末尾,是则作为处罚时间
+            if ptime == '':
+                n_time = [ner for ner in entity_l if ner.entity_type == 'time']
+                if len(n_time) != 0:
+                    ner = n_time[-1]
+                    if ner.sentence_index == len(sentences_l) - 1:
+                        textLong = len(sentences_l[ner.sentence_index].sentence_text)
+                        if ner.wordOffset_end > textLong - 3 and len(ner.entity_text) > 3:
+                            ptime = ner.entity_text
+        institutions = [ner.entity_text for ner in institutions]
+        punishTimes = [ner.entity_text for ner in punishTimes]
+        if institutions == [] and ins != "":
+            institutions.append(ins)
+        if punishTimes == [] and ptime != "":
+            punishTimes.append(ptime)
+        return ";".join(institutions), ";".join(punishTimes)
+
+    # 投诉人、被投诉人、被处罚人
+    def get_complainant(self, punishType, sentences_l, entity_l):
+        '''
+        通过对公告类别、句子列表、实体列表正则寻找投诉人、被投诉人、处罚人
+        :param punishType: 公告处罚类别
+        :param sentences_l: 单篇公告句子列表
+        :param entity_l: 单篇公告实体列表
+        :return: 投诉人、被投诉人
+        '''
+        complainants = []  # 投诉人
+        punishPeople = []  # 被投诉人、被处罚人
+        size = 16
+        # 投诉人、质疑人
+        complainants_rule1 = re.compile(
+            "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+        # 被处罚人,被投诉人
+        punishPeople_rule1 = re.compile(
+            "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+        punishPeople_rule2_1 = re.compile(",$")
+        punishPeople_rule2_2 = re.compile("^[::]")
+        punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
+        punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
+
+        punish_l = []  # 处罚实体列表
+        tmp = []
+        for ner in [ner for ner in entity_l if ner.entity_type in ['org', 'company', 'person']]:
+            if tmp == []:
+                tmp.append(ner)
+            elif ner.entity_type == tmp[-1].entity_type and ner.sentence_index == tmp[-1].sentence_index and \
+                    ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
+                '',
+                '、',
+                '和',
+                '及']:
+                tmp.append(ner)
+            elif ner.entity_type in ['org', 'company'] and tmp[-1].entity_type in ['org', 'company'] and \
+                    ner.sentence_index == tmp[-1].sentence_index and ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
+                '',
+                '、',
+                '和',
+                '及']:
+                tmp.append(ner)
+            else:
+                punish_l.append(tmp)
+                tmp = [ner]
+        for ner_l in punish_l:
+            begin_index = ner_l[0].wordOffset_begin
+            end_index = ner_l[-1].wordOffset_end
+            left = sentences_l[ner_l[0].sentence_index].sentence_text[max(0, begin_index - size):begin_index]
+            right = sentences_l[ner_l[0].sentence_index].sentence_text[end_index:end_index + size]
+            if complainants_rule1.search(left):
+                complainants.append(ner_l)
+            elif punishPeople_rule1.search(left):
+                punishPeople.append(ner_l)
+            elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
+                if punishType == '投诉处理':
+                    complainants.append(ner_l)
+                else:
+                    punishPeople.append(ner_l)
+            elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
+                punishPeople.append(ner_l)
+        complainants = set([it.entity_text for l in complainants for it in l])
+        punishPeople = set([it.entity_text for l in punishPeople for it in l])
+        return ';'.join(complainants), ';'.join(punishPeople)
+
+    def get_punish_extracts(self,list_articles,list_sentences, list_entitys):
+        list_result = []
+        for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
+            title = article.title
+            text=article.content
+            keyword, punishType = self.get_punishType(title, text)
+            if punishType == "未知类别":
+                list_result.append({"punish":{}})
+            else:
+                # print('处罚类型:',punishType)
+                punish_code = self.predict_punishCode(list_sentences)
+                # print('处罚编号: ',punish_code)
+                institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
+                # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
+                punishDecision = self.get_punishDecision(text, punishType)
+                # print('处罚决定:',punishDecision)
+                punishWhether= self.get_punishWhether(punishDecision, text, punishType)
+                # print('投诉是否成立:',punishWhether)
+                complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
+                # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
+                punish_dic = {'punish_code':punish_code,
+                              'punishType':punishType,
+                              'punishDecision':punishDecision,
+                             'complainants':complainants,
+                             'punishPeople':punishPeople,
+                             'punishWhether':punishWhether,
+                             'institutions':institutions,
+                             'punishTimes':punishTimes}
+                list_result.append({"punish":punish_dic})
+        return list_result
+
+
+
+if __name__ == "__main__":
+    punish = Punish_Extract()
+
+    import pandas as pd
+    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
+    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
+    # i = 89
+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+    # i = 92
+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+
+    # t1 = time.time()
+    # for i in df.index:
+    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
+    #     df.loc[i, '投诉人'] = complainants
+    #     df.loc[i, '被投诉人'] = punishPeople
+    #     df.loc[i, '执法机构'] = institutions
+    #     df.loc[i, '处罚时间'] = punishTimes
+    #     df.loc[i, '处罚编号'] = punish_code
+    #     print('完成第%d篇'%i)
+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
+    # #    'institution', 'punishTime', 'ner_test']])
+    # t2 = time.time()
+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
+    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
+    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
+    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
+    # t3 = time.time()
+    # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
+    s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
+    # list_sentences = [s.split('。')]
+    # punish_code= punish.predict_punishCode( list_sentences)
+    # print(punish_code)
+
+    # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    #             get_punish_extracts(text=s)
+    # punish_dic = punish.get_punish_extracts(text=s)
+    # print(punish_dic)

+ 64 - 41
BiddingKG/dl/complaint/punish_rule.py

@@ -75,6 +75,7 @@ def BiLSTM_CRF_tfmodel(sess,weights):
             grads_vars = opt.compute_gradients(crf_loss)
             capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
             train_op = opt.apply_gradients(capped_grads_vars,global_step)
+            print('tensor: ',char_input, length, trans, _logits)
             return char_input,_logits,target,length,crf_loss,trans,train_op
 
 def decode(logits, trans, sequence_lengths, tag_num):
@@ -125,6 +126,7 @@ class Punish_Extract():
                         sentences_x.append(sentence2id)
                     sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
                     sentences_x = [np.array(x) for x in sentences_x]
+                    print('punish tensor: ',self.logits, self.trans, self.char_input, self.length)
                     _logits, _trans = self.sess.run([self.logits, self.trans],
                                                feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
                     viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
@@ -485,47 +487,68 @@ class Punish_Extract():
                 list_result.append({"punish":punish_dic})
         return list_result
 
-if __name__ == "__main__":
-    punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
+def save_punish_code_model():
+    model_folder = os.path.dirname(__file__) + "/models/21-0.9990081295021194-0.3647936"
+    output_graph = os.path.dirname(__file__) + "/models/punish_code.pb"
+    ckpt = tf.train.get_checkpoint_state(model_folder)
+    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
+        input_checkpoint = ckpt.model_checkpoint_path
+        saver = tf.train.import_meta_graph(input_checkpoint+".meta", clear_devices=True)
+        graph = tf.get_default_graph()
+        input_graph_def = graph.as_graph_def()
+        with tf.Session() as sess:
+            saver.restore(sess, input_checkpoint)
+            output_graph_def = graph_util.convert_variables_to_constants(
+                sess = sess,
+                input_graph_def = input_graph_def,
+                output_node_names=["char_input","length","crf_loss/transitons","CRF/output/logits"]
+            )
+            with tf.gfile.GFile(output_graph, "wb") as f:
+                f.write(output_graph_def.SerializeToString())
 
-    import pandas as pd
-    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
-    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
-    # i = 89
-    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
-    # i = 92
-    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
 
-    # t1 = time.time()
-    # for i in df.index:
-    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
-    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
-    #     df.loc[i, '投诉人'] = complainants
-    #     df.loc[i, '被投诉人'] = punishPeople
-    #     df.loc[i, '执法机构'] = institutions
-    #     df.loc[i, '处罚时间'] = punishTimes
-    #     df.loc[i, '处罚编号'] = punish_code
-    #     print('完成第%d篇'%i)
-    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
-    # #     '关键词', '类别', '处理决定', '投诉是否成立',
-    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
-    # #    'institution', 'punishTime', 'ner_test']])
-    # t2 = time.time()
+if __name__ == "__main__":
+    save_punish_code_model()
+    # punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
+    #
+    # import pandas as pd
+    # # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
+    # df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
+    # # i = 89
+    # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+    # # i = 92
+    # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+    #
+    # # t1 = time.time()
+    # # for i in df.index:
+    # #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    # #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
+    # #     df.loc[i, '投诉人'] = complainants
+    # #     df.loc[i, '被投诉人'] = punishPeople
+    # #     df.loc[i, '执法机构'] = institutions
+    # #     df.loc[i, '处罚时间'] = punishTimes
+    # #     df.loc[i, '处罚编号'] = punish_code
+    # #     print('完成第%d篇'%i)
+    # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
+    # # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
+    # # #    'institution', 'punishTime', 'ner_test']])
+    # # t2 = time.time()
+    # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    # # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
+    # # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
     # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
-    # #     '关键词', '类别', '处理决定', '投诉是否成立',
-    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
-    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
-    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
-    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
-    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
-    # t3 = time.time()
-    # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
-    s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
-    # list_sentences = [s.split('。')]
-    # punish_code= punish.predict_punishCode( list_sentences)
-    # print(punish_code)
-
-    # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
-    #             get_punish_extracts(text=s)
-    punish_dic = punish.get_punish_extracts_backup(text=s)
-    print(punish_dic)
+    # #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
+    # # t3 = time.time()
+    # # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
+    # s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
+    # # list_sentences = [s.split('。')]
+    # # punish_code= punish.predict_punishCode( list_sentences)
+    # # print(punish_code)
+    #
+    # # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    # #             get_punish_extracts(text=s)
+    # punish_dic = punish.get_punish_extracts_backup(text=s)
+    # print(punish_dic)

+ 71 - 1
BiddingKG/dl/interface/predictor.py

@@ -16,6 +16,8 @@ sys.path.append(os.path.abspath("../.."))
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.modelFactory import *
 import tensorflow as tf
+from tensorflow.python.framework import graph_util
+from BiddingKG.dl.product.data_util import decode, process_data, result_to_json
 from BiddingKG.dl.interface.Entitys import Entity
 
 from threading import RLock
@@ -223,7 +225,7 @@ class CodeNamePredict():
             list_entitys = [[] for _ in range(len(list_sentences))]
         for list_sentence,list_entity in zip(list_sentences,list_entitys):
             if len(list_sentence)==0:
-                result.append([list_sentence[0].doc_id,{"code":[],"name":""}])
+                result.append([{"code":[],"name":""}])
                 continue
             doc_id = list_sentence[0].doc_id
             # sentences = []
@@ -1201,6 +1203,73 @@ class TimePredictor():
                     values.append(item)
                     entity.set_Role(label, values)
 
+# 产品字段提取
+class ProductPredictor():
+    def __init__(self):
+        self.sess = tf.Session(graph=tf.Graph())
+        self.load_model()
+
+    def load_model(self):
+        model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                output_graph_def = tf.GraphDef()
+                with open(model_path, 'rb') as f:
+                    output_graph_def.ParseFromString(f.read())
+                    tf.import_graph_def(output_graph_def, name='')
+                    self.sess.run(tf.global_variables_initializer())
+                    self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
+                    self.length = self.sess.graph.get_tensor_by_name("Sum:0")
+                    self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
+                    self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
+                    self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
+
+    def predict(self, list_sentences,list_entitys=None, MAX_AREA=5000):
+        '''
+        预测实体代码,每个句子最多取MAX_AREA个字,超过截断
+        :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
+        :param list_entitys: 多篇公告实体列表
+        :param MAX_AREA: 每个句子最多截取多少字
+        :return: 把预测出来的实体放进实体类
+        '''
+        with self.sess.as_default() as sess:
+            with self.sess.graph.as_default():
+                result = []
+                if list_entitys is None:
+                    list_entitys = [[] for _ in range(len(list_sentences))]
+                for list_sentence, list_entity in zip(list_sentences,list_entitys):
+                    if len(list_sentence)==0:
+                        result.append({"product":[]})
+                        continue
+                    list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
+                    _begin_index = 0
+                    item = {"product":[]}
+                    temp_list = []
+                    MAX_LEN = len(list_sentence[_begin_index].sentence_text)
+                    if MAX_LEN > MAX_AREA:
+                        MAX_LEN = MAX_AREA
+                    chars = process_data([sentence.sentence_text[:MAX_LEN] for sentence in list_sentence])
+                    lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran], feed_dict={self.char_input: np.asarray(chars),
+                                                                                        self.dropout: 1.0
+                                                                                        })
+                    batch_paths = decode(scores, lengths, tran_)
+                    for sentence, path, length in zip(list_sentence,batch_paths, lengths):
+                        tags = ''.join([str(it) for it in path[:length]])
+                        for it in re.finditer("12*3", tags):
+                            start = it.start()
+                            end = it.end()
+                            _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
+                            sentence.doc_id, sentence.sentence_index, start, end),
+                                             entity_text=sentence.sentence_text[start:end],
+                                             entity_type="product", sentence_index=sentence.sentence_index, begin_index=0,
+                                             end_index=0, wordOffset_begin=start,
+                                             wordOffset_end=end)
+                            list_entity.append(_entity)
+                            temp_list.append(sentence.sentence_text[start:end])
+                    item["product"] = list(set(temp_list))
+                    result.append(item)
+                return result
+
 def getSavedModel():
     #predictor = FormPredictor()
     graph = tf.Graph()
@@ -1562,6 +1631,7 @@ def save_timesplit_model():
                                                "input1":time_model.input[1]},
                                        outputs={"outputs":time_model.output})
 
+
 if __name__=="__main__":
     #save_role_model()
     # save_codename_model()

TEMPAT SAMPAH
BiddingKG/dl/interface/product_savedmodel/product.pb


TEMPAT SAMPAH
BiddingKG/dl/product/data/dev_data.pkl


TEMPAT SAMPAH
BiddingKG/dl/product/data/dev_data2.pkl


TEMPAT SAMPAH
BiddingKG/dl/product/data/train_data.pkl


TEMPAT SAMPAH
BiddingKG/dl/product/data/train_data2.pkl


+ 155 - 0
BiddingKG/dl/product/data_util.py

@@ -0,0 +1,155 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/13 0013 14:19
+import re
+import math
+import random
+import psycopg2
+import numpy as np
+from tensorflow.contrib.crf import viterbi_decode
+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word
+
+id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
+word_model = getModel_word()
+vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
+word2id = {k: v for v, k in enumerate(vocab)}
+max_id = len(vocab)
+conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.101')
+cursor = conn.cursor()
+
+def get_label_data():
+    sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 \
+      and creation_date > to_timestamp('2021-01-14 00:00:00','yyyy-MM-dd HH24:mi:ss');"
+    cursor.execute(sql)
+    writer = open('label_data.txt', 'w', encoding='utf-8')
+    datas = []
+    for row in cursor.fetchall():
+        docid = row[0]
+        text = row[1]
+        # string = list(text)
+        tags = [0]*len(text)
+        sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
+        cursor.execute(sql_lb)
+        for row_lb in cursor.fetchall():
+            label = row_lb[0]
+            _, _, begin, end, _ = re.split('\s',label)
+            begin = int(begin)
+            end = int(end)
+            if end-begin>=2:
+                tags[begin]=1
+                tags[end-1]=3
+                for i in range(begin+1,end-1):
+                    tags[i]=2
+        # datas.append([string, tags])
+        text_sentence = []
+        ids_sentence = []
+        tag_sentence = []
+        for i in range(len(text)):
+            text_sentence.append(text[i])
+            ids_sentence.append(word2id.get(text[i], max_id))
+            tag_sentence.append(tags[i])
+            writer.write("%s\t%s\n"%(text[i],tags[i]))
+            if text[i] in ['。','?','!',';']:
+                writer.write('\n')
+                if text_sentence:
+                    if len(text_sentence) > 100:
+                    # if len(text_sentence)>5 and len(text_sentence)<1000:
+                        datas.append([text_sentence, ids_sentence,tag_sentence])
+                    elif len(text_sentence) > 5:
+                        continue
+                    else:
+                        print('单句小于5或大于100,句子长度为:%d,文章ID:%s'%(len(text_sentence), docid))
+                    text_sentence = []
+                    ids_sentence = []
+                    tag_sentence = []
+        if text_sentence:
+            if len(text_sentence) > 5:
+            # if len(text_sentence) > 5 and len(text_sentence) < 1000:
+                datas.append([text_sentence, ids_sentence, tag_sentence])
+            else:
+                print('单句小于5或大于100,句子长度为:%d,文章ID:%s' % (len(text_sentence), docid))
+    writer.close()
+    return datas
+
+def input_from_line(line):
+    string = list(line)
+    ids = [word2id.get(k, max_id) for k in string]
+    tags = []
+    return [[string], [ids], [tags]]
+def process_data(sentences):
+    '''
+    字符串数字化并统一长度
+    :param sentences: 文章分句字符串列表['招标公告','招标代理']
+    :return: 数字化后的统一长度
+    '''
+    maxLen = max([len(sentence) for sentence in sentences])
+    tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
+    pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
+    return pad_tags
+
+def get_ner(BIE_tag):
+    ner = set()
+    for it in re.finditer('BI*E',BIE_tag):
+        ner.add((it.start(),it.end()))
+    return ner
+
+def decode(logits, lengths, matrix):
+    paths = []
+    small = -1000.0
+    start = np.asarray([[small]*4+[0]])
+    for score, length in zip(logits, lengths):
+        score = score[:length]
+        pad = small * np.ones([length, 1])
+        logits = np.concatenate([score, pad], axis=1)
+        logits = np.concatenate([start, logits], axis=0)
+        path, _  = viterbi_decode(logits, matrix)
+        paths.append(path[1:])
+    return paths
+
+def result_to_json(line, tags):
+    result = []
+    ner = []
+    tags = ''.join([str(it) for it in tags])
+    for it in re.finditer("12*3", tags):
+        start = it.start()
+        end = it.end()
+        ner.append([line[start:end], (start, end)])
+    result.append([line, ner])
+    print(tags)
+    return result
+
+
+class BatchManager(object):
+    def __init__(self, data, batch_size):
+        self.batch_data = self.sort_and_pad(data, batch_size)
+        self.len_data = len(self.batch_data)
+
+    def sort_and_pad(self, data, batch_size):
+        num_batch = int(math.ceil(len(data)/batch_size))
+        sorted_data = sorted(data, key=lambda x:len(x[0]))
+        print('最小句子长度:%d;最大句子长度:%d' % (len(sorted_data[0][0]), len(sorted_data[-1][0])))  # 临时增加打印句子长度
+        batch_data = list()
+        for i in range(num_batch):
+            batch_data.append(self.pad_data(sorted_data[i*int(batch_size):(i+1)*int(batch_size)]))
+        return batch_data
+
+    @staticmethod
+    def pad_data(data):
+        strings = []
+        chars = []
+        targets = []
+        max_length = max([len(sentence[0]) for sentence in data])
+        for line in data:
+            string, char, target = line
+            padding = [0]*(max_length-len(string))
+            strings.append(string + padding)
+            chars.append(char + padding)
+            targets.append(target + padding)
+        return [strings, chars, targets]
+
+    def iter_batch(self, shuffle=False):
+        if shuffle:
+            random.shuffle(self.batch_data)
+        for idx in range(self.len_data):
+            yield self.batch_data[idx]

+ 117 - 0
BiddingKG/dl/product/main.py

@@ -0,0 +1,117 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/13 0013 14:03 
+from BiddingKG.dl.product.product_model import Product_Model
+from BiddingKG.dl.product.data_util import BatchManager, get_label_data, id_to_tag, input_from_line, decode, result_to_json
+import numpy as np
+import tensorflow as tf
+import random
+import pickle
+import os
+
+def train():
+    # all_data = get_label_data()
+    # random.shuffle(all_data)
+    # train_data = all_data[:int(len(all_data)*0.85)]
+    # dev_data = all_data[int(len(all_data)*0.85):]
+    # with open('data/train_data2.pkl', 'wb') as f:
+    #     pickle.dump(train_data, f)
+    # with open('data/dev_data2.pkl', 'wb') as f:
+    #     pickle.dump(dev_data, f)
+
+    with open('data/train_data2.pkl', 'rb') as f:
+        train_data = pickle.load(f)
+    with open('data/dev_data2.pkl', 'rb') as f:
+        dev_data = pickle.load(f)
+
+    train_manager = BatchManager(train_data, batch_size=128)
+    dev_manager = BatchManager(dev_data, batch_size=64)
+
+    tf_config = tf.ConfigProto()
+    tf_config.gpu_options.allow_growth = True
+    steps_per_epoch = train_manager.len_data
+    ckpt_path = "model"
+    with tf.Session(config=tf_config) as sess:
+        model = Product_Model()
+        sess.run(tf.global_variables_initializer())
+        # ckpt = tf.train.get_checkpoint_state(ckpt_path)
+        # if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
+        #     model.saver.restore(sess, ckpt.model_checkpoint_path)
+        #     print("从文件加载原来模型数据",ckpt.model_checkpoint_path)
+
+        print('准备训练数据')
+        loss = []
+        mix_loss = 1000
+        max_f1 = 0
+        for i in range(100):
+            print('epochs:',i)
+            # model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag)
+            # break
+            for batch in train_manager.iter_batch(shuffle=True):
+                # print('batch:',len(batch))
+                # step, batch_loss = model.run_step(sess, True, batch)
+                step, batch_loss = model.run_step(sess, 'train', batch)
+                loss.append(batch_loss)
+                if step % 10 == 0:
+                    iteration = step // steps_per_epoch + 1
+                    print('iter:{} step:{} loss:{}'.format(iteration, step, np.mean(loss)))
+            if i >= 50 or i%5==0:
+                f1, precision, recall, evl_loss = model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag)
+                print('f1:%.4f, precision:%.4f, recall:%.4f, evl_loss:%.4f' % (f1, precision, recall, evl_loss))
+                if max_f1 < f1:
+                    model.saver.save(sess, os.path.join(ckpt_path, "ner2.ckpt"))
+                    print("model save .bast f1 is %.4f" % f1)
+                    max_f1 = f1
+                    # if np.mean(loss)<mix_loss:
+                    #     mix_loss = np.mean(loss)
+                    #     model.saver.save(sess, os.path.join(ckpt_path, "ner.ckpt"))
+                    #     print("model saved, loss is:",mix_loss)
+                loss = []
+
+def evaluate_line():
+    ckpt_path = "model"
+    with tf.Session() as sess:
+        model = Product_Model()
+        sess.run(tf.global_variables_initializer())
+        ckpt = tf.train.get_checkpoint_state(ckpt_path)
+        if ckpt and tf.train.checkpoint_exists(ckpt_path):
+            print('模型文件:',ckpt.model_checkpoint_path)
+            model.saver.restore(sess, ckpt.model_checkpoint_path)
+            print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
+            while True:
+                line = input("请输入测试句子:")
+                result = model.evaluate_line(sess, line)
+                print(result)
+def predict():
+    pb_path = "model/product.pb"
+    with tf.Graph().as_default():
+        output_graph_def = tf.GraphDef()
+        with open(pb_path, 'rb') as f:
+            output_graph_def.ParseFromString(f.read())
+            tf.import_graph_def(output_graph_def, name='')  # 注意这里不能加名字
+            with tf.Session() as sess:
+                sess.run(tf.global_variables_initializer())
+                for node in output_graph_def.node:
+                    print(node.name)
+                char_input = sess.graph.get_tensor_by_name("CharInputs:0")
+                length = sess.graph.get_tensor_by_name("Sum:0")
+                dropout = sess.graph.get_tensor_by_name("Dropout:0")
+                logit = sess.graph.get_tensor_by_name("logits/Reshape:0")
+                tran = sess.graph.get_tensor_by_name("crf_loss/transitions:0")
+                while True:
+                    line = input("请输入测试句子:")
+                    _, chars, tags = input_from_line(line)
+                    print(chars)
+                    lengths, scores, tran_ = sess.run([length,logit,tran],feed_dict={char_input:np.asarray(chars),
+                                                                dropout:1.0
+                                                                } )
+                    batch_paths = decode(scores, lengths, tran_)
+                    tags = batch_paths[0]  # batch_paths[0][:lengths] 错误
+                    result = result_to_json(line, tags)
+                    print(result)
+
+if __name__ == "__main__":
+    # train()
+    # evaluate_line()
+    predict()

+ 2 - 0
BiddingKG/dl/product/model/checkpoint

@@ -0,0 +1,2 @@
+model_checkpoint_path: "ner2.ckpt"
+all_model_checkpoint_paths: "ner2.ckpt"

TEMPAT SAMPAH
BiddingKG/dl/product/model/ner2.ckpt.data-00000-of-00001


TEMPAT SAMPAH
BiddingKG/dl/product/model/ner2.ckpt.index


TEMPAT SAMPAH
BiddingKG/dl/product/model/ner2.ckpt.meta


TEMPAT SAMPAH
BiddingKG/dl/product/model/product.pb


+ 240 - 0
BiddingKG/dl/product/product_model.py

@@ -0,0 +1,240 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/13 0013 10:12
+# from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word
+from BiddingKG.dl.product.data_util import matrix,vocab,input_from_line,result_to_json,get_ner
+import tensorflow as tf
+import numpy as np
+from tensorflow.contrib.crf import crf_log_likelihood
+from tensorflow.contrib.crf import viterbi_decode
+from tensorflow.contrib.layers.python.layers import initializers
+
+# word_model = getModel_word()
+class Product_Model(object):
+    def __init__(self):
+        self.char_dim = 60
+        self.lstm_dim = 128
+        self.num_tags = 4
+        self.lr = 0.001
+        self.clip = 5.0
+        self.dropout_rate = 0.5
+        # vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
+        self.matrix = matrix
+        # self.word2id = {k:v for v,k in enumerate(self.vocab)}
+        self.num_chars = len(vocab)+1
+        self.emb_matrix = np.random.random((self.num_chars, self.char_dim))
+        self.emb_matrix[:self.num_chars-1:,:] = self.matrix
+
+
+        self.globel_step = tf.Variable(0, trainable=False)
+        self.best_dev_f1 = tf.Variable(0.0, trainable=False)
+        self.initializer = initializers.xavier_initializer()
+
+        self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None],name='CharInputs')
+        self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None],name='Targets')
+        self.dropout = tf.placeholder(dtype=tf.float32, name='Dropout')
+
+        used = tf.sign(tf.abs(self.char_inputs))
+        length = tf.reduce_sum(used, reduction_indices=1)
+        self.lengths = tf.cast(length, tf.int32)
+        self.batch_size = tf.shape(self.char_inputs)[0]
+        self.num_steps = tf.shape(self.char_inputs)[1]
+
+        with tf.variable_scope("char_embedding"):
+            self.char_lookup = tf.get_variable(
+                name="char_embedding",
+                # shape=[self.num_chars, self.char_dim],
+                initializer=np.array(self.emb_matrix,dtype=np.float32)
+            )
+        embed = tf.nn.embedding_lookup(self.char_lookup, self.char_inputs)
+
+        with tf.variable_scope("char_BiLSTM"):
+            lstm_cell = {}
+            for direction in ["forward", "backward"]:
+                with tf.variable_scope(direction):
+                    lstm_cell[direction] = tf.contrib.rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
+            outputs, final_states = tf.nn.bidirectional_dynamic_rnn(
+                lstm_cell["forward"],
+                lstm_cell["backward"],
+                embed,
+                dtype=tf.float32,
+                sequence_length=self.lengths
+            )
+        outputs = tf.concat(outputs, axis=2)
+
+        with tf.variable_scope("project"):
+            with tf.variable_scope("hidden"):
+                W = tf.get_variable("W", shape=[self.lstm_dim*2, self.lstm_dim],
+                                    dtype=tf.float32,initializer=self.initializer)
+                b = tf.get_variable("b", shape=[self.lstm_dim],
+                                    dtype=tf.float32, initializer=self.initializer)
+                output = tf.reshape(outputs, shape=[-1, 2*self.lstm_dim])
+                hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b))
+                hidden = tf.nn.dropout(hidden, keep_prob=self.dropout) # 添加dropout
+
+        with tf.variable_scope("logits"):
+            W = tf.get_variable("W", shape=[self.lstm_dim, self.num_tags],
+                                dtype=tf.float32, initializer=self.initializer)
+            b = tf.get_variable("b", shape=[self.num_tags])
+            pred = tf.nn.xw_plus_b(hidden, W, b)
+            self.logits = tf.reshape(pred, [-1, self.num_steps, self.num_tags])
+
+        with tf.variable_scope("crf_loss"):
+            small = -1000.0
+            start_logits = tf.concat(
+                [small*tf.ones(shape=[self.batch_size,1,self.num_tags]), tf.zeros(shape=[self.batch_size,1,1])], axis=-1
+            )
+            pad_logits = tf.cast(small*tf.ones([self.batch_size, self.num_steps, 1]), tf.float32)
+            logits = tf.concat([self.logits, pad_logits], axis=-1)
+            logits = tf.concat([start_logits, logits], axis=1)
+            targets = tf.concat([tf.cast(self.num_tags*tf.ones([self.batch_size,1]),tf.int32), self.targets], axis=-1)
+
+            self.trans = tf.get_variable(
+                name="transitions",
+                shape=[self.num_tags+1, self.num_tags+1],
+                initializer=self.initializer
+            )
+            log_likelihood, self.trans = crf_log_likelihood(
+                inputs=logits,
+                tag_indices=targets,
+                transition_params=self.trans,
+                sequence_lengths=self.lengths+1
+            )
+            self.loss = tf.reduce_mean(-log_likelihood)
+
+        with tf.variable_scope("optimizer"):
+            self.opt = tf.train.AdamOptimizer(learning_rate=self.lr)
+            grads_vars = self.opt.compute_gradients(self.loss)
+            capped_grads_vars = [[tf.clip_by_value(g, -self.clip, self.clip), v] for g,v in grads_vars]
+            self.train_op = self.opt.apply_gradients(capped_grads_vars, self.globel_step)
+
+        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
+
+    def create_feed_dict(self, run_type, batch): #is_train
+        '''
+        :param is_train: Flag, True for train batch
+        :param batch: list train/evaluate data
+        :return: structured data to feed
+        '''
+        _, chars, tags = batch
+        feed_dict = {
+            self.char_inputs:np.asarray(chars),
+            self.dropout:1.0
+        }
+        assert run_type in ['train', 'dev', 'predict']
+        if run_type=='train':
+            feed_dict[self.targets] = np.asarray(tags)
+            feed_dict[self.dropout] = self.dropout_rate
+        elif run_type=='dev':
+            feed_dict[self.targets] = np.asarray(tags)
+        return feed_dict
+
+    def run_step(self, sess, run_type, batch):
+        assert run_type in ['train', 'dev', 'predict']
+        feed_dict = self.create_feed_dict(run_type, batch)
+        if run_type=='train':
+            global_step, loss, _ = sess.run(
+                [self.globel_step, self.loss, self.train_op],
+                feed_dict=feed_dict
+            )
+            return global_step, loss
+        elif run_type=='dev':
+            lengths ,logits, loss = sess.run([self.lengths, self.logits, self.loss], feed_dict)
+            return lengths, logits, loss
+        else:
+            lengths ,logits = sess.run([self.lengths, self.logits], feed_dict)
+            return lengths, logits
+
+    def run_step_backup(self, sess, is_train, batch):
+        feed_dict = self.create_feed_dict(is_train, batch)
+        if is_train:
+            global_step, loss, _ = sess.run(
+                [self.globel_step, self.loss, self.train_op],
+                feed_dict=feed_dict
+            )
+            return global_step, loss
+        else:
+            lengths ,logits, loss = sess.run([self.lengths, self.logits, self.loss], feed_dict)
+            return lengths, logits, loss
+
+    def decode(self, logits, lengths, matrix):
+        paths = []
+        small = -1000.0
+        start = np.asarray([[small]*self.num_tags+[0]])
+        for score, length in zip(logits, lengths):
+            score = score[:length]
+            pad = small * np.ones([length, 1])
+            logits = np.concatenate([score, pad], axis=1)
+            logits = np.concatenate([start, logits], axis=0)
+            path, _  = viterbi_decode(logits, matrix)
+            paths.append(path[1:])
+        return paths
+
+    def evaluate(self, sess, data_manager, id_to_tag):
+        results = []
+        trans = self.trans.eval()
+        Precision = []
+        Recall = []
+        F1 = []
+        loss = []
+        pred_num = 0
+        gold_num = 0
+        equal_num = 0
+        for batch in data_manager.iter_batch():
+            strings = batch[0]
+            tags = batch[-1]
+            # lengths, scores, batch_loss = self.run_step(sess, False, batch)
+            lengths, scores, batch_loss = self.run_step(sess, 'dev', batch)
+            loss.append(batch_loss)
+            batch_paths = self.decode(scores, lengths, trans)
+            for i in range(len(strings)):
+                result = []
+                string = strings[i][:lengths[i]]
+                gold = [id_to_tag[int(x)] for x in tags[i][:lengths[i]]]
+                pred = [id_to_tag[int(x)] for x in batch_paths[i][:lengths[i]]]
+                gold_ner = get_ner("".join(gold))
+                pred_ner = get_ner("".join(pred))
+                # print('标签实体:',gold_ner)
+                # print('预测实体:',pred_ner)
+                pred_num += len(pred_ner)
+                gold_num += len(gold_ner)
+                equal_num += len(gold_ner&pred_ner)
+                # precision_temp = len(gold_ner&pred_ner)/(len(pred_ner)+1e-10)
+                # recall_temp = len(gold_ner&pred_ner)/(len(gold_ner)+1e-10)
+                # f1_temp = 2*(precision_temp*recall_temp)/(precision_temp+recall_temp+1e-10)
+                # Precision.append(precision_temp)
+                # Recall.append(recall_temp)
+                # F1.append(f1_temp)
+
+                # for char, gold, pred in zip(string, gold, pred):
+                #     result.append(" ".join([char, gold, pred]))
+                # results.append(result)
+                # with open('evaluate_result.txt','w', encoding='utf-8') as f:
+                #     for rs in results:
+                #         for line in rs:
+                #             f.write(line+'\n')
+                #         f.write('\n')
+
+        # return sum(F1)/len(F1),sum(Precision)/len(Precision),sum(Recall)/len(Recall)
+        precision = equal_num/(pred_num+1e-10)
+        recall = equal_num/(gold_num+1e-10)
+        f1 = 2*(precision*recall)/(precision+recall+1e-10)
+        return f1, precision, recall, np.mean(loss)
+
+
+    def evaluate_line(self, sess, line):
+        trans = self.trans.eval(session=sess)
+        # lengths, scores = self.run_step(sess, False, input_from_line(line))
+        lengths, scores = self.run_step(sess, 'predict', input_from_line(line))
+        batch_paths = self.decode(scores, lengths, trans)
+        tags = batch_paths[0]  # batch_paths[0][:lengths] 错误
+        return result_to_json(line, tags)
+
+
+
+
+
+
+
+

+ 2 - 1
BiddingKG/dl/test/test4.py

@@ -23,7 +23,8 @@ import BiddingKG.dl.interface.predictor as predictor
 import BiddingKG.dl.interface.Preprocessing as Preprocessing
 import BiddingKG.dl.interface.getAttributes as getAttributes
 import BiddingKG.dl.entityLink.entityLink as entityLink
-import BiddingKG.dl.complaint.punish_rule as punish_rule
+# import BiddingKG.dl.complaint.punish_rule as punish_rule
+import BiddingKG.dl.complaint.punish_predictor as punish_rule
 import json
 
 

+ 278 - 0
BiddingKG/dl/test/测试所有提取信息.py

@@ -0,0 +1,278 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/11 0011 13:52 
+
+'''
+Created on 2019年1月4日
+
+@author: User
+'''
+
+from bs4 import BeautifulSoup, Comment
+import copy
+import re
+import sys
+import os
+import codecs
+import requests
+import time
+
+_time1 = time.time()
+sys.path.append(os.path.abspath("../.."))
+import fool
+from BiddingKG.dl.interface.Connection import *
+from BiddingKG.dl.common.Utils import *
+from BiddingKG.dl.interface.Connection import getConnection
+import BiddingKG.dl.interface.predictor as predictor
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+import BiddingKG.dl.interface.getAttributes as getAttributes
+import BiddingKG.dl.entityLink.entityLink as entityLink
+import BiddingKG.dl.complaint.punish_predictor as punish_predictor
+# import BiddingKG.dl.complaint.punish_rule as punish_predictor
+import json
+
+'''
+doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
+
+conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
+
+cursor = conn.cursor()
+
+cursor.execute(" select content from articles where id='"+doc_id+"' ")
+
+row = cursor.fetchall()[0]
+
+
+#text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
+
+#content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
+'''
+
+''''''
+codeNamePredict = predictor.CodeNamePredict()
+premPredict = predictor.PREMPredict()
+epcPredict = predictor.EPCPredict()
+roleRulePredict = predictor.RoleRulePredictor()
+timePredict = predictor.TimePredictor()
+# punish = punish_rule.Punish_Extract()
+punish = punish_predictor.Punish_Extract()
+productPredict = predictor.ProductPredictor()
+
+# 自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        elif isinstance(obj, str):
+            return obj
+        return json.JSONEncoder.default(self, obj)
+
+
+def predict(doc_id, text, title=""):
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", title]],
+                                                                                    useselffool=True)
+    for articles in list_articles:
+        print(articles.content)
+
+    ''''''
+
+    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
+    print(codeName)
+    premPredict.predict(list_sentences, list_entitys)
+    # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    print("epcPredict")
+    epcPredict.predict(list_sentences, list_entitys)
+    print("entityLink")
+    timePredict.predict(list_sentences, list_entitys)
+    print("timePredict")
+    entityLink.link_entitys(list_entitys)
+    print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
+    print("getPREMs")
+    list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
+    product = productPredict.predict(list_sentences,list_entitys)
+
+    for entitys in list_entitys:
+        for entity in entitys:
+            print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
+                  entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end,entity.sentence_index)
+    # print(prem)
+    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product)[0],
+                      cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
+
+
+def predict_back(doc_id, html):
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, html, "", "", ""]],
+                                                                                    useselffool=True)
+    for articles in list_articles:
+        print(articles.content)
+
+    ''''''
+
+    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)  #预测项目编号,名称
+    print(codeName)
+    premPredict.predict(list_sentences, list_entitys)  #  角色金额模型
+    roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName) # 角色规则
+    print("epcPredict")
+    epcPredict.predict(list_sentences, list_entitys)  # 联系人模型
+    print("entityLink")
+    timePredict.predict(list_sentences, list_entitys) # 时间类别模型
+    print("timePredict")
+    entityLink.link_entitys(list_entitys) #
+    print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles) # 找包,并包号与其他要素连接起来
+    print("getPREMs")
+    # punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title=title, text=list_articles[0].content)
+    list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
+    # punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
+    # print(punish_dic)
+    # prem[0][1]['punish'] = punish_dic
+
+    # bidway = []  # 招标方式
+    # moneySource = []  # 资金来源
+    # servicetime = []  # 服务时间
+    # time_release = []  # 发布时间
+    # time_bidopen = []  # 开标时间
+    # time_bidclose = []  # 截标时间
+    # for entity in list_entitys[0]:
+    #     if entity.entity_type == 'bidway':
+    #         bidway.append(entity.entity_text)
+    #     elif entity.entity_type == 'moneySource':
+    #         moneySource.append(entity.entity_text)
+    #     elif entity.entity_type == 'servicetime':
+    #         servicetime.append(entity.entity_text)
+    #     elif entity.entity_type == 'time' and entity.label == 1:
+    #         time_release.append(entity.entity_text)
+    #     elif entity.entity_type == 'time' and entity.label == 2:
+    #         time_bidopen.append(entity.entity_text)
+    #     elif entity.entity_type == 'time' and entity.label == 3:
+    #         time_bidclose.append(entity.entity_text)
+    #
+    # prem[0][1]['bidway'] = ';'.join(set(bidway))
+    # prem[0][1]['moneySource'] = ';'.join(set(moneySource))
+    # prem[0][1]['servicetime'] = ';'.join(set(servicetime))
+    # prem[0][1]['time_release'] = ';'.join(set(time_release))
+    # prem[0][1]['time_bidopen'] = ';'.join(set(time_bidopen))
+    # prem[0][1]['time_bidclose'] = ';'.join(set(time_bidclose))
+    #
+    # ''''''
+    #
+    # for entitys in list_entitys:
+    #     for entity in entitys:
+    #         print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
+    #               entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end)
+    #
+    # print(prem)
+    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic)[0],
+               cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)
+
+    # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1], cls=MyEncoder, sort_keys=True, indent=4,
+    #                   ensure_ascii=False)
+
+
+def test(name, content):
+    user = {
+        "content": content,
+        "id": name
+    }
+    myheaders = {'Content-Type': 'application/json'}
+    _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
+    resp_json = _resp.content.decode("utf-8")
+    print(resp_json)
+    return resp_json
+
+
+if __name__ == "__main__":
+    from tablestore import *
+    endpoint = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
+    access_key_id = 'LTAI4GJxbioV1y2WM3XcZTmP'
+    access_key_secret = 'B3VITMoCnKtTQE6eAkDkat6UNFHped'
+    instance_name = 'bxkc-ots'
+    ots_client = OTSClient(endpoint, access_key_id, access_key_secret, instance_name)
+
+    def get_data(query, max_rows, table_name='document',
+                 index_name='document_index',
+                 column_names=['docid', 'dochtmlcon','doctitle', 'info_type', 'page_time'],
+                 sorters=[FieldSort("page_time", SortOrder.DESC), FieldSort("docid", SortOrder.DESC)]):
+        '''
+        从阿里云ots查询数据
+        :param query: 查询命令
+        :param max_rows: 最大返回多少数据
+        :param table_name: 表名
+        :param index_name: 表索引名
+        :param column_names: 返回字段名
+        :param sorters: 排序规则列表
+        :return: 处理后的数据列表
+        '''
+        next_token = None
+        data = []
+        all_rows = []
+        rows, next_token, total_count, is_all_succeed = \
+            ots_client.search(table_name,
+                              index_name,
+                              SearchQuery(query,
+                                          next_token=next_token,
+                                          sort=Sort(sorters=sorters),  # ASC升序
+                                          limit=100,
+                                          get_total_count=True),
+                              ColumnsToGet(column_names=column_names,
+                                           return_type=ColumnReturnType.SPECIFIED))
+        all_rows.extend(rows)
+        while next_token:
+            rows, next_token, total_count, is_all_succeed = \
+                ots_client.search(table_name,
+                                  index_name,
+                                  SearchQuery(query,
+                                              next_token=next_token,
+                                              sort=None,
+                                              limit=100,
+                                              get_total_count=True),
+                                  ColumnsToGet(column_names=column_names,
+                                               return_type=ColumnReturnType.SPECIFIED))
+            all_rows.extend(rows)
+            if len(all_rows) > max_rows:
+                print('已获取%d条数据' % len(all_rows))
+                break
+
+        if all_rows:
+            for row in all_rows:
+                tmp = []
+                tmp.append(row[0][1][1])
+                for tup in row[1]:
+                    tmp.append(tup[1])
+                data.append(tmp)
+        return data
+
+
+    bool_query = TermQuery('docid','124113339')
+    # bool_query = BoolQuery(
+    #     must_queries=[TermsQuery(field_name='info_type', column_values=['办公设备', '计算机设备']),
+    #                   RangeQuery('page_time', range_from='2020-11-01', range_to='2020-11-31')]
+    # )
+
+    data = get_data(bool_query, 1)
+    print(data)
+    docid = str(data[0][0])
+    html = data[0][1]
+    title = data[0][2]
+    # text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
+    # 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
+    # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
+    # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
+    # docid = ""
+    # html = '首都医科大学附属北京地坛医院1.5T核磁共振、16排CT和血管造影机维保服务医疗设备维修和保养服务采购项目政府采购合同公告'
+    # html = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
+
+    a = time.time()
+    print("start")
+    # print(predict('12',text))
+    print(predict(docid, html,title=""))
+    # test("12",text)
+    print("takes", time.time() - a)
+    pass

+ 374 - 0
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -0,0 +1,374 @@
+'''
+Created on 2019年1月4日
+
+@author: User
+'''
+
+from bs4 import BeautifulSoup, Comment
+import copy
+import re
+import sys
+import os
+import codecs
+import requests
+import time
+
+_time1 = time.time()
+sys.path.append(os.path.abspath("../.."))
+sys.path.append(os.path.abspath('../../'))
+print('当前路径为:',os.getcwd())
+print('sys.path',sys.path)
+import fool
+from BiddingKG.dl.interface.Connection import *
+from BiddingKG.dl.common.Utils import *
+from BiddingKG.dl.interface.Connection import getConnection
+import BiddingKG.dl.interface.predictor as predictor
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+import BiddingKG.dl.interface.getAttributes as getAttributes
+import BiddingKG.dl.entityLink.entityLink as entityLink
+import json
+
+
+'''
+doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
+
+conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
+
+cursor = conn.cursor()
+
+cursor.execute(" select content from articles where id='"+doc_id+"' ")
+
+row = cursor.fetchall()[0]
+
+
+#text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
+
+#content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
+'''
+
+'''''' 
+codeNamePredict = predictor.CodeNamePredict()
+premPredict = predictor.PREMPredict()
+epcPredict = predictor.EPCPredict()
+roleRulePredict = predictor.RoleRulePredictor()
+
+#自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32, 
+        np.float64)):
+            return float(obj)
+        elif isinstance(obj,str):
+            return obj
+        return json.JSONEncoder.default(self, obj)
+
+
+def predict(doc_id,text):
+    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
+    # for articles in list_articles:
+    #     print('预处理后文本信息')
+    #     print(articles.content)
+
+
+    ''''''
+        
+    codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
+    # print(codeName)
+    premPredict.predict(list_sentences,list_entitys)
+    roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    # print("epcPredict")
+    epcPredict.predict(list_sentences,list_entitys)
+    # print("entityLink")
+    entityLink.link_entitys(list_entitys)
+    # print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
+    # print("getPREMs")
+    
+    
+    ''''''
+    
+    entitys_all = [[[entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index] for entity in entitys] for entitys in list_entitys]
+    for entitys in entitys_all:
+        # print(entitys)
+        # en_types = set([it[1] for it in entitys])
+        print([(it[0],it[1], it[2],it[3][it[2]],it[4],it[5],it[6]) for it in entitys if it[1] in ('org', 'company', 'person')])
+        # print([it for it in entitys if it[1] in ('org','company','person')])
+        # for en_type in en_types:
+        #     print('***************************************')
+        #     print(en_type)
+        #     print([(it[0],it[2],it[3]) for it in entitys if it[1]==en_type])
+
+    # for entitys in list_entitys:
+    #     for entity in entitys:
+    #         print('**********实体信息****************')
+    #         print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
+
+    #print(prem)
+    return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+
+         
+# def test(name,content):
+#     user = {
+#             "content": content,
+#             "id":name
+#             }
+#     myheaders = {'Content-Type': 'application/json'}
+#     _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
+#     resp_json = _resp.content.decode("utf-8")
+#     print(resp_json)
+#     return resp_json
+def get_result_online(docid):
+    import psycopg2
+    conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    cursor = conn.cursor()
+    sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    user = {
+            "content": rows[0][1],
+            "id":docid
+            }
+    myheaders = {'Content-Type': 'application/json'}
+    _resp = requests.post("http://192.168.2.101:15030" + '/article_extract', json=user, headers=myheaders, verify=True)  # 15015  #最新模型15030
+    resp_json = _resp.content.decode("utf-8")
+    return json.loads(resp_json)
+
+def get_result(docid):
+    import psycopg2
+    conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    cursor = conn.cursor()
+    sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    return(json.loads(predict(docid, rows[0][1])))
+
+def analys_person_phone():
+    import pandas as pd
+    import time
+    t1 = time.time()
+    df = pd.read_excel(r'E:\workspace\BiddingKG\BiddingKG\dl\person\实习生标注信息角色联系人电话.xlsx', encoding='utf-8')
+    lab_num = pos_num = pre_num = 0
+    lab_num2 = pos_num2 = pre_num2 = 0
+    lab_person = pos_person = pre_person = 0
+    lab_role = pos_role = pre_role = 0
+    person_errors = []
+    phone_errors = []
+    join_errors = []
+    person_name_errors =[]
+    role_name_errors =[]
+    for docid in set(df['doc_id']):
+        print('开始处理 : ',docid)
+        df_tmp = df[df.loc[:, 'doc_id'] == docid]
+        values = list(df_tmp['value'])
+        a = [it.split() for it in values]
+        rel_person = [it for it in a if it[1] == 'rel_person']
+        rel_phone = [it for it in a if it[1] == 'rel_phone']
+        r1 = get_result(str(docid))
+        # r1 = get_result_online(str(docid))
+        label_role_person = []  # 标注角色+联系人
+        for rel in rel_person:
+            role = [it for it in a if it[0] == rel[2].split(':')[-1]]
+            person = [it for it in a if it[0] == rel[3].split(':')[-1]]
+            if person != [] and role != []:
+                label_role_person.append(role[0][-1] +'+'+ person[0][-1])
+        label_person_phone = []  # 标注角色+联系人
+        for rel in rel_phone:
+            person = [it for it in a if it[0] == rel[2].split(':')[-1]]
+            phone = [it for it in a if it[0] == rel[3].split(':')[-1]]
+            if person != [] and phone != []:
+                label_person_phone.append(person[0][-1] +'+'+ phone[0][-1])
+        role_person = []
+        person_phone = []
+        if r1.get('success','')==False:
+            print(docid, '接口返回失败 ')
+        else:
+            for v in r1['prem'].values():
+                roleList = v['roleList']
+                for role in roleList:
+                    for it in role[3]:
+                        role_person.append(role[1] +'+'+ it[0])
+                for role in roleList:
+                    for it in role[3]:
+                        person_phone.append(it[0] +'+'+ it[1])
+                    # print(set(label_person_phone))
+            # print(set(person_phone))
+        pos_num += len(set(role_person) & set(label_role_person))
+        lab_num += len(set(label_role_person))
+        pre_num += len(set(role_person))
+        if set(role_person)&set(label_role_person) != set(label_role_person):
+            person_errors.append([docid, set(label_role_person), set(role_person)])
+            # 判断角色联系人是否正确逻辑:1、先看预测角色是否都在标签角色里,2判断预测联系人是否在标签联系人,
+            # print(set(role_person))
+            # print(set(label_role_person))
+        if set(label_person_phone) & set(person_phone)!=set(label_person_phone):
+            phone_errors.append([docid, set(label_person_phone), set(person_phone)])
+        pos_num2 += len(set(label_person_phone) & set(person_phone))
+        lab_num2 += len(set(label_person_phone))
+        pre_num2 += len(set(person_phone))
+
+        lab_person += len(set([it.split('+')[1] for it in label_role_person]))
+        pos_person += len(set([it.split('+')[1] for it in label_role_person])&set([it.split('+')[1] for it in role_person]))
+        pre_person += len(set([it.split('+')[1] for it in role_person]))
+
+        lab_role += len(set([it.split('+')[0] for it in label_role_person]))
+        pos_role += len(set([it.split('+')[0] for it in label_role_person])&set([it.split('+')[0] for it in role_person]))
+        pre_role += len(set([it.split('+')[0] for it in role_person]))
+
+        if set([it.split('+')[0] for it in label_role_person]) != set([it.split('+')[0] for it in role_person]):
+            if set([it.split('+')[1] for it in label_role_person]) != set([it.split('+')[1] for it in role_person]):
+                person_name_errors.append([docid,set(label_role_person), set(role_person)])
+            else:
+                role_name_errors.append([docid, set(label_role_person), set(role_person)])
+        else:
+            if set([it.split('+')[1] for it in label_role_person]) != set([it.split('+')[1] for it in role_person]):
+                person_name_errors.append([docid, set(label_role_person), set(role_person)])
+            elif set(label_role_person)!= set(role_person):
+                print(docid,set(label_role_person), set(role_person))
+                join_errors.append([docid,set(label_role_person), set(role_person)])
+    print('单独角色召回率:%.4f,准确率:%.4f'%(pos_role/lab_role, pos_role/pre_role))
+    print('单独联系人召回率:%.4f, 准确率:%.4f'%(pos_person/lab_person, pos_person/pre_person))
+    print('联系人召回率:%.4f, 准确率:%.4f' % (pos_num / lab_num, pos_num / pre_num))
+    print('电话召回率:%.4f,准确率:%.4f' % (pos_num2 / lab_num2, pos_num2 / pre_num2))
+    print('总耗时:',time.time()-t1)
+    return person_errors, phone_errors, join_errors, role_name_errors, person_name_errors
+
+def predict_fromdb(docid, dbname="sys_document_23"):
+    # import pymysql
+    # conn = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD') #新账号密码
+    # cursor = conn.cursor()
+    # sql = "SELECT  docid as id, dochtmlcon as content  from {1} WHERE DOCID='{0}';".format(docid, dbname)
+    import psycopg2
+    conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    cursor = conn.cursor()
+    sql = """select human_identifier as id,sourcetext as content from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    doc_id = rows[0][0]
+    text = rows[0][1]
+    # text = '竟然很明显的表达没识别为代理,代理机构名称:国信国采(北京)招标咨询有限责任公司,代理机构地址:北京市海淀区首体南路22号国兴大厦11层,  1.采购人信息名 称:北京市植物园。'
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],useselffool=True)
+    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
+    # print(codeName)
+    premPredict.predict(list_sentences, list_entitys)
+    roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
+    # print("epcPredict")
+    epcPredict.predict(list_sentences, list_entitys)
+    # print("entityLink")
+    entityLink.link_entitys(list_entitys)
+    # print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
+    return list_articles, list_sentences, list_entitys, codeName, prem
+
+if __name__=="__main__":
+    # import pandas as pd
+    # import math
+    # import pymysql
+    # conn = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD') #新账号密码
+    # cursor = conn.cursor()
+    # df = pd.read_excel('G:/大网站规则识别/1027统计入库top100编号.xlsx')
+    # docs_list = []
+    # for i in range(100):
+    #     web_no = df.loc[i, '编号']
+    #     # num = math.ceil(int(df.loc[i, '1019-1023入库公告数量']) * 0.01)
+    #     num = 10
+    #     sql = "SELECT DOCID,DOCCHANNEL,DOCHTMLCON,WEB_SOURCE_NO from sys_document_23 where WEB_SOURCE_NO='{0}' and DOCCHANNEL='101' and DOCID%9=1 limit {1}".format(
+    #         web_no, num)
+    #     #  rows = cursor.execute(sql) 此处代码错误 rows 需要用 cursor.fetchall方法获取
+    #     cursor.execute(sql)
+    #     rows = cursor.fetchall()
+    #     docs_list.extend(list(rows))
+    # df_doc = pd.DataFrame(docs_list, columns=['docid', 'channel', 'html', 'web_no'])
+    # codenames = []
+    # prems = []
+    # for docid,text in zip(df_doc['docid'], df_doc['html']):
+    #     list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[docid, text, "", "", ""]],
+    #                                                                                     useselffool=True)
+    #     codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
+    #     # print(codeName)
+    #     premPredict.predict(list_sentences, list_entitys)
+    #     roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
+    #     # print("epcPredict")
+    #     epcPredict.predict(list_sentences, list_entitys)
+    #     # print("entityLink")
+    #     entityLink.link_entitys(list_entitys)
+    #     # print("getPREMs")
+    #     prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
+    #     if codeName:
+    #         codenames.append(codeName[0][1])
+    #     else:
+    #         codenames.append(" ")
+    #     if prem:
+    #         prems.append(prem[0][1])
+    #     else:
+    #         prems.append(" ")
+    # df_doc['codename'] = pd.Series(codenames)
+    # df_doc['prem'] = pd.Series(prems)
+    # df_doc.to_excel('G:/大网站规则识别/大网站规则调整后预测结果20201124.xlsx', columns=['docid', 'channel', 'html', 'prem', 'codename', 'web_no'])
+
+
+    list_articles, list_sentences, list_entitys, codeName, prem = predict_fromdb('100006370',dbname="sys_document_25")  #sys_document_23
+    print(prem)
+    print(codeName)
+    entitys_all = [[[entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index] for entity in entitys] for entitys in list_entitys]
+    for entitys in entitys_all:
+        # print(entitys)
+        # en_types = set([it[1] for it in entitys])
+        print([(it[0],it[1], it[2],it[3][it[2]],it[4],it[5],it[6]) for it in entitys if it[1] in ('org', 'company', 'person')])
+    print(list_articles[0].content)
+
+    # print(get_result('100000203'))
+
+    # person_errors, phone_errors, join_errors, role_name_errors, person_name_errors = analys_person_phone()
+    # import pickle
+    # with open('phone_errors.pkl','wb') as f:
+    #     pickle.dump(phone_errors, f)
+
+    # filename = "比地_52_79929693.html"
+    # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
+    # # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
+    # # text = codecs.open('F:/工作文档/实体识别实体对其/20190320/比地_101_58511386.html', encoding='utf-8').read()
+    # docid = '100000203'
+    # r1 = get_result(docid)
+    # r2 = get_result_online(docid)
+    # rolperson = []
+    # person_phone = []
+    # for v in r1['prem'].values():
+    #     roleList = v['roleList']
+    #     for role in roleList:
+    #         for it in role[3]:
+    #             rolperson.append(role[1] + it[0])
+    #     for role in roleList:
+    #         for it in role[3]:
+    #             person_phone.append(it[0]+it[1])
+    # print(r1['prem'])
+    # print(r2['prem'])
+    #
+    # import psycopg2
+    # conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    # cursor = conn.cursor()
+    # sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('95008163');"""
+    # cursor.execute(sql)
+    # rows = cursor.fetchall()
+    # # print(len(rows), rows)
+    # content = rows[0][1]
+    # # content = str(BeautifulSoup(text).find("div",id="pcontent"))
+    # # content = text
+    # # print('content: ',content)
+    # #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
+    # #text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
+    # a = time.time()
+    # print("start")
+    # # print(predict("12",content))
+    # result = predict("12",content)
+    # print(json.loads(result))
+    # #test("12",text)
+    # print("takes",time.time()-a)
+    # _time2 = time.time()
+    # print(predict("12",content))
+    # _time3 = time.time()
+    # print("init takes:%d"%((_time2-_time1)-(_time3-_time2)))
+    # pass