ソースを参照

Merge remote-tracking branch 'origin/master' into master

Jiasheng 4 年 前
コミット
74f9b2ba2f
31 ファイル変更2643 行追加388 行削除
  1. 4 1
      BiddingKG/dl/common/Utils.py
  2. BIN
      BiddingKG/dl/complaint/models/punish_code.pb
  3. 473 0
      BiddingKG/dl/complaint/punish_predictor.py
  4. 98 70
      BiddingKG/dl/complaint/punish_rule.py
  5. 294 173
      BiddingKG/dl/interface/Preprocessing.py
  6. 4 1
      BiddingKG/dl/interface/getAttributes.py
  7. 1 1
      BiddingKG/dl/interface/modelFactory.py
  8. 87 7
      BiddingKG/dl/interface/predictor.py
  9. BIN
      BiddingKG/dl/interface/product_savedmodel/product.pb
  10. BIN
      BiddingKG/dl/interface/timesplit_model/saved_model.pb
  11. BIN
      BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001
  12. BIN
      BiddingKG/dl/interface/timesplit_model/variables/variables.index
  13. BIN
      BiddingKG/dl/product/data/dev_data.pkl
  14. BIN
      BiddingKG/dl/product/data/dev_data2.pkl
  15. BIN
      BiddingKG/dl/product/data/train_data.pkl
  16. BIN
      BiddingKG/dl/product/data/train_data2.pkl
  17. 155 0
      BiddingKG/dl/product/data_util.py
  18. 117 0
      BiddingKG/dl/product/main.py
  19. 2 0
      BiddingKG/dl/product/model/checkpoint
  20. BIN
      BiddingKG/dl/product/model/ner2.ckpt.data-00000-of-00001
  21. BIN
      BiddingKG/dl/product/model/ner2.ckpt.index
  22. BIN
      BiddingKG/dl/product/model/ner2.ckpt.meta
  23. BIN
      BiddingKG/dl/product/model/product.pb
  24. 240 0
      BiddingKG/dl/product/product_model.py
  25. 14 5
      BiddingKG/dl/test/test4.py
  26. 278 0
      BiddingKG/dl/test/测试所有提取信息.py
  27. 374 0
      BiddingKG/dl/test/测试整个要素提取流程.py
  28. BIN
      BiddingKG/dl/time/model_label_time_classify.model.hdf5
  29. 219 100
      BiddingKG/dl/time/train_2.py
  30. 169 0
      BiddingKG/maxcompute/contactDumplicate.py
  31. 114 30
      BiddingKG/maxcompute/documentDumplicate.py

+ 4 - 1
BiddingKG/dl/common/Utils.py

@@ -134,7 +134,10 @@ def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024):
         while(_begin<len_sample):
             new_dict = dict()
             for _key in feed_dict.keys():
-                new_dict[_key] = feed_dict[_key][_begin:_begin+MAX_BATCH]
+                if isinstance(feed_dict[_key],(float,int,np.int32,np.float_,np.float16,np.float32,np.float64)):
+                    new_dict[_key] = feed_dict[_key]
+                else:
+                    new_dict[_key] = feed_dict[_key][_begin:_begin+MAX_BATCH]
             _output = sess.run(list_output,feed_dict=new_dict)
             for _index in range(len(list_output)):
                 list_result[_index].extend(_output[_index])

BIN
BiddingKG/dl/complaint/models/punish_code.pb


+ 473 - 0
BiddingKG/dl/complaint/punish_predictor.py

@@ -0,0 +1,473 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/25 0025 16:35 
+
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2020/12/24 0024 15:23
+import re
+import os
+import time
+import tensorflow as tf
+# from BiddingKG.dl.common.Utils import *
+from tensorflow.contrib.crf import crf_log_likelihood
+from tensorflow.contrib.layers.python.layers import initializers
+# from keras.preprocessing.sequence import pad_sequences
+# import BiddingKG.dl.interface.Preprocessing as Preprocessing
+from BiddingKG.dl.interface.Preprocessing import *
+
+
+def decode(logits, trans, sequence_lengths, tag_num):
+    viterbi_sequences = []
+    for logit, length in zip(logits, sequence_lengths):
+        score = logit[:length]
+        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
+        viterbi_sequences.append(viterbi_seq)
+    return viterbi_sequences
+
+class Punish_Extract():
+    def __init__(self, model_file = os.path.dirname(__file__)+"/models/punish_code.pb"):
+        print('model_file_path:',model_file)
+        self.sess = tf.Session(graph=tf.Graph())
+        self.code = ""
+        self.punish_dicition = ""
+        self.model_file = model_file #预测编号模型
+        self.load_model()
+
+    # 加载处罚编号预测模型
+    def load_model(self):
+        log("get model of time")
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                output_graph_def = tf.GraphDef()
+                with open(self.model_file, 'rb') as f:
+                    output_graph_def.ParseFromString(f.read())
+                    tf.import_graph_def(output_graph_def, name="")
+                    self.sess.run(tf.global_variables_initializer())
+                    self.char_input = self.sess.graph.get_tensor_by_name("char_input:0")
+                    self.length = self.sess.graph.get_tensor_by_name("length:0")
+                    self.trans = self.sess.graph.get_tensor_by_name("crf_loss/transitons:0")
+                    self.logits = self.sess.graph.get_tensor_by_name("CRF/output/logits:0")
+
+    # 处罚编号预测
+    def predict_punishCode(self,list_sentences, MAX_AREA=5000):
+        '''
+        每个句子预测处罚编号
+        :param list_sentences: 多篇文章句子列表[[每篇文章句子列表]]
+        :param MAX_AREA: 控制最大每个句子长度,超过截断
+        :return: 处罚编号字符串,若有多个;号隔开
+        '''
+        re_ner = re.compile("12+?3")
+        article_ner_list = []
+        count = 0
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                for sentences in list_sentences:
+                    count += 1
+                    # print(count)
+                    sentences.sort(key=lambda x: len(x.sentence_text), reverse=True)
+                    _begin_index = 0
+                    while True:
+                        MAX_LEN = len(sentences[_begin_index].sentence_text)
+                        if MAX_LEN > MAX_AREA:
+                            MAX_LEN = MAX_AREA
+                        _LEN = MAX_AREA // MAX_LEN
+                        sentence_len = [len(sentence.sentence_text) for sentence in sentences[_begin_index:_begin_index+_LEN]]
+                        sentences_x = []
+                        for sentence in sentences[_begin_index:_begin_index+_LEN]:
+                            sentence = sentence.sentence_text
+                            sentence = list(sentence)
+                            sentence2id = [getIndexOfWord(word) for word in sentence]
+                            sentences_x.append(sentence2id)
+                        sentences_x = pad_sequences(sentences_x, maxlen=MAX_LEN, padding="post", truncating="post")
+                        sentences_x = [np.array(x) for x in sentences_x]
+                        _logits, _trans = self.sess.run([self.logits, self.trans],
+                                                   feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
+                        viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
+
+                        ner_list = []
+                        for _seq, sentence in zip(viterbi_sequence, sentences[_begin_index:_begin_index+_LEN]):
+                            sentence = sentence.sentence_text
+                            seq_id = ''.join([str(s) for s in _seq])
+                            if re_ner.search(seq_id):
+                                # print("sentence: ",sentence)
+                                for _ner in re_ner.finditer(seq_id):
+                                    start = _ner.start()
+                                    end = _ner.end()
+                                    n = sentence[start:end]
+                                    # print(n,'<==>',start,end)
+                                    # ner_list.append((n, start, end))
+                                    ner_list.append(n)  # 改为只返回实体字符
+                        # article_ner_list.append(ner_list)
+                        article_ner_list.append(';'.join(set(ner_list)))
+                        if _begin_index+_LEN >= len(sentences):
+                            break
+                        _begin_index += _LEN
+        return article_ner_list[0]
+
+    # 处罚类型
+    def get_punishType(self, x1, x2):
+        '''通过文章标题及内容判断文章类别
+        x1: 标题
+        x2: 内容
+        return 类别'''
+        # x1 = x1.replace('(','(').replace(')', ')').replace(' ','')
+        # x2 = x2.replace('(', '(').replace(')', ')').replace(' ', '')
+        '''标题正则'''
+        # 未知公告
+        unknow = re.compile('采购方式|采购公告|采购招标|磋商公告|谈判公告|交易公告$|征集|征求|招标公告|竞标公告|中标公告|'
+                            '成交公告|成交信息|流标公告|废标公告|城市管理考评|决算表|决算|预算|资格考试|招聘|选聘'
+                            '|聘请|拟录用|无违规违法|无此项信息|暂无工程投标违法|管理办法|指导意见|无投诉|投诉办法'
+                            '公共资源交易情况|绩效评价|考试成绩|付息公告|不动产|办证|印发|转发')  #|结果公示 部分是
+        # 投诉处理
+        tscl = re.compile('投诉不予[处受]理|投诉不成立|终止投诉|投诉终止|不予受理|投诉事?项?的?处理')
+        # 行政处罚
+        xzcf = re.compile('行政处罚|行政处理|政处罚|行政裁决|防罚|公罚|医罚|环罚|政罚|文罚|局罚|旅罚|财罚|运罚')
+        # 监督检查
+        jdjc = re.compile('(监督检查的?问?题?(处理|整改|记分|结果|决定|处罚))|监督处罚|调查处理|监督处理')
+        # 严重违法
+        yzwf = re.compile('严重违法失信|黑名单|失信名单')
+        # 不良行为
+        blxw = re.compile('((不良|失信|不诚信|差错|不规范|违规|违约|处罚|违法)(行为|记录|信息))|((违约|违规|违法)(处理|操作|情况|问题))'
+                          '|通报批评|记分管理|迟到|早退|缺席|虚假材料|弄虚作假|履职不到位|诚信考核扣分|串通投标'
+                          '|审核不通过|码一致|地址一致|扣分处理|扣分通知|扣[0-9]+分|责令整改|信用信息认定书$'
+                          '|关于.{,30}的处罚|关于.{,10}的?考评通报|关于.{,30}扣分情况|不规范代理行为'
+                          '|(取消|暂停|限制).{,50}((专家|评标|评委|投标|竞价|被抽取|中标|供应商|候选人)资格)'
+                          '|(代理服?务?机构).{,10}(扣分)|(专家).{,30}(扣分|记分|处罚)|对.{,30}处理|冻结.{,30}账号')
+        # 其他不良行为
+        other = re.compile('质疑|代理机构进场交易情况|网上投诉办理|信用奖惩|信用奖罚|进场工作.{,5}考核'
+                           '|举报处理|结果无效|成交无效|行政复议')
+
+        '''正文内容正则'''
+        # 投诉处理
+        tscl_c = re.compile('(投诉(人|单位)[1-9]?(名称)?[::])|(投诉事项[1-5一二三四五、]*部?分?(成立|予以受理))'
+                            '|((驳回|撤回|撤销|终止)[^,。]{,60}(投诉|质疑))')
+        # 行政处罚
+        xzcf_c = re.compile('((处理依据及结果|处理结果|处罚结果)).*行政处罚|如下行政处罚|行政处罚决定')
+        # 诚信加分
+        cxjf_c = re.compile('处罚结果.*诚信加分')
+        # 严重违法失信
+        yzwf_c = re.compile('工商部门严重违法失信起名单|严重违法失信的具体情形') #|严重违法失信的具体情形
+        # 不良行为
+        blxw_c = re.compile('(取消|暂停|限制).{,30}((专家|评标|评委|投标|采购|竞价|被抽取|中标|供应商)的?资格)'
+                            '|(处罚结果|处罚情况).*(扣[1-9]*分|记分|不良行为|不良记录|不良信用|不诚信|扣除信用'
+                            '|诚信档案|信用信息|取消.*资格|口头警告|处罚机关|责令改正|罚款|限制投标|暂扣|禁止'
+                            '|暂停|封禁|暂无|行政处罚)|处罚结果'
+                            '|处罚主题|禁止参与.{,10}政府采购活动|列入不良行为|处罚如下|如下处罚|违规处罚|处罚违规'
+                            '|责令改正|责令整改|处罚依据|进行以下处理|处理依据及结果|处理结果|处罚决定书|'
+                            '(不规范|不良|不诚信)行为记录')
+        # 其他不良行为
+        other_c = re.compile('质疑(人|单位)[1-9]?(名称)?:|公告期内受质疑')
+
+        if re.search(unknow, x1):
+            return re.search(unknow, x1).group(0), '未知类别'
+        elif re.search(yzwf, x1):
+            return re.search(yzwf, x1).group(0), '严重违法'
+        elif re.search(yzwf_c, x2):
+            return re.search(yzwf_c, x2).group(0), '严重违法'
+
+        elif re.search(tscl, x1):
+            return re.search(tscl, x1).group(0), '投诉处理'
+        elif re.search(xzcf, x1):
+            return re.search(xzcf, x1).group(0), '行政处罚'
+        elif re.search(jdjc, x1):
+            return re.search(jdjc, x1).group(0), '监督检查'
+        elif re.search(blxw, x1):
+            return re.search(blxw, x1).group(0), '不良行为'
+        elif re.search(other, x1):
+            return re.search(other, x1).group(0), '其他不良行为'
+
+        elif re.search(tscl_c, x2):
+            return re.search(tscl_c, x2).group(0), '投诉处理'
+        elif re.search(xzcf_c, x2):
+            return re.search(xzcf_c, x2).group(0), '行政处罚'
+        elif re.search(cxjf_c, x2):
+            return re.search(cxjf_c, x2).group(0), '诚信加分'
+
+        elif re.search(blxw_c, x2):
+            return re.search(blxw_c, x2).group(0), '不良行为'
+        elif re.search(other_c, x2):
+            return re.search(other_c, x2).group(0), '其他不良行为'
+
+        return ' ', '未知类别'
+
+    # 处罚决定
+    def get_punishDecision(self, x, x2):
+        '''通过正则匹配文章内容中的处理决定
+        x:正文内容
+        x2: 处罚类别
+        return 处理决定字符串'''
+        rule1 = re.compile(
+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处理意见|行政处罚|处罚)(如下|如下))'
+            '|((以下|如下)(决定|处理|处理意见|行政处罚|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
+            '|整改意见)[::].{5,}')
+        rule2 = re.compile(
+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处罚|处理意见)(如下|如下))'
+            '|((以下|如下)(决定|处理|处理意见|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
+            '|处罚内容)[:,,].{10,}')
+        rule3 = re.compile('考评结果:?.*')
+        rule4 = re.compile('(依据|根据)《.*》.*')
+        if x2 == '未知类别':
+            return ' '
+        elif re.search(rule1, x[-int(len(x)*0.4):]):
+            return re.search(rule1, x[-int(len(x)*0.4):]).group(0)
+        elif re.search(rule1, x[-int(len(x)*0.6):]):
+            return re.search(rule1, x[-int(len(x)*0.6):]).group(0)
+        elif re.search(rule2, x[-int(len(x)*0.7):]):
+            return re.search(rule2, x[-int(len(x)*0.7):]).group(0)
+        elif re.search(rule3, x[-int(len(x)*0.6):]):
+            return re.search(rule3, x[-int(len(x)*0.6):]).group(0)
+        elif re.search(rule4, x[-int(len(x)*0.4):]):
+            return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
+        else:
+            return ' '
+
+    # 投诉是否成立
+    def get_punishWhether(self, x1, x2, x3):
+        '''通过正则匹配处理决定判断投诉是否成立
+        x1: 处理决定字符串
+        x2: 正文内容
+        x3: 处罚类别
+        return 投诉是否成立'''
+        p1 = re.compile('(投诉|投拆|质疑|举报)(事项|内容|事实)?[^不,。]{,10}(成立|属实|予以受理|予以支持)|责令|废标|(中标|成交)[^,。]{,10}无效'
+                        '|取消[^,。]{,60}资格|罚款|重新(组织|开展)?(招标|采购)|投诉成立|被投诉人存在违法违规行为'
+                        '|采购活动违法|(中标|评标|成交)结果无效')
+        p2 = re.compile('投诉不予[处受]理|((投诉|投拆|质疑|举报)(事项|内容|事实)?[^,。]{,10}(不成立|情?况?不属实|不予支持|缺乏事实依据))'
+                        '|((驳回|撤回|撤销|终止)[^,。]*(投诉|质疑|诉求))|终止[^,。]{,20}(行政裁决|投诉处理|采购活动)|投诉终止|投诉无效'
+                        '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^,。]{,10}不成立'
+                        '|维持被投诉人|不支持[^,。]{,20}投诉|无确凿证据')
+        if x3 != '投诉处理':
+            return ' '
+        elif re.search(p1, x1):
+            return '投诉成立'
+        elif re.search(p2, x1):
+            return '投诉无效'
+        elif re.search(p1, x2):
+            return '投诉成立'
+        elif re.search(p2, x2):
+            return '投诉无效'
+        return ' '
+
+    # 执法机构、处罚时间
+    def get_institution(self, title, sentences_l, entity_l):
+        '''
+        通过判断实体前信息判断改实体是否为执法机构
+        :param title: 文章标题
+        :param sentences_l: 单篇公告句子列表
+        :param entity_l: 单篇公告实体列表
+        :return: 执法机构及处罚时间字符串,多个的用;号隔开
+        '''
+        institutions = []
+        punishTimes = []
+        institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
+        punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
+        # 通过实体前面关键词判断是否为执法机构或处罚时间
+        for ner in entity_l:
+            if ner.entity_type == 'org':
+                left = sentences_l[ner.sentence_index].sentence_text[
+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
+                if institution_1.search(left):
+                    institutions.append(ner)
+                elif institutions != [] and ner.sentence_index == institutions[-1].sentence_index and \
+                        ner.wordOffset_begin - institutions[-1].wordOffset_end < 2 and \
+                        sentences_l[ner.sentence_index].sentence_text[
+                        ner.wordOffset_begin:institutions[-1].wordOffset_end] \
+                        in ['', '、', '和', '及']:
+                    institutions.append(ner)
+            elif ner.entity_type == 'time':
+                left = sentences_l[ner.sentence_index].sentence_text[
+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
+                if punishTimes_1.search(left):
+                    punishTimes.append(ner)
+
+        institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
+        institution_time = re.compile(
+            "(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
+        ins = ""
+        ptime = ""
+        # 如果前面步骤找不到处罚机构则在标题找实体,并正则检查是否有关键词
+        if institutions == [] and len(title)>10:
+            title_ners = getNers([title], useselffool=True)
+            if title_ners[0]:
+                for title_ner in title_ners[0]:
+                    if title_ner[2] == 'org' and institution_title.search(title_ner[3]):
+                        ins = title_ner[3]
+                        break
+        if punishTimes == [] or institutions == []:
+            # 如果前面步骤还没找到要素,则通过公司实体后面是否有日期关键词,有则作为处罚机构和处罚时间
+            for ner in [ner for ner in entity_l if ner.entity_type == 'org'][-5:][::-1]:
+                right = sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_end:ner.wordOffset_end + 16]
+                if institution_time.search(right):
+                    if ins == '':
+                        ins = ner.entity_text
+                    if ptime == '':
+                        ptime = institution_time.search(right).group(1)
+                    break
+            # 前面步骤都没找到则判断最后一个时间实体是否在文章末尾,是则作为处罚时间
+            if ptime == '':
+                n_time = [ner for ner in entity_l if ner.entity_type == 'time']
+                if len(n_time) != 0:
+                    ner = n_time[-1]
+                    if ner.sentence_index == len(sentences_l) - 1:
+                        textLong = len(sentences_l[ner.sentence_index].sentence_text)
+                        if ner.wordOffset_end > textLong - 3 and len(ner.entity_text) > 3:
+                            ptime = ner.entity_text
+        institutions = [ner.entity_text for ner in institutions]
+        punishTimes = [ner.entity_text for ner in punishTimes]
+        if institutions == [] and ins != "":
+            institutions.append(ins)
+        if punishTimes == [] and ptime != "":
+            punishTimes.append(ptime)
+        return ";".join(institutions), ";".join(punishTimes)
+
+    # 投诉人、被投诉人、被处罚人
+    def get_complainant(self, punishType, sentences_l, entity_l):
+        '''
+        通过对公告类别、句子列表、实体列表正则寻找投诉人、被投诉人、处罚人
+        :param punishType: 公告处罚类别
+        :param sentences_l: 单篇公告句子列表
+        :param entity_l: 单篇公告实体列表
+        :return: 投诉人、被投诉人
+        '''
+        complainants = []  # 投诉人
+        punishPeople = []  # 被投诉人、被处罚人
+        size = 16
+        # 投诉人、质疑人
+        complainants_rule1 = re.compile(
+            "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+        # 被处罚人,被投诉人
+        punishPeople_rule1 = re.compile(
+            "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+        punishPeople_rule2_1 = re.compile(",$")
+        punishPeople_rule2_2 = re.compile("^[::]")
+        punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
+        punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
+
+        punish_l = []  # 处罚实体列表
+        tmp = []
+        for ner in [ner for ner in entity_l if ner.entity_type in ['org', 'company', 'person']]:
+            if tmp == []:
+                tmp.append(ner)
+            elif ner.entity_type == tmp[-1].entity_type and ner.sentence_index == tmp[-1].sentence_index and \
+                    ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
+                '',
+                '、',
+                '和',
+                '及']:
+                tmp.append(ner)
+            elif ner.entity_type in ['org', 'company'] and tmp[-1].entity_type in ['org', 'company'] and \
+                    ner.sentence_index == tmp[-1].sentence_index and ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
+                '',
+                '、',
+                '和',
+                '及']:
+                tmp.append(ner)
+            else:
+                punish_l.append(tmp)
+                tmp = [ner]
+        for ner_l in punish_l:
+            begin_index = ner_l[0].wordOffset_begin
+            end_index = ner_l[-1].wordOffset_end
+            left = sentences_l[ner_l[0].sentence_index].sentence_text[max(0, begin_index - size):begin_index]
+            right = sentences_l[ner_l[0].sentence_index].sentence_text[end_index:end_index + size]
+            if complainants_rule1.search(left):
+                complainants.append(ner_l)
+            elif punishPeople_rule1.search(left):
+                punishPeople.append(ner_l)
+            elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
+                if punishType == '投诉处理':
+                    complainants.append(ner_l)
+                else:
+                    punishPeople.append(ner_l)
+            elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
+                punishPeople.append(ner_l)
+        complainants = set([it.entity_text for l in complainants for it in l])
+        punishPeople = set([it.entity_text for l in punishPeople for it in l])
+        return ';'.join(complainants), ';'.join(punishPeople)
+
+    def get_punish_extracts(self,list_articles,list_sentences, list_entitys):
+        list_result = []
+        for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
+            title = article.title
+            text=article.content
+            keyword, punishType = self.get_punishType(title, text)
+
+            # print('处罚类型:',punishType)
+            punish_code = self.predict_punishCode(list_sentences)
+            # print('处罚编号: ',punish_code)
+            institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
+            # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
+            punishDecision = self.get_punishDecision(text, punishType)
+            # print('处罚决定:',punishDecision)
+            punishWhether= self.get_punishWhether(punishDecision, text, punishType)
+            # print('投诉是否成立:',punishWhether)
+            complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
+            # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
+            punish_dic = {'punish_code':punish_code,
+                          'punishType':punishType,
+                          'punishDecision':punishDecision,
+                         'complainants':complainants,
+                         'punishPeople':punishPeople,
+                         'punishWhether':punishWhether,
+                         'institutions':institutions,
+                         'punishTimes':punishTimes}
+            _count = 0
+            for k,v in punish_dic.items():
+                if v!="":
+                    _count += 1
+            if _count>=2 and punish_dic["punishType"]!="未知类别":
+                list_result.append({"punish":punish_dic})
+            else:
+                list_result.append({"punish":{}})
+        return list_result
+
+
+
+if __name__ == "__main__":
+    punish = Punish_Extract()
+
+    import pandas as pd
+    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
+    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
+    # i = 89
+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+    # i = 92
+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+
+    # t1 = time.time()
+    # for i in df.index:
+    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
+    #     df.loc[i, '投诉人'] = complainants
+    #     df.loc[i, '被投诉人'] = punishPeople
+    #     df.loc[i, '执法机构'] = institutions
+    #     df.loc[i, '处罚时间'] = punishTimes
+    #     df.loc[i, '处罚编号'] = punish_code
+    #     print('完成第%d篇'%i)
+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
+    # #    'institution', 'punishTime', 'ner_test']])
+    # t2 = time.time()
+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
+    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
+    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
+    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
+    # t3 = time.time()
+    # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
+    s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
+    # list_sentences = [s.split('。')]
+    # punish_code= punish.predict_punishCode( list_sentences)
+    # print(punish_code)
+
+    # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    #             get_punish_extracts(text=s)
+    # punish_dic = punish.get_punish_extracts(text=s)
+    # print(punish_dic)

+ 98 - 70
BiddingKG/dl/complaint/punish_rule.py

@@ -75,6 +75,7 @@ def BiLSTM_CRF_tfmodel(sess,weights):
             grads_vars = opt.compute_gradients(crf_loss)
             capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
             train_op = opt.apply_gradients(capped_grads_vars,global_step)
+            print('tensor: ',char_input, length, trans, _logits)
             return char_input,_logits,target,length,crf_loss,trans,train_op
 
 def decode(logits, trans, sequence_lengths, tag_num):
@@ -125,6 +126,7 @@ class Punish_Extract():
                         sentences_x.append(sentence2id)
                     sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
                     sentences_x = [np.array(x) for x in sentences_x]
+                    print('punish tensor: ',self.logits, self.trans, self.char_input, self.length)
                     _logits, _trans = self.sess.run([self.logits, self.trans],
                                                feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
                     viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
@@ -143,7 +145,7 @@ class Punish_Extract():
                                 # ner_list.append((n, start, end))
                                 ner_list.append(n)  # 改为只返回实体字符
                     # article_ner_list.append(ner_list)
-                    article_ner_list.append(''.join(set(ner_list)))
+                    article_ner_list.append(';'.join(set(ner_list)))
         return article_ner_list[0]
 
     # 处罚类型
@@ -261,7 +263,7 @@ class Punish_Extract():
         elif re.search(rule4, x[-int(len(x)*0.4):]):
             return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
         else:
-            return ' '
+            return ''
 
     # 投诉是否成立
     def get_punishWhether(self, x1, x2, x3):
@@ -278,7 +280,7 @@ class Punish_Extract():
                         '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^,。]{,10}不成立'
                         '|维持被投诉人|不支持[^,。]{,20}投诉|无确凿证据')
         if x3 != '投诉处理':
-            return ' '
+            return ''
         elif re.search(p1, x1):
             return '投诉成立'
         elif re.search(p2, x1):
@@ -287,7 +289,7 @@ class Punish_Extract():
             return '投诉成立'
         elif re.search(p2, x2):
             return '投诉无效'
-        return ' '
+        return ''
 
     # 执法机构、处罚时间
     def get_institution(self, title, sentences_l, entity_l):
@@ -296,7 +298,7 @@ class Punish_Extract():
         :param title: 文章标题
         :param sentences_l: 单篇公告句子列表
         :param entity_l: 单篇公告实体列表
-        :return: 执法机构及处罚时间字符串,多个的用号隔开
+        :return: 执法机构及处罚时间字符串,多个的用;号隔开
         '''
         institutions = []
         punishTimes = []
@@ -359,7 +361,7 @@ class Punish_Extract():
             institutions.append(ins)
         if punishTimes == [] and ptime != "":
             punishTimes.append(ptime)
-        return ";".join(institutions), ";".join(punishTimes)
+        return ";".join(institutions), ";".join(punishTimes)
 
     # 投诉人、被投诉人、被处罚人
     def get_complainant(self, punishType, sentences_l, entity_l):
@@ -426,7 +428,7 @@ class Punish_Extract():
                 punishPeople.append(ner_l)
         complainants = set([it.entity_text for l in complainants for it in l])
         punishPeople = set([it.entity_text for l in punishPeople for it in l])
-        return ';'.join(complainants), ';'.join(punishPeople)
+        return ';'.join(complainants), ';'.join(punishPeople)
 
     def get_punish_extracts_backup(self, doc_id=' ', title=' ', text=' '):
         list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
@@ -459,73 +461,99 @@ class Punish_Extract():
         for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
             title = article.title
             text=article.content
+
             keyword, punishType = self.get_punishType(title, text)
-            if punishType == "未知类别":
-                list_result.append({"punish":{}})
-            else:
-                # print('处罚类型:',punishType)
-                punish_code = self.predict_punishCode(list_sentences)
-                # print('处罚编号: ',punish_code)
-                institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
-                # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
-                punishDecision = self.get_punishDecision(text, punishType)
-                # print('处罚决定:',punishDecision)
-                punishWhether= self.get_punishWhether(punishDecision, text, punishType)
-                # print('投诉是否成立:',punishWhether)
-                complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
-                # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
-                punish_dic = {'punish_code':punish_code,
-                              'punishType':punishType,
-                              'punishDecision':punishDecision,
-                             'complainants':complainants,
-                             'punishPeople':punishPeople,
-                             'punishWhether':punishWhether,
-                             'institutions':institutions,
-                             'punishTimes':punishTimes}
+            # print('处罚类型:',punishType)
+            punish_code = self.predict_punishCode(list_sentences)
+            # print('处罚编号: ',punish_code)
+            institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
+            # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
+            punishDecision = self.get_punishDecision(text, punishType)
+            # print('处罚决定:',punishDecision)
+            punishWhether= self.get_punishWhether(punishDecision, text, punishType)
+            # print('投诉是否成立:',punishWhether)
+            complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
+            # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
+            punish_dic = {'punish_code':punish_code,
+                          'punishType':punishType,
+                          'punishDecision':punishDecision,
+                         'complainants':complainants,
+                         'punishPeople':punishPeople,
+                         'punishWhether':punishWhether,
+                         'institutions':institutions,
+                         'punishTimes':punishTimes}
+            _count = 0
+            for k,v in punish_dic.items():
+                if v!="":
+                    _count += 1
+            if _count>=2 and punish_dic["punishType"]!="未知类别":
                 list_result.append({"punish":punish_dic})
+            else:
+                list_result.append({"punish":{}})
         return list_result
 
-if __name__ == "__main__":
-    punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
+def save_punish_code_model():
+    model_folder = os.path.dirname(__file__) + "/models/21-0.9990081295021194-0.3647936"
+    output_graph = os.path.dirname(__file__) + "/models/punish_code.pb"
+    ckpt = tf.train.get_checkpoint_state(model_folder)
+    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
+        input_checkpoint = ckpt.model_checkpoint_path
+        saver = tf.train.import_meta_graph(input_checkpoint+".meta", clear_devices=True)
+        graph = tf.get_default_graph()
+        input_graph_def = graph.as_graph_def()
+        with tf.Session() as sess:
+            saver.restore(sess, input_checkpoint)
+            output_graph_def = graph_util.convert_variables_to_constants(
+                sess = sess,
+                input_graph_def = input_graph_def,
+                output_node_names=["char_input","length","crf_loss/transitons","CRF/output/logits"]
+            )
+            with tf.gfile.GFile(output_graph, "wb") as f:
+                f.write(output_graph_def.SerializeToString())
 
-    import pandas as pd
-    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
-    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
-    # i = 89
-    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
-    # i = 92
-    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
 
-    # t1 = time.time()
-    # for i in df.index:
-    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
-    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
-    #     df.loc[i, '投诉人'] = complainants
-    #     df.loc[i, '被投诉人'] = punishPeople
-    #     df.loc[i, '执法机构'] = institutions
-    #     df.loc[i, '处罚时间'] = punishTimes
-    #     df.loc[i, '处罚编号'] = punish_code
-    #     print('完成第%d篇'%i)
-    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
-    # #     '关键词', '类别', '处理决定', '投诉是否成立',
-    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
-    # #    'institution', 'punishTime', 'ner_test']])
-    # t2 = time.time()
+if __name__ == "__main__":
+    save_punish_code_model()
+    # punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
+    #
+    # import pandas as pd
+    # # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
+    # df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
+    # # i = 89
+    # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+    # # i = 92
+    # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+    #
+    # # t1 = time.time()
+    # # for i in df.index:
+    # #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    # #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
+    # #     df.loc[i, '投诉人'] = complainants
+    # #     df.loc[i, '被投诉人'] = punishPeople
+    # #     df.loc[i, '执法机构'] = institutions
+    # #     df.loc[i, '处罚时间'] = punishTimes
+    # #     df.loc[i, '处罚编号'] = punish_code
+    # #     print('完成第%d篇'%i)
+    # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
+    # # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
+    # # #    'institution', 'punishTime', 'ner_test']])
+    # # t2 = time.time()
+    # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    # # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
+    # # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
     # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
-    # #     '关键词', '类别', '处理决定', '投诉是否成立',
-    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
-    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
-    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
-    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
-    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
-    # t3 = time.time()
-    # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
-    s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
-    # list_sentences = [s.split('。')]
-    # punish_code= punish.predict_punishCode( list_sentences)
-    # print(punish_code)
-
-    # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
-    #             get_punish_extracts(text=s)
-    punish_dic = punish.get_punish_extracts_backup(text=s)
-    print(punish_dic)
+    # #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
+    # # t3 = time.time()
+    # # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
+    # s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
+    # # list_sentences = [s.split('。')]
+    # # punish_code= punish.predict_punishCode( list_sentences)
+    # # print(punish_code)
+    #
+    # # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    # #             get_punish_extracts(text=s)
+    # punish_dic = punish.get_punish_extracts_backup(text=s)
+    # print(punish_dic)

+ 294 - 173
BiddingKG/dl/interface/Preprocessing.py

@@ -107,7 +107,7 @@ def tableToText(soup):
             tr_line = []
             tds = tr.findChildren(['td','th'], recursive=False)
             for td in tds:
-                tr_line.append([re.sub('\xa0','',segment(td)),0])
+                tr_line.append([re.sub('\xa0','',segment(td,final=False)),0])
                 #tr_line.append([td.get_text(),0])
             inner_table.append(tr_line)
         return inner_table                          
@@ -628,7 +628,7 @@ def tableToText(soup):
         # packPattern = "(标包|[标包][号段名])"
         packPattern = "(标包|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
         rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标)"  # 2020/11/23 大网站规则,添加序号为排序
-        entityPattern = "(候选|([中投]标|报价)(人|单位|候选)|单位名称|供应商)"
+        entityPattern = "(候选|([中投]标|报价)|单位名称|供应商|金额)"
         height = len(inner_table)
         width = len(inner_table[0])
         text = ""
@@ -639,7 +639,8 @@ def tableToText(soup):
             head_end = head_list[head_i+1]
                 
             direct = getDirect(inner_table, head_begin, head_end)
-            
+
+
             #若只有一行,则直接按行读取
             if head_end-head_begin==1:
                 text_line = ""
@@ -657,9 +658,75 @@ def tableToText(soup):
                     text_line = text_line+"。" if text_line!="" else text_line
                 text += text_line
             else:
-        
+                #构建一个共现矩阵
+                table_occurence = []
+                for i in range(head_begin,head_end):
+                    line_oc = []
+                    for j in range(width):
+                        cell = inner_table[i][j]
+                        line_oc.append({"text":cell[0],"type":cell[1],"occu_count":0,"left_head":"","top_head":""})
+                    table_occurence.append(line_oc)
+
+
+                occu_height = len(table_occurence)
+                occu_width = len(table_occurence[0]) if len(table_occurence)>0 else 0
+                #为每个属性值寻找表头
+                for i in range(occu_height):
+                    for j in range(occu_width):
+                        cell = table_occurence[i][j]
+                        #是属性值
+                        if cell["type"]==0 and cell["text"]!="":
+                            left_head = ""
+                            top_head = ""
+
+                            find_flag = False
+                            temp_head = ""
+                            for loop_i in range(1,i+1):
+                                if not key_direct:
+                                    key_values = [1,2]
+                                else:
+                                    key_values = [1]
+                                if table_occurence[i-loop_i][j]["type"] in key_values:
+                                    if find_flag:
+                                        if table_occurence[i-loop_i][j]["text"]!=temp_head:
+                                            top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
+                                    else:
+                                        top_head = table_occurence[i-loop_i][j]["text"]+":"+top_head
+                                    find_flag = True
+                                    temp_head = table_occurence[i-loop_i][j]["text"]
+                                    table_occurence[i-loop_i][j]["occu_count"] += 1
+                                else:
+                                    #找到表头后遇到属性值就返回
+                                    if find_flag:
+                                        break
+
+
+                            cell["top_head"] += top_head
+                            find_flag = False
+                            temp_head = ""
+
+
+
+                            for loop_j in range(1,j+1):
+                                if not key_direct:
+                                    key_values = [1,2]
+                                else:
+                                    key_values = [2]
+                                if table_occurence[i][j-loop_j]["type"] in key_values:
+                                    if find_flag:
+                                        if table_occurence[i][j-loop_j]["text"]!=temp_head:
+                                            left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
+                                    else:
+                                        left_head = table_occurence[i][j-loop_j]["text"]+":"+left_head
+                                    find_flag = True
+                                    temp_head = table_occurence[i][j-loop_j]["text"]
+                                    table_occurence[i][j-loop_j]["occu_count"] += 1
+                                else:
+                                    if find_flag:
+                                        break
+                            cell["left_head"] += left_head
                 if direct=="row":
-                    for i in range(head_begin,head_end):
+                    for i in range(occu_height):
                         pack_text = ""
                         rank_text = ""
                         entity_text = ""
@@ -667,131 +734,196 @@ def tableToText(soup):
                         #在同一句话中重复的可以去掉
                         text_set = set()
                         for j in range(width):
-                            cell = inner_table[i][j]
-                            #是属性值
-                            if cell[1]==0 and cell[0]!="":
-                                head = ""
-                                
-                                find_flag = False
-                                temp_head = ""
-                                for loop_i in range(0,i+1-head_begin):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [1]
-                                    if inner_table[i-loop_i][j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i-loop_i][j][0]!=temp_head:
-                                                head = inner_table[i-loop_i][j][0]+":"+head
-                                        else:
-                                            head = inner_table[i-loop_i][j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i-loop_i][j][0]
-                                    else:
-                                        #找到表头后遇到属性值就返回
-                                        if find_flag:
-                                            break
-                                
-                                find_flag = False
-                                temp_head = ""
-                                
-                                
-                                
-                                for loop_j in range(1,j+1):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [2]
-                                    if inner_table[i][j-loop_j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i][j-loop_j][0]!=temp_head:
-                                                head = inner_table[i][j-loop_j][0]+":"+head
-                                        else:
-                                            head = inner_table[i][j-loop_j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i][j-loop_j][0]
-                                    else:
-                                        if find_flag:
-                                            break
-                                
-                                if str(head+inner_table[i][j][0]) in text_set:
+                            cell = table_occurence[i][j]
+                            if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
+
+                                cell = table_occurence[i][j]
+                                head = (cell["top_head"]+":") if len(cell["top_head"])>0 else ""
+                                head += cell["left_head"]
+                                if str(head+cell["text"]) in text_set:
                                     continue
                                 if re.search(packPattern,head) is not None:
-                                    pack_text += head+inner_table[i][j][0]+","
+                                    pack_text += head+cell["text"]+","
                                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
                                     #排名替换为同一种表达
-                                    rank_text += head+inner_table[i][j][0]+","
+                                    rank_text += head+cell["text"]+","
                                     #print(rank_text)
                                 elif re.search(entityPattern,head) is not None:
-                                    entity_text += head+inner_table[i][j][0]+","
+                                    entity_text += head+cell["text"]+","
                                     #print(entity_text)
                                 else:
-                                    text_line += head+inner_table[i][j][0]+","
-                                text_set.add(str(head+inner_table[i][j][0]))
+                                    text_line += head+cell["text"]+","
+                                text_set.add(str(head+cell["text"]))
+
                         text += pack_text+rank_text+entity_text+text_line
                         text = text[:-1]+"。" if len(text)>0 else text
+
                 else:
-                    for j in range(width):
-                    
+                    for j in range(occu_width):
+                        pack_text = ""
                         rank_text = ""
                         entity_text = ""
                         text_line = ""
                         text_set = set()
-                        for i in range(head_begin,head_end):
-                            cell = inner_table[i][j]
-                            #是属性值
-                            if cell[1]==0 and cell[0]!="":
-                                find_flag = False
-                                head = ""
-                                temp_head = ""
-                                
-                                for loop_j in range(1,j+1):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [2]
-                                    if inner_table[i][j-loop_j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i][j-loop_j][0]!=temp_head:
-                                                head = inner_table[i][j-loop_j][0]+":"+head
-                                        else:
-                                            head = inner_table[i][j-loop_j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i][j-loop_j][0]
-                                    else:
-                                        if find_flag:
-                                            break
-                                find_flag = False
-                                temp_head = ""
-                                for loop_i in range(0,i+1-head_begin):
-                                    if not key_direct:
-                                        key_values = [1,2]
-                                    else:
-                                        key_values = [1]
-                                    if inner_table[i-loop_i][j][1] in key_values:
-                                        if find_flag:
-                                            if inner_table[i-loop_i][j][0]!=temp_head:
-                                                head = inner_table[i-loop_i][j][0]+":"+head
-                                        else:
-                                            head = inner_table[i-loop_i][j][0]+":"+head
-                                        find_flag = True
-                                        temp_head = inner_table[i-loop_i][j][0]
-                                    else:
-                                        if find_flag:
-                                            break
-                                if str(head+inner_table[i][j][0]) in text_set:
+                        for i in range(occu_height):
+                            cell = table_occurence[i][j]
+                            if cell["type"]==0 or (cell["type"]==1 and cell["occu_count"]==0):
+
+                                cell = table_occurence[i][j]
+                                head = (cell["left_head"]+"") if len(cell["left_head"])>0 else ""
+                                head += cell["top_head"]
+                                if str(head+cell["text"]) in text_set:
                                     continue
-                                if re.search(rankPattern,head) is not None:
-                                    rank_text += head+inner_table[i][j][0]+","
+                                if re.search(packPattern,head) is not None:
+                                    pack_text += head+cell["text"]+","
+                                elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
+                                    #排名替换为同一种表达
+                                    rank_text += head+cell["text"]+","
                                     #print(rank_text)
                                 elif re.search(entityPattern,head) is not None:
-                                    entity_text += head+inner_table[i][j][0]+","
+                                    entity_text += head+cell["text"]+","
                                     #print(entity_text)
                                 else:
-                                    text_line += head+inner_table[i][j][0]+","
-                                text_set.add(str(head+inner_table[i][j][0]))
-                        text += rank_text+entity_text+text_line
+                                    text_line += head+cell["text"]+","
+                                text_set.add(str(head+cell["text"]))
+                        text += pack_text+rank_text+entity_text+text_line
                         text = text[:-1]+"。" if len(text)>0 else text
+
+
+                # if direct=="row":
+                #     for i in range(head_begin,head_end):
+                #         pack_text = ""
+                #         rank_text = ""
+                #         entity_text = ""
+                #         text_line = ""
+                #         #在同一句话中重复的可以去掉
+                #         text_set = set()
+                #         for j in range(width):
+                #             cell = inner_table[i][j]
+                #             #是属性值
+                #             if cell[1]==0 and cell[0]!="":
+                #                 head = ""
+                #
+                #                 find_flag = False
+                #                 temp_head = ""
+                #                 for loop_i in range(0,i+1-head_begin):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [1]
+                #                     if inner_table[i-loop_i][j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i-loop_i][j][0]!=temp_head:
+                #                                 head = inner_table[i-loop_i][j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i-loop_i][j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i-loop_i][j][0]
+                #                     else:
+                #                         #找到表头后遇到属性值就返回
+                #                         if find_flag:
+                #                             break
+                #
+                #                 find_flag = False
+                #                 temp_head = ""
+                #
+                #
+                #
+                #                 for loop_j in range(1,j+1):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [2]
+                #                     if inner_table[i][j-loop_j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i][j-loop_j][0]!=temp_head:
+                #                                 head = inner_table[i][j-loop_j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i][j-loop_j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i][j-loop_j][0]
+                #                     else:
+                #                         if find_flag:
+                #                             break
+                #
+                #                 if str(head+inner_table[i][j][0]) in text_set:
+                #                     continue
+                #                 if re.search(packPattern,head) is not None:
+                #                     pack_text += head+inner_table[i][j][0]+","
+                #                 elif re.search(rankPattern,head) is not None:   # 2020/11/23 大网站规则发现问题,if 改elif
+                #                     #排名替换为同一种表达
+                #                     rank_text += head+inner_table[i][j][0]+","
+                #                     #print(rank_text)
+                #                 elif re.search(entityPattern,head) is not None:
+                #                     entity_text += head+inner_table[i][j][0]+","
+                #                     #print(entity_text)
+                #                 else:
+                #                     text_line += head+inner_table[i][j][0]+","
+                #                 text_set.add(str(head+inner_table[i][j][0]))
+                #         text += pack_text+rank_text+entity_text+text_line
+                #         text = text[:-1]+"。" if len(text)>0 else text
+                # else:
+                #     for j in range(width):
+                #
+                #         rank_text = ""
+                #         entity_text = ""
+                #         text_line = ""
+                #         text_set = set()
+                #         for i in range(head_begin,head_end):
+                #             cell = inner_table[i][j]
+                #             #是属性值
+                #             if cell[1]==0 and cell[0]!="":
+                #                 find_flag = False
+                #                 head = ""
+                #                 temp_head = ""
+                #
+                #                 for loop_j in range(1,j+1):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [2]
+                #                     if inner_table[i][j-loop_j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i][j-loop_j][0]!=temp_head:
+                #                                 head = inner_table[i][j-loop_j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i][j-loop_j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i][j-loop_j][0]
+                #                     else:
+                #                         if find_flag:
+                #                             break
+                #                 find_flag = False
+                #                 temp_head = ""
+                #                 for loop_i in range(0,i+1-head_begin):
+                #                     if not key_direct:
+                #                         key_values = [1,2]
+                #                     else:
+                #                         key_values = [1]
+                #                     if inner_table[i-loop_i][j][1] in key_values:
+                #                         if find_flag:
+                #                             if inner_table[i-loop_i][j][0]!=temp_head:
+                #                                 head = inner_table[i-loop_i][j][0]+":"+head
+                #                         else:
+                #                             head = inner_table[i-loop_i][j][0]+":"+head
+                #                         find_flag = True
+                #                         temp_head = inner_table[i-loop_i][j][0]
+                #                     else:
+                #                         if find_flag:
+                #                             break
+                #                 if str(head+inner_table[i][j][0]) in text_set:
+                #                     continue
+                #                 if re.search(rankPattern,head) is not None:
+                #                     rank_text += head+inner_table[i][j][0]+","
+                #                     #print(rank_text)
+                #                 elif re.search(entityPattern,head) is not None:
+                #                     entity_text += head+inner_table[i][j][0]+","
+                #                     #print(entity_text)
+                #                 else:
+                #                     text_line += head+inner_table[i][j][0]+","
+                #                 text_set.add(str(head+inner_table[i][j][0]))
+                #         text += rank_text+entity_text+text_line
+                #         text = text[:-1]+"。" if len(text)>0 else text
         return text
     
     def removeFix(inner_table,fix_value="~~"):
@@ -856,12 +988,13 @@ def tableToText(soup):
     # return list_innerTable
 
 #数据清洗
-def segment(soup):
+def segment(soup,final=True):
     # print("==")
     # print(soup)
     # print("====")
     #segList = ["tr","div","h1", "h2", "h3", "h4", "h5", "h6", "header"]
-    if soup.name=="td":
+    subspaceList = ["td",'a',"span","p"]
+    if soup.name in subspaceList:
         #判断有值叶子节点数
         _count = 0
         for child in soup.find_all(recursive=True):
@@ -874,27 +1007,26 @@ def segment(soup):
                 if '...' in soup.get_text() and (soup.get_text()[:-3]).strip() in soup.attrs['title']:
                     text = soup.attrs['title']
 
-            _list = []
-            for x in re.split("\s+",text):
-                if x.strip()!="":
-                    _list.append(len(x))
-            if len(_list)>0:
-                _minLength = min(_list)
-                if _minLength>2:
-                    _substr = ","
-                else:
-                    _substr = ""
-            else:
-                _substr = ""
-            text = _substr.join(re.split("(\s+)",text))
+            # _list = []
+            # for x in re.split("\s+",text):
+            #     if x.strip()!="":
+            #         _list.append(len(x))
+            # if len(_list)>0:
+            #     _minLength = min(_list)
+            #     if _minLength>2:
+            #         _substr = ","
+            #     else:
+            #         _substr = ""
+            # else:
+            #     _substr = ""
             text = text.replace("\r\n",",").replace("\n",",")
-            text = re.sub("^[,\s]*|[,\s]*$","",text)
+            text = re.sub("\s+","##space##",text)
             return text
     segList = ["title"]
     commaList = ["div","br","td","p"]
     #commaList = []
     spaceList = ["span"]
-    subspaceList = ["td",'a',"span","p"]
+
     tbodies = soup.find_all('tbody')
     if len(tbodies) == 0:
         tbodies = soup.find_all('table')
@@ -908,8 +1040,8 @@ def segment(soup):
         # if child.name in subspaceList:
         #     child.insert_before("#subs"+str(child.name)+"#")
         #     child.insert_after("#sube"+str(child.name)+"#")
-        if child.name in spaceList:
-            child.insert_after(" ")
+        # if child.name in spaceList:
+        #     child.insert_after(" ")
     text = str(soup.get_text())
 
     #替换英文冒号为中文冒号
@@ -920,67 +1052,56 @@ def segment(soup):
     text = re.sub("(?<=[\u4e00-\u9fa5]);|;(?=[\u4e00-\u9fa5])",";",text)
     
          
-    #删除标签中的所有空格
-    for subs in subspaceList:
-        patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
-        while(True):
-            oneMatch = re.search(re.compile(patten),text)
-            if oneMatch is not None:
-                _match = oneMatch.group(1)
-                _minLength = min([len(x) for x in re.split("(\s*)",_match)])
-                if _minLength>2:
-                    _substr = ","
-                else:
-                    _substr = ""
-                text = text.replace("#subs"+str(subs)+"#"+oneMatch.group(1)+"#sube"+str(subs)+"#",re.sub("\s",_substr,oneMatch.group(1)))
-            else:
-                break
-    
-    
+
     #替换"""为"“",否则导入deepdive出错
     text = text.replace('"',"“").replace("\r","").replace("\n",",")
     text = re.sub("\s{4,}",",",text)   
     #替换标点
-    while(True):
-        #替换连续的标点
-        punc = re.search(",(?P<punc>:|。|,|;)\s*",text)
-        if punc is not None:
-            text = re.sub(","+punc.group("punc")+"\s*",punc.group("punc"),text)
+
+    #替换连续的标点
+
+    punc_pattern = "(?P<del>[。,;::,\s]+)"
+
+    list_punc = re.findall(punc_pattern,text)
+    list_punc.sort(key=lambda x:len(x),reverse=True)
+    for punc_del in list_punc:
+        if len(punc_del)>1:
+            text = re.sub(punc_del,punc_del[-1],text)
         
-        punc = re.search("(?P<punc>:|。|,|;)\s*,",text)
-        if punc is not None:
-            text = re.sub(punc.group("punc")+"\s*,",punc.group("punc"),text)
-        else:
-            #替换标点之后的空格
-            punc = re.search("(?P<punc>:|。|,|;)\s+",text)
-            if punc is not None:
-                text = re.sub(punc.group("punc")+"\s+",punc.group("punc"),text)
-            else:
-                break
+
     #将连续的中文句号替换为一个
     text_split = text.split("。")
     text_split = [x for x in text_split if len(x)>0]
-    list_text = []
-    # for _t in text_split:
-    #     list_text.append(re.sub(")",")",re.sub("(","(",re.sub("\s*","",_t))))
     text = "。".join(text_split)
-    # text = text.replace(')',")").replace("(","(").replace("\s","")
-    #删除所有空格
+
+    # #删除标签中的所有空格
+    # for subs in subspaceList:
+    #     patten = "#subs"+str(subs)+"#(.*?)#sube"+str(subs)+"#"
+    #     while(True):
+    #         oneMatch = re.search(re.compile(patten),text)
+    #         if oneMatch is not None:
+    #             _match = oneMatch.group(1)
+    #             text = text.replace("#subs"+str(subs)+"#"+_match+"#sube"+str(subs)+"#",_match)
+    #         else:
+    #             break
+
     # text过大报错
     LOOP_LEN = 10000
     LOOP_BEGIN = 0
     _text = ""
+
+
+
     if len(text)<10000000:
         while(LOOP_BEGIN<len(text)):
-            _text += re.sub(")",")",re.sub("(","(",re.sub("\s*","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
+            _text += re.sub(")",")",re.sub("(","(",re.sub("\s+","",text[LOOP_BEGIN:LOOP_BEGIN+LOOP_LEN])))
             LOOP_BEGIN += LOOP_LEN
-    else:
-        return text
-    # text = re.sub("\s*","",text)
-    # #替换中文括号为英文括号
-    # text = re.sub("(","(",text)
-    # text = re.sub(")",")",text)
-    return _text
+        text = _text
+
+    if final:
+        text = re.sub("##space##"," ",text)
+
+    return text
 
 '''
 #数据清洗

+ 4 - 1
BiddingKG/dl/interface/getAttributes.py

@@ -833,7 +833,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity
                         if str(entity_before.label)=="1":
                             addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
                             #add pointer_money
-                            entity_before.pointer_money = entity_after
+                            entity_before.pointer_money = entity_money
                         break
                     p_entity_money -= 1
 
@@ -1177,6 +1177,9 @@ def getOtherAttributes(list_entity):
             dict_other["time_bidclose"] = timeFormat(entity.entity_text)
         elif entity.entity_type=="person" and entity.label ==4:
             dict_other["person_review"].append(entity.entity_text)
+        elif entity.entity_type=='product':
+            dict_other["product"].append(entity.entity_text)
+    dict_other["product"] = list(set(dict_other["product"]))
     return dict_other
 
 

+ 1 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -195,8 +195,8 @@ class Model_person_classify():
     '''
     
     def encode(self,tokens,begin_index,end_index,**kwargs):
+        # return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=10),shape=(2,10,128))
         return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=35),shape=(2,35,128))
-        # return embedding(spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=35),shape=(2,35,128))
 
     def predict(self,x):
         x = np.transpose(np.array(x),(1,0,2,3))

+ 87 - 7
BiddingKG/dl/interface/predictor.py

@@ -16,6 +16,8 @@ sys.path.append(os.path.abspath("../.."))
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.modelFactory import *
 import tensorflow as tf
+from tensorflow.python.framework import graph_util
+from BiddingKG.dl.product.data_util import decode, process_data, result_to_json
 from BiddingKG.dl.interface.Entitys import Entity
 
 from threading import RLock
@@ -223,7 +225,7 @@ class CodeNamePredict():
             list_entitys = [[] for _ in range(len(list_sentences))]
         for list_sentence,list_entity in zip(list_sentences,list_entitys):
             if len(list_sentence)==0:
-                result.append([list_sentence[0].doc_id,{"code":[],"name":""}])
+                result.append([{"code":[],"name":""}])
                 continue
             doc_id = list_sentence[0].doc_id
             # sentences = []
@@ -834,9 +836,9 @@ class FormPredictor():
         else:
             return self.getModel(type).predict(form_datas)
     
-    
+
 #角色规则
-#依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率    
+#依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
 class RoleRulePredictor():
     
     def __init__(self):
@@ -1134,7 +1136,7 @@ class TimePredictor():
         self.sess = tf.Session(graph=tf.Graph())
         self.inputs_code = None
         self.outputs_code = None
-        self.input_shape = (2,30,60)
+        self.input_shape = (2,10,128)
         self.load_model()
 
     def load_model(self):
@@ -1168,10 +1170,13 @@ class TimePredictor():
                     while(p_sentences<len(list_sentence)):
                         sentence = list_sentence[p_sentences]
                         if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
-                            left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
-                            right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
+                            # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
+                            # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
+                            s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
+                            left = s[0]
+                            right = s[1]
                             context = [left, right]
-                            x = embedding_word(context, shape=self.input_shape)
+                            x = embedding(context, shape=self.input_shape)
                             data_x.append(x)
                             points_entitys.append(entity)
                             break
@@ -1198,6 +1203,80 @@ class TimePredictor():
                     values.append(item)
                     entity.set_Role(label, values)
 
+# 产品字段提取
+class ProductPredictor():
+    def __init__(self):
+        self.sess = tf.Session(graph=tf.Graph())
+        self.load_model()
+
+    def load_model(self):
+        model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                output_graph_def = tf.GraphDef()
+                with open(model_path, 'rb') as f:
+                    output_graph_def.ParseFromString(f.read())
+                    tf.import_graph_def(output_graph_def, name='')
+                    self.sess.run(tf.global_variables_initializer())
+                    self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
+                    self.length = self.sess.graph.get_tensor_by_name("Sum:0")
+                    self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
+                    self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
+                    self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
+
+    def predict(self, list_sentences,list_entitys=None, MAX_AREA=5000):
+        '''
+        预测实体代码,每个句子最多取MAX_AREA个字,超过截断
+        :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
+        :param list_entitys: 多篇公告实体列表
+        :param MAX_AREA: 每个句子最多截取多少字
+        :return: 把预测出来的实体放进实体类
+        '''
+        with self.sess.as_default() as sess:
+            with self.sess.graph.as_default():
+                result = []
+                if list_entitys is None:
+                    list_entitys = [[] for _ in range(len(list_sentences))]
+                for list_sentence, list_entity in zip(list_sentences,list_entitys):
+                    if len(list_sentence)==0:
+                        result.append({"product":[]})
+                        continue
+                    list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
+                    _begin_index = 0
+                    item = {"product":[]}
+                    temp_list = []
+                    while True:
+                        MAX_LEN = len(list_sentence[_begin_index].sentence_text)
+                        if MAX_LEN > MAX_AREA:
+                            MAX_LEN = MAX_AREA
+                        _LEN = MAX_AREA//MAX_LEN
+                        chars = process_data([sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]])
+                        lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
+                                                          feed_dict={
+                                                                    self.char_input: np.asarray(chars),
+                                                                    self.dropout: 1.0
+                                                                    })
+                        batch_paths = decode(scores, lengths, tran_)
+                        for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
+                            tags = ''.join([str(it) for it in path[:length]])
+                            for it in re.finditer("12*3", tags):
+                                start = it.start()
+                                end = it.end()
+                                _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
+                                sentence.doc_id, sentence.sentence_index, start, end),
+                                                 entity_text=sentence.sentence_text[start:end],
+                                                 entity_type="product", sentence_index=sentence.sentence_index,
+                                                 begin_index=0, end_index=0, wordOffset_begin=start,
+                                                 wordOffset_end=end)
+                                list_entity.append(_entity)
+                                temp_list.append(sentence.sentence_text[start:end])
+                        item["product"] = list(set(temp_list))
+                        result.append(item)
+                        if _begin_index+_LEN >= len(list_sentence):
+                            break
+                        _begin_index += _LEN
+                return result
+
 def getSavedModel():
     #predictor = FormPredictor()
     graph = tf.Graph()
@@ -1559,6 +1638,7 @@ def save_timesplit_model():
                                                "input1":time_model.input[1]},
                                        outputs={"outputs":time_model.output})
 
+
 if __name__=="__main__":
     #save_role_model()
     # save_codename_model()

BIN
BiddingKG/dl/interface/product_savedmodel/product.pb


BIN
BiddingKG/dl/interface/timesplit_model/saved_model.pb


BIN
BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/timesplit_model/variables/variables.index


BIN
BiddingKG/dl/product/data/dev_data.pkl


BIN
BiddingKG/dl/product/data/dev_data2.pkl


BIN
BiddingKG/dl/product/data/train_data.pkl


BIN
BiddingKG/dl/product/data/train_data2.pkl


+ 155 - 0
BiddingKG/dl/product/data_util.py

@@ -0,0 +1,155 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/13 0013 14:19
+import re
+import math
+import random
+import psycopg2
+import numpy as np
+from tensorflow.contrib.crf import viterbi_decode
+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word
+
+id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
+word_model = getModel_word()
+vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
+word2id = {k: v for v, k in enumerate(vocab)}
+max_id = len(vocab)
+conn = psycopg2.connect(dbname='iepy_product', user='postgres', password='postgres', host='192.168.2.101')
+cursor = conn.cursor()
+
+def get_label_data():
+    sql = "select human_identifier, text from corpus_iedocument where edittime NOTNULL AND jump_signal=0 \
+      and creation_date > to_timestamp('2021-01-14 00:00:00','yyyy-MM-dd HH24:mi:ss');"
+    cursor.execute(sql)
+    writer = open('label_data.txt', 'w', encoding='utf-8')
+    datas = []
+    for row in cursor.fetchall():
+        docid = row[0]
+        text = row[1]
+        # string = list(text)
+        tags = [0]*len(text)
+        sql_lb = "select b.value from brat_bratannotation as b where document_id = '{}' and b.value like 'T%product%';".format(docid)
+        cursor.execute(sql_lb)
+        for row_lb in cursor.fetchall():
+            label = row_lb[0]
+            _, _, begin, end, _ = re.split('\s',label)
+            begin = int(begin)
+            end = int(end)
+            if end-begin>=2:
+                tags[begin]=1
+                tags[end-1]=3
+                for i in range(begin+1,end-1):
+                    tags[i]=2
+        # datas.append([string, tags])
+        text_sentence = []
+        ids_sentence = []
+        tag_sentence = []
+        for i in range(len(text)):
+            text_sentence.append(text[i])
+            ids_sentence.append(word2id.get(text[i], max_id))
+            tag_sentence.append(tags[i])
+            writer.write("%s\t%s\n"%(text[i],tags[i]))
+            if text[i] in ['。','?','!',';']:
+                writer.write('\n')
+                if text_sentence:
+                    if len(text_sentence) > 100:
+                    # if len(text_sentence)>5 and len(text_sentence)<1000:
+                        datas.append([text_sentence, ids_sentence,tag_sentence])
+                    elif len(text_sentence) > 5:
+                        continue
+                    else:
+                        print('单句小于5或大于100,句子长度为:%d,文章ID:%s'%(len(text_sentence), docid))
+                    text_sentence = []
+                    ids_sentence = []
+                    tag_sentence = []
+        if text_sentence:
+            if len(text_sentence) > 5:
+            # if len(text_sentence) > 5 and len(text_sentence) < 1000:
+                datas.append([text_sentence, ids_sentence, tag_sentence])
+            else:
+                print('单句小于5或大于100,句子长度为:%d,文章ID:%s' % (len(text_sentence), docid))
+    writer.close()
+    return datas
+
+def input_from_line(line):
+    string = list(line)
+    ids = [word2id.get(k, max_id) for k in string]
+    tags = []
+    return [[string], [ids], [tags]]
+def process_data(sentences):
+    '''
+    字符串数字化并统一长度
+    :param sentences: 文章分句字符串列表['招标公告','招标代理']
+    :return: 数字化后的统一长度
+    '''
+    maxLen = max([len(sentence) for sentence in sentences])
+    tags = [[word2id.get(k, max_id) for k in sentence] for sentence in sentences]
+    pad_tags = [tag[:maxLen]+[0]*(maxLen-len(tag)) for tag in tags]
+    return pad_tags
+
+def get_ner(BIE_tag):
+    ner = set()
+    for it in re.finditer('BI*E',BIE_tag):
+        ner.add((it.start(),it.end()))
+    return ner
+
+def decode(logits, lengths, matrix):
+    paths = []
+    small = -1000.0
+    start = np.asarray([[small]*4+[0]])
+    for score, length in zip(logits, lengths):
+        score = score[:length]
+        pad = small * np.ones([length, 1])
+        logits = np.concatenate([score, pad], axis=1)
+        logits = np.concatenate([start, logits], axis=0)
+        path, _  = viterbi_decode(logits, matrix)
+        paths.append(path[1:])
+    return paths
+
+def result_to_json(line, tags):
+    result = []
+    ner = []
+    tags = ''.join([str(it) for it in tags])
+    for it in re.finditer("12*3", tags):
+        start = it.start()
+        end = it.end()
+        ner.append([line[start:end], (start, end)])
+    result.append([line, ner])
+    print(tags)
+    return result
+
+
+class BatchManager(object):
+    def __init__(self, data, batch_size):
+        self.batch_data = self.sort_and_pad(data, batch_size)
+        self.len_data = len(self.batch_data)
+
+    def sort_and_pad(self, data, batch_size):
+        num_batch = int(math.ceil(len(data)/batch_size))
+        sorted_data = sorted(data, key=lambda x:len(x[0]))
+        print('最小句子长度:%d;最大句子长度:%d' % (len(sorted_data[0][0]), len(sorted_data[-1][0])))  # 临时增加打印句子长度
+        batch_data = list()
+        for i in range(num_batch):
+            batch_data.append(self.pad_data(sorted_data[i*int(batch_size):(i+1)*int(batch_size)]))
+        return batch_data
+
+    @staticmethod
+    def pad_data(data):
+        strings = []
+        chars = []
+        targets = []
+        max_length = max([len(sentence[0]) for sentence in data])
+        for line in data:
+            string, char, target = line
+            padding = [0]*(max_length-len(string))
+            strings.append(string + padding)
+            chars.append(char + padding)
+            targets.append(target + padding)
+        return [strings, chars, targets]
+
+    def iter_batch(self, shuffle=False):
+        if shuffle:
+            random.shuffle(self.batch_data)
+        for idx in range(self.len_data):
+            yield self.batch_data[idx]

+ 117 - 0
BiddingKG/dl/product/main.py

@@ -0,0 +1,117 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/13 0013 14:03 
+from BiddingKG.dl.product.product_model import Product_Model
+from BiddingKG.dl.product.data_util import BatchManager, get_label_data, id_to_tag, input_from_line, decode, result_to_json
+import numpy as np
+import tensorflow as tf
+import random
+import pickle
+import os
+
+def train():
+    # all_data = get_label_data()
+    # random.shuffle(all_data)
+    # train_data = all_data[:int(len(all_data)*0.85)]
+    # dev_data = all_data[int(len(all_data)*0.85):]
+    # with open('data/train_data2.pkl', 'wb') as f:
+    #     pickle.dump(train_data, f)
+    # with open('data/dev_data2.pkl', 'wb') as f:
+    #     pickle.dump(dev_data, f)
+
+    with open('data/train_data2.pkl', 'rb') as f:
+        train_data = pickle.load(f)
+    with open('data/dev_data2.pkl', 'rb') as f:
+        dev_data = pickle.load(f)
+
+    train_manager = BatchManager(train_data, batch_size=128)
+    dev_manager = BatchManager(dev_data, batch_size=64)
+
+    tf_config = tf.ConfigProto()
+    tf_config.gpu_options.allow_growth = True
+    steps_per_epoch = train_manager.len_data
+    ckpt_path = "model"
+    with tf.Session(config=tf_config) as sess:
+        model = Product_Model()
+        sess.run(tf.global_variables_initializer())
+        # ckpt = tf.train.get_checkpoint_state(ckpt_path)
+        # if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
+        #     model.saver.restore(sess, ckpt.model_checkpoint_path)
+        #     print("从文件加载原来模型数据",ckpt.model_checkpoint_path)
+
+        print('准备训练数据')
+        loss = []
+        mix_loss = 1000
+        max_f1 = 0
+        for i in range(100):
+            print('epochs:',i)
+            # model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag)
+            # break
+            for batch in train_manager.iter_batch(shuffle=True):
+                # print('batch:',len(batch))
+                # step, batch_loss = model.run_step(sess, True, batch)
+                step, batch_loss = model.run_step(sess, 'train', batch)
+                loss.append(batch_loss)
+                if step % 10 == 0:
+                    iteration = step // steps_per_epoch + 1
+                    print('iter:{} step:{} loss:{}'.format(iteration, step, np.mean(loss)))
+            if i >= 50 or i%5==0:
+                f1, precision, recall, evl_loss = model.evaluate(sess, data_manager=dev_manager, id_to_tag=id_to_tag)
+                print('f1:%.4f, precision:%.4f, recall:%.4f, evl_loss:%.4f' % (f1, precision, recall, evl_loss))
+                if max_f1 < f1:
+                    model.saver.save(sess, os.path.join(ckpt_path, "ner2.ckpt"))
+                    print("model save .bast f1 is %.4f" % f1)
+                    max_f1 = f1
+                    # if np.mean(loss)<mix_loss:
+                    #     mix_loss = np.mean(loss)
+                    #     model.saver.save(sess, os.path.join(ckpt_path, "ner.ckpt"))
+                    #     print("model saved, loss is:",mix_loss)
+                loss = []
+
+def evaluate_line():
+    ckpt_path = "model"
+    with tf.Session() as sess:
+        model = Product_Model()
+        sess.run(tf.global_variables_initializer())
+        ckpt = tf.train.get_checkpoint_state(ckpt_path)
+        if ckpt and tf.train.checkpoint_exists(ckpt_path):
+            print('模型文件:',ckpt.model_checkpoint_path)
+            model.saver.restore(sess, ckpt.model_checkpoint_path)
+            print(model.logits, model.lengths, model.trans, model.dropout, model.char_inputs)
+            while True:
+                line = input("请输入测试句子:")
+                result = model.evaluate_line(sess, line)
+                print(result)
+def predict():
+    pb_path = "model/product.pb"
+    with tf.Graph().as_default():
+        output_graph_def = tf.GraphDef()
+        with open(pb_path, 'rb') as f:
+            output_graph_def.ParseFromString(f.read())
+            tf.import_graph_def(output_graph_def, name='')  # 注意这里不能加名字
+            with tf.Session() as sess:
+                sess.run(tf.global_variables_initializer())
+                for node in output_graph_def.node:
+                    print(node.name)
+                char_input = sess.graph.get_tensor_by_name("CharInputs:0")
+                length = sess.graph.get_tensor_by_name("Sum:0")
+                dropout = sess.graph.get_tensor_by_name("Dropout:0")
+                logit = sess.graph.get_tensor_by_name("logits/Reshape:0")
+                tran = sess.graph.get_tensor_by_name("crf_loss/transitions:0")
+                while True:
+                    line = input("请输入测试句子:")
+                    _, chars, tags = input_from_line(line)
+                    print(chars)
+                    lengths, scores, tran_ = sess.run([length,logit,tran],feed_dict={char_input:np.asarray(chars),
+                                                                dropout:1.0
+                                                                } )
+                    batch_paths = decode(scores, lengths, tran_)
+                    tags = batch_paths[0]  # batch_paths[0][:lengths] 错误
+                    result = result_to_json(line, tags)
+                    print(result)
+
+if __name__ == "__main__":
+    # train()
+    # evaluate_line()
+    predict()

+ 2 - 0
BiddingKG/dl/product/model/checkpoint

@@ -0,0 +1,2 @@
+model_checkpoint_path: "ner2.ckpt"
+all_model_checkpoint_paths: "ner2.ckpt"

BIN
BiddingKG/dl/product/model/ner2.ckpt.data-00000-of-00001


BIN
BiddingKG/dl/product/model/ner2.ckpt.index


BIN
BiddingKG/dl/product/model/ner2.ckpt.meta


BIN
BiddingKG/dl/product/model/product.pb


+ 240 - 0
BiddingKG/dl/product/product_model.py

@@ -0,0 +1,240 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/13 0013 10:12
+# from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word
+from BiddingKG.dl.product.data_util import matrix,vocab,input_from_line,result_to_json,get_ner
+import tensorflow as tf
+import numpy as np
+from tensorflow.contrib.crf import crf_log_likelihood
+from tensorflow.contrib.crf import viterbi_decode
+from tensorflow.contrib.layers.python.layers import initializers
+
+# word_model = getModel_word()
+class Product_Model(object):
+    def __init__(self):
+        self.char_dim = 60
+        self.lstm_dim = 128
+        self.num_tags = 4
+        self.lr = 0.001
+        self.clip = 5.0
+        self.dropout_rate = 0.5
+        # vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
+        self.matrix = matrix
+        # self.word2id = {k:v for v,k in enumerate(self.vocab)}
+        self.num_chars = len(vocab)+1
+        self.emb_matrix = np.random.random((self.num_chars, self.char_dim))
+        self.emb_matrix[:self.num_chars-1:,:] = self.matrix
+
+
+        self.globel_step = tf.Variable(0, trainable=False)
+        self.best_dev_f1 = tf.Variable(0.0, trainable=False)
+        self.initializer = initializers.xavier_initializer()
+
+        self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None],name='CharInputs')
+        self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None],name='Targets')
+        self.dropout = tf.placeholder(dtype=tf.float32, name='Dropout')
+
+        used = tf.sign(tf.abs(self.char_inputs))
+        length = tf.reduce_sum(used, reduction_indices=1)
+        self.lengths = tf.cast(length, tf.int32)
+        self.batch_size = tf.shape(self.char_inputs)[0]
+        self.num_steps = tf.shape(self.char_inputs)[1]
+
+        with tf.variable_scope("char_embedding"):
+            self.char_lookup = tf.get_variable(
+                name="char_embedding",
+                # shape=[self.num_chars, self.char_dim],
+                initializer=np.array(self.emb_matrix,dtype=np.float32)
+            )
+        embed = tf.nn.embedding_lookup(self.char_lookup, self.char_inputs)
+
+        with tf.variable_scope("char_BiLSTM"):
+            lstm_cell = {}
+            for direction in ["forward", "backward"]:
+                with tf.variable_scope(direction):
+                    lstm_cell[direction] = tf.contrib.rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
+            outputs, final_states = tf.nn.bidirectional_dynamic_rnn(
+                lstm_cell["forward"],
+                lstm_cell["backward"],
+                embed,
+                dtype=tf.float32,
+                sequence_length=self.lengths
+            )
+        outputs = tf.concat(outputs, axis=2)
+
+        with tf.variable_scope("project"):
+            with tf.variable_scope("hidden"):
+                W = tf.get_variable("W", shape=[self.lstm_dim*2, self.lstm_dim],
+                                    dtype=tf.float32,initializer=self.initializer)
+                b = tf.get_variable("b", shape=[self.lstm_dim],
+                                    dtype=tf.float32, initializer=self.initializer)
+                output = tf.reshape(outputs, shape=[-1, 2*self.lstm_dim])
+                hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b))
+                hidden = tf.nn.dropout(hidden, keep_prob=self.dropout) # 添加dropout
+
+        with tf.variable_scope("logits"):
+            W = tf.get_variable("W", shape=[self.lstm_dim, self.num_tags],
+                                dtype=tf.float32, initializer=self.initializer)
+            b = tf.get_variable("b", shape=[self.num_tags])
+            pred = tf.nn.xw_plus_b(hidden, W, b)
+            self.logits = tf.reshape(pred, [-1, self.num_steps, self.num_tags])
+
+        with tf.variable_scope("crf_loss"):
+            small = -1000.0
+            start_logits = tf.concat(
+                [small*tf.ones(shape=[self.batch_size,1,self.num_tags]), tf.zeros(shape=[self.batch_size,1,1])], axis=-1
+            )
+            pad_logits = tf.cast(small*tf.ones([self.batch_size, self.num_steps, 1]), tf.float32)
+            logits = tf.concat([self.logits, pad_logits], axis=-1)
+            logits = tf.concat([start_logits, logits], axis=1)
+            targets = tf.concat([tf.cast(self.num_tags*tf.ones([self.batch_size,1]),tf.int32), self.targets], axis=-1)
+
+            self.trans = tf.get_variable(
+                name="transitions",
+                shape=[self.num_tags+1, self.num_tags+1],
+                initializer=self.initializer
+            )
+            log_likelihood, self.trans = crf_log_likelihood(
+                inputs=logits,
+                tag_indices=targets,
+                transition_params=self.trans,
+                sequence_lengths=self.lengths+1
+            )
+            self.loss = tf.reduce_mean(-log_likelihood)
+
+        with tf.variable_scope("optimizer"):
+            self.opt = tf.train.AdamOptimizer(learning_rate=self.lr)
+            grads_vars = self.opt.compute_gradients(self.loss)
+            capped_grads_vars = [[tf.clip_by_value(g, -self.clip, self.clip), v] for g,v in grads_vars]
+            self.train_op = self.opt.apply_gradients(capped_grads_vars, self.globel_step)
+
+        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
+
+    def create_feed_dict(self, run_type, batch): #is_train
+        '''
+        :param is_train: Flag, True for train batch
+        :param batch: list train/evaluate data
+        :return: structured data to feed
+        '''
+        _, chars, tags = batch
+        feed_dict = {
+            self.char_inputs:np.asarray(chars),
+            self.dropout:1.0
+        }
+        assert run_type in ['train', 'dev', 'predict']
+        if run_type=='train':
+            feed_dict[self.targets] = np.asarray(tags)
+            feed_dict[self.dropout] = self.dropout_rate
+        elif run_type=='dev':
+            feed_dict[self.targets] = np.asarray(tags)
+        return feed_dict
+
+    def run_step(self, sess, run_type, batch):
+        assert run_type in ['train', 'dev', 'predict']
+        feed_dict = self.create_feed_dict(run_type, batch)
+        if run_type=='train':
+            global_step, loss, _ = sess.run(
+                [self.globel_step, self.loss, self.train_op],
+                feed_dict=feed_dict
+            )
+            return global_step, loss
+        elif run_type=='dev':
+            lengths ,logits, loss = sess.run([self.lengths, self.logits, self.loss], feed_dict)
+            return lengths, logits, loss
+        else:
+            lengths ,logits = sess.run([self.lengths, self.logits], feed_dict)
+            return lengths, logits
+
+    def run_step_backup(self, sess, is_train, batch):
+        feed_dict = self.create_feed_dict(is_train, batch)
+        if is_train:
+            global_step, loss, _ = sess.run(
+                [self.globel_step, self.loss, self.train_op],
+                feed_dict=feed_dict
+            )
+            return global_step, loss
+        else:
+            lengths ,logits, loss = sess.run([self.lengths, self.logits, self.loss], feed_dict)
+            return lengths, logits, loss
+
+    def decode(self, logits, lengths, matrix):
+        paths = []
+        small = -1000.0
+        start = np.asarray([[small]*self.num_tags+[0]])
+        for score, length in zip(logits, lengths):
+            score = score[:length]
+            pad = small * np.ones([length, 1])
+            logits = np.concatenate([score, pad], axis=1)
+            logits = np.concatenate([start, logits], axis=0)
+            path, _  = viterbi_decode(logits, matrix)
+            paths.append(path[1:])
+        return paths
+
+    def evaluate(self, sess, data_manager, id_to_tag):
+        results = []
+        trans = self.trans.eval()
+        Precision = []
+        Recall = []
+        F1 = []
+        loss = []
+        pred_num = 0
+        gold_num = 0
+        equal_num = 0
+        for batch in data_manager.iter_batch():
+            strings = batch[0]
+            tags = batch[-1]
+            # lengths, scores, batch_loss = self.run_step(sess, False, batch)
+            lengths, scores, batch_loss = self.run_step(sess, 'dev', batch)
+            loss.append(batch_loss)
+            batch_paths = self.decode(scores, lengths, trans)
+            for i in range(len(strings)):
+                result = []
+                string = strings[i][:lengths[i]]
+                gold = [id_to_tag[int(x)] for x in tags[i][:lengths[i]]]
+                pred = [id_to_tag[int(x)] for x in batch_paths[i][:lengths[i]]]
+                gold_ner = get_ner("".join(gold))
+                pred_ner = get_ner("".join(pred))
+                # print('标签实体:',gold_ner)
+                # print('预测实体:',pred_ner)
+                pred_num += len(pred_ner)
+                gold_num += len(gold_ner)
+                equal_num += len(gold_ner&pred_ner)
+                # precision_temp = len(gold_ner&pred_ner)/(len(pred_ner)+1e-10)
+                # recall_temp = len(gold_ner&pred_ner)/(len(gold_ner)+1e-10)
+                # f1_temp = 2*(precision_temp*recall_temp)/(precision_temp+recall_temp+1e-10)
+                # Precision.append(precision_temp)
+                # Recall.append(recall_temp)
+                # F1.append(f1_temp)
+
+                # for char, gold, pred in zip(string, gold, pred):
+                #     result.append(" ".join([char, gold, pred]))
+                # results.append(result)
+                # with open('evaluate_result.txt','w', encoding='utf-8') as f:
+                #     for rs in results:
+                #         for line in rs:
+                #             f.write(line+'\n')
+                #         f.write('\n')
+
+        # return sum(F1)/len(F1),sum(Precision)/len(Precision),sum(Recall)/len(Recall)
+        precision = equal_num/(pred_num+1e-10)
+        recall = equal_num/(gold_num+1e-10)
+        f1 = 2*(precision*recall)/(precision+recall+1e-10)
+        return f1, precision, recall, np.mean(loss)
+
+
+    def evaluate_line(self, sess, line):
+        trans = self.trans.eval(session=sess)
+        # lengths, scores = self.run_step(sess, False, input_from_line(line))
+        lengths, scores = self.run_step(sess, 'predict', input_from_line(line))
+        batch_paths = self.decode(scores, lengths, trans)
+        tags = batch_paths[0]  # batch_paths[0][:lengths] 错误
+        return result_to_json(line, tags)
+
+
+
+
+
+
+
+

+ 14 - 5
BiddingKG/dl/test/test4.py

@@ -23,7 +23,8 @@ import BiddingKG.dl.interface.predictor as predictor
 import BiddingKG.dl.interface.Preprocessing as Preprocessing
 import BiddingKG.dl.interface.getAttributes as getAttributes
 import BiddingKG.dl.entityLink.entityLink as entityLink
-import BiddingKG.dl.complaint.punish_rule as punish_rule
+# import BiddingKG.dl.complaint.punish_rule as punish_rule
+import BiddingKG.dl.complaint.punish_predictor as punish_rule
 import json
 
 
@@ -51,7 +52,7 @@ epcPredict = predictor.EPCPredict()
 roleRulePredict = predictor.RoleRulePredictor()
 timePredict = predictor.TimePredictor()
 punish = punish_rule.Punish_Extract()
-
+productPredict = predictor.ProductPredictor()
 
 #自定义jsonEncoder
 class MyEncoder(json.JSONEncoder):
@@ -68,8 +69,8 @@ class MyEncoder(json.JSONEncoder):
         return json.JSONEncoder.default(self, obj)
 
 
-def predict(doc_id,text):
-    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
+def predict(doc_id,text,title=""):
+    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
     for articles in list_articles:
         print(articles.content)
 
@@ -79,6 +80,7 @@ def predict(doc_id,text):
     codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
     print(codeName)
     premPredict.predict(list_sentences,list_entitys)
+    productPredict.predict(list_sentences,list_entitys)
     # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
     print("epcPredict")
     epcPredict.predict(list_sentences,list_entitys)
@@ -135,7 +137,14 @@ if __name__=="__main__":
     # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
     a = time.time()
     # text = '''
-    # ,光大证券统一认证系统服务器硬件设备更新项目中标候选人公示,项目名称:光大证券统一认证系统服务器硬件设备更新项目,招标编号:CG-202011-030-001,公告日期:2020年12月3日,评标日期:2020年11月30日13时32分,评标地点:光大证券集中采购管理平台,推荐中标候选人:上海致为信息技术有限公司,联系人:殷志超,联系电话:021-22169419
+    # ,清远市清新区治理道路货物运输车辆非法超限超载工作领导小组清远市清新区治理道路货物运输车辆非法超限超载工作领导小组喷墨打印机网上商城合同
+    # 验收报告,一、合同编号:GDMALL2019123563,。二、合同名称:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组喷墨打印机网上商城合同。
+    # 三、中标、成交供应商:广州爱联科技有限公司,地址:广州市黄埔大道西468号勤建商务大厦14层。联系人:周勇联系电话:020-85180120,。
+    # 四、合同金额(元):¥3,270.00,。五、合同详细信息:。采购项目编号::441827-201910-531001-0013,中标/成交标的名称::喷墨打印机,
+    # 数量::1台。采购项目名称::喷墨打印机,规格型号::WF-7218,中标/成交金额(元)::3,270.00。服务要求::,。,。六、验收结论:已通过。
+    # 七、验收小组成员名单::。八、联系事项:。(一)采购人:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组,地址:太和镇玄真路49号。
+    # 联系人:苏美彩,联系电话:0763-5835988,。(二)采购代理机构:地址::。联系人:联系电话::。附件::。
+    # 发布人:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组。发布时间:2019年11月26日
     # '''
     print("start")
     print(predict("12",content))

+ 278 - 0
BiddingKG/dl/test/测试所有提取信息.py

@@ -0,0 +1,278 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/1/11 0011 13:52 
+
+'''
+Created on 2019年1月4日
+
+@author: User
+'''
+
+from bs4 import BeautifulSoup, Comment
+import copy
+import re
+import sys
+import os
+import codecs
+import requests
+import time
+
+_time1 = time.time()
+sys.path.append(os.path.abspath("../.."))
+import fool
+from BiddingKG.dl.interface.Connection import *
+from BiddingKG.dl.common.Utils import *
+from BiddingKG.dl.interface.Connection import getConnection
+import BiddingKG.dl.interface.predictor as predictor
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+import BiddingKG.dl.interface.getAttributes as getAttributes
+import BiddingKG.dl.entityLink.entityLink as entityLink
+import BiddingKG.dl.complaint.punish_predictor as punish_predictor
+# import BiddingKG.dl.complaint.punish_rule as punish_predictor
+import json
+
+'''
+doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
+
+conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
+
+cursor = conn.cursor()
+
+cursor.execute(" select content from articles where id='"+doc_id+"' ")
+
+row = cursor.fetchall()[0]
+
+
+#text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
+
+#content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
+'''
+
+''''''
+codeNamePredict = predictor.CodeNamePredict()
+premPredict = predictor.PREMPredict()
+epcPredict = predictor.EPCPredict()
+roleRulePredict = predictor.RoleRulePredictor()
+timePredict = predictor.TimePredictor()
+# punish = punish_rule.Punish_Extract()
+punish = punish_predictor.Punish_Extract()
+productPredict = predictor.ProductPredictor()
+
+# 自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        elif isinstance(obj, str):
+            return obj
+        return json.JSONEncoder.default(self, obj)
+
+
+def predict(doc_id, text, title=""):
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", title]],
+                                                                                    useselffool=True)
+    for articles in list_articles:
+        print(articles.content)
+
+    ''''''
+
+    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
+    print(codeName)
+    premPredict.predict(list_sentences, list_entitys)
+    # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    print("epcPredict")
+    epcPredict.predict(list_sentences, list_entitys)
+    print("entityLink")
+    timePredict.predict(list_sentences, list_entitys)
+    print("timePredict")
+    entityLink.link_entitys(list_entitys)
+    print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
+    print("getPREMs")
+    list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
+    product = productPredict.predict(list_sentences,list_entitys)
+
+    for entitys in list_entitys:
+        for entity in entitys:
+            print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
+                  entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end,entity.sentence_index)
+    # print(prem)
+    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic),product)[0],
+                      cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)   # list_punish_dic
+
+
+def predict_back(doc_id, html):
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, html, "", "", ""]],
+                                                                                    useselffool=True)
+    for articles in list_articles:
+        print(articles.content)
+
+    ''''''
+
+    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)  #预测项目编号,名称
+    print(codeName)
+    premPredict.predict(list_sentences, list_entitys)  #  角色金额模型
+    roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName) # 角色规则
+    print("epcPredict")
+    epcPredict.predict(list_sentences, list_entitys)  # 联系人模型
+    print("entityLink")
+    timePredict.predict(list_sentences, list_entitys) # 时间类别模型
+    print("timePredict")
+    entityLink.link_entitys(list_entitys) #
+    print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles) # 找包,并包号与其他要素连接起来
+    print("getPREMs")
+    # punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title=title, text=list_articles[0].content)
+    list_punish_dic = punish.get_punish_extracts(list_articles, list_sentences, list_entitys)
+    # punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
+    # print(punish_dic)
+    # prem[0][1]['punish'] = punish_dic
+
+    # bidway = []  # 招标方式
+    # moneySource = []  # 资金来源
+    # servicetime = []  # 服务时间
+    # time_release = []  # 发布时间
+    # time_bidopen = []  # 开标时间
+    # time_bidclose = []  # 截标时间
+    # for entity in list_entitys[0]:
+    #     if entity.entity_type == 'bidway':
+    #         bidway.append(entity.entity_text)
+    #     elif entity.entity_type == 'moneySource':
+    #         moneySource.append(entity.entity_text)
+    #     elif entity.entity_type == 'servicetime':
+    #         servicetime.append(entity.entity_text)
+    #     elif entity.entity_type == 'time' and entity.label == 1:
+    #         time_release.append(entity.entity_text)
+    #     elif entity.entity_type == 'time' and entity.label == 2:
+    #         time_bidopen.append(entity.entity_text)
+    #     elif entity.entity_type == 'time' and entity.label == 3:
+    #         time_bidclose.append(entity.entity_text)
+    #
+    # prem[0][1]['bidway'] = ';'.join(set(bidway))
+    # prem[0][1]['moneySource'] = ';'.join(set(moneySource))
+    # prem[0][1]['servicetime'] = ';'.join(set(servicetime))
+    # prem[0][1]['time_release'] = ';'.join(set(time_release))
+    # prem[0][1]['time_bidopen'] = ';'.join(set(time_bidopen))
+    # prem[0][1]['time_bidclose'] = ';'.join(set(time_bidclose))
+    #
+    # ''''''
+    #
+    # for entitys in list_entitys:
+    #     for entity in entitys:
+    #         print(entity.entity_text, entity.entity_type, entity.label, entity.values, entity.sentence_index,
+    #               entity.begin_index, entity.end_index, entity.wordOffset_begin, entity.wordOffset_end)
+    #
+    # print(prem)
+    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem), list_punish_dic)[0],
+               cls=MyEncoder, sort_keys=True, indent=4, ensure_ascii=False)
+
+    # return json.dumps(Preprocessing.union_result(codeName, prem)[0][1], cls=MyEncoder, sort_keys=True, indent=4,
+    #                   ensure_ascii=False)
+
+
+def test(name, content):
+    user = {
+        "content": content,
+        "id": name
+    }
+    myheaders = {'Content-Type': 'application/json'}
+    _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
+    resp_json = _resp.content.decode("utf-8")
+    print(resp_json)
+    return resp_json
+
+
+if __name__ == "__main__":
+    from tablestore import *
+    endpoint = 'https://bxkc-ots.cn-hangzhou.ots.aliyuncs.com'
+    access_key_id = 'LTAI4GJxbioV1y2WM3XcZTmP'
+    access_key_secret = 'B3VITMoCnKtTQE6eAkDkat6UNFHped'
+    instance_name = 'bxkc-ots'
+    ots_client = OTSClient(endpoint, access_key_id, access_key_secret, instance_name)
+
+    def get_data(query, max_rows, table_name='document',
+                 index_name='document_index',
+                 column_names=['docid', 'dochtmlcon','doctitle', 'info_type', 'page_time'],
+                 sorters=[FieldSort("page_time", SortOrder.DESC), FieldSort("docid", SortOrder.DESC)]):
+        '''
+        从阿里云ots查询数据
+        :param query: 查询命令
+        :param max_rows: 最大返回多少数据
+        :param table_name: 表名
+        :param index_name: 表索引名
+        :param column_names: 返回字段名
+        :param sorters: 排序规则列表
+        :return: 处理后的数据列表
+        '''
+        next_token = None
+        data = []
+        all_rows = []
+        rows, next_token, total_count, is_all_succeed = \
+            ots_client.search(table_name,
+                              index_name,
+                              SearchQuery(query,
+                                          next_token=next_token,
+                                          sort=Sort(sorters=sorters),  # ASC升序
+                                          limit=100,
+                                          get_total_count=True),
+                              ColumnsToGet(column_names=column_names,
+                                           return_type=ColumnReturnType.SPECIFIED))
+        all_rows.extend(rows)
+        while next_token:
+            rows, next_token, total_count, is_all_succeed = \
+                ots_client.search(table_name,
+                                  index_name,
+                                  SearchQuery(query,
+                                              next_token=next_token,
+                                              sort=None,
+                                              limit=100,
+                                              get_total_count=True),
+                                  ColumnsToGet(column_names=column_names,
+                                               return_type=ColumnReturnType.SPECIFIED))
+            all_rows.extend(rows)
+            if len(all_rows) > max_rows:
+                print('已获取%d条数据' % len(all_rows))
+                break
+
+        if all_rows:
+            for row in all_rows:
+                tmp = []
+                tmp.append(row[0][1][1])
+                for tup in row[1]:
+                    tmp.append(tup[1])
+                data.append(tmp)
+        return data
+
+
+    bool_query = TermQuery('docid','124113339')
+    # bool_query = BoolQuery(
+    #     must_queries=[TermsQuery(field_name='info_type', column_values=['办公设备', '计算机设备']),
+    #                   RangeQuery('page_time', range_from='2020-11-01', range_to='2020-11-31')]
+    # )
+
+    data = get_data(bool_query, 1)
+    print(data)
+    docid = str(data[0][0])
+    html = data[0][1]
+    title = data[0][2]
+    # text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
+    # 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
+    # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
+    # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
+    # docid = ""
+    # html = '首都医科大学附属北京地坛医院1.5T核磁共振、16排CT和血管造影机维保服务医疗设备维修和保养服务采购项目政府采购合同公告'
+    # html = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
+
+    a = time.time()
+    print("start")
+    # print(predict('12',text))
+    print(predict(docid, html,title=""))
+    # test("12",text)
+    print("takes", time.time() - a)
+    pass

+ 374 - 0
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -0,0 +1,374 @@
+'''
+Created on 2019年1月4日
+
+@author: User
+'''
+
+from bs4 import BeautifulSoup, Comment
+import copy
+import re
+import sys
+import os
+import codecs
+import requests
+import time
+
+_time1 = time.time()
+sys.path.append(os.path.abspath("../.."))
+sys.path.append(os.path.abspath('../../'))
+print('当前路径为:',os.getcwd())
+print('sys.path',sys.path)
+import fool
+from BiddingKG.dl.interface.Connection import *
+from BiddingKG.dl.common.Utils import *
+from BiddingKG.dl.interface.Connection import getConnection
+import BiddingKG.dl.interface.predictor as predictor
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+import BiddingKG.dl.interface.getAttributes as getAttributes
+import BiddingKG.dl.entityLink.entityLink as entityLink
+import json
+
+
+'''
+doc_id = '3537ed94-9c7e-11e8-aed5-44a84246dbba'
+
+conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
+
+cursor = conn.cursor()
+
+cursor.execute(" select content from articles where id='"+doc_id+"' ")
+
+row = cursor.fetchall()[0]
+
+
+#text = codecs.open("C:\\Users\\User\\Desktop\\1.html","r",encoding="utf8").read()
+
+#content = segment(tableToText(BeautifulSoup(row[0],"lxml")))
+'''
+
+'''''' 
+codeNamePredict = predictor.CodeNamePredict()
+premPredict = predictor.PREMPredict()
+epcPredict = predictor.EPCPredict()
+roleRulePredict = predictor.RoleRulePredictor()
+
+#自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32, 
+        np.float64)):
+            return float(obj)
+        elif isinstance(obj,str):
+            return obj
+        return json.JSONEncoder.default(self, obj)
+
+
+def predict(doc_id,text):
+    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",""]],useselffool=True)
+    # for articles in list_articles:
+    #     print('预处理后文本信息')
+    #     print(articles.content)
+
+
+    ''''''
+        
+    codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
+    # print(codeName)
+    premPredict.predict(list_sentences,list_entitys)
+    roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    # print("epcPredict")
+    epcPredict.predict(list_sentences,list_entitys)
+    # print("entityLink")
+    entityLink.link_entitys(list_entitys)
+    # print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
+    # print("getPREMs")
+    
+    
+    ''''''
+    
+    entitys_all = [[[entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index] for entity in entitys] for entitys in list_entitys]
+    for entitys in entitys_all:
+        # print(entitys)
+        # en_types = set([it[1] for it in entitys])
+        print([(it[0],it[1], it[2],it[3][it[2]],it[4],it[5],it[6]) for it in entitys if it[1] in ('org', 'company', 'person')])
+        # print([it for it in entitys if it[1] in ('org','company','person')])
+        # for en_type in en_types:
+        #     print('***************************************')
+        #     print(en_type)
+        #     print([(it[0],it[2],it[3]) for it in entitys if it[1]==en_type])
+
+    # for entitys in list_entitys:
+    #     for entity in entitys:
+    #         print('**********实体信息****************')
+    #         print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
+
+    #print(prem)
+    return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+
+         
+# def test(name,content):
+#     user = {
+#             "content": content,
+#             "id":name
+#             }
+#     myheaders = {'Content-Type': 'application/json'}
+#     _resp = requests.post("http://192.168.2.101:15015" + '/article_extract', json=user, headers=myheaders, verify=True)
+#     resp_json = _resp.content.decode("utf-8")
+#     print(resp_json)
+#     return resp_json
+def get_result_online(docid):
+    import psycopg2
+    conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    cursor = conn.cursor()
+    sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    user = {
+            "content": rows[0][1],
+            "id":docid
+            }
+    myheaders = {'Content-Type': 'application/json'}
+    _resp = requests.post("http://192.168.2.101:15030" + '/article_extract', json=user, headers=myheaders, verify=True)  # 15015  #最新模型15030
+    resp_json = _resp.content.decode("utf-8")
+    return json.loads(resp_json)
+
+def get_result(docid):
+    import psycopg2
+    conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    cursor = conn.cursor()
+    sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    return(json.loads(predict(docid, rows[0][1])))
+
+def analys_person_phone():
+    import pandas as pd
+    import time
+    t1 = time.time()
+    df = pd.read_excel(r'E:\workspace\BiddingKG\BiddingKG\dl\person\实习生标注信息角色联系人电话.xlsx', encoding='utf-8')
+    lab_num = pos_num = pre_num = 0
+    lab_num2 = pos_num2 = pre_num2 = 0
+    lab_person = pos_person = pre_person = 0
+    lab_role = pos_role = pre_role = 0
+    person_errors = []
+    phone_errors = []
+    join_errors = []
+    person_name_errors =[]
+    role_name_errors =[]
+    for docid in set(df['doc_id']):
+        print('开始处理 : ',docid)
+        df_tmp = df[df.loc[:, 'doc_id'] == docid]
+        values = list(df_tmp['value'])
+        a = [it.split() for it in values]
+        rel_person = [it for it in a if it[1] == 'rel_person']
+        rel_phone = [it for it in a if it[1] == 'rel_phone']
+        r1 = get_result(str(docid))
+        # r1 = get_result_online(str(docid))
+        label_role_person = []  # 标注角色+联系人
+        for rel in rel_person:
+            role = [it for it in a if it[0] == rel[2].split(':')[-1]]
+            person = [it for it in a if it[0] == rel[3].split(':')[-1]]
+            if person != [] and role != []:
+                label_role_person.append(role[0][-1] +'+'+ person[0][-1])
+        label_person_phone = []  # 标注角色+联系人
+        for rel in rel_phone:
+            person = [it for it in a if it[0] == rel[2].split(':')[-1]]
+            phone = [it for it in a if it[0] == rel[3].split(':')[-1]]
+            if person != [] and phone != []:
+                label_person_phone.append(person[0][-1] +'+'+ phone[0][-1])
+        role_person = []
+        person_phone = []
+        if r1.get('success','')==False:
+            print(docid, '接口返回失败 ')
+        else:
+            for v in r1['prem'].values():
+                roleList = v['roleList']
+                for role in roleList:
+                    for it in role[3]:
+                        role_person.append(role[1] +'+'+ it[0])
+                for role in roleList:
+                    for it in role[3]:
+                        person_phone.append(it[0] +'+'+ it[1])
+                    # print(set(label_person_phone))
+            # print(set(person_phone))
+        pos_num += len(set(role_person) & set(label_role_person))
+        lab_num += len(set(label_role_person))
+        pre_num += len(set(role_person))
+        if set(role_person)&set(label_role_person) != set(label_role_person):
+            person_errors.append([docid, set(label_role_person), set(role_person)])
+            # 判断角色联系人是否正确逻辑:1、先看预测角色是否都在标签角色里,2判断预测联系人是否在标签联系人,
+            # print(set(role_person))
+            # print(set(label_role_person))
+        if set(label_person_phone) & set(person_phone)!=set(label_person_phone):
+            phone_errors.append([docid, set(label_person_phone), set(person_phone)])
+        pos_num2 += len(set(label_person_phone) & set(person_phone))
+        lab_num2 += len(set(label_person_phone))
+        pre_num2 += len(set(person_phone))
+
+        lab_person += len(set([it.split('+')[1] for it in label_role_person]))
+        pos_person += len(set([it.split('+')[1] for it in label_role_person])&set([it.split('+')[1] for it in role_person]))
+        pre_person += len(set([it.split('+')[1] for it in role_person]))
+
+        lab_role += len(set([it.split('+')[0] for it in label_role_person]))
+        pos_role += len(set([it.split('+')[0] for it in label_role_person])&set([it.split('+')[0] for it in role_person]))
+        pre_role += len(set([it.split('+')[0] for it in role_person]))
+
+        if set([it.split('+')[0] for it in label_role_person]) != set([it.split('+')[0] for it in role_person]):
+            if set([it.split('+')[1] for it in label_role_person]) != set([it.split('+')[1] for it in role_person]):
+                person_name_errors.append([docid,set(label_role_person), set(role_person)])
+            else:
+                role_name_errors.append([docid, set(label_role_person), set(role_person)])
+        else:
+            if set([it.split('+')[1] for it in label_role_person]) != set([it.split('+')[1] for it in role_person]):
+                person_name_errors.append([docid, set(label_role_person), set(role_person)])
+            elif set(label_role_person)!= set(role_person):
+                print(docid,set(label_role_person), set(role_person))
+                join_errors.append([docid,set(label_role_person), set(role_person)])
+    print('单独角色召回率:%.4f,准确率:%.4f'%(pos_role/lab_role, pos_role/pre_role))
+    print('单独联系人召回率:%.4f, 准确率:%.4f'%(pos_person/lab_person, pos_person/pre_person))
+    print('联系人召回率:%.4f, 准确率:%.4f' % (pos_num / lab_num, pos_num / pre_num))
+    print('电话召回率:%.4f,准确率:%.4f' % (pos_num2 / lab_num2, pos_num2 / pre_num2))
+    print('总耗时:',time.time()-t1)
+    return person_errors, phone_errors, join_errors, role_name_errors, person_name_errors
+
+def predict_fromdb(docid, dbname="sys_document_23"):
+    # import pymysql
+    # conn = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD') #新账号密码
+    # cursor = conn.cursor()
+    # sql = "SELECT  docid as id, dochtmlcon as content  from {1} WHERE DOCID='{0}';".format(docid, dbname)
+    import psycopg2
+    conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    cursor = conn.cursor()
+    sql = """select human_identifier as id,sourcetext as content from corpus_iedocument where human_identifier in ('{0}');""".format(docid)
+    cursor.execute(sql)
+    rows = cursor.fetchall()
+    doc_id = rows[0][0]
+    text = rows[0][1]
+    # text = '竟然很明显的表达没识别为代理,代理机构名称:国信国采(北京)招标咨询有限责任公司,代理机构地址:北京市海淀区首体南路22号国兴大厦11层,  1.采购人信息名 称:北京市植物园。'
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],useselffool=True)
+    codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
+    # print(codeName)
+    premPredict.predict(list_sentences, list_entitys)
+    roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
+    # print("epcPredict")
+    epcPredict.predict(list_sentences, list_entitys)
+    # print("entityLink")
+    entityLink.link_entitys(list_entitys)
+    # print("getPREMs")
+    prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
+    return list_articles, list_sentences, list_entitys, codeName, prem
+
+if __name__=="__main__":
+    # import pandas as pd
+    # import math
+    # import pymysql
+    # conn = pymysql.Connect(host='rm-bp1quo50k0q2ok73gi.mysql.rds.aliyuncs.com', port=3306, db='bxkc', user='bxkc_read', passwd='bxkc_20RE18AD') #新账号密码
+    # cursor = conn.cursor()
+    # df = pd.read_excel('G:/大网站规则识别/1027统计入库top100编号.xlsx')
+    # docs_list = []
+    # for i in range(100):
+    #     web_no = df.loc[i, '编号']
+    #     # num = math.ceil(int(df.loc[i, '1019-1023入库公告数量']) * 0.01)
+    #     num = 10
+    #     sql = "SELECT DOCID,DOCCHANNEL,DOCHTMLCON,WEB_SOURCE_NO from sys_document_23 where WEB_SOURCE_NO='{0}' and DOCCHANNEL='101' and DOCID%9=1 limit {1}".format(
+    #         web_no, num)
+    #     #  rows = cursor.execute(sql) 此处代码错误 rows 需要用 cursor.fetchall方法获取
+    #     cursor.execute(sql)
+    #     rows = cursor.fetchall()
+    #     docs_list.extend(list(rows))
+    # df_doc = pd.DataFrame(docs_list, columns=['docid', 'channel', 'html', 'web_no'])
+    # codenames = []
+    # prems = []
+    # for docid,text in zip(df_doc['docid'], df_doc['html']):
+    #     list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[docid, text, "", "", ""]],
+    #                                                                                     useselffool=True)
+    #     codeName = codeNamePredict.predict(list_sentences, list_entitys=list_entitys)
+    #     # print(codeName)
+    #     premPredict.predict(list_sentences, list_entitys)
+    #     roleRulePredict.predict(list_articles, list_sentences, list_entitys, codeName)
+    #     # print("epcPredict")
+    #     epcPredict.predict(list_sentences, list_entitys)
+    #     # print("entityLink")
+    #     entityLink.link_entitys(list_entitys)
+    #     # print("getPREMs")
+    #     prem = getAttributes.getPREMs(list_sentences, list_entitys, list_articles)
+    #     if codeName:
+    #         codenames.append(codeName[0][1])
+    #     else:
+    #         codenames.append(" ")
+    #     if prem:
+    #         prems.append(prem[0][1])
+    #     else:
+    #         prems.append(" ")
+    # df_doc['codename'] = pd.Series(codenames)
+    # df_doc['prem'] = pd.Series(prems)
+    # df_doc.to_excel('G:/大网站规则识别/大网站规则调整后预测结果20201124.xlsx', columns=['docid', 'channel', 'html', 'prem', 'codename', 'web_no'])
+
+
+    list_articles, list_sentences, list_entitys, codeName, prem = predict_fromdb('100006370',dbname="sys_document_25")  #sys_document_23
+    print(prem)
+    print(codeName)
+    entitys_all = [[[entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index] for entity in entitys] for entitys in list_entitys]
+    for entitys in entitys_all:
+        # print(entitys)
+        # en_types = set([it[1] for it in entitys])
+        print([(it[0],it[1], it[2],it[3][it[2]],it[4],it[5],it[6]) for it in entitys if it[1] in ('org', 'company', 'person')])
+    print(list_articles[0].content)
+
+    # print(get_result('100000203'))
+
+    # person_errors, phone_errors, join_errors, role_name_errors, person_name_errors = analys_person_phone()
+    # import pickle
+    # with open('phone_errors.pkl','wb') as f:
+    #     pickle.dump(phone_errors, f)
+
+    # filename = "比地_52_79929693.html"
+    # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
+    # # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
+    # # text = codecs.open('F:/工作文档/实体识别实体对其/20190320/比地_101_58511386.html', encoding='utf-8').read()
+    # docid = '100000203'
+    # r1 = get_result(docid)
+    # r2 = get_result_online(docid)
+    # rolperson = []
+    # person_phone = []
+    # for v in r1['prem'].values():
+    #     roleList = v['roleList']
+    #     for role in roleList:
+    #         for it in role[3]:
+    #             rolperson.append(role[1] + it[0])
+    #     for role in roleList:
+    #         for it in role[3]:
+    #             person_phone.append(it[0]+it[1])
+    # print(r1['prem'])
+    # print(r2['prem'])
+    #
+    # import psycopg2
+    # conn = psycopg2.connect(dbname='iepy', user='postgres', password='postgres', host='192.168.2.101')
+    # cursor = conn.cursor()
+    # sql = """select human_identifier,sourcetext from corpus_iedocument where human_identifier in ('95008163');"""
+    # cursor.execute(sql)
+    # rows = cursor.fetchall()
+    # # print(len(rows), rows)
+    # content = rows[0][1]
+    # # content = str(BeautifulSoup(text).find("div",id="pcontent"))
+    # # content = text
+    # # print('content: ',content)
+    # #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
+    # #text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
+    # a = time.time()
+    # print("start")
+    # # print(predict("12",content))
+    # result = predict("12",content)
+    # print(json.loads(result))
+    # #test("12",text)
+    # print("takes",time.time()-a)
+    # _time2 = time.time()
+    # print(predict("12",content))
+    # _time3 = time.time()
+    # print("init takes:%d"%((_time2-_time1)-(_time3-_time2)))
+    # pass

BIN
BiddingKG/dl/time/model_label_time_classify.model.hdf5


+ 219 - 100
BiddingKG/dl/time/train_2.py

@@ -13,57 +13,64 @@ from sklearn.utils import shuffle,class_weight
 import matplotlib.pyplot as plt
 
 input_shape = (2,30,60)
+input_shape2 = (2,10,128)
 output_shape = [4]
 
+def get_data():
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
+    id_set = set()
+    for id in data_load['document_id']:
+        id_set.add(id)
+    conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101")
+    sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
+          "FROM corpus_iedocument A,brat_bratannotation B " \
+          "WHERE A.human_identifier = '%s' " \
+          "AND A.human_identifier = B.document_id "
+    db_data = []
+    count = 0
+    for id in list(id_set):
+        count+=1
+        print(count)
+        cur1 = conn.cursor()
+        cur1.execute(sql % (id))
+        db_data.extend(cur1.fetchall())
+        cur1.close()
+    conn.close()
+    columns = ['document_id','sentences','tokens','offsets_to_text','value']
+    df = pd.DataFrame(db_data, columns=columns)
+    df = df[df['value'].str.contains('time')]
+    df = df.reset_index(drop=True)
+    print(len(df))
+    time_label = df['value'].str.split(expand=True)
+    time_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text']
+    time_label = time_label.drop('_', axis=1)
+    df = pd.concat([df, time_label], axis=1)
+    print(df.info())
+    df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
+    df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
+    df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
+    df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
+    df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
+    save(df,'db_time_data.pk')
+
 def getModel():
     '''
     @summary: 时间分类模型
     '''
-    L_input = layers.Input(shape=input_shape[1:], dtype='float32')
-    R_input = layers.Input(shape=input_shape[1:], dtype='float32')
-    L_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.1))(L_input)
+    L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input)
     # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input)
     avg_l = layers.GlobalAveragePooling1D()(L_lstm)
-    R_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.1))(R_input)
+    R_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(R_input)
     # R_lstm = layers.LSTM(32, return_sequences=True, dropout=0.2)(R_input)
     avg_r = layers.GlobalAveragePooling1D()(R_lstm)
     concat = layers.merge([avg_l, avg_r], mode='concat')
-    # concat = layers.merge([L_lstm, R_lstm], mode='concat')
     # lstm = layers.LSTM(24,return_sequences=False,dropout=0.2)(concat)
     output = layers.Dense(output_shape[0],activation="softmax")(concat)
 
-    # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input)
-    # avg = layers.GlobalAveragePooling1D()(L_GRU)
-    # output = layers.Dense(output_shape[0],activation="softmax")(avg)
-
     model = models.Model(inputs=[L_input,R_input], outputs=output)
-    # model = models.Model(inputs=L_input, outputs=output)
-    learn_rate = 0.0005
-    model.compile(optimizer=optimizers.Adam(lr=learn_rate),
-                  loss=losses.binary_crossentropy,
-                  metrics=[precision,recall,f1_score])
-    model.summary()
-    return model
-
-def getModel_center():
-    '''
-    @summary: 时间分类模型
-    '''
-    L_input = layers.Input(shape=input_shape[1:], dtype='float32')
-    R_input = layers.Input(shape=input_shape[1:], dtype='float32')
-    center_shape = (25, 60)
-    C_input = layers.Input(shape=center_shape, dtype='float32')
-    L_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(L_input)
-    avg_l = layers.GlobalAveragePooling1D()(L_lstm)
-    C_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(C_input)
-    avg_c = layers.GlobalAveragePooling1D()(C_lstm)
-    R_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(R_input)
-    avg_r = layers.GlobalAveragePooling1D()(R_lstm)
-    concat = layers.merge([avg_l, avg_c, avg_r], mode='concat')
 
-    output = layers.Dense(output_shape[0],activation="softmax")(concat)
-
-    model = models.Model(inputs=[L_input,C_input,R_input], outputs=output)
     learn_rate = 0.0005
     model.compile(optimizer=optimizers.Adam(lr=learn_rate),
                   loss=losses.binary_crossentropy,
@@ -73,8 +80,9 @@ def getModel_center():
 
 
 def training():
-    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
-    test_data = data_load.sample(frac=0.25, random_state=7)
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
+    data_load = data_load.reset_index(drop=True)
+    test_data = data_load.sample(frac=0.2, random_state=8)
     train_data = data_load.drop(test_data.index, axis=0)
     train_data =train_data.reset_index(drop=True)
 
@@ -83,8 +91,12 @@ def training():
     for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['re_label']):
         y = np.zeros(output_shape)
         y[label] = 1
-        left = ''.join(str(left))
-        right = ''.join(str(right))
+        left = str(left)
+        right = str(right)
+        if left=='nan': left = ''
+        if right=='nan': right = ''
+        left = list(left)
+        right = list(right)
         context = [left, right]
         x = embedding_word(context, shape=input_shape)
         train_x.append(x)
@@ -95,8 +107,12 @@ def training():
     for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['re_label']):
         y = np.zeros(output_shape)
         y[label] = 1
-        left = ''.join(str(left))
-        right = ''.join(str(right))
+        left = str(left)
+        right = str(right)
+        if left == 'nan': left = ''
+        if right == 'nan': right = ''
+        left = list(left)
+        right = list(right)
         context = [left, right]
         x = embedding_word(context, shape=input_shape)
         test_x.append(x)
@@ -107,7 +123,7 @@ def training():
     train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
 
     model = getModel()
-    epochs = 100
+    epochs = 150
     batch_size = 256
     checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
                                  save_best_only=True, mode='min')
@@ -123,7 +139,7 @@ def training():
         callbacks=[checkpoint],
         class_weight='auto'
     )
-    plot_loss(history=history)
+    # plot_loss(history=history)
     load_model = models.load_model("model_label_time_classify.model.hdf5",
                                    custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
     y_pre = load_model.predict([test_x[0], test_x[1]])
@@ -136,35 +152,32 @@ def training():
     res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
     print(res2)
 
-def training_center():
-    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0)
-    test_data = data_load.sample(frac=0.25, random_state=7)
+def train2():
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
+    data_load = data_load.reset_index(drop=True)
+    data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
+    data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
+    test_data = data_load.sample(frac=0.2, random_state=8)
     train_data = data_load.drop(test_data.index, axis=0)
     train_data =train_data.reset_index(drop=True)
 
     train_x = []
     train_y = []
-    for left, center, right, label in zip(train_data['context_left'], train_data['entity_time'], train_data['context_right'], train_data['re_label']):
+    for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['label']):
         y = np.zeros(output_shape)
         y[label] = 1
-        left = ''.join(str(left))
-        right = ''.join(str(right))
-        center = ''.join(str(center))
-        context = [left,center, right]
-        x = embedding_word(context, shape=(3,25,60))
+        context = [left, right]
+        x = embedding(context, shape=input_shape2)
         train_x.append(x)
         train_y.append(y)
 
     test_x = []
     test_y = []
-    for left, center, right, label in zip(test_data['context_left'], train_data['entity_time'], test_data['context_right'], test_data['re_label']):
+    for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['label']):
         y = np.zeros(output_shape)
         y[label] = 1
-        left = ''.join(str(left))
-        right = ''.join(str(right))
-        center = ''.join(str(center))
-        context = [left, center, right]
-        x = embedding_word(context, shape=(3,25,60))
+        context = [left, right]
+        x = embedding(context, shape=input_shape2)
         test_x.append(x)
         test_y.append(y)
 
@@ -172,79 +185,194 @@ def training_center():
     train_x, test_x = (np.array(train_x), np.array(test_x))
     train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
 
-    model = getModel_center()
-    epochs = 70
+    model = getModel()
+    epochs = 150
     batch_size = 256
     checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
                                  save_best_only=True, mode='min')
     # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
     # cw = dict(enumerate(cw))
     history = model.fit(
-        x=[train_x[0], train_x[1], train_x[2]],
+        x=[train_x[0], train_x[1]],
         y=train_y,
-        validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
-        # validation_data=(test_x[0],test_y),
+        validation_data=([test_x[0], test_x[1]], test_y),
         epochs=epochs,
         batch_size=batch_size,
         shuffle=True,
         callbacks=[checkpoint],
         class_weight='auto'
     )
-    plot_loss(history = history)
+    # plot_loss(history=history)
     load_model = models.load_model("model_label_time_classify.model.hdf5",
                                    custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
-    y_pre = load_model.predict([test_x[0], test_x[1], test_x[2]])
+    y_pre = load_model.predict([test_x[0], test_x[1]])
     # y_pre = load_model.predict(test_x[0])
     # 各类别预测评估
     res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
     print(res1)
-    y_pre2 = load_model.predict([train_x[0], train_x[1], train_x[2]])
+    y_pre2 = load_model.predict([train_x[0], train_x[1]])
     # y_pre2 = load_model.predict(train_x[0])
     res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
     print(res2)
 
-def predict():
+
+def predict2():
     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
-    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
+    data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
+    data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
     test_x = []
     test_y = []
-    for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
+    for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['label']):
         y = np.zeros(output_shape)
         y[label] = 1
-        left = ''.join(str(left))
-        right = ''.join(str(right))
         context = [left, right]
-        x = embedding_word(context, shape=input_shape)
+        x = embedding(context, shape=input_shape2)
         test_x.append(x)
         test_y.append(y)
     test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
     pre_y = model1.predict([test_x[0],test_x[1]])
     data_load['pre'] = [np.argmax(item) for item in pre_y]
-    error_data = data_load[data_load['re_label']!=data_load['pre']]
+    error_data = data_load[data_load['label']!=data_load['pre']]
     # print(error_data.info())
-    error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error4-0.2-0.6_30.csv")
+    error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
 
-def predict_center():
+def predict():
     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
-    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0)
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
     test_x = []
     test_y = []
-    for left, center, right, label in zip(data_load['context_left'],data_load['entity_time'], data_load['context_right'], data_load['re_label']):
+    for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
         y = np.zeros(output_shape)
         y[label] = 1
-        left = ''.join(str(left))
-        right = ''.join(str(right))
-        center = ''.join(str(center))
-        context = [left, center, right]
-        x = embedding_word(context, shape=(3, 25, 60))
+        left = str(left)
+        right = str(right)
+        if left == 'nan': left = ''
+        if right == 'nan': right = ''
+        left = list(left)
+        right = list(right)
+        context = [left, right]
+        x = embedding_word(context, shape=input_shape)
         test_x.append(x)
         test_y.append(y)
     test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
-    pre_y = model1.predict([test_x[0],test_x[1],test_x[2]])
+    pre_y = model1.predict([test_x[0],test_x[1]])
     data_load['pre'] = [np.argmax(item) for item in pre_y]
     error_data = data_load[data_load['re_label']!=data_load['pre']]
     # print(error_data.info())
-    error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error_center.csv")
+    error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
+
+
+def data_process():
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
+    re_left = re.compile("。[^。]*?$")
+    re_right = re.compile("^[^。]*?。")
+    left_list = []
+    right_list = []
+    for left, right in zip(data_load['context_left'], data_load['context_right']):
+        left = str(left)
+        right = str(right)
+        if right=='nan':
+            right = ''
+            # print(1)
+        if re.search("。",left):
+            left = re_left.search(left)
+            left = left.group()[1:]
+        if re.search("。",right):
+            right = re_right.search(right)
+            right = right.group()
+        left_list.append(left)
+        right_list.append(right)
+    data_load['context_left'] = left_list
+    data_load['context_right'] = right_list
+    data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv")
+
+def data_process2():
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
+    left_list = []
+    right_list = []
+    for left, right in zip(data_load['context_left'], data_load['context_right']):
+        left = str(left)
+        right = str(right)
+        if right=='nan':
+            right = ''
+        if left=='nan':
+            left = ''
+        left = left[max(len(left)-20,0):]
+        right = right[:20]
+        left_list.append(left)
+        right_list.append(right)
+    data_load['context_left'] = left_list
+    data_load['context_right'] = right_list
+    data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_20_prc.csv")
+
+def data_process3():
+    data = load('db_time_data.pk')
+    data = data.drop('value', axis=1)
+    token_begin = []
+    token_end = []
+    context_left = []
+    context_right = []
+    data2 = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc2.csv")
+    label = []
+    # data=data[:20]
+    for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
+                                                             data['begin_index'],data['end_index'],data['entity_text']):
+        _label = data2[(data2['document_id']==int(id)) & (data2['begin_index']==int(begin))][:1]
+        if not _label.empty:
+            _label = int(_label['re_label'])
+        else:
+            _label=0
+        label.append(_label)
+        begin = int(begin)
+        end = int(end)
+        entity_tbegin = 0
+        entity_tend = 0
+        find_begin = False
+
+        for t in range(len(offset)):
+            if not find_begin:
+                if offset[t]==begin:
+                    entity_tbegin = t
+                    find_begin = True
+                if offset[t]>begin:
+                    entity_tbegin = t-1
+                    find_begin = True
+            if offset[t] >= end:
+                entity_tend = t
+                break
+        token_begin.append(entity_tbegin)
+        token_end.append(entity_tend)
+        s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend,size=10)
+        s1 = s[0]
+        _temp1 = []
+        for i in range(len(s1)):
+            if s1[i]=="。":
+                _temp1.append(i)
+        if _temp1:
+            s1 = s1[_temp1[-1]+1:]
+        s2 = s[1]
+        _temp2 = []
+        for i in range(len(s2)):
+            if s2[i] == "。":
+                _temp2.append(i)
+                break
+        if _temp2:
+            s2 = s2[:_temp2[0]+1]
+            # print(s2)
+        context_left.append(s1)
+        context_right.append(s2)
+        print(id)
+        # print(_label)
+        # print(entity_text)
+        # print(tokens[entity_tbegin:entity_tend])
+    data['token_begin'] = token_begin
+    data['token_end'] = token_end
+    data['context_left'] = context_left
+    data['context_right'] = context_right
+    data['label'] = label
+    data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
+    data.to_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv")
 
 def plot_loss(history):
     plt.plot(history.history['loss'])
@@ -256,23 +384,14 @@ def plot_loss(history):
     plt.show()
 
 if __name__ == '__main__':
+    # get_data()
     # getModel()
-    # getModel_center()
     # training()
-    # training_center()
+    # train2()
+    # data_process()
+    # data_process2()
+    # data_process3()
     # predict()
-    # predict_center()
-    model1 = models.load_model("model_label_time_classify.model.hdf5",
-                               custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
-    test_x = []
-    test_y = []
-    left = '8675.20元人民币,(3)服务期限:'
-    right = '(4)质量:符合竞争性磋商文件规定的质'
-    context = [left, right]
-    x = embedding_word(context, shape=input_shape)
-    test_x.append(x)
-    test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
-    pre_y = model1.predict([test_x[0],test_x[1]])
-    rs = [np.argmax(item) for item in pre_y]
-    print(pre_y, rs)
+    # predict2()
+
     pass

+ 169 - 0
BiddingKG/maxcompute/contactDumplicate.py

@@ -0,0 +1,169 @@
+from odps.udf import annotate
+from odps.udf import BaseUDAF
+from odps.udf import BaseUDTF
+
+@annotate('string,string,string,string,bigint,datetime,string,string,string,string->string')
+class dumplicate(BaseUDAF):
+
+    def __init__(self):
+        import datetime
+        import json
+        import logging
+        global datetime,json,logging,MyEncoder
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        class MyEncoder(json.JSONEncoder):
+
+            def default(self, obj):
+                if isinstance(obj, bytes):
+                    return str(obj, encoding='utf-8')
+                return json.JSONEncoder.default(self, obj)
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self, buffer, company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city):
+        logging.info(company_name)
+        buffer[0].append([company_name.strip(),mobile_no,phone_no,contact_person,level,create_time.timestamp(),email,company_addr,province,city])
+        logging.info(company_name)
+
+    def merge(self, buffer, pbuffer):
+        logging.info('-3=')
+        buffer[0].extend(pbuffer[0])
+        logging.info('-4=')
+
+    def terminate(self, buffer):
+        logging.info('-1=')
+        buffer[0].sort(key=lambda x:x[5],reverse=True)
+        company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city = buffer[0][0]
+        logging.info("-2=")
+        return json.dumps([company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city],cls=MyEncoder,ensure_ascii=False)
+
+
+@annotate("string->string,string,string,string,bigint,datetime,string,string,string,string")
+class liberate(BaseUDTF):
+
+    def __init__(self):
+        import json
+        import time
+        import logging
+        import datetime
+        # import sys
+        # reload(sys)
+        # sys.setdefaultencoding('utf8')
+        global json,MyEncoder,logging,time,datetime
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        class MyEncoder(json.JSONEncoder):
+
+            def default(self, obj):
+                if isinstance(obj, bytes):
+                    return str(obj, encoding='utf-8')
+                return json.JSONEncoder.default(self, obj)
+
+
+    def process(self, json_dumplicate):
+        try:
+            logging.info(json_dumplicate)
+            json_dumplicate = json_dumplicate.replace("\\n","").replace('\\"','').replace("\\r","")
+            company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city = json.loads(json_dumplicate)
+            create_time = datetime.datetime.fromtimestamp(create_time)
+            self.forward(company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city)
+        except Exception as e:
+            pass
+
+import re
+mobile_pattern = re.compile("^1\d{10}$")
+def recog_likeType(phone):
+    if re.search(mobile_pattern,phone) is not None:
+        return "mobile"
+    else:
+        return "phone"
+
+@annotate("string,string,string,string,string,string->string")
+class f_tojson_docuentContact(object):
+
+    def __init__(self):
+        import json
+        global json
+
+
+    def evaluate(self, tenderee,tenderee_contact,tenderee_phone,agency,agency_contact,agency_phone):
+        list_contact = []
+        if tenderee!="" and tenderee_contact!="" and tenderee_phone!='' and tenderee_phone is not None:
+            _dict = {"company":tenderee,"contact_person":tenderee_contact,"level":20}
+            if recog_likeType(tenderee_phone)=="mobile":
+                _dict["mobile_no"] = tenderee_phone
+            else:
+                _dict["phone_no"] = tenderee_phone
+            list_contact.append(_dict)
+        if agency!="" and agency_contact!="" and agency_phone!='' and agency_phone is not None:
+            _dict = {"company":agency,"contact_person":agency_contact,"level":20}
+            if recog_likeType(agency_phone)=="mobile":
+                _dict["mobile_no"] = agency_phone
+            else:
+                _dict["phone_no"] = agency_phone
+            list_contact.append(_dict)
+        return json.dumps(list_contact)
+
+@annotate("string->string,string,string,string,bigint,string")
+class f_liberate_contactJson(BaseUDTF):
+
+    def __init__(self):
+        import json
+        import time
+        import logging
+        import datetime
+        # import sys
+        # reload(sys)
+        # sys.setdefaultencoding('utf8')
+        global json,MyEncoder,logging,time,datetime
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+
+    def process(self, json_contact):
+        try:
+            list_dict = json.loads(json_contact)
+            for _dict in list_dict:
+                company = _dict.get("company")
+                contact_person = _dict.get("contact_person")
+                mobile_no = _dict.get("mobile_no","")
+                if mobile_no is None:
+                    mobile_no = ""
+                phone_no = _dict.get("phone_no","")
+                if phone_no is None:
+                    phone_no = ""
+                else:
+                    phone_no = re.sub('[^0-9\-转]','',phone_no)
+                    if len(phone_no)<6:
+                        phone_no = ""
+                level = _dict.get("level")
+                mail = _dict.get("mail","")
+                self.forward(company,contact_person,mobile_no,phone_no,level,mail)
+        except Exception as e:
+            logging.info(str(e))
+            logging.info(json_contact)
+
+@annotate('string->bigint')
+class f_count_company(BaseUDAF):
+
+    def __init__(self):
+        import datetime
+        import json
+        import logging
+        global datetime,json,logging,MyEncoder
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [set()]
+
+    def iterate(self, buffer, company_name):
+        buffer[0].add(company_name)
+
+    def merge(self, buffer, pbuffer):
+        buffer[0] |= pbuffer[0]
+
+    def terminate(self, buffer):
+        return len(buffer[0])

+ 114 - 30
BiddingKG/maxcompute/documentDumplicate.py

@@ -190,7 +190,8 @@ class f_set_docid(BaseUDAF):
                 _set_column = set()
                 _set_tenderee = set()
                 for j in range(_begin,i+1):
-                    _set_tenderee.add(list_docs[j]["tenderee"])
+                    if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
+                        _set_tenderee.add(list_docs[j]["tenderee"])
                     _set_column.add(list_docs[j]["defind_column"])
                     _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
 
@@ -213,7 +214,8 @@ class f_set_docid(BaseUDAF):
             _set_tenderee = set()
             _group = []
             for j in range(_begin,len(list_docs)):
-                _set_tenderee.add(list_docs[j]["tenderee"])
+                if list_docs[j]["tenderee"] is not None and list_docs[j]["tenderee"]!="":
+                    _set_tenderee.add(list_docs[j]["tenderee"])
                 _set_column.add(list_docs[j]["defind_column"])
                 _group.append({"docid":list_docs[j]["docid"],"extract_count":list_docs[j]["extract_count"]})
             if len(_group)>=3 and len(_set_tenderee)>1:
@@ -373,6 +375,22 @@ class decare_document(BaseUDTF):
                                     new_json_set_docid.append(_item2)
                             self.forward(_doc1["id"],_doc2["id"],json.dumps(new_json_set_docid))
 
+def getBestDocid(list_pair):
+    list_pair.sort(key=lambda x:x[3],reverse=True)
+    _max_count = max(list_pair[0][3],list_pair[0][1])
+    set_candidate = set()
+    if list_pair[0][1]==_max_count:
+        set_candidate.add(list_pair[0][0])
+    for item in list_pair:
+        if item[3]==_max_count:
+            set_candidate.add(item[2])
+        else:
+            break
+    list_candidate = list(set_candidate)
+    list_candidate.sort(key=lambda x:x)
+    return list_candidate[0]
+
+
 @annotate('bigint,bigint,bigint,bigint->string')
 class choose_document(BaseUDAF):
     '''
@@ -394,28 +412,15 @@ class choose_document(BaseUDAF):
 
     def terminate(self, buffer):
         list_pair = buffer[0]
-        list_pair.sort(key=lambda x:x[3],reverse=True)
-        _max_count = list_pair[0][3]
-        save_flag = 0
-        list_dumplicate = []
         _set = set()
         for item in buffer[0]:
             _set.add(str(item[2]))
-        #不包含这条公告
-        # _set.add(list_pair[0][0])
-        if list_pair[0][1]>_max_count:
+        list_dumplicate = list(_set)
+        best_docid = getBestDocid(list_pair)
+        if best_docid==list_pair[0][0]:
             save_flag = 1
-            # _set.remove(list_pair[0][0])
-            list_dumplicate = list(_set)
         else:
             save_flag = 0
-            less_docid = list_pair[0][2]
-            for item in list_pair:
-                if item[3]>=_max_count and item[2]<less_docid:
-                    less_docid = item[2]
-            _set.remove(str(less_docid))
-            list_dumplicate = list(_set)
-            list_dumplicate.insert(0,str(less_docid))
         return json.dumps({"save_flag":save_flag,"dumplicates":list_dumplicate})
 
 
@@ -459,22 +464,11 @@ class group_document_bestFirst(BaseUDAF):
 
     def terminate(self, buffer):
         list_pair = buffer[0]
-        list_pair.sort(key=lambda x:x[3],reverse=True)
-        _max_count = list_pair[0][3]
-        save_flag = 0
-        list_dumplicate = []
         _set = set()
         for item in buffer[0]:
             _set.add(item[2])
         _set.add(list_pair[0][0])
-        best_docid = None
-        if list_pair[0][1]>_max_count:
-            best_docid = list_pair[0][0]
-        else:
-            best_docid = list_pair[0][2]
-            for item in list_pair:
-                if item[3]>=_max_count and item[2]<best_docid:
-                    best_docid = item[2]
+        best_docid = getBestDocid(list_pair)
         _set.remove(best_docid)
         list_dumplicate = list(_set)
         list_dumplicate.sort(key=lambda x:x)
@@ -611,3 +605,93 @@ class get_count_dump(object):
             _count = len(title.split(","))
         return _count
 
+def getSet(list_dict,key):
+    _set = set()
+    for item in list_dict:
+        if key in item:
+            if item[key]!='' and item[key] is not None:
+                if re.search("^\d[\d\.]*$",item[key]) is not None:
+                    _set.add(str(float(item[key])))
+                else:
+                    _set.add(str(item[key]))
+    return _set
+
+@annotate('bigint,string -> bigint,bigint')
+class f_getGroup_dumpFinal(BaseUDTF):
+    '''
+    从最后的结果中获取组
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,docid,dumplicates):
+        self.forward(int(docid),int(docid))
+        if dumplicates is not None:
+            list_docids = dumplicates.split(",")
+            for _docid in list_docids:
+                self.forward(int(docid),int(_docid))
+
+@annotate('bigint,bigint,string,string,string,string,bigint,bigint->string')
+class f_redump_limit_num(BaseUDAF):
+    '''
+    去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,main_docid,docid,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,extract_count1,extract_count2):
+        buffer[0].append({"main_docid":main_docid,"docid":docid,"set_limit_column1":set_limit_column1,"set_limit_column2":set_limit_column2,
+                          "set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,"extract_count1":extract_count1,"extract_count2":extract_count2})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_group = []
+        the_group = buffer[0]
+        if len(the_group)>5:
+            keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
+        else:
+            keys = ["set_limit_column2","set_limit_column3","set_limit_column4"]
+        stay = True
+        for _key in keys:
+            if len(getSet(the_group,_key))>1:
+                stay = False
+                break
+        final_group = []
+        if stay:
+            main_docid = the_group[0]["main_docid"]
+            for item in the_group:
+                if item["docid"]!=main_docid:
+                    final_group.append({"docid1":main_docid,"docid2":item["docid"],"extract_count1":item["extract_count1"],"extract_count2":item["extract_count2"]})
+
+        return json.dumps(final_group)
+
+@annotate('string -> bigint,bigint,bigint,bigint')
+class f_get_dumpFinal_checked(BaseUDTF):
+    '''
+    从最后的结果中获取组
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,list_group):
+        if list_group is not None:
+            final_group = json.loads(list_group)
+            for _group in final_group:
+                self.forward(_group["docid1"],_group["docid2"],_group["extract_count1"],_group["extract_count2"])