Преглед изворни кода

清理代码以及解决由于tensorflow.contrib.crf包在生产上出现的问题和闭环导入的问题

rogel пре 4 година
родитељ
комит
2b179e8877

+ 3 - 3
BiddingKG/dl/BertNer/BertCRF.py

@@ -4,9 +4,9 @@ Created on 2019年12月31日
 @author: User
 '''
 
-from tensorflow.contrib import rnn
-from tensorflow.contrib.crf import crf_log_likelihood
-from tensorflow.contrib.layers.python.layers import initializers
+# from tensorflow.contrib import rnn
+# from tensorflow.contrib.crf import crf_log_likelihood
+# from tensorflow.contrib.layers.python.layers import initializers
 import numpy as np
 from BiddingKG.dl.common.Utils import viterbi_decode
 from zipfile import ZipFile

+ 3 - 3
BiddingKG/dl/BertNer/Pretrain.py

@@ -8,9 +8,9 @@ import os
 from BiddingKG.dl.BertNer.BertModel import *
 import tensorflow as tf
 
-from tensorflow.contrib import rnn
-from tensorflow.contrib.crf import crf_log_likelihood
-from tensorflow.contrib.layers.python.layers import initializers
+# from tensorflow.contrib import rnn
+# from tensorflow.contrib.crf import crf_log_likelihood
+# from tensorflow.contrib.layers.python.layers import initializers
 import numpy as np
 from BiddingKG.dl.common.Utils import viterbi_decode
 from zipfile import ZipFile

+ 2 - 0
BiddingKG/dl/common/Utils.py

@@ -11,6 +11,7 @@ from keras import backend as K
 import os
 import time
 
+
 from threading import RLock
 
 # from pai_tf_predict_proto import tf_predict_pb2
@@ -124,6 +125,7 @@ def viterbi_decode(score, transition_params):
     viterbi_score = np.max(trellis[-1])
     return viterbi, viterbi_score
 
+
 def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024):
     len_sample = 0
     if len(feed_dict.keys())>0:

+ 104 - 0
BiddingKG/dl/common/nerUtils.py

@@ -0,0 +1,104 @@
+from BiddingKG.dl.foolnltk import selffool
+
+
+def getTokensAndNers(sentences,MAXAREA = 10000,useselffool=False):
+    '''
+    @param: sentences:句子数
+    @return 限流执行后的分词和实体识别list
+    '''
+    def getData(tokens,ners,process_data):
+        process_sentences = [item[1] for item in process_data]
+
+        token_ = selffool.cut(process_sentences)
+        if useselffool:
+            ner_ = selffool.self_ner(process_sentences)
+        else:
+            ner_ = selffool.ner(process_sentences)
+        for i in range(len(token_)):
+            the_index = process_data[i][0]
+            tokens[the_index] = token_[i]
+            ners[the_index] = ner_[i]
+    sents = []
+    for i in range(len(sentences)):
+        sents.append([i,sentences[i]])
+    sents.sort(key=lambda x:len(x[1]),reverse=True)
+    index_ = 0
+    tokens = [[]for i in range(len(sentences))]
+    ners = [[]for i in range(len(sentences))]
+
+    while(True):
+        width = len(sents[index_][1])
+        height = MAXAREA//width+1
+        if height>len(sents)-index_:
+            height = len(sents)-index_
+        process_data = sents[index_:index_+height]
+        getData(tokens, ners, process_data)
+        index_ += height
+        if index_>=len(sents):
+            break
+    return tokens,ners
+
+def getTokens(sentences,MAXAREA = 10000,useselffool=True):
+    '''
+     @param: sentences:句子数
+     @return 限流执行后的分词list
+     '''
+    def getData(tokens,process_data):
+        process_sentences = [item[1] for item in process_data]
+
+        token_ = selffool.cut(process_sentences)
+        for i in range(len(token_)):
+            the_index = process_data[i][0]
+            tokens[the_index] = token_[i]
+    sents = []
+    for i in range(len(sentences)):
+        sents.append([i,sentences[i]])
+    sents.sort(key=lambda x:len(x[1]),reverse=True)
+    index_ = 0
+    tokens = [[]for i in range(len(sentences))]
+
+    while(True):
+        width = len(sents[index_][1])
+        height = MAXAREA//width+1
+        if height>len(sents)-index_:
+            height = len(sents)-index_
+        process_data = sents[index_:index_+height]
+        getData(tokens, process_data)
+        index_ += height
+        if index_>=len(sents):
+            break
+    return tokens
+
+def getNers(sentences,MAXAREA = 10000,useselffool=False):
+    '''
+    @param: sentences:句子数
+    @return 限流执行后的实体识别list
+    '''
+    def getData(ners,process_data):
+        process_sentences = [item[1] for item in process_data]
+
+        if useselffool:
+            ner_ = selffool.self_ner(process_sentences)
+        else:
+            ner_ = selffool.ner(process_sentences)
+        for i in range(len(ner_)):
+            the_index = process_data[i][0]
+            ners[the_index] = ner_[i]
+    sents = []
+    for i in range(len(sentences)):
+        sents.append([i,sentences[i]])
+    sents.sort(key=lambda x:len(x[1]),reverse=True)
+    index_ = 0
+    ners = [[]for i in range(len(sentences))]
+
+    while(True):
+        width = len(sents[index_][1])
+        height = MAXAREA//width+1
+        if height>len(sents)-index_:
+            height = len(sents)-index_
+        process_data = sents[index_:index_+height]
+        getData( ners, process_data)
+        index_ += height
+        if index_>=len(sents):
+            break
+    return ners

+ 3 - 3
BiddingKG/dl/complaint/punishNo_tf.py

@@ -1,6 +1,6 @@
 import tensorflow as tf
-from tensorflow.contrib.crf import crf_log_likelihood
-from tensorflow.contrib.layers.python.layers import initializers
+# from tensorflow.contrib.crf import crf_log_likelihood
+# from tensorflow.contrib.layers.python.layers import initializers
 import numpy as np
 import pandas as pd
 from zipfile import ZipFile
@@ -215,7 +215,7 @@ def getAcc(y_batch,logits,trans,lengths):
         # logit = np.concatenate([score, pad], axis=1)
         # logit = np.concatenate([start, logit], axis=0)
         # path, _ = tf.contrib.crf.viterbi_decode(logit, trans)
-        path, _ = tf.contrib.crf.viterbi_decode(score, trans)
+        path, _ = viterbi_decode(score, trans)
         preds += path[0:]
         # preds += path[1:]
         index += 1

+ 3 - 6
BiddingKG/dl/complaint/punish_predictor.py

@@ -11,12 +11,9 @@ import re
 import os
 import time
 import tensorflow as tf
-# from BiddingKG.dl.common.Utils import *
-from tensorflow.contrib.crf import crf_log_likelihood
-from tensorflow.contrib.layers.python.layers import initializers
-# from keras.preprocessing.sequence import pad_sequences
-# import BiddingKG.dl.interface.Preprocessing as Preprocessing
-from BiddingKG.dl.interface.Preprocessing import *
+from BiddingKG.dl.common.Utils import *
+from BiddingKG.dl.common.nerUtils import *
+from keras.preprocessing.sequence import pad_sequences
 
 
 def decode(logits, trans, sequence_lengths, tag_num):

+ 0 - 559
BiddingKG/dl/complaint/punish_rule.py

@@ -1,559 +0,0 @@
-#!/usr/bin/python3
-# -*- coding: utf-8 -*-
-# @Author  : bidikeji
-# @Time    : 2020/12/24 0024 15:23
-import re
-import os
-import time
-import tensorflow as tf
-from BiddingKG.dl.common.Utils import *
-from tensorflow.contrib.crf import crf_log_likelihood
-from tensorflow.contrib.layers.python.layers import initializers
-from keras.preprocessing.sequence import pad_sequences
-import BiddingKG.dl.interface.Preprocessing as Preprocessing
-from BiddingKG.dl.interface.Preprocessing import *
-
-def BiLSTM_CRF_tfmodel(sess,weights):
-    BiRNN_Units = 140
-    chunk_tags = {
-        'O': 0,
-        'PN_B': 1,
-        'PN_M': 2,
-        'PN_E': 3
-    }
-
-    def embedding_layer(input):
-        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
-        return tf.nn.embedding_lookup(params=embedding,ids=input)
-
-    def BiLSTM_Layer(input,length):
-        with tf.variable_scope("BiLSTM"):
-            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
-            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
-        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
-        output = tf.concat(output,2)
-        return output
-
-    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
-        with tf.variable_scope("CRF"):
-            with tf.variable_scope("hidden"):
-                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
-                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
-                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
-                # print(input)
-                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
-                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
-            with tf.variable_scope("output"):
-                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
-                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
-                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
-                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
-        return logits_
-
-    def layer_loss(input,true_target,num_tags,length):
-        with tf.variable_scope("crf_loss"):
-            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
-            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
-            return tf.reduce_mean(-log_likelihood),trans
-
-    with sess.graph.as_default():
-        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
-        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
-        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
-        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
-
-        _embedding = embedding_layer(char_input)
-        _shape = tf.shape(char_input)
-        batch_size = _shape[0]
-        step_size = _shape[-1]
-        bilstm = BiLSTM_Layer(_embedding,length)
-        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
-        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
-        global_step = tf.Variable(0,trainable=False)
-        with tf.variable_scope("optimizer"):
-            opt = tf.train.AdamOptimizer(0.002)
-            grads_vars = opt.compute_gradients(crf_loss)
-            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
-            train_op = opt.apply_gradients(capped_grads_vars,global_step)
-            print('tensor: ',char_input, length, trans, _logits)
-            return char_input,_logits,target,length,crf_loss,trans,train_op
-
-def decode(logits, trans, sequence_lengths, tag_num):
-    viterbi_sequences = []
-    for logit, length in zip(logits, sequence_lengths):
-        score = logit[:length]
-        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
-        viterbi_sequences.append(viterbi_seq)
-    return viterbi_sequences
-
-class Punish_Extract():
-    def __init__(self, model_file = os.path.dirname(__file__)+"/models/21-0.9990081295021194-0.3647936/model.ckpt"):
-        print('model_file_path:',model_file)
-        self.sess = tf.Session(graph=tf.Graph())
-        self.code = ""
-        self.punish_dicition = ""
-        self.model_file = model_file #预测编号模型
-        self.load_model()
-
-    # 加载处罚编号预测模型
-    def load_model(self):
-        with self.sess.as_default() as sess:
-            with sess.graph.as_default():
-                vocab_model = getModel_word()
-                vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
-                self.char_input, self.logits, self.target, self.length, self.crf_loss, self.trans, self.train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
-                sess.run(tf.global_variables_initializer())
-                saver = tf.train.Saver()
-                saver.restore(sess, self.model_file)
-
-    # 处罚编号预测
-    def predict_punishCode(self,list_sentences):
-        re_ner = re.compile("12+?3")
-        article_ner_list = []
-        count = 0
-        with self.sess.as_default():
-            with self.sess.graph.as_default():
-                for sentences in list_sentences:
-                    count += 1
-                    # print(count)
-                    sentence_len = [len(sentence.sentence_text) for sentence in sentences]
-                    maxlen = max(sentence_len)
-                    sentences_x = []
-                    for sentence in sentences:
-                        sentence = sentence.sentence_text
-                        sentence = list(sentence)
-                        sentence2id = [getIndexOfWord(word) for word in sentence]
-                        sentences_x.append(sentence2id)
-                    sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
-                    sentences_x = [np.array(x) for x in sentences_x]
-                    print('punish tensor: ',self.logits, self.trans, self.char_input, self.length)
-                    _logits, _trans = self.sess.run([self.logits, self.trans],
-                                               feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
-                    viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
-
-                    ner_list = []
-                    for _seq, sentence in zip(viterbi_sequence, sentences):
-                        sentence = sentence.sentence_text
-                        seq_id = ''.join([str(s) for s in _seq])
-                        if re_ner.search(seq_id):
-                            # print("sentence: ",sentence)
-                            for _ner in re_ner.finditer(seq_id):
-                                start = _ner.start()
-                                end = _ner.end()
-                                n = sentence[start:end]
-                                # print(n,'<==>',start,end)
-                                # ner_list.append((n, start, end))
-                                ner_list.append(n)  # 改为只返回实体字符
-                    # article_ner_list.append(ner_list)
-                    article_ner_list.append(';'.join(set(ner_list)))
-        return article_ner_list[0]
-
-    # 处罚类型
-    def get_punishType(self, x1, x2):
-        '''通过文章标题及内容判断文章类别
-        x1: 标题
-        x2: 内容
-        return 类别'''
-        # x1 = x1.replace('(','(').replace(')', ')').replace(' ','')
-        # x2 = x2.replace('(', '(').replace(')', ')').replace(' ', '')
-        '''标题正则'''
-        # 未知公告
-        unknow = re.compile('采购方式|采购公告|磋商公告|谈判公告|交易公告$|征集|征求|招标公告|竞标公告|中标公告|'
-                            '成交公告|成交信息|流标公告|废标公告|城市管理考评|决算表|决算|预算|资格考试|招聘|选聘'
-                            '|聘请|拟录用|无违规违法|无此项信息|暂无工程投标违法|管理办法|指导意见|无投诉|投诉办法'
-                            '公共资源交易情况|绩效评价|考试成绩|付息公告|不动产|办证|印发|转发')  #|结果公示 部分是
-        # 投诉处理
-        tscl = re.compile('投诉不予[处受]理|投诉不成立|终止投诉|投诉终止|不予受理|投诉事?项?的?处理')
-        # 行政处罚
-        xzcf = re.compile('行政处罚|行政处理|政处罚|行政裁决|防罚|公罚|医罚|环罚|政罚|文罚|局罚|旅罚|财罚|运罚')
-        # 监督检查
-        jdjc = re.compile('(监督检查的?问?题?(处理|整改|记分|结果|决定|处罚))|监督处罚|调查处理|监督处理')
-        # 严重违法
-        yzwf = re.compile('严重违法失信|黑名单|失信名单')
-        # 不良行为
-        blxw = re.compile('((不良|失信|不诚信|差错|不规范|违规|违约|处罚|违法)(行为|记录|信息))|((违约|违规|违法)(处理|操作|情况|问题))'
-                          '|通报批评|记分管理|迟到|早退|缺席|虚假材料|弄虚作假|履职不到位|诚信考核扣分|串通投标'
-                          '|审核不通过|码一致|地址一致|扣分处理|扣分通知|扣[0-9]+分|责令整改|信用信息认定书$'
-                          '|关于.{,30}的处罚|关于.{,10}的?考评通报|关于.{,30}扣分情况|不规范代理行为'
-                          '|(取消|暂停|限制).{,50}((专家|评标|评委|投标|竞价|被抽取|中标|供应商|候选人)资格)'
-                          '|(代理服?务?机构).{,10}(扣分)|(专家).{,30}(扣分|记分|处罚)|对.{,30}处理|冻结.{,30}账号')
-        # 其他不良行为
-        other = re.compile('质疑|代理机构进场交易情况|网上投诉办理|信用奖惩|信用奖罚|进场工作.{,5}考核'
-                           '|举报处理|结果无效|成交无效|行政复议')
-
-        '''正文内容正则'''
-        # 投诉处理
-        tscl_c = re.compile('(投诉(人|单位)[1-9]?(名称)?[::])|(投诉事项[1-5一二三四五、]*部?分?(成立|予以受理))'
-                            '|((驳回|撤回|撤销|终止)[^,。]{,60}(投诉|质疑))')
-        # 行政处罚
-        xzcf_c = re.compile('((处理依据及结果|处理结果|处罚结果)).*行政处罚|如下行政处罚|行政处罚决定')
-        # 诚信加分
-        cxjf_c = re.compile('处罚结果.*诚信加分')
-        # 严重违法失信
-        yzwf_c = re.compile('工商部门严重违法失信起名单|严重违法失信的具体情形') #|严重违法失信的具体情形
-        # 不良行为
-        blxw_c = re.compile('(取消|暂停|限制).{,30}((专家|评标|评委|投标|采购|竞价|被抽取|中标|供应商)的?资格)'
-                            '|(处罚结果|处罚情况).*(扣[1-9]*分|记分|不良行为|不良记录|不良信用|不诚信|扣除信用'
-                            '|诚信档案|信用信息|取消.*资格|口头警告|处罚机关|责令改正|罚款|限制投标|暂扣|禁止'
-                            '|暂停|封禁|暂无|行政处罚)|处罚结果'
-                            '|处罚主题|禁止参与.{,10}政府采购活动|列入不良行为|处罚如下|如下处罚|违规处罚|处罚违规'
-                            '|责令改正|责令整改|处罚依据|进行以下处理|处理依据及结果|处理结果|处罚决定书|'
-                            '(不规范|不良|不诚信)行为记录')
-        # 其他不良行为
-        other_c = re.compile('质疑(人|单位)[1-9]?(名称)?:|公告期内受质疑')
-
-        if re.search(unknow, x1):
-            return re.search(unknow, x1).group(0), '未知类别'
-        elif re.search(yzwf, x1):
-            return re.search(yzwf, x1).group(0), '严重违法'
-        elif re.search(yzwf_c, x2):
-            return re.search(yzwf_c, x2).group(0), '严重违法'
-
-        elif re.search(tscl, x1):
-            return re.search(tscl, x1).group(0), '投诉处理'
-        elif re.search(xzcf, x1):
-            return re.search(xzcf, x1).group(0), '行政处罚'
-        elif re.search(jdjc, x1):
-            return re.search(jdjc, x1).group(0), '监督检查'
-        elif re.search(blxw, x1):
-            return re.search(blxw, x1).group(0), '不良行为'
-        elif re.search(other, x1):
-            return re.search(other, x1).group(0), '其他不良行为'
-
-        elif re.search(tscl_c, x2):
-            return re.search(tscl_c, x2).group(0), '投诉处理'
-        elif re.search(xzcf_c, x2):
-            return re.search(xzcf_c, x2).group(0), '行政处罚'
-        elif re.search(cxjf_c, x2):
-            return re.search(cxjf_c, x2).group(0), '诚信加分'
-
-        elif re.search(blxw_c, x2):
-            return re.search(blxw_c, x2).group(0), '不良行为'
-        elif re.search(other_c, x2):
-            return re.search(other_c, x2).group(0), '其他不良行为'
-
-        return ' ', '未知类别'
-
-    # 处罚决定
-    def get_punishDecision(self, x, x2):
-        '''通过正则匹配文章内容中的处理决定
-        x:正文内容
-        x2: 处罚类别
-        return 处理决定字符串'''
-        rule1 = re.compile(
-            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处理意见|行政处罚|处罚)(如下|如下))'
-            '|((以下|如下)(决定|处理|处理意见|行政处罚|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
-            '|整改意见)[::].{5,}')
-        rule2 = re.compile(
-            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处罚|处理意见)(如下|如下))'
-            '|((以下|如下)(决定|处理|处理意见|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
-            '|处罚内容)[:,,].{10,}')
-        rule3 = re.compile('考评结果:?.*')
-        rule4 = re.compile('(依据|根据)《.*》.*')
-        if x2 == '未知类别':
-            return ' '
-        elif re.search(rule1, x[-int(len(x)*0.4):]):
-            return re.search(rule1, x[-int(len(x)*0.4):]).group(0)
-        elif re.search(rule1, x[-int(len(x)*0.6):]):
-            return re.search(rule1, x[-int(len(x)*0.6):]).group(0)
-        elif re.search(rule2, x[-int(len(x)*0.7):]):
-            return re.search(rule2, x[-int(len(x)*0.7):]).group(0)
-        elif re.search(rule3, x[-int(len(x)*0.6):]):
-            return re.search(rule3, x[-int(len(x)*0.6):]).group(0)
-        elif re.search(rule4, x[-int(len(x)*0.4):]):
-            return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
-        else:
-            return ''
-
-    # 投诉是否成立
-    def get_punishWhether(self, x1, x2, x3):
-        '''通过正则匹配处理决定判断投诉是否成立
-        x1: 处理决定字符串
-        x2: 正文内容
-        x3: 处罚类别
-        return 投诉是否成立'''
-        p1 = re.compile('(投诉|投拆|质疑|举报)(事项|内容|事实)?[^不,。]{,10}(成立|属实|予以受理|予以支持)|责令|废标|(中标|成交)[^,。]{,10}无效'
-                        '|取消[^,。]{,60}资格|罚款|重新(组织|开展)?(招标|采购)|投诉成立|被投诉人存在违法违规行为'
-                        '|采购活动违法|(中标|评标|成交)结果无效')
-        p2 = re.compile('投诉不予[处受]理|((投诉|投拆|质疑|举报)(事项|内容|事实)?[^,。]{,10}(不成立|情?况?不属实|不予支持|缺乏事实依据))'
-                        '|((驳回|撤回|撤销|终止)[^,。]*(投诉|质疑|诉求))|终止[^,。]{,20}(行政裁决|投诉处理|采购活动)|投诉终止|投诉无效'
-                        '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^,。]{,10}不成立'
-                        '|维持被投诉人|不支持[^,。]{,20}投诉|无确凿证据')
-        if x3 != '投诉处理':
-            return ''
-        elif re.search(p1, x1):
-            return '投诉成立'
-        elif re.search(p2, x1):
-            return '投诉无效'
-        elif re.search(p1, x2):
-            return '投诉成立'
-        elif re.search(p2, x2):
-            return '投诉无效'
-        return ''
-
-    # 执法机构、处罚时间
-    def get_institution(self, title, sentences_l, entity_l):
-        '''
-        通过判断实体前信息判断改实体是否为执法机构
-        :param title: 文章标题
-        :param sentences_l: 单篇公告句子列表
-        :param entity_l: 单篇公告实体列表
-        :return: 执法机构及处罚时间字符串,多个的用;号隔开
-        '''
-        institutions = []
-        punishTimes = []
-        institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
-        punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
-        # 通过实体前面关键词判断是否为执法机构或处罚时间
-        for ner in entity_l:
-            if ner.entity_type == 'org':
-                left = sentences_l[ner.sentence_index].sentence_text[
-                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
-                if institution_1.search(left):
-                    institutions.append(ner)
-                elif institutions != [] and ner.sentence_index == institutions[-1].sentence_index and \
-                        ner.wordOffset_begin - institutions[-1].wordOffset_end < 2 and \
-                        sentences_l[ner.sentence_index].sentence_text[
-                        ner.wordOffset_begin:institutions[-1].wordOffset_end] \
-                        in ['', '、', '和', '及']:
-                    institutions.append(ner)
-            elif ner.entity_type == 'time':
-                left = sentences_l[ner.sentence_index].sentence_text[
-                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
-                if punishTimes_1.search(left):
-                    punishTimes.append(ner)
-
-        institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
-        institution_time = re.compile(
-            "(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
-        ins = ""
-        ptime = ""
-        # 如果前面步骤找不到处罚机构则在标题找实体,并正则检查是否有关键词
-        if institutions == [] and len(title)>10:
-            title_ners = getNers([title], useselffool=True)
-            if title_ners[0]:
-                for title_ner in title_ners[0]:
-                    if title_ner[2] == 'org' and institution_title.search(title_ner[3]):
-                        ins = title_ner[3]
-                        break
-        if punishTimes == [] or institutions == []:
-            # 如果前面步骤还没找到要素,则通过公司实体后面是否有日期关键词,有则作为处罚机构和处罚时间
-            for ner in [ner for ner in entity_l if ner.entity_type == 'org'][-5:][::-1]:
-                right = sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_end:ner.wordOffset_end + 16]
-                if institution_time.search(right):
-                    if ins == '':
-                        ins = ner.entity_text
-                    if ptime == '':
-                        ptime = institution_time.search(right).group(1)
-                    break
-            # 前面步骤都没找到则判断最后一个时间实体是否在文章末尾,是则作为处罚时间
-            if ptime == '':
-                n_time = [ner for ner in entity_l if ner.entity_type == 'time']
-                if len(n_time) != 0:
-                    ner = n_time[-1]
-                    if ner.sentence_index == len(sentences_l) - 1:
-                        textLong = len(sentences_l[ner.sentence_index].sentence_text)
-                        if ner.wordOffset_end > textLong - 3 and len(ner.entity_text) > 3:
-                            ptime = ner.entity_text
-        institutions = [ner.entity_text for ner in institutions]
-        punishTimes = [ner.entity_text for ner in punishTimes]
-        if institutions == [] and ins != "":
-            institutions.append(ins)
-        if punishTimes == [] and ptime != "":
-            punishTimes.append(ptime)
-        return ";".join(institutions), ";".join(punishTimes)
-
-    # 投诉人、被投诉人、被处罚人
-    def get_complainant(self, punishType, sentences_l, entity_l):
-        '''
-        通过对公告类别、句子列表、实体列表正则寻找投诉人、被投诉人、处罚人
-        :param punishType: 公告处罚类别
-        :param sentences_l: 单篇公告句子列表
-        :param entity_l: 单篇公告实体列表
-        :return: 投诉人、被投诉人
-        '''
-        complainants = []  # 投诉人
-        punishPeople = []  # 被投诉人、被处罚人
-        size = 16
-        # 投诉人、质疑人
-        complainants_rule1 = re.compile(
-            "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
-        # 被处罚人,被投诉人
-        punishPeople_rule1 = re.compile(
-            "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
-        punishPeople_rule2_1 = re.compile(",$")
-        punishPeople_rule2_2 = re.compile("^[::]")
-        punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
-        punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
-
-        punish_l = []  # 处罚实体列表
-        tmp = []
-        for ner in [ner for ner in entity_l if ner.entity_type in ['org', 'company', 'person']]:
-            if tmp == []:
-                tmp.append(ner)
-            elif ner.entity_type == tmp[-1].entity_type and ner.sentence_index == tmp[-1].sentence_index and \
-                    ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
-                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
-                '',
-                '、',
-                '和',
-                '及']:
-                tmp.append(ner)
-            elif ner.entity_type in ['org', 'company'] and tmp[-1].entity_type in ['org', 'company'] and \
-                    ner.sentence_index == tmp[-1].sentence_index and ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
-                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
-                '',
-                '、',
-                '和',
-                '及']:
-                tmp.append(ner)
-            else:
-                punish_l.append(tmp)
-                tmp = [ner]
-        for ner_l in punish_l:
-            begin_index = ner_l[0].wordOffset_begin
-            end_index = ner_l[-1].wordOffset_end
-            left = sentences_l[ner_l[0].sentence_index].sentence_text[max(0, begin_index - size):begin_index]
-            right = sentences_l[ner_l[0].sentence_index].sentence_text[end_index:end_index + size]
-            if complainants_rule1.search(left):
-                complainants.append(ner_l)
-            elif punishPeople_rule1.search(left):
-                punishPeople.append(ner_l)
-            elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
-                if punishType == '投诉处理':
-                    complainants.append(ner_l)
-                else:
-                    punishPeople.append(ner_l)
-            elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
-                punishPeople.append(ner_l)
-        complainants = set([it.entity_text for l in complainants for it in l])
-        punishPeople = set([it.entity_text for l in punishPeople for it in l])
-        return ';'.join(complainants), ';'.join(punishPeople)
-
-    def get_punish_extracts_backup(self, doc_id=' ', title=' ', text=' '):
-        list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
-                                                                                        useselffool=True)
-        punish_code = punish.predict_punishCode(list_sentences)
-        # print('处罚编号: ',punish_code)
-        institutions, punishTimes = punish.get_institution(title, list_sentences[0], list_entitys[0])
-        # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
-        keyword, punishType = punish.get_punishType(title, text)
-        # print('处罚类型:',punishType)
-        punishDecision = punish.get_punishDecision(text, punishType)
-        # print('处罚决定:',punishDecision)
-        punishWhether= punish.get_punishWhether(punishDecision, text, punishType)
-        # print('投诉是否成立:',punishWhether)
-        complainants, punishPeople = punish.get_complainant(punishType, list_sentences[0], list_entitys[0])
-        # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
-        punish_dic = {'punish_code':punish_code,
-                      'punishType':punishType,
-                      'punishDecision':punishDecision,
-                     'complainants':complainants,
-                     'punishPeople':punishPeople,
-                     'punishWhether':punishWhether,
-                     'institutions':institutions,
-                     'punishTimes':punishTimes}
-        return punish_dic
-        # return punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether,institutions, punishTimes
-
-    def get_punish_extracts(self,list_articles,list_sentences, list_entitys):
-        list_result = []
-        for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
-            title = article.title
-            text=article.content
-
-            keyword, punishType = self.get_punishType(title, text)
-            # print('处罚类型:',punishType)
-            punish_code = self.predict_punishCode(list_sentences)
-            # print('处罚编号: ',punish_code)
-            institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
-            # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
-            punishDecision = self.get_punishDecision(text, punishType)
-            # print('处罚决定:',punishDecision)
-            punishWhether= self.get_punishWhether(punishDecision, text, punishType)
-            # print('投诉是否成立:',punishWhether)
-            complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
-            # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
-            punish_dic = {'punish_code':punish_code,
-                          'punishType':punishType,
-                          'punishDecision':punishDecision,
-                         'complainants':complainants,
-                         'punishPeople':punishPeople,
-                         'punishWhether':punishWhether,
-                         'institutions':institutions,
-                         'punishTimes':punishTimes}
-            _count = 0
-            for k,v in punish_dic.items():
-                if v!="":
-                    _count += 1
-            if _count>=2 and punish_dic["punishType"]!="未知类别":
-                list_result.append({"punish":punish_dic})
-            else:
-                list_result.append({"punish":{}})
-        return list_result
-
-def save_punish_code_model():
-    model_folder = os.path.dirname(__file__) + "/models/21-0.9990081295021194-0.3647936"
-    output_graph = os.path.dirname(__file__) + "/models/punish_code.pb"
-    ckpt = tf.train.get_checkpoint_state(model_folder)
-    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
-        input_checkpoint = ckpt.model_checkpoint_path
-        saver = tf.train.import_meta_graph(input_checkpoint+".meta", clear_devices=True)
-        graph = tf.get_default_graph()
-        input_graph_def = graph.as_graph_def()
-        with tf.Session() as sess:
-            saver.restore(sess, input_checkpoint)
-            output_graph_def = graph_util.convert_variables_to_constants(
-                sess = sess,
-                input_graph_def = input_graph_def,
-                output_node_names=["char_input","length","crf_loss/transitons","CRF/output/logits"]
-            )
-            with tf.gfile.GFile(output_graph, "wb") as f:
-                f.write(output_graph_def.SerializeToString())
-
-
-if __name__ == "__main__":
-    save_punish_code_model()
-    # punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
-    #
-    # import pandas as pd
-    # # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
-    # df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
-    # # i = 89
-    # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
-    # # i = 92
-    # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
-    #
-    # # t1 = time.time()
-    # # for i in df.index:
-    # #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
-    # #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
-    # #     df.loc[i, '投诉人'] = complainants
-    # #     df.loc[i, '被投诉人'] = punishPeople
-    # #     df.loc[i, '执法机构'] = institutions
-    # #     df.loc[i, '处罚时间'] = punishTimes
-    # #     df.loc[i, '处罚编号'] = punish_code
-    # #     print('完成第%d篇'%i)
-    # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
-    # # #     '关键词', '类别', '处理决定', '投诉是否成立',
-    # # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
-    # # #    'institution', 'punishTime', 'ner_test']])
-    # # t2 = time.time()
-    # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
-    # # #     '关键词', '类别', '处理决定', '投诉是否成立',
-    # # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
-    # # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
-    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
-    # #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
-    # #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
-    # # t3 = time.time()
-    # # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
-    # s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
-    # # list_sentences = [s.split('。')]
-    # # punish_code= punish.predict_punishCode( list_sentences)
-    # # print(punish_code)
-    #
-    # # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
-    # #             get_punish_extracts(text=s)
-    # punish_dic = punish.get_punish_extracts_backup(text=s)
-    # print(punish_dic)

+ 0 - 0
BiddingKG/dl/complaint/test/__init__.py


+ 1 - 1
BiddingKG/dl/complaint/test1.py → BiddingKG/dl/complaint/test/test1.py

@@ -1,6 +1,6 @@
 import sys
 import os
-sys.path.append(os.path.abspath("../.."))
+sys.path.append(os.path.abspath("../../.."))
 import pandas as pd
 import re
 from BiddingKG.dl.common.Utils import *

+ 3 - 3
BiddingKG/dl/foolnltk/bi_lstm_crf.py

@@ -3,9 +3,9 @@
 
 
 import tensorflow as tf
-from tensorflow.contrib import rnn
-from tensorflow.contrib.crf import crf_log_likelihood
-from tensorflow.contrib.layers.python.layers import initializers
+# from tensorflow.contrib import rnn
+# from tensorflow.contrib.crf import crf_log_likelihood
+# from tensorflow.contrib.layers.python.layers import initializers
 import numpy as np
 from BiddingKG.dl.common.Utils import viterbi_decode
 from zipfile import ZipFile

+ 2 - 2
BiddingKG/dl/foolnltk/retrain.py

@@ -10,7 +10,7 @@ import sys
 sys.path.append(os.path.abspath("../../"))
 
 import tensorflow as tf
-from tensorflow.contrib.crf import crf_log_likelihood
+# from tensorflow.contrib.crf import crf_log_likelihood
 import json
 import numpy as np
 from BiddingKG.dl.foolnltk.label import labelEntity,readlabeldata,getContext
@@ -66,7 +66,7 @@ def getAcc(y_batch,logits,trans,lengths):
         pad = small * np.ones([length, 1])
         logit = np.concatenate([score, pad], axis=1)
         logit = np.concatenate([start, logit], axis=0)
-        path, _ = tf.contrib.crf.viterbi_decode(logit, trans)
+        path, _ = viterbi_decode(logit, trans)
         preds += path[1:]
         index += 1
 

+ 2 - 105
BiddingKG/dl/interface/Preprocessing.py

@@ -11,8 +11,8 @@ sys.path.append(os.path.abspath("../.."))
 sys.path.append(os.path.abspath(".."))
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.Entitys import *
-from BiddingKG.dl.interface.predictor import *
-from BiddingKG.dl.foolnltk import selffool
+from BiddingKG.dl.interface.predictor import getPredictor
+from BiddingKG.dl.common.nerUtils import *
 from BiddingKG.dl.money.moneySource.ruleExtra import extract_moneySource
 from BiddingKG.dl.time.re_servicetime import extract_servicetime
 from BiddingKG.dl.bidway.re_bidway import extract_bidway
@@ -1210,109 +1210,6 @@ def union_ner(list_ner):
         result_list.append((list_ner[item[0]][0],list_ner[item[1]][1],'company',str(list_ner[item[0]][3])+str(list_ner[item[1]][3])))
     return result_list
                 
-                
-def getTokensAndNers(sentences,MAXAREA = 10000,useselffool=False):
-    '''
-    @param: sentences:句子数
-    @return 限流执行后的分词和实体识别list
-    '''
-    def getData(tokens,ners,process_data):
-        process_sentences = [item[1] for item in process_data]
-        
-        token_ = selffool.cut(process_sentences)
-        if useselffool:
-            ner_ = selffool.self_ner(process_sentences)
-        else:
-            ner_ = selffool.ner(process_sentences)
-        for i in range(len(token_)):
-            the_index = process_data[i][0]
-            tokens[the_index] = token_[i]
-            ners[the_index] = ner_[i]
-    sents = []
-    for i in range(len(sentences)):
-        sents.append([i,sentences[i]])
-    sents.sort(key=lambda x:len(x[1]),reverse=True)
-    index_ = 0
-    tokens = [[]for i in range(len(sentences))]
-    ners = [[]for i in range(len(sentences))]
-    
-    while(True):
-        width = len(sents[index_][1])
-        height = MAXAREA//width+1
-        if height>len(sents)-index_:
-            height = len(sents)-index_
-        process_data = sents[index_:index_+height]
-        getData(tokens, ners, process_data)
-        index_ += height
-        if index_>=len(sents):
-            break
-    return tokens,ners
-
-def getTokens(sentences,MAXAREA = 10000,useselffool=True):
-    '''
-     @param: sentences:句子数
-     @return 限流执行后的分词list
-     '''
-    def getData(tokens,process_data):
-        process_sentences = [item[1] for item in process_data]
-
-        token_ = selffool.cut(process_sentences)
-        for i in range(len(token_)):
-            the_index = process_data[i][0]
-            tokens[the_index] = token_[i]
-    sents = []
-    for i in range(len(sentences)):
-        sents.append([i,sentences[i]])
-    sents.sort(key=lambda x:len(x[1]),reverse=True)
-    index_ = 0
-    tokens = [[]for i in range(len(sentences))]
-
-    while(True):
-        width = len(sents[index_][1])
-        height = MAXAREA//width+1
-        if height>len(sents)-index_:
-            height = len(sents)-index_
-        process_data = sents[index_:index_+height]
-        getData(tokens, process_data)
-        index_ += height
-        if index_>=len(sents):
-            break
-    return tokens
-
-def getNers(sentences,MAXAREA = 10000,useselffool=False):
-    '''
-    @param: sentences:句子数
-    @return 限流执行后的实体识别list
-    '''
-    def getData(ners,process_data):
-        process_sentences = [item[1] for item in process_data]
-
-        if useselffool:
-            ner_ = selffool.self_ner(process_sentences)
-        else:
-            ner_ = selffool.ner(process_sentences)
-        for i in range(len(ner_)):
-            the_index = process_data[i][0]
-            ners[the_index] = ner_[i]
-    sents = []
-    for i in range(len(sentences)):
-        sents.append([i,sentences[i]])
-    sents.sort(key=lambda x:len(x[1]),reverse=True)
-    index_ = 0
-    ners = [[]for i in range(len(sentences))]
-
-    while(True):
-        width = len(sents[index_][1])
-        height = MAXAREA//width+1
-        if height>len(sents)-index_:
-            height = len(sents)-index_
-        process_data = sents[index_:index_+height]
-        getData( ners, process_data)
-        index_ += height
-        if index_>=len(sents):
-            break
-    return ners
-    
 
 # def get_preprocessed(articles,useselffool=False):
 #     '''

+ 66 - 20
BiddingKG/dl/interface/extract.py

@@ -25,14 +25,7 @@ import json
 
 
 
-''''''
-codeNamePredict = predictor.CodeNamePredict()
-premPredict = predictor.PREMPredict()
-epcPredict = predictor.EPCPredict()
-roleRulePredict = predictor.RoleRulePredictor()
-timePredict = predictor.TimePredictor()
-punish = punish_rule.Punish_Extract()
-productPredict = predictor.ProductPredictor()
+
 
 #自定义jsonEncoder
 class MyEncoder(json.JSONEncoder):
@@ -49,21 +42,74 @@ class MyEncoder(json.JSONEncoder):
         return json.JSONEncoder.default(self, obj)
 
 def predict(doc_id,text,title=""):
-    list_articles,list_sentences,list_entitys,_ = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
-
-    codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
 
-    premPredict.predict(list_sentences,list_entitys)
-    productPredict.predict(list_sentences,list_entitys)
-
-    roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-    epcPredict.predict(list_sentences,list_entitys)
-    timePredict.predict(list_sentences, list_entitys)
+    cost_time = dict()
+
+    start_time = time.time()
+    log("start process doc %s"%(str(doc_id)))
+    list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
+    log("get preprocessed done of doc_id%s"%(doc_id))
+    cost_time["preprocess"] = time.time()-start_time
+    cost_time.update(_cost_time)
+
+    start_time = time.time()
+    codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=5000,list_entitys=list_entitys)
+    log("get codename done of doc_id%s"%(doc_id))
+    cost_time["codename"] = time.time()-start_time
+
+    start_time = time.time()
+    predictor.getPredictor("prem").predict(list_sentences,list_entitys)
+    log("get prem done of doc_id%s"%(doc_id))
+    cost_time["prem"] = time.time()-start_time
+
+    start_time = time.time()
+    predictor.getPredictor("product").predict(list_sentences,list_entitys)
+    log("get product done of doc_id%s"%(doc_id))
+    cost_time["product"] = time.time()-start_time
+
+    start_time = time.time()
+    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
+    cost_time["rule"] = time.time()-start_time
+
+    start_time = time.time()
+    predictor.getPredictor("epc").predict(list_sentences,list_entitys)
+    log("get epc done of doc_id%s"%(doc_id))
+    cost_time["person"] = time.time()-start_time
+
+    start_time = time.time()
+    predictor.getPredictor("time").predict(list_sentences, list_entitys)
+    log("get time done of doc_id%s"%(doc_id))
+    cost_time["time"] = time.time()-start_time
+
+    start_time = time.time()
     entityLink.link_entitys(list_entitys)
     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
-    list_punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
-
-    return json.dumps(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+    log("get attributes done of doc_id%s"%(doc_id))
+    cost_time["attrs"] = time.time()-start_time
+
+    start_time = time.time()
+    list_punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
+    cost_time["punish"] = time.time()-start_time
+
+    #print(prem)
+    data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
+    data_res["cost_time"] = cost_time
+    data_res["success"] = True
+
+
+    # codeName = codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
+    #
+    # premPredict.predict(list_sentences,list_entitys)
+    # productPredict.predict(list_sentences,list_entitys)
+    #
+    # roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
+    # epcPredict.predict(list_sentences,list_entitys)
+    # timePredict.predict(list_sentences, list_entitys)
+    # entityLink.link_entitys(list_entitys)
+    # prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
+    # list_punish_dic = punish.get_punish_extracts(list_articles,list_sentences, list_entitys)
+
+    return json.dumps(data_res,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
 
 
 def test(name,content):

+ 11 - 78
BiddingKG/dl/interface/predictor.py

@@ -16,16 +16,19 @@ sys.path.append(os.path.abspath("../.."))
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.modelFactory import *
 import tensorflow as tf
-from tensorflow.python.framework import graph_util
 from BiddingKG.dl.product.data_util import decode, process_data
 from BiddingKG.dl.interface.Entitys import Entity
+from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
 
 from threading import RLock
 dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
               "prem":{"predictor":None,"Lock":RLock()},
               "epc":{"predictor":None,"Lock":RLock()},
               "roleRule":{"predictor":None,"Lock":RLock()},
-                  "form":{"predictor":None,"Lock":RLock()}}
+                  "form":{"predictor":None,"Lock":RLock()},
+                  "time":{"predictor":None,"Lock":RLock()},
+                  "punish":{"predictor":None,"Lock":RLock()},
+                  "product":{"predictor":None,"Lock":RLock()}}
 
 
 def getPredictor(_type):
@@ -42,6 +45,12 @@ def getPredictor(_type):
                     dict_predictor[_type]["predictor"] = RoleRulePredictor()
                 if _type=="form":
                     dict_predictor[_type]["predictor"] = FormPredictor()
+                if _type=="time":
+                    dict_predictor[_type]["predictor"] = TimePredictor()
+                if _type=="punish":
+                    dict_predictor[_type]["predictor"] = Punish_Extract()
+                if _type=="product":
+                    dict_predictor[_type]["predictor"] = ProductPredictor()
             return dict_predictor[_type]["predictor"]
     raise NameError("no this type of predictor")
 
@@ -1322,82 +1331,6 @@ def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
     model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
     return model
 
-from tensorflow.contrib.crf import crf_log_likelihood
-from tensorflow.contrib.layers.python.layers import initializers
-def BiLSTM_CRF_tfmodel(sess,embedding_weights):
-    '''
-    :param embedding_weights: 预训练的字向量矩阵
-
-    '''
-    BiRNN_Unit = 100
-    chunk_tags = {
-        'O': 0,
-        'PN_B': 1,
-        'PN_M': 2,
-        'PN_E': 3,
-        'PC_B': 4,
-        'PC_M': 5,
-        'PC_E': 6,
-    }
-
-    def embedding_layer(input,keepprob):
-        # 加载预训练的字向量矩阵
-        embedding = tf.get_variable(name="embedding",initializer=np.array(embedding_weights, dtype=np.float32),dtype=tf.float32)
-        embedding = tf.nn.embedding_lookup(params=embedding,ids=input)
-        embedding_drop = tf.nn.dropout(embedding,keepprob)
-        return embedding_drop
-
-    def BiLSTM_Layer(input,length):
-        with tf.variable_scope("BiLSTM"):
-            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True)
-            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True)
-        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
-        output = tf.concat(output,2)
-        return output
-
-    def CRF_layer(input,num_tags,BiRNN_Unit,time_step,keepprob):
-        with tf.variable_scope("CRF"):
-            with tf.variable_scope("hidden"):
-                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Unit*2,BiRNN_Unit),dtype=tf.float32,
-                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
-                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Unit),dtype=tf.float32,initializer=tf.zeros_initializer())
-                # print(input)
-                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Unit*2))
-                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
-                hidden = tf.nn.dropout(hidden,keepprob)
-            with tf.variable_scope("output"):
-                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Unit,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
-                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
-                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
-                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
-        return logits_
-
-    def layer_loss(input,true_target,num_tags,length):
-        with tf.variable_scope("crf_loss"):
-            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
-            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
-            return tf.reduce_mean(-log_likelihood),trans
-
-    with sess.graph.as_default():
-        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
-        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
-        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
-        keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
-
-        _embedding = embedding_layer(char_input,keepprob)
-        _shape = tf.shape(char_input)
-        batch_size = _shape[0]
-        step_size = _shape[-1]
-        bilstm = BiLSTM_Layer(_embedding,length)
-        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Unit=BiRNN_Unit,time_step=step_size,keepprob=keepprob)
-        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
-        global_step = tf.Variable(0,trainable=False)
-        with tf.variable_scope("optimizer"):
-            opt = tf.train.AdamOptimizer(0.002)
-            grads_vars = opt.compute_gradients(crf_loss)
-            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
-            train_op = opt.apply_gradients(capped_grads_vars,global_step)
-            return char_input,_logits,target,keepprob,length,crf_loss,trans,train_op
 
 import h5py
 def h5_to_graph(sess,graph,h5file):

+ 1 - 1
BiddingKG/dl/money/moneySource/ruleExtra.py

@@ -153,7 +153,7 @@ def extract_moneySource(text):
         else:
             wordOffset_begin = result['index'] + (result['start']+result['moneySource']).find(entity_text)
             wordOffset_end = wordOffset_begin + len(entity_text)
-            print(entity_text,wordOffset_begin,wordOffset_end)
+            # print(entity_text,wordOffset_begin,wordOffset_end)
             _moneySource = dict()
             _moneySource['body'] = entity_text
             _moneySource['begin_index'] = wordOffset_begin

+ 1 - 2
BiddingKG/dl/product/data_util.py

@@ -6,8 +6,7 @@ import re
 import math
 import random
 import numpy as np
-from tensorflow.contrib.crf import viterbi_decode
-from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word
+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word,viterbi_decode
 
 id_to_tag = {0:'O',1:'B',2:'I',3:'E'}
 word_model = getModel_word()

+ 3 - 3
BiddingKG/dl/product/product_model.py

@@ -6,9 +6,9 @@
 from BiddingKG.dl.product.data_util import matrix,vocab,input_from_line,result_to_json,get_ner
 import tensorflow as tf
 import numpy as np
-from tensorflow.contrib.crf import crf_log_likelihood
-from tensorflow.contrib.crf import viterbi_decode
-from tensorflow.contrib.layers.python.layers import initializers
+# from tensorflow.contrib.crf import crf_log_likelihood
+# from tensorflow.contrib.crf import viterbi_decode
+# from tensorflow.contrib.layers.python.layers import initializers
 
 # word_model = getModel_word()
 class Product_Model(object):

+ 1 - 1
BiddingKG/dl/projectCode/ProjectCodeNameRecognition.py

@@ -74,7 +74,7 @@ def getAcc(y_batch,logits,trans,lengths):
         # logit = np.concatenate([score, pad], axis=1)
         # logit = np.concatenate([start, logit], axis=0)
         # path, _ = tf.contrib.crf.viterbi_decode(logit, trans)
-        path, _ = tf.contrib.crf.viterbi_decode(score, trans)
+        path, _ = viterbi_decode(score, trans)
         preds += path[0:]
         # preds += path[1:]
         index += 1

+ 2 - 2
BiddingKG/dl/projectCode/models.py

@@ -668,8 +668,8 @@ def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
     return model
 
 def getBilstmCRF_tf(sess,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
-    from tensorflow.contrib.layers.python.layers import initializers
-    from tensorflow.contrib.crf import crf_log_likelihood
+    # from tensorflow.contrib.layers.python.layers import initializers
+    # from tensorflow.contrib.crf import crf_log_likelihood
     def layer_embedding(input):
         embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
         return tf.nn.embedding_lookup(params=embedding,ids=input)

+ 3 - 3
BiddingKG/dl/projectCode/projectCodeAndName_tf.py

@@ -1,6 +1,6 @@
 import tensorflow as tf
-from tensorflow.contrib.crf import crf_log_likelihood
-from tensorflow.contrib.layers.python.layers import initializers
+# from tensorflow.contrib.crf import crf_log_likelihood
+# from tensorflow.contrib.layers.python.layers import initializers
 import numpy as np
 import pandas as pd
 import os
@@ -432,7 +432,7 @@ def getAcc(y_batch,logits,trans,lengths):
     true_tags = []
     for score, length in zip(logits, lengths):
         score = score[:length]
-        path, _ = tf.contrib.crf.viterbi_decode(score, trans)
+        path, _ = viterbi_decode(score, trans)
         preds += path[0:]
         index += 1
 

+ 3 - 3
BiddingKG/dl/selffool/bi_lstm_crf.py

@@ -3,9 +3,9 @@
 
 
 import tensorflow as tf
-from tensorflow.contrib import rnn
-from tensorflow.contrib.crf import crf_log_likelihood
-from tensorflow.contrib.layers.python.layers import initializers
+# from tensorflow.contrib import rnn
+# from tensorflow.contrib.crf import crf_log_likelihood
+# from tensorflow.contrib.layers.python.layers import initializers
 import numpy as np
 from BiddingKG.dl.common.Utils import viterbi_decode
 from zipfile import ZipFile

+ 1 - 1
BiddingKG/dl/test/test4.py

@@ -35,7 +35,7 @@ def test(name,content):
 if __name__=="__main__":
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\admin\\Desktop\\新建文本文档 (3).txt","r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))

+ 0 - 324
BiddingKG/maxcompute/evaluates.py

@@ -61,329 +61,6 @@ def init_env(list_files,package_name):
     # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
 
 
-
-
-# UDF主程序
-# 由于Series可能在多处调用,所以先在__init__中将其定义为全局类。
-@annotate("string->string")
-class JiebaCut(object):
-
-    def __init__(self):
-        # zip_01 = include_package_path('testB01.zip')
-        # zip_02 = include_package_path('testB02.zip')
-        # self.cat_cmd = "cat %s %s > %s"%(zip_01+"/files/*",zip_02+"/files/*","testH.zip")
-        # import os
-        import sys
-        # os.system(self.cat_cmd)
-        # self.out = str(os.path.getsize("testH.zip"))
-        # # self.out = str(os.listdir(zip_01+"/files/testB01/"))
-        # os.system("mkdir jieba_t")
-        # os.system("unzip testH.zip -d jieba_t")
-        # self.out = str(os.listdir("jieba_t"))
-        # sys.path.append(".")
-        # # sys.path.append(os.path.dirname(os.path.normpath("jieba_test")))
-        # # import jieba_test
-
-        # from jieba_t import cut
-
-        include_package_path("jiebaA.zip")
-        import jieba
-        reload(sys)
-        sys.setdefaultencoding('utf-8')
-        global jieba
-
-    def evaluate(self, x):
-        import os
-        # return self.out
-        # return str(os.listdir("jieba_test"))
-        # return self.cat_cmd
-        return '--'.join(jieba.cut(x))
-
-
-@annotate("string->string")
-class Preprocess(BaseUDTF):
-
-    def __init__(self):
-        # init_env(["gensim_package.zip.env"],"local_package1")
-        import sys
-        import uuid
-        self.out = init_env(["BiddingKG.zip.env"],"local_package")
-        self.out = init_env(["wiki_128_word_embedding_new.vector.env"],".")
-        self.out = include_package_path("envs_py37.env.zip")
-        # self.out = init_env(["envs_py37.zip.env"],"local_package")
-        self.out = init_env(["so.env"],".")
-
-        import BiddingKG.dl.interface.predictor as predictor
-        import BiddingKG.dl.interface.Preprocessing as Preprocessing
-        import BiddingKG.dl.entityLink.entityLink as entityLink
-        import BiddingKG.dl.interface.getAttributes as getAttributes
-        global Preprocessing,entityLink,predictor,uuid,getAttributes
-        # import gensim
-        # include_package_path("numpy.zip")
-        # init_env(["tensorflow-1.14.0-cp37-cp37m-manylinux1_x86_64.whl"])
-        # so_file = get_cache_file("tensorflow-1.14.0-cp37-cp37m-manylinux1_x86_64.whl")
-        # import os
-        # self.out = os.path.abspath(so_file.name)
-        # import tensorflow
-
-    def process(self, x):
-        k = str(uuid.uuid4())
-        list_articles = Preprocessing.get_preprocessed_articles([[k,x,"","_doc_id",""]],useselffool=True)
-        self.forward(list_articles[0].toJson())
-
-        # list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[k,x,"","_doc_id",""]],useselffool=True)
-        #
-        # codeName = predictor.getPredictor("codeName").predict(list_sentences,MAX_AREA=2000,list_entitys=list_entitys)
-        #
-        # predictor.getPredictor("prem").predict(list_sentences,list_entitys)
-        #
-        # predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
-        #
-        # predictor.getPredictor("epc").predict(list_sentences,list_entitys)
-        #
-        # entityLink.link_entitys(list_entitys)
-        #
-        # prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
-        # # return str(self.out)
-        # return "1"
-        # list_articles,list_sentences,list_entitys,_ = Preprocessing.get_articles_processed([["doc_id",x,"","",""]],useselffool=True)
-        # if len(list_articles)==1:
-        #     json_article = list_articles[0]
-        # self.forward(list_sentences[0][0].sentence_text)
-
-@annotate("string -> string,string")
-class Preprocess_article(BaseUDTF):
-
-    def __init__(self):
-        # self.out = init_env(["BiddingKG.z01","BiddingKG.z02"],"local_package")
-        self.out = init_env(["BiddingKG.zip.env"],"local_package")
-        self.out = init_env(["wiki_128_word_embedding_new.vector.env"],".")
-        self.out = init_env(["envs_py37.zip.env"],"local_package")
-        self.out = init_env(["so.env"],".")
-        import uuid
-        import BiddingKG.dl.interface.predictor as predictor
-        import BiddingKG.dl.interface.Preprocessing as Preprocessing
-        import BiddingKG.dl.entityLink.entityLink as entityLink
-        import BiddingKG.dl.interface.getAttributes as getAttributes
-        global Preprocessing,entityLink,predictor,uuid,getAttributes
-
-    def process(self, x):
-        if x is not None:
-            k = str(uuid.uuid4())
-            list_articles = Preprocessing.get_preprocessed_article([[k,x,"","_doc_id",""]])
-            self.forward(list_articles[0].id,list_articles[0].toJson())
-
-@annotate("string->string,string")
-class Preprocess_sentences(BaseUDTF):
-
-    def __init__(self):
-        # self.out = init_env(["BiddingKG.z01","BiddingKG.z02"],"local_package")
-        self.out = init_env(["BiddingKG.zip.env"],"local_package")
-        self.out = init_env(["wiki_128_word_embedding_new.vector.env"],".")
-        self.out = init_env(["envs_py37.zip.env"],"local_package")
-        self.out = init_env(["so.env"],".")
-        import BiddingKG.dl.interface.Preprocessing as Preprocessing
-        import BiddingKG.dl.interface.Entitys as Entitys
-        import json
-        global Preprocessing,Entitys,json
-
-    def process(self,x):
-        _article = Entitys.Article.fromJson(x)
-        list_sentences = Preprocessing.get_preprocessed_sentences([_article],True)
-        list_out = []
-        for _sentence in list_sentences[0]:
-            list_out.append(_sentence.toJson())
-        self.forward(_article.id,json.dumps(list_out))
-
-@annotate("string->string,string")
-class Preprocess_entitys(BaseUDTF):
-    def __init__(self):
-        # self.out = init_env(["BiddingKG.z01","BiddingKG.z02"],"local_package")
-        self.out = init_env(["BiddingKG.zip.env"],"local_package")
-        self.out = init_env(["wiki_128_word_embedding_new.vector.env"],".")
-        self.out = init_env(["envs_py37.zip.env"],"local_package")
-        self.out = init_env(["so.env"],".")
-        import BiddingKG.dl.interface.Preprocessing as Preprocessing
-        import BiddingKG.dl.interface.Entitys as Entitys
-        import json
-        global Preprocessing,Entitys,json
-
-    def process(self,x):
-        list_sentence = []
-        for _x in json.loads(x):
-            list_sentence.append(Entitys.Sentences.fromJson(_x))
-        list_out = []
-        list_entitys = Preprocessing.get_preprocessed_entitys([list_sentence],True)
-        for _entity in list_entitys[0]:
-            list_out.append(_entity.toJson())
-        self.forward(list_sentence[0].doc_id,json.dumps(list_out))
-
-@annotate("string->string,string")
-class Predict_codename(BaseUDTF):
-    def __init__(self):
-        # self.out = init_env(["BiddingKG.z01","BiddingKG.z02"],"local_package")
-        self.out = init_env(["BiddingKG.zip.env"],"local_package")
-        self.out = init_env(["wiki_128_word_embedding_new.vector.env"],".")
-        self.out = init_env(["envs_py37.zip.env"],"local_package")
-        self.out = init_env(["so.env"],".")
-        import BiddingKG.dl.interface.predictor as predictor
-        import BiddingKG.dl.interface.Entitys as Entitys
-        import json
-        global predictor,Entitys,json
-
-    def process(self,x):
-        list_sentence = []
-        for _x in json.loads(x):
-            list_sentence.append(Entitys.Sentences.fromJson(_x))
-        codename = predictor.getPredictor("codeName").predict([list_sentence],MAX_AREA=2000)
-        self.forward(codename[0][0],json.dumps(codename[0]))
-
-@annotate("string,string->string,string")
-class Predict_role(BaseUDTF):
-    def __init__(self):
-        # self.out = init_env(["BiddingKG.z01","BiddingKG.z02"],"local_package")
-        self.out = init_env(["BiddingKG.zip.env"],"local_package")
-        self.out = init_env(["wiki_128_word_embedding_new.vector.env"],".")
-        self.out = init_env(["envs_py37.zip.env"],"local_package")
-        self.out = init_env(["so.env"],".")
-        import BiddingKG.dl.interface.predictor as predictor
-        import BiddingKG.dl.interface.Entitys as Entitys
-        import json
-        global predictor,Entitys,json
-
-    def process(self,x,y):
-        list_sentence = []
-        list_entity = []
-        for _x in json.loads(x):
-            list_sentence.append(Entitys.Sentences.fromJson(_x))
-        for _y in json.loads(y):
-            list_entity.append(Entitys.Entity.fromJson(_y))
-        predictor.getPredictor("prem").predict_role([list_sentence],[list_entity])
-        list_out = []
-        for _entity in list_entity:
-            if _entity.label is not None:
-                list_out.append(_entity.toJson())
-        self.forward(list_sentence[0].doc_id,json.dumps(list_out))
-
-@annotate("string,string->string,string")
-class Predict_money(BaseUDTF):
-    def __init__(self):
-        # self.out = init_env(["BiddingKG.z01","BiddingKG.z02"],"local_package")
-        self.out = init_env(["BiddingKG.zip.env"],"local_package")
-        self.out = init_env(["wiki_128_word_embedding_new.vector.env"],".")
-        self.out = init_env(["envs_py37.zip.env"],"local_package")
-        self.out = init_env(["so.env"],".")
-        import BiddingKG.dl.interface.predictor as predictor
-        import BiddingKG.dl.interface.Entitys as Entitys
-        import json
-        global predictor,Entitys,json
-
-    def process(self,x,y):
-        list_sentence = []
-        list_entity = []
-        for _x in json.loads(x):
-            list_sentence.append(Entitys.Sentences.fromJson(_x))
-        for _y in json.loads(y):
-            list_entity.append(Entitys.Entity.fromJson(_y))
-        predictor.getPredictor("prem").predict_money([list_sentence],[list_entity])
-        list_out = []
-        for _entity in list_entity:
-            if _entity.label is not None:
-                list_out.append(_entity.toJson())
-        self.forward(list_sentence[0].doc_id,json.dumps(list_out))
-
-@annotate("string,string->string,string")
-class Predict_person(BaseUDTF):
-    def __init__(self):
-        # self.out = init_env(["BiddingKG.z01","BiddingKG.z02"],"local_package")
-        self.out = init_env(["BiddingKG.zip.env"],"local_package")
-        self.out = init_env(["wiki_128_word_embedding_new.vector.env"],".")
-        self.out = init_env(["envs_py37.zip.env"],"local_package")
-        self.out = init_env(["so.env"],".")
-        import BiddingKG.dl.interface.predictor as predictor
-        import BiddingKG.dl.interface.Entitys as Entitys
-        import json
-        global predictor,Entitys,json
-
-    def process(self,x,y):
-        list_sentence = []
-        list_entity = []
-        for _x in json.loads(x):
-            list_sentence.append(Entitys.Sentences.fromJson(_x))
-        for _y in json.loads(y):
-            list_entity.append(Entitys.Entity.fromJson(_y))
-        predictor.getPredictor("epc").predict_person([list_sentence],[list_entity])
-        list_out = []
-        for _entity in list_entity:
-            if _entity.label is not None:
-                list_out.append(_entity.toJson())
-        self.forward(list_sentence[0].doc_id,json.dumps(list_out))
-
-@annotate("string,string,string,string,string,string,string->string,string,string")
-class ContentUnion(BaseUDTF):
-    def __init__(self):
-
-        # self.out = init_env(["BiddingKG.z01","BiddingKG.z02"],"local_package")
-        self.out = init_env(["BiddingKG.zip.env"],"local_package")
-        self.out = init_env(["wiki_128_word_embedding_new.vector.env"],".")
-        self.out = init_env(["envs_py37.zip.env"],"local_package")
-        self.out = init_env(["so.env"],".")
-        import BiddingKG.dl.interface.predictor as predictor
-        import BiddingKG.dl.interface.Entitys as Entitys
-        import BiddingKG.dl.interface.getAttributes as getAttributes
-        import BiddingKG.dl.entityLink.entityLink as entityLink
-        import BiddingKG.dl.interface.Preprocessing as Preprocessing
-        import json
-
-
-        global predictor,Entitys,getAttributes,entityLink,json,MyEncoder,Preprocessing
-        #自定义jsonEncoder
-        class MyEncoder(json.JSONEncoder):
-
-            def __init__(self):
-                import numpy as np
-                global np
-
-            def default(self, obj):
-                if isinstance(obj, np.ndarray):
-                    return obj.tolist()
-                elif isinstance(obj, bytes):
-                    return str(obj, encoding='utf-8')
-                elif isinstance(obj, (np.float_, np.float16, np.float32,
-                                      np.float64)):
-                    return float(obj)
-                return json.JSONEncoder.default(self, obj)
-
-
-
-    def process(self,json_article,list_json_sentence,list_json_entity,list_json_entity_role,list_json_entity_money,list_json_entity_person,json_codename):
-        dict_entity = {}
-        list_sentence = []
-        list_entity = []
-        _article = Entitys.Article.fromJson(json_article)
-        for list_json in [list_json_entity_role,list_json_entity_money,list_json_entity_person]:
-            for _json_entity in json.loads(list_json):
-                _entity = Entitys.Entity.fromJson(_json_entity)
-                _key = "%s-%s-%s"%(str(_entity.doc_id),str(_entity.entity_id),str(_entity.entity_type))
-                dict_entity[_key] = _entity
-        for _json_sentence in json.loads(list_json_sentence):
-            list_sentence.append(Entitys.Sentences.fromJson(_json_sentence))
-        for _json_entity in json.loads(list_json_entity):
-            _entity = Entitys.Entity.fromJson(_json_entity)
-            _key = "%s-%s-%s"%(str(_entity.doc_id),str(_entity.entity_id),str(_entity.entity_type))
-            if _key in dict_entity:
-                list_entity.append(dict_entity[_key])
-            else:
-                list_entity.append(_entity)
-        codeName = json.loads(json_codename)
-        predictor.getPredictor("roleRule").predict([_article],[list_sentence], [list_entity],[codeName])
-        entityLink.link_entitys([list_entity])
-        prem = getAttributes.getPREMs([list_sentence],[list_entity],[_article])
-
-        # result = json.dumps(Preprocessing.union_result([codeName], prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
-        result = json.dumps(Preprocessing.union_result([codeName], prem)[0][1],ensure_ascii=False)
-        self.forward(_article.id,_article.doc_id,result)
-
 @annotate("string,bigint,string,string->string,bigint,string")
 class Extract(BaseUDTF):
 
@@ -422,7 +99,6 @@ class Extract(BaseUDTF):
         import time
 
         from BiddingKG.dl.common.Utils import log
-        logging.info("time7"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
         import numpy as np