4 年之前 · 5e0d431c28
--- a/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/checkpoint
+++ b/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/checkpoint
@@ -0,0 +1,6 @@
 
				+model_checkpoint_path: "model.ckpt"
			
 
				+all_model_checkpoint_paths: "..\\9-0.9983888954343817-0.6076048\\model.ckpt"
			
 
				+all_model_checkpoint_paths: "..\\10-0.9984710946469133-0.58896327\\model.ckpt"
			
 
				+all_model_checkpoint_paths: "..\\11-0.9986902925469974-0.50287944\\model.ckpt"
			
 
				+all_model_checkpoint_paths: "..\\16-0.9989259302895879-0.39168403\\model.ckpt"
			
 
				+all_model_checkpoint_paths: "model.ckpt"
			
--- a/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.data-00000-of-00001
+++ b/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.data-00000-of-00001
--- a/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.index
+++ b/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.index
--- a/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.meta
+++ b/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.meta
--- a/BiddingKG/dl/complaint/punishNo_tf.py
+++ b/BiddingKG/dl/complaint/punishNo_tf.py
@@ -0,0 +1,364 @@
 
				+import tensorflow as tf
			
 
				+from tensorflow.contrib.crf import crf_log_likelihood
			
 
				+from tensorflow.contrib.layers.python.layers import initializers
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+from zipfile import ZipFile
			
 
				+import os
			
 
				+import pickle
			
 
				+from BiddingKG.dl.common.Utils import *
			
 
				+from keras.preprocessing.sequence import pad_sequences
			
 
				+
			
 
				+# class BiLSTM_CRF_tf(object):
			
 
				+#     def __init__(self):
			
 
				+
			
 
				+def BiLSTM_CRF_tfmodel(sess,weights):
			
 
				+    BiRNN_Units = 140
			
 
				+    chunk_tags = {
			
 
				+        'O': 0,
			
 
				+        'PN_B': 1,
			
 
				+        'PN_M': 2,
			
 
				+        'PN_E': 3
			
 
				+    }
			
 
				+
			
 
				+    def embedding_layer(input):
			
 
				+        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
			
 
				+        return tf.nn.embedding_lookup(params=embedding,ids=input)
			
 
				+
			
 
				+    def BiLSTM_Layer(input,length):
			
 
				+        with tf.variable_scope("BiLSTM"):
			
 
				+            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
			
 
				+            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
			
 
				+        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
			
 
				+        output = tf.concat(output,2)
			
 
				+        return output
			
 
				+
			
 
				+    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
			
 
				+        with tf.variable_scope("CRF"):
			
 
				+            with tf.variable_scope("hidden"):
			
 
				+                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
			
 
				+                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
			
 
				+                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
			
 
				+                # print(input)
			
 
				+                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
			
 
				+                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
			
 
				+            with tf.variable_scope("output"):
			
 
				+                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
			
 
				+                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
			
 
				+                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
			
 
				+                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
			
 
				+        return logits_
			
 
				+
			
 
				+    def layer_loss(input,true_target,num_tags,length):
			
 
				+        with tf.variable_scope("crf_loss"):
			
 
				+            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
			
 
				+            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
			
 
				+            return tf.reduce_mean(-log_likelihood),trans
			
 
				+
			
 
				+    with sess.graph.as_default():
			
 
				+        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
			
 
				+        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
			
 
				+        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
			
 
				+        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
			
 
				+
			
 
				+        _embedding = embedding_layer(char_input)
			
 
				+        _shape = tf.shape(char_input)
			
 
				+        batch_size = _shape[0]
			
 
				+        step_size = _shape[-1]
			
 
				+        bilstm = BiLSTM_Layer(_embedding,length)
			
 
				+        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
			
 
				+        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
			
 
				+        global_step = tf.Variable(0,trainable=False)
			
 
				+        with tf.variable_scope("optimizer"):
			
 
				+            opt = tf.train.AdamOptimizer(0.002)
			
 
				+            grads_vars = opt.compute_gradients(crf_loss)
			
 
				+            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
			
 
				+            train_op = opt.apply_gradients(capped_grads_vars,global_step)
			
 
				+            return char_input,_logits,target,length,crf_loss,trans,train_op
			
 
				+
			
 
				+def train():
			
 
				+    vocab_model = getModel_word()
			
 
				+    vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
			
 
				+    # print(w2v_matrix)
			
 
				+    punishNo = {
			
 
				+        'O': 0,
			
 
				+        'PN_B': 1,
			
 
				+        'PN_M': 2,
			
 
				+        'PN_E': 3
			
 
				+    }
			
 
				+    punishNo_2 = {
			
 
				+        'O': np.array([1, 0, 0, 0]),
			
 
				+        'PN_B': np.array([0, 1, 0, 0]),
			
 
				+        'PN_M': np.array([0, 0, 1, 0]),
			
 
				+        'PN_E': np.array([0, 0, 0, 1])
			
 
				+    }
			
 
				+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0)
			
 
				+
			
 
				+    train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0)
			
 
				+    train_data['text'] = [data['text'][data['document_id'] == id] for id in train_data['document_id']]
			
 
				+    data_x = []
			
 
				+    data_y = []
			
 
				+
			
 
				+    articles_label = ['' for _ in range(13500)]
			
 
				+    punishNo_in_text = set()
			
 
				+    for textId, begin, end, entity_text, text in zip(train_data['document_id'], train_data['begin_index'],
			
 
				+                                                     train_data['end_index'],
			
 
				+                                                     train_data['entity_text'], train_data['text']):
			
 
				+        punishNo_in_text.add(textId)
			
 
				+        text = list(text)[0]
			
 
				+        l = len(text)
			
 
				+        if not articles_label[textId]:
			
 
				+            articles_label[textId] = ['O' for _ in range(l)]
			
 
				+        articles_label[textId][begin] = 'PN_B'
			
 
				+        articles_label[textId][end - 1] = 'PN_E'
			
 
				+        for i in range(begin + 1, end - 1):
			
 
				+            articles_label[textId][i] = 'PN_M'
			
 
				+    punishNo_in_text = list(punishNo_in_text)
			
 
				+
			
 
				+    # 取含数字的负样本
			
 
				+    data = data.dropna(subset=['text'])
			
 
				+    re_rule1 = re.compile('\[|\]')
			
 
				+    data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']]
			
 
				+    data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
			
 
				+    re_rule2 = re.compile("[\d，.]{4,}")
			
 
				+    for id, article, sentences in zip(data['document_id'], data['text'], data['sentences']):
			
 
				+        if id < 2826 or id in punishNo_in_text:
			
 
				+            # print(id)
			
 
				+            article = str(article)
			
 
				+            l = len(article)
			
 
				+            text_word = list(article)
			
 
				+            text_word_index = [getIndexOfWord(word) for word in text_word]
			
 
				+            sentence_count = len(sentences)
			
 
				+            if articles_label[id]:
			
 
				+                label_list = articles_label[id]
			
 
				+            else:
			
 
				+                label_list = ['O' for _ in range(l)]
			
 
				+            for i in range(sentence_count - 1):
			
 
				+                if re_rule2.search(article[sentences[i]:sentences[i + 1]]):
			
 
				+                    data_x.append(np.array(text_word_index[sentences[i]:sentences[i + 1]]))
			
 
				+                    data_y.append(np.array(label_list[sentences[i]:sentences[i + 1]]))
			
 
				+
			
 
				+    data_x = np.array(data_x)
			
 
				+    x_len = [250 if len(x)>250 else len(x) for x in data_x]
			
 
				+    data_x = pad_sequences(data_x, maxlen=250, padding="post", truncating="post")
			
 
				+    # train_x = train_x.reshape(-1)
			
 
				+    data_y = [np.array([punishNo[_y] for _y in y]) for y in data_y]
			
 
				+    # data_y = np.array(data_y).reshape(-1)
			
 
				+    data_y = np.array(data_y)
			
 
				+    data_y = pad_sequences(data_y, maxlen=250, padding="post", truncating="post")
			
 
				+    # print(data_x[:5])
			
 
				+    # print(data_y[:5])
			
 
				+    # data_x = np.array(list(data_x))
			
 
				+    # data_y = np.array(list(data_y))
			
 
				+    indices = np.random.permutation(data_x.shape[0])
			
 
				+    count = len(data_x)
			
 
				+    test_count = int(0.2 * count)
			
 
				+    test_idx, train_idx = indices[:test_count], indices[test_count:]
			
 
				+    # print(test_idx)
			
 
				+    train_x, test_x = data_x[train_idx, :], data_x[test_idx, :]
			
 
				+    train_y, test_y = data_y[train_idx, :], data_y[test_idx, :]
			
 
				+    train_x_len = np.array([x_len[idx] for idx in train_idx])
			
 
				+    test_x_len = np.array([x_len[idx] for idx in test_idx])
			
 
				+
			
 
				+    with tf.Session(graph=tf.Graph()) as sess:
			
 
				+        char_input,logits,target,length,crf_loss,trans,train_op = BiLSTM_CRF_tfmodel(sess,w2v_matrix)
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+        saver = tf.train.Saver()
			
 
				+        epochs = 60
			
 
				+        batch_size = 300
			
 
				+        _test_loss = 10000.
			
 
				+        for epoch in range(epochs):
			
 
				+            for x_batch,y_batch,x_len_batch in batch_iter(train_x,train_y,train_x_len,batch_size=batch_size):
			
 
				+                # for x,y,l in zip(x_batch,y_batch,x_len_batch):
			
 
				+                    # print(l,'=>',x)
			
 
				+                    # print(y)
			
 
				+                train_loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:x_batch,target:y_batch,length:x_len_batch,})
			
 
				+            test_loss,_logits,_trans = sess.run([crf_loss,logits,trans],feed_dict={char_input:test_x,target:test_y,length:test_x_len})
			
 
				+            acc = getAcc(test_y,_logits,_trans,test_x_len)
			
 
				+            print("==>epoch:"+str(epoch))
			
 
				+            print("--test --"," acc:",acc,'test_loss:',test_loss)
			
 
				+            print("--train--","loss:",train_loss,"have_done")
			
 
				+            if test_loss<_test_loss:
			
 
				+                _test_loss = test_loss
			
 
				+                print("Saving-"+str(epoch)+"-model,test_loss:"+str(test_loss))
			
 
				+                saver.save(sess,"models/"+str(epoch)+"-"+str(acc)+"-"+str(test_loss)+"/model.ckpt")
			
 
				+
			
 
				+def batch_iter(x, y,x_len, batch_size=256):
			
 
				+    '''
			
 
				+    :param x: content2id
			
 
				+    :param y: label2id
			
 
				+    :param batch_size: 每次进入模型的句子数量
			
 
				+    :return:
			
 
				+    '''
			
 
				+    data_len = len(x)
			
 
				+    num_batch = int((data_len - 1) / batch_size) + 1 #计算一个epoch,需要多少次batch
			
 
				+
			
 
				+    # indices = np.random.permutation(data_len) #生成随机数列
			
 
				+    # x_shuffle = x[indices]
			
 
				+    # y_shuffle = y[indices]
			
 
				+    # x_len_shuffle = x_len[indices]
			
 
				+    for i in range(num_batch):
			
 
				+        start_id = batch_size * i
			
 
				+        end_id = min(batch_size*(i+1), data_len)
			
 
				+        yield x[start_id:end_id], y[start_id:end_id],x_len[start_id:end_id]
			
 
				+from sklearn.metrics import accuracy_score
			
 
				+def getAcc(y_batch,logits,trans,lengths):
			
 
				+    index = 0
			
 
				+    small = -1000.0
			
 
				+    start = np.asarray([[small] * 4 + [0]])
			
 
				+
			
 
				+    preds = []
			
 
				+    true_tags = []
			
 
				+    for score, length in zip(logits, lengths):
			
 
				+        score = score[:length]
			
 
				+        # pad = small * np.ones([length, 1])
			
 
				+        # logit = np.concatenate([score, pad], axis=1)
			
 
				+        # logit = np.concatenate([start, logit], axis=0)
			
 
				+        # path, _ = tf.contrib.crf.viterbi_decode(logit, trans)
			
 
				+        path, _ = tf.contrib.crf.viterbi_decode(score, trans)
			
 
				+        preds += path[0:]
			
 
				+        # preds += path[1:]
			
 
				+        index += 1
			
 
				+
			
 
				+    for y, length in zip(y_batch, lengths):
			
 
				+        y = y.tolist()
			
 
				+        true_tags += y[: length]
			
 
				+    acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1)))
			
 
				+    return acc
			
 
				+
			
 
				+def predict(articles,model_file):
			
 
				+
			
 
				+    vocab_model = getModel_word()
			
 
				+    vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
			
 
				+    model_file = model_file
			
 
				+    sess = tf.Session(graph=tf.Graph())
			
 
				+    with sess:
			
 
				+        char_input, logits, target, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+        saver = tf.train.Saver()
			
 
				+        saver.restore(sess, model_file)
			
 
				+        re_ner = re.compile("12+?3")
			
 
				+        article_ner_list = []
			
 
				+        count = 0
			
 
				+        for sentences in articles:
			
 
				+            count += 1
			
 
				+            print(count)
			
 
				+            sentence_len = [ len(sentence) for sentence in sentences]
			
 
				+            maxlen = max(sentence_len)
			
 
				+            sentences_x = []
			
 
				+            for sentence in sentences:
			
 
				+                sentence = list(sentence)
			
 
				+                sentence2id = [getIndexOfWord(word) for word in sentence]
			
 
				+                sentences_x.append(sentence2id)
			
 
				+            sentences_x = pad_sequences(sentences_x,maxlen=maxlen,padding="post", truncating="post")
			
 
				+            sentences_x = [np.array(x) for x in sentences_x]
			
 
				+
			
 
				+            _logits,_trans = sess.run([logits,trans],feed_dict={char_input:np.array(sentences_x),length:sentence_len})
			
 
				+
			
 
				+            viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=sentence_len,tag_num=4)
			
 
				+
			
 
				+            ner_list = []
			
 
				+            for _seq,sentence in zip(viterbi_sequence,sentences):
			
 
				+                seq_id = ''.join([str(s) for s in _seq])
			
 
				+                if re_ner.search(seq_id):
			
 
				+                    # print("sentence: ",sentence)
			
 
				+                    for _ner in re_ner.finditer(seq_id):
			
 
				+                        start = _ner.start()
			
 
				+                        end = _ner.end()
			
 
				+                        n = sentence[start:end]
			
 
				+                        # print(n,'<==>',start,end)
			
 
				+                        ner_list.append((n,start,end))
			
 
				+            article_ner_list.append(ner_list)
			
 
				+    return article_ner_list
			
 
				+
			
 
				+def decode(logits, trans, sequence_lengths, tag_num):
			
 
				+    viterbi_sequences = []
			
 
				+    for logit, length in zip(logits, sequence_lengths):
			
 
				+        score = logit[:length]
			
 
				+        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
			
 
				+        viterbi_sequences.append(viterbi_seq)
			
 
				+    return viterbi_sequences
			
 
				+
			
 
				+def test2():
			
 
				+    punishNo = {
			
 
				+        'O': 0,
			
 
				+        'PN_B': 1,
			
 
				+        'PN_M': 2,
			
 
				+        'PN_E': 3
			
 
				+    }
			
 
				+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0)
			
 
				+
			
 
				+    train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0)
			
 
				+    punishNo_in_text = set()
			
 
				+    for textId in train_data['document_id']:
			
 
				+        punishNo_in_text.add(textId)
			
 
				+    for _ in range(1,2821):
			
 
				+        punishNo_in_text.add(_)
			
 
				+    punishNo_in_text = list(punishNo_in_text)
			
 
				+    data = data[data['document_id'].isin(punishNo_in_text)]
			
 
				+    data = data.dropna(subset=['text'])
			
 
				+    re_rule1 = re.compile('\[|\]')
			
 
				+    data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']]
			
 
				+    data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
			
 
				+    article_sentences = []
			
 
				+    for id,text,sentences in zip(data['document_id'],data['text'],data['sentences']):
			
 
				+        # if id in punishNo_in_text:
			
 
				+        sentences_count = len(sentences)
			
 
				+        sentence_list = []
			
 
				+        for i in range(sentences_count-1):
			
 
				+            sentence = text[sentences[i]:sentences[i+1]]
			
 
				+            sentence_list.append(sentence)
			
 
				+        article_sentences.append(sentence_list)
			
 
				+    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
			
 
				+    punishNo_ner = predict(article_sentences,model_file)
			
 
				+    data['punishNo_test'] = punishNo_ner
			
 
				+    punishNo_label = [[] for _ in range(13500)]
			
 
				+    for textId, begin, end, entity_text in zip(train_data['document_id'], train_data['begin_index'],
			
 
				+                                                train_data['end_index'],train_data['entity_text']):
			
 
				+        punishNo_label[textId].append((entity_text,begin,end))
			
 
				+    punishNo_right = []
			
 
				+    for id in data['document_id']:
			
 
				+        punishNo_right.append(punishNo_label[id])
			
 
				+    data['punishNo_right'] = punishNo_right
			
 
				+    test_res = []
			
 
				+    for test,label_list in zip(data['punishNo_test'],data['punishNo_right']):
			
 
				+        if set(test)==set(label_list):
			
 
				+            test_res.append(1)
			
 
				+        else:
			
 
				+            test_res.append(0)
			
 
				+    data['test_res'] = test_res
			
 
				+    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishNo_test.xlsx",encoding='utf-8')
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+def test():
			
 
				+    data = pd.read_csv("data/ALLDATA.csv", index_col=0)[500:600]
			
 
				+    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
			
 
				+    # data = data[35000:45000]
			
 
				+    sentences_list = []
			
 
				+    for sentences in data['sentences']:
			
 
				+        sentences = sentences.split("*#*>")
			
 
				+        sentences_list.append(sentences)
			
 
				+    print(len(sentences_list))
			
 
				+    pn_ner = predict(sentences_list,model_file)
			
 
				+    print('*'*20)
			
 
				+    print(len(pn_ner),pn_ner)
			
 
				+    data['ner_test'] = pn_ner
			
 
				+    print(data.head(3))
			
 
				+    # data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-3.xlsx",encoding='utf-8')
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # train()
			
 
				+    # test()
			
 
				+    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
			
 
				+    sentences_list = '行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号,'.split('。')
			
 
				+    pn_ner = predict([sentences_list], model_file)
			
 
				+    print(pn_ner)
			
 
				+    # test2()
			
 
				+    # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
			
 
				+    # sentences = data['sentences'][51313]
			
 
				+    # sentences = sentences.split("*#*>")
			
 
				+    # model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
			
 
				+    # predict(sentences,model_file)
			
 
				+    pass
			
--- a/BiddingKG/dl/complaint/punish_rule.py
+++ b/BiddingKG/dl/complaint/punish_rule.py
@@ -0,0 +1,488 @@
 
				+#!/usr/bin/python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author  : bidikeji
			
 
				+# @Time    : 2020/12/24 0024 15:23
			
 
				+import re
			
 
				+import time
			
 
				+import tensorflow as tf
			
 
				+from BiddingKG.dl.common.Utils import *
			
 
				+from tensorflow.contrib.crf import crf_log_likelihood
			
 
				+from tensorflow.contrib.layers.python.layers import initializers
			
 
				+from keras.preprocessing.sequence import pad_sequences
			
 
				+import BiddingKG.dl.interface.Preprocessing as Preprocessing
			
 
				+from BiddingKG.dl.interface.Preprocessing import *
			
 
				+
			
 
				+def BiLSTM_CRF_tfmodel(sess,weights):
			
 
				+    BiRNN_Units = 140
			
 
				+    chunk_tags = {
			
 
				+        'O': 0,
			
 
				+        'PN_B': 1,
			
 
				+        'PN_M': 2,
			
 
				+        'PN_E': 3
			
 
				+    }
			
 
				+
			
 
				+    def embedding_layer(input):
			
 
				+        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
			
 
				+        return tf.nn.embedding_lookup(params=embedding,ids=input)
			
 
				+
			
 
				+    def BiLSTM_Layer(input,length):
			
 
				+        with tf.variable_scope("BiLSTM"):
			
 
				+            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
			
 
				+            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
			
 
				+        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
			
 
				+        output = tf.concat(output,2)
			
 
				+        return output
			
 
				+
			
 
				+    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
			
 
				+        with tf.variable_scope("CRF"):
			
 
				+            with tf.variable_scope("hidden"):
			
 
				+                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
			
 
				+                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
			
 
				+                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
			
 
				+                # print(input)
			
 
				+                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
			
 
				+                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
			
 
				+            with tf.variable_scope("output"):
			
 
				+                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
			
 
				+                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
			
 
				+                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
			
 
				+                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
			
 
				+        return logits_
			
 
				+
			
 
				+    def layer_loss(input,true_target,num_tags,length):
			
 
				+        with tf.variable_scope("crf_loss"):
			
 
				+            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
			
 
				+            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
			
 
				+            return tf.reduce_mean(-log_likelihood),trans
			
 
				+
			
 
				+    with sess.graph.as_default():
			
 
				+        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
			
 
				+        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
			
 
				+        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
			
 
				+        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
			
 
				+
			
 
				+        _embedding = embedding_layer(char_input)
			
 
				+        _shape = tf.shape(char_input)
			
 
				+        batch_size = _shape[0]
			
 
				+        step_size = _shape[-1]
			
 
				+        bilstm = BiLSTM_Layer(_embedding,length)
			
 
				+        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
			
 
				+        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
			
 
				+        global_step = tf.Variable(0,trainable=False)
			
 
				+        with tf.variable_scope("optimizer"):
			
 
				+            opt = tf.train.AdamOptimizer(0.002)
			
 
				+            grads_vars = opt.compute_gradients(crf_loss)
			
 
				+            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
			
 
				+            train_op = opt.apply_gradients(capped_grads_vars,global_step)
			
 
				+            return char_input,_logits,target,length,crf_loss,trans,train_op
			
 
				+
			
 
				+def decode(logits, trans, sequence_lengths, tag_num):
			
 
				+    viterbi_sequences = []
			
 
				+    for logit, length in zip(logits, sequence_lengths):
			
 
				+        score = logit[:length]
			
 
				+        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
			
 
				+        viterbi_sequences.append(viterbi_seq)
			
 
				+    return viterbi_sequences
			
 
				+
			
 
				+class Punish_Extract():
			
 
				+    def __init__(self, model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"):
			
 
				+        self.sess = tf.Session(graph=tf.Graph())
			
 
				+        self.code = ""
			
 
				+        self.punish_dicition = ""
			
 
				+        self.model_file = model_file #预测编号模型
			
 
				+        self.load_model()
			
 
				+
			
 
				+    # 加载处罚编号预测模型
			
 
				+    def load_model(self):
			
 
				+        with self.sess.as_default() as sess:
			
 
				+            with sess.graph.as_default():
			
 
				+                vocab_model = getModel_word()
			
 
				+                vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
			
 
				+                self.char_input, self.logits, self.target, self.length, self.crf_loss, self.trans, self.train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
			
 
				+                sess.run(tf.global_variables_initializer())
			
 
				+                saver = tf.train.Saver()
			
 
				+                saver.restore(sess, self.model_file)
			
 
				+
			
 
				+    # 处罚编号预测
			
 
				+    def predict_punishCode(self,list_sentences):
			
 
				+        re_ner = re.compile("12+?3")
			
 
				+        article_ner_list = []
			
 
				+        count = 0
			
 
				+        with self.sess.as_default():
			
 
				+            with self.sess.graph.as_default():
			
 
				+                for sentences in list_sentences:
			
 
				+                    count += 1
			
 
				+                    # print(count)
			
 
				+                    sentence_len = [len(sentence.sentence_text) for sentence in sentences]
			
 
				+                    maxlen = max(sentence_len)
			
 
				+                    sentences_x = []
			
 
				+                    for sentence in sentences:
			
 
				+                        sentence = sentence.sentence_text
			
 
				+                        sentence = list(sentence)
			
 
				+                        sentence2id = [getIndexOfWord(word) for word in sentence]
			
 
				+                        sentences_x.append(sentence2id)
			
 
				+                    sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
			
 
				+                    sentences_x = [np.array(x) for x in sentences_x]
			
 
				+                    _logits, _trans = self.sess.run([self.logits, self.trans],
			
 
				+                                               feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
			
 
				+                    viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
			
 
				+
			
 
				+                    ner_list = []
			
 
				+                    for _seq, sentence in zip(viterbi_sequence, sentences):
			
 
				+                        sentence = sentence.sentence_text
			
 
				+                        seq_id = ''.join([str(s) for s in _seq])
			
 
				+                        if re_ner.search(seq_id):
			
 
				+                            # print("sentence: ",sentence)
			
 
				+                            for _ner in re_ner.finditer(seq_id):
			
 
				+                                start = _ner.start()
			
 
				+                                end = _ner.end()
			
 
				+                                n = sentence[start:end]
			
 
				+                                # print(n,'<==>',start,end)
			
 
				+                                # ner_list.append((n, start, end))
			
 
				+                                ner_list.append(n)  # 改为只返回实体字符
			
 
				+                    # article_ner_list.append(ner_list)
			
 
				+                    article_ner_list.append('；'.join(set(ner_list)))
			
 
				+        return article_ner_list[0]
			
 
				+
			
 
				+    # 处罚类型
			
 
				+    def get_punishType(self, x1, x2):
			
 
				+        '''通过文章标题及内容判断文章类别
			
 
				+        x1: 标题
			
 
				+        x2: 内容
			
 
				+        return 类别'''
			
 
				+        # x1 = x1.replace('(','（').replace(')', '）').replace(' ','')
			
 
				+        # x2 = x2.replace('(', '（').replace(')', '）').replace(' ', '')
			
 
				+        '''标题正则'''
			
 
				+        # 未知公告
			
 
				+        unknow = re.compile('采购方式|采购公告|磋商公告|谈判公告|交易公告$|征集|征求|招标公告|竞标公告|中标公告|'
			
 
				+                            '成交公告|成交信息|流标公告|废标公告|城市管理考评|决算表|决算|预算|资格考试|招聘|选聘'
			
 
				+                            '|聘请|拟录用|无违规违法|无此项信息|暂无工程投标违法|管理办法|指导意见|无投诉|投诉办法'
			
 
				+                            '公共资源交易情况|绩效评价|考试成绩|付息公告|不动产|办证|印发|转发')  #|结果公示 部分是
			
 
				+        # 投诉处理
			
 
				+        tscl = re.compile('投诉不予[处受]理|投诉不成立|终止投诉|投诉终止|不予受理|投诉事?项?的?处理')
			
 
				+        # 行政处罚
			
 
				+        xzcf = re.compile('行政处罚|行政处理|政处罚|行政裁决|防罚|公罚|医罚|环罚|政罚|文罚|局罚|旅罚|财罚|运罚')
			
 
				+        # 监督检查
			
 
				+        jdjc = re.compile('(监督检查的?问?题?(处理|整改|记分|结果|决定|处罚))|监督处罚|调查处理|监督处理')
			
 
				+        # 严重违法
			
 
				+        yzwf = re.compile('严重违法失信|黑名单|失信名单')
			
 
				+        # 不良行为
			
 
				+        blxw = re.compile('((不良|失信|不诚信|差错|不规范|违规|违约|处罚|违法)(行为|记录|信息))|((违约|违规|违法)(处理|操作|情况|问题))'
			
 
				+                          '|通报批评|记分管理|迟到|早退|缺席|虚假材料|弄虚作假|履职不到位|诚信考核扣分|串通投标'
			
 
				+                          '|审核不通过|码一致|地址一致|扣分处理|扣分通知|扣[0-9]+分|责令整改|信用信息认定书$'
			
 
				+                          '|关于.{,30}的处罚|关于.{,10}的?考评通报|关于.{,30}扣分情况|不规范代理行为'
			
 
				+                          '|(取消|暂停|限制).{,50}((专家|评标|评委|投标|竞价|被抽取|中标|供应商|候选人)资格)'
			
 
				+                          '|(代理服?务?机构).{,10}(扣分)|(专家).{,30}(扣分|记分|处罚)|对.{,30}处理|冻结.{,30}账号')
			
 
				+        # 其他不良行为
			
 
				+        other = re.compile('质疑|代理机构进场交易情况|网上投诉办理|信用奖惩|信用奖罚|进场工作.{,5}考核'
			
 
				+                           '|举报处理|结果无效|成交无效|行政复议')
			
 
				+
			
 
				+        '''正文内容正则'''
			
 
				+        # 投诉处理
			
 
				+        tscl_c = re.compile('(投诉(人|单位)[1-9]?(名称)?[:：])|(投诉事项[1-5一二三四五、]*部?分?(成立|予以受理))'
			
 
				+                            '|((驳回|撤回|撤销|终止)[^，。]{,60}(投诉|质疑))')
			
 
				+        # 行政处罚
			
 
				+        xzcf_c = re.compile('((处理依据及结果|处理结果|处罚结果)).*行政处罚|如下行政处罚|行政处罚决定')
			
 
				+        # 诚信加分
			
 
				+        cxjf_c = re.compile('处罚结果.*诚信加分')
			
 
				+        # 严重违法失信
			
 
				+        yzwf_c = re.compile('工商部门严重违法失信起名单|严重违法失信的具体情形') #|严重违法失信的具体情形
			
 
				+        # 不良行为
			
 
				+        blxw_c = re.compile('(取消|暂停|限制).{,30}((专家|评标|评委|投标|采购|竞价|被抽取|中标|供应商)的?资格)'
			
 
				+                            '|(处罚结果|处罚情况).*(扣[1-9]*分|记分|不良行为|不良记录|不良信用|不诚信|扣除信用'
			
 
				+                            '|诚信档案|信用信息|取消.*资格|口头警告|处罚机关|责令改正|罚款|限制投标|暂扣|禁止'
			
 
				+                            '|暂停|封禁|暂无|行政处罚)|处罚结果'
			
 
				+                            '|处罚主题|禁止参与.{,10}政府采购活动|列入不良行为|处罚如下|如下处罚|违规处罚|处罚违规'
			
 
				+                            '|责令改正|责令整改|处罚依据|进行以下处理|处理依据及结果|处理结果|处罚决定书|'
			
 
				+                            '(不规范|不良|不诚信)行为记录')
			
 
				+        # 其他不良行为
			
 
				+        other_c = re.compile('质疑(人|单位)[1-9]?(名称)?：|公告期内受质疑')
			
 
				+
			
 
				+        if re.search(unknow, x1):
			
 
				+            return re.search(unknow, x1).group(0), '未知类别'
			
 
				+        elif re.search(yzwf, x1):
			
 
				+            return re.search(yzwf, x1).group(0), '严重违法'
			
 
				+        elif re.search(yzwf_c, x2):
			
 
				+            return re.search(yzwf_c, x2).group(0), '严重违法'
			
 
				+
			
 
				+        elif re.search(tscl, x1):
			
 
				+            return re.search(tscl, x1).group(0), '投诉处理'
			
 
				+        elif re.search(xzcf, x1):
			
 
				+            return re.search(xzcf, x1).group(0), '行政处罚'
			
 
				+        elif re.search(jdjc, x1):
			
 
				+            return re.search(jdjc, x1).group(0), '监督检查'
			
 
				+        elif re.search(blxw, x1):
			
 
				+            return re.search(blxw, x1).group(0), '不良行为'
			
 
				+        elif re.search(other, x1):
			
 
				+            return re.search(other, x1).group(0), '其他不良行为'
			
 
				+
			
 
				+        elif re.search(tscl_c, x2):
			
 
				+            return re.search(tscl_c, x2).group(0), '投诉处理'
			
 
				+        elif re.search(xzcf_c, x2):
			
 
				+            return re.search(xzcf_c, x2).group(0), '行政处罚'
			
 
				+        elif re.search(cxjf_c, x2):
			
 
				+            return re.search(cxjf_c, x2).group(0), '诚信加分'
			
 
				+
			
 
				+        elif re.search(blxw_c, x2):
			
 
				+            return re.search(blxw_c, x2).group(0), '不良行为'
			
 
				+        elif re.search(other_c, x2):
			
 
				+            return re.search(other_c, x2).group(0), '其他不良行为'
			
 
				+
			
 
				+        return ' ', '未知类别'
			
 
				+
			
 
				+    # 处罚决定
			
 
				+    def get_punishDecision(self, x, x2):
			
 
				+        '''通过正则匹配文章内容中的处理决定
			
 
				+        x:正文内容
			
 
				+        x2: 处罚类别
			
 
				+        return 处理决定字符串'''
			
 
				+        rule1 = re.compile(
			
 
				+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处理意见|行政处罚|处罚)(如下|如下))'
			
 
				+            '|((以下|如下)(决定|处理|处理意见|行政处罚|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
			
 
				+            '|整改意见)[:：].{5,}')
			
 
				+        rule2 = re.compile(
			
 
				+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处罚|处理意见)(如下|如下))'
			
 
				+            '|((以下|如下)(决定|处理|处理意见|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
			
 
				+            '|处罚内容)[：，,].{10,}')
			
 
				+        rule3 = re.compile('考评结果：?.*')
			
 
				+        rule4 = re.compile('(依据|根据)《.*》.*')
			
 
				+        if x2 == '未知类别':
			
 
				+            return ' '
			
 
				+        elif re.search(rule1, x[-int(len(x)*0.4):]):
			
 
				+            return re.search(rule1, x[-int(len(x)*0.4):]).group(0)
			
 
				+        elif re.search(rule1, x[-int(len(x)*0.6):]):
			
 
				+            return re.search(rule1, x[-int(len(x)*0.6):]).group(0)
			
 
				+        elif re.search(rule2, x[-int(len(x)*0.7):]):
			
 
				+            return re.search(rule2, x[-int(len(x)*0.7):]).group(0)
			
 
				+        elif re.search(rule3, x[-int(len(x)*0.6):]):
			
 
				+            return re.search(rule3, x[-int(len(x)*0.6):]).group(0)
			
 
				+        elif re.search(rule4, x[-int(len(x)*0.4):]):
			
 
				+            return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
			
 
				+        else:
			
 
				+            return ' '
			
 
				+
			
 
				+    # 投诉是否成立
			
 
				+    def get_punishWhether(self, x1, x2, x3):
			
 
				+        '''通过正则匹配处理决定判断投诉是否成立
			
 
				+        x1: 处理决定字符串
			
 
				+        x2: 正文内容
			
 
				+        x3: 处罚类别
			
 
				+        return 投诉是否成立'''
			
 
				+        p1 = re.compile('(投诉|投拆|质疑|举报)(事项|内容|事实)?[^不，。]{,10}(成立|属实|予以受理|予以支持)|责令|废标|(中标|成交)[^，。]{,10}无效'
			
 
				+                        '|取消[^，。]{,60}资格|罚款|重新(组织|开展)?(招标|采购)|投诉成立|被投诉人存在违法违规行为'
			
 
				+                        '|采购活动违法|(中标|评标|成交)结果无效')
			
 
				+        p2 = re.compile('投诉不予[处受]理|((投诉|投拆|质疑|举报)(事项|内容|事实)?[^，。]{,10}(不成立|情?况?不属实|不予支持|缺乏事实依据))'
			
 
				+                        '|((驳回|撤回|撤销|终止)[^，。]*(投诉|质疑|诉求))|终止[^，。]{,20}(行政裁决|投诉处理|采购活动)|投诉终止|投诉无效'
			
 
				+                        '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^，。]{,10}不成立'
			
 
				+                        '|维持被投诉人|不支持[^，。]{,20}投诉|无确凿证据')
			
 
				+        if x3 != '投诉处理':
			
 
				+            return ' '
			
 
				+        elif re.search(p1, x1):
			
 
				+            return '投诉成立'
			
 
				+        elif re.search(p2, x1):
			
 
				+            return '投诉无效'
			
 
				+        elif re.search(p1, x2):
			
 
				+            return '投诉成立'
			
 
				+        elif re.search(p2, x2):
			
 
				+            return '投诉无效'
			
 
				+        return ' '
			
 
				+
			
 
				+    # 执法机构、处罚时间
			
 
				+    def get_institution(self, title, sentences_l, entity_l):
			
 
				+        '''
			
 
				+        通过判断实体前信息判断改实体是否为执法机构
			
 
				+        :param title: 文章标题
			
 
				+        :param sentences_l: 单篇公告句子列表
			
 
				+        :param entity_l: 单篇公告实体列表
			
 
				+        :return: 执法机构及处罚时间字符串，多个的用；号隔开
			
 
				+        '''
			
 
				+        institutions = []
			
 
				+        punishTimes = []
			
 
				+        institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[:：]")
			
 
				+        punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[:：]")
			
 
				+        # 通过实体前面关键词判断是否为执法机构或处罚时间
			
 
				+        for ner in entity_l:
			
 
				+            if ner.entity_type == 'org':
			
 
				+                left = sentences_l[ner.sentence_index].sentence_text[
			
 
				+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
			
 
				+                if institution_1.search(left):
			
 
				+                    institutions.append(ner)
			
 
				+                elif institutions != [] and ner.sentence_index == institutions[-1].sentence_index and \
			
 
				+                        ner.wordOffset_begin - institutions[-1].wordOffset_end < 2 and \
			
 
				+                        sentences_l[ner.sentence_index].sentence_text[
			
 
				+                        ner.wordOffset_begin:institutions[-1].wordOffset_end] \
			
 
				+                        in ['', '、', '和', '及']:
			
 
				+                    institutions.append(ner)
			
 
				+            elif ner.entity_type == 'time':
			
 
				+                left = sentences_l[ner.sentence_index].sentence_text[
			
 
				+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
			
 
				+                if punishTimes_1.search(left):
			
 
				+                    punishTimes.append(ner)
			
 
				+
			
 
				+        institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
			
 
				+        institution_time = re.compile(
			
 
				+            "(^，?[\d一二三四五六七八九十]{4}，?[/年-][\d一二三四五六七八九十]{1,2}，?[/月-][\d一二三四五六七八九十]{1,2}，?[/日-]?)")
			
 
				+        ins = ""
			
 
				+        ptime = ""
			
 
				+        # 如果前面步骤找不到处罚机构则在标题找实体，并正则检查是否有关键词
			
 
				+        if institutions == []:
			
 
				+            title_ners = getNers([title], useselffool=True)
			
 
				+            if title_ners[0]:
			
 
				+                for title_ner in title_ners[0]:
			
 
				+                    if title_ner[2] == 'org' and institution_title.search(title_ner[3]):
			
 
				+                        ins = title_ner[3]
			
 
				+                        break
			
 
				+        if punishTimes == [] or institutions == []:
			
 
				+            # 如果前面步骤还没找到要素，则通过公司实体后面是否有日期关键词，有则作为处罚机构和处罚时间
			
 
				+            for ner in [ner for ner in entity_l if ner.entity_type == 'org'][-5:][::-1]:
			
 
				+                right = sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_end:ner.wordOffset_end + 16]
			
 
				+                if institution_time.search(right):
			
 
				+                    if ins == '':
			
 
				+                        ins = ner.entity_text
			
 
				+                    if ptime == '':
			
 
				+                        ptime = institution_time.search(right).group(1)
			
 
				+                    break
			
 
				+            # 前面步骤都没找到则判断最后一个时间实体是否在文章末尾，是则作为处罚时间
			
 
				+            if ptime == '':
			
 
				+                n_time = [ner for ner in entity_l if ner.entity_type == 'time']
			
 
				+                if len(n_time) != 0:
			
 
				+                    ner = n_time[-1]
			
 
				+                    if ner.sentence_index == len(sentences_l) - 1:
			
 
				+                        textLong = len(sentences_l[ner.sentence_index].sentence_text)
			
 
				+                        if ner.wordOffset_end > textLong - 3 and len(ner.entity_text) > 3:
			
 
				+                            ptime = ner.entity_text
			
 
				+        institutions = [ner.entity_text for ner in institutions]
			
 
				+        punishTimes = [ner.entity_text for ner in punishTimes]
			
 
				+        if institutions == [] and ins != "":
			
 
				+            institutions.append(ins)
			
 
				+        if punishTimes == [] and ptime != "":
			
 
				+            punishTimes.append(ptime)
			
 
				+        return "；".join(institutions), "；".join(punishTimes)
			
 
				+
			
 
				+    # 投诉人、被投诉人、被处罚人
			
 
				+    def get_complainant(self, punishType, sentences_l, entity_l):
			
 
				+        '''
			
 
				+        通过对公告类别、句子列表、实体列表正则寻找投诉人、被投诉人、处罚人
			
 
				+        :param punishType: 公告处罚类别
			
 
				+        :param sentences_l: 单篇公告句子列表
			
 
				+        :param entity_l: 单篇公告实体列表
			
 
				+        :return: 投诉人、被投诉人
			
 
				+        '''
			
 
				+        complainants = []  # 投诉人
			
 
				+        punishPeople = []  # 被投诉人、被处罚人
			
 
				+        size = 16
			
 
				+        # 投诉人、质疑人
			
 
				+        complainants_rule1 = re.compile(
			
 
				+            "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
			
 
				+        # 被处罚人，被投诉人
			
 
				+        punishPeople_rule1 = re.compile(
			
 
				+            "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
			
 
				+        punishPeople_rule2_1 = re.compile("，$")
			
 
				+        punishPeople_rule2_2 = re.compile("^[:：]")
			
 
				+        punishPeople_rule3_1 = re.compile("(?:关于|对)[^，。]*$")
			
 
				+        punishPeople_rule3_2 = re.compile("^[^，。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
			
 
				+
			
 
				+        punish_l = []  # 处罚实体列表
			
 
				+        tmp = []
			
 
				+        for ner in [ner for ner in entity_l if ner.entity_type in ['org', 'company', 'person']]:
			
 
				+            if tmp == []:
			
 
				+                tmp.append(ner)
			
 
				+            elif ner.entity_type == tmp[-1].entity_type and ner.sentence_index == tmp[-1].sentence_index and \
			
 
				+                    ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
			
 
				+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
			
 
				+                '',
			
 
				+                '、',
			
 
				+                '和',
			
 
				+                '及']:
			
 
				+                tmp.append(ner)
			
 
				+            elif ner.entity_type in ['org', 'company'] and tmp[-1].entity_type in ['org', 'company'] and \
			
 
				+                    ner.sentence_index == tmp[-1].sentence_index and ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
			
 
				+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
			
 
				+                '',
			
 
				+                '、',
			
 
				+                '和',
			
 
				+                '及']:
			
 
				+                tmp.append(ner)
			
 
				+            else:
			
 
				+                punish_l.append(tmp)
			
 
				+                tmp = [ner]
			
 
				+        for ner_l in punish_l:
			
 
				+            begin_index = ner_l[0].wordOffset_begin
			
 
				+            end_index = ner_l[-1].wordOffset_end
			
 
				+            left = sentences_l[ner_l[0].sentence_index].sentence_text[max(0, begin_index - size):begin_index]
			
 
				+            right = sentences_l[ner_l[0].sentence_index].sentence_text[end_index:end_index + size]
			
 
				+            if complainants_rule1.search(left):
			
 
				+                complainants.append(ner_l)
			
 
				+            elif punishPeople_rule1.search(left):
			
 
				+                punishPeople.append(ner_l)
			
 
				+            elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
			
 
				+                if punishType == '投诉处理':
			
 
				+                    complainants.append(ner_l)
			
 
				+                else:
			
 
				+                    punishPeople.append(ner_l)
			
 
				+            elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
			
 
				+                punishPeople.append(ner_l)
			
 
				+        complainants = set([it.entity_text for l in complainants for it in l])
			
 
				+        punishPeople = set([it.entity_text for l in punishPeople for it in l])
			
 
				+        return '；'.join(complainants), '；'.join(punishPeople)
			
 
				+
			
 
				+def get_punish_extracts(doc_id=' ', title=' ', text=' '):
			
 
				+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
			
 
				+                                                                                    useselffool=True)
			
 
				+    punish_code = punish.predict_punishCode(list_sentences)
			
 
				+    # print('处罚编号： ',punish_code)
			
 
				+    institutions, punishTimes = punish.get_institution(title, list_sentences[0], list_entitys[0])
			
 
				+    # print('执法机构:',institutions, '\n 处罚时间：', punishTimes)
			
 
				+    keyword, punishType = punish.get_punishType(title, text)
			
 
				+    # print('处罚类型：',punishType)
			
 
				+    punishDecision = punish.get_punishDecision(text, punishType)
			
 
				+    # print('处罚决定：',punishDecision)
			
 
				+    punishWhether= punish.get_punishWhether(punishDecision, text, punishType)
			
 
				+    # print('投诉是否成立：',punishWhether)
			
 
				+    complainants, punishPeople = punish.get_complainant(punishType, list_sentences[0], list_entitys[0])
			
 
				+    # print('投诉人：%s  被投诉人：%s'%(complainants, punishPeople))
			
 
				+    return punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether,institutions, punishTimes
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
			
 
				+
			
 
				+    import pandas as pd
			
 
				+    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
			
 
				+    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
			
 
				+    # i = 89
			
 
				+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
			
 
				+    # i = 92
			
 
				+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
			
 
				+
			
 
				+    # t1 = time.time()
			
 
				+    # for i in df.index:
			
 
				+    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
			
 
				+    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
			
 
				+    #     df.loc[i, '投诉人'] = complainants
			
 
				+    #     df.loc[i, '被投诉人'] = punishPeople
			
 
				+    #     df.loc[i, '执法机构'] = institutions
			
 
				+    #     df.loc[i, '处罚时间'] = punishTimes
			
 
				+    #     df.loc[i, '处罚编号'] = punish_code
			
 
				+    #     print('完成第%d篇'%i)
			
 
				+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
			
 
				+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
			
 
				+    # #    'institution', 'punishTime', 'ner_test']])
			
 
				+    # t2 = time.time()
			
 
				+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
			
 
				+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
			
 
				+    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
			
 
				+    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
			
 
				+    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
			
 
				+    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
			
 
				+    # t3 = time.time()
			
 
				+    # print('处理耗时：%.4f, 保存耗时：%.4f'%(t2-t1, t3-t2))
			
 
				+    s = '厦财企〔2020〕12号，各有关单位：341号。厦财企〔2020〕12号，各有关单位：行政处罚厦建招诉决【2019】342号。行政处罚厦建招诉决【2019】343号。行政处罚厦建招诉决【2019】344号,'
			
 
				+    # list_sentences = [s.split('。')]
			
 
				+    # punish_code= punish.predict_punishCode( list_sentences)
			
 
				+    # print(punish_code)
			
 
				+
			
 
				+    punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
			
 
				+                get_punish_extracts(text=s)
			
 
				+    print(punish_code)
			
--- a/BiddingKG/dl/complaint/test1.py
+++ b/BiddingKG/dl/complaint/test1.py
@@ -0,0 +1,831 @@
 
				+import sys
			
 
				+import os
			
 
				+sys.path.append(os.path.abspath("../.."))
			
 
				+import pandas as pd
			
 
				+import re
			
 
				+from BiddingKG.dl.common.Utils import *
			
 
				+from BiddingKG.dl.interface.Entitys import *
			
 
				+from BiddingKG.dl.interface.predictor import *
			
 
				+from BiddingKG.dl.foolnltk import selffool
			
 
				+from BiddingKG.dl.interface.Preprocessing import *
			
 
				+
			
 
				+def get_data1():
			
 
				+    load1 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_TOU_SU_CHU_LI.csv")
			
 
				+    load2 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_WEI_FA_JI_LU.csv")
			
 
				+    load3 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_QI_TA_SHI_XIN.csv")
			
 
				+    load = pd.concat([load1, load2, load3], axis=0)
			
 
				+    load = load.reset_index(drop=True)
			
 
				+    load['PAGE_CONTENT'] = get_article1(load['PAGE_CONTENT'])
			
 
				+    sentences_list = get_sentences1(load['PAGE_CONTENT'])
			
 
				+    load['sentences'] = ['*#*>'.join(_sentences) for _sentences in sentences_list ]
			
 
				+    load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv")
			
 
				+
			
 
				+def get_ners():
			
 
				+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
			
 
				+    # data = data.head(3)
			
 
				+    nersList = []
			
 
				+    for index,_sentences in zip(data.index,data['sentences']):
			
 
				+        _sentences = _sentences.split('*#*>')
			
 
				+        _ners = getNers(_sentences,useselffool=True)
			
 
				+        word_index = 0
			
 
				+        for ners,sentence in zip(_ners, _sentences):
			
 
				+            if len(ners) != 0:
			
 
				+                word_ner_list = ['O']*len(sentence)
			
 
				+
			
 
				+                for ner in ners:
			
 
				+                    nerDict = dict()
			
 
				+                    entity_type = ner[2]
			
 
				+                    nerDict['entity_type'] = entity_type
			
 
				+                    entity_text = ner[3]
			
 
				+                    nerDict['entity_text'] = entity_text
			
 
				+                    begin_index = ner[0]
			
 
				+                    nerDict['begin_index'] = begin_index
			
 
				+                    end_index = ner[1] - 1
			
 
				+                    nerDict['end_index'] = end_index
			
 
				+                    wordOffset_begin = word_index + begin_index
			
 
				+                    nerDict['wordOffset_begin'] = wordOffset_begin
			
 
				+                    wordOffset_end = wordOffset_begin + len(entity_text)
			
 
				+                    nerDict['wordOffset_end'] = wordOffset_end
			
 
				+                    nerDict['sentence'] = sentence
			
 
				+                    nerDict['article_index'] = index
			
 
				+                    # print('====')
			
 
				+                    # print(begin_index,end_index,entity_type,entity_text)
			
 
				+                    nersList.append(nerDict)
			
 
				+                    # print(nerDict)
			
 
				+                    word_ner_list[begin_index] = 'B'
			
 
				+                    word_ner_list[begin_index+1:end_index] = ['I']*(end_index-begin_index-1)
			
 
				+            word_index += len(sentence)
			
 
				+    # save(nersList,"nersList.pk")
			
 
				+
			
 
				+# 相邻的（org、company）（person）合并
			
 
				+def get_unionNers():
			
 
				+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
			
 
				+    ners = load("nersList.pk")
			
 
				+    org_companys = [[] for _ in range(len(data))]
			
 
				+    type1 = ['org', 'company', 'union_oc']
			
 
				+    persons = [[] for _ in range(len(data))]
			
 
				+    type2 = ['person', 'union_person']
			
 
				+    for ner in ners:
			
 
				+        if ner['entity_type'] in type1:
			
 
				+            org_companys[ner['article_index']].append(ner)
			
 
				+        if ner['entity_type'] in type2:
			
 
				+            persons[ner['article_index']].append(ner)
			
 
				+    # 合并 org 和 company
			
 
				+    new_org_companys = []
			
 
				+    for org_company in org_companys:
			
 
				+        if org_company and len(org_company) > 1:
			
 
				+            union_nums = 0
			
 
				+            for i in range(len(org_company)-1):
			
 
				+                if org_company[i]['end_index'] == org_company[i + 1]['begin_index'] - 1 and org_company[i]['sentence'][org_company[i]['end_index']] == '、' \
			
 
				+                        and org_company[i]['sentence'] == org_company[i + 1]['sentence']:
			
 
				+                    # print(1)
			
 
				+                    org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
			
 
				+                    org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
			
 
				+                    org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
			
 
				+                    # print(org_company[i + 1]['entity_text'])
			
 
				+                    org_company[i] = 0
			
 
				+                    union_nums += 1
			
 
				+                elif org_company[i]['end_index'] == org_company[i + 1]['begin_index'] and org_company[i]['sentence'] == org_company[i+1]['sentence']:
			
 
				+                    # print(2)
			
 
				+                    org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
			
 
				+                    org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
			
 
				+                    org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
			
 
				+                    # print(org_company[i + 1]['entity_text'])
			
 
				+                    org_company[i] = 0
			
 
				+                    union_nums += 1
			
 
				+            for _ in range(union_nums):
			
 
				+                org_company.remove(0)
			
 
				+        new_org_companys.append(org_company)
			
 
				+    # 合并person
			
 
				+    new_persons = []
			
 
				+    for person in persons:
			
 
				+        if person and len(person) > 1:
			
 
				+            union_nums = 0
			
 
				+            for i in range(len(person) - 1):
			
 
				+                if person[i]['end_index'] == person[i + 1]['begin_index'] - 1 and person[i]['sentence'][person[i]['end_index']] == '、' \
			
 
				+                        and person[i]['sentence'] == person[i + 1]['sentence']:
			
 
				+                    # print(1)
			
 
				+                    person[i + 1]['begin_index'] = person[i]['begin_index']
			
 
				+                    person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
			
 
				+                    person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
			
 
				+                    # print(person[i + 1]['entity_text'])
			
 
				+                    person[i] = 0
			
 
				+                    union_nums += 1
			
 
				+                elif person[i]['end_index'] == person[i + 1]['begin_index'] and person[i]['sentence'] == person[i + 1]['sentence']:
			
 
				+                    # print(2)
			
 
				+                    person[i + 1]['begin_index'] = person[i]['begin_index']
			
 
				+                    person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
			
 
				+                    person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
			
 
				+                    # print(person[i + 1]['entity_text'])
			
 
				+                    person[i] = 0
			
 
				+                    union_nums += 1
			
 
				+            for _ in range(union_nums):
			
 
				+                person.remove(0)
			
 
				+        new_persons.append(person)
			
 
				+    # save([new_org_companys,new_persons],"unionNers.pk")
			
 
				+
			
 
				+def test02():
			
 
				+    load = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
			
 
				+
			
 
				+    text_rule = re.compile("监管调查|通报|不诚信|监督检查|不良|投诉|质疑|处罚|违法|违规|不予[受处]理|处理")
			
 
				+    title_rule = re.compile("中标公告|中标[（\(]成交[\)）]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
			
 
				+                            "|补贴公[示告]|废标公[示告]")
			
 
				+    # need_index = []
			
 
				+    # for index, title, text in zip(load.index, load['PAGE_TITLE'], load['PAGE_CONTENT']):
			
 
				+    #     a = 0
			
 
				+    #     if text_rule.search(text):
			
 
				+    #         a = 1
			
 
				+    #     if title_rule.search(title):
			
 
				+    #         a = 0
			
 
				+    #     if text_rule.search(title):
			
 
				+    #         a = 1
			
 
				+    #     if a:
			
 
				+    #         need_index.append(index)
			
 
				+    # print(len(need_index))
			
 
				+    # load = load.loc[need_index]
			
 
				+    # print(len(load))
			
 
				+    # load = load.reset_index(drop=True)
			
 
				+
			
 
				+    complainants_rule1 = re.compile("[^被]投[诉拆][人方]之?[\d一二三四五六七八九十]?(?:（.+?）)?[：:]+?")
			
 
				+    complaint_rule = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|疑问[人方]|检举[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?名称)?[:：]+")
			
 
				+    complainants_list = []
			
 
				+    a = 1
			
 
				+    load = load[9744:9745]
			
 
				+    for article,sentences in zip(load['PAGE_CONTENT'],load['sentences']):
			
 
				+        print(a)
			
 
				+        a+=1
			
 
				+        getSentences = sentences.split('*#*>')
			
 
				+        # print(getSentences)
			
 
				+        ners = getNers(getSentences,useselffool=True)
			
 
				+        print(ners)
			
 
				+        print('======================')
			
 
				+        word_index = 0
			
 
				+        ners_list = []
			
 
				+        for ner,sentence in zip(ners,getSentences):
			
 
				+            size = 16
			
 
				+            complainants = []
			
 
				+            if len(ner)!=0:
			
 
				+                for aner in ner:
			
 
				+
			
 
				+                    entity_type = aner[2]
			
 
				+                    entity_text = aner[3]
			
 
				+                    # begin = word_index + aner[0]
			
 
				+                    # end = begin + len(entity_text)
			
 
				+                    # 投诉人
			
 
				+                    if entity_type in ['org','company','person']:
			
 
				+                        left = sentence[max(0, aner[0] - size):aner[0]]
			
 
				+
			
 
				+                        print(entity_text,left,sentence)
			
 
				+                        if complaint_rule.search(left):
			
 
				+                            print('yes')
			
 
				+                            entity_type = 'complainant'
			
 
				+                            complainants.append(entity_text)
			
 
				+                    # ners_list.append([begin, end, entity_type, entity_text])
			
 
				+            word_index += len(sentence)
			
 
				+        complainants_list.append(complainants)
			
 
				+
			
 
				+        # test
			
 
				+        # for i in ners_list:
			
 
				+        #     print(i[3])
			
 
				+        #     print(processed[0][i[0]:i[1]])
			
 
				+    load['complainant'] = complainants_list
			
 
				+    # load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\test01.csv")
			
 
				+
			
 
				+# 投诉人、被投诉人、被处罚人
			
 
				+def get_complainant():
			
 
				+    data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2.xlsx",index_col=0)
			
 
				+    # ners = load("nersList.pk")
			
 
				+    unionNers = load("unionNers.pk")
			
 
				+    ners = [i+j for i,j in zip(unionNers[0],unionNers[1])]
			
 
				+    complainants = [[] for _ in range(len(data))]
			
 
				+    punishPeople = [[] for _ in range(len(data))]
			
 
				+    a = ['org','company','person']
			
 
				+    size = 16
			
 
				+    # 投诉人、质疑人
			
 
				+    complainants_rule1 = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
			
 
				+    # 被处罚人，被投诉人
			
 
				+    punishPeople_rule1 = re.compile("(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
			
 
				+    punishPeople_rule2_1 = re.compile("，$")
			
 
				+    punishPeople_rule2_2 = re.compile("^[:：]")
			
 
				+    punishPeople_rule3_1 = re.compile("(?:关于|对)[^，。]*$")
			
 
				+    punishPeople_rule3_2 = re.compile("^[^，。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
			
 
				+
			
 
				+    time1 = time.time()
			
 
				+    for _ner in ners:
			
 
				+        if _ner:
			
 
				+            for ner in _ner:
			
 
				+                left = ner['sentence'][max(0,ner['begin_index']-size):ner['begin_index']]
			
 
				+                right = ner['sentence'][ner['end_index']:min(ner['end_index']+size,len(ner['sentence']))]
			
 
				+                # print(left)
			
 
				+                if complainants_rule1.search(left):
			
 
				+                    complainants[ner['article_index']].append(ner['entity_text'])
			
 
				+                elif punishPeople_rule1.search(left):
			
 
				+                    punishPeople[ner['article_index']].append(ner['entity_text'])
			
 
				+                elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
			
 
				+                    if data['类别'][ner['article_index']] == '投诉处理':
			
 
				+                        complainants[ner['article_index']].append(ner['entity_text'])
			
 
				+                    else:
			
 
				+                        punishPeople[ner['article_index']].append(ner['entity_text'])
			
 
				+                elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
			
 
				+                    punishPeople[ner['article_index']].append(ner['entity_text'])
			
 
				+    data['complainant'] = complainants
			
 
				+    data['punishPeople'] = punishPeople
			
 
				+    print(time.time()-time1)
			
 
				+    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx")
			
 
				+
			
 
				+def get_complainant2(list_sentences, list_entitys, text_type):
			
 
				+    '''
			
 
				+    list_sentences: get_preprocessed() 得list_sentences
			
 
				+    list_entitys: get_preprocessed() 得list_entitys
			
 
				+    text_type: 文章类别（处罚类型）
			
 
				+    :return:
			
 
				+    complainants :投诉人列表
			
 
				+    punishPeople: 被投诉人/被处罚人
			
 
				+    '''
			
 
				+    sentences_list = list_sentences
			
 
				+    entitys_list = list_entitys
			
 
				+    size = 16
			
 
				+    a = ['org', 'company', 'person']
			
 
				+    b = ['org', 'company', 'union_org_company']
			
 
				+    c = ['person', 'union_person']
			
 
				+    need_entitys = []
			
 
				+    for entity in entitys_list:
			
 
				+        if entity.entity_type in a:
			
 
				+            need_entitys.append(entity)
			
 
				+    # 实体合并
			
 
				+    drop_count = 0
			
 
				+    for i in range(1, len(need_entitys)):
			
 
				+        entity = need_entitys[i]
			
 
				+        entity_begin = entity.wordOffset_begin
			
 
				+        entity_end = entity.wordOffset_end
			
 
				+        sentence = sentences_list[entity.sentence_index].sentence_text
			
 
				+        last_entity = need_entitys[i - 1]
			
 
				+        if entity.sentence_index == last_entity.sentence_index:
			
 
				+            if (entity.entity_type in b and last_entity.entity_type in b) or (
			
 
				+                    entity.entity_type in c and last_entity.entity_type in c):
			
 
				+                if entity_begin - last_entity.wordOffset_end < 2 and sentence[
			
 
				+                                                                     last_entity.wordOffset_end:entity_begin] in ['',
			
 
				+                                                                                                                  '、',
			
 
				+                                                                                                                  '和',
			
 
				+                                                                                                                  '及']:
			
 
				+                    need_entitys[i].wordOffset_begin = last_entity.wordOffset_begin
			
 
				+                    need_entitys[i].begin_index = last_entity.begin_index
			
 
				+                    need_entitys[i].entity_text = last_entity.entity_text + '+' + entity.entity_text
			
 
				+                    if entity.entity_type in b:
			
 
				+                        need_entitys[i].entity_type = 'union_org_company'
			
 
				+                    else:
			
 
				+                        need_entitys[i].entity_type = 'union_person'
			
 
				+                    need_entitys[i - 1] = 0
			
 
				+                    drop_count += 1
			
 
				+    for _ in range(drop_count):
			
 
				+        need_entitys.remove(0)
			
 
				+    # 投诉人、质疑人
			
 
				+    complainants_rule1 = re.compile(
			
 
				+        "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
			
 
				+    # 被处罚人，被投诉人
			
 
				+    punishPeople_rule1 = re.compile(
			
 
				+        "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
			
 
				+    punishPeople_rule2_1 = re.compile("，$")
			
 
				+    punishPeople_rule2_2 = re.compile("^[:：]")
			
 
				+    punishPeople_rule3_1 = re.compile("(?:关于|对)[^，。]*$")
			
 
				+    punishPeople_rule3_2 = re.compile("^[^，。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
			
 
				+    complainants = []
			
 
				+    punishPeople = []
			
 
				+    for i in range(len(need_entitys)):
			
 
				+        entity = need_entitys[i]
			
 
				+        entity_begin = entity.wordOffset_begin
			
 
				+        entity_end = entity.wordOffset_end
			
 
				+
			
 
				+        # entity所在句子
			
 
				+        sentence = sentences_list[entity.sentence_index].sentence_text
			
 
				+        left = sentence[max(0, entity_begin - size):entity_begin]
			
 
				+        right = sentence[entity_end:min(entity_end + size, len(sentence))]
			
 
				+
			
 
				+        if complainants_rule1.search(left):
			
 
				+            complainants.append(entity)
			
 
				+        elif punishPeople_rule1.search(left):
			
 
				+            punishPeople.append(entity)
			
 
				+        elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
			
 
				+            if text_type == '投诉处理':
			
 
				+                complainants.append(entity)
			
 
				+            else:
			
 
				+                punishPeople.append(entity)
			
 
				+        elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
			
 
				+            punishPeople.append(entity)
			
 
				+
			
 
				+    result_complainants = []
			
 
				+    result_punishPeople = []
			
 
				+    for entity in complainants:
			
 
				+        if entity.entity_type in ['union_org_company', 'union_person']:
			
 
				+            entity_text = entity.entity_text.split('+')
			
 
				+            for item in entity_text:
			
 
				+                result_complainants.append(item)
			
 
				+        else:
			
 
				+            result_complainants.append(entity.entity_text)
			
 
				+    for entity in punishPeople:
			
 
				+        if entity.entity_type in ['union_org_company', 'union_person']:
			
 
				+            entity_text = entity.entity_text.split('+')
			
 
				+            for item in entity_text:
			
 
				+                result_punishPeople.append(item)
			
 
				+        else:
			
 
				+            result_punishPeople.append(entity.entity_text)
			
 
				+    return list(set(result_complainants)), list(set(result_punishPeople))
			
 
				+
			
 
				+# 公告分类
			
 
				+def textClassify():
			
 
				+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
			
 
				+    #投诉人|检举人|举报人|质疑人|质疑函
			
 
				+    patten1 = "投诉人|检举人|举报人|质疑人|质疑函|投诉处理|质疑单位"
			
 
				+    re1 = re.compile(patten1)
			
 
				+    patten2 = "不予[处受]理|撤诉|撤[销回]投诉|投诉终止"
			
 
				+    re2 = re.compile(patten2)
			
 
				+    patten3 = "关于[^，。]+?(?:处罚|通报|处理意见)|被处罚人|处罚决定|限制行为开始时间|处罚执行部门"
			
 
				+    re3 = re.compile(patten3)
			
 
				+    patten4 = "不良行为|不良信用|不良记录|不规范行为|不诚信行为"
			
 
				+    re4 = re.compile(patten4)
			
 
				+    patten5 = "行政处罚|行政处理|监督检查|监管调查|监督处理|违规处[罚理]|违法处[罚理]"
			
 
				+    re5 = re.compile(patten5)
			
 
				+    patten6 = "严重违法失信起名单|严重违法失信行为|严重违法失信企业"
			
 
				+    re6 = re.compile(patten6)
			
 
				+    patten7 = '处理决定'
			
 
				+    re7 = re.compile(patten7)
			
 
				+    patten8 = "处[理罚]依据|处罚日期|扣分依据|认定依据"
			
 
				+    re8 = re.compile(patten8)
			
 
				+    pos = []
			
 
				+    _type = []
			
 
				+    for title,text in zip(data['PAGE_TITLE'],data["PAGE_CONTENT"]):
			
 
				+        p = []
			
 
				+        t = ''
			
 
				+        if re1.search(text) or re1.search(title):
			
 
				+            p.append(patten1)
			
 
				+            t = '投诉'
			
 
				+        elif re2.search(text) and re.search('投诉',text):
			
 
				+            p.append('投诉+'+patten2)
			
 
				+            t = '投诉'
			
 
				+        elif re.search("回复",title):
			
 
				+            p.append("回复")
			
 
				+            t = '投诉'
			
 
				+        if len(p)==0:
			
 
				+            if re3.search(title) or re3.search(text):
			
 
				+                p.append(patten3)
			
 
				+                t = '处罚'
			
 
				+            elif re4.search(title):
			
 
				+                p.append(patten4)
			
 
				+                t = '处罚'
			
 
				+            elif re5.search(title) or re5.search(text):
			
 
				+                p.append(patten5)
			
 
				+                t = '处罚'
			
 
				+            elif re6.search(text) or re6.search(title):
			
 
				+                p.append(patten6)
			
 
				+                t = '处罚'
			
 
				+            elif re8.search(text):
			
 
				+                p.append(patten8)
			
 
				+                t = '处罚'
			
 
				+        if len(p) == 0:
			
 
				+            if re7.search(text) and re.search('投诉', text):
			
 
				+                p.append('投诉+' + patten7)
			
 
				+                t = '投诉'
			
 
				+            elif re7.search(text) or re7.search(title):
			
 
				+                p.append("处罚+"+patten7)
			
 
				+                t = '处罚'
			
 
				+        pos.append(p)
			
 
				+        _type.append(t)
			
 
				+    data['pos'] = pos
			
 
				+    data['type'] = _type
			
 
				+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv")
			
 
				+
			
 
				+# 投诉是否成立
			
 
				+def get_punishWhether01():
			
 
				+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv",index_col=0)
			
 
				+    data = data[data['type']=='投诉']
			
 
				+    punishWhether_1 = re.compile("投诉[^。，,不]+?成立|投诉[^。，,]*[^不]属实|情况[^。，,]*[^不]属实|投诉成立|情况属实|予以支持")
			
 
				+    punishWhether_0 = re.compile("投诉[^。，,]*不能?成立|撤诉|[^逾将]{4,}不予[受处]理|撤[回销][^。，,]*(?:举报|投诉)|驳回[^。，,]*投诉|投诉终止|终止[^。，,]*投诉|情况[^。，,]*不属实|投诉[^。，,]*不属实|缺乏事实依据|不予支持|予以驳回")
			
 
				+    punishWhether = []
			
 
				+    punishDecision = []
			
 
				+    punishDecision_1 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[：:]((?:(?:[\d一二三四五六七八九十]|[\(（][\d一二三四五六七八九十][\)）]|投[诉拆]事项[\d一二三四五六七八九十]).+?。)+)")
			
 
				+    punishDecision_2 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[：:]([^。]+?(?:。|$))")
			
 
				+    punishDecision_3 = re.compile("[\d一二三四五六七八九十]、(?:处理，?意见|[裁决|处理]依据及结果|处理(?:决定|结果)|投诉处理决定)，(.+?)。[\d一二三四五六七八九十]、")
			
 
				+    punishDecision_4 = re.compile("(?:[\d一二三四五六七八九十]、处理，?意见|综上所述|[裁决|处理]依据及结果|综上|[\d一二三四五六七八九十]、处理(?:决定|结果)|经研究决定|[\d一二三四五六七八九十]、投诉处理决定)，([^。]+?(?:。|$))")
			
 
				+    punishDecision_5 = re.compile("(本机关决定|本机关认为|经审查.+?(?:。|$))")
			
 
				+    punishDecision_6 = re.compile("((?:依据|按照|根据|依照)[^：:。].+?(?:。|$))")
			
 
				+
			
 
				+    def findDecision(text):
			
 
				+        decision = ''
			
 
				+        if punishDecision_1.search(text):
			
 
				+            decision = punishDecision_1.search(text).group(1)
			
 
				+
			
 
				+        elif punishDecision_2.search(text):
			
 
				+            decision = punishDecision_2.search(text).group(1)
			
 
				+        elif punishDecision_3.search(text):
			
 
				+            decision = punishDecision_3.search(text).group(1)
			
 
				+        elif punishDecision_4.search(text):
			
 
				+            decision = punishDecision_4.findall(text)
			
 
				+            decision = decision[-1]
			
 
				+        elif punishDecision_5.search(text):
			
 
				+            decision = punishDecision_5.search(text).group(1)
			
 
				+        elif punishDecision_6.search(text):
			
 
				+            decision = punishDecision_6.findall(text)
			
 
				+            decision1 = decision[-1]
			
 
				+            if re.search("诉讼",decision1) and len(decision)>1:
			
 
				+                decision1 = decision[-2]
			
 
				+            decision = decision1
			
 
				+        return decision
			
 
				+
			
 
				+    for text in data['PAGE_CONTENT']:
			
 
				+        pw = ''
			
 
				+        if punishWhether_1.search(text):
			
 
				+            pw = 1
			
 
				+        elif punishWhether_0.search(text):
			
 
				+            pw = 0
			
 
				+        punishWhether.append(pw)
			
 
				+
			
 
				+        mid = len(text)//2
			
 
				+        lower_half = text[mid:]
			
 
				+        decision = findDecision(lower_half)
			
 
				+        if decision == '':
			
 
				+            decision = findDecision(text)
			
 
				+
			
 
				+        # if punishDecision_1.search(text):
			
 
				+        #     decision = punishDecision_1.search(text).group(1)
			
 
				+        #
			
 
				+        # elif punishDecision_2.search(text):
			
 
				+        #     decision = punishDecision_2.search(text).group(1)
			
 
				+        # elif punishDecision_3.search(text):
			
 
				+        #     decision = punishDecision_3.search(text).group(1)
			
 
				+        # elif punishDecision_4.search(text):
			
 
				+        #     decision = punishDecision_4.findall(text)
			
 
				+        #     decision = decision[-1]
			
 
				+        # elif punishDecision_5.search(text):
			
 
				+        #     decision = punishDecision_5.findall(text)
			
 
				+        #     decision = decision[-1]
			
 
				+        punishDecision.append(decision)
			
 
				+    data['punishWhether'] = punishWhether
			
 
				+    data['punishDecision'] = punishDecision
			
 
				+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishWhether&Decision.csv")
			
 
				+# 处罚决定
			
 
				+def get_punishDecision():
			
 
				+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv", index_col=0)
			
 
				+    data = data[data['type'] == '处罚']
			
 
				+    punishDecision_1 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[:：]+((?:(?:[\d一二三四五六七八九十]|[\(（][\d一二三四五六七八九十][\)）]).+?。)+)")
			
 
				+    punishDecision_2 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[:：]+(.+?(?:。|$))")
			
 
				+    punishDecision_3 = re.compile("(扣分分?值[:：][\d.]+分?)")
			
 
				+    punishDecision_4 = re.compile("[\d一二三四五六七八九十]、(?:处理结果|处理决定|处理依据[和及]处理结果|处理依据及结果|处罚决定|处罚结果|整改意见)，(.+?)。[\d一二三四五六七八九十]、")
			
 
				+    punishDecision_5 = re.compile("(?:处理结果|[\d一二三四五六七八九十]、处理决定|处理依据及处理结果|处理依据及结果|经研究|经研究决定|[\d一二三四五六七八九十]、处罚决定|处罚结果|整改意见)，+(.+?(?:。|$))")
			
 
				+    punishDecision_6 = re.compile("(?:本机关决定|我局决定)(.+?(?:。|$))")
			
 
				+    punishDecision_7 = re.compile("((?:依据|按照|根据|依照)[^：:。].+?(?:。|$))")
			
 
				+    punishDecision = []
			
 
				+    for text in data['PAGE_CONTENT']:
			
 
				+        decision = ''
			
 
				+        if punishDecision_1.search(text):
			
 
				+            decision = punishDecision_1.search(text).group(1)
			
 
				+        elif punishDecision_2.search(text):
			
 
				+            decision = punishDecision_2.search(text).group(1)
			
 
				+        elif punishDecision_3.search(text):
			
 
				+            decision = punishDecision_3.search(text).group(1)
			
 
				+        elif punishDecision_4.search(text):
			
 
				+            decision = punishDecision_4.search(text).group(1)
			
 
				+        elif punishDecision_5.search(text):
			
 
				+            decision = punishDecision_5.findall(text)
			
 
				+            decision = decision[-1]
			
 
				+        elif punishDecision_6.search(text):
			
 
				+            decision = punishDecision_6.search(text).group(1)
			
 
				+        elif punishDecision_7.search(text):
			
 
				+            decision = punishDecision_7.findall(text)
			
 
				+            decision = decision[-1]
			
 
				+        punishDecision.append(decision)
			
 
				+    data['punishDecision'] = punishDecision
			
 
				+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishDecision处罚.csv")
			
 
				+
			
 
				+# 执法机构、处罚时间
			
 
				+def get_institution():
			
 
				+    data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx", index_col=0)
			
 
				+    ners = load("nersList.pk")
			
 
				+    orgs = [[] for _ in range(len(data))]
			
 
				+    times = [[] for _ in range(len(data))]
			
 
				+    institutions = [[] for _ in range(len(data))]
			
 
				+    punishTimes = [[] for _ in range(len(data))]
			
 
				+    institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[:：]")
			
 
				+    punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[:：]")
			
 
				+    for ner in ners:
			
 
				+        if ner['entity_type'] == 'org':
			
 
				+            left = ner['sentence'][max(0,ner['begin_index']-15):ner['begin_index']]
			
 
				+            if institution_1.search(left):
			
 
				+                institutions[ner['article_index']].append(ner['entity_text'])
			
 
				+            orgs[ner['article_index']].append(ner)
			
 
				+        elif ner['entity_type'] =='time':
			
 
				+            left = ner['sentence'][max(0, ner['begin_index'] - 15):ner['begin_index']]
			
 
				+            if punishTimes_1.search(left):
			
 
				+                punishTimes[ner['article_index']].append(ner['entity_text'])
			
 
				+            times[ner['article_index']].append(ner)
			
 
				+    orgs = [org[-5:] if len(org)>5 else org for org in orgs]
			
 
				+    times = [time[-3:] if len(time)>3 else time for time in times]
			
 
				+    data['org'] = orgs
			
 
				+    data['time'] = times
			
 
				+    data['institution'] = institutions
			
 
				+    data['punishTime'] = punishTimes
			
 
				+    # data = data[data['type'].isin(["投诉","处罚"])]
			
 
				+    print(len(data))
			
 
				+    # data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv")
			
 
				+    # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv", index_col=0)
			
 
				+    institution_list = []
			
 
				+    punishTime_list = []
			
 
				+    institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
			
 
				+    institution_time = re.compile("(^，?[\d一二三四五六七八九十]{4}，?[/年-][\d一二三四五六七八九十]{1,2}，?[/月-][\d一二三四五六七八九十]{1,2}，?[/日-]?)")
			
 
				+    for title,text,org,n_time,institution,punishTime in zip(data['PAGE_TITLE'],data['PAGE_CONTENT'],data['org'],data['time'],data['institution'],data['punishTime']):
			
 
				+        ins = ''
			
 
				+        ptime = ''
			
 
				+        if punishTime:
			
 
				+            ptime = punishTime
			
 
				+        if institution:
			
 
				+            ins = institution
			
 
				+        else:
			
 
				+            title_ners = getNers([title],useselffool=True)
			
 
				+            if title_ners[0]:
			
 
				+
			
 
				+                for title_ner in title_ners[0]:
			
 
				+
			
 
				+                    if title_ner[2]=='org' and institution_title.search(title_ner[3]):
			
 
				+                        # 'title:'+
			
 
				+                        ins = title_ner[3]
			
 
				+                        # print(title_ner[3])
			
 
				+                        break
			
 
				+
			
 
				+        # if ins == '':
			
 
				+        for _org in org[::-1]:
			
 
				+            right = _org['sentence'][_org['end_index']:min(len(_org['sentence']),_org['end_index']+16)]
			
 
				+            if institution_time.search(right):
			
 
				+                if ins == '':
			
 
				+                    # "text_EndWithTime:" +
			
 
				+                    ins = _org['entity_text']
			
 
				+                if ptime == '':
			
 
				+                    # "text_EndWithIns:" +
			
 
				+                    ptime =institution_time.search(right).group(1)
			
 
				+                break
			
 
				+        if ptime == '' and len(n_time) != 0:
			
 
				+            textLong = len(text)
			
 
				+            if n_time[-1]['wordOffset_end'] > textLong-3 and len(n_time[-1]['entity_text'])>3:
			
 
				+                # "EndOfText:" +
			
 
				+                ptime = n_time[-1]['entity_text']
			
 
				+
			
 
				+        institution_list.append(ins)
			
 
				+        punishTime_list.append(ptime)
			
 
				+    data['institution'] = institution_list
			
 
				+    data['punishTime'] = punishTime_list
			
 
				+    data = data.drop(columns=['org','time'],axis=1)
			
 
				+    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-2.xlsx")
			
 
				+
			
 
				+# 处罚类型
			
 
				+def get_punishType():
			
 
				+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
			
 
				+    # 暂定：严重违法失信，行政处罚，投诉处理，监督检查，其他失信记录
			
 
				+
			
 
				+    # 其他无关公告
			
 
				+    title_rule = re.compile("(?:中标公[示告]|中标[（\(]成交[\)）]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
			
 
				+                            "|补贴公[示告]|废标公[示告]|备案公[示告]|数据统计|选取公告|流标公告|变更公告|入围公告|征集公告|执行情况|"
			
 
				+                            "登记公告|竞争性磋商公告|报名的公[示告]|竞争性谈判公告|邀请函|竞标公告|采购公告|招标公告|议标公告|预审公告|"
			
 
				+                            "询价公告|竞争性磋商（磋商）公告|竞[谈价]公告|合同公告|人员(名单)?公示|批复|终止公告|入围结果公告|中标结果公[示告]|"
			
 
				+                            "意见公示)(?:[\(（].+?[\)）])?$|关于.*通知(?:[^书]|$)")
			
 
				+    othertype = "其他无关公告"
			
 
				+    # 投诉处理
			
 
				+    re1_1 = re.compile("投诉[人方]|检举人|举报人[：:]|投诉处理|终止投诉|投诉终止|撤诉|撤回投诉|质疑人|质疑单位|质疑[^，,。]*答复")
			
 
				+    re1_2 = re.compile("处理决定|回复")
			
 
				+    re1_type = '投诉处理'
			
 
				+    # 监督检查
			
 
				+    re2 = re.compile("监督检查|监管调查|监督处理")
			
 
				+    re2_type = "监督检查"
			
 
				+    # 行政处罚
			
 
				+    re3 = re.compile("行政处罚|行政处理")
			
 
				+    re3_type = "行政处罚"
			
 
				+    # 严重违法失信
			
 
				+    re4 = re.compile("严重违法失信行为|严重违法失信企业|严重违法失信起名单")
			
 
				+    re4_type = "严重违法失信"
			
 
				+    # 其他失信公告
			
 
				+    re_other = re.compile("关于[^，。]+?(?:处罚|处理|通报)|不良行为|不良信用|不良记录|不规范行为|不诚信行为|"
			
 
				+                          "违[规法约]处[罚理]|处[理罚]依据|处罚日期|扣分依据|认定依据|处罚决定|违规情况|"
			
 
				+                          "违[规法]行为|违规事项|考评依据|失信行为")
			
 
				+    re_otherType = "其他失信公告"
			
 
				+    punishType_list = []
			
 
				+    for title,text in zip(data['PAGE_TITLE'],data['PAGE_CONTENT']):
			
 
				+        punishType = ''
			
 
				+        titleWithText = title + text
			
 
				+        if title_rule.search(title):
			
 
				+            punishType = othertype
			
 
				+        elif re1_1.search(titleWithText) or re.search("投[诉拆]",title):
			
 
				+            punishType = re1_type
			
 
				+        elif re1_2.search(titleWithText) and re.search("投诉",titleWithText):
			
 
				+            punishType = re1_type
			
 
				+        elif re2.search(titleWithText):
			
 
				+            punishType = re2_type
			
 
				+        elif re3.search(titleWithText):
			
 
				+            punishType = re3_type
			
 
				+        elif re4.search(titleWithText):
			
 
				+            punishType = re4_type
			
 
				+        elif re_other.search(titleWithText) or re.search("处罚",title):
			
 
				+            punishType = re_otherType
			
 
				+        punishType_list.append(punishType)
			
 
				+    data['punishType'] = punishType_list
			
 
				+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishType_test.csv",encoding='utf-8')
			
 
				+
			
 
				+
			
 
				+def getNers_my(sentences,MAXAREA = 10000,useselffool=False):
			
 
				+    '''
			
 
				+    @param: sentences:句子数
			
 
				+    @return 限流执行后的实体识别list
			
 
				+    '''
			
 
				+    def getData(ners,process_data):
			
 
				+        process_sentences = [item[1] for item in process_data]
			
 
				+        print(process_data)
			
 
				+        if useselffool:
			
 
				+            ner_ = selffool.self_ner(process_sentences)
			
 
				+        else:
			
 
				+            ner_ = selffool.ner(process_sentences)
			
 
				+        print('ner_ :',ner_)
			
 
				+        for i in range(len(ner_)):
			
 
				+            the_index = process_data[i][0]
			
 
				+            ners[the_index] = ner_[i]
			
 
				+    sents = []
			
 
				+    for i in range(len(sentences)):
			
 
				+        sents.append([i,sentences[i]])
			
 
				+    sents.sort(key=lambda x:len(x[1]),reverse=True)
			
 
				+    print(sents)
			
 
				+    index_ = 0
			
 
				+    ners = [[]for i in range(len(sentences))]
			
 
				+
			
 
				+    while(True):
			
 
				+        width = len(sents[index_][1])
			
 
				+        height = MAXAREA//width+1
			
 
				+        if height>len(sents)-index_:
			
 
				+            height = len(sents)-index_
			
 
				+        process_data = sents[index_:index_+height]
			
 
				+        getData( ners, process_data)
			
 
				+        index_ += height
			
 
				+        if index_>=len(sents):
			
 
				+            break
			
 
				+    return ners
			
 
				+# 网页公告处理
			
 
				+def get_article1(articles,cost_time = dict(),useselffool=True):
			
 
				+    '''
			
 
				+    :param articles: 待处理的article source html
			
 
				+    :param useselffool: 是否使用selffool
			
 
				+    :return: list_articles
			
 
				+    '''
			
 
				+
			
 
				+    list_articles = []
			
 
				+    for article in articles:
			
 
				+        a_time = time.time()
			
 
				+        sourceContent = article
			
 
				+        #表格处理
			
 
				+        key_preprocess = "tableToText"
			
 
				+        start_time = time.time()
			
 
				+        article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
			
 
				+
			
 
				+        # log(article_processed)
			
 
				+
			
 
				+        if key_preprocess not in cost_time:
			
 
				+            cost_time[key_preprocess] = 0
			
 
				+        cost_time[key_preprocess] += time.time()-start_time
			
 
				+
			
 
				+        #article_processed = article[1]
			
 
				+        list_articles.append(article_processed)
			
 
				+        print(time.time()-a_time)
			
 
				+    return list_articles
			
 
				+# 分句处理
			
 
				+def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
			
 
				+    '''
			
 
				+
			
 
				+    :param list_articles: 经过预处理的article text
			
 
				+    :return: list_sentences
			
 
				+    '''
			
 
				+
			
 
				+    list_sentences = []
			
 
				+    for article in list_articles:
			
 
				+        a_time = time.time()
			
 
				+        list_sentences_temp = []
			
 
				+        #表格处理
			
 
				+        key_preprocess = "tableToText"
			
 
				+        start_time = time.time()
			
 
				+        article_processed = article
			
 
				+
			
 
				+
			
 
				+        if key_preprocess not in cost_time:
			
 
				+            cost_time[key_preprocess] = 0
			
 
				+        cost_time[key_preprocess] += time.time()-start_time
			
 
				+
			
 
				+        #nlp处理
			
 
				+        if article_processed is not None and len(article_processed)!=0:
			
 
				+            split_patten = "。"
			
 
				+            sentences = []
			
 
				+            _begin = 0
			
 
				+            sentences_set = set()
			
 
				+            for _iter in re.finditer(split_patten,article_processed):
			
 
				+                _sen = article_processed[_begin:_iter.span()[1]]
			
 
				+                if len(_sen)>0 and _sen not in sentences_set:
			
 
				+                    sentences.append(_sen)
			
 
				+                    sentences_set.add(_sen)
			
 
				+                _begin = _iter.span()[1]
			
 
				+            _sen = article_processed[_begin:]
			
 
				+            if len(_sen)>0 and _sen not in sentences_set:
			
 
				+                sentences.append(_sen)
			
 
				+                sentences_set.add(_sen)
			
 
				+            # article = "".join(sentences)
			
 
				+            # # sentences.append(article_processed[_begin:])
			
 
				+            #
			
 
				+            # lemmas = []
			
 
				+            # doc_offsets = []
			
 
				+            # dep_types = []
			
 
				+            # dep_tokens = []
			
 
				+            #
			
 
				+            # time1 = time.time()
			
 
				+
			
 
				+            '''
			
 
				+            tokens_all = fool.cut(sentences)
			
 
				+            #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
			
 
				+            #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
			
 
				+            ner_entitys_all = fool.ner(sentences)
			
 
				+            '''
			
 
				+            #限流执行
			
 
				+            key_nerToken = "nerToken"
			
 
				+            start_time = time.time()
			
 
				+            # tokens_all = getTokens(sentences,useselffool=useselffool)
			
 
				+            if key_nerToken not in cost_time:
			
 
				+                cost_time[key_nerToken] = 0
			
 
				+            cost_time[key_nerToken] += time.time()-start_time
			
 
				+
			
 
				+
			
 
				+            for sentence_index in range(len(sentences)):
			
 
				+
			
 
				+                sentence_text = sentences[sentence_index]
			
 
				+                # tokens = tokens_all[sentence_index]
			
 
				+                #
			
 
				+                # #pos_tag = pos_all[sentence_index]
			
 
				+                # pos_tag = ""
			
 
				+                #
			
 
				+                # ner_entitys = ""
			
 
				+
			
 
				+                list_sentences_temp.append(sentence_text)
			
 
				+
			
 
				+        if len(list_sentences_temp)==0:
			
 
				+            list_sentences_temp.append(sentence_text)
			
 
				+        list_sentences.append(list_sentences_temp)
			
 
				+        print('2:',time.time()-a_time)
			
 
				+    return list_sentences
			
 
				+
			
 
				+def ronghe():
			
 
				+    a = "，投诉处理决定书，投诉人：福建光正工程项目管理有限公司，联系地址：福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室，被投诉人：泉州台商投资区城市建设发展有限公司，泉州台商投资区水务投资经营有限公司，福建省富诚工程管理有限公司，联系地址：泉州台商投资区通港路大创商厦，一、投诉人投诉事项，投诉人按中标候选人公示的要求参加会议，由于提供的身份证原件于复印件版本不同而被废标，认为废标理由不成立。"
			
 
				+    ners = [(13, 28, 'company', '福建光正工程项目管理有限公司'), (33, 75, 'location', '福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室'), (80, 98, 'company', '泉州台商投资区城市建设发展有限公司'), (98, 116, 'company', '泉州台商投资区水务投资经营有限公司'), (116, 130, 'company', '福建省富诚工程管理有限公司'), (135, 150, 'location', '泉州台商投资区通港路大创商厦')]
			
 
				+    s = ['person', 'org', 'company', 'union']
			
 
				+    remove_num = 0
			
 
				+    for i in range(len(ners)):
			
 
				+        print(0)
			
 
				+        ner = ners[i]
			
 
				+        begin = ner[0]
			
 
				+        end = ner[1]
			
 
				+        type = ner[2]
			
 
				+
			
 
				+        if type in s:
			
 
				+            if end == ners[i+1][0] and a[end-1]=='、':
			
 
				+                print(1)
			
 
				+                new_begin = begin
			
 
				+                new_end = ners[i+1][1]
			
 
				+                new_type = 'union'
			
 
				+                new_text = ner[3]+'、'+ners[i+1][3]
			
 
				+                new_ner = (new_begin,new_end,new_type,new_text)
			
 
				+                ners[i] = 0
			
 
				+                ners[i+1] = new_ner
			
 
				+                remove_num += 1
			
 
				+                continue
			
 
				+            if end == ners[i + 1][0] and a[end-1] == '，' and a[ners[i + 1][1]-1]==a[end-1]:
			
 
				+                print(2)
			
 
				+                new_begin = begin
			
 
				+                new_end = ners[i + 1][1]
			
 
				+                new_type = 'union'
			
 
				+                new_text = ner[3] + '，' + ners[i + 1][3]
			
 
				+                new_ner = (new_begin, new_end, new_type, new_text)
			
 
				+                ners[i] = 0
			
 
				+                ners[i + 1] = new_ner
			
 
				+                remove_num += 1
			
 
				+
			
 
				+    for i in range(remove_num):
			
 
				+        ners.remove(0)
			
 
				+    print(ners)
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    # get_data1()
			
 
				+    # get_ners()
			
 
				+    # test02()
			
 
				+    # get_unionNers()
			
 
				+    # 投诉人、被投诉/处罚人
			
 
				+    # get_complainant()
			
 
				+    # ronghe()
			
 
				+    # 分类
			
 
				+    # textClassify()
			
 
				+    # 投诉是否成立、处罚决定（投诉）
			
 
				+    # get_punishWhether01()
			
 
				+    # 处罚决定（处罚）
			
 
				+    # get_punishDecision()
			
 
				+    # 执法机构、处罚时间
			
 
				+    get_institution()
			
 
				+    # 处罚类型
			
 
				+    # get_punishType()
			
 
				+
			
 
				+    pass
			
--- a/BiddingKG/dl/complaint/vocab_word.pk
+++ b/BiddingKG/dl/complaint/vocab_word.pk