4 tahun lalu · 5e0d431c28
--- a/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/checkpoint
+++ b/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/checkpoint
@@ -0,0 +1,6 @@
 
															+model_checkpoint_path: "model.ckpt"
														
 
															+all_model_checkpoint_paths: "..\\9-0.9983888954343817-0.6076048\\model.ckpt"
														
 
															+all_model_checkpoint_paths: "..\\10-0.9984710946469133-0.58896327\\model.ckpt"
														
 
															+all_model_checkpoint_paths: "..\\11-0.9986902925469974-0.50287944\\model.ckpt"
														
 
															+all_model_checkpoint_paths: "..\\16-0.9989259302895879-0.39168403\\model.ckpt"
														
 
															+all_model_checkpoint_paths: "model.ckpt"
														
--- a/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.data-00000-of-00001
+++ b/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.data-00000-of-00001
--- a/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.index
+++ b/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.index
--- a/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.meta
+++ b/BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.meta
--- a/BiddingKG/dl/complaint/punishNo_tf.py
+++ b/BiddingKG/dl/complaint/punishNo_tf.py
@@ -0,0 +1,364 @@
 
															+import tensorflow as tf
														
 
															+from tensorflow.contrib.crf import crf_log_likelihood
														
 
															+from tensorflow.contrib.layers.python.layers import initializers
														
 
															+import numpy as np
														
 
															+import pandas as pd
														
 
															+from zipfile import ZipFile
														
 
															+import os
														
 
															+import pickle
														
 
															+from BiddingKG.dl.common.Utils import *
														
 
															+from keras.preprocessing.sequence import pad_sequences
														
 
															+
														
 
															+# class BiLSTM_CRF_tf(object):
														
 
															+#     def __init__(self):
														
 
															+
														
 
															+def BiLSTM_CRF_tfmodel(sess,weights):
														
 
															+    BiRNN_Units = 140
														
 
															+    chunk_tags = {
														
 
															+        'O': 0,
														
 
															+        'PN_B': 1,
														
 
															+        'PN_M': 2,
														
 
															+        'PN_E': 3
														
 
															+    }
														
 
															+
														
 
															+    def embedding_layer(input):
														
 
															+        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
														
 
															+        return tf.nn.embedding_lookup(params=embedding,ids=input)
														
 
															+
														
 
															+    def BiLSTM_Layer(input,length):
														
 
															+        with tf.variable_scope("BiLSTM"):
														
 
															+            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
														
 
															+            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
														
 
															+        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
														
 
															+        output = tf.concat(output,2)
														
 
															+        return output
														
 
															+
														
 
															+    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
														
 
															+        with tf.variable_scope("CRF"):
														
 
															+            with tf.variable_scope("hidden"):
														
 
															+                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
														
 
															+                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
														
 
															+                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
														
 
															+                # print(input)
														
 
															+                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
														
 
															+                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
														
 
															+            with tf.variable_scope("output"):
														
 
															+                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
														
 
															+                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
														
 
															+                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
														
 
															+                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
														
 
															+        return logits_
														
 
															+
														
 
															+    def layer_loss(input,true_target,num_tags,length):
														
 
															+        with tf.variable_scope("crf_loss"):
														
 
															+            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
														
 
															+            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
														
 
															+            return tf.reduce_mean(-log_likelihood),trans
														
 
															+
														
 
															+    with sess.graph.as_default():
														
 
															+        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
														
 
															+        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
														
 
															+        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
														
 
															+        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
														
 
															+
														
 
															+        _embedding = embedding_layer(char_input)
														
 
															+        _shape = tf.shape(char_input)
														
 
															+        batch_size = _shape[0]
														
 
															+        step_size = _shape[-1]
														
 
															+        bilstm = BiLSTM_Layer(_embedding,length)
														
 
															+        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
														
 
															+        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
														
 
															+        global_step = tf.Variable(0,trainable=False)
														
 
															+        with tf.variable_scope("optimizer"):
														
 
															+            opt = tf.train.AdamOptimizer(0.002)
														
 
															+            grads_vars = opt.compute_gradients(crf_loss)
														
 
															+            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
														
 
															+            train_op = opt.apply_gradients(capped_grads_vars,global_step)
														
 
															+            return char_input,_logits,target,length,crf_loss,trans,train_op
														
 
															+
														
 
															+def train():
														
 
															+    vocab_model = getModel_word()
														
 
															+    vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
														
 
															+    # print(w2v_matrix)
														
 
															+    punishNo = {
														
 
															+        'O': 0,
														
 
															+        'PN_B': 1,
														
 
															+        'PN_M': 2,
														
 
															+        'PN_E': 3
														
 
															+    }
														
 
															+    punishNo_2 = {
														
 
															+        'O': np.array([1, 0, 0, 0]),
														
 
															+        'PN_B': np.array([0, 1, 0, 0]),
														
 
															+        'PN_M': np.array([0, 0, 1, 0]),
														
 
															+        'PN_E': np.array([0, 0, 0, 1])
														
 
															+    }
														
 
															+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0)
														
 
															+
														
 
															+    train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0)
														
 
															+    train_data['text'] = [data['text'][data['document_id'] == id] for id in train_data['document_id']]
														
 
															+    data_x = []
														
 
															+    data_y = []
														
 
															+
														
 
															+    articles_label = ['' for _ in range(13500)]
														
 
															+    punishNo_in_text = set()
														
 
															+    for textId, begin, end, entity_text, text in zip(train_data['document_id'], train_data['begin_index'],
														
 
															+                                                     train_data['end_index'],
														
 
															+                                                     train_data['entity_text'], train_data['text']):
														
 
															+        punishNo_in_text.add(textId)
														
 
															+        text = list(text)[0]
														
 
															+        l = len(text)
														
 
															+        if not articles_label[textId]:
														
 
															+            articles_label[textId] = ['O' for _ in range(l)]
														
 
															+        articles_label[textId][begin] = 'PN_B'
														
 
															+        articles_label[textId][end - 1] = 'PN_E'
														
 
															+        for i in range(begin + 1, end - 1):
														
 
															+            articles_label[textId][i] = 'PN_M'
														
 
															+    punishNo_in_text = list(punishNo_in_text)
														
 
															+
														
 
															+    # 取含数字的负样本
														
 
															+    data = data.dropna(subset=['text'])
														
 
															+    re_rule1 = re.compile('\[|\]')
														
 
															+    data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']]
														
 
															+    data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
														
 
															+    re_rule2 = re.compile("[\d，.]{4,}")
														
 
															+    for id, article, sentences in zip(data['document_id'], data['text'], data['sentences']):
														
 
															+        if id < 2826 or id in punishNo_in_text:
														
 
															+            # print(id)
														
 
															+            article = str(article)
														
 
															+            l = len(article)
														
 
															+            text_word = list(article)
														
 
															+            text_word_index = [getIndexOfWord(word) for word in text_word]
														
 
															+            sentence_count = len(sentences)
														
 
															+            if articles_label[id]:
														
 
															+                label_list = articles_label[id]
														
 
															+            else:
														
 
															+                label_list = ['O' for _ in range(l)]
														
 
															+            for i in range(sentence_count - 1):
														
 
															+                if re_rule2.search(article[sentences[i]:sentences[i + 1]]):
														
 
															+                    data_x.append(np.array(text_word_index[sentences[i]:sentences[i + 1]]))
														
 
															+                    data_y.append(np.array(label_list[sentences[i]:sentences[i + 1]]))
														
 
															+
														
 
															+    data_x = np.array(data_x)
														
 
															+    x_len = [250 if len(x)>250 else len(x) for x in data_x]
														
 
															+    data_x = pad_sequences(data_x, maxlen=250, padding="post", truncating="post")
														
 
															+    # train_x = train_x.reshape(-1)
														
 
															+    data_y = [np.array([punishNo[_y] for _y in y]) for y in data_y]
														
 
															+    # data_y = np.array(data_y).reshape(-1)
														
 
															+    data_y = np.array(data_y)
														
 
															+    data_y = pad_sequences(data_y, maxlen=250, padding="post", truncating="post")
														
 
															+    # print(data_x[:5])
														
 
															+    # print(data_y[:5])
														
 
															+    # data_x = np.array(list(data_x))
														
 
															+    # data_y = np.array(list(data_y))
														
 
															+    indices = np.random.permutation(data_x.shape[0])
														
 
															+    count = len(data_x)
														
 
															+    test_count = int(0.2 * count)
														
 
															+    test_idx, train_idx = indices[:test_count], indices[test_count:]
														
 
															+    # print(test_idx)
														
 
															+    train_x, test_x = data_x[train_idx, :], data_x[test_idx, :]
														
 
															+    train_y, test_y = data_y[train_idx, :], data_y[test_idx, :]
														
 
															+    train_x_len = np.array([x_len[idx] for idx in train_idx])
														
 
															+    test_x_len = np.array([x_len[idx] for idx in test_idx])
														
 
															+
														
 
															+    with tf.Session(graph=tf.Graph()) as sess:
														
 
															+        char_input,logits,target,length,crf_loss,trans,train_op = BiLSTM_CRF_tfmodel(sess,w2v_matrix)
														
 
															+        sess.run(tf.global_variables_initializer())
														
 
															+        saver = tf.train.Saver()
														
 
															+        epochs = 60
														
 
															+        batch_size = 300
														
 
															+        _test_loss = 10000.
														
 
															+        for epoch in range(epochs):
														
 
															+            for x_batch,y_batch,x_len_batch in batch_iter(train_x,train_y,train_x_len,batch_size=batch_size):
														
 
															+                # for x,y,l in zip(x_batch,y_batch,x_len_batch):
														
 
															+                    # print(l,'=>',x)
														
 
															+                    # print(y)
														
 
															+                train_loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:x_batch,target:y_batch,length:x_len_batch,})
														
 
															+            test_loss,_logits,_trans = sess.run([crf_loss,logits,trans],feed_dict={char_input:test_x,target:test_y,length:test_x_len})
														
 
															+            acc = getAcc(test_y,_logits,_trans,test_x_len)
														
 
															+            print("==>epoch:"+str(epoch))
														
 
															+            print("--test --"," acc:",acc,'test_loss:',test_loss)
														
 
															+            print("--train--","loss:",train_loss,"have_done")
														
 
															+            if test_loss<_test_loss:
														
 
															+                _test_loss = test_loss
														
 
															+                print("Saving-"+str(epoch)+"-model,test_loss:"+str(test_loss))
														
 
															+                saver.save(sess,"models/"+str(epoch)+"-"+str(acc)+"-"+str(test_loss)+"/model.ckpt")
														
 
															+
														
 
															+def batch_iter(x, y,x_len, batch_size=256):
														
 
															+    '''
														
 
															+    :param x: content2id
														
 
															+    :param y: label2id
														
 
															+    :param batch_size: 每次进入模型的句子数量
														
 
															+    :return:
														
 
															+    '''
														
 
															+    data_len = len(x)
														
 
															+    num_batch = int((data_len - 1) / batch_size) + 1 #计算一个epoch,需要多少次batch
														
 
															+
														
 
															+    # indices = np.random.permutation(data_len) #生成随机数列
														
 
															+    # x_shuffle = x[indices]
														
 
															+    # y_shuffle = y[indices]
														
 
															+    # x_len_shuffle = x_len[indices]
														
 
															+    for i in range(num_batch):
														
 
															+        start_id = batch_size * i
														
 
															+        end_id = min(batch_size*(i+1), data_len)
														
 
															+        yield x[start_id:end_id], y[start_id:end_id],x_len[start_id:end_id]
														
 
															+from sklearn.metrics import accuracy_score
														
 
															+def getAcc(y_batch,logits,trans,lengths):
														
 
															+    index = 0
														
 
															+    small = -1000.0
														
 
															+    start = np.asarray([[small] * 4 + [0]])
														
 
															+
														
 
															+    preds = []
														
 
															+    true_tags = []
														
 
															+    for score, length in zip(logits, lengths):
														
 
															+        score = score[:length]
														
 
															+        # pad = small * np.ones([length, 1])
														
 
															+        # logit = np.concatenate([score, pad], axis=1)
														
 
															+        # logit = np.concatenate([start, logit], axis=0)
														
 
															+        # path, _ = tf.contrib.crf.viterbi_decode(logit, trans)
														
 
															+        path, _ = tf.contrib.crf.viterbi_decode(score, trans)
														
 
															+        preds += path[0:]
														
 
															+        # preds += path[1:]
														
 
															+        index += 1
														
 
															+
														
 
															+    for y, length in zip(y_batch, lengths):
														
 
															+        y = y.tolist()
														
 
															+        true_tags += y[: length]
														
 
															+    acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1)))
														
 
															+    return acc
														
 
															+
														
 
															+def predict(articles,model_file):
														
 
															+
														
 
															+    vocab_model = getModel_word()
														
 
															+    vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
														
 
															+    model_file = model_file
														
 
															+    sess = tf.Session(graph=tf.Graph())
														
 
															+    with sess:
														
 
															+        char_input, logits, target, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
														
 
															+        sess.run(tf.global_variables_initializer())
														
 
															+        saver = tf.train.Saver()
														
 
															+        saver.restore(sess, model_file)
														
 
															+        re_ner = re.compile("12+?3")
														
 
															+        article_ner_list = []
														
 
															+        count = 0
														
 
															+        for sentences in articles:
														
 
															+            count += 1
														
 
															+            print(count)
														
 
															+            sentence_len = [ len(sentence) for sentence in sentences]
														
 
															+            maxlen = max(sentence_len)
														
 
															+            sentences_x = []
														
 
															+            for sentence in sentences:
														
 
															+                sentence = list(sentence)
														
 
															+                sentence2id = [getIndexOfWord(word) for word in sentence]
														
 
															+                sentences_x.append(sentence2id)
														
 
															+            sentences_x = pad_sequences(sentences_x,maxlen=maxlen,padding="post", truncating="post")
														
 
															+            sentences_x = [np.array(x) for x in sentences_x]
														
 
															+
														
 
															+            _logits,_trans = sess.run([logits,trans],feed_dict={char_input:np.array(sentences_x),length:sentence_len})
														
 
															+
														
 
															+            viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=sentence_len,tag_num=4)
														
 
															+
														
 
															+            ner_list = []
														
 
															+            for _seq,sentence in zip(viterbi_sequence,sentences):
														
 
															+                seq_id = ''.join([str(s) for s in _seq])
														
 
															+                if re_ner.search(seq_id):
														
 
															+                    # print("sentence: ",sentence)
														
 
															+                    for _ner in re_ner.finditer(seq_id):
														
 
															+                        start = _ner.start()
														
 
															+                        end = _ner.end()
														
 
															+                        n = sentence[start:end]
														
 
															+                        # print(n,'<==>',start,end)
														
 
															+                        ner_list.append((n,start,end))
														
 
															+            article_ner_list.append(ner_list)
														
 
															+    return article_ner_list
														
 
															+
														
 
															+def decode(logits, trans, sequence_lengths, tag_num):
														
 
															+    viterbi_sequences = []
														
 
															+    for logit, length in zip(logits, sequence_lengths):
														
 
															+        score = logit[:length]
														
 
															+        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
														
 
															+        viterbi_sequences.append(viterbi_seq)
														
 
															+    return viterbi_sequences
														
 
															+
														
 
															+def test2():
														
 
															+    punishNo = {
														
 
															+        'O': 0,
														
 
															+        'PN_B': 1,
														
 
															+        'PN_M': 2,
														
 
															+        'PN_E': 3
														
 
															+    }
														
 
															+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0)
														
 
															+
														
 
															+    train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0)
														
 
															+    punishNo_in_text = set()
														
 
															+    for textId in train_data['document_id']:
														
 
															+        punishNo_in_text.add(textId)
														
 
															+    for _ in range(1,2821):
														
 
															+        punishNo_in_text.add(_)
														
 
															+    punishNo_in_text = list(punishNo_in_text)
														
 
															+    data = data[data['document_id'].isin(punishNo_in_text)]
														
 
															+    data = data.dropna(subset=['text'])
														
 
															+    re_rule1 = re.compile('\[|\]')
														
 
															+    data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']]
														
 
															+    data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
														
 
															+    article_sentences = []
														
 
															+    for id,text,sentences in zip(data['document_id'],data['text'],data['sentences']):
														
 
															+        # if id in punishNo_in_text:
														
 
															+        sentences_count = len(sentences)
														
 
															+        sentence_list = []
														
 
															+        for i in range(sentences_count-1):
														
 
															+            sentence = text[sentences[i]:sentences[i+1]]
														
 
															+            sentence_list.append(sentence)
														
 
															+        article_sentences.append(sentence_list)
														
 
															+    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
														
 
															+    punishNo_ner = predict(article_sentences,model_file)
														
 
															+    data['punishNo_test'] = punishNo_ner
														
 
															+    punishNo_label = [[] for _ in range(13500)]
														
 
															+    for textId, begin, end, entity_text in zip(train_data['document_id'], train_data['begin_index'],
														
 
															+                                                train_data['end_index'],train_data['entity_text']):
														
 
															+        punishNo_label[textId].append((entity_text,begin,end))
														
 
															+    punishNo_right = []
														
 
															+    for id in data['document_id']:
														
 
															+        punishNo_right.append(punishNo_label[id])
														
 
															+    data['punishNo_right'] = punishNo_right
														
 
															+    test_res = []
														
 
															+    for test,label_list in zip(data['punishNo_test'],data['punishNo_right']):
														
 
															+        if set(test)==set(label_list):
														
 
															+            test_res.append(1)
														
 
															+        else:
														
 
															+            test_res.append(0)
														
 
															+    data['test_res'] = test_res
														
 
															+    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishNo_test.xlsx",encoding='utf-8')
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+def test():
														
 
															+    data = pd.read_csv("data/ALLDATA.csv", index_col=0)[500:600]
														
 
															+    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
														
 
															+    # data = data[35000:45000]
														
 
															+    sentences_list = []
														
 
															+    for sentences in data['sentences']:
														
 
															+        sentences = sentences.split("*#*>")
														
 
															+        sentences_list.append(sentences)
														
 
															+    print(len(sentences_list))
														
 
															+    pn_ner = predict(sentences_list,model_file)
														
 
															+    print('*'*20)
														
 
															+    print(len(pn_ner),pn_ner)
														
 
															+    data['ner_test'] = pn_ner
														
 
															+    print(data.head(3))
														
 
															+    # data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-3.xlsx",encoding='utf-8')
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    # train()
														
 
															+    # test()
														
 
															+    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
														
 
															+    sentences_list = '行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号,'.split('。')
														
 
															+    pn_ner = predict([sentences_list], model_file)
														
 
															+    print(pn_ner)
														
 
															+    # test2()
														
 
															+    # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
														
 
															+    # sentences = data['sentences'][51313]
														
 
															+    # sentences = sentences.split("*#*>")
														
 
															+    # model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
														
 
															+    # predict(sentences,model_file)
														
 
															+    pass
														
--- a/BiddingKG/dl/complaint/punish_rule.py
+++ b/BiddingKG/dl/complaint/punish_rule.py
@@ -0,0 +1,488 @@
 
															+#!/usr/bin/python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author  : bidikeji
														
 
															+# @Time    : 2020/12/24 0024 15:23
														
 
															+import re
														
 
															+import time
														
 
															+import tensorflow as tf
														
 
															+from BiddingKG.dl.common.Utils import *
														
 
															+from tensorflow.contrib.crf import crf_log_likelihood
														
 
															+from tensorflow.contrib.layers.python.layers import initializers
														
 
															+from keras.preprocessing.sequence import pad_sequences
														
 
															+import BiddingKG.dl.interface.Preprocessing as Preprocessing
														
 
															+from BiddingKG.dl.interface.Preprocessing import *
														
 
															+
														
 
															+def BiLSTM_CRF_tfmodel(sess,weights):
														
 
															+    BiRNN_Units = 140
														
 
															+    chunk_tags = {
														
 
															+        'O': 0,
														
 
															+        'PN_B': 1,
														
 
															+        'PN_M': 2,
														
 
															+        'PN_E': 3
														
 
															+    }
														
 
															+
														
 
															+    def embedding_layer(input):
														
 
															+        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
														
 
															+        return tf.nn.embedding_lookup(params=embedding,ids=input)
														
 
															+
														
 
															+    def BiLSTM_Layer(input,length):
														
 
															+        with tf.variable_scope("BiLSTM"):
														
 
															+            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
														
 
															+            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
														
 
															+        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
														
 
															+        output = tf.concat(output,2)
														
 
															+        return output
														
 
															+
														
 
															+    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
														
 
															+        with tf.variable_scope("CRF"):
														
 
															+            with tf.variable_scope("hidden"):
														
 
															+                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
														
 
															+                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
														
 
															+                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
														
 
															+                # print(input)
														
 
															+                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
														
 
															+                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
														
 
															+            with tf.variable_scope("output"):
														
 
															+                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
														
 
															+                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
														
 
															+                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
														
 
															+                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
														
 
															+        return logits_
														
 
															+
														
 
															+    def layer_loss(input,true_target,num_tags,length):
														
 
															+        with tf.variable_scope("crf_loss"):
														
 
															+            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
														
 
															+            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
														
 
															+            return tf.reduce_mean(-log_likelihood),trans
														
 
															+
														
 
															+    with sess.graph.as_default():
														
 
															+        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
														
 
															+        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
														
 
															+        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
														
 
															+        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
														
 
															+
														
 
															+        _embedding = embedding_layer(char_input)
														
 
															+        _shape = tf.shape(char_input)
														
 
															+        batch_size = _shape[0]
														
 
															+        step_size = _shape[-1]
														
 
															+        bilstm = BiLSTM_Layer(_embedding,length)
														
 
															+        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
														
 
															+        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
														
 
															+        global_step = tf.Variable(0,trainable=False)
														
 
															+        with tf.variable_scope("optimizer"):
														
 
															+            opt = tf.train.AdamOptimizer(0.002)
														
 
															+            grads_vars = opt.compute_gradients(crf_loss)
														
 
															+            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
														
 
															+            train_op = opt.apply_gradients(capped_grads_vars,global_step)
														
 
															+            return char_input,_logits,target,length,crf_loss,trans,train_op
														
 
															+
														
 
															+def decode(logits, trans, sequence_lengths, tag_num):
														
 
															+    viterbi_sequences = []
														
 
															+    for logit, length in zip(logits, sequence_lengths):
														
 
															+        score = logit[:length]
														
 
															+        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
														
 
															+        viterbi_sequences.append(viterbi_seq)
														
 
															+    return viterbi_sequences
														
 
															+
														
 
															+class Punish_Extract():
														
 
															+    def __init__(self, model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"):
														
 
															+        self.sess = tf.Session(graph=tf.Graph())
														
 
															+        self.code = ""
														
 
															+        self.punish_dicition = ""
														
 
															+        self.model_file = model_file #预测编号模型
														
 
															+        self.load_model()
														
 
															+
														
 
															+    # 加载处罚编号预测模型
														
 
															+    def load_model(self):
														
 
															+        with self.sess.as_default() as sess:
														
 
															+            with sess.graph.as_default():
														
 
															+                vocab_model = getModel_word()
														
 
															+                vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
														
 
															+                self.char_input, self.logits, self.target, self.length, self.crf_loss, self.trans, self.train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
														
 
															+                sess.run(tf.global_variables_initializer())
														
 
															+                saver = tf.train.Saver()
														
 
															+                saver.restore(sess, self.model_file)
														
 
															+
														
 
															+    # 处罚编号预测
														
 
															+    def predict_punishCode(self,list_sentences):
														
 
															+        re_ner = re.compile("12+?3")
														
 
															+        article_ner_list = []
														
 
															+        count = 0
														
 
															+        with self.sess.as_default():
														
 
															+            with self.sess.graph.as_default():
														
 
															+                for sentences in list_sentences:
														
 
															+                    count += 1
														
 
															+                    # print(count)
														
 
															+                    sentence_len = [len(sentence.sentence_text) for sentence in sentences]
														
 
															+                    maxlen = max(sentence_len)
														
 
															+                    sentences_x = []
														
 
															+                    for sentence in sentences:
														
 
															+                        sentence = sentence.sentence_text
														
 
															+                        sentence = list(sentence)
														
 
															+                        sentence2id = [getIndexOfWord(word) for word in sentence]
														
 
															+                        sentences_x.append(sentence2id)
														
 
															+                    sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
														
 
															+                    sentences_x = [np.array(x) for x in sentences_x]
														
 
															+                    _logits, _trans = self.sess.run([self.logits, self.trans],
														
 
															+                                               feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
														
 
															+                    viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
														
 
															+
														
 
															+                    ner_list = []
														
 
															+                    for _seq, sentence in zip(viterbi_sequence, sentences):
														
 
															+                        sentence = sentence.sentence_text
														
 
															+                        seq_id = ''.join([str(s) for s in _seq])
														
 
															+                        if re_ner.search(seq_id):
														
 
															+                            # print("sentence: ",sentence)
														
 
															+                            for _ner in re_ner.finditer(seq_id):
														
 
															+                                start = _ner.start()
														
 
															+                                end = _ner.end()
														
 
															+                                n = sentence[start:end]
														
 
															+                                # print(n,'<==>',start,end)
														
 
															+                                # ner_list.append((n, start, end))
														
 
															+                                ner_list.append(n)  # 改为只返回实体字符
														
 
															+                    # article_ner_list.append(ner_list)
														
 
															+                    article_ner_list.append('；'.join(set(ner_list)))
														
 
															+        return article_ner_list[0]
														
 
															+
														
 
															+    # 处罚类型
														
 
															+    def get_punishType(self, x1, x2):
														
 
															+        '''通过文章标题及内容判断文章类别
														
 
															+        x1: 标题
														
 
															+        x2: 内容
														
 
															+        return 类别'''
														
 
															+        # x1 = x1.replace('(','（').replace(')', '）').replace(' ','')
														
 
															+        # x2 = x2.replace('(', '（').replace(')', '）').replace(' ', '')
														
 
															+        '''标题正则'''
														
 
															+        # 未知公告
														
 
															+        unknow = re.compile('采购方式|采购公告|磋商公告|谈判公告|交易公告$|征集|征求|招标公告|竞标公告|中标公告|'
														
 
															+                            '成交公告|成交信息|流标公告|废标公告|城市管理考评|决算表|决算|预算|资格考试|招聘|选聘'
														
 
															+                            '|聘请|拟录用|无违规违法|无此项信息|暂无工程投标违法|管理办法|指导意见|无投诉|投诉办法'
														
 
															+                            '公共资源交易情况|绩效评价|考试成绩|付息公告|不动产|办证|印发|转发')  #|结果公示 部分是
														
 
															+        # 投诉处理
														
 
															+        tscl = re.compile('投诉不予[处受]理|投诉不成立|终止投诉|投诉终止|不予受理|投诉事?项?的?处理')
														
 
															+        # 行政处罚
														
 
															+        xzcf = re.compile('行政处罚|行政处理|政处罚|行政裁决|防罚|公罚|医罚|环罚|政罚|文罚|局罚|旅罚|财罚|运罚')
														
 
															+        # 监督检查
														
 
															+        jdjc = re.compile('(监督检查的?问?题?(处理|整改|记分|结果|决定|处罚))|监督处罚|调查处理|监督处理')
														
 
															+        # 严重违法
														
 
															+        yzwf = re.compile('严重违法失信|黑名单|失信名单')
														
 
															+        # 不良行为
														
 
															+        blxw = re.compile('((不良|失信|不诚信|差错|不规范|违规|违约|处罚|违法)(行为|记录|信息))|((违约|违规|违法)(处理|操作|情况|问题))'
														
 
															+                          '|通报批评|记分管理|迟到|早退|缺席|虚假材料|弄虚作假|履职不到位|诚信考核扣分|串通投标'
														
 
															+                          '|审核不通过|码一致|地址一致|扣分处理|扣分通知|扣[0-9]+分|责令整改|信用信息认定书$'
														
 
															+                          '|关于.{,30}的处罚|关于.{,10}的?考评通报|关于.{,30}扣分情况|不规范代理行为'
														
 
															+                          '|(取消|暂停|限制).{,50}((专家|评标|评委|投标|竞价|被抽取|中标|供应商|候选人)资格)'
														
 
															+                          '|(代理服?务?机构).{,10}(扣分)|(专家).{,30}(扣分|记分|处罚)|对.{,30}处理|冻结.{,30}账号')
														
 
															+        # 其他不良行为
														
 
															+        other = re.compile('质疑|代理机构进场交易情况|网上投诉办理|信用奖惩|信用奖罚|进场工作.{,5}考核'
														
 
															+                           '|举报处理|结果无效|成交无效|行政复议')
														
 
															+
														
 
															+        '''正文内容正则'''
														
 
															+        # 投诉处理
														
 
															+        tscl_c = re.compile('(投诉(人|单位)[1-9]?(名称)?[:：])|(投诉事项[1-5一二三四五、]*部?分?(成立|予以受理))'
														
 
															+                            '|((驳回|撤回|撤销|终止)[^，。]{,60}(投诉|质疑))')
														
 
															+        # 行政处罚
														
 
															+        xzcf_c = re.compile('((处理依据及结果|处理结果|处罚结果)).*行政处罚|如下行政处罚|行政处罚决定')
														
 
															+        # 诚信加分
														
 
															+        cxjf_c = re.compile('处罚结果.*诚信加分')
														
 
															+        # 严重违法失信
														
 
															+        yzwf_c = re.compile('工商部门严重违法失信起名单|严重违法失信的具体情形') #|严重违法失信的具体情形
														
 
															+        # 不良行为
														
 
															+        blxw_c = re.compile('(取消|暂停|限制).{,30}((专家|评标|评委|投标|采购|竞价|被抽取|中标|供应商)的?资格)'
														
 
															+                            '|(处罚结果|处罚情况).*(扣[1-9]*分|记分|不良行为|不良记录|不良信用|不诚信|扣除信用'
														
 
															+                            '|诚信档案|信用信息|取消.*资格|口头警告|处罚机关|责令改正|罚款|限制投标|暂扣|禁止'
														
 
															+                            '|暂停|封禁|暂无|行政处罚)|处罚结果'
														
 
															+                            '|处罚主题|禁止参与.{,10}政府采购活动|列入不良行为|处罚如下|如下处罚|违规处罚|处罚违规'
														
 
															+                            '|责令改正|责令整改|处罚依据|进行以下处理|处理依据及结果|处理结果|处罚决定书|'
														
 
															+                            '(不规范|不良|不诚信)行为记录')
														
 
															+        # 其他不良行为
														
 
															+        other_c = re.compile('质疑(人|单位)[1-9]?(名称)?：|公告期内受质疑')
														
 
															+
														
 
															+        if re.search(unknow, x1):
														
 
															+            return re.search(unknow, x1).group(0), '未知类别'
														
 
															+        elif re.search(yzwf, x1):
														
 
															+            return re.search(yzwf, x1).group(0), '严重违法'
														
 
															+        elif re.search(yzwf_c, x2):
														
 
															+            return re.search(yzwf_c, x2).group(0), '严重违法'
														
 
															+
														
 
															+        elif re.search(tscl, x1):
														
 
															+            return re.search(tscl, x1).group(0), '投诉处理'
														
 
															+        elif re.search(xzcf, x1):
														
 
															+            return re.search(xzcf, x1).group(0), '行政处罚'
														
 
															+        elif re.search(jdjc, x1):
														
 
															+            return re.search(jdjc, x1).group(0), '监督检查'
														
 
															+        elif re.search(blxw, x1):
														
 
															+            return re.search(blxw, x1).group(0), '不良行为'
														
 
															+        elif re.search(other, x1):
														
 
															+            return re.search(other, x1).group(0), '其他不良行为'
														
 
															+
														
 
															+        elif re.search(tscl_c, x2):
														
 
															+            return re.search(tscl_c, x2).group(0), '投诉处理'
														
 
															+        elif re.search(xzcf_c, x2):
														
 
															+            return re.search(xzcf_c, x2).group(0), '行政处罚'
														
 
															+        elif re.search(cxjf_c, x2):
														
 
															+            return re.search(cxjf_c, x2).group(0), '诚信加分'
														
 
															+
														
 
															+        elif re.search(blxw_c, x2):
														
 
															+            return re.search(blxw_c, x2).group(0), '不良行为'
														
 
															+        elif re.search(other_c, x2):
														
 
															+            return re.search(other_c, x2).group(0), '其他不良行为'
														
 
															+
														
 
															+        return ' ', '未知类别'
														
 
															+
														
 
															+    # 处罚决定
														
 
															+    def get_punishDecision(self, x, x2):
														
 
															+        '''通过正则匹配文章内容中的处理决定
														
 
															+        x:正文内容
														
 
															+        x2: 处罚类别
														
 
															+        return 处理决定字符串'''
														
 
															+        rule1 = re.compile(
														
 
															+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处理意见|行政处罚|处罚)(如下|如下))'
														
 
															+            '|((以下|如下)(决定|处理|处理意见|行政处罚|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
														
 
															+            '|整改意见)[:：].{5,}')
														
 
															+        rule2 = re.compile(
														
 
															+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处罚|处理意见)(如下|如下))'
														
 
															+            '|((以下|如下)(决定|处理|处理意见|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
														
 
															+            '|处罚内容)[：，,].{10,}')
														
 
															+        rule3 = re.compile('考评结果：?.*')
														
 
															+        rule4 = re.compile('(依据|根据)《.*》.*')
														
 
															+        if x2 == '未知类别':
														
 
															+            return ' '
														
 
															+        elif re.search(rule1, x[-int(len(x)*0.4):]):
														
 
															+            return re.search(rule1, x[-int(len(x)*0.4):]).group(0)
														
 
															+        elif re.search(rule1, x[-int(len(x)*0.6):]):
														
 
															+            return re.search(rule1, x[-int(len(x)*0.6):]).group(0)
														
 
															+        elif re.search(rule2, x[-int(len(x)*0.7):]):
														
 
															+            return re.search(rule2, x[-int(len(x)*0.7):]).group(0)
														
 
															+        elif re.search(rule3, x[-int(len(x)*0.6):]):
														
 
															+            return re.search(rule3, x[-int(len(x)*0.6):]).group(0)
														
 
															+        elif re.search(rule4, x[-int(len(x)*0.4):]):
														
 
															+            return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
														
 
															+        else:
														
 
															+            return ' '
														
 
															+
														
 
															+    # 投诉是否成立
														
 
															+    def get_punishWhether(self, x1, x2, x3):
														
 
															+        '''通过正则匹配处理决定判断投诉是否成立
														
 
															+        x1: 处理决定字符串
														
 
															+        x2: 正文内容
														
 
															+        x3: 处罚类别
														
 
															+        return 投诉是否成立'''
														
 
															+        p1 = re.compile('(投诉|投拆|质疑|举报)(事项|内容|事实)?[^不，。]{,10}(成立|属实|予以受理|予以支持)|责令|废标|(中标|成交)[^，。]{,10}无效'
														
 
															+                        '|取消[^，。]{,60}资格|罚款|重新(组织|开展)?(招标|采购)|投诉成立|被投诉人存在违法违规行为'
														
 
															+                        '|采购活动违法|(中标|评标|成交)结果无效')
														
 
															+        p2 = re.compile('投诉不予[处受]理|((投诉|投拆|质疑|举报)(事项|内容|事实)?[^，。]{,10}(不成立|情?况?不属实|不予支持|缺乏事实依据))'
														
 
															+                        '|((驳回|撤回|撤销|终止)[^，。]*(投诉|质疑|诉求))|终止[^，。]{,20}(行政裁决|投诉处理|采购活动)|投诉终止|投诉无效'
														
 
															+                        '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^，。]{,10}不成立'
														
 
															+                        '|维持被投诉人|不支持[^，。]{,20}投诉|无确凿证据')
														
 
															+        if x3 != '投诉处理':
														
 
															+            return ' '
														
 
															+        elif re.search(p1, x1):
														
 
															+            return '投诉成立'
														
 
															+        elif re.search(p2, x1):
														
 
															+            return '投诉无效'
														
 
															+        elif re.search(p1, x2):
														
 
															+            return '投诉成立'
														
 
															+        elif re.search(p2, x2):
														
 
															+            return '投诉无效'
														
 
															+        return ' '
														
 
															+
														
 
															+    # 执法机构、处罚时间
														
 
															+    def get_institution(self, title, sentences_l, entity_l):
														
 
															+        '''
														
 
															+        通过判断实体前信息判断改实体是否为执法机构
														
 
															+        :param title: 文章标题
														
 
															+        :param sentences_l: 单篇公告句子列表
														
 
															+        :param entity_l: 单篇公告实体列表
														
 
															+        :return: 执法机构及处罚时间字符串，多个的用；号隔开
														
 
															+        '''
														
 
															+        institutions = []
														
 
															+        punishTimes = []
														
 
															+        institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[:：]")
														
 
															+        punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[:：]")
														
 
															+        # 通过实体前面关键词判断是否为执法机构或处罚时间
														
 
															+        for ner in entity_l:
														
 
															+            if ner.entity_type == 'org':
														
 
															+                left = sentences_l[ner.sentence_index].sentence_text[
														
 
															+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
														
 
															+                if institution_1.search(left):
														
 
															+                    institutions.append(ner)
														
 
															+                elif institutions != [] and ner.sentence_index == institutions[-1].sentence_index and \
														
 
															+                        ner.wordOffset_begin - institutions[-1].wordOffset_end < 2 and \
														
 
															+                        sentences_l[ner.sentence_index].sentence_text[
														
 
															+                        ner.wordOffset_begin:institutions[-1].wordOffset_end] \
														
 
															+                        in ['', '、', '和', '及']:
														
 
															+                    institutions.append(ner)
														
 
															+            elif ner.entity_type == 'time':
														
 
															+                left = sentences_l[ner.sentence_index].sentence_text[
														
 
															+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
														
 
															+                if punishTimes_1.search(left):
														
 
															+                    punishTimes.append(ner)
														
 
															+
														
 
															+        institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
														
 
															+        institution_time = re.compile(
														
 
															+            "(^，?[\d一二三四五六七八九十]{4}，?[/年-][\d一二三四五六七八九十]{1,2}，?[/月-][\d一二三四五六七八九十]{1,2}，?[/日-]?)")
														
 
															+        ins = ""
														
 
															+        ptime = ""
														
 
															+        # 如果前面步骤找不到处罚机构则在标题找实体，并正则检查是否有关键词
														
 
															+        if institutions == []:
														
 
															+            title_ners = getNers([title], useselffool=True)
														
 
															+            if title_ners[0]:
														
 
															+                for title_ner in title_ners[0]:
														
 
															+                    if title_ner[2] == 'org' and institution_title.search(title_ner[3]):
														
 
															+                        ins = title_ner[3]
														
 
															+                        break
														
 
															+        if punishTimes == [] or institutions == []:
														
 
															+            # 如果前面步骤还没找到要素，则通过公司实体后面是否有日期关键词，有则作为处罚机构和处罚时间
														
 
															+            for ner in [ner for ner in entity_l if ner.entity_type == 'org'][-5:][::-1]:
														
 
															+                right = sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_end:ner.wordOffset_end + 16]
														
 
															+                if institution_time.search(right):
														
 
															+                    if ins == '':
														
 
															+                        ins = ner.entity_text
														
 
															+                    if ptime == '':
														
 
															+                        ptime = institution_time.search(right).group(1)
														
 
															+                    break
														
 
															+            # 前面步骤都没找到则判断最后一个时间实体是否在文章末尾，是则作为处罚时间
														
 
															+            if ptime == '':
														
 
															+                n_time = [ner for ner in entity_l if ner.entity_type == 'time']
														
 
															+                if len(n_time) != 0:
														
 
															+                    ner = n_time[-1]
														
 
															+                    if ner.sentence_index == len(sentences_l) - 1:
														
 
															+                        textLong = len(sentences_l[ner.sentence_index].sentence_text)
														
 
															+                        if ner.wordOffset_end > textLong - 3 and len(ner.entity_text) > 3:
														
 
															+                            ptime = ner.entity_text
														
 
															+        institutions = [ner.entity_text for ner in institutions]
														
 
															+        punishTimes = [ner.entity_text for ner in punishTimes]
														
 
															+        if institutions == [] and ins != "":
														
 
															+            institutions.append(ins)
														
 
															+        if punishTimes == [] and ptime != "":
														
 
															+            punishTimes.append(ptime)
														
 
															+        return "；".join(institutions), "；".join(punishTimes)
														
 
															+
														
 
															+    # 投诉人、被投诉人、被处罚人
														
 
															+    def get_complainant(self, punishType, sentences_l, entity_l):
														
 
															+        '''
														
 
															+        通过对公告类别、句子列表、实体列表正则寻找投诉人、被投诉人、处罚人
														
 
															+        :param punishType: 公告处罚类别
														
 
															+        :param sentences_l: 单篇公告句子列表
														
 
															+        :param entity_l: 单篇公告实体列表
														
 
															+        :return: 投诉人、被投诉人
														
 
															+        '''
														
 
															+        complainants = []  # 投诉人
														
 
															+        punishPeople = []  # 被投诉人、被处罚人
														
 
															+        size = 16
														
 
															+        # 投诉人、质疑人
														
 
															+        complainants_rule1 = re.compile(
														
 
															+            "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
														
 
															+        # 被处罚人，被投诉人
														
 
															+        punishPeople_rule1 = re.compile(
														
 
															+            "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
														
 
															+        punishPeople_rule2_1 = re.compile("，$")
														
 
															+        punishPeople_rule2_2 = re.compile("^[:：]")
														
 
															+        punishPeople_rule3_1 = re.compile("(?:关于|对)[^，。]*$")
														
 
															+        punishPeople_rule3_2 = re.compile("^[^，。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
														
 
															+
														
 
															+        punish_l = []  # 处罚实体列表
														
 
															+        tmp = []
														
 
															+        for ner in [ner for ner in entity_l if ner.entity_type in ['org', 'company', 'person']]:
														
 
															+            if tmp == []:
														
 
															+                tmp.append(ner)
														
 
															+            elif ner.entity_type == tmp[-1].entity_type and ner.sentence_index == tmp[-1].sentence_index and \
														
 
															+                    ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
														
 
															+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
														
 
															+                '',
														
 
															+                '、',
														
 
															+                '和',
														
 
															+                '及']:
														
 
															+                tmp.append(ner)
														
 
															+            elif ner.entity_type in ['org', 'company'] and tmp[-1].entity_type in ['org', 'company'] and \
														
 
															+                    ner.sentence_index == tmp[-1].sentence_index and ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
														
 
															+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
														
 
															+                '',
														
 
															+                '、',
														
 
															+                '和',
														
 
															+                '及']:
														
 
															+                tmp.append(ner)
														
 
															+            else:
														
 
															+                punish_l.append(tmp)
														
 
															+                tmp = [ner]
														
 
															+        for ner_l in punish_l:
														
 
															+            begin_index = ner_l[0].wordOffset_begin
														
 
															+            end_index = ner_l[-1].wordOffset_end
														
 
															+            left = sentences_l[ner_l[0].sentence_index].sentence_text[max(0, begin_index - size):begin_index]
														
 
															+            right = sentences_l[ner_l[0].sentence_index].sentence_text[end_index:end_index + size]
														
 
															+            if complainants_rule1.search(left):
														
 
															+                complainants.append(ner_l)
														
 
															+            elif punishPeople_rule1.search(left):
														
 
															+                punishPeople.append(ner_l)
														
 
															+            elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
														
 
															+                if punishType == '投诉处理':
														
 
															+                    complainants.append(ner_l)
														
 
															+                else:
														
 
															+                    punishPeople.append(ner_l)
														
 
															+            elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
														
 
															+                punishPeople.append(ner_l)
														
 
															+        complainants = set([it.entity_text for l in complainants for it in l])
														
 
															+        punishPeople = set([it.entity_text for l in punishPeople for it in l])
														
 
															+        return '；'.join(complainants), '；'.join(punishPeople)
														
 
															+
														
 
															+def get_punish_extracts(doc_id=' ', title=' ', text=' '):
														
 
															+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
														
 
															+                                                                                    useselffool=True)
														
 
															+    punish_code = punish.predict_punishCode(list_sentences)
														
 
															+    # print('处罚编号： ',punish_code)
														
 
															+    institutions, punishTimes = punish.get_institution(title, list_sentences[0], list_entitys[0])
														
 
															+    # print('执法机构:',institutions, '\n 处罚时间：', punishTimes)
														
 
															+    keyword, punishType = punish.get_punishType(title, text)
														
 
															+    # print('处罚类型：',punishType)
														
 
															+    punishDecision = punish.get_punishDecision(text, punishType)
														
 
															+    # print('处罚决定：',punishDecision)
														
 
															+    punishWhether= punish.get_punishWhether(punishDecision, text, punishType)
														
 
															+    # print('投诉是否成立：',punishWhether)
														
 
															+    complainants, punishPeople = punish.get_complainant(punishType, list_sentences[0], list_entitys[0])
														
 
															+    # print('投诉人：%s  被投诉人：%s'%(complainants, punishPeople))
														
 
															+    return punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether,institutions, punishTimes
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
														
 
															+
														
 
															+    import pandas as pd
														
 
															+    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
														
 
															+    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
														
 
															+    # i = 89
														
 
															+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
														
 
															+    # i = 92
														
 
															+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
														
 
															+
														
 
															+    # t1 = time.time()
														
 
															+    # for i in df.index:
														
 
															+    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
														
 
															+    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
														
 
															+    #     df.loc[i, '投诉人'] = complainants
														
 
															+    #     df.loc[i, '被投诉人'] = punishPeople
														
 
															+    #     df.loc[i, '执法机构'] = institutions
														
 
															+    #     df.loc[i, '处罚时间'] = punishTimes
														
 
															+    #     df.loc[i, '处罚编号'] = punish_code
														
 
															+    #     print('完成第%d篇'%i)
														
 
															+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
														
 
															+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
														
 
															+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
														
 
															+    # #    'institution', 'punishTime', 'ner_test']])
														
 
															+    # t2 = time.time()
														
 
															+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
														
 
															+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
														
 
															+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
														
 
															+    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
														
 
															+    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
														
 
															+    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
														
 
															+    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
														
 
															+    # t3 = time.time()
														
 
															+    # print('处理耗时：%.4f, 保存耗时：%.4f'%(t2-t1, t3-t2))
														
 
															+    s = '厦财企〔2020〕12号，各有关单位：341号。厦财企〔2020〕12号，各有关单位：行政处罚厦建招诉决【2019】342号。行政处罚厦建招诉决【2019】343号。行政处罚厦建招诉决【2019】344号,'
														
 
															+    # list_sentences = [s.split('。')]
														
 
															+    # punish_code= punish.predict_punishCode( list_sentences)
														
 
															+    # print(punish_code)
														
 
															+
														
 
															+    punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
														
 
															+                get_punish_extracts(text=s)
														
 
															+    print(punish_code)
														
--- a/BiddingKG/dl/complaint/test1.py
+++ b/BiddingKG/dl/complaint/test1.py
@@ -0,0 +1,831 @@
 
															+import sys
														
 
															+import os
														
 
															+sys.path.append(os.path.abspath("../.."))
														
 
															+import pandas as pd
														
 
															+import re
														
 
															+from BiddingKG.dl.common.Utils import *
														
 
															+from BiddingKG.dl.interface.Entitys import *
														
 
															+from BiddingKG.dl.interface.predictor import *
														
 
															+from BiddingKG.dl.foolnltk import selffool
														
 
															+from BiddingKG.dl.interface.Preprocessing import *
														
 
															+
														
 
															+def get_data1():
														
 
															+    load1 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_TOU_SU_CHU_LI.csv")
														
 
															+    load2 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_WEI_FA_JI_LU.csv")
														
 
															+    load3 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_QI_TA_SHI_XIN.csv")
														
 
															+    load = pd.concat([load1, load2, load3], axis=0)
														
 
															+    load = load.reset_index(drop=True)
														
 
															+    load['PAGE_CONTENT'] = get_article1(load['PAGE_CONTENT'])
														
 
															+    sentences_list = get_sentences1(load['PAGE_CONTENT'])
														
 
															+    load['sentences'] = ['*#*>'.join(_sentences) for _sentences in sentences_list ]
														
 
															+    load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv")
														
 
															+
														
 
															+def get_ners():
														
 
															+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
														
 
															+    # data = data.head(3)
														
 
															+    nersList = []
														
 
															+    for index,_sentences in zip(data.index,data['sentences']):
														
 
															+        _sentences = _sentences.split('*#*>')
														
 
															+        _ners = getNers(_sentences,useselffool=True)
														
 
															+        word_index = 0
														
 
															+        for ners,sentence in zip(_ners, _sentences):
														
 
															+            if len(ners) != 0:
														
 
															+                word_ner_list = ['O']*len(sentence)
														
 
															+
														
 
															+                for ner in ners:
														
 
															+                    nerDict = dict()
														
 
															+                    entity_type = ner[2]
														
 
															+                    nerDict['entity_type'] = entity_type
														
 
															+                    entity_text = ner[3]
														
 
															+                    nerDict['entity_text'] = entity_text
														
 
															+                    begin_index = ner[0]
														
 
															+                    nerDict['begin_index'] = begin_index
														
 
															+                    end_index = ner[1] - 1
														
 
															+                    nerDict['end_index'] = end_index
														
 
															+                    wordOffset_begin = word_index + begin_index
														
 
															+                    nerDict['wordOffset_begin'] = wordOffset_begin
														
 
															+                    wordOffset_end = wordOffset_begin + len(entity_text)
														
 
															+                    nerDict['wordOffset_end'] = wordOffset_end
														
 
															+                    nerDict['sentence'] = sentence
														
 
															+                    nerDict['article_index'] = index
														
 
															+                    # print('====')
														
 
															+                    # print(begin_index,end_index,entity_type,entity_text)
														
 
															+                    nersList.append(nerDict)
														
 
															+                    # print(nerDict)
														
 
															+                    word_ner_list[begin_index] = 'B'
														
 
															+                    word_ner_list[begin_index+1:end_index] = ['I']*(end_index-begin_index-1)
														
 
															+            word_index += len(sentence)
														
 
															+    # save(nersList,"nersList.pk")
														
 
															+
														
 
															+# 相邻的（org、company）（person）合并
														
 
															+def get_unionNers():
														
 
															+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
														
 
															+    ners = load("nersList.pk")
														
 
															+    org_companys = [[] for _ in range(len(data))]
														
 
															+    type1 = ['org', 'company', 'union_oc']
														
 
															+    persons = [[] for _ in range(len(data))]
														
 
															+    type2 = ['person', 'union_person']
														
 
															+    for ner in ners:
														
 
															+        if ner['entity_type'] in type1:
														
 
															+            org_companys[ner['article_index']].append(ner)
														
 
															+        if ner['entity_type'] in type2:
														
 
															+            persons[ner['article_index']].append(ner)
														
 
															+    # 合并 org 和 company
														
 
															+    new_org_companys = []
														
 
															+    for org_company in org_companys:
														
 
															+        if org_company and len(org_company) > 1:
														
 
															+            union_nums = 0
														
 
															+            for i in range(len(org_company)-1):
														
 
															+                if org_company[i]['end_index'] == org_company[i + 1]['begin_index'] - 1 and org_company[i]['sentence'][org_company[i]['end_index']] == '、' \
														
 
															+                        and org_company[i]['sentence'] == org_company[i + 1]['sentence']:
														
 
															+                    # print(1)
														
 
															+                    org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
														
 
															+                    org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
														
 
															+                    org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
														
 
															+                    # print(org_company[i + 1]['entity_text'])
														
 
															+                    org_company[i] = 0
														
 
															+                    union_nums += 1
														
 
															+                elif org_company[i]['end_index'] == org_company[i + 1]['begin_index'] and org_company[i]['sentence'] == org_company[i+1]['sentence']:
														
 
															+                    # print(2)
														
 
															+                    org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
														
 
															+                    org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
														
 
															+                    org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
														
 
															+                    # print(org_company[i + 1]['entity_text'])
														
 
															+                    org_company[i] = 0
														
 
															+                    union_nums += 1
														
 
															+            for _ in range(union_nums):
														
 
															+                org_company.remove(0)
														
 
															+        new_org_companys.append(org_company)
														
 
															+    # 合并person
														
 
															+    new_persons = []
														
 
															+    for person in persons:
														
 
															+        if person and len(person) > 1:
														
 
															+            union_nums = 0
														
 
															+            for i in range(len(person) - 1):
														
 
															+                if person[i]['end_index'] == person[i + 1]['begin_index'] - 1 and person[i]['sentence'][person[i]['end_index']] == '、' \
														
 
															+                        and person[i]['sentence'] == person[i + 1]['sentence']:
														
 
															+                    # print(1)
														
 
															+                    person[i + 1]['begin_index'] = person[i]['begin_index']
														
 
															+                    person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
														
 
															+                    person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
														
 
															+                    # print(person[i + 1]['entity_text'])
														
 
															+                    person[i] = 0
														
 
															+                    union_nums += 1
														
 
															+                elif person[i]['end_index'] == person[i + 1]['begin_index'] and person[i]['sentence'] == person[i + 1]['sentence']:
														
 
															+                    # print(2)
														
 
															+                    person[i + 1]['begin_index'] = person[i]['begin_index']
														
 
															+                    person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
														
 
															+                    person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
														
 
															+                    # print(person[i + 1]['entity_text'])
														
 
															+                    person[i] = 0
														
 
															+                    union_nums += 1
														
 
															+            for _ in range(union_nums):
														
 
															+                person.remove(0)
														
 
															+        new_persons.append(person)
														
 
															+    # save([new_org_companys,new_persons],"unionNers.pk")
														
 
															+
														
 
															+def test02():
														
 
															+    load = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
														
 
															+
														
 
															+    text_rule = re.compile("监管调查|通报|不诚信|监督检查|不良|投诉|质疑|处罚|违法|违规|不予[受处]理|处理")
														
 
															+    title_rule = re.compile("中标公告|中标[（\(]成交[\)）]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
														
 
															+                            "|补贴公[示告]|废标公[示告]")
														
 
															+    # need_index = []
														
 
															+    # for index, title, text in zip(load.index, load['PAGE_TITLE'], load['PAGE_CONTENT']):
														
 
															+    #     a = 0
														
 
															+    #     if text_rule.search(text):
														
 
															+    #         a = 1
														
 
															+    #     if title_rule.search(title):
														
 
															+    #         a = 0
														
 
															+    #     if text_rule.search(title):
														
 
															+    #         a = 1
														
 
															+    #     if a:
														
 
															+    #         need_index.append(index)
														
 
															+    # print(len(need_index))
														
 
															+    # load = load.loc[need_index]
														
 
															+    # print(len(load))
														
 
															+    # load = load.reset_index(drop=True)
														
 
															+
														
 
															+    complainants_rule1 = re.compile("[^被]投[诉拆][人方]之?[\d一二三四五六七八九十]?(?:（.+?）)?[：:]+?")
														
 
															+    complaint_rule = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|疑问[人方]|检举[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?名称)?[:：]+")
														
 
															+    complainants_list = []
														
 
															+    a = 1
														
 
															+    load = load[9744:9745]
														
 
															+    for article,sentences in zip(load['PAGE_CONTENT'],load['sentences']):
														
 
															+        print(a)
														
 
															+        a+=1
														
 
															+        getSentences = sentences.split('*#*>')
														
 
															+        # print(getSentences)
														
 
															+        ners = getNers(getSentences,useselffool=True)
														
 
															+        print(ners)
														
 
															+        print('======================')
														
 
															+        word_index = 0
														
 
															+        ners_list = []
														
 
															+        for ner,sentence in zip(ners,getSentences):
														
 
															+            size = 16
														
 
															+            complainants = []
														
 
															+            if len(ner)!=0:
														
 
															+                for aner in ner:
														
 
															+
														
 
															+                    entity_type = aner[2]
														
 
															+                    entity_text = aner[3]
														
 
															+                    # begin = word_index + aner[0]
														
 
															+                    # end = begin + len(entity_text)
														
 
															+                    # 投诉人
														
 
															+                    if entity_type in ['org','company','person']:
														
 
															+                        left = sentence[max(0, aner[0] - size):aner[0]]
														
 
															+
														
 
															+                        print(entity_text,left,sentence)
														
 
															+                        if complaint_rule.search(left):
														
 
															+                            print('yes')
														
 
															+                            entity_type = 'complainant'
														
 
															+                            complainants.append(entity_text)
														
 
															+                    # ners_list.append([begin, end, entity_type, entity_text])
														
 
															+            word_index += len(sentence)
														
 
															+        complainants_list.append(complainants)
														
 
															+
														
 
															+        # test
														
 
															+        # for i in ners_list:
														
 
															+        #     print(i[3])
														
 
															+        #     print(processed[0][i[0]:i[1]])
														
 
															+    load['complainant'] = complainants_list
														
 
															+    # load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\test01.csv")
														
 
															+
														
 
															+# 投诉人、被投诉人、被处罚人
														
 
															+def get_complainant():
														
 
															+    data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2.xlsx",index_col=0)
														
 
															+    # ners = load("nersList.pk")
														
 
															+    unionNers = load("unionNers.pk")
														
 
															+    ners = [i+j for i,j in zip(unionNers[0],unionNers[1])]
														
 
															+    complainants = [[] for _ in range(len(data))]
														
 
															+    punishPeople = [[] for _ in range(len(data))]
														
 
															+    a = ['org','company','person']
														
 
															+    size = 16
														
 
															+    # 投诉人、质疑人
														
 
															+    complainants_rule1 = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
														
 
															+    # 被处罚人，被投诉人
														
 
															+    punishPeople_rule1 = re.compile("(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
														
 
															+    punishPeople_rule2_1 = re.compile("，$")
														
 
															+    punishPeople_rule2_2 = re.compile("^[:：]")
														
 
															+    punishPeople_rule3_1 = re.compile("(?:关于|对)[^，。]*$")
														
 
															+    punishPeople_rule3_2 = re.compile("^[^，。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
														
 
															+
														
 
															+    time1 = time.time()
														
 
															+    for _ner in ners:
														
 
															+        if _ner:
														
 
															+            for ner in _ner:
														
 
															+                left = ner['sentence'][max(0,ner['begin_index']-size):ner['begin_index']]
														
 
															+                right = ner['sentence'][ner['end_index']:min(ner['end_index']+size,len(ner['sentence']))]
														
 
															+                # print(left)
														
 
															+                if complainants_rule1.search(left):
														
 
															+                    complainants[ner['article_index']].append(ner['entity_text'])
														
 
															+                elif punishPeople_rule1.search(left):
														
 
															+                    punishPeople[ner['article_index']].append(ner['entity_text'])
														
 
															+                elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
														
 
															+                    if data['类别'][ner['article_index']] == '投诉处理':
														
 
															+                        complainants[ner['article_index']].append(ner['entity_text'])
														
 
															+                    else:
														
 
															+                        punishPeople[ner['article_index']].append(ner['entity_text'])
														
 
															+                elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
														
 
															+                    punishPeople[ner['article_index']].append(ner['entity_text'])
														
 
															+    data['complainant'] = complainants
														
 
															+    data['punishPeople'] = punishPeople
														
 
															+    print(time.time()-time1)
														
 
															+    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx")
														
 
															+
														
 
															+def get_complainant2(list_sentences, list_entitys, text_type):
														
 
															+    '''
														
 
															+    list_sentences: get_preprocessed() 得list_sentences
														
 
															+    list_entitys: get_preprocessed() 得list_entitys
														
 
															+    text_type: 文章类别（处罚类型）
														
 
															+    :return:
														
 
															+    complainants :投诉人列表
														
 
															+    punishPeople: 被投诉人/被处罚人
														
 
															+    '''
														
 
															+    sentences_list = list_sentences
														
 
															+    entitys_list = list_entitys
														
 
															+    size = 16
														
 
															+    a = ['org', 'company', 'person']
														
 
															+    b = ['org', 'company', 'union_org_company']
														
 
															+    c = ['person', 'union_person']
														
 
															+    need_entitys = []
														
 
															+    for entity in entitys_list:
														
 
															+        if entity.entity_type in a:
														
 
															+            need_entitys.append(entity)
														
 
															+    # 实体合并
														
 
															+    drop_count = 0
														
 
															+    for i in range(1, len(need_entitys)):
														
 
															+        entity = need_entitys[i]
														
 
															+        entity_begin = entity.wordOffset_begin
														
 
															+        entity_end = entity.wordOffset_end
														
 
															+        sentence = sentences_list[entity.sentence_index].sentence_text
														
 
															+        last_entity = need_entitys[i - 1]
														
 
															+        if entity.sentence_index == last_entity.sentence_index:
														
 
															+            if (entity.entity_type in b and last_entity.entity_type in b) or (
														
 
															+                    entity.entity_type in c and last_entity.entity_type in c):
														
 
															+                if entity_begin - last_entity.wordOffset_end < 2 and sentence[
														
 
															+                                                                     last_entity.wordOffset_end:entity_begin] in ['',
														
 
															+                                                                                                                  '、',
														
 
															+                                                                                                                  '和',
														
 
															+                                                                                                                  '及']:
														
 
															+                    need_entitys[i].wordOffset_begin = last_entity.wordOffset_begin
														
 
															+                    need_entitys[i].begin_index = last_entity.begin_index
														
 
															+                    need_entitys[i].entity_text = last_entity.entity_text + '+' + entity.entity_text
														
 
															+                    if entity.entity_type in b:
														
 
															+                        need_entitys[i].entity_type = 'union_org_company'
														
 
															+                    else:
														
 
															+                        need_entitys[i].entity_type = 'union_person'
														
 
															+                    need_entitys[i - 1] = 0
														
 
															+                    drop_count += 1
														
 
															+    for _ in range(drop_count):
														
 
															+        need_entitys.remove(0)
														
 
															+    # 投诉人、质疑人
														
 
															+    complainants_rule1 = re.compile(
														
 
															+        "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
														
 
															+    # 被处罚人，被投诉人
														
 
															+    punishPeople_rule1 = re.compile(
														
 
															+        "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?，?名称[\d一二三四五六七八九十]?)?(?:[:：，]+.{0,3}$|$)")
														
 
															+    punishPeople_rule2_1 = re.compile("，$")
														
 
															+    punishPeople_rule2_2 = re.compile("^[:：]")
														
 
															+    punishPeople_rule3_1 = re.compile("(?:关于|对)[^，。]*$")
														
 
															+    punishPeople_rule3_2 = re.compile("^[^，。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
														
 
															+    complainants = []
														
 
															+    punishPeople = []
														
 
															+    for i in range(len(need_entitys)):
														
 
															+        entity = need_entitys[i]
														
 
															+        entity_begin = entity.wordOffset_begin
														
 
															+        entity_end = entity.wordOffset_end
														
 
															+
														
 
															+        # entity所在句子
														
 
															+        sentence = sentences_list[entity.sentence_index].sentence_text
														
 
															+        left = sentence[max(0, entity_begin - size):entity_begin]
														
 
															+        right = sentence[entity_end:min(entity_end + size, len(sentence))]
														
 
															+
														
 
															+        if complainants_rule1.search(left):
														
 
															+            complainants.append(entity)
														
 
															+        elif punishPeople_rule1.search(left):
														
 
															+            punishPeople.append(entity)
														
 
															+        elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
														
 
															+            if text_type == '投诉处理':
														
 
															+                complainants.append(entity)
														
 
															+            else:
														
 
															+                punishPeople.append(entity)
														
 
															+        elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
														
 
															+            punishPeople.append(entity)
														
 
															+
														
 
															+    result_complainants = []
														
 
															+    result_punishPeople = []
														
 
															+    for entity in complainants:
														
 
															+        if entity.entity_type in ['union_org_company', 'union_person']:
														
 
															+            entity_text = entity.entity_text.split('+')
														
 
															+            for item in entity_text:
														
 
															+                result_complainants.append(item)
														
 
															+        else:
														
 
															+            result_complainants.append(entity.entity_text)
														
 
															+    for entity in punishPeople:
														
 
															+        if entity.entity_type in ['union_org_company', 'union_person']:
														
 
															+            entity_text = entity.entity_text.split('+')
														
 
															+            for item in entity_text:
														
 
															+                result_punishPeople.append(item)
														
 
															+        else:
														
 
															+            result_punishPeople.append(entity.entity_text)
														
 
															+    return list(set(result_complainants)), list(set(result_punishPeople))
														
 
															+
														
 
															+# 公告分类
														
 
															+def textClassify():
														
 
															+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
														
 
															+    #投诉人|检举人|举报人|质疑人|质疑函
														
 
															+    patten1 = "投诉人|检举人|举报人|质疑人|质疑函|投诉处理|质疑单位"
														
 
															+    re1 = re.compile(patten1)
														
 
															+    patten2 = "不予[处受]理|撤诉|撤[销回]投诉|投诉终止"
														
 
															+    re2 = re.compile(patten2)
														
 
															+    patten3 = "关于[^，。]+?(?:处罚|通报|处理意见)|被处罚人|处罚决定|限制行为开始时间|处罚执行部门"
														
 
															+    re3 = re.compile(patten3)
														
 
															+    patten4 = "不良行为|不良信用|不良记录|不规范行为|不诚信行为"
														
 
															+    re4 = re.compile(patten4)
														
 
															+    patten5 = "行政处罚|行政处理|监督检查|监管调查|监督处理|违规处[罚理]|违法处[罚理]"
														
 
															+    re5 = re.compile(patten5)
														
 
															+    patten6 = "严重违法失信起名单|严重违法失信行为|严重违法失信企业"
														
 
															+    re6 = re.compile(patten6)
														
 
															+    patten7 = '处理决定'
														
 
															+    re7 = re.compile(patten7)
														
 
															+    patten8 = "处[理罚]依据|处罚日期|扣分依据|认定依据"
														
 
															+    re8 = re.compile(patten8)
														
 
															+    pos = []
														
 
															+    _type = []
														
 
															+    for title,text in zip(data['PAGE_TITLE'],data["PAGE_CONTENT"]):
														
 
															+        p = []
														
 
															+        t = ''
														
 
															+        if re1.search(text) or re1.search(title):
														
 
															+            p.append(patten1)
														
 
															+            t = '投诉'
														
 
															+        elif re2.search(text) and re.search('投诉',text):
														
 
															+            p.append('投诉+'+patten2)
														
 
															+            t = '投诉'
														
 
															+        elif re.search("回复",title):
														
 
															+            p.append("回复")
														
 
															+            t = '投诉'
														
 
															+        if len(p)==0:
														
 
															+            if re3.search(title) or re3.search(text):
														
 
															+                p.append(patten3)
														
 
															+                t = '处罚'
														
 
															+            elif re4.search(title):
														
 
															+                p.append(patten4)
														
 
															+                t = '处罚'
														
 
															+            elif re5.search(title) or re5.search(text):
														
 
															+                p.append(patten5)
														
 
															+                t = '处罚'
														
 
															+            elif re6.search(text) or re6.search(title):
														
 
															+                p.append(patten6)
														
 
															+                t = '处罚'
														
 
															+            elif re8.search(text):
														
 
															+                p.append(patten8)
														
 
															+                t = '处罚'
														
 
															+        if len(p) == 0:
														
 
															+            if re7.search(text) and re.search('投诉', text):
														
 
															+                p.append('投诉+' + patten7)
														
 
															+                t = '投诉'
														
 
															+            elif re7.search(text) or re7.search(title):
														
 
															+                p.append("处罚+"+patten7)
														
 
															+                t = '处罚'
														
 
															+        pos.append(p)
														
 
															+        _type.append(t)
														
 
															+    data['pos'] = pos
														
 
															+    data['type'] = _type
														
 
															+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv")
														
 
															+
														
 
															+# 投诉是否成立
														
 
															+def get_punishWhether01():
														
 
															+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv",index_col=0)
														
 
															+    data = data[data['type']=='投诉']
														
 
															+    punishWhether_1 = re.compile("投诉[^。，,不]+?成立|投诉[^。，,]*[^不]属实|情况[^。，,]*[^不]属实|投诉成立|情况属实|予以支持")
														
 
															+    punishWhether_0 = re.compile("投诉[^。，,]*不能?成立|撤诉|[^逾将]{4,}不予[受处]理|撤[回销][^。，,]*(?:举报|投诉)|驳回[^。，,]*投诉|投诉终止|终止[^。，,]*投诉|情况[^。，,]*不属实|投诉[^。，,]*不属实|缺乏事实依据|不予支持|予以驳回")
														
 
															+    punishWhether = []
														
 
															+    punishDecision = []
														
 
															+    punishDecision_1 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[：:]((?:(?:[\d一二三四五六七八九十]|[\(（][\d一二三四五六七八九十][\)）]|投[诉拆]事项[\d一二三四五六七八九十]).+?。)+)")
														
 
															+    punishDecision_2 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[：:]([^。]+?(?:。|$))")
														
 
															+    punishDecision_3 = re.compile("[\d一二三四五六七八九十]、(?:处理，?意见|[裁决|处理]依据及结果|处理(?:决定|结果)|投诉处理决定)，(.+?)。[\d一二三四五六七八九十]、")
														
 
															+    punishDecision_4 = re.compile("(?:[\d一二三四五六七八九十]、处理，?意见|综上所述|[裁决|处理]依据及结果|综上|[\d一二三四五六七八九十]、处理(?:决定|结果)|经研究决定|[\d一二三四五六七八九十]、投诉处理决定)，([^。]+?(?:。|$))")
														
 
															+    punishDecision_5 = re.compile("(本机关决定|本机关认为|经审查.+?(?:。|$))")
														
 
															+    punishDecision_6 = re.compile("((?:依据|按照|根据|依照)[^：:。].+?(?:。|$))")
														
 
															+
														
 
															+    def findDecision(text):
														
 
															+        decision = ''
														
 
															+        if punishDecision_1.search(text):
														
 
															+            decision = punishDecision_1.search(text).group(1)
														
 
															+
														
 
															+        elif punishDecision_2.search(text):
														
 
															+            decision = punishDecision_2.search(text).group(1)
														
 
															+        elif punishDecision_3.search(text):
														
 
															+            decision = punishDecision_3.search(text).group(1)
														
 
															+        elif punishDecision_4.search(text):
														
 
															+            decision = punishDecision_4.findall(text)
														
 
															+            decision = decision[-1]
														
 
															+        elif punishDecision_5.search(text):
														
 
															+            decision = punishDecision_5.search(text).group(1)
														
 
															+        elif punishDecision_6.search(text):
														
 
															+            decision = punishDecision_6.findall(text)
														
 
															+            decision1 = decision[-1]
														
 
															+            if re.search("诉讼",decision1) and len(decision)>1:
														
 
															+                decision1 = decision[-2]
														
 
															+            decision = decision1
														
 
															+        return decision
														
 
															+
														
 
															+    for text in data['PAGE_CONTENT']:
														
 
															+        pw = ''
														
 
															+        if punishWhether_1.search(text):
														
 
															+            pw = 1
														
 
															+        elif punishWhether_0.search(text):
														
 
															+            pw = 0
														
 
															+        punishWhether.append(pw)
														
 
															+
														
 
															+        mid = len(text)//2
														
 
															+        lower_half = text[mid:]
														
 
															+        decision = findDecision(lower_half)
														
 
															+        if decision == '':
														
 
															+            decision = findDecision(text)
														
 
															+
														
 
															+        # if punishDecision_1.search(text):
														
 
															+        #     decision = punishDecision_1.search(text).group(1)
														
 
															+        #
														
 
															+        # elif punishDecision_2.search(text):
														
 
															+        #     decision = punishDecision_2.search(text).group(1)
														
 
															+        # elif punishDecision_3.search(text):
														
 
															+        #     decision = punishDecision_3.search(text).group(1)
														
 
															+        # elif punishDecision_4.search(text):
														
 
															+        #     decision = punishDecision_4.findall(text)
														
 
															+        #     decision = decision[-1]
														
 
															+        # elif punishDecision_5.search(text):
														
 
															+        #     decision = punishDecision_5.findall(text)
														
 
															+        #     decision = decision[-1]
														
 
															+        punishDecision.append(decision)
														
 
															+    data['punishWhether'] = punishWhether
														
 
															+    data['punishDecision'] = punishDecision
														
 
															+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishWhether&Decision.csv")
														
 
															+# 处罚决定
														
 
															+def get_punishDecision():
														
 
															+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv", index_col=0)
														
 
															+    data = data[data['type'] == '处罚']
														
 
															+    punishDecision_1 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[:：]+((?:(?:[\d一二三四五六七八九十]|[\(（][\d一二三四五六七八九十][\)）]).+?。)+)")
														
 
															+    punishDecision_2 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[:：]+(.+?(?:。|$))")
														
 
															+    punishDecision_3 = re.compile("(扣分分?值[:：][\d.]+分?)")
														
 
															+    punishDecision_4 = re.compile("[\d一二三四五六七八九十]、(?:处理结果|处理决定|处理依据[和及]处理结果|处理依据及结果|处罚决定|处罚结果|整改意见)，(.+?)。[\d一二三四五六七八九十]、")
														
 
															+    punishDecision_5 = re.compile("(?:处理结果|[\d一二三四五六七八九十]、处理决定|处理依据及处理结果|处理依据及结果|经研究|经研究决定|[\d一二三四五六七八九十]、处罚决定|处罚结果|整改意见)，+(.+?(?:。|$))")
														
 
															+    punishDecision_6 = re.compile("(?:本机关决定|我局决定)(.+?(?:。|$))")
														
 
															+    punishDecision_7 = re.compile("((?:依据|按照|根据|依照)[^：:。].+?(?:。|$))")
														
 
															+    punishDecision = []
														
 
															+    for text in data['PAGE_CONTENT']:
														
 
															+        decision = ''
														
 
															+        if punishDecision_1.search(text):
														
 
															+            decision = punishDecision_1.search(text).group(1)
														
 
															+        elif punishDecision_2.search(text):
														
 
															+            decision = punishDecision_2.search(text).group(1)
														
 
															+        elif punishDecision_3.search(text):
														
 
															+            decision = punishDecision_3.search(text).group(1)
														
 
															+        elif punishDecision_4.search(text):
														
 
															+            decision = punishDecision_4.search(text).group(1)
														
 
															+        elif punishDecision_5.search(text):
														
 
															+            decision = punishDecision_5.findall(text)
														
 
															+            decision = decision[-1]
														
 
															+        elif punishDecision_6.search(text):
														
 
															+            decision = punishDecision_6.search(text).group(1)
														
 
															+        elif punishDecision_7.search(text):
														
 
															+            decision = punishDecision_7.findall(text)
														
 
															+            decision = decision[-1]
														
 
															+        punishDecision.append(decision)
														
 
															+    data['punishDecision'] = punishDecision
														
 
															+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishDecision处罚.csv")
														
 
															+
														
 
															+# 执法机构、处罚时间
														
 
															+def get_institution():
														
 
															+    data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx", index_col=0)
														
 
															+    ners = load("nersList.pk")
														
 
															+    orgs = [[] for _ in range(len(data))]
														
 
															+    times = [[] for _ in range(len(data))]
														
 
															+    institutions = [[] for _ in range(len(data))]
														
 
															+    punishTimes = [[] for _ in range(len(data))]
														
 
															+    institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[:：]")
														
 
															+    punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[:：]")
														
 
															+    for ner in ners:
														
 
															+        if ner['entity_type'] == 'org':
														
 
															+            left = ner['sentence'][max(0,ner['begin_index']-15):ner['begin_index']]
														
 
															+            if institution_1.search(left):
														
 
															+                institutions[ner['article_index']].append(ner['entity_text'])
														
 
															+            orgs[ner['article_index']].append(ner)
														
 
															+        elif ner['entity_type'] =='time':
														
 
															+            left = ner['sentence'][max(0, ner['begin_index'] - 15):ner['begin_index']]
														
 
															+            if punishTimes_1.search(left):
														
 
															+                punishTimes[ner['article_index']].append(ner['entity_text'])
														
 
															+            times[ner['article_index']].append(ner)
														
 
															+    orgs = [org[-5:] if len(org)>5 else org for org in orgs]
														
 
															+    times = [time[-3:] if len(time)>3 else time for time in times]
														
 
															+    data['org'] = orgs
														
 
															+    data['time'] = times
														
 
															+    data['institution'] = institutions
														
 
															+    data['punishTime'] = punishTimes
														
 
															+    # data = data[data['type'].isin(["投诉","处罚"])]
														
 
															+    print(len(data))
														
 
															+    # data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv")
														
 
															+    # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv", index_col=0)
														
 
															+    institution_list = []
														
 
															+    punishTime_list = []
														
 
															+    institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
														
 
															+    institution_time = re.compile("(^，?[\d一二三四五六七八九十]{4}，?[/年-][\d一二三四五六七八九十]{1,2}，?[/月-][\d一二三四五六七八九十]{1,2}，?[/日-]?)")
														
 
															+    for title,text,org,n_time,institution,punishTime in zip(data['PAGE_TITLE'],data['PAGE_CONTENT'],data['org'],data['time'],data['institution'],data['punishTime']):
														
 
															+        ins = ''
														
 
															+        ptime = ''
														
 
															+        if punishTime:
														
 
															+            ptime = punishTime
														
 
															+        if institution:
														
 
															+            ins = institution
														
 
															+        else:
														
 
															+            title_ners = getNers([title],useselffool=True)
														
 
															+            if title_ners[0]:
														
 
															+
														
 
															+                for title_ner in title_ners[0]:
														
 
															+
														
 
															+                    if title_ner[2]=='org' and institution_title.search(title_ner[3]):
														
 
															+                        # 'title:'+
														
 
															+                        ins = title_ner[3]
														
 
															+                        # print(title_ner[3])
														
 
															+                        break
														
 
															+
														
 
															+        # if ins == '':
														
 
															+        for _org in org[::-1]:
														
 
															+            right = _org['sentence'][_org['end_index']:min(len(_org['sentence']),_org['end_index']+16)]
														
 
															+            if institution_time.search(right):
														
 
															+                if ins == '':
														
 
															+                    # "text_EndWithTime:" +
														
 
															+                    ins = _org['entity_text']
														
 
															+                if ptime == '':
														
 
															+                    # "text_EndWithIns:" +
														
 
															+                    ptime =institution_time.search(right).group(1)
														
 
															+                break
														
 
															+        if ptime == '' and len(n_time) != 0:
														
 
															+            textLong = len(text)
														
 
															+            if n_time[-1]['wordOffset_end'] > textLong-3 and len(n_time[-1]['entity_text'])>3:
														
 
															+                # "EndOfText:" +
														
 
															+                ptime = n_time[-1]['entity_text']
														
 
															+
														
 
															+        institution_list.append(ins)
														
 
															+        punishTime_list.append(ptime)
														
 
															+    data['institution'] = institution_list
														
 
															+    data['punishTime'] = punishTime_list
														
 
															+    data = data.drop(columns=['org','time'],axis=1)
														
 
															+    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-2.xlsx")
														
 
															+
														
 
															+# 处罚类型
														
 
															+def get_punishType():
														
 
															+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
														
 
															+    # 暂定：严重违法失信，行政处罚，投诉处理，监督检查，其他失信记录
														
 
															+
														
 
															+    # 其他无关公告
														
 
															+    title_rule = re.compile("(?:中标公[示告]|中标[（\(]成交[\)）]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
														
 
															+                            "|补贴公[示告]|废标公[示告]|备案公[示告]|数据统计|选取公告|流标公告|变更公告|入围公告|征集公告|执行情况|"
														
 
															+                            "登记公告|竞争性磋商公告|报名的公[示告]|竞争性谈判公告|邀请函|竞标公告|采购公告|招标公告|议标公告|预审公告|"
														
 
															+                            "询价公告|竞争性磋商（磋商）公告|竞[谈价]公告|合同公告|人员(名单)?公示|批复|终止公告|入围结果公告|中标结果公[示告]|"
														
 
															+                            "意见公示)(?:[\(（].+?[\)）])?$|关于.*通知(?:[^书]|$)")
														
 
															+    othertype = "其他无关公告"
														
 
															+    # 投诉处理
														
 
															+    re1_1 = re.compile("投诉[人方]|检举人|举报人[：:]|投诉处理|终止投诉|投诉终止|撤诉|撤回投诉|质疑人|质疑单位|质疑[^，,。]*答复")
														
 
															+    re1_2 = re.compile("处理决定|回复")
														
 
															+    re1_type = '投诉处理'
														
 
															+    # 监督检查
														
 
															+    re2 = re.compile("监督检查|监管调查|监督处理")
														
 
															+    re2_type = "监督检查"
														
 
															+    # 行政处罚
														
 
															+    re3 = re.compile("行政处罚|行政处理")
														
 
															+    re3_type = "行政处罚"
														
 
															+    # 严重违法失信
														
 
															+    re4 = re.compile("严重违法失信行为|严重违法失信企业|严重违法失信起名单")
														
 
															+    re4_type = "严重违法失信"
														
 
															+    # 其他失信公告
														
 
															+    re_other = re.compile("关于[^，。]+?(?:处罚|处理|通报)|不良行为|不良信用|不良记录|不规范行为|不诚信行为|"
														
 
															+                          "违[规法约]处[罚理]|处[理罚]依据|处罚日期|扣分依据|认定依据|处罚决定|违规情况|"
														
 
															+                          "违[规法]行为|违规事项|考评依据|失信行为")
														
 
															+    re_otherType = "其他失信公告"
														
 
															+    punishType_list = []
														
 
															+    for title,text in zip(data['PAGE_TITLE'],data['PAGE_CONTENT']):
														
 
															+        punishType = ''
														
 
															+        titleWithText = title + text
														
 
															+        if title_rule.search(title):
														
 
															+            punishType = othertype
														
 
															+        elif re1_1.search(titleWithText) or re.search("投[诉拆]",title):
														
 
															+            punishType = re1_type
														
 
															+        elif re1_2.search(titleWithText) and re.search("投诉",titleWithText):
														
 
															+            punishType = re1_type
														
 
															+        elif re2.search(titleWithText):
														
 
															+            punishType = re2_type
														
 
															+        elif re3.search(titleWithText):
														
 
															+            punishType = re3_type
														
 
															+        elif re4.search(titleWithText):
														
 
															+            punishType = re4_type
														
 
															+        elif re_other.search(titleWithText) or re.search("处罚",title):
														
 
															+            punishType = re_otherType
														
 
															+        punishType_list.append(punishType)
														
 
															+    data['punishType'] = punishType_list
														
 
															+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishType_test.csv",encoding='utf-8')
														
 
															+
														
 
															+
														
 
															+def getNers_my(sentences,MAXAREA = 10000,useselffool=False):
														
 
															+    '''
														
 
															+    @param: sentences:句子数
														
 
															+    @return 限流执行后的实体识别list
														
 
															+    '''
														
 
															+    def getData(ners,process_data):
														
 
															+        process_sentences = [item[1] for item in process_data]
														
 
															+        print(process_data)
														
 
															+        if useselffool:
														
 
															+            ner_ = selffool.self_ner(process_sentences)
														
 
															+        else:
														
 
															+            ner_ = selffool.ner(process_sentences)
														
 
															+        print('ner_ :',ner_)
														
 
															+        for i in range(len(ner_)):
														
 
															+            the_index = process_data[i][0]
														
 
															+            ners[the_index] = ner_[i]
														
 
															+    sents = []
														
 
															+    for i in range(len(sentences)):
														
 
															+        sents.append([i,sentences[i]])
														
 
															+    sents.sort(key=lambda x:len(x[1]),reverse=True)
														
 
															+    print(sents)
														
 
															+    index_ = 0
														
 
															+    ners = [[]for i in range(len(sentences))]
														
 
															+
														
 
															+    while(True):
														
 
															+        width = len(sents[index_][1])
														
 
															+        height = MAXAREA//width+1
														
 
															+        if height>len(sents)-index_:
														
 
															+            height = len(sents)-index_
														
 
															+        process_data = sents[index_:index_+height]
														
 
															+        getData( ners, process_data)
														
 
															+        index_ += height
														
 
															+        if index_>=len(sents):
														
 
															+            break
														
 
															+    return ners
														
 
															+# 网页公告处理
														
 
															+def get_article1(articles,cost_time = dict(),useselffool=True):
														
 
															+    '''
														
 
															+    :param articles: 待处理的article source html
														
 
															+    :param useselffool: 是否使用selffool
														
 
															+    :return: list_articles
														
 
															+    '''
														
 
															+
														
 
															+    list_articles = []
														
 
															+    for article in articles:
														
 
															+        a_time = time.time()
														
 
															+        sourceContent = article
														
 
															+        #表格处理
														
 
															+        key_preprocess = "tableToText"
														
 
															+        start_time = time.time()
														
 
															+        article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
														
 
															+
														
 
															+        # log(article_processed)
														
 
															+
														
 
															+        if key_preprocess not in cost_time:
														
 
															+            cost_time[key_preprocess] = 0
														
 
															+        cost_time[key_preprocess] += time.time()-start_time
														
 
															+
														
 
															+        #article_processed = article[1]
														
 
															+        list_articles.append(article_processed)
														
 
															+        print(time.time()-a_time)
														
 
															+    return list_articles
														
 
															+# 分句处理
														
 
															+def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
														
 
															+    '''
														
 
															+
														
 
															+    :param list_articles: 经过预处理的article text
														
 
															+    :return: list_sentences
														
 
															+    '''
														
 
															+
														
 
															+    list_sentences = []
														
 
															+    for article in list_articles:
														
 
															+        a_time = time.time()
														
 
															+        list_sentences_temp = []
														
 
															+        #表格处理
														
 
															+        key_preprocess = "tableToText"
														
 
															+        start_time = time.time()
														
 
															+        article_processed = article
														
 
															+
														
 
															+
														
 
															+        if key_preprocess not in cost_time:
														
 
															+            cost_time[key_preprocess] = 0
														
 
															+        cost_time[key_preprocess] += time.time()-start_time
														
 
															+
														
 
															+        #nlp处理
														
 
															+        if article_processed is not None and len(article_processed)!=0:
														
 
															+            split_patten = "。"
														
 
															+            sentences = []
														
 
															+            _begin = 0
														
 
															+            sentences_set = set()
														
 
															+            for _iter in re.finditer(split_patten,article_processed):
														
 
															+                _sen = article_processed[_begin:_iter.span()[1]]
														
 
															+                if len(_sen)>0 and _sen not in sentences_set:
														
 
															+                    sentences.append(_sen)
														
 
															+                    sentences_set.add(_sen)
														
 
															+                _begin = _iter.span()[1]
														
 
															+            _sen = article_processed[_begin:]
														
 
															+            if len(_sen)>0 and _sen not in sentences_set:
														
 
															+                sentences.append(_sen)
														
 
															+                sentences_set.add(_sen)
														
 
															+            # article = "".join(sentences)
														
 
															+            # # sentences.append(article_processed[_begin:])
														
 
															+            #
														
 
															+            # lemmas = []
														
 
															+            # doc_offsets = []
														
 
															+            # dep_types = []
														
 
															+            # dep_tokens = []
														
 
															+            #
														
 
															+            # time1 = time.time()
														
 
															+
														
 
															+            '''
														
 
															+            tokens_all = fool.cut(sentences)
														
 
															+            #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
														
 
															+            #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
														
 
															+            ner_entitys_all = fool.ner(sentences)
														
 
															+            '''
														
 
															+            #限流执行
														
 
															+            key_nerToken = "nerToken"
														
 
															+            start_time = time.time()
														
 
															+            # tokens_all = getTokens(sentences,useselffool=useselffool)
														
 
															+            if key_nerToken not in cost_time:
														
 
															+                cost_time[key_nerToken] = 0
														
 
															+            cost_time[key_nerToken] += time.time()-start_time
														
 
															+
														
 
															+
														
 
															+            for sentence_index in range(len(sentences)):
														
 
															+
														
 
															+                sentence_text = sentences[sentence_index]
														
 
															+                # tokens = tokens_all[sentence_index]
														
 
															+                #
														
 
															+                # #pos_tag = pos_all[sentence_index]
														
 
															+                # pos_tag = ""
														
 
															+                #
														
 
															+                # ner_entitys = ""
														
 
															+
														
 
															+                list_sentences_temp.append(sentence_text)
														
 
															+
														
 
															+        if len(list_sentences_temp)==0:
														
 
															+            list_sentences_temp.append(sentence_text)
														
 
															+        list_sentences.append(list_sentences_temp)
														
 
															+        print('2:',time.time()-a_time)
														
 
															+    return list_sentences
														
 
															+
														
 
															+def ronghe():
														
 
															+    a = "，投诉处理决定书，投诉人：福建光正工程项目管理有限公司，联系地址：福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室，被投诉人：泉州台商投资区城市建设发展有限公司，泉州台商投资区水务投资经营有限公司，福建省富诚工程管理有限公司，联系地址：泉州台商投资区通港路大创商厦，一、投诉人投诉事项，投诉人按中标候选人公示的要求参加会议，由于提供的身份证原件于复印件版本不同而被废标，认为废标理由不成立。"
														
 
															+    ners = [(13, 28, 'company', '福建光正工程项目管理有限公司'), (33, 75, 'location', '福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室'), (80, 98, 'company', '泉州台商投资区城市建设发展有限公司'), (98, 116, 'company', '泉州台商投资区水务投资经营有限公司'), (116, 130, 'company', '福建省富诚工程管理有限公司'), (135, 150, 'location', '泉州台商投资区通港路大创商厦')]
														
 
															+    s = ['person', 'org', 'company', 'union']
														
 
															+    remove_num = 0
														
 
															+    for i in range(len(ners)):
														
 
															+        print(0)
														
 
															+        ner = ners[i]
														
 
															+        begin = ner[0]
														
 
															+        end = ner[1]
														
 
															+        type = ner[2]
														
 
															+
														
 
															+        if type in s:
														
 
															+            if end == ners[i+1][0] and a[end-1]=='、':
														
 
															+                print(1)
														
 
															+                new_begin = begin
														
 
															+                new_end = ners[i+1][1]
														
 
															+                new_type = 'union'
														
 
															+                new_text = ner[3]+'、'+ners[i+1][3]
														
 
															+                new_ner = (new_begin,new_end,new_type,new_text)
														
 
															+                ners[i] = 0
														
 
															+                ners[i+1] = new_ner
														
 
															+                remove_num += 1
														
 
															+                continue
														
 
															+            if end == ners[i + 1][0] and a[end-1] == '，' and a[ners[i + 1][1]-1]==a[end-1]:
														
 
															+                print(2)
														
 
															+                new_begin = begin
														
 
															+                new_end = ners[i + 1][1]
														
 
															+                new_type = 'union'
														
 
															+                new_text = ner[3] + '，' + ners[i + 1][3]
														
 
															+                new_ner = (new_begin, new_end, new_type, new_text)
														
 
															+                ners[i] = 0
														
 
															+                ners[i + 1] = new_ner
														
 
															+                remove_num += 1
														
 
															+
														
 
															+    for i in range(remove_num):
														
 
															+        ners.remove(0)
														
 
															+    print(ners)
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    # get_data1()
														
 
															+    # get_ners()
														
 
															+    # test02()
														
 
															+    # get_unionNers()
														
 
															+    # 投诉人、被投诉/处罚人
														
 
															+    # get_complainant()
														
 
															+    # ronghe()
														
 
															+    # 分类
														
 
															+    # textClassify()
														
 
															+    # 投诉是否成立、处罚决定（投诉）
														
 
															+    # get_punishWhether01()
														
 
															+    # 处罚决定（处罚）
														
 
															+    # get_punishDecision()
														
 
															+    # 执法机构、处罚时间
														
 
															+    get_institution()
														
 
															+    # 处罚类型
														
 
															+    # get_punishType()
														
 
															+
														
 
															+    pass
														
--- a/BiddingKG/dl/complaint/vocab_word.pk
+++ b/BiddingKG/dl/complaint/vocab_word.pk