浏览代码

新增投诉公告提取模型,ctf提取编号,分类、处理决定、投诉人等要素正则提取,第一次提交

bidi 4 年之前
父节点
当前提交
5e0d431c28

+ 6 - 0
BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/checkpoint

@@ -0,0 +1,6 @@
+model_checkpoint_path: "model.ckpt"
+all_model_checkpoint_paths: "..\\9-0.9983888954343817-0.6076048\\model.ckpt"
+all_model_checkpoint_paths: "..\\10-0.9984710946469133-0.58896327\\model.ckpt"
+all_model_checkpoint_paths: "..\\11-0.9986902925469974-0.50287944\\model.ckpt"
+all_model_checkpoint_paths: "..\\16-0.9989259302895879-0.39168403\\model.ckpt"
+all_model_checkpoint_paths: "model.ckpt"

二进制
BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.data-00000-of-00001


二进制
BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.index


二进制
BiddingKG/dl/complaint/models/21-0.9990081295021194-0.3647936/model.ckpt.meta


+ 364 - 0
BiddingKG/dl/complaint/punishNo_tf.py

@@ -0,0 +1,364 @@
+import tensorflow as tf
+from tensorflow.contrib.crf import crf_log_likelihood
+from tensorflow.contrib.layers.python.layers import initializers
+import numpy as np
+import pandas as pd
+from zipfile import ZipFile
+import os
+import pickle
+from BiddingKG.dl.common.Utils import *
+from keras.preprocessing.sequence import pad_sequences
+
+# class BiLSTM_CRF_tf(object):
+#     def __init__(self):
+
+def BiLSTM_CRF_tfmodel(sess,weights):
+    BiRNN_Units = 140
+    chunk_tags = {
+        'O': 0,
+        'PN_B': 1,
+        'PN_M': 2,
+        'PN_E': 3
+    }
+
+    def embedding_layer(input):
+        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
+        return tf.nn.embedding_lookup(params=embedding,ids=input)
+
+    def BiLSTM_Layer(input,length):
+        with tf.variable_scope("BiLSTM"):
+            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
+            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
+        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
+        output = tf.concat(output,2)
+        return output
+
+    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
+        with tf.variable_scope("CRF"):
+            with tf.variable_scope("hidden"):
+                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
+                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
+                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
+                # print(input)
+                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
+                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
+            with tf.variable_scope("output"):
+                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
+                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
+                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
+                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
+        return logits_
+
+    def layer_loss(input,true_target,num_tags,length):
+        with tf.variable_scope("crf_loss"):
+            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
+            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
+            return tf.reduce_mean(-log_likelihood),trans
+
+    with sess.graph.as_default():
+        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
+        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
+        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
+        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
+
+        _embedding = embedding_layer(char_input)
+        _shape = tf.shape(char_input)
+        batch_size = _shape[0]
+        step_size = _shape[-1]
+        bilstm = BiLSTM_Layer(_embedding,length)
+        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
+        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
+        global_step = tf.Variable(0,trainable=False)
+        with tf.variable_scope("optimizer"):
+            opt = tf.train.AdamOptimizer(0.002)
+            grads_vars = opt.compute_gradients(crf_loss)
+            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
+            train_op = opt.apply_gradients(capped_grads_vars,global_step)
+            return char_input,_logits,target,length,crf_loss,trans,train_op
+
+def train():
+    vocab_model = getModel_word()
+    vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
+    # print(w2v_matrix)
+    punishNo = {
+        'O': 0,
+        'PN_B': 1,
+        'PN_M': 2,
+        'PN_E': 3
+    }
+    punishNo_2 = {
+        'O': np.array([1, 0, 0, 0]),
+        'PN_B': np.array([0, 1, 0, 0]),
+        'PN_M': np.array([0, 0, 1, 0]),
+        'PN_E': np.array([0, 0, 0, 1])
+    }
+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0)
+
+    train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0)
+    train_data['text'] = [data['text'][data['document_id'] == id] for id in train_data['document_id']]
+    data_x = []
+    data_y = []
+
+    articles_label = ['' for _ in range(13500)]
+    punishNo_in_text = set()
+    for textId, begin, end, entity_text, text in zip(train_data['document_id'], train_data['begin_index'],
+                                                     train_data['end_index'],
+                                                     train_data['entity_text'], train_data['text']):
+        punishNo_in_text.add(textId)
+        text = list(text)[0]
+        l = len(text)
+        if not articles_label[textId]:
+            articles_label[textId] = ['O' for _ in range(l)]
+        articles_label[textId][begin] = 'PN_B'
+        articles_label[textId][end - 1] = 'PN_E'
+        for i in range(begin + 1, end - 1):
+            articles_label[textId][i] = 'PN_M'
+    punishNo_in_text = list(punishNo_in_text)
+
+    # 取含数字的负样本
+    data = data.dropna(subset=['text'])
+    re_rule1 = re.compile('\[|\]')
+    data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']]
+    data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
+    re_rule2 = re.compile("[\d,.]{4,}")
+    for id, article, sentences in zip(data['document_id'], data['text'], data['sentences']):
+        if id < 2826 or id in punishNo_in_text:
+            # print(id)
+            article = str(article)
+            l = len(article)
+            text_word = list(article)
+            text_word_index = [getIndexOfWord(word) for word in text_word]
+            sentence_count = len(sentences)
+            if articles_label[id]:
+                label_list = articles_label[id]
+            else:
+                label_list = ['O' for _ in range(l)]
+            for i in range(sentence_count - 1):
+                if re_rule2.search(article[sentences[i]:sentences[i + 1]]):
+                    data_x.append(np.array(text_word_index[sentences[i]:sentences[i + 1]]))
+                    data_y.append(np.array(label_list[sentences[i]:sentences[i + 1]]))
+
+    data_x = np.array(data_x)
+    x_len = [250 if len(x)>250 else len(x) for x in data_x]
+    data_x = pad_sequences(data_x, maxlen=250, padding="post", truncating="post")
+    # train_x = train_x.reshape(-1)
+    data_y = [np.array([punishNo[_y] for _y in y]) for y in data_y]
+    # data_y = np.array(data_y).reshape(-1)
+    data_y = np.array(data_y)
+    data_y = pad_sequences(data_y, maxlen=250, padding="post", truncating="post")
+    # print(data_x[:5])
+    # print(data_y[:5])
+    # data_x = np.array(list(data_x))
+    # data_y = np.array(list(data_y))
+    indices = np.random.permutation(data_x.shape[0])
+    count = len(data_x)
+    test_count = int(0.2 * count)
+    test_idx, train_idx = indices[:test_count], indices[test_count:]
+    # print(test_idx)
+    train_x, test_x = data_x[train_idx, :], data_x[test_idx, :]
+    train_y, test_y = data_y[train_idx, :], data_y[test_idx, :]
+    train_x_len = np.array([x_len[idx] for idx in train_idx])
+    test_x_len = np.array([x_len[idx] for idx in test_idx])
+
+    with tf.Session(graph=tf.Graph()) as sess:
+        char_input,logits,target,length,crf_loss,trans,train_op = BiLSTM_CRF_tfmodel(sess,w2v_matrix)
+        sess.run(tf.global_variables_initializer())
+        saver = tf.train.Saver()
+        epochs = 60
+        batch_size = 300
+        _test_loss = 10000.
+        for epoch in range(epochs):
+            for x_batch,y_batch,x_len_batch in batch_iter(train_x,train_y,train_x_len,batch_size=batch_size):
+                # for x,y,l in zip(x_batch,y_batch,x_len_batch):
+                    # print(l,'=>',x)
+                    # print(y)
+                train_loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:x_batch,target:y_batch,length:x_len_batch,})
+            test_loss,_logits,_trans = sess.run([crf_loss,logits,trans],feed_dict={char_input:test_x,target:test_y,length:test_x_len})
+            acc = getAcc(test_y,_logits,_trans,test_x_len)
+            print("==>epoch:"+str(epoch))
+            print("--test --"," acc:",acc,'test_loss:',test_loss)
+            print("--train--","loss:",train_loss,"have_done")
+            if test_loss<_test_loss:
+                _test_loss = test_loss
+                print("Saving-"+str(epoch)+"-model,test_loss:"+str(test_loss))
+                saver.save(sess,"models/"+str(epoch)+"-"+str(acc)+"-"+str(test_loss)+"/model.ckpt")
+
+def batch_iter(x, y,x_len, batch_size=256):
+    '''
+    :param x: content2id
+    :param y: label2id
+    :param batch_size: 每次进入模型的句子数量
+    :return:
+    '''
+    data_len = len(x)
+    num_batch = int((data_len - 1) / batch_size) + 1 #计算一个epoch,需要多少次batch
+
+    # indices = np.random.permutation(data_len) #生成随机数列
+    # x_shuffle = x[indices]
+    # y_shuffle = y[indices]
+    # x_len_shuffle = x_len[indices]
+    for i in range(num_batch):
+        start_id = batch_size * i
+        end_id = min(batch_size*(i+1), data_len)
+        yield x[start_id:end_id], y[start_id:end_id],x_len[start_id:end_id]
+from sklearn.metrics import accuracy_score
+def getAcc(y_batch,logits,trans,lengths):
+    index = 0
+    small = -1000.0
+    start = np.asarray([[small] * 4 + [0]])
+
+    preds = []
+    true_tags = []
+    for score, length in zip(logits, lengths):
+        score = score[:length]
+        # pad = small * np.ones([length, 1])
+        # logit = np.concatenate([score, pad], axis=1)
+        # logit = np.concatenate([start, logit], axis=0)
+        # path, _ = tf.contrib.crf.viterbi_decode(logit, trans)
+        path, _ = tf.contrib.crf.viterbi_decode(score, trans)
+        preds += path[0:]
+        # preds += path[1:]
+        index += 1
+
+    for y, length in zip(y_batch, lengths):
+        y = y.tolist()
+        true_tags += y[: length]
+    acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1)))
+    return acc
+
+def predict(articles,model_file):
+
+    vocab_model = getModel_word()
+    vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
+    model_file = model_file
+    sess = tf.Session(graph=tf.Graph())
+    with sess:
+        char_input, logits, target, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
+        sess.run(tf.global_variables_initializer())
+        saver = tf.train.Saver()
+        saver.restore(sess, model_file)
+        re_ner = re.compile("12+?3")
+        article_ner_list = []
+        count = 0
+        for sentences in articles:
+            count += 1
+            print(count)
+            sentence_len = [ len(sentence) for sentence in sentences]
+            maxlen = max(sentence_len)
+            sentences_x = []
+            for sentence in sentences:
+                sentence = list(sentence)
+                sentence2id = [getIndexOfWord(word) for word in sentence]
+                sentences_x.append(sentence2id)
+            sentences_x = pad_sequences(sentences_x,maxlen=maxlen,padding="post", truncating="post")
+            sentences_x = [np.array(x) for x in sentences_x]
+
+            _logits,_trans = sess.run([logits,trans],feed_dict={char_input:np.array(sentences_x),length:sentence_len})
+
+            viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=sentence_len,tag_num=4)
+
+            ner_list = []
+            for _seq,sentence in zip(viterbi_sequence,sentences):
+                seq_id = ''.join([str(s) for s in _seq])
+                if re_ner.search(seq_id):
+                    # print("sentence: ",sentence)
+                    for _ner in re_ner.finditer(seq_id):
+                        start = _ner.start()
+                        end = _ner.end()
+                        n = sentence[start:end]
+                        # print(n,'<==>',start,end)
+                        ner_list.append((n,start,end))
+            article_ner_list.append(ner_list)
+    return article_ner_list
+
+def decode(logits, trans, sequence_lengths, tag_num):
+    viterbi_sequences = []
+    for logit, length in zip(logits, sequence_lengths):
+        score = logit[:length]
+        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
+        viterbi_sequences.append(viterbi_seq)
+    return viterbi_sequences
+
+def test2():
+    punishNo = {
+        'O': 0,
+        'PN_B': 1,
+        'PN_M': 2,
+        'PN_E': 3
+    }
+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0)
+
+    train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0)
+    punishNo_in_text = set()
+    for textId in train_data['document_id']:
+        punishNo_in_text.add(textId)
+    for _ in range(1,2821):
+        punishNo_in_text.add(_)
+    punishNo_in_text = list(punishNo_in_text)
+    data = data[data['document_id'].isin(punishNo_in_text)]
+    data = data.dropna(subset=['text'])
+    re_rule1 = re.compile('\[|\]')
+    data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']]
+    data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
+    article_sentences = []
+    for id,text,sentences in zip(data['document_id'],data['text'],data['sentences']):
+        # if id in punishNo_in_text:
+        sentences_count = len(sentences)
+        sentence_list = []
+        for i in range(sentences_count-1):
+            sentence = text[sentences[i]:sentences[i+1]]
+            sentence_list.append(sentence)
+        article_sentences.append(sentence_list)
+    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
+    punishNo_ner = predict(article_sentences,model_file)
+    data['punishNo_test'] = punishNo_ner
+    punishNo_label = [[] for _ in range(13500)]
+    for textId, begin, end, entity_text in zip(train_data['document_id'], train_data['begin_index'],
+                                                train_data['end_index'],train_data['entity_text']):
+        punishNo_label[textId].append((entity_text,begin,end))
+    punishNo_right = []
+    for id in data['document_id']:
+        punishNo_right.append(punishNo_label[id])
+    data['punishNo_right'] = punishNo_right
+    test_res = []
+    for test,label_list in zip(data['punishNo_test'],data['punishNo_right']):
+        if set(test)==set(label_list):
+            test_res.append(1)
+        else:
+            test_res.append(0)
+    data['test_res'] = test_res
+    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishNo_test.xlsx",encoding='utf-8')
+
+
+
+
+def test():
+    data = pd.read_csv("data/ALLDATA.csv", index_col=0)[500:600]
+    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
+    # data = data[35000:45000]
+    sentences_list = []
+    for sentences in data['sentences']:
+        sentences = sentences.split("*#*>")
+        sentences_list.append(sentences)
+    print(len(sentences_list))
+    pn_ner = predict(sentences_list,model_file)
+    print('*'*20)
+    print(len(pn_ner),pn_ner)
+    data['ner_test'] = pn_ner
+    print(data.head(3))
+    # data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-3.xlsx",encoding='utf-8')
+
+if __name__ == '__main__':
+    # train()
+    # test()
+    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
+    sentences_list = '行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号,'.split('。')
+    pn_ner = predict([sentences_list], model_file)
+    print(pn_ner)
+    # test2()
+    # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
+    # sentences = data['sentences'][51313]
+    # sentences = sentences.split("*#*>")
+    # model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
+    # predict(sentences,model_file)
+    pass

+ 488 - 0
BiddingKG/dl/complaint/punish_rule.py

@@ -0,0 +1,488 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2020/12/24 0024 15:23
+import re
+import time
+import tensorflow as tf
+from BiddingKG.dl.common.Utils import *
+from tensorflow.contrib.crf import crf_log_likelihood
+from tensorflow.contrib.layers.python.layers import initializers
+from keras.preprocessing.sequence import pad_sequences
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+from BiddingKG.dl.interface.Preprocessing import *
+
+def BiLSTM_CRF_tfmodel(sess,weights):
+    BiRNN_Units = 140
+    chunk_tags = {
+        'O': 0,
+        'PN_B': 1,
+        'PN_M': 2,
+        'PN_E': 3
+    }
+
+    def embedding_layer(input):
+        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
+        return tf.nn.embedding_lookup(params=embedding,ids=input)
+
+    def BiLSTM_Layer(input,length):
+        with tf.variable_scope("BiLSTM"):
+            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
+            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
+        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
+        output = tf.concat(output,2)
+        return output
+
+    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
+        with tf.variable_scope("CRF"):
+            with tf.variable_scope("hidden"):
+                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
+                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
+                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
+                # print(input)
+                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
+                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
+            with tf.variable_scope("output"):
+                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
+                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
+                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
+                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
+        return logits_
+
+    def layer_loss(input,true_target,num_tags,length):
+        with tf.variable_scope("crf_loss"):
+            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
+            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
+            return tf.reduce_mean(-log_likelihood),trans
+
+    with sess.graph.as_default():
+        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
+        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
+        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
+        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
+
+        _embedding = embedding_layer(char_input)
+        _shape = tf.shape(char_input)
+        batch_size = _shape[0]
+        step_size = _shape[-1]
+        bilstm = BiLSTM_Layer(_embedding,length)
+        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
+        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
+        global_step = tf.Variable(0,trainable=False)
+        with tf.variable_scope("optimizer"):
+            opt = tf.train.AdamOptimizer(0.002)
+            grads_vars = opt.compute_gradients(crf_loss)
+            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
+            train_op = opt.apply_gradients(capped_grads_vars,global_step)
+            return char_input,_logits,target,length,crf_loss,trans,train_op
+
+def decode(logits, trans, sequence_lengths, tag_num):
+    viterbi_sequences = []
+    for logit, length in zip(logits, sequence_lengths):
+        score = logit[:length]
+        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
+        viterbi_sequences.append(viterbi_seq)
+    return viterbi_sequences
+
+class Punish_Extract():
+    def __init__(self, model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"):
+        self.sess = tf.Session(graph=tf.Graph())
+        self.code = ""
+        self.punish_dicition = ""
+        self.model_file = model_file #预测编号模型
+        self.load_model()
+
+    # 加载处罚编号预测模型
+    def load_model(self):
+        with self.sess.as_default() as sess:
+            with sess.graph.as_default():
+                vocab_model = getModel_word()
+                vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
+                self.char_input, self.logits, self.target, self.length, self.crf_loss, self.trans, self.train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
+                sess.run(tf.global_variables_initializer())
+                saver = tf.train.Saver()
+                saver.restore(sess, self.model_file)
+
+    # 处罚编号预测
+    def predict_punishCode(self,list_sentences):
+        re_ner = re.compile("12+?3")
+        article_ner_list = []
+        count = 0
+        with self.sess.as_default():
+            with self.sess.graph.as_default():
+                for sentences in list_sentences:
+                    count += 1
+                    # print(count)
+                    sentence_len = [len(sentence.sentence_text) for sentence in sentences]
+                    maxlen = max(sentence_len)
+                    sentences_x = []
+                    for sentence in sentences:
+                        sentence = sentence.sentence_text
+                        sentence = list(sentence)
+                        sentence2id = [getIndexOfWord(word) for word in sentence]
+                        sentences_x.append(sentence2id)
+                    sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
+                    sentences_x = [np.array(x) for x in sentences_x]
+                    _logits, _trans = self.sess.run([self.logits, self.trans],
+                                               feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
+                    viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
+
+                    ner_list = []
+                    for _seq, sentence in zip(viterbi_sequence, sentences):
+                        sentence = sentence.sentence_text
+                        seq_id = ''.join([str(s) for s in _seq])
+                        if re_ner.search(seq_id):
+                            # print("sentence: ",sentence)
+                            for _ner in re_ner.finditer(seq_id):
+                                start = _ner.start()
+                                end = _ner.end()
+                                n = sentence[start:end]
+                                # print(n,'<==>',start,end)
+                                # ner_list.append((n, start, end))
+                                ner_list.append(n)  # 改为只返回实体字符
+                    # article_ner_list.append(ner_list)
+                    article_ner_list.append(';'.join(set(ner_list)))
+        return article_ner_list[0]
+
+    # 处罚类型
+    def get_punishType(self, x1, x2):
+        '''通过文章标题及内容判断文章类别
+        x1: 标题
+        x2: 内容
+        return 类别'''
+        # x1 = x1.replace('(','(').replace(')', ')').replace(' ','')
+        # x2 = x2.replace('(', '(').replace(')', ')').replace(' ', '')
+        '''标题正则'''
+        # 未知公告
+        unknow = re.compile('采购方式|采购公告|磋商公告|谈判公告|交易公告$|征集|征求|招标公告|竞标公告|中标公告|'
+                            '成交公告|成交信息|流标公告|废标公告|城市管理考评|决算表|决算|预算|资格考试|招聘|选聘'
+                            '|聘请|拟录用|无违规违法|无此项信息|暂无工程投标违法|管理办法|指导意见|无投诉|投诉办法'
+                            '公共资源交易情况|绩效评价|考试成绩|付息公告|不动产|办证|印发|转发')  #|结果公示 部分是
+        # 投诉处理
+        tscl = re.compile('投诉不予[处受]理|投诉不成立|终止投诉|投诉终止|不予受理|投诉事?项?的?处理')
+        # 行政处罚
+        xzcf = re.compile('行政处罚|行政处理|政处罚|行政裁决|防罚|公罚|医罚|环罚|政罚|文罚|局罚|旅罚|财罚|运罚')
+        # 监督检查
+        jdjc = re.compile('(监督检查的?问?题?(处理|整改|记分|结果|决定|处罚))|监督处罚|调查处理|监督处理')
+        # 严重违法
+        yzwf = re.compile('严重违法失信|黑名单|失信名单')
+        # 不良行为
+        blxw = re.compile('((不良|失信|不诚信|差错|不规范|违规|违约|处罚|违法)(行为|记录|信息))|((违约|违规|违法)(处理|操作|情况|问题))'
+                          '|通报批评|记分管理|迟到|早退|缺席|虚假材料|弄虚作假|履职不到位|诚信考核扣分|串通投标'
+                          '|审核不通过|码一致|地址一致|扣分处理|扣分通知|扣[0-9]+分|责令整改|信用信息认定书$'
+                          '|关于.{,30}的处罚|关于.{,10}的?考评通报|关于.{,30}扣分情况|不规范代理行为'
+                          '|(取消|暂停|限制).{,50}((专家|评标|评委|投标|竞价|被抽取|中标|供应商|候选人)资格)'
+                          '|(代理服?务?机构).{,10}(扣分)|(专家).{,30}(扣分|记分|处罚)|对.{,30}处理|冻结.{,30}账号')
+        # 其他不良行为
+        other = re.compile('质疑|代理机构进场交易情况|网上投诉办理|信用奖惩|信用奖罚|进场工作.{,5}考核'
+                           '|举报处理|结果无效|成交无效|行政复议')
+
+        '''正文内容正则'''
+        # 投诉处理
+        tscl_c = re.compile('(投诉(人|单位)[1-9]?(名称)?[::])|(投诉事项[1-5一二三四五、]*部?分?(成立|予以受理))'
+                            '|((驳回|撤回|撤销|终止)[^,。]{,60}(投诉|质疑))')
+        # 行政处罚
+        xzcf_c = re.compile('((处理依据及结果|处理结果|处罚结果)).*行政处罚|如下行政处罚|行政处罚决定')
+        # 诚信加分
+        cxjf_c = re.compile('处罚结果.*诚信加分')
+        # 严重违法失信
+        yzwf_c = re.compile('工商部门严重违法失信起名单|严重违法失信的具体情形') #|严重违法失信的具体情形
+        # 不良行为
+        blxw_c = re.compile('(取消|暂停|限制).{,30}((专家|评标|评委|投标|采购|竞价|被抽取|中标|供应商)的?资格)'
+                            '|(处罚结果|处罚情况).*(扣[1-9]*分|记分|不良行为|不良记录|不良信用|不诚信|扣除信用'
+                            '|诚信档案|信用信息|取消.*资格|口头警告|处罚机关|责令改正|罚款|限制投标|暂扣|禁止'
+                            '|暂停|封禁|暂无|行政处罚)|处罚结果'
+                            '|处罚主题|禁止参与.{,10}政府采购活动|列入不良行为|处罚如下|如下处罚|违规处罚|处罚违规'
+                            '|责令改正|责令整改|处罚依据|进行以下处理|处理依据及结果|处理结果|处罚决定书|'
+                            '(不规范|不良|不诚信)行为记录')
+        # 其他不良行为
+        other_c = re.compile('质疑(人|单位)[1-9]?(名称)?:|公告期内受质疑')
+
+        if re.search(unknow, x1):
+            return re.search(unknow, x1).group(0), '未知类别'
+        elif re.search(yzwf, x1):
+            return re.search(yzwf, x1).group(0), '严重违法'
+        elif re.search(yzwf_c, x2):
+            return re.search(yzwf_c, x2).group(0), '严重违法'
+
+        elif re.search(tscl, x1):
+            return re.search(tscl, x1).group(0), '投诉处理'
+        elif re.search(xzcf, x1):
+            return re.search(xzcf, x1).group(0), '行政处罚'
+        elif re.search(jdjc, x1):
+            return re.search(jdjc, x1).group(0), '监督检查'
+        elif re.search(blxw, x1):
+            return re.search(blxw, x1).group(0), '不良行为'
+        elif re.search(other, x1):
+            return re.search(other, x1).group(0), '其他不良行为'
+
+        elif re.search(tscl_c, x2):
+            return re.search(tscl_c, x2).group(0), '投诉处理'
+        elif re.search(xzcf_c, x2):
+            return re.search(xzcf_c, x2).group(0), '行政处罚'
+        elif re.search(cxjf_c, x2):
+            return re.search(cxjf_c, x2).group(0), '诚信加分'
+
+        elif re.search(blxw_c, x2):
+            return re.search(blxw_c, x2).group(0), '不良行为'
+        elif re.search(other_c, x2):
+            return re.search(other_c, x2).group(0), '其他不良行为'
+
+        return ' ', '未知类别'
+
+    # 处罚决定
+    def get_punishDecision(self, x, x2):
+        '''通过正则匹配文章内容中的处理决定
+        x:正文内容
+        x2: 处罚类别
+        return 处理决定字符串'''
+        rule1 = re.compile(
+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处理意见|行政处罚|处罚)(如下|如下))'
+            '|((以下|如下)(决定|处理|处理意见|行政处罚|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
+            '|整改意见)[::].{5,}')
+        rule2 = re.compile(
+            '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处罚|处理意见)(如下|如下))'
+            '|((以下|如下)(决定|处理|处理意见|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
+            '|处罚内容)[:,,].{10,}')
+        rule3 = re.compile('考评结果:?.*')
+        rule4 = re.compile('(依据|根据)《.*》.*')
+        if x2 == '未知类别':
+            return ' '
+        elif re.search(rule1, x[-int(len(x)*0.4):]):
+            return re.search(rule1, x[-int(len(x)*0.4):]).group(0)
+        elif re.search(rule1, x[-int(len(x)*0.6):]):
+            return re.search(rule1, x[-int(len(x)*0.6):]).group(0)
+        elif re.search(rule2, x[-int(len(x)*0.7):]):
+            return re.search(rule2, x[-int(len(x)*0.7):]).group(0)
+        elif re.search(rule3, x[-int(len(x)*0.6):]):
+            return re.search(rule3, x[-int(len(x)*0.6):]).group(0)
+        elif re.search(rule4, x[-int(len(x)*0.4):]):
+            return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
+        else:
+            return ' '
+
+    # 投诉是否成立
+    def get_punishWhether(self, x1, x2, x3):
+        '''通过正则匹配处理决定判断投诉是否成立
+        x1: 处理决定字符串
+        x2: 正文内容
+        x3: 处罚类别
+        return 投诉是否成立'''
+        p1 = re.compile('(投诉|投拆|质疑|举报)(事项|内容|事实)?[^不,。]{,10}(成立|属实|予以受理|予以支持)|责令|废标|(中标|成交)[^,。]{,10}无效'
+                        '|取消[^,。]{,60}资格|罚款|重新(组织|开展)?(招标|采购)|投诉成立|被投诉人存在违法违规行为'
+                        '|采购活动违法|(中标|评标|成交)结果无效')
+        p2 = re.compile('投诉不予[处受]理|((投诉|投拆|质疑|举报)(事项|内容|事实)?[^,。]{,10}(不成立|情?况?不属实|不予支持|缺乏事实依据))'
+                        '|((驳回|撤回|撤销|终止)[^,。]*(投诉|质疑|诉求))|终止[^,。]{,20}(行政裁决|投诉处理|采购活动)|投诉终止|投诉无效'
+                        '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^,。]{,10}不成立'
+                        '|维持被投诉人|不支持[^,。]{,20}投诉|无确凿证据')
+        if x3 != '投诉处理':
+            return ' '
+        elif re.search(p1, x1):
+            return '投诉成立'
+        elif re.search(p2, x1):
+            return '投诉无效'
+        elif re.search(p1, x2):
+            return '投诉成立'
+        elif re.search(p2, x2):
+            return '投诉无效'
+        return ' '
+
+    # 执法机构、处罚时间
+    def get_institution(self, title, sentences_l, entity_l):
+        '''
+        通过判断实体前信息判断改实体是否为执法机构
+        :param title: 文章标题
+        :param sentences_l: 单篇公告句子列表
+        :param entity_l: 单篇公告实体列表
+        :return: 执法机构及处罚时间字符串,多个的用;号隔开
+        '''
+        institutions = []
+        punishTimes = []
+        institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
+        punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
+        # 通过实体前面关键词判断是否为执法机构或处罚时间
+        for ner in entity_l:
+            if ner.entity_type == 'org':
+                left = sentences_l[ner.sentence_index].sentence_text[
+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
+                if institution_1.search(left):
+                    institutions.append(ner)
+                elif institutions != [] and ner.sentence_index == institutions[-1].sentence_index and \
+                        ner.wordOffset_begin - institutions[-1].wordOffset_end < 2 and \
+                        sentences_l[ner.sentence_index].sentence_text[
+                        ner.wordOffset_begin:institutions[-1].wordOffset_end] \
+                        in ['', '、', '和', '及']:
+                    institutions.append(ner)
+            elif ner.entity_type == 'time':
+                left = sentences_l[ner.sentence_index].sentence_text[
+                       max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
+                if punishTimes_1.search(left):
+                    punishTimes.append(ner)
+
+        institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
+        institution_time = re.compile(
+            "(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
+        ins = ""
+        ptime = ""
+        # 如果前面步骤找不到处罚机构则在标题找实体,并正则检查是否有关键词
+        if institutions == []:
+            title_ners = getNers([title], useselffool=True)
+            if title_ners[0]:
+                for title_ner in title_ners[0]:
+                    if title_ner[2] == 'org' and institution_title.search(title_ner[3]):
+                        ins = title_ner[3]
+                        break
+        if punishTimes == [] or institutions == []:
+            # 如果前面步骤还没找到要素,则通过公司实体后面是否有日期关键词,有则作为处罚机构和处罚时间
+            for ner in [ner for ner in entity_l if ner.entity_type == 'org'][-5:][::-1]:
+                right = sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_end:ner.wordOffset_end + 16]
+                if institution_time.search(right):
+                    if ins == '':
+                        ins = ner.entity_text
+                    if ptime == '':
+                        ptime = institution_time.search(right).group(1)
+                    break
+            # 前面步骤都没找到则判断最后一个时间实体是否在文章末尾,是则作为处罚时间
+            if ptime == '':
+                n_time = [ner for ner in entity_l if ner.entity_type == 'time']
+                if len(n_time) != 0:
+                    ner = n_time[-1]
+                    if ner.sentence_index == len(sentences_l) - 1:
+                        textLong = len(sentences_l[ner.sentence_index].sentence_text)
+                        if ner.wordOffset_end > textLong - 3 and len(ner.entity_text) > 3:
+                            ptime = ner.entity_text
+        institutions = [ner.entity_text for ner in institutions]
+        punishTimes = [ner.entity_text for ner in punishTimes]
+        if institutions == [] and ins != "":
+            institutions.append(ins)
+        if punishTimes == [] and ptime != "":
+            punishTimes.append(ptime)
+        return ";".join(institutions), ";".join(punishTimes)
+
+    # 投诉人、被投诉人、被处罚人
+    def get_complainant(self, punishType, sentences_l, entity_l):
+        '''
+        通过对公告类别、句子列表、实体列表正则寻找投诉人、被投诉人、处罚人
+        :param punishType: 公告处罚类别
+        :param sentences_l: 单篇公告句子列表
+        :param entity_l: 单篇公告实体列表
+        :return: 投诉人、被投诉人
+        '''
+        complainants = []  # 投诉人
+        punishPeople = []  # 被投诉人、被处罚人
+        size = 16
+        # 投诉人、质疑人
+        complainants_rule1 = re.compile(
+            "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+        # 被处罚人,被投诉人
+        punishPeople_rule1 = re.compile(
+            "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+        punishPeople_rule2_1 = re.compile(",$")
+        punishPeople_rule2_2 = re.compile("^[::]")
+        punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
+        punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
+
+        punish_l = []  # 处罚实体列表
+        tmp = []
+        for ner in [ner for ner in entity_l if ner.entity_type in ['org', 'company', 'person']]:
+            if tmp == []:
+                tmp.append(ner)
+            elif ner.entity_type == tmp[-1].entity_type and ner.sentence_index == tmp[-1].sentence_index and \
+                    ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
+                '',
+                '、',
+                '和',
+                '及']:
+                tmp.append(ner)
+            elif ner.entity_type in ['org', 'company'] and tmp[-1].entity_type in ['org', 'company'] and \
+                    ner.sentence_index == tmp[-1].sentence_index and ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
+                    and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
+                '',
+                '、',
+                '和',
+                '及']:
+                tmp.append(ner)
+            else:
+                punish_l.append(tmp)
+                tmp = [ner]
+        for ner_l in punish_l:
+            begin_index = ner_l[0].wordOffset_begin
+            end_index = ner_l[-1].wordOffset_end
+            left = sentences_l[ner_l[0].sentence_index].sentence_text[max(0, begin_index - size):begin_index]
+            right = sentences_l[ner_l[0].sentence_index].sentence_text[end_index:end_index + size]
+            if complainants_rule1.search(left):
+                complainants.append(ner_l)
+            elif punishPeople_rule1.search(left):
+                punishPeople.append(ner_l)
+            elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
+                if punishType == '投诉处理':
+                    complainants.append(ner_l)
+                else:
+                    punishPeople.append(ner_l)
+            elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
+                punishPeople.append(ner_l)
+        complainants = set([it.entity_text for l in complainants for it in l])
+        punishPeople = set([it.entity_text for l in punishPeople for it in l])
+        return ';'.join(complainants), ';'.join(punishPeople)
+
+def get_punish_extracts(doc_id=' ', title=' ', text=' '):
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
+                                                                                    useselffool=True)
+    punish_code = punish.predict_punishCode(list_sentences)
+    # print('处罚编号: ',punish_code)
+    institutions, punishTimes = punish.get_institution(title, list_sentences[0], list_entitys[0])
+    # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
+    keyword, punishType = punish.get_punishType(title, text)
+    # print('处罚类型:',punishType)
+    punishDecision = punish.get_punishDecision(text, punishType)
+    # print('处罚决定:',punishDecision)
+    punishWhether= punish.get_punishWhether(punishDecision, text, punishType)
+    # print('投诉是否成立:',punishWhether)
+    complainants, punishPeople = punish.get_complainant(punishType, list_sentences[0], list_entitys[0])
+    # print('投诉人:%s  被投诉人:%s'%(complainants, punishPeople))
+    return punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether,institutions, punishTimes
+
+if __name__ == "__main__":
+    punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
+
+    import pandas as pd
+    # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
+    df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
+    # i = 89
+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+    # i = 92
+    # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
+
+    # t1 = time.time()
+    # for i in df.index:
+    #     punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+    #         get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
+    #     df.loc[i, '投诉人'] = complainants
+    #     df.loc[i, '被投诉人'] = punishPeople
+    #     df.loc[i, '执法机构'] = institutions
+    #     df.loc[i, '处罚时间'] = punishTimes
+    #     df.loc[i, '处罚编号'] = punish_code
+    #     print('完成第%d篇'%i)
+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
+    # #    'institution', 'punishTime', 'ner_test']])
+    # t2 = time.time()
+    # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    # #     '关键词', '类别', '处理决定', '投诉是否成立',
+    # #    'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
+    # #    'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
+    # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
+    #     '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
+    #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
+    # t3 = time.time()
+    # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
+    s = '厦财企〔2020〕12号,各有关单位:341号。厦财企〔2020〕12号,各有关单位:行政处罚厦建招诉决【2019】342号。行政处罚厦建招诉决【2019】343号。行政处罚厦建招诉决【2019】344号,'
+    # list_sentences = [s.split('。')]
+    # punish_code= punish.predict_punishCode( list_sentences)
+    # print(punish_code)
+
+    punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
+                get_punish_extracts(text=s)
+    print(punish_code)

+ 831 - 0
BiddingKG/dl/complaint/test1.py

@@ -0,0 +1,831 @@
+import sys
+import os
+sys.path.append(os.path.abspath("../.."))
+import pandas as pd
+import re
+from BiddingKG.dl.common.Utils import *
+from BiddingKG.dl.interface.Entitys import *
+from BiddingKG.dl.interface.predictor import *
+from BiddingKG.dl.foolnltk import selffool
+from BiddingKG.dl.interface.Preprocessing import *
+
+def get_data1():
+    load1 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_TOU_SU_CHU_LI.csv")
+    load2 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_WEI_FA_JI_LU.csv")
+    load3 = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\T_QI_TA_SHI_XIN.csv")
+    load = pd.concat([load1, load2, load3], axis=0)
+    load = load.reset_index(drop=True)
+    load['PAGE_CONTENT'] = get_article1(load['PAGE_CONTENT'])
+    sentences_list = get_sentences1(load['PAGE_CONTENT'])
+    load['sentences'] = ['*#*>'.join(_sentences) for _sentences in sentences_list ]
+    load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv")
+
+def get_ners():
+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
+    # data = data.head(3)
+    nersList = []
+    for index,_sentences in zip(data.index,data['sentences']):
+        _sentences = _sentences.split('*#*>')
+        _ners = getNers(_sentences,useselffool=True)
+        word_index = 0
+        for ners,sentence in zip(_ners, _sentences):
+            if len(ners) != 0:
+                word_ner_list = ['O']*len(sentence)
+
+                for ner in ners:
+                    nerDict = dict()
+                    entity_type = ner[2]
+                    nerDict['entity_type'] = entity_type
+                    entity_text = ner[3]
+                    nerDict['entity_text'] = entity_text
+                    begin_index = ner[0]
+                    nerDict['begin_index'] = begin_index
+                    end_index = ner[1] - 1
+                    nerDict['end_index'] = end_index
+                    wordOffset_begin = word_index + begin_index
+                    nerDict['wordOffset_begin'] = wordOffset_begin
+                    wordOffset_end = wordOffset_begin + len(entity_text)
+                    nerDict['wordOffset_end'] = wordOffset_end
+                    nerDict['sentence'] = sentence
+                    nerDict['article_index'] = index
+                    # print('====')
+                    # print(begin_index,end_index,entity_type,entity_text)
+                    nersList.append(nerDict)
+                    # print(nerDict)
+                    word_ner_list[begin_index] = 'B'
+                    word_ner_list[begin_index+1:end_index] = ['I']*(end_index-begin_index-1)
+            word_index += len(sentence)
+    # save(nersList,"nersList.pk")
+
+# 相邻的(org、company)(person)合并
+def get_unionNers():
+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
+    ners = load("nersList.pk")
+    org_companys = [[] for _ in range(len(data))]
+    type1 = ['org', 'company', 'union_oc']
+    persons = [[] for _ in range(len(data))]
+    type2 = ['person', 'union_person']
+    for ner in ners:
+        if ner['entity_type'] in type1:
+            org_companys[ner['article_index']].append(ner)
+        if ner['entity_type'] in type2:
+            persons[ner['article_index']].append(ner)
+    # 合并 org 和 company
+    new_org_companys = []
+    for org_company in org_companys:
+        if org_company and len(org_company) > 1:
+            union_nums = 0
+            for i in range(len(org_company)-1):
+                if org_company[i]['end_index'] == org_company[i + 1]['begin_index'] - 1 and org_company[i]['sentence'][org_company[i]['end_index']] == '、' \
+                        and org_company[i]['sentence'] == org_company[i + 1]['sentence']:
+                    # print(1)
+                    org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
+                    org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
+                    org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
+                    # print(org_company[i + 1]['entity_text'])
+                    org_company[i] = 0
+                    union_nums += 1
+                elif org_company[i]['end_index'] == org_company[i + 1]['begin_index'] and org_company[i]['sentence'] == org_company[i+1]['sentence']:
+                    # print(2)
+                    org_company[i + 1]['begin_index'] = org_company[i]['begin_index']
+                    org_company[i + 1]['wordOffset_begin'] = org_company[i]['wordOffset_begin']
+                    org_company[i + 1]['entity_text'] = org_company[i]['entity_text'] + '+' + org_company[i+1]['entity_text']
+                    # print(org_company[i + 1]['entity_text'])
+                    org_company[i] = 0
+                    union_nums += 1
+            for _ in range(union_nums):
+                org_company.remove(0)
+        new_org_companys.append(org_company)
+    # 合并person
+    new_persons = []
+    for person in persons:
+        if person and len(person) > 1:
+            union_nums = 0
+            for i in range(len(person) - 1):
+                if person[i]['end_index'] == person[i + 1]['begin_index'] - 1 and person[i]['sentence'][person[i]['end_index']] == '、' \
+                        and person[i]['sentence'] == person[i + 1]['sentence']:
+                    # print(1)
+                    person[i + 1]['begin_index'] = person[i]['begin_index']
+                    person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
+                    person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
+                    # print(person[i + 1]['entity_text'])
+                    person[i] = 0
+                    union_nums += 1
+                elif person[i]['end_index'] == person[i + 1]['begin_index'] and person[i]['sentence'] == person[i + 1]['sentence']:
+                    # print(2)
+                    person[i + 1]['begin_index'] = person[i]['begin_index']
+                    person[i + 1]['wordOffset_begin'] = person[i]['wordOffset_begin']
+                    person[i + 1]['entity_text'] = person[i]['entity_text'] + '+' + person[i + 1]['entity_text']
+                    # print(person[i + 1]['entity_text'])
+                    person[i] = 0
+                    union_nums += 1
+            for _ in range(union_nums):
+                person.remove(0)
+        new_persons.append(person)
+    # save([new_org_companys,new_persons],"unionNers.pk")
+
+def test02():
+    load = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
+
+    text_rule = re.compile("监管调查|通报|不诚信|监督检查|不良|投诉|质疑|处罚|违法|违规|不予[受处]理|处理")
+    title_rule = re.compile("中标公告|中标[(\(]成交[\))]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
+                            "|补贴公[示告]|废标公[示告]")
+    # need_index = []
+    # for index, title, text in zip(load.index, load['PAGE_TITLE'], load['PAGE_CONTENT']):
+    #     a = 0
+    #     if text_rule.search(text):
+    #         a = 1
+    #     if title_rule.search(title):
+    #         a = 0
+    #     if text_rule.search(title):
+    #         a = 1
+    #     if a:
+    #         need_index.append(index)
+    # print(len(need_index))
+    # load = load.loc[need_index]
+    # print(len(load))
+    # load = load.reset_index(drop=True)
+
+    complainants_rule1 = re.compile("[^被]投[诉拆][人方]之?[\d一二三四五六七八九十]?(?:(.+?))?[::]+?")
+    complaint_rule = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|疑问[人方]|检举[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?名称)?[::]+")
+    complainants_list = []
+    a = 1
+    load = load[9744:9745]
+    for article,sentences in zip(load['PAGE_CONTENT'],load['sentences']):
+        print(a)
+        a+=1
+        getSentences = sentences.split('*#*>')
+        # print(getSentences)
+        ners = getNers(getSentences,useselffool=True)
+        print(ners)
+        print('======================')
+        word_index = 0
+        ners_list = []
+        for ner,sentence in zip(ners,getSentences):
+            size = 16
+            complainants = []
+            if len(ner)!=0:
+                for aner in ner:
+
+                    entity_type = aner[2]
+                    entity_text = aner[3]
+                    # begin = word_index + aner[0]
+                    # end = begin + len(entity_text)
+                    # 投诉人
+                    if entity_type in ['org','company','person']:
+                        left = sentence[max(0, aner[0] - size):aner[0]]
+
+                        print(entity_text,left,sentence)
+                        if complaint_rule.search(left):
+                            print('yes')
+                            entity_type = 'complainant'
+                            complainants.append(entity_text)
+                    # ners_list.append([begin, end, entity_type, entity_text])
+            word_index += len(sentence)
+        complainants_list.append(complainants)
+
+        # test
+        # for i in ners_list:
+        #     print(i[3])
+        #     print(processed[0][i[0]:i[1]])
+    load['complainant'] = complainants_list
+    # load.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\test01.csv")
+
+# 投诉人、被投诉人、被处罚人
+def get_complainant():
+    data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2.xlsx",index_col=0)
+    # ners = load("nersList.pk")
+    unionNers = load("unionNers.pk")
+    ners = [i+j for i,j in zip(unionNers[0],unionNers[1])]
+    complainants = [[] for _ in range(len(data))]
+    punishPeople = [[] for _ in range(len(data))]
+    a = ['org','company','person']
+    size = 16
+    # 投诉人、质疑人
+    complainants_rule1 = re.compile("(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+    # 被处罚人,被投诉人
+    punishPeople_rule1 = re.compile("(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+    punishPeople_rule2_1 = re.compile(",$")
+    punishPeople_rule2_2 = re.compile("^[::]")
+    punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
+    punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
+
+    time1 = time.time()
+    for _ner in ners:
+        if _ner:
+            for ner in _ner:
+                left = ner['sentence'][max(0,ner['begin_index']-size):ner['begin_index']]
+                right = ner['sentence'][ner['end_index']:min(ner['end_index']+size,len(ner['sentence']))]
+                # print(left)
+                if complainants_rule1.search(left):
+                    complainants[ner['article_index']].append(ner['entity_text'])
+                elif punishPeople_rule1.search(left):
+                    punishPeople[ner['article_index']].append(ner['entity_text'])
+                elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
+                    if data['类别'][ner['article_index']] == '投诉处理':
+                        complainants[ner['article_index']].append(ner['entity_text'])
+                    else:
+                        punishPeople[ner['article_index']].append(ner['entity_text'])
+                elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
+                    punishPeople[ner['article_index']].append(ner['entity_text'])
+    data['complainant'] = complainants
+    data['punishPeople'] = punishPeople
+    print(time.time()-time1)
+    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx")
+
+def get_complainant2(list_sentences, list_entitys, text_type):
+    '''
+    list_sentences: get_preprocessed() 得list_sentences
+    list_entitys: get_preprocessed() 得list_entitys
+    text_type: 文章类别(处罚类型)
+    :return:
+    complainants :投诉人列表
+    punishPeople: 被投诉人/被处罚人
+    '''
+    sentences_list = list_sentences
+    entitys_list = list_entitys
+    size = 16
+    a = ['org', 'company', 'person']
+    b = ['org', 'company', 'union_org_company']
+    c = ['person', 'union_person']
+    need_entitys = []
+    for entity in entitys_list:
+        if entity.entity_type in a:
+            need_entitys.append(entity)
+    # 实体合并
+    drop_count = 0
+    for i in range(1, len(need_entitys)):
+        entity = need_entitys[i]
+        entity_begin = entity.wordOffset_begin
+        entity_end = entity.wordOffset_end
+        sentence = sentences_list[entity.sentence_index].sentence_text
+        last_entity = need_entitys[i - 1]
+        if entity.sentence_index == last_entity.sentence_index:
+            if (entity.entity_type in b and last_entity.entity_type in b) or (
+                    entity.entity_type in c and last_entity.entity_type in c):
+                if entity_begin - last_entity.wordOffset_end < 2 and sentence[
+                                                                     last_entity.wordOffset_end:entity_begin] in ['',
+                                                                                                                  '、',
+                                                                                                                  '和',
+                                                                                                                  '及']:
+                    need_entitys[i].wordOffset_begin = last_entity.wordOffset_begin
+                    need_entitys[i].begin_index = last_entity.begin_index
+                    need_entitys[i].entity_text = last_entity.entity_text + '+' + entity.entity_text
+                    if entity.entity_type in b:
+                        need_entitys[i].entity_type = 'union_org_company'
+                    else:
+                        need_entitys[i].entity_type = 'union_person'
+                    need_entitys[i - 1] = 0
+                    drop_count += 1
+    for _ in range(drop_count):
+        need_entitys.remove(0)
+    # 投诉人、质疑人
+    complainants_rule1 = re.compile(
+        "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+    # 被处罚人,被投诉人
+    punishPeople_rule1 = re.compile(
+        "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
+    punishPeople_rule2_1 = re.compile(",$")
+    punishPeople_rule2_2 = re.compile("^[::]")
+    punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
+    punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
+    complainants = []
+    punishPeople = []
+    for i in range(len(need_entitys)):
+        entity = need_entitys[i]
+        entity_begin = entity.wordOffset_begin
+        entity_end = entity.wordOffset_end
+
+        # entity所在句子
+        sentence = sentences_list[entity.sentence_index].sentence_text
+        left = sentence[max(0, entity_begin - size):entity_begin]
+        right = sentence[entity_end:min(entity_end + size, len(sentence))]
+
+        if complainants_rule1.search(left):
+            complainants.append(entity)
+        elif punishPeople_rule1.search(left):
+            punishPeople.append(entity)
+        elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
+            if text_type == '投诉处理':
+                complainants.append(entity)
+            else:
+                punishPeople.append(entity)
+        elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
+            punishPeople.append(entity)
+
+    result_complainants = []
+    result_punishPeople = []
+    for entity in complainants:
+        if entity.entity_type in ['union_org_company', 'union_person']:
+            entity_text = entity.entity_text.split('+')
+            for item in entity_text:
+                result_complainants.append(item)
+        else:
+            result_complainants.append(entity.entity_text)
+    for entity in punishPeople:
+        if entity.entity_type in ['union_org_company', 'union_person']:
+            entity_text = entity.entity_text.split('+')
+            for item in entity_text:
+                result_punishPeople.append(item)
+        else:
+            result_punishPeople.append(entity.entity_text)
+    return list(set(result_complainants)), list(set(result_punishPeople))
+
+# 公告分类
+def textClassify():
+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
+    #投诉人|检举人|举报人|质疑人|质疑函
+    patten1 = "投诉人|检举人|举报人|质疑人|质疑函|投诉处理|质疑单位"
+    re1 = re.compile(patten1)
+    patten2 = "不予[处受]理|撤诉|撤[销回]投诉|投诉终止"
+    re2 = re.compile(patten2)
+    patten3 = "关于[^,。]+?(?:处罚|通报|处理意见)|被处罚人|处罚决定|限制行为开始时间|处罚执行部门"
+    re3 = re.compile(patten3)
+    patten4 = "不良行为|不良信用|不良记录|不规范行为|不诚信行为"
+    re4 = re.compile(patten4)
+    patten5 = "行政处罚|行政处理|监督检查|监管调查|监督处理|违规处[罚理]|违法处[罚理]"
+    re5 = re.compile(patten5)
+    patten6 = "严重违法失信起名单|严重违法失信行为|严重违法失信企业"
+    re6 = re.compile(patten6)
+    patten7 = '处理决定'
+    re7 = re.compile(patten7)
+    patten8 = "处[理罚]依据|处罚日期|扣分依据|认定依据"
+    re8 = re.compile(patten8)
+    pos = []
+    _type = []
+    for title,text in zip(data['PAGE_TITLE'],data["PAGE_CONTENT"]):
+        p = []
+        t = ''
+        if re1.search(text) or re1.search(title):
+            p.append(patten1)
+            t = '投诉'
+        elif re2.search(text) and re.search('投诉',text):
+            p.append('投诉+'+patten2)
+            t = '投诉'
+        elif re.search("回复",title):
+            p.append("回复")
+            t = '投诉'
+        if len(p)==0:
+            if re3.search(title) or re3.search(text):
+                p.append(patten3)
+                t = '处罚'
+            elif re4.search(title):
+                p.append(patten4)
+                t = '处罚'
+            elif re5.search(title) or re5.search(text):
+                p.append(patten5)
+                t = '处罚'
+            elif re6.search(text) or re6.search(title):
+                p.append(patten6)
+                t = '处罚'
+            elif re8.search(text):
+                p.append(patten8)
+                t = '处罚'
+        if len(p) == 0:
+            if re7.search(text) and re.search('投诉', text):
+                p.append('投诉+' + patten7)
+                t = '投诉'
+            elif re7.search(text) or re7.search(title):
+                p.append("处罚+"+patten7)
+                t = '处罚'
+        pos.append(p)
+        _type.append(t)
+    data['pos'] = pos
+    data['type'] = _type
+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv")
+
+# 投诉是否成立
+def get_punishWhether01():
+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv",index_col=0)
+    data = data[data['type']=='投诉']
+    punishWhether_1 = re.compile("投诉[^。,,不]+?成立|投诉[^。,,]*[^不]属实|情况[^。,,]*[^不]属实|投诉成立|情况属实|予以支持")
+    punishWhether_0 = re.compile("投诉[^。,,]*不能?成立|撤诉|[^逾将]{4,}不予[受处]理|撤[回销][^。,,]*(?:举报|投诉)|驳回[^。,,]*投诉|投诉终止|终止[^。,,]*投诉|情况[^。,,]*不属实|投诉[^。,,]*不属实|缺乏事实依据|不予支持|予以驳回")
+    punishWhether = []
+    punishDecision = []
+    punishDecision_1 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[::]((?:(?:[\d一二三四五六七八九十]|[\((][\d一二三四五六七八九十][\))]|投[诉拆]事项[\d一二三四五六七八九十]).+?。)+)")
+    punishDecision_2 = re.compile("(?:决定|认定|综上所述|决定如下|处理结果|处理如下|处理结果公布)[::]([^。]+?(?:。|$))")
+    punishDecision_3 = re.compile("[\d一二三四五六七八九十]、(?:处理,?意见|[裁决|处理]依据及结果|处理(?:决定|结果)|投诉处理决定),(.+?)。[\d一二三四五六七八九十]、")
+    punishDecision_4 = re.compile("(?:[\d一二三四五六七八九十]、处理,?意见|综上所述|[裁决|处理]依据及结果|综上|[\d一二三四五六七八九十]、处理(?:决定|结果)|经研究决定|[\d一二三四五六七八九十]、投诉处理决定),([^。]+?(?:。|$))")
+    punishDecision_5 = re.compile("(本机关决定|本机关认为|经审查.+?(?:。|$))")
+    punishDecision_6 = re.compile("((?:依据|按照|根据|依照)[^::。].+?(?:。|$))")
+
+    def findDecision(text):
+        decision = ''
+        if punishDecision_1.search(text):
+            decision = punishDecision_1.search(text).group(1)
+
+        elif punishDecision_2.search(text):
+            decision = punishDecision_2.search(text).group(1)
+        elif punishDecision_3.search(text):
+            decision = punishDecision_3.search(text).group(1)
+        elif punishDecision_4.search(text):
+            decision = punishDecision_4.findall(text)
+            decision = decision[-1]
+        elif punishDecision_5.search(text):
+            decision = punishDecision_5.search(text).group(1)
+        elif punishDecision_6.search(text):
+            decision = punishDecision_6.findall(text)
+            decision1 = decision[-1]
+            if re.search("诉讼",decision1) and len(decision)>1:
+                decision1 = decision[-2]
+            decision = decision1
+        return decision
+
+    for text in data['PAGE_CONTENT']:
+        pw = ''
+        if punishWhether_1.search(text):
+            pw = 1
+        elif punishWhether_0.search(text):
+            pw = 0
+        punishWhether.append(pw)
+
+        mid = len(text)//2
+        lower_half = text[mid:]
+        decision = findDecision(lower_half)
+        if decision == '':
+            decision = findDecision(text)
+
+        # if punishDecision_1.search(text):
+        #     decision = punishDecision_1.search(text).group(1)
+        #
+        # elif punishDecision_2.search(text):
+        #     decision = punishDecision_2.search(text).group(1)
+        # elif punishDecision_3.search(text):
+        #     decision = punishDecision_3.search(text).group(1)
+        # elif punishDecision_4.search(text):
+        #     decision = punishDecision_4.findall(text)
+        #     decision = decision[-1]
+        # elif punishDecision_5.search(text):
+        #     decision = punishDecision_5.findall(text)
+        #     decision = decision[-1]
+        punishDecision.append(decision)
+    data['punishWhether'] = punishWhether
+    data['punishDecision'] = punishDecision
+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishWhether&Decision.csv")
+# 处罚决定
+def get_punishDecision():
+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\textClassify01.csv", index_col=0)
+    data = data[data['type'] == '处罚']
+    punishDecision_1 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[::]+((?:(?:[\d一二三四五六七八九十]|[\((][\d一二三四五六七八九十][\))]).+?。)+)")
+    punishDecision_2 = re.compile("(?:处罚结果|处理结果|处罚结论|处罚内容|处理意见|考评结果|我局决定|处罚决定|[以如]下行政处罚|如下监督处理决定|如下处理决定|处理意见如下|处罚[以如]下|[以如]下处罚|决定如下|处理如下)[::]+(.+?(?:。|$))")
+    punishDecision_3 = re.compile("(扣分分?值[::][\d.]+分?)")
+    punishDecision_4 = re.compile("[\d一二三四五六七八九十]、(?:处理结果|处理决定|处理依据[和及]处理结果|处理依据及结果|处罚决定|处罚结果|整改意见),(.+?)。[\d一二三四五六七八九十]、")
+    punishDecision_5 = re.compile("(?:处理结果|[\d一二三四五六七八九十]、处理决定|处理依据及处理结果|处理依据及结果|经研究|经研究决定|[\d一二三四五六七八九十]、处罚决定|处罚结果|整改意见),+(.+?(?:。|$))")
+    punishDecision_6 = re.compile("(?:本机关决定|我局决定)(.+?(?:。|$))")
+    punishDecision_7 = re.compile("((?:依据|按照|根据|依照)[^::。].+?(?:。|$))")
+    punishDecision = []
+    for text in data['PAGE_CONTENT']:
+        decision = ''
+        if punishDecision_1.search(text):
+            decision = punishDecision_1.search(text).group(1)
+        elif punishDecision_2.search(text):
+            decision = punishDecision_2.search(text).group(1)
+        elif punishDecision_3.search(text):
+            decision = punishDecision_3.search(text).group(1)
+        elif punishDecision_4.search(text):
+            decision = punishDecision_4.search(text).group(1)
+        elif punishDecision_5.search(text):
+            decision = punishDecision_5.findall(text)
+            decision = decision[-1]
+        elif punishDecision_6.search(text):
+            decision = punishDecision_6.search(text).group(1)
+        elif punishDecision_7.search(text):
+            decision = punishDecision_7.findall(text)
+            decision = decision[-1]
+        punishDecision.append(decision)
+    data['punishDecision'] = punishDecision
+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishDecision处罚.csv")
+
+# 执法机构、处罚时间
+def get_institution():
+    data = pd.read_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-1.xlsx", index_col=0)
+    ners = load("nersList.pk")
+    orgs = [[] for _ in range(len(data))]
+    times = [[] for _ in range(len(data))]
+    institutions = [[] for _ in range(len(data))]
+    punishTimes = [[] for _ in range(len(data))]
+    institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
+    punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
+    for ner in ners:
+        if ner['entity_type'] == 'org':
+            left = ner['sentence'][max(0,ner['begin_index']-15):ner['begin_index']]
+            if institution_1.search(left):
+                institutions[ner['article_index']].append(ner['entity_text'])
+            orgs[ner['article_index']].append(ner)
+        elif ner['entity_type'] =='time':
+            left = ner['sentence'][max(0, ner['begin_index'] - 15):ner['begin_index']]
+            if punishTimes_1.search(left):
+                punishTimes[ner['article_index']].append(ner['entity_text'])
+            times[ner['article_index']].append(ner)
+    orgs = [org[-5:] if len(org)>5 else org for org in orgs]
+    times = [time[-3:] if len(time)>3 else time for time in times]
+    data['org'] = orgs
+    data['time'] = times
+    data['institution'] = institutions
+    data['punishTime'] = punishTimes
+    # data = data[data['type'].isin(["投诉","处罚"])]
+    print(len(data))
+    # data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv")
+    # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\get_institution.csv", index_col=0)
+    institution_list = []
+    punishTime_list = []
+    institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
+    institution_time = re.compile("(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
+    for title,text,org,n_time,institution,punishTime in zip(data['PAGE_TITLE'],data['PAGE_CONTENT'],data['org'],data['time'],data['institution'],data['punishTime']):
+        ins = ''
+        ptime = ''
+        if punishTime:
+            ptime = punishTime
+        if institution:
+            ins = institution
+        else:
+            title_ners = getNers([title],useselffool=True)
+            if title_ners[0]:
+
+                for title_ner in title_ners[0]:
+
+                    if title_ner[2]=='org' and institution_title.search(title_ner[3]):
+                        # 'title:'+
+                        ins = title_ner[3]
+                        # print(title_ner[3])
+                        break
+
+        # if ins == '':
+        for _org in org[::-1]:
+            right = _org['sentence'][_org['end_index']:min(len(_org['sentence']),_org['end_index']+16)]
+            if institution_time.search(right):
+                if ins == '':
+                    # "text_EndWithTime:" +
+                    ins = _org['entity_text']
+                if ptime == '':
+                    # "text_EndWithIns:" +
+                    ptime =institution_time.search(right).group(1)
+                break
+        if ptime == '' and len(n_time) != 0:
+            textLong = len(text)
+            if n_time[-1]['wordOffset_end'] > textLong-3 and len(n_time[-1]['entity_text'])>3:
+                # "EndOfText:" +
+                ptime = n_time[-1]['entity_text']
+
+        institution_list.append(ins)
+        punishTime_list.append(ptime)
+    data['institution'] = institution_list
+    data['punishTime'] = punishTime_list
+    data = data.drop(columns=['org','time'],axis=1)
+    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-2.xlsx")
+
+# 处罚类型
+def get_punishType():
+    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv", index_col=0)
+    # 暂定:严重违法失信,行政处罚,投诉处理,监督检查,其他失信记录
+
+    # 其他无关公告
+    title_rule = re.compile("(?:中标公[示告]|中标[(\(]成交[\))]公告|采购结果公[示告]|评审结果公告|[侯候]选人公[示告]|成交公[示告]"
+                            "|补贴公[示告]|废标公[示告]|备案公[示告]|数据统计|选取公告|流标公告|变更公告|入围公告|征集公告|执行情况|"
+                            "登记公告|竞争性磋商公告|报名的公[示告]|竞争性谈判公告|邀请函|竞标公告|采购公告|招标公告|议标公告|预审公告|"
+                            "询价公告|竞争性磋商(磋商)公告|竞[谈价]公告|合同公告|人员(名单)?公示|批复|终止公告|入围结果公告|中标结果公[示告]|"
+                            "意见公示)(?:[\((].+?[\))])?$|关于.*通知(?:[^书]|$)")
+    othertype = "其他无关公告"
+    # 投诉处理
+    re1_1 = re.compile("投诉[人方]|检举人|举报人[::]|投诉处理|终止投诉|投诉终止|撤诉|撤回投诉|质疑人|质疑单位|质疑[^,,。]*答复")
+    re1_2 = re.compile("处理决定|回复")
+    re1_type = '投诉处理'
+    # 监督检查
+    re2 = re.compile("监督检查|监管调查|监督处理")
+    re2_type = "监督检查"
+    # 行政处罚
+    re3 = re.compile("行政处罚|行政处理")
+    re3_type = "行政处罚"
+    # 严重违法失信
+    re4 = re.compile("严重违法失信行为|严重违法失信企业|严重违法失信起名单")
+    re4_type = "严重违法失信"
+    # 其他失信公告
+    re_other = re.compile("关于[^,。]+?(?:处罚|处理|通报)|不良行为|不良信用|不良记录|不规范行为|不诚信行为|"
+                          "违[规法约]处[罚理]|处[理罚]依据|处罚日期|扣分依据|认定依据|处罚决定|违规情况|"
+                          "违[规法]行为|违规事项|考评依据|失信行为")
+    re_otherType = "其他失信公告"
+    punishType_list = []
+    for title,text in zip(data['PAGE_TITLE'],data['PAGE_CONTENT']):
+        punishType = ''
+        titleWithText = title + text
+        if title_rule.search(title):
+            punishType = othertype
+        elif re1_1.search(titleWithText) or re.search("投[诉拆]",title):
+            punishType = re1_type
+        elif re1_2.search(titleWithText) and re.search("投诉",titleWithText):
+            punishType = re1_type
+        elif re2.search(titleWithText):
+            punishType = re2_type
+        elif re3.search(titleWithText):
+            punishType = re3_type
+        elif re4.search(titleWithText):
+            punishType = re4_type
+        elif re_other.search(titleWithText) or re.search("处罚",title):
+            punishType = re_otherType
+        punishType_list.append(punishType)
+    data['punishType'] = punishType_list
+    data.to_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishType_test.csv",encoding='utf-8')
+
+
+def getNers_my(sentences,MAXAREA = 10000,useselffool=False):
+    '''
+    @param: sentences:句子数
+    @return 限流执行后的实体识别list
+    '''
+    def getData(ners,process_data):
+        process_sentences = [item[1] for item in process_data]
+        print(process_data)
+        if useselffool:
+            ner_ = selffool.self_ner(process_sentences)
+        else:
+            ner_ = selffool.ner(process_sentences)
+        print('ner_ :',ner_)
+        for i in range(len(ner_)):
+            the_index = process_data[i][0]
+            ners[the_index] = ner_[i]
+    sents = []
+    for i in range(len(sentences)):
+        sents.append([i,sentences[i]])
+    sents.sort(key=lambda x:len(x[1]),reverse=True)
+    print(sents)
+    index_ = 0
+    ners = [[]for i in range(len(sentences))]
+
+    while(True):
+        width = len(sents[index_][1])
+        height = MAXAREA//width+1
+        if height>len(sents)-index_:
+            height = len(sents)-index_
+        process_data = sents[index_:index_+height]
+        getData( ners, process_data)
+        index_ += height
+        if index_>=len(sents):
+            break
+    return ners
+# 网页公告处理
+def get_article1(articles,cost_time = dict(),useselffool=True):
+    '''
+    :param articles: 待处理的article source html
+    :param useselffool: 是否使用selffool
+    :return: list_articles
+    '''
+
+    list_articles = []
+    for article in articles:
+        a_time = time.time()
+        sourceContent = article
+        #表格处理
+        key_preprocess = "tableToText"
+        start_time = time.time()
+        article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
+
+        # log(article_processed)
+
+        if key_preprocess not in cost_time:
+            cost_time[key_preprocess] = 0
+        cost_time[key_preprocess] += time.time()-start_time
+
+        #article_processed = article[1]
+        list_articles.append(article_processed)
+        print(time.time()-a_time)
+    return list_articles
+# 分句处理
+def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
+    '''
+
+    :param list_articles: 经过预处理的article text
+    :return: list_sentences
+    '''
+
+    list_sentences = []
+    for article in list_articles:
+        a_time = time.time()
+        list_sentences_temp = []
+        #表格处理
+        key_preprocess = "tableToText"
+        start_time = time.time()
+        article_processed = article
+
+
+        if key_preprocess not in cost_time:
+            cost_time[key_preprocess] = 0
+        cost_time[key_preprocess] += time.time()-start_time
+
+        #nlp处理
+        if article_processed is not None and len(article_processed)!=0:
+            split_patten = "。"
+            sentences = []
+            _begin = 0
+            sentences_set = set()
+            for _iter in re.finditer(split_patten,article_processed):
+                _sen = article_processed[_begin:_iter.span()[1]]
+                if len(_sen)>0 and _sen not in sentences_set:
+                    sentences.append(_sen)
+                    sentences_set.add(_sen)
+                _begin = _iter.span()[1]
+            _sen = article_processed[_begin:]
+            if len(_sen)>0 and _sen not in sentences_set:
+                sentences.append(_sen)
+                sentences_set.add(_sen)
+            # article = "".join(sentences)
+            # # sentences.append(article_processed[_begin:])
+            #
+            # lemmas = []
+            # doc_offsets = []
+            # dep_types = []
+            # dep_tokens = []
+            #
+            # time1 = time.time()
+
+            '''
+            tokens_all = fool.cut(sentences)
+            #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
+            #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
+            ner_entitys_all = fool.ner(sentences)
+            '''
+            #限流执行
+            key_nerToken = "nerToken"
+            start_time = time.time()
+            # tokens_all = getTokens(sentences,useselffool=useselffool)
+            if key_nerToken not in cost_time:
+                cost_time[key_nerToken] = 0
+            cost_time[key_nerToken] += time.time()-start_time
+
+
+            for sentence_index in range(len(sentences)):
+
+                sentence_text = sentences[sentence_index]
+                # tokens = tokens_all[sentence_index]
+                #
+                # #pos_tag = pos_all[sentence_index]
+                # pos_tag = ""
+                #
+                # ner_entitys = ""
+
+                list_sentences_temp.append(sentence_text)
+
+        if len(list_sentences_temp)==0:
+            list_sentences_temp.append(sentence_text)
+        list_sentences.append(list_sentences_temp)
+        print('2:',time.time()-a_time)
+    return list_sentences
+
+def ronghe():
+    a = ",投诉处理决定书,投诉人:福建光正工程项目管理有限公司,联系地址:福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室,被投诉人:泉州台商投资区城市建设发展有限公司,泉州台商投资区水务投资经营有限公司,福建省富诚工程管理有限公司,联系地址:泉州台商投资区通港路大创商厦,一、投诉人投诉事项,投诉人按中标候选人公示的要求参加会议,由于提供的身份证原件于复印件版本不同而被废标,认为废标理由不成立。"
+    ners = [(13, 28, 'company', '福建光正工程项目管理有限公司'), (33, 75, 'location', '福建省漳州市芗城区水仙大道与东环城路交叉口西南角新城苑北区1幢1301-1305室'), (80, 98, 'company', '泉州台商投资区城市建设发展有限公司'), (98, 116, 'company', '泉州台商投资区水务投资经营有限公司'), (116, 130, 'company', '福建省富诚工程管理有限公司'), (135, 150, 'location', '泉州台商投资区通港路大创商厦')]
+    s = ['person', 'org', 'company', 'union']
+    remove_num = 0
+    for i in range(len(ners)):
+        print(0)
+        ner = ners[i]
+        begin = ner[0]
+        end = ner[1]
+        type = ner[2]
+
+        if type in s:
+            if end == ners[i+1][0] and a[end-1]=='、':
+                print(1)
+                new_begin = begin
+                new_end = ners[i+1][1]
+                new_type = 'union'
+                new_text = ner[3]+'、'+ners[i+1][3]
+                new_ner = (new_begin,new_end,new_type,new_text)
+                ners[i] = 0
+                ners[i+1] = new_ner
+                remove_num += 1
+                continue
+            if end == ners[i + 1][0] and a[end-1] == ',' and a[ners[i + 1][1]-1]==a[end-1]:
+                print(2)
+                new_begin = begin
+                new_end = ners[i + 1][1]
+                new_type = 'union'
+                new_text = ner[3] + ',' + ners[i + 1][3]
+                new_ner = (new_begin, new_end, new_type, new_text)
+                ners[i] = 0
+                ners[i + 1] = new_ner
+                remove_num += 1
+
+    for i in range(remove_num):
+        ners.remove(0)
+    print(ners)
+
+if __name__ == '__main__':
+    # get_data1()
+    # get_ners()
+    # test02()
+    # get_unionNers()
+    # 投诉人、被投诉/处罚人
+    # get_complainant()
+    # ronghe()
+    # 分类
+    # textClassify()
+    # 投诉是否成立、处罚决定(投诉)
+    # get_punishWhether01()
+    # 处罚决定(处罚)
+    # get_punishDecision()
+    # 执法机构、处罚时间
+    get_institution()
+    # 处罚类型
+    # get_punishType()
+
+    pass

二进制
BiddingKG/dl/complaint/vocab_word.pk