|
@@ -1,559 +0,0 @@
|
|
|
-#!/usr/bin/python3
|
|
|
-# -*- coding: utf-8 -*-
|
|
|
-# @Author : bidikeji
|
|
|
-# @Time : 2020/12/24 0024 15:23
|
|
|
-import re
|
|
|
-import os
|
|
|
-import time
|
|
|
-import tensorflow as tf
|
|
|
-from BiddingKG.dl.common.Utils import *
|
|
|
-from tensorflow.contrib.crf import crf_log_likelihood
|
|
|
-from tensorflow.contrib.layers.python.layers import initializers
|
|
|
-from keras.preprocessing.sequence import pad_sequences
|
|
|
-import BiddingKG.dl.interface.Preprocessing as Preprocessing
|
|
|
-from BiddingKG.dl.interface.Preprocessing import *
|
|
|
-
|
|
|
-def BiLSTM_CRF_tfmodel(sess,weights):
|
|
|
- BiRNN_Units = 140
|
|
|
- chunk_tags = {
|
|
|
- 'O': 0,
|
|
|
- 'PN_B': 1,
|
|
|
- 'PN_M': 2,
|
|
|
- 'PN_E': 3
|
|
|
- }
|
|
|
-
|
|
|
- def embedding_layer(input):
|
|
|
- embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
|
|
|
- return tf.nn.embedding_lookup(params=embedding,ids=input)
|
|
|
-
|
|
|
- def BiLSTM_Layer(input,length):
|
|
|
- with tf.variable_scope("BiLSTM"):
|
|
|
- forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
|
|
|
- backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
|
|
|
- output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
|
|
|
- output = tf.concat(output,2)
|
|
|
- return output
|
|
|
-
|
|
|
- def CRF_layer(input,num_tags,BiRNN_Units,time_step):
|
|
|
- with tf.variable_scope("CRF"):
|
|
|
- with tf.variable_scope("hidden"):
|
|
|
- w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
|
|
|
- initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
|
|
|
- b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
|
|
|
- # print(input)
|
|
|
- input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
|
|
|
- hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
|
|
|
- with tf.variable_scope("output"):
|
|
|
- w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
|
|
|
- b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
|
|
|
- pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
|
|
|
- logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
|
|
|
- return logits_
|
|
|
-
|
|
|
- def layer_loss(input,true_target,num_tags,length):
|
|
|
- with tf.variable_scope("crf_loss"):
|
|
|
- trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
|
|
|
- log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
|
|
|
- return tf.reduce_mean(-log_likelihood),trans
|
|
|
-
|
|
|
- with sess.graph.as_default():
|
|
|
- char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
|
|
|
- target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
|
|
|
- length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
|
|
|
- # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
|
|
|
-
|
|
|
- _embedding = embedding_layer(char_input)
|
|
|
- _shape = tf.shape(char_input)
|
|
|
- batch_size = _shape[0]
|
|
|
- step_size = _shape[-1]
|
|
|
- bilstm = BiLSTM_Layer(_embedding,length)
|
|
|
- _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
|
|
|
- crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
|
|
|
- global_step = tf.Variable(0,trainable=False)
|
|
|
- with tf.variable_scope("optimizer"):
|
|
|
- opt = tf.train.AdamOptimizer(0.002)
|
|
|
- grads_vars = opt.compute_gradients(crf_loss)
|
|
|
- capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
|
|
|
- train_op = opt.apply_gradients(capped_grads_vars,global_step)
|
|
|
- print('tensor: ',char_input, length, trans, _logits)
|
|
|
- return char_input,_logits,target,length,crf_loss,trans,train_op
|
|
|
-
|
|
|
-def decode(logits, trans, sequence_lengths, tag_num):
|
|
|
- viterbi_sequences = []
|
|
|
- for logit, length in zip(logits, sequence_lengths):
|
|
|
- score = logit[:length]
|
|
|
- viterbi_seq, viterbi_score = viterbi_decode(score, trans)
|
|
|
- viterbi_sequences.append(viterbi_seq)
|
|
|
- return viterbi_sequences
|
|
|
-
|
|
|
-class Punish_Extract():
|
|
|
- def __init__(self, model_file = os.path.dirname(__file__)+"/models/21-0.9990081295021194-0.3647936/model.ckpt"):
|
|
|
- print('model_file_path:',model_file)
|
|
|
- self.sess = tf.Session(graph=tf.Graph())
|
|
|
- self.code = ""
|
|
|
- self.punish_dicition = ""
|
|
|
- self.model_file = model_file #预测编号模型
|
|
|
- self.load_model()
|
|
|
-
|
|
|
- # 加载处罚编号预测模型
|
|
|
- def load_model(self):
|
|
|
- with self.sess.as_default() as sess:
|
|
|
- with sess.graph.as_default():
|
|
|
- vocab_model = getModel_word()
|
|
|
- vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
|
|
|
- self.char_input, self.logits, self.target, self.length, self.crf_loss, self.trans, self.train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
|
|
|
- sess.run(tf.global_variables_initializer())
|
|
|
- saver = tf.train.Saver()
|
|
|
- saver.restore(sess, self.model_file)
|
|
|
-
|
|
|
- # 处罚编号预测
|
|
|
- def predict_punishCode(self,list_sentences):
|
|
|
- re_ner = re.compile("12+?3")
|
|
|
- article_ner_list = []
|
|
|
- count = 0
|
|
|
- with self.sess.as_default():
|
|
|
- with self.sess.graph.as_default():
|
|
|
- for sentences in list_sentences:
|
|
|
- count += 1
|
|
|
- # print(count)
|
|
|
- sentence_len = [len(sentence.sentence_text) for sentence in sentences]
|
|
|
- maxlen = max(sentence_len)
|
|
|
- sentences_x = []
|
|
|
- for sentence in sentences:
|
|
|
- sentence = sentence.sentence_text
|
|
|
- sentence = list(sentence)
|
|
|
- sentence2id = [getIndexOfWord(word) for word in sentence]
|
|
|
- sentences_x.append(sentence2id)
|
|
|
- sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
|
|
|
- sentences_x = [np.array(x) for x in sentences_x]
|
|
|
- print('punish tensor: ',self.logits, self.trans, self.char_input, self.length)
|
|
|
- _logits, _trans = self.sess.run([self.logits, self.trans],
|
|
|
- feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
|
|
|
- viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
|
|
|
-
|
|
|
- ner_list = []
|
|
|
- for _seq, sentence in zip(viterbi_sequence, sentences):
|
|
|
- sentence = sentence.sentence_text
|
|
|
- seq_id = ''.join([str(s) for s in _seq])
|
|
|
- if re_ner.search(seq_id):
|
|
|
- # print("sentence: ",sentence)
|
|
|
- for _ner in re_ner.finditer(seq_id):
|
|
|
- start = _ner.start()
|
|
|
- end = _ner.end()
|
|
|
- n = sentence[start:end]
|
|
|
- # print(n,'<==>',start,end)
|
|
|
- # ner_list.append((n, start, end))
|
|
|
- ner_list.append(n) # 改为只返回实体字符
|
|
|
- # article_ner_list.append(ner_list)
|
|
|
- article_ner_list.append(';'.join(set(ner_list)))
|
|
|
- return article_ner_list[0]
|
|
|
-
|
|
|
- # 处罚类型
|
|
|
- def get_punishType(self, x1, x2):
|
|
|
- '''通过文章标题及内容判断文章类别
|
|
|
- x1: 标题
|
|
|
- x2: 内容
|
|
|
- return 类别'''
|
|
|
- # x1 = x1.replace('(','(').replace(')', ')').replace(' ','')
|
|
|
- # x2 = x2.replace('(', '(').replace(')', ')').replace(' ', '')
|
|
|
- '''标题正则'''
|
|
|
- # 未知公告
|
|
|
- unknow = re.compile('采购方式|采购公告|磋商公告|谈判公告|交易公告$|征集|征求|招标公告|竞标公告|中标公告|'
|
|
|
- '成交公告|成交信息|流标公告|废标公告|城市管理考评|决算表|决算|预算|资格考试|招聘|选聘'
|
|
|
- '|聘请|拟录用|无违规违法|无此项信息|暂无工程投标违法|管理办法|指导意见|无投诉|投诉办法'
|
|
|
- '公共资源交易情况|绩效评价|考试成绩|付息公告|不动产|办证|印发|转发') #|结果公示 部分是
|
|
|
- # 投诉处理
|
|
|
- tscl = re.compile('投诉不予[处受]理|投诉不成立|终止投诉|投诉终止|不予受理|投诉事?项?的?处理')
|
|
|
- # 行政处罚
|
|
|
- xzcf = re.compile('行政处罚|行政处理|政处罚|行政裁决|防罚|公罚|医罚|环罚|政罚|文罚|局罚|旅罚|财罚|运罚')
|
|
|
- # 监督检查
|
|
|
- jdjc = re.compile('(监督检查的?问?题?(处理|整改|记分|结果|决定|处罚))|监督处罚|调查处理|监督处理')
|
|
|
- # 严重违法
|
|
|
- yzwf = re.compile('严重违法失信|黑名单|失信名单')
|
|
|
- # 不良行为
|
|
|
- blxw = re.compile('((不良|失信|不诚信|差错|不规范|违规|违约|处罚|违法)(行为|记录|信息))|((违约|违规|违法)(处理|操作|情况|问题))'
|
|
|
- '|通报批评|记分管理|迟到|早退|缺席|虚假材料|弄虚作假|履职不到位|诚信考核扣分|串通投标'
|
|
|
- '|审核不通过|码一致|地址一致|扣分处理|扣分通知|扣[0-9]+分|责令整改|信用信息认定书$'
|
|
|
- '|关于.{,30}的处罚|关于.{,10}的?考评通报|关于.{,30}扣分情况|不规范代理行为'
|
|
|
- '|(取消|暂停|限制).{,50}((专家|评标|评委|投标|竞价|被抽取|中标|供应商|候选人)资格)'
|
|
|
- '|(代理服?务?机构).{,10}(扣分)|(专家).{,30}(扣分|记分|处罚)|对.{,30}处理|冻结.{,30}账号')
|
|
|
- # 其他不良行为
|
|
|
- other = re.compile('质疑|代理机构进场交易情况|网上投诉办理|信用奖惩|信用奖罚|进场工作.{,5}考核'
|
|
|
- '|举报处理|结果无效|成交无效|行政复议')
|
|
|
-
|
|
|
- '''正文内容正则'''
|
|
|
- # 投诉处理
|
|
|
- tscl_c = re.compile('(投诉(人|单位)[1-9]?(名称)?[::])|(投诉事项[1-5一二三四五、]*部?分?(成立|予以受理))'
|
|
|
- '|((驳回|撤回|撤销|终止)[^,。]{,60}(投诉|质疑))')
|
|
|
- # 行政处罚
|
|
|
- xzcf_c = re.compile('((处理依据及结果|处理结果|处罚结果)).*行政处罚|如下行政处罚|行政处罚决定')
|
|
|
- # 诚信加分
|
|
|
- cxjf_c = re.compile('处罚结果.*诚信加分')
|
|
|
- # 严重违法失信
|
|
|
- yzwf_c = re.compile('工商部门严重违法失信起名单|严重违法失信的具体情形') #|严重违法失信的具体情形
|
|
|
- # 不良行为
|
|
|
- blxw_c = re.compile('(取消|暂停|限制).{,30}((专家|评标|评委|投标|采购|竞价|被抽取|中标|供应商)的?资格)'
|
|
|
- '|(处罚结果|处罚情况).*(扣[1-9]*分|记分|不良行为|不良记录|不良信用|不诚信|扣除信用'
|
|
|
- '|诚信档案|信用信息|取消.*资格|口头警告|处罚机关|责令改正|罚款|限制投标|暂扣|禁止'
|
|
|
- '|暂停|封禁|暂无|行政处罚)|处罚结果'
|
|
|
- '|处罚主题|禁止参与.{,10}政府采购活动|列入不良行为|处罚如下|如下处罚|违规处罚|处罚违规'
|
|
|
- '|责令改正|责令整改|处罚依据|进行以下处理|处理依据及结果|处理结果|处罚决定书|'
|
|
|
- '(不规范|不良|不诚信)行为记录')
|
|
|
- # 其他不良行为
|
|
|
- other_c = re.compile('质疑(人|单位)[1-9]?(名称)?:|公告期内受质疑')
|
|
|
-
|
|
|
- if re.search(unknow, x1):
|
|
|
- return re.search(unknow, x1).group(0), '未知类别'
|
|
|
- elif re.search(yzwf, x1):
|
|
|
- return re.search(yzwf, x1).group(0), '严重违法'
|
|
|
- elif re.search(yzwf_c, x2):
|
|
|
- return re.search(yzwf_c, x2).group(0), '严重违法'
|
|
|
-
|
|
|
- elif re.search(tscl, x1):
|
|
|
- return re.search(tscl, x1).group(0), '投诉处理'
|
|
|
- elif re.search(xzcf, x1):
|
|
|
- return re.search(xzcf, x1).group(0), '行政处罚'
|
|
|
- elif re.search(jdjc, x1):
|
|
|
- return re.search(jdjc, x1).group(0), '监督检查'
|
|
|
- elif re.search(blxw, x1):
|
|
|
- return re.search(blxw, x1).group(0), '不良行为'
|
|
|
- elif re.search(other, x1):
|
|
|
- return re.search(other, x1).group(0), '其他不良行为'
|
|
|
-
|
|
|
- elif re.search(tscl_c, x2):
|
|
|
- return re.search(tscl_c, x2).group(0), '投诉处理'
|
|
|
- elif re.search(xzcf_c, x2):
|
|
|
- return re.search(xzcf_c, x2).group(0), '行政处罚'
|
|
|
- elif re.search(cxjf_c, x2):
|
|
|
- return re.search(cxjf_c, x2).group(0), '诚信加分'
|
|
|
-
|
|
|
- elif re.search(blxw_c, x2):
|
|
|
- return re.search(blxw_c, x2).group(0), '不良行为'
|
|
|
- elif re.search(other_c, x2):
|
|
|
- return re.search(other_c, x2).group(0), '其他不良行为'
|
|
|
-
|
|
|
- return ' ', '未知类别'
|
|
|
-
|
|
|
- # 处罚决定
|
|
|
- def get_punishDecision(self, x, x2):
|
|
|
- '''通过正则匹配文章内容中的处理决定
|
|
|
- x:正文内容
|
|
|
- x2: 处罚类别
|
|
|
- return 处理决定字符串'''
|
|
|
- rule1 = re.compile(
|
|
|
- '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处理意见|行政处罚|处罚)(如下|如下))'
|
|
|
- '|((以下|如下)(决定|处理|处理意见|行政处罚|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
|
|
|
- '|整改意见)[::].{5,}')
|
|
|
- rule2 = re.compile(
|
|
|
- '(((如下|以下|处理|研究|本机关|我机关|本局|我局)决定)|((决定|处理|处罚|处理意见)(如下|如下))'
|
|
|
- '|((以下|如下)(决定|处理|处理意见|处罚))|处理依据及结果|处理结果|处罚结果|处罚情况|限制行为'
|
|
|
- '|处罚内容)[:,,].{10,}')
|
|
|
- rule3 = re.compile('考评结果:?.*')
|
|
|
- rule4 = re.compile('(依据|根据)《.*》.*')
|
|
|
- if x2 == '未知类别':
|
|
|
- return ' '
|
|
|
- elif re.search(rule1, x[-int(len(x)*0.4):]):
|
|
|
- return re.search(rule1, x[-int(len(x)*0.4):]).group(0)
|
|
|
- elif re.search(rule1, x[-int(len(x)*0.6):]):
|
|
|
- return re.search(rule1, x[-int(len(x)*0.6):]).group(0)
|
|
|
- elif re.search(rule2, x[-int(len(x)*0.7):]):
|
|
|
- return re.search(rule2, x[-int(len(x)*0.7):]).group(0)
|
|
|
- elif re.search(rule3, x[-int(len(x)*0.6):]):
|
|
|
- return re.search(rule3, x[-int(len(x)*0.6):]).group(0)
|
|
|
- elif re.search(rule4, x[-int(len(x)*0.4):]):
|
|
|
- return re.search(rule4, x[-int(len(x)*0.4):]).group(0)
|
|
|
- else:
|
|
|
- return ''
|
|
|
-
|
|
|
- # 投诉是否成立
|
|
|
- def get_punishWhether(self, x1, x2, x3):
|
|
|
- '''通过正则匹配处理决定判断投诉是否成立
|
|
|
- x1: 处理决定字符串
|
|
|
- x2: 正文内容
|
|
|
- x3: 处罚类别
|
|
|
- return 投诉是否成立'''
|
|
|
- p1 = re.compile('(投诉|投拆|质疑|举报)(事项|内容|事实)?[^不,。]{,10}(成立|属实|予以受理|予以支持)|责令|废标|(中标|成交)[^,。]{,10}无效'
|
|
|
- '|取消[^,。]{,60}资格|罚款|重新(组织|开展)?(招标|采购)|投诉成立|被投诉人存在违法违规行为'
|
|
|
- '|采购活动违法|(中标|评标|成交)结果无效')
|
|
|
- p2 = re.compile('投诉不予[处受]理|((投诉|投拆|质疑|举报)(事项|内容|事实)?[^,。]{,10}(不成立|情?况?不属实|不予支持|缺乏事实依据))'
|
|
|
- '|((驳回|撤回|撤销|终止)[^,。]*(投诉|质疑|诉求))|终止[^,。]{,20}(行政裁决|投诉处理|采购活动)|投诉终止|投诉无效'
|
|
|
- '|予以驳回|不予受理|继续开展采购|被投诉人不存在违法违规行为|中标结果有效|投诉[^,。]{,10}不成立'
|
|
|
- '|维持被投诉人|不支持[^,。]{,20}投诉|无确凿证据')
|
|
|
- if x3 != '投诉处理':
|
|
|
- return ''
|
|
|
- elif re.search(p1, x1):
|
|
|
- return '投诉成立'
|
|
|
- elif re.search(p2, x1):
|
|
|
- return '投诉无效'
|
|
|
- elif re.search(p1, x2):
|
|
|
- return '投诉成立'
|
|
|
- elif re.search(p2, x2):
|
|
|
- return '投诉无效'
|
|
|
- return ''
|
|
|
-
|
|
|
- # 执法机构、处罚时间
|
|
|
- def get_institution(self, title, sentences_l, entity_l):
|
|
|
- '''
|
|
|
- 通过判断实体前信息判断改实体是否为执法机构
|
|
|
- :param title: 文章标题
|
|
|
- :param sentences_l: 单篇公告句子列表
|
|
|
- :param entity_l: 单篇公告实体列表
|
|
|
- :return: 执法机构及处罚时间字符串,多个的用;号隔开
|
|
|
- '''
|
|
|
- institutions = []
|
|
|
- punishTimes = []
|
|
|
- institution_1 = re.compile("(?:处罚执行部门|认定部门|执法机关名称|执法单位|通报部门|处罚机关|处罚部门)[::]")
|
|
|
- punishTimes_1 = re.compile("(?:处罚日期|限制行为开始时间|曝光开始日期|处罚决定日期|处罚期限|处罚时间|处理日期|公告开始时间)[::]")
|
|
|
- # 通过实体前面关键词判断是否为执法机构或处罚时间
|
|
|
- for ner in entity_l:
|
|
|
- if ner.entity_type == 'org':
|
|
|
- left = sentences_l[ner.sentence_index].sentence_text[
|
|
|
- max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
|
|
|
- if institution_1.search(left):
|
|
|
- institutions.append(ner)
|
|
|
- elif institutions != [] and ner.sentence_index == institutions[-1].sentence_index and \
|
|
|
- ner.wordOffset_begin - institutions[-1].wordOffset_end < 2 and \
|
|
|
- sentences_l[ner.sentence_index].sentence_text[
|
|
|
- ner.wordOffset_begin:institutions[-1].wordOffset_end] \
|
|
|
- in ['', '、', '和', '及']:
|
|
|
- institutions.append(ner)
|
|
|
- elif ner.entity_type == 'time':
|
|
|
- left = sentences_l[ner.sentence_index].sentence_text[
|
|
|
- max(0, ner.wordOffset_begin - 15):ner.wordOffset_begin]
|
|
|
- if punishTimes_1.search(left):
|
|
|
- punishTimes.append(ner)
|
|
|
-
|
|
|
- institution_title = re.compile("财政局|财政厅|监督管理局|公管局|公共资源局|委员会")
|
|
|
- institution_time = re.compile(
|
|
|
- "(^,?[\d一二三四五六七八九十]{4},?[/年-][\d一二三四五六七八九十]{1,2},?[/月-][\d一二三四五六七八九十]{1,2},?[/日-]?)")
|
|
|
- ins = ""
|
|
|
- ptime = ""
|
|
|
- # 如果前面步骤找不到处罚机构则在标题找实体,并正则检查是否有关键词
|
|
|
- if institutions == [] and len(title)>10:
|
|
|
- title_ners = getNers([title], useselffool=True)
|
|
|
- if title_ners[0]:
|
|
|
- for title_ner in title_ners[0]:
|
|
|
- if title_ner[2] == 'org' and institution_title.search(title_ner[3]):
|
|
|
- ins = title_ner[3]
|
|
|
- break
|
|
|
- if punishTimes == [] or institutions == []:
|
|
|
- # 如果前面步骤还没找到要素,则通过公司实体后面是否有日期关键词,有则作为处罚机构和处罚时间
|
|
|
- for ner in [ner for ner in entity_l if ner.entity_type == 'org'][-5:][::-1]:
|
|
|
- right = sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_end:ner.wordOffset_end + 16]
|
|
|
- if institution_time.search(right):
|
|
|
- if ins == '':
|
|
|
- ins = ner.entity_text
|
|
|
- if ptime == '':
|
|
|
- ptime = institution_time.search(right).group(1)
|
|
|
- break
|
|
|
- # 前面步骤都没找到则判断最后一个时间实体是否在文章末尾,是则作为处罚时间
|
|
|
- if ptime == '':
|
|
|
- n_time = [ner for ner in entity_l if ner.entity_type == 'time']
|
|
|
- if len(n_time) != 0:
|
|
|
- ner = n_time[-1]
|
|
|
- if ner.sentence_index == len(sentences_l) - 1:
|
|
|
- textLong = len(sentences_l[ner.sentence_index].sentence_text)
|
|
|
- if ner.wordOffset_end > textLong - 3 and len(ner.entity_text) > 3:
|
|
|
- ptime = ner.entity_text
|
|
|
- institutions = [ner.entity_text for ner in institutions]
|
|
|
- punishTimes = [ner.entity_text for ner in punishTimes]
|
|
|
- if institutions == [] and ins != "":
|
|
|
- institutions.append(ins)
|
|
|
- if punishTimes == [] and ptime != "":
|
|
|
- punishTimes.append(ptime)
|
|
|
- return ";".join(institutions), ";".join(punishTimes)
|
|
|
-
|
|
|
- # 投诉人、被投诉人、被处罚人
|
|
|
- def get_complainant(self, punishType, sentences_l, entity_l):
|
|
|
- '''
|
|
|
- 通过对公告类别、句子列表、实体列表正则寻找投诉人、被投诉人、处罚人
|
|
|
- :param punishType: 公告处罚类别
|
|
|
- :param sentences_l: 单篇公告句子列表
|
|
|
- :param entity_l: 单篇公告实体列表
|
|
|
- :return: 投诉人、被投诉人
|
|
|
- '''
|
|
|
- complainants = [] # 投诉人
|
|
|
- punishPeople = [] # 被投诉人、被处罚人
|
|
|
- size = 16
|
|
|
- # 投诉人、质疑人
|
|
|
- complainants_rule1 = re.compile(
|
|
|
- "(?:[^被]|^)(?:投[诉拆][人方]|质疑[人方]|质疑供应商|质疑单位|疑问[人方]|检举[人方]|举报[人方])[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
|
|
|
- # 被处罚人,被投诉人
|
|
|
- punishPeople_rule1 = re.compile(
|
|
|
- "(被投[诉拆][人方]|被检举[人方]|被举报[人方]|被处罚人|被处罚单位|行政相对人|单位名称|不良行为单位或个人|被查单位|处罚主题|企业|主体|违规对象|违规单位|当事人)[\d一二三四五六七八九十]?(\(.+?\))?(:?,?名称[\d一二三四五六七八九十]?)?(?:[::,]+.{0,3}$|$)")
|
|
|
- punishPeople_rule2_1 = re.compile(",$")
|
|
|
- punishPeople_rule2_2 = re.compile("^[::]")
|
|
|
- punishPeople_rule3_1 = re.compile("(?:关于|对)[^,。]*$")
|
|
|
- punishPeople_rule3_2 = re.compile("^[^,。]*(?:通报|处罚|披露|处理|信用奖惩|不良行为|不良记录)")
|
|
|
-
|
|
|
- punish_l = [] # 处罚实体列表
|
|
|
- tmp = []
|
|
|
- for ner in [ner for ner in entity_l if ner.entity_type in ['org', 'company', 'person']]:
|
|
|
- if tmp == []:
|
|
|
- tmp.append(ner)
|
|
|
- elif ner.entity_type == tmp[-1].entity_type and ner.sentence_index == tmp[-1].sentence_index and \
|
|
|
- ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
|
|
|
- and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
|
|
|
- '',
|
|
|
- '、',
|
|
|
- '和',
|
|
|
- '及']:
|
|
|
- tmp.append(ner)
|
|
|
- elif ner.entity_type in ['org', 'company'] and tmp[-1].entity_type in ['org', 'company'] and \
|
|
|
- ner.sentence_index == tmp[-1].sentence_index and ner.wordOffset_begin - tmp[-1].wordOffset_end < 2 \
|
|
|
- and sentences_l[ner.sentence_index].sentence_text[ner.wordOffset_begin:tmp[-1].wordOffset_end] in [
|
|
|
- '',
|
|
|
- '、',
|
|
|
- '和',
|
|
|
- '及']:
|
|
|
- tmp.append(ner)
|
|
|
- else:
|
|
|
- punish_l.append(tmp)
|
|
|
- tmp = [ner]
|
|
|
- for ner_l in punish_l:
|
|
|
- begin_index = ner_l[0].wordOffset_begin
|
|
|
- end_index = ner_l[-1].wordOffset_end
|
|
|
- left = sentences_l[ner_l[0].sentence_index].sentence_text[max(0, begin_index - size):begin_index]
|
|
|
- right = sentences_l[ner_l[0].sentence_index].sentence_text[end_index:end_index + size]
|
|
|
- if complainants_rule1.search(left):
|
|
|
- complainants.append(ner_l)
|
|
|
- elif punishPeople_rule1.search(left):
|
|
|
- punishPeople.append(ner_l)
|
|
|
- elif punishPeople_rule2_1.search(left) and punishPeople_rule2_2.search(right):
|
|
|
- if punishType == '投诉处理':
|
|
|
- complainants.append(ner_l)
|
|
|
- else:
|
|
|
- punishPeople.append(ner_l)
|
|
|
- elif punishPeople_rule3_1.search(left) and punishPeople_rule3_2.search(right):
|
|
|
- punishPeople.append(ner_l)
|
|
|
- complainants = set([it.entity_text for l in complainants for it in l])
|
|
|
- punishPeople = set([it.entity_text for l in punishPeople for it in l])
|
|
|
- return ';'.join(complainants), ';'.join(punishPeople)
|
|
|
-
|
|
|
- def get_punish_extracts_backup(self, doc_id=' ', title=' ', text=' '):
|
|
|
- list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([[doc_id, text, "", "", ""]],
|
|
|
- useselffool=True)
|
|
|
- punish_code = punish.predict_punishCode(list_sentences)
|
|
|
- # print('处罚编号: ',punish_code)
|
|
|
- institutions, punishTimes = punish.get_institution(title, list_sentences[0], list_entitys[0])
|
|
|
- # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
|
|
|
- keyword, punishType = punish.get_punishType(title, text)
|
|
|
- # print('处罚类型:',punishType)
|
|
|
- punishDecision = punish.get_punishDecision(text, punishType)
|
|
|
- # print('处罚决定:',punishDecision)
|
|
|
- punishWhether= punish.get_punishWhether(punishDecision, text, punishType)
|
|
|
- # print('投诉是否成立:',punishWhether)
|
|
|
- complainants, punishPeople = punish.get_complainant(punishType, list_sentences[0], list_entitys[0])
|
|
|
- # print('投诉人:%s 被投诉人:%s'%(complainants, punishPeople))
|
|
|
- punish_dic = {'punish_code':punish_code,
|
|
|
- 'punishType':punishType,
|
|
|
- 'punishDecision':punishDecision,
|
|
|
- 'complainants':complainants,
|
|
|
- 'punishPeople':punishPeople,
|
|
|
- 'punishWhether':punishWhether,
|
|
|
- 'institutions':institutions,
|
|
|
- 'punishTimes':punishTimes}
|
|
|
- return punish_dic
|
|
|
- # return punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether,institutions, punishTimes
|
|
|
-
|
|
|
- def get_punish_extracts(self,list_articles,list_sentences, list_entitys):
|
|
|
- list_result = []
|
|
|
- for article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
|
|
|
- title = article.title
|
|
|
- text=article.content
|
|
|
-
|
|
|
- keyword, punishType = self.get_punishType(title, text)
|
|
|
- # print('处罚类型:',punishType)
|
|
|
- punish_code = self.predict_punishCode(list_sentences)
|
|
|
- # print('处罚编号: ',punish_code)
|
|
|
- institutions, punishTimes = self.get_institution(title, list_sentence, list_entity)
|
|
|
- # print('执法机构:',institutions, '\n 处罚时间:', punishTimes)
|
|
|
- punishDecision = self.get_punishDecision(text, punishType)
|
|
|
- # print('处罚决定:',punishDecision)
|
|
|
- punishWhether= self.get_punishWhether(punishDecision, text, punishType)
|
|
|
- # print('投诉是否成立:',punishWhether)
|
|
|
- complainants, punishPeople = self.get_complainant(punishType, list_sentence, list_entity)
|
|
|
- # print('投诉人:%s 被投诉人:%s'%(complainants, punishPeople))
|
|
|
- punish_dic = {'punish_code':punish_code,
|
|
|
- 'punishType':punishType,
|
|
|
- 'punishDecision':punishDecision,
|
|
|
- 'complainants':complainants,
|
|
|
- 'punishPeople':punishPeople,
|
|
|
- 'punishWhether':punishWhether,
|
|
|
- 'institutions':institutions,
|
|
|
- 'punishTimes':punishTimes}
|
|
|
- _count = 0
|
|
|
- for k,v in punish_dic.items():
|
|
|
- if v!="":
|
|
|
- _count += 1
|
|
|
- if _count>=2 and punish_dic["punishType"]!="未知类别":
|
|
|
- list_result.append({"punish":punish_dic})
|
|
|
- else:
|
|
|
- list_result.append({"punish":{}})
|
|
|
- return list_result
|
|
|
-
|
|
|
-def save_punish_code_model():
|
|
|
- model_folder = os.path.dirname(__file__) + "/models/21-0.9990081295021194-0.3647936"
|
|
|
- output_graph = os.path.dirname(__file__) + "/models/punish_code.pb"
|
|
|
- ckpt = tf.train.get_checkpoint_state(model_folder)
|
|
|
- if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
|
|
|
- input_checkpoint = ckpt.model_checkpoint_path
|
|
|
- saver = tf.train.import_meta_graph(input_checkpoint+".meta", clear_devices=True)
|
|
|
- graph = tf.get_default_graph()
|
|
|
- input_graph_def = graph.as_graph_def()
|
|
|
- with tf.Session() as sess:
|
|
|
- saver.restore(sess, input_checkpoint)
|
|
|
- output_graph_def = graph_util.convert_variables_to_constants(
|
|
|
- sess = sess,
|
|
|
- input_graph_def = input_graph_def,
|
|
|
- output_node_names=["char_input","length","crf_loss/transitons","CRF/output/logits"]
|
|
|
- )
|
|
|
- with tf.gfile.GFile(output_graph, "wb") as f:
|
|
|
- f.write(output_graph_def.SerializeToString())
|
|
|
-
|
|
|
-
|
|
|
-if __name__ == "__main__":
|
|
|
- save_punish_code_model()
|
|
|
- # punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
|
|
|
- #
|
|
|
- # import pandas as pd
|
|
|
- # # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
|
|
|
- # df = pd.read_excel('G:/失信数据/ALLDATA_re2-3.xlsx', index=0)[2:10]
|
|
|
- # # i = 89
|
|
|
- # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
|
|
|
- # # i = 92
|
|
|
- # # predict('2', df.loc[i, 'PAGE_TITLE'],df.loc[i, 'PAGE_CONTENT'])
|
|
|
- #
|
|
|
- # # t1 = time.time()
|
|
|
- # # for i in df.index:
|
|
|
- # # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
|
|
|
- # # get_punish_extracts(i, df.loc[i, 'PAGE_TITLE'], df.loc[i, 'PAGE_CONTENT'])
|
|
|
- # # df.loc[i, '投诉人'] = complainants
|
|
|
- # # df.loc[i, '被投诉人'] = punishPeople
|
|
|
- # # df.loc[i, '执法机构'] = institutions
|
|
|
- # # df.loc[i, '处罚时间'] = punishTimes
|
|
|
- # # df.loc[i, '处罚编号'] = punish_code
|
|
|
- # # print('完成第%d篇'%i)
|
|
|
- # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=[['PAGE_TITLE', 'PAGE_CONTENT',
|
|
|
- # # # '关键词', '类别', '处理决定', '投诉是否成立',
|
|
|
- # # # 'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', 'punishPeople',
|
|
|
- # # # 'institution', 'punishTime', 'ner_test']])
|
|
|
- # # t2 = time.time()
|
|
|
- # # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
|
|
|
- # # # '关键词', '类别', '处理决定', '投诉是否成立',
|
|
|
- # # # 'DETAILLINK', 'sentences', 'PAGE_TIME', 'complainant', '投诉人', 'punishPeople', '被投诉人',
|
|
|
- # # # 'institution', '执法机构', 'punishTime', '处罚时间', 'ner_test', '处罚编号'])
|
|
|
- # # df.to_excel('G:/失信数据/ALLDATA_re2-4.xlsx', encoding='utf-8',columns=['PAGE_TITLE', 'PAGE_CONTENT',
|
|
|
- # # '关键词', '类别', '处理决定', '投诉是否成立', '投诉人', '被投诉人','执法机构', '处罚时间', '处罚编号',
|
|
|
- # # 'DETAILLINK', 'sentences', 'PAGE_TIME'])
|
|
|
- # # t3 = time.time()
|
|
|
- # # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
|
|
|
- # s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
|
|
|
- # # list_sentences = [s.split('。')]
|
|
|
- # # punish_code= punish.predict_punishCode( list_sentences)
|
|
|
- # # print(punish_code)
|
|
|
- #
|
|
|
- # # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
|
|
|
- # # get_punish_extracts(text=s)
|
|
|
- # punish_dic = punish.get_punish_extracts_backup(text=s)
|
|
|
- # print(punish_dic)
|