#! -*- coding:utf-8 -*- import os,sys # parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) # sys.path.insert(0,parentdir) # import json import numpy as np # from random import choice # from tqdm import tqdm from BiddingKG.dl.common.models import * from itertools import groupby def seq_padding(X, padding=0): L = [len(x) for x in X] ML = max(L) return np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X ]) from keras.layers import * from keras.models import Model import keras.backend as K from keras.callbacks import Callback from keras.optimizers import Adam def seq_gather(x): """seq是[None, seq_len, s_size]的格式, idxs是[None, 1]的格式,在seq的第i个序列中选出第idxs[i]个向量, 最终输出[None, s_size]的向量。 """ seq, idxs = x idxs = K.cast(idxs, 'int32') batch_idxs = K.arange(0, K.shape(seq)[0]) batch_idxs = K.expand_dims(batch_idxs, 1) idxs = K.concatenate([batch_idxs, idxs], 1) return K.tf.gather_nd(seq, idxs) def seq_maxpool(x): """seq是[None, seq_len, s_size]的格式, mask是[None, seq_len, 1]的格式,先除去mask部分, 然后再做maxpooling。 """ seq, mask = x seq -= (1 - mask) * 1e10 return K.max(seq, 1, keepdims=True) def dilated_gated_conv1d(seq, mask, dilation_rate=1): """膨胀门卷积(残差式) """ dim = K.int_shape(seq)[-1] h = Conv1D(dim*2, 3, padding='same', dilation_rate=dilation_rate)(seq) def _gate(x): dropout_rate = 0.2 s, h = x g, h = h[:, :, :dim], h[:, :, dim:] g = K.in_train_phase(K.dropout(g, dropout_rate), g) g = K.sigmoid(g) return g * s + (1 - g) * h seq = Lambda(_gate)([seq, h]) seq = Lambda(lambda x: x[0] * x[1])([seq, mask]) return seq class OurLayer(Layer): """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层 """ def reuse(self, layer, *args, **kwargs): if not layer.built: if len(args) > 0: inputs = args[0] else: inputs = kwargs['inputs'] if isinstance(inputs, list): input_shape = [K.int_shape(x) for x in inputs] else: input_shape = K.int_shape(inputs) layer.build(input_shape) outputs = layer.call(*args, **kwargs) for w in layer.trainable_weights: if w not in self._trainable_weights: self._trainable_weights.append(w) for w in layer.non_trainable_weights: if w not in self._non_trainable_weights: self._non_trainable_weights.append(w) for u in layer.updates: if not hasattr(self, '_updates'): self._updates = [] if u not in self._updates: self._updates.append(u) return outputs class OurBidirectional(OurLayer): """自己封装双向RNN,允许传入mask,保证对齐 """ def __init__(self, layer, **args): super(OurBidirectional, self).__init__(**args) self.forward_layer = layer.__class__.from_config(layer.get_config()) self.backward_layer = layer.__class__.from_config(layer.get_config()) self.forward_layer.name = 'forward_' + self.forward_layer.name self.backward_layer.name = 'backward_' + self.backward_layer.name def reverse_sequence(self, x, mask): """这里的mask.shape是[batch_size, seq_len, 1] """ seq_len = K.round(K.sum(mask, 1)[:, 0]) seq_len = K.cast(seq_len, 'int32') return tf.reverse_sequence(x, seq_len, seq_dim=1) def call(self, inputs): x, mask = inputs x_forward = self.reuse(self.forward_layer, x) x_backward = self.reverse_sequence(x, mask) x_backward = self.reuse(self.backward_layer, x_backward) x_backward = self.reverse_sequence(x_backward, mask) x = K.concatenate([x_forward, x_backward], -1) if K.ndim(x) == 3: return x * mask else: return x def compute_output_shape(self, input_shape): return input_shape[0][:-1] + (self.forward_layer.units * 2,) class Attention(Layer): """多头注意力机制 """ def __init__(self, nb_head, size_per_head, **kwargs): self.nb_head = nb_head self.size_per_head = size_per_head self.out_dim = nb_head * size_per_head super(Attention, self).__init__(**kwargs) def build(self, input_shape): super(Attention, self).build(input_shape) q_in_dim = input_shape[0][-1] k_in_dim = input_shape[1][-1] v_in_dim = input_shape[2][-1] self.q_kernel = self.add_weight(name='q_kernel', shape=(q_in_dim, self.out_dim), initializer='glorot_normal') self.k_kernel = self.add_weight(name='k_kernel', shape=(k_in_dim, self.out_dim), initializer='glorot_normal') self.v_kernel = self.add_weight(name='w_kernel', shape=(v_in_dim, self.out_dim), initializer='glorot_normal') def mask(self, x, mask, mode='mul'): if mask is None: return x else: for _ in range(K.ndim(x) - K.ndim(mask)): mask = K.expand_dims(mask, K.ndim(mask)) if mode == 'mul': return x * mask else: return x - (1 - mask) * 1e10 def call(self, inputs): q, k, v = inputs[:3] v_mask, q_mask = None, None if len(inputs) > 3: v_mask = inputs[3] if len(inputs) > 4: q_mask = inputs[4] # 线性变换 qw = K.dot(q, self.q_kernel) kw = K.dot(k, self.k_kernel) vw = K.dot(v, self.v_kernel) # 形状变换 qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head)) kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head)) vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head)) # 维度置换 qw = K.permute_dimensions(qw, (0, 2, 1, 3)) kw = K.permute_dimensions(kw, (0, 2, 1, 3)) vw = K.permute_dimensions(vw, (0, 2, 1, 3)) # Attention a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5 a = K.permute_dimensions(a, (0, 3, 2, 1)) a = self.mask(a, v_mask, 'add') a = K.permute_dimensions(a, (0, 3, 2, 1)) a = K.softmax(a) # 完成输出 o = K.batch_dot(a, vw, [3, 2]) o = K.permute_dimensions(o, (0, 2, 1, 3)) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.mask(o, q_mask, 'mul') return o def compute_output_shape(self, input_shape): return (input_shape[0][0], input_shape[0][1], self.out_dim) def position_id(x): if isinstance(x, list) and len(x) == 2: x, r = x else: r = 0 pid = K.arange(K.shape(x)[1]) pid = K.expand_dims(pid, 0) pid = K.tile(pid, [K.shape(x)[0], 1]) return K.abs(pid - K.cast(r, 'int32')) add_dict = load(os.path.dirname(__file__)+'/../relation_extraction/add_words_dict.pkl') add_words = ['','','','',''] model_w2v = getModel_w2v() def get_words_matrix(words): if words in add_words: return add_dict[words] else: item_not_space = re.sub("\s*", "", words) if item_not_space in model_w2v.vocab: return model_w2v[item_not_space] else: return add_dict[''] entity_type_dict = { 'org': '', 'company': '', 'location': '', 'phone': '', 'person': '' } class Relation_extraction(): def __init__(self,is_train=False): self.is_train = is_train # self.words_vocab = load(os.path.dirname(__file__)+'/../relation_extraction/words_vocab.pkl') # id2word = {i: j for i, j in enumerate(self.words_vocab)} # self.words2id = {j: i for i, j in id2word.items()} self.words_size = 128 self.id2predicate = { 0: "rel_person", # 公司——联系人 1: "rel_phone", # 联系人——电话 2: "rel_address" # 公司——地址 } self.predicate2id = dict({j: i for i, j in self.id2predicate.items()}) self.num_classes = len(self.id2predicate) self.maxlen = 512 # self.word2vec = None # if self.is_train: # self.word2vec = load('words2v_matrix.pkl') self.model_path = os.path.dirname(__file__)+'/../relation_extraction/models/my_best_model_oneoutput.weights' self.get_model() if self.model_path: self.train_model.load_weights(self.model_path) def get_model(self): words_size = self.words_size t2_in = Input(shape=(None,words_size)) # 词向量 t3_in = Input(shape=(None,)) # mask列表 s1_in = Input(shape=(None,)) k1_in = Input(shape=(1,)) o1_in = Input(shape=(None, self.num_classes)) t2, t3, s1, k1, o1 = t2_in, t3_in, s1_in, k1_in, o1_in mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(t3) pid = Lambda(position_id)(t2) position_embedding = Embedding(self.maxlen, words_size, embeddings_initializer='zeros') pv = position_embedding(pid) # t2 = Embedding(len(self.words2id), words_size, weights=[self.word2vec] if self.is_train else None, trainable=True,name="words_embedding")(t2) t = Add()([t2, pv]) t = Dropout(0.25)(t) t = Lambda(lambda x: x[0] * x[1])([t, mask]) if K.tensorflow_backend._get_available_gpus(): # GPU训练时使用mask机制的双向RNN t = OurBidirectional(CuDNNGRU(64, return_sequences=True))([t,mask]) else: # CPU预测时使用keras自带的双向RNN,不用mask t = Bidirectional(GRU(64,return_sequences=True,reset_after=True))(t) t_dim = K.int_shape(t)[-1] pn1 = Dense(words_size, activation='relu')(t) pn1 = Dense(1, activation='sigmoid')(pn1) h = Attention(8, 16)([t, t, t, mask]) h = Concatenate()([t, h]) h = Conv1D(words_size, 3, activation='relu', padding='same')(h) ps1 = Dense(1, activation='sigmoid')(h) ps1 = Lambda(lambda x: x[0] * x[1])([ps1, pn1]) self.subject_model = Model([t2_in,t3_in], [ps1]) # 预测subject的模型 t_max = Lambda(seq_maxpool)([t, mask]) pc = Dense(words_size, activation='relu')(t_max) pc = Dense(self.num_classes, activation='sigmoid')(pc) def get_k_inter(x, n=6): seq, k1 = x # k_inter = [K.round(k1 * a + k2 * (1 - a)) for a in np.arange(n) / (n - 1.)] k_inter = [seq_gather([seq, k1])] * 2 k_inter = [K.expand_dims(k, 1) for k in k_inter] k_inter = K.concatenate(k_inter, 1) return k_inter k = Lambda(get_k_inter, output_shape=(2, t_dim))([t, k1]) if K.tensorflow_backend._get_available_gpus(): k = Bidirectional(CuDNNGRU(t_dim))(k) else: k = Bidirectional(GRU(t_dim, reset_after=True))(k) k1v = position_embedding(Lambda(position_id)([t, k1])) kv = Concatenate()([k1v, k1v]) k = Lambda(lambda x: K.expand_dims(x[0], 1) + x[1])([k, kv]) h = Attention(8, 16)([t, t, t, mask]) h = Concatenate()([t, h, k]) h = Conv1D(words_size, 3, activation='relu', padding='same')(h) po = Dense(1, activation='sigmoid')(h) po1 = Dense(self.num_classes, activation='sigmoid')(h) po1 = Lambda(lambda x: x[0] * x[1] * x[2] * x[3])([po, po1, pc, pn1]) self.object_model = Model([t2_in,t3_in,k1_in], [po1]) train_model = Model([t2_in,t3_in, s1_in, k1_in, o1_in], [ps1, po1]) # loss s1 = K.expand_dims(s1, 2) s1_loss = K.binary_crossentropy(s1, ps1) s1_loss = K.sum(s1_loss * mask) / K.sum(mask) o1_loss = K.sum(K.binary_crossentropy(o1, po1), 2, keepdims=True) o1_loss = K.sum(o1_loss * mask) / K.sum(mask) loss = s1_loss + o1_loss train_model.add_loss(loss) train_model.compile(optimizer=Adam(1e-3)) # train_model.summary() self.train_model = train_model def extract_items(self,text_in, words, rate=0.5): text_words = text_in R = [] # _t2 = [self.words2id.get(c, 1) for c in words] _t2 = np.zeros((len(words), self.words_size)) for i in range(len(words)): _t2[i] = np.array(get_words_matrix(words[i])) _t2 = np.array([_t2]) _t3 = [1 for _ in words] _t3 = np.array([_t3]) _k1 = self.subject_model.predict([_t2,_t3]) _k1 = _k1[0, :, 0] _k1 = np.where(_k1 > rate)[0] _subjects = [] for i in _k1: _subject = text_in[i] _subjects.append((_subject, i, i)) if _subjects: _t2 = np.repeat(_t2, len(_subjects), 0) _t3 = np.repeat(_t3, len(_subjects), 0) _k1, _ = np.array([_s[1:] for _s in _subjects]).T.reshape((2, -1, 1)) _o1 = self.object_model.predict([_t2,_t3,_k1]) for i, _subject in enumerate(_subjects): _oo1 = np.where(_o1[i] > 0.5) for _ooo1, _c1 in zip(*_oo1): _object = text_in[_ooo1] _predicate = self.id2predicate[_c1] R.append((_subject[0], _predicate, _object)) return R else: return [] def predict(self,text, words): res = self.extract_items(text,words) return res @staticmethod def get_predata(entity_list,list_sentence): list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index) entity_list = sorted(entity_list,key=lambda x:(x.sentence_index,x.begin_index)) pre_data = [] text_data = [] last_sentence_index = -1 for key, group in groupby(entity_list,key=lambda x:x.sentence_index): if key-last_sentence_index>1: for i in range(last_sentence_index+1,key): pre_data.extend(list_sentence[i].tokens) text_data.extend([0]*len(list_sentence[i].tokens)) group = list(group) for i in range(len(group)): ent = group[i] _tokens = list_sentence[key].tokens if i==len(group)-1: if i==0: pre_data.extend(_tokens[:ent.begin_index]) text_data.extend([0]*len(_tokens[:ent.begin_index])) pre_data.append(entity_type_dict[ent.entity_type]) text_data.append(ent) pre_data.extend(_tokens[ent.end_index+1:]) text_data.extend([0]*len(_tokens[ent.end_index+1:])) break else: pre_data.append(entity_type_dict[ent.entity_type]) text_data.append(ent) pre_data.extend(_tokens[ent.end_index+1:]) text_data.extend([0]*len(_tokens[ent.end_index+1:])) break if i==0: pre_data.extend(_tokens[:ent.begin_index]) text_data.extend([0] * len(_tokens[:ent.begin_index])) pre_data.append(entity_type_dict[ent.entity_type]) text_data.append(ent) pre_data.extend(_tokens[ent.end_index+1:group[i+1].begin_index]) text_data.extend([0] * len(_tokens[ent.end_index+1:group[i+1].begin_index])) else: pre_data.append(entity_type_dict[ent.entity_type]) text_data.append(ent) pre_data.extend(_tokens[ent.end_index+1:group[i + 1].begin_index]) text_data.extend([0] * len(_tokens[ent.end_index+1:group[i+1].begin_index])) last_sentence_index = key return text_data,pre_data def save_model(): graph = tf.Graph() with graph.as_default() as graph: with tf.Session(graph=graph).as_default() as sess: test_model = Relation_extraction() tf.saved_model.simple_save(sess, "models2/object_model/", inputs={"input0": test_model.object_model.input[0], "input1": test_model.object_model.input[1], "input2": test_model.object_model.input[2]}, outputs={"outputs": test_model.object_model.output}) tf.saved_model.simple_save(sess, "models2/subject_model/", inputs={"input0": test_model.subject_model.input[0], "input1": test_model.subject_model.input[1]}, outputs={"outputs": test_model.subject_model.output}) if __name__ == '__main__': test_model = Relation_extraction() test_model.train_model.summary() print("object_model=====================") test_model.object_model.summary() print("subject_model=======================") test_model.subject_model.summary() # save_model() # ['','','','','',''] # add_words = ['','','','',''] # add_dict = dict() # for layer in test_model.train_model.layers: # if layer.name=="words_embedding": # save(layer.get_weights()[0],"trained_words.pkl") # for i,j in zip(add_words,layer.get_weights()[0][1:6]): # add_dict[i] = j # print(i,'\n',j) # print(layer.get_weights()[0][1:6]) # save(add_dict,"add_words_dict.pkl") text_in = "索引||号||:||014583788||/||2018-00038||,||成文||日期||:||2018-11-19||,||关于||国家税务总局都昌县税务局||办公楼||七||楼||会议室||维修||改造||项目||综合||比价||成交||公告||,||关于||国家税务总局都昌县税务局||办公楼七楼会议室||维修||改造||项目||(||比价||编号||:||JXXL2018-JJ-DC001||)||综合||比价||成交||公告||,||江西新立建设管理有限公司九江分公司||受||国家税务总局都昌县税务局||委托||,||就||其||办公楼||七||楼||会议室||维修||改造||项目||(||控制||价||:||294788.86||元||)||进行||综合||比价||方式||,||比价||活动||于||2018年||11月||16日||15:30||在||都昌县万里大道和平宾馆旁三楼||江西新立建设管理有限公司九江分公司||进行||,||经||比价||小组||评审||,||比价人||确定||,||现||将||比价||结果||公式||如下||:||序号||:||1||,||比价||编号||,||JXXL2018-JJ-DC001||,||项目||内容||名称||,||都昌县税务局||办公楼||七||楼||会议室||维修||改造||项目||,||数量||:||1||,||成交||供应商||名称||,||江西芙蓉建筑工程有限公司||,||成交价||(||元||)||,||284687.67||。||一||、||比价||小组||成员||:||杨忠辉||李燕杨瑾||,||本||公告||自||发布||之||日||起||1||个||工作日||内||若||无||异议||,||将||向||中标人||发出||《||成交||通知书||》||,||二||、||联系||方式||,||单位||:||国家税务总局都昌县税务局||,||比价||代理||机构||:||江西新立建设管理有限公司九江分公司||,||联系人||:||詹女士||,||电话||:||15979976088||,||江西新立建设管理有限公司九江分公司" words = "索引||号||:||014583788||/||2018-00038||,||成文||日期||:||2018-11-19||,||关于||国家税务总局都昌县税务局||" \ "办公楼||七||楼||会议室||维修||改造||项目||综合||比价||成交||公告||,||关于||国家税务总局都昌县税务局||办公楼七楼会议室||" \ "维修||改造||项目||(||比价||编号||:||JXXL2018-JJ-DC001||)||综合||比价||成交||公告||,||||" \ "受||国家税务总局都昌县税务局||委托||,||就||其||办公楼||七||楼||会议室||维修||改造||项目||(||控制||价||:||294788.86||元||)||" \ "进行||综合||比价||方式||,||比价||活动||于||2018年||11月||16日||15:30||在||都昌县万里大道和平宾馆旁三楼||||" \ "进行||,||经||比价||小组||评审||,||比价人||确定||,||现||将||比价||结果||公式||如下||:||序号||:||1||,||比价||编号||," \ "||JXXL2018-JJ-DC001||,||项目||内容||名称||,||都昌县税务局||办公楼||七||楼||会议室||维修||改造||项目||,||数量||:||1||,||成交||" \ "供应商||名称||,||||,||成交价||(||元||)||,||284687.67||。||一||、||比价||小组||成员||:||杨忠辉||李燕杨瑾||," \ "||本||公告||自||发布||之||日||起||1||个||工作日||内||若||无||异议||,||将||向||中标人||发出||《||成交||通知书||》||,||二||、||联系||方式||," \ "||单位||:||||,||比价||代理||机构||:||||,||联系人||:||||,||电话||:||||,||江西新立建设管理有限公司九江分公司" # text_in = "索引" # words = "索引" # res = test_model.predict(text_in.split("||"),words.split("||")) # print(res) # print(test_model.predict(text_in.split("||"),words.split("||")))