123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461 |
- #! -*- coding:utf-8 -*-
- import os,sys
- # parentdir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
- # sys.path.insert(0,parentdir)
- # import json
- import numpy as np
- # from random import choice
- # from tqdm import tqdm
- from BiddingKG.dl.common.models import *
- from itertools import groupby
- def seq_padding(X, padding=0):
- L = [len(x) for x in X]
- ML = max(L)
- return np.array([
- np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
- ])
- from keras.layers import *
- from keras.models import Model
- import keras.backend as K
- from keras.callbacks import Callback
- from keras.optimizers import Adam
- def seq_gather(x):
- """seq是[None, seq_len, s_size]的格式,
- idxs是[None, 1]的格式,在seq的第i个序列中选出第idxs[i]个向量,
- 最终输出[None, s_size]的向量。
- """
- seq, idxs = x
- idxs = K.cast(idxs, 'int32')
- batch_idxs = K.arange(0, K.shape(seq)[0])
- batch_idxs = K.expand_dims(batch_idxs, 1)
- idxs = K.concatenate([batch_idxs, idxs], 1)
- return K.tf.gather_nd(seq, idxs)
- def seq_maxpool(x):
- """seq是[None, seq_len, s_size]的格式,
- mask是[None, seq_len, 1]的格式,先除去mask部分,
- 然后再做maxpooling。
- """
- seq, mask = x
- seq -= (1 - mask) * 1e10
- return K.max(seq, 1, keepdims=True)
- def dilated_gated_conv1d(seq, mask, dilation_rate=1):
- """膨胀门卷积(残差式)
- """
- dim = K.int_shape(seq)[-1]
- h = Conv1D(dim*2, 3, padding='same', dilation_rate=dilation_rate)(seq)
- def _gate(x):
- dropout_rate = 0.2
- s, h = x
- g, h = h[:, :, :dim], h[:, :, dim:]
- g = K.in_train_phase(K.dropout(g, dropout_rate), g)
- g = K.sigmoid(g)
- return g * s + (1 - g) * h
- seq = Lambda(_gate)([seq, h])
- seq = Lambda(lambda x: x[0] * x[1])([seq, mask])
- return seq
- class OurLayer(Layer):
- """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层
- """
- def reuse(self, layer, *args, **kwargs):
- if not layer.built:
- if len(args) > 0:
- inputs = args[0]
- else:
- inputs = kwargs['inputs']
- if isinstance(inputs, list):
- input_shape = [K.int_shape(x) for x in inputs]
- else:
- input_shape = K.int_shape(inputs)
- layer.build(input_shape)
- outputs = layer.call(*args, **kwargs)
- for w in layer.trainable_weights:
- if w not in self._trainable_weights:
- self._trainable_weights.append(w)
- for w in layer.non_trainable_weights:
- if w not in self._non_trainable_weights:
- self._non_trainable_weights.append(w)
- for u in layer.updates:
- if not hasattr(self, '_updates'):
- self._updates = []
- if u not in self._updates:
- self._updates.append(u)
- return outputs
- class OurBidirectional(OurLayer):
- """自己封装双向RNN,允许传入mask,保证对齐
- """
- def __init__(self, layer, **args):
- super(OurBidirectional, self).__init__(**args)
- self.forward_layer = layer.__class__.from_config(layer.get_config())
- self.backward_layer = layer.__class__.from_config(layer.get_config())
- self.forward_layer.name = 'forward_' + self.forward_layer.name
- self.backward_layer.name = 'backward_' + self.backward_layer.name
- def reverse_sequence(self, x, mask):
- """这里的mask.shape是[batch_size, seq_len, 1]
- """
- seq_len = K.round(K.sum(mask, 1)[:, 0])
- seq_len = K.cast(seq_len, 'int32')
- return tf.reverse_sequence(x, seq_len, seq_dim=1)
- def call(self, inputs):
- x, mask = inputs
- x_forward = self.reuse(self.forward_layer, x)
- x_backward = self.reverse_sequence(x, mask)
- x_backward = self.reuse(self.backward_layer, x_backward)
- x_backward = self.reverse_sequence(x_backward, mask)
- x = K.concatenate([x_forward, x_backward], -1)
- if K.ndim(x) == 3:
- return x * mask
- else:
- return x
- def compute_output_shape(self, input_shape):
- return input_shape[0][:-1] + (self.forward_layer.units * 2,)
- class Attention(Layer):
- """多头注意力机制
- """
- def __init__(self, nb_head, size_per_head, **kwargs):
- self.nb_head = nb_head
- self.size_per_head = size_per_head
- self.out_dim = nb_head * size_per_head
- super(Attention, self).__init__(**kwargs)
- def build(self, input_shape):
- super(Attention, self).build(input_shape)
- q_in_dim = input_shape[0][-1]
- k_in_dim = input_shape[1][-1]
- v_in_dim = input_shape[2][-1]
- self.q_kernel = self.add_weight(name='q_kernel',
- shape=(q_in_dim, self.out_dim),
- initializer='glorot_normal')
- self.k_kernel = self.add_weight(name='k_kernel',
- shape=(k_in_dim, self.out_dim),
- initializer='glorot_normal')
- self.v_kernel = self.add_weight(name='w_kernel',
- shape=(v_in_dim, self.out_dim),
- initializer='glorot_normal')
- def mask(self, x, mask, mode='mul'):
- if mask is None:
- return x
- else:
- for _ in range(K.ndim(x) - K.ndim(mask)):
- mask = K.expand_dims(mask, K.ndim(mask))
- if mode == 'mul':
- return x * mask
- else:
- return x - (1 - mask) * 1e10
- def call(self, inputs):
- q, k, v = inputs[:3]
- v_mask, q_mask = None, None
- if len(inputs) > 3:
- v_mask = inputs[3]
- if len(inputs) > 4:
- q_mask = inputs[4]
- # 线性变换
- qw = K.dot(q, self.q_kernel)
- kw = K.dot(k, self.k_kernel)
- vw = K.dot(v, self.v_kernel)
- # 形状变换
- qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
- kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
- vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
- # 维度置换
- qw = K.permute_dimensions(qw, (0, 2, 1, 3))
- kw = K.permute_dimensions(kw, (0, 2, 1, 3))
- vw = K.permute_dimensions(vw, (0, 2, 1, 3))
- # Attention
- a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5
- a = K.permute_dimensions(a, (0, 3, 2, 1))
- a = self.mask(a, v_mask, 'add')
- a = K.permute_dimensions(a, (0, 3, 2, 1))
- a = K.softmax(a)
- # 完成输出
- o = K.batch_dot(a, vw, [3, 2])
- o = K.permute_dimensions(o, (0, 2, 1, 3))
- o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
- o = self.mask(o, q_mask, 'mul')
- return o
- def compute_output_shape(self, input_shape):
- return (input_shape[0][0], input_shape[0][1], self.out_dim)
- def position_id(x):
- if isinstance(x, list) and len(x) == 2:
- x, r = x
- else:
- r = 0
- pid = K.arange(K.shape(x)[1])
- pid = K.expand_dims(pid, 0)
- pid = K.tile(pid, [K.shape(x)[0], 1])
- return K.abs(pid - K.cast(r, 'int32'))
- add_dict = load(os.path.dirname(__file__)+'/../relation_extraction/add_words_dict.pkl')
- add_words = ['<unk>','<company/org>','<location>','<phone>','<contact_person>']
- def get_words_matrix(words):
- model_w2v = getModel_w2v()
- if words in add_words:
- return add_dict[words]
- else:
- item_not_space = re.sub("\s*", "", words)
- if item_not_space in model_w2v.vocab:
- return model_w2v[item_not_space]
- else:
- return add_dict['<unk>']
- entity_type_dict = {
- 'org': '<company/org>',
- 'company': '<company/org>',
- 'location': '<location>',
- 'phone': '<phone>',
- 'person': '<contact_person>'
- }
- class Relation_extraction():
- def __init__(self,is_train=False):
- self.is_train = is_train
- # self.words_vocab = load(os.path.dirname(__file__)+'/../relation_extraction/words_vocab.pkl')
- # id2word = {i: j for i, j in enumerate(self.words_vocab)}
- # self.words2id = {j: i for i, j in id2word.items()}
- self.words_size = 128
- self.id2predicate = {
- 0: "rel_person", # 公司——联系人
- 1: "rel_phone", # 联系人——电话
- 2: "rel_address" # 公司——地址
- }
- self.predicate2id = dict({j: i for i, j in self.id2predicate.items()})
- self.num_classes = len(self.id2predicate)
- self.maxlen = 512
- # self.word2vec = None
- # if self.is_train:
- # self.word2vec = load('words2v_matrix.pkl')
- self.model_path = os.path.dirname(__file__)+'/../relation_extraction/models/my_best_model_oneoutput2.weights'
- self.get_model()
- if self.model_path:
- self.train_model.load_weights(self.model_path)
- def get_model(self):
- words_size = self.words_size
- t2_in = Input(shape=(None,words_size)) # 词向量
- t3_in = Input(shape=(None,)) # mask列表
- s1_in = Input(shape=(None,))
- k1_in = Input(shape=(1,))
- o1_in = Input(shape=(None, self.num_classes))
- t2, t3, s1, k1, o1 = t2_in, t3_in, s1_in, k1_in, o1_in
- mask = Lambda(lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(t3)
- pid = Lambda(position_id)(t2)
- position_embedding = Embedding(self.maxlen, words_size, embeddings_initializer='zeros')
- pv = position_embedding(pid)
- # t2 = Embedding(len(self.words2id), words_size, weights=[self.word2vec] if self.is_train else None, trainable=True,name="words_embedding")(t2)
- t = Add()([t2, pv])
- t = Dropout(0.25)(t)
- t = Lambda(lambda x: x[0] * x[1])([t, mask])
- if K.tensorflow_backend._get_available_gpus():
- # GPU训练时使用mask机制的双向RNN
- t = OurBidirectional(CuDNNGRU(64, return_sequences=True))([t,mask])
- else:
- # CPU预测时使用keras自带的双向RNN,不用mask
- t = Bidirectional(GRU(64,return_sequences=True,reset_after=True))(t)
- t_dim = K.int_shape(t)[-1]
- pn1 = Dense(words_size, activation='relu')(t)
- pn1 = Dense(1, activation='sigmoid')(pn1)
- h = Attention(8, 16)([t, t, t, mask])
- h = Concatenate()([t, h])
- h = Conv1D(words_size, 3, activation='relu', padding='same')(h)
- ps1 = Dense(1, activation='sigmoid')(h)
- ps1 = Lambda(lambda x: x[0] * x[1])([ps1, pn1])
- self.subject_model = Model([t2_in,t3_in], [ps1]) # 预测subject的模型
- t_max = Lambda(seq_maxpool)([t, mask])
- pc = Dense(words_size, activation='relu')(t_max)
- pc = Dense(self.num_classes, activation='sigmoid')(pc)
- def get_k_inter(x, n=6):
- seq, k1 = x
- # k_inter = [K.round(k1 * a + k2 * (1 - a)) for a in np.arange(n) / (n - 1.)]
- k_inter = [seq_gather([seq, k1])] * 2
- k_inter = [K.expand_dims(k, 1) for k in k_inter]
- k_inter = K.concatenate(k_inter, 1)
- return k_inter
- k = Lambda(get_k_inter, output_shape=(2, t_dim))([t, k1])
- if K.tensorflow_backend._get_available_gpus():
- k = Bidirectional(CuDNNGRU(t_dim))(k)
- else:
- k = Bidirectional(GRU(t_dim, reset_after=True))(k)
- k1v = position_embedding(Lambda(position_id)([t, k1]))
- kv = Concatenate()([k1v, k1v])
- k = Lambda(lambda x: K.expand_dims(x[0], 1) + x[1])([k, kv])
- h = Attention(8, 16)([t, t, t, mask])
- h = Concatenate()([t, h, k])
- h = Conv1D(words_size, 3, activation='relu', padding='same')(h)
- po = Dense(1, activation='sigmoid')(h)
- po1 = Dense(self.num_classes, activation='sigmoid')(h)
- po1 = Lambda(lambda x: x[0] * x[1] * x[2] * x[3])([po, po1, pc, pn1])
- self.object_model = Model([t2_in,t3_in,k1_in], [po1])
- train_model = Model([t2_in,t3_in, s1_in, k1_in, o1_in],
- [ps1, po1])
- # loss
- s1 = K.expand_dims(s1, 2)
- s1_loss = K.binary_crossentropy(s1, ps1)
- s1_loss = K.sum(s1_loss * mask) / K.sum(mask)
- o1_loss = K.sum(K.binary_crossentropy(o1, po1), 2, keepdims=True)
- o1_loss = K.sum(o1_loss * mask) / K.sum(mask)
- loss = s1_loss + o1_loss
- train_model.add_loss(loss)
- train_model.compile(optimizer=Adam(1e-3))
- # train_model.summary()
- self.train_model = train_model
- def extract_items(self,text_in, words, rate=0.5):
- text_words = text_in
- R = []
- # _t2 = [self.words2id.get(c, 1) for c in words]
- _t2 = np.zeros((len(words), self.words_size))
- for i in range(len(words)):
- _t2[i] = np.array(get_words_matrix(words[i]))
- _t2 = np.array([_t2])
- _t3 = [1 for _ in words]
- _t3 = np.array([_t3])
- _k1 = self.subject_model.predict([_t2,_t3])
- _k1 = _k1[0, :, 0]
- _k1 = np.where(_k1 > rate)[0]
- _subjects = []
- for i in _k1:
- _subject = text_in[i]
- _subjects.append((_subject, i, i))
- if _subjects:
- _t2 = np.repeat(_t2, len(_subjects), 0)
- _t3 = np.repeat(_t3, len(_subjects), 0)
- _k1, _ = np.array([_s[1:] for _s in _subjects]).T.reshape((2, -1, 1))
- _o1 = self.object_model.predict([_t2,_t3,_k1])
- for i, _subject in enumerate(_subjects):
- _oo1 = np.where(_o1[i] > 0.5)
- for _ooo1, _c1 in zip(*_oo1):
- _object = text_in[_ooo1]
- _predicate = self.id2predicate[_c1]
- R.append((_subject[0], _predicate, _object))
- return R
- else:
- return []
- def predict(self,text, words):
- res = self.extract_items(text,words)
- return res
- @staticmethod
- def get_predata(entity_list,list_sentence):
- list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
- entity_list = sorted(entity_list,key=lambda x:(x.sentence_index,x.begin_index))
- pre_data = []
- text_data = []
- last_sentence_index = -1
- for key, group in groupby(entity_list,key=lambda x:x.sentence_index):
- if key-last_sentence_index>1:
- for i in range(last_sentence_index+1,key):
- pre_data.extend(list_sentence[i].tokens)
- text_data.extend([0]*len(list_sentence[i].tokens))
- group = list(group)
- for i in range(len(group)):
- ent = group[i]
- _tokens = list_sentence[key].tokens
- if i==len(group)-1:
- if i==0:
- pre_data.extend(_tokens[:ent.begin_index])
- text_data.extend([0]*len(_tokens[:ent.begin_index]))
- pre_data.append(entity_type_dict[ent.entity_type])
- text_data.append(ent)
- pre_data.extend(_tokens[ent.end_index+1:])
- text_data.extend([0]*len(_tokens[ent.end_index+1:]))
- break
- else:
- pre_data.append(entity_type_dict[ent.entity_type])
- text_data.append(ent)
- pre_data.extend(_tokens[ent.end_index+1:])
- text_data.extend([0]*len(_tokens[ent.end_index+1:]))
- break
- if i==0:
- pre_data.extend(_tokens[:ent.begin_index])
- text_data.extend([0] * len(_tokens[:ent.begin_index]))
- pre_data.append(entity_type_dict[ent.entity_type])
- text_data.append(ent)
- pre_data.extend(_tokens[ent.end_index+1:group[i+1].begin_index])
- text_data.extend([0] * len(_tokens[ent.end_index+1:group[i+1].begin_index]))
- else:
- pre_data.append(entity_type_dict[ent.entity_type])
- text_data.append(ent)
- pre_data.extend(_tokens[ent.end_index+1:group[i + 1].begin_index])
- text_data.extend([0] * len(_tokens[ent.end_index+1:group[i+1].begin_index]))
- last_sentence_index = key
- return text_data,pre_data
- def save_model():
- graph = tf.Graph()
- with graph.as_default() as graph:
- with tf.Session(graph=graph).as_default() as sess:
- test_model = Relation_extraction()
- tf.saved_model.simple_save(sess,
- "models2/object_model/",
- inputs={"input0": test_model.object_model.input[0],
- "input1": test_model.object_model.input[1],
- "input2": test_model.object_model.input[2]},
- outputs={"outputs": test_model.object_model.output})
- tf.saved_model.simple_save(sess,
- "models2/subject_model/",
- inputs={"input0": test_model.subject_model.input[0],
- "input1": test_model.subject_model.input[1]},
- outputs={"outputs": test_model.subject_model.output})
- if __name__ == '__main__':
- test_model = Relation_extraction()
- test_model.train_model.summary()
- print("object_model=====================")
- test_model.object_model.summary()
- print("subject_model=======================")
- test_model.subject_model.summary()
- # save_model()
- # ['<pad>','<unk>','<company/org>','<location>','<phone>','<contact_person>']
- # add_words = ['<unk>','<company/org>','<location>','<phone>','<contact_person>']
- # add_dict = dict()
- # for layer in test_model.train_model.layers:
- # if layer.name=="words_embedding":
- # save(layer.get_weights()[0],"trained_words.pkl")
- # for i,j in zip(add_words,layer.get_weights()[0][1:6]):
- # add_dict[i] = j
- # print(i,'\n',j)
- # print(layer.get_weights()[0][1:6])
- # save(add_dict,"add_words_dict.pkl")
- text_in = "索引||号||:||014583788||/||2018-00038||,||成文||日期||:||2018-11-19||,||关于||国家税务总局都昌县税务局||办公楼||七||楼||会议室||维修||改造||项目||综合||比价||成交||公告||,||关于||国家税务总局都昌县税务局||办公楼七楼会议室||维修||改造||项目||(||比价||编号||:||JXXL2018-JJ-DC001||)||综合||比价||成交||公告||,||江西新立建设管理有限公司九江分公司||受||国家税务总局都昌县税务局||委托||,||就||其||办公楼||七||楼||会议室||维修||改造||项目||(||控制||价||:||294788.86||元||)||进行||综合||比价||方式||,||比价||活动||于||2018年||11月||16日||15:30||在||都昌县万里大道和平宾馆旁三楼||江西新立建设管理有限公司九江分公司||进行||,||经||比价||小组||评审||,||比价人||确定||,||现||将||比价||结果||公式||如下||:||序号||:||1||,||比价||编号||,||JXXL2018-JJ-DC001||,||项目||内容||名称||,||都昌县税务局||办公楼||七||楼||会议室||维修||改造||项目||,||数量||:||1||,||成交||供应商||名称||,||江西芙蓉建筑工程有限公司||,||成交价||(||元||)||,||284687.67||。||一||、||比价||小组||成员||:||杨忠辉||李燕杨瑾||,||本||公告||自||发布||之||日||起||1||个||工作日||内||若||无||异议||,||将||向||中标人||发出||《||成交||通知书||》||,||二||、||联系||方式||,||单位||:||国家税务总局都昌县税务局||,||比价||代理||机构||:||江西新立建设管理有限公司九江分公司||,||联系人||:||詹女士||,||电话||:||15979976088||,||江西新立建设管理有限公司九江分公司"
- words = "索引||号||:||014583788||/||2018-00038||,||成文||日期||:||2018-11-19||,||关于||国家税务总局都昌县税务局||" \
- "办公楼||七||楼||会议室||维修||改造||项目||综合||比价||成交||公告||,||关于||国家税务总局都昌县税务局||办公楼七楼会议室||" \
- "维修||改造||项目||(||比价||编号||:||JXXL2018-JJ-DC001||)||综合||比价||成交||公告||,||<company/org>||" \
- "受||国家税务总局都昌县税务局||委托||,||就||其||办公楼||七||楼||会议室||维修||改造||项目||(||控制||价||:||294788.86||元||)||" \
- "进行||综合||比价||方式||,||比价||活动||于||2018年||11月||16日||15:30||在||都昌县万里大道和平宾馆旁三楼||<company/org>||" \
- "进行||,||经||比价||小组||评审||,||比价人||确定||,||现||将||比价||结果||公式||如下||:||序号||:||1||,||比价||编号||," \
- "||JXXL2018-JJ-DC001||,||项目||内容||名称||,||都昌县税务局||办公楼||七||楼||会议室||维修||改造||项目||,||数量||:||1||,||成交||" \
- "供应商||名称||,||<company/org>||,||成交价||(||元||)||,||284687.67||。||一||、||比价||小组||成员||:||杨忠辉||李燕杨瑾||," \
- "||本||公告||自||发布||之||日||起||1||个||工作日||内||若||无||异议||,||将||向||中标人||发出||《||成交||通知书||》||,||二||、||联系||方式||," \
- "||单位||:||<company/org>||,||比价||代理||机构||:||<company/org>||,||联系人||:||<contact_person>||,||电话||:||<phone>||,||江西新立建设管理有限公司九江分公司"
- # text_in = "索引"
- # words = "索引"
- # res = test_model.predict(text_in.split("||"),words.split("||"))
- # print(res)
- # print(test_model.predict(text_in.split("||"),words.split("||")))
|