123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242 |
- import sys
- import os
- sys.path.append(os.path.abspath("../.."))
- # sys.path.append('/data/python_znj/znj/BIDI_ML_INFO_EXTRACTION/')
- import pandas as pd
- import re
- import psycopg2
- from keras.callbacks import ModelCheckpoint
- from keras import layers,models,optimizers,losses
- from keras.layers import *
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.common.models import *
- from sklearn.metrics import classification_report
- from sklearn.utils import shuffle,class_weight
- import matplotlib.pyplot as plt
- import random
- input_shape = (2,30,60)
- input_shape2 = (2,40,128)
- # output_shape = [4]
- time_label_dict = {
- 'time': 0,
- 'time_release': 1, #发布时间
- 'time_bidopen': 2, #开标时间
- 'time_bidclose': 3, #截标时间
- 'time_bidstart': 12, #投标(开始)时间、响应文件接收(开始)时间
- 'time_publicityStart': 4, #公示开始时间(公示时间、公示期)
- 'time_publicityEnd': 5, #公示截止时间
- 'time_getFileStart': 6, #文件获取开始时间(文件获取时间)
- 'time_getFileEnd': 7, #文件获取截止时间
- 'time_registrationStart': 8, #报名开始时间(报名时间)
- 'time_registrationEnd': 9, #报名截止时间
- 'time_earnestMoneyStart': 10, #保证金递交开始时间(保证金递交时间)
- 'time_earnestMoneyEnd': 11, #保证金递交截止时间
- 'time_commencement': 13, #开工日期
- 'time_completion': 14 #竣工日期
- }
- output_shape = [len(time_label_dict)]
- def get_data():
- data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
- id_set = set()
- for id in data_load['document_id']:
- id_set.add(id)
- conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.103")
- sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
- "FROM corpus_iedocument A,brat_bratannotation B " \
- "WHERE A.human_identifier = '%s' " \
- "AND A.human_identifier = B.document_id "
- db_data = []
- count = 0
- for id in list(id_set):
- count+=1
- print(count)
- cur1 = conn.cursor()
- cur1.execute(sql % (id))
- db_data.extend(cur1.fetchall())
- cur1.close()
- conn.close()
- columns = ['document_id','sentences','tokens','offsets_to_text','value']
- df = pd.DataFrame(db_data, columns=columns)
- df = df[df['value'].str.contains('time')]
- df = df.reset_index(drop=True)
- print(len(df))
- time_label = df['value'].str.split(expand=True)
- time_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text']
- time_label = time_label.drop('_', axis=1)
- df = pd.concat([df, time_label], axis=1)
- print(df.info())
- df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
- df['sentences'] = [eval(sentence) for sentence in df['sentences']]
- # df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
- # df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
- df['offsets_to_text'] = [eval(offset) for offset in df['offsets_to_text']]
- # df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
- # df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
- save(df,'db_time_data.pk')
- def getModel():
- '''
- @summary: 时间分类模型
- '''
- L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
- R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
- L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input)
- # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input)
- avg_l = layers.GlobalAveragePooling1D()(L_lstm)
- R_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(R_input)
- # R_lstm = layers.LSTM(32, return_sequences=True, dropout=0.2)(R_input)
- avg_r = layers.GlobalAveragePooling1D()(R_lstm)
- concat = layers.merge([avg_l, avg_r], mode='concat')
- # lstm = layers.LSTM(24,return_sequences=False,dropout=0.2)(concat)
- output = layers.Dense(output_shape[0],activation="softmax")(concat)
- model = models.Model(inputs=[L_input,R_input], outputs=output)
- learn_rate = 0.0005
- model.compile(optimizer=optimizers.Adam(lr=learn_rate),
- loss=losses.binary_crossentropy,
- metrics=[precision,recall,f1_score])
- model.summary()
- return model
- def getModel2():
- '''
- @summary: 时间分类模型
- '''
- L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
- L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
- R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
- R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
- L_input_drop = Dropout(0.3)(L_input)
- R_input_drop = Dropout(0.3)(R_input)
- # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
- L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
- L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
- # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
- R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
- R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
- L_R = layers.merge([L_lstm, R_lstm],concat_axis=1, mode='concat')
- L_R_mask = layers.merge([L_mask, R_mask],concat_axis=1, mode='concat')
- L_R_att = Attention02()(L_R,mask=K.squeeze(L_R_mask,axis=-1))
- L_att = layers.add([L_att,L_R_att])
- R_att = layers.add([R_att,L_R_att])
- concat = layers.merge([L_att, R_att], mode='concat')
- concat = Dropout(0.2)(concat)
- output = layers.Dense(output_shape[0],activation="softmax")(concat)
- model = models.Model(inputs=[L_input,R_input], outputs=output)
- learn_rate = 0.00005
- model.compile(optimizer=optimizers.Adam(lr=learn_rate),
- loss=losses.binary_crossentropy,
- metrics=[precision,recall,f1_score])
- model.summary()
- return model
- # def getModel2():
- # '''
- # @summary: 时间分类模型
- # '''
- # L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
- # L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
- # R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
- # R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
- #
- # L_input_drop = Dropout(0.3)(L_input)
- # R_input_drop = Dropout(0.3)(R_input)
- # # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
- # L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
- # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
- # # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
- # R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
- # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
- # concat = layers.merge([L_att, R_att], mode='concat')
- #
- # concat = Dropout(0.2)(concat)
- # output = layers.Dense(output_shape[0],activation="softmax")(concat)
- #
- # model = models.Model(inputs=[L_input,R_input], outputs=output)
- #
- # learn_rate = 0.00005
- # model.compile(optimizer=optimizers.Adam(lr=learn_rate),
- # loss=losses.binary_crossentropy,
- # metrics=[precision,recall,f1_score])
- # model.summary()
- # return model
- def getModel3():
- '''
- @summary: 时间分类模型
- '''
- L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
- L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
- R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
- R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
- L_input_drop = Dropout(0.3)(L_input)
- R_input_drop = Dropout(0.3)(R_input)
- # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
- L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
- # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
- # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
- R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
- concat = layers.merge([L_lstm,R_lstm], mode='concat',concat_axis=1)
- concat_mask = layers.merge([L_mask,R_mask], mode='concat',concat_axis=1)
- att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
- # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
- # concat = layers.merge([L_att, R_att], mode='concat')
- att = Dropout(0.2)(att)
- output = layers.Dense(output_shape[0],activation="softmax")(att)
- model = models.Model(inputs=[L_input,R_input], outputs=output)
- learn_rate = 0.0001
- model.compile(optimizer=optimizers.Adam(lr=learn_rate),
- loss=losses.binary_crossentropy,
- metrics=[precision,recall,f1_score])
- model.summary()
- return model
- class Attention(Layer):
- """多头注意力机制
- """
- def __init__(self, nb_head, size_per_head, **kwargs):
- self.nb_head = nb_head
- self.size_per_head = size_per_head
- self.out_dim = nb_head * size_per_head
- super(Attention, self).__init__(**kwargs)
- def build(self, input_shape):
- super(Attention, self).build(input_shape)
- q_in_dim = input_shape[0][-1]
- k_in_dim = input_shape[1][-1]
- v_in_dim = input_shape[2][-1]
- self.q_kernel = self.add_weight(name='q_kernel',
- shape=(q_in_dim, self.out_dim),
- initializer='glorot_normal')
- self.k_kernel = self.add_weight(name='k_kernel',
- shape=(k_in_dim, self.out_dim),
- initializer='glorot_normal')
- self.v_kernel = self.add_weight(name='w_kernel',
- shape=(v_in_dim, self.out_dim),
- initializer='glorot_normal')
- def mask(self, x, mask, mode='mul'):
- if mask is None:
- return x
- else:
- for _ in range(K.ndim(x) - K.ndim(mask)):
- mask = K.expand_dims(mask, K.ndim(mask))
- if mode == 'mul':
- return x * mask
- else:
- return x - (1 - mask) * 1e10
- def call(self, inputs):
- q, k, v = inputs[:3]
- v_mask, q_mask = None, None
- if len(inputs) > 3:
- v_mask = inputs[3]
- if len(inputs) > 4:
- q_mask = inputs[4]
- # 线性变换
- qw = K.dot(q, self.q_kernel)
- kw = K.dot(k, self.k_kernel)
- vw = K.dot(v, self.v_kernel)
- # 形状变换
- qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
- kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
- vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
- # 维度置换
- qw = K.permute_dimensions(qw, (0, 2, 1, 3))
- kw = K.permute_dimensions(kw, (0, 2, 1, 3))
- vw = K.permute_dimensions(vw, (0, 2, 1, 3))
- # Attention
- a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5
- a = K.permute_dimensions(a, (0, 3, 2, 1))
- a = self.mask(a, v_mask, 'add')
- a = K.permute_dimensions(a, (0, 3, 2, 1))
- a = K.softmax(a)
- # 完成输出
- o = K.batch_dot(a, vw, [3, 2])
- o = K.permute_dimensions(o, (0, 2, 1, 3))
- o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
- o = self.mask(o, q_mask, 'mul')
- return o
- def compute_output_shape(self, input_shape):
- return (input_shape[0][0], input_shape[0][1], self.out_dim)
- class Attention02(Layer):
- def __init__(self, **kwargs):
- self.init = initializers.get('normal')
- self.supports_masking = True
- self.attention_dim = 50
- super(Attention02, self).__init__(**kwargs)
- def build(self, input_shape):
- assert len(input_shape) == 3
- self.W = K.variable(self.init((input_shape[-1], 1)))
- self.b = K.variable(self.init((self.attention_dim,)))
- self.u = K.variable(self.init((self.attention_dim, 1)))
- self.trainable_weights = [self.W, self.b, self.u]
- super(Attention02, self).build(input_shape)
- def compute_mask(self, inputs, mask=None):
- return mask
- def call(self, x, mask=None):
- uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
- ait = K.dot(uit, self.u)
- ait = K.squeeze(ait, -1)
- ait = K.exp(ait)
- if mask is not None:
- ait = ait * K.cast(mask, K.floatx())
- # ait = ait * mask
- ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
- ait = K.expand_dims(ait)
- weighted_input = x * ait
- output = K.sum(weighted_input, axis=1)
- return output
- def compute_output_shape(self, input_shape):
- return (input_shape[0], input_shape[-1])
- class OurLayer(Layer):
- """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层
- """
- def reuse(self, layer, *args, **kwargs):
- if not layer.built:
- if len(args) > 0:
- inputs = args[0]
- else:
- inputs = kwargs['inputs']
- if isinstance(inputs, list):
- input_shape = [K.int_shape(x) for x in inputs]
- else:
- input_shape = K.int_shape(inputs)
- layer.build(input_shape)
- outputs = layer.call(*args, **kwargs)
- for w in layer.trainable_weights:
- if w not in self._trainable_weights:
- self._trainable_weights.append(w)
- for w in layer.non_trainable_weights:
- if w not in self._non_trainable_weights:
- self._non_trainable_weights.append(w)
- for u in layer.updates:
- if not hasattr(self, '_updates'):
- self._updates = []
- if u not in self._updates:
- self._updates.append(u)
- return outputs
- class OurBidirectional(OurLayer):
- """自己封装双向RNN,允许传入mask,保证对齐
- """
- def __init__(self, layer, **args):
- super(OurBidirectional, self).__init__(**args)
- self.forward_layer = layer.__class__.from_config(layer.get_config())
- self.backward_layer = layer.__class__.from_config(layer.get_config())
- self.forward_layer.name = 'forward_' + self.forward_layer.name
- self.backward_layer.name = 'backward_' + self.backward_layer.name
- def reverse_sequence(self, x, mask):
- """这里的mask.shape是[batch_size, seq_len, 1]
- """
- seq_len = K.round(K.sum(mask, 1)[:, 0])
- seq_len = K.cast(seq_len, 'int32')
- return tf.reverse_sequence(x, seq_len, seq_dim=1)
- def call(self, inputs):
- x, mask = inputs
- x_forward = self.reuse(self.forward_layer, x)
- x_backward = self.reverse_sequence(x, mask)
- x_backward = self.reuse(self.backward_layer, x_backward)
- x_backward = self.reverse_sequence(x_backward, mask)
- x = K.concatenate([x_forward, x_backward], -1)
- if K.ndim(x) == 3:
- return x * mask
- else:
- return x
- def compute_output_shape(self, input_shape):
- return input_shape[0][:-1] + (self.forward_layer.units * 2,)
- def training():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
- data_load = data_load.reset_index(drop=True)
- test_data = data_load.sample(frac=0.2, random_state=8)
- train_data = data_load.drop(test_data.index, axis=0)
- train_data =train_data.reset_index(drop=True)
- train_x = []
- train_y = []
- for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = str(left)
- right = str(right)
- if left=='nan': left = ''
- if right=='nan': right = ''
- left = list(left)
- right = list(right)
- context = [left, right]
- x = embedding_word(context, shape=input_shape)
- train_x.append(x)
- train_y.append(y)
- test_x = []
- test_y = []
- for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = str(left)
- right = str(right)
- if left == 'nan': left = ''
- if right == 'nan': right = ''
- left = list(left)
- right = list(right)
- context = [left, right]
- x = embedding_word(context, shape=input_shape)
- test_x.append(x)
- test_y.append(y)
- train_y, test_y = (np.array(train_y), np.array(test_y))
- train_x, test_x = (np.array(train_x), np.array(test_x))
- train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
- model = getModel()
- epochs = 150
- batch_size = 256
- checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
- save_best_only=True, mode='min')
- # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
- # cw = dict(enumerate(cw))
- history = model.fit(
- x=[train_x[0], train_x[1]],
- y=train_y,
- validation_data=([test_x[0], test_x[1]], test_y),
- epochs=epochs,
- batch_size=batch_size,
- shuffle=True,
- callbacks=[checkpoint],
- class_weight='auto'
- )
- # plot_loss(history=history)
- load_model = models.load_model("model_label_time_classify.model.hdf5",
- custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- y_pre = load_model.predict([test_x[0], test_x[1]])
- # y_pre = load_model.predict(test_x[0])
- # 各类别预测评估
- res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
- print(res1)
- y_pre2 = load_model.predict([train_x[0], train_x[1]])
- # y_pre2 = load_model.predict(train_x[0])
- res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
- print(res2)
- def train2():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
- data_load = data_load.reset_index(drop=True)
- data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
- data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
- test_data = data_load.sample(frac=0.2, random_state=8)
- train_data = data_load.drop(test_data.index, axis=0)
- train_data =train_data.reset_index(drop=True)
- train_x = []
- train_y = []
- for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['label']):
- y = np.zeros(output_shape)
- y[label] = 1
- context = [left, right]
- x = embedding(context, shape=input_shape2)
- train_x.append(x)
- train_y.append(y)
- test_x = []
- test_y = []
- for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['label']):
- y = np.zeros(output_shape)
- y[label] = 1
- context = [left, right]
- x = embedding(context, shape=input_shape2)
- test_x.append(x)
- test_y.append(y)
- train_y, test_y = (np.array(train_y), np.array(test_y))
- train_x, test_x = (np.array(train_x), np.array(test_x))
- train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
- model = getModel()
- epochs = 150
- batch_size = 256
- checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
- save_best_only=True, mode='min')
- # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
- # cw = dict(enumerate(cw))
- history = model.fit(
- x=[train_x[0], train_x[1]],
- y=train_y,
- validation_data=([test_x[0], test_x[1]], test_y),
- epochs=epochs,
- batch_size=batch_size,
- shuffle=True,
- callbacks=[checkpoint],
- class_weight='auto'
- )
- # plot_loss(history=history)
- load_model = models.load_model("model_label_time_classify.model.hdf5",
- custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- y_pre = load_model.predict([test_x[0], test_x[1]])
- # y_pre = load_model.predict(test_x[0])
- # 各类别预测评估
- res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
- print(res1)
- y_pre2 = load_model.predict([train_x[0], train_x[1]])
- # y_pre2 = load_model.predict(train_x[0])
- res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
- print(res2)
- def train3():
- # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
- data_load = pd.read_excel("tokens_tolabel_data1_res12.xlsx", index_col=0)
- # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
- # data_load = data_load[data_load['pre_label_prob']>0.97]
- # data_load = data_load[data_load['is_same']==1]
- data_zero = pd.read_excel("tokens_label0_data1.xlsx")
- # data_old = pd.read_excel("tokens_data_02.xlsx")
- data_old = pd.read_excel("tokens_data_02_res6.xlsx")
- data_zero = data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)]
- # data_zero = pd.concat([data_zero,data_zero])
- # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
- # data_zero = data_zero.sample(n=80000)
- print("输入shape:",input_shape2)
- data_x = []
- data_y = []
- for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
- if label==_label:
- y = np.zeros(output_shape)
- y[label] = 1
- left = eval(left)
- left = left[-40:]
- right = eval(right)
- right = right[:40]
- context = [left, right]
- # x = embedding(context, shape=input_shape2)
- data_x.append(context)
- data_y.append(y)
- data_load2 = data_load[data_load['re_label']==0]
- for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
- if label==_label:
- y = np.zeros(output_shape)
- y[label] = 1
- left = eval(left)
- left = left[-40:]
- if len(left)>30:
- left = left[2:]
- elif len(left)>15:
- left = left[1:]
- right = eval(right)
- right = right[:40]
- if len(right)>15:
- right = right[:-1]
- context = [left, right]
- # x = embedding(context, shape=input_shape2)
- data_x.append(context)
- data_y.append(y)
- for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = eval(left)
- left = left[-40:]
- right = eval(right)
- right = right[:40]
- context = [left, right]
- # x = embedding(context, shape=input_shape2)
- data_x.append(context)
- data_y.append(y)
- for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = eval(left)
- left = left[-40:]
- if len(left) > 30:
- left = left[2:]
- elif len(left) > 15:
- left = left[1:]
- right = eval(right)
- right = right[:40]
- if len(right) > 15:
- right = right[:-1]
- context = [left, right]
- # x = embedding(context, shape=input_shape2)
- data_x.append(context)
- data_y.append(y)
- # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
- # y = np.zeros(output_shape)
- # y[label] = 1
- # left = eval(left)
- # left = left[-40:]
- # right = eval(right)
- # right = right[:40]
- # context = [left, right]
- # # x = embedding(context, shape=input_shape2)
- # data_x.append(context)
- # data_y.append(y)
- _data = [d for d in zip(data_x,data_y)]
- import random
- random.shuffle(_data)
- data_x = [i[0] for i in _data]
- data_y = [i[1] for i in _data]
- test_len = int(len(data_x) * 0.13)
- test_x = data_x[:test_len]
- test_y = data_y[:test_len]
- print("测试数据量:", len(test_x))
- train_x = data_x[test_len:]
- train_y = data_y[test_len:]
- for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = eval(left)
- left = left[-40:]
- right = eval(right)
- right = right[:40]
- context = [left, right]
- # x = embedding(context, shape=input_shape2)
- train_x.append(context)
- train_y.append(y)
- print("训练数据量:", len(train_x))
- # train_y, test_y = np.array(train_y), np.array(test_y)
- # train_x = np.array(train_x)
- # test_x = np.array(test_x)
- # test_x = np.transpose(test_x, (1, 0, 2, 3))
- # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
- training_generator = DataGenerator(train_x, train_y)
- # training_generator = DataGenerator(data_x, data_y)
- validation_generator = DataGenerator(test_x, test_y)
- # model = getModel3()
- model = getModel2()
- epochs = 100
- # batch_size = 256
- checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
- save_best_only=True, mode='min')
- # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
- # save_best_only=True, mode='min')
- history = model.fit_generator(
- generator=training_generator,
- validation_data=validation_generator,
- use_multiprocessing=True, workers=2,
- epochs=epochs,
- shuffle=True,
- callbacks=[checkpoint],
- class_weight='auto'
- )
- # plot_loss(history=history)
- # load_model = models.load_model("model_label_time_classify.model.hdf5",
- # custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- # y_pre = load_model.predict([test_x[0], test_x[1]])
- # # y_pre = load_model.predict(test_x[0])
- # # 各类别预测评估
- # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
- # print(res1)
- # y_pre2 = load_model.predict([train_x[0], train_x[1]])
- # # y_pre2 = load_model.predict(train_x[0])
- # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
- # print(res2)
- def train4():
- # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
- data_load = pd.read_excel("tokens_tolabel_data1_res13New.xlsx", index_col=0)
- # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
- # data_load = data_load[data_load['pre_label_prob']>0.97]
- # data_load = data_load[data_load['is_same']==1]
- data_zero = pd.read_excel("time_entity5.xlsx")
- data_zero = data_zero[(data_zero['viewed']==1)|(data_zero['is_same']==2)]
- # data_old = pd.read_excel("tokens_data_02.xlsx")
- data_old = pd.read_excel("tokens_data_02_res7New.xlsx")
- data_delay1 = pd.read_excel("delayTime_entity1.xlsx")
- data_delay1 = data_delay1[data_delay1['label']!=0]
- data_delay2 = pd.read_excel("delayTime_entity2.xlsx")
- # data_zero = pd.concat([data_zero,data_zero])
- # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
- # data_zero = data_zero.sample(n=80000)
- print("输入shape:",input_shape2)
- data_x = []
- data_y = []
- import random
- for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
- # if label==_label:
- y = np.zeros(output_shape)
- y[label] = 1
- left = eval(left)
- left = left[-40:]
- right = eval(right)
- right = right[:40]
- context = [left, right]
- # x = embedding(context, shape=input_shape2)
- data_x.append(context)
- data_y.append(y)
- # data_load2 = data_load[data_load['re_label']==0]
- # for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
- # if label==_label:
- # y = np.zeros(output_shape)
- # y[label] = 1
- # left = eval(left)
- # left = left[-40:]
- # if len(left)>30:
- # left = left[2:]
- # elif len(left)>15:
- # left = left[1:]
- # right = eval(right)
- # right = right[:40]
- # if len(right)>15:
- # right = right[:-1]
- # context = [left, right]
- # # x = embedding(context, shape=input_shape2)
- # data_x.append(context)
- # data_y.append(y)
- for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = eval(left)
- left = left[-40:]
- right = eval(right)
- right = right[:40]
- context = [left, right]
- # x = embedding(context, shape=input_shape2)
- data_x.append(context)
- data_y.append(y)
- for left, right, label in zip(data_delay1['context_left'], data_delay1['context_right'], data_delay1['label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = eval(left)
- left = left[-40:]
- right = eval(right)
- right = right[:40]
- context = [left, right]
- # x = embedding(context, shape=input_shape2)
- data_x.append(context)
- data_y.append(y)
- for left, right, label in zip(data_delay2['context_left'], data_delay2['context_right'], data_delay2['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = eval(left)
- left = left[-40:]
- right = eval(right)
- right = right[:40]
- context = [left, right]
- # x = embedding(context, shape=input_shape2)
- data_x.append(context)
- data_y.append(y)
- # for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
- # y = np.zeros(output_shape)
- # y[label] = 1
- # left = eval(left)
- # left = left[-40:]
- # if len(left) > 30:
- # left = left[2:]
- # elif len(left) > 15:
- # left = left[1:]
- # right = eval(right)
- # right = right[:40]
- # if len(right) > 15:
- # right = right[:-1]
- # context = [left, right]
- # # x = embedding(context, shape=input_shape2)
- # data_x.append(context)
- # data_y.append(y)
- # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
- # y = np.zeros(output_shape)
- # y[label] = 1
- # left = eval(left)
- # left = left[-40:]
- # right = eval(right)
- # right = right[:40]
- # context = [left, right]
- # # x = embedding(context, shape=input_shape2)
- # data_x.append(context)
- # data_y.append(y)
- for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'],
- data_old['pre_label'],data_old['is_same']):
- if label==0:
- if is_same==1:
- pass
- else:
- if pre_label>3:
- label = pre_label
- else:
- continue
- y = np.zeros(output_shape)
- y[label] = 1
- left = eval(left)
- left = left[-40:]
- right = eval(right)
- right = right[:40]
- context = [left, right]
- # x = embedding(context, shape=input_shape2)
- data_x.append(context)
- data_y.append(y)
- _data = [d for d in zip(data_x,data_y)]
- random.shuffle(_data)
- data_x = [i[0] for i in _data]
- data_y = [i[1] for i in _data]
- test_len = int(len(data_x) * 0.11)
- test_x = data_x[:test_len]
- test_y = data_y[:test_len]
- print("测试数据量:", len(test_x))
- train_x = data_x[test_len:]
- train_y = data_y[test_len:]
- # for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'],
- # data_old['pre_label'],data_old['is_same']):
- # # if label==0:
- # # if random.random()>0.25:
- # # continue
- # if label==0:
- # if is_same==1:
- # pass
- # else:
- # if pre_label>3:
- # label = pre_label
- # else:
- # continue
- # y = np.zeros(output_shape)
- # y[label] = 1
- # left = eval(left)
- # left = left[-40:]
- # right = eval(right)
- # right = right[:40]
- # context = [left, right]
- # # x = embedding(context, shape=input_shape2)
- # train_x.append(context)
- # train_y.append(y)
- print("训练数据量:", len(train_x))
- # train_y, test_y = np.array(train_y), np.array(test_y)
- # train_x = np.array(train_x)
- # test_x = np.array(test_x)
- # test_x = np.transpose(test_x, (1, 0, 2, 3))
- # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
- training_generator = DataGenerator(train_x, train_y,is_train=True)
- # training_generator = DataGenerator(data_x, data_y)
- validation_generator = DataGenerator(test_x, test_y,is_train=False,shuffle=False)
- # model = getModel3()
- model = getModel2()
- epochs = 100
- # batch_size = 256
- checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
- save_best_only=True, mode='min')
- # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
- # save_best_only=True, mode='min')
- history = model.fit_generator(
- generator=training_generator,
- validation_data=validation_generator,
- use_multiprocessing=True, workers=2,
- epochs=epochs,
- shuffle=True,
- callbacks=[checkpoint],
- class_weight='auto'
- )
- from keras.utils import Sequence,to_categorical
- class DataGenerator(Sequence):
- 'Generates data for Keras'
- def __init__(self, texts, labels, is_train=True,batch_size=256,
- n_classes=len(time_label_dict), shuffle=True):
- 'Initialization'
- # self.dim = dim
- self.batch_size = batch_size
- self.labels = labels
- self.texts = texts
- self.n_classes = n_classes
- self.shuffle = shuffle
- self.is_train = is_train
- self.on_epoch_end()
- def __len__(self):
- 'Denotes the number of batches per epoch'
- _len = len(self.texts) // self.batch_size
- if len(self.texts) % self.batch_size != 0:
- _len += 1
- return _len
- def __getitem__(self, index):
- 'Generate one batch of data'
- # Generate indexes of the batch
- indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
- # Find list of IDs
- list_texts = [self.texts[k] for k in indexes]
- _label = [self.labels[k] for k in indexes]
- # Generate data
- X, y = self.__data_generation(list_texts,_label)
- return X, y
- def on_epoch_end(self):
- 'Updates indexes after each epoch'
- self.indexes = np.arange(len(self.texts))
- if self.shuffle == True:
- np.random.shuffle(self.indexes)
- def __data_generation(self, list_texts,_label):
- 'Generates data containing batch_size samples'
- # Initialization
- # X = np.empty((self.batch_size, *self.dim))
- # y = np.empty((self.batch_size), dtype=int)
- # batch_len = len(list_texts)
- # x = np.empty((batch_len, *self.dim))
- x = []
- # y = np.empty((batch_len), dtype=int)
- # Generate data
- for i, context in enumerate(list_texts):
- # Store sample
- if self.is_train:
- left = context[0]
- if len(left) > 30:
- if random.random() > 0.5:
- left = left[2:]
- elif len(left) > 15:
- if random.random() > 0.5:
- left = left[1:]
- right = context[1]
- if len(right) > 30:
- if random.random() > 0.5:
- right = right[:-2]
- elif len(right) > 15:
- if random.random() > 0.5:
- right = right[:-1]
- context = [left, right]
- words_matrix = embedding_mywords(context, shape=input_shape2)
- # Store class
- # y[i] = _label[i]
- x.append(words_matrix)
- x = np.array(x)
- x = np.transpose(x, (1, 0, 2, 3))
- return [x[0],x[1]], np.array(_label)
- def predict2():
- model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
- data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
- data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
- test_x = []
- test_y = []
- for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['label']):
- y = np.zeros(output_shape)
- y[label] = 1
- context = [left, right]
- x = embedding(context, shape=input_shape2)
- test_x.append(x)
- test_y.append(y)
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
- pre_y = model1.predict([test_x[0],test_x[1]])
- data_load['pre'] = [np.argmax(item) for item in pre_y]
- error_data = data_load[data_load['label']!=data_load['pre']]
- # print(error_data.info())
- error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
- def predict3():
- data = pd.read_csv("new_tokens_data1.csv", chunksize=5000)
- model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- new_data = pd.DataFrame()
- idx = 0
- for _data in data:
- test_x = []
- test_y = []
- for left, right, label in zip(_data['context_left'], _data['context_right'], _data['label']):
- left = eval(left)
- left = left[-10:]
- right = eval(right)
- right = right[:10]
- label = int(label)
- y = np.zeros(output_shape)
- y[label] = 1
- context = [left, right]
- x = embedding(context, shape=input_shape2)
- test_x.append(x)
- test_y.append(y)
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
- pre_y = model1.predict([test_x[0], test_x[1]])
- _data['pre'] = [np.argmax(item) for item in pre_y]
- _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre'])]
- # data['label'] = label
- new_data = pd.concat([new_data, _data])
- idx += 5000
- print(idx)
- # data.to_csv("new_tokens_data1.csv")
- new_data.to_excel("new_tokens_data1_res.xlsx")
- def predict4():
- data = pd.read_csv("tokens_data_02_res6New.csv", chunksize=3000)
- # data = pd.read_excel("C:\\Users\\Administrator\\Desktop\\time_entity4.xlsx")
- # data.to_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv")
- # data = pd.read_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv", chunksize=3000)
- model1 = getModel2()
- model1.load_weights("model_time_classify.weights")
- new_data = pd.DataFrame()
- idx = 0
- for _data in data:
- test_x = []
- test_y = []
- for left, right, label in zip(_data['context_left'], _data['context_right'], _data['re_label']):
- left = eval(left)
- left = left[-40:]
- right = eval(right)
- right = right[:40]
- label = int(label)
- y = np.zeros(output_shape)
- y[label] = 1
- context = [left, right]
- x = embedding_mywords(context, shape=input_shape2)
- test_x.append(x)
- test_y.append(y)
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
- pre_y = model1.predict([test_x[0], test_x[1]])
- _data['pre_label'] = [np.argmax(item) for item in pre_y]
- _data['pre_label_prob'] = [max(item) for item in pre_y]
- _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre_label'])]
- # _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
- # data['label'] = label
- new_data = pd.concat([new_data, _data])
- idx += 3000
- print(idx)
- # new_data.to_csv("tokens_data_02_res7New.csv")
- new_data.to_excel("tokens_data_02_res7New.xlsx")
- # new_data.to_excel("C:\\Users\\Administrator\\Desktop\\tokens_data_02_res7New.xlsx")
- def predict():
- model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
- test_x = []
- test_y = []
- for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = str(left)
- right = str(right)
- if left == 'nan': left = ''
- if right == 'nan': right = ''
- left = list(left)
- right = list(right)
- context = [left, right]
- x = embedding_word(context, shape=input_shape)
- test_x.append(x)
- test_y.append(y)
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
- pre_y = model1.predict([test_x[0],test_x[1]])
- data_load['pre'] = [np.argmax(item) for item in pre_y]
- error_data = data_load[data_load['re_label']!=data_load['pre']]
- # print(error_data.info())
- error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
- def data_process():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
- re_left = re.compile("。[^。]*?$")
- re_right = re.compile("^[^。]*?。")
- left_list = []
- right_list = []
- for left, right in zip(data_load['context_left'], data_load['context_right']):
- left = str(left)
- right = str(right)
- if right=='nan':
- right = ''
- # print(1)
- if re.search("。",left):
- left = re_left.search(left)
- left = left.group()[1:]
- if re.search("。",right):
- right = re_right.search(right)
- right = right.group()
- left_list.append(left)
- right_list.append(right)
- data_load['context_left'] = left_list
- data_load['context_right'] = right_list
- data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv")
- def data_process2():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
- left_list = []
- right_list = []
- for left, right in zip(data_load['context_left'], data_load['context_right']):
- left = str(left)
- right = str(right)
- if right=='nan':
- right = ''
- if left=='nan':
- left = ''
- left = left[max(len(left)-20,0):]
- right = right[:20]
- left_list.append(left)
- right_list.append(right)
- data_load['context_left'] = left_list
- data_load['context_right'] = right_list
- data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_20_prc.csv")
- def data_process3():
- data = load('db_time_data.pk')
- data = data.drop('value', axis=1)
- token_begin = []
- token_end = []
- context_left = []
- context_right = []
- data2 = pd.read_csv("newdata_30_prc2.csv")
- label = []
- # data=data[:20]
- for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
- data['begin_index'],data['end_index'],data['entity_text']):
- _label = data2[(data2['document_id']==int(id)) & (data2['begin_index']==int(begin))][:1]
- if not _label.empty:
- _label = int(_label['re_label'])
- else:
- _label=0
- label.append(_label)
- begin = int(begin)
- end = int(end)
- entity_tbegin = 0
- entity_tend = 0
- find_begin = False
- for t in range(len(offset)):
- if not find_begin:
- if offset[t]==begin:
- entity_tbegin = t
- find_begin = True
- if offset[t]>begin:
- entity_tbegin = t-1
- find_begin = True
- if offset[t] >= end:
- entity_tend = t
- break
- token_begin.append(entity_tbegin)
- token_end.append(entity_tend)
- s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend-1,size=40)
- s1 = s[0]
- _temp1 = []
- for i in range(len(s1)):
- if s1[i]=="。":
- _temp1.append(i)
- if _temp1:
- s1 = s1[_temp1[-1]+1:]
- s2 = s[1]
- _temp2 = []
- for i in range(len(s2)):
- if s2[i] == "。":
- _temp2.append(i)
- break
- if _temp2:
- s2 = s2[:_temp2[0]+1]
- # print(s2)
- context_left.append(s1)
- context_right.append(s2)
- print(id)
- # print(_label)
- # print(entity_text)
- # print(tokens[entity_tbegin:entity_tend])
- data['token_begin'] = token_begin
- data['token_end'] = token_end
- data['context_left'] = context_left
- data['context_right'] = context_right
- data['label'] = label
- data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
- # data.to_csv("tokens_data_02.csv")
- data.to_excel("tokens_data_02.xlsx")
- def plot_loss(history):
- plt.plot(history.history['loss'])
- plt.plot(history.history['val_loss'])
- plt.title('Model loss')
- plt.ylabel('Loss')
- plt.xlabel('Epoch')
- plt.legend(['Train', 'Test'], loc='upper left')
- plt.show()
- def embedding_mywords(datas,shape):
- '''
- @summary:查找词汇对应的词向量
- @param:
- datas:词汇的list
- shape:结果的shape
- @return: array,返回对应shape的词嵌入
- '''
- model_w2v = getModel_w2v()
- embed = np.zeros(shape)
- length = shape[1]
- out_index = 0
- #print(datas)
- for data in datas:
- index = 0
- for item in data:
- item_not_space = re.sub("\s*","",item)
- if index>=length:
- break
- if item_not_space in model_w2v.vocab:
- embed[out_index][index] = model_w2v[item_not_space]
- index += 1
- else:
- embed[out_index][index] = model_w2v['unk']
- index += 1
- out_index += 1
- return embed
- def save_model():
- graph = tf.Graph()
- with graph.as_default() as graph:
- with tf.Session(graph=graph).as_default() as sess:
- test_model = getModel2()
- test_model.load_weights("model_time_classify.weights")
- tf.saved_model.simple_save(sess,
- "models/timesplit_model2/",
- inputs={"input0": test_model.input[0],
- "input1":test_model.input[1]
- },
- outputs={"outputs": test_model.output})
- if __name__ == '__main__':
- # get_data()
- # getModel()
- # getModel2()
- # getModel3()
- # training()
- # train2()
- # train3()
- # train4()
- # data_process()
- # data_process2()
- # data_process3()
- # predict()
- # predict2()
- # predict3()
- # predict4()
- save_model()
- pass
|