123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752 |
- import os,sys
- # sys.path.append('/data/python/znj/BIDI_ML_INFO_EXTRACTION/')
- import numpy as np
- from BiddingKG.dl.common.models import *
- # import tensorflow as tf
- from keras.layers import *
- from keras.models import Model
- from keras.utils import Sequence,to_categorical
- import keras.backend as K
- # from keras.callbacks import Callback
- from keras.optimizers import Adam
- import pandas as pd
- from sklearn.metrics import classification_report
- maxlen = 512
- words_size = 128
- # batch_size = 64
- class Attention(Layer):
- """多头注意力机制
- """
- def __init__(self, nb_head, size_per_head, **kwargs):
- self.nb_head = nb_head
- self.size_per_head = size_per_head
- self.out_dim = nb_head * size_per_head
- super(Attention, self).__init__(**kwargs)
- def build(self, input_shape):
- super(Attention, self).build(input_shape)
- q_in_dim = input_shape[0][-1]
- k_in_dim = input_shape[1][-1]
- v_in_dim = input_shape[2][-1]
- self.q_kernel = self.add_weight(name='q_kernel',
- shape=(q_in_dim, self.out_dim),
- initializer='glorot_normal')
- self.k_kernel = self.add_weight(name='k_kernel',
- shape=(k_in_dim, self.out_dim),
- initializer='glorot_normal')
- self.v_kernel = self.add_weight(name='w_kernel',
- shape=(v_in_dim, self.out_dim),
- initializer='glorot_normal')
- def mask(self, x, mask, mode='mul'):
- if mask is None:
- return x
- else:
- for _ in range(K.ndim(x) - K.ndim(mask)):
- mask = K.expand_dims(mask, K.ndim(mask))
- if mode == 'mul':
- return x * mask
- else:
- return x - (1 - mask) * 1e10
- def call(self, inputs):
- q, k, v = inputs[:3]
- v_mask, q_mask = None, None
- if len(inputs) > 3:
- v_mask = inputs[3]
- if len(inputs) > 4:
- q_mask = inputs[4]
- # 线性变换
- qw = K.dot(q, self.q_kernel)
- kw = K.dot(k, self.k_kernel)
- vw = K.dot(v, self.v_kernel)
- # 形状变换
- qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
- kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
- vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
- # 维度置换
- qw = K.permute_dimensions(qw, (0, 2, 1, 3))
- kw = K.permute_dimensions(kw, (0, 2, 1, 3))
- vw = K.permute_dimensions(vw, (0, 2, 1, 3))
- # Attention
- a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5
- a = K.permute_dimensions(a, (0, 3, 2, 1))
- a = self.mask(a, v_mask, 'add')
- a = K.permute_dimensions(a, (0, 3, 2, 1))
- a = K.softmax(a)
- # 完成输出
- o = K.batch_dot(a, vw, [3, 2])
- o = K.permute_dimensions(o, (0, 2, 1, 3))
- o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
- o = self.mask(o, q_mask, 'mul')
- return o
- def compute_output_shape(self, input_shape):
- return (input_shape[0][0], input_shape[0][1], self.out_dim)
- class OurLayer(Layer):
- """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层
- """
- def reuse(self, layer, *args, **kwargs):
- if not layer.built:
- if len(args) > 0:
- inputs = args[0]
- else:
- inputs = kwargs['inputs']
- if isinstance(inputs, list):
- input_shape = [K.int_shape(x) for x in inputs]
- else:
- input_shape = K.int_shape(inputs)
- layer.build(input_shape)
- outputs = layer.call(*args, **kwargs)
- for w in layer.trainable_weights:
- if w not in self._trainable_weights:
- self._trainable_weights.append(w)
- for w in layer.non_trainable_weights:
- if w not in self._non_trainable_weights:
- self._non_trainable_weights.append(w)
- for u in layer.updates:
- if not hasattr(self, '_updates'):
- self._updates = []
- if u not in self._updates:
- self._updates.append(u)
- return outputs
- class OurBidirectional(OurLayer):
- """自己封装双向RNN,允许传入mask,保证对齐
- """
- def __init__(self, layer, **args):
- super(OurBidirectional, self).__init__(**args)
- self.forward_layer = layer.__class__.from_config(layer.get_config())
- self.backward_layer = layer.__class__.from_config(layer.get_config())
- self.forward_layer.name = 'forward_' + self.forward_layer.name
- self.backward_layer.name = 'backward_' + self.backward_layer.name
- def reverse_sequence(self, x, mask):
- """这里的mask.shape是[batch_size, seq_len, 1]
- """
- seq_len = K.round(K.sum(mask, 1)[:, 0])
- seq_len = K.cast(seq_len, 'int32')
- return tf.reverse_sequence(x, seq_len, seq_dim=1)
- def call(self, inputs):
- x, mask = inputs
- x_forward = self.reuse(self.forward_layer, x)
- x_backward = self.reverse_sequence(x, mask)
- x_backward = self.reuse(self.backward_layer, x_backward)
- x_backward = self.reverse_sequence(x_backward, mask)
- x = K.concatenate([x_forward, x_backward], -1)
- if K.ndim(x) == 3:
- return x * mask
- else:
- return x
- def compute_output_shape(self, input_shape):
- return input_shape[0][:-1] + (self.forward_layer.units * 2,)
- def classify_model():
- num_classes = 6
- embed_input = Input(shape=(None,words_size))
- mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(embed_input)
- # mask = Lambda(lambda x: K.cast(K.not_equal(x, np.zeros(words_size,dtype=float)), 'float32'))(embed_input)
- # test_model = Model([embed_input],mask)
- input_drop = Dropout(0.25)(embed_input)
- t = OurBidirectional(GRU(64, return_sequences=True))([input_drop,mask])
- h = Attention(8, 16)([t, t, t, mask])
- h = Concatenate()([t, h])
- # avg = layers.GlobalAveragePooling1D()(h)
- # output = Dense(num_classes, activation='softmax')(avg)
- h = Lambda(lambda x: x[0] * x[1])([h, mask])
- h_dim = K.int_shape(h)[-1]
- h = Masking(mask_value=np.zeros(h_dim), input_shape=(maxlen, h_dim))(h)
- h = Dropout(0.25)(h)
- gru_output = Bidirectional(GRU(128))(h)
- output = Dense(num_classes, activation='softmax')(gru_output)
- # h = Dropout(0.25)(h)
- # atten = Attention02()(h,mask=K.squeeze(mask,axis=-1))
- # output = Dense(num_classes, activation='softmax')(atten)
- model = Model([embed_input],output)
- model.summary()
- learn_rate = 0.0002
- model.compile(optimizer=optimizers.Adam(lr=learn_rate),
- loss=losses.binary_crossentropy,
- metrics=[precision, recall, f1_score])
- return model
- from BiddingKG.dl.common.nerUtils import getTokens
- import jieba
- def preprocess(text):
- text = re.sub("\n+",',',text)
- text = re.sub("\s+|?+",'',text)
- text = re.sub("[\.·_]{2,}", ',', text)
- text = re.sub("_", '', text)
- text = text[:2500]
- sentences = text.split("。")
- sentences = [s+"。" for s in sentences if s]
- if not sentences:
- return []
- tokens = getTokens(sentences)
- new_tokens = []
- for t in tokens:
- new_tokens.extend(t)
- return new_tokens
- def preprocess2(text):
- text = re.sub("\n+",',',text)
- text = re.sub("\s+|?+",'',text)
- text = re.sub("[\.·_]{2,}", ',', text)
- text = re.sub("_", '', text)
- text = text[:2500]
- tokens = list(jieba.cut(text))
- return tokens
- from BiddingKG.dl.common.Utils import getModel_w2v
- model_w2v = getModel_w2v()
- def get_words_matrix(words):
- if words in model_w2v.vocab:
- return model_w2v[words]
- else:
- return model_w2v['unk']
- def data_generate():
- train_x = []
- train_y = []
- train_text = []
- # # 数据1
- # attachmentcon_list = []
- # re_label_list = []
- # data = pd.read_excel("attachment_data_relabel01.xlsx")
- # data = data[data['re_label'] != 6]
- # attachmentcon_list.extend([i for i in data['attachmentcon']])
- # re_label_list.extend([i for i in data['re_label']])
- # # filetitle_工程量清单标注.xlsx
- # data2 = pd.read_excel("filetitle_3.xlsx")
- # data2 = data2[:1887]
- # attachmentcon_list.extend([i for i in data2['attachmentcon']])
- # re_label_list.extend([i for i in data2['re_label']])
- # # filetitle_评标办法.xlsx
- # data3 = pd.read_excel("filetitle_5.xlsx")
- # attachmentcon_list.extend([i for i in data3['attachmentcon']])
- # re_label_list.extend([i for i in data3['re_label']])
- # # filetitle_限价(控制价).xlsx
- # data7 = pd.read_excel("filetitle_2.xlsx")
- # attachmentcon_list.extend([i for i in data7['attachmentcon']])
- # re_label_list.extend([i for i in data7['re_label']])
- #
- # data4 = pd.read_excel("attachment_data_pred_label2.xlsx")
- # data4 = data4[(data4['pred_label'] == 2)|(data4['pred_label'] == 5)|(data4['pred_label'] == 6)]
- # attachmentcon_list.extend([i for i in data4['attachmentcon']])
- # re_label_list.extend([i for i in data4['re_label']])
- #
- # data5 = pd.read_excel("attachment_data_nolabel01_test_pred.xlsx")
- # data5 = data5[(data5['pred_label'] == 5)|(data5['pred_label'] == 6)]
- # attachmentcon_list.extend([i for i in data5['attachmentcon']])
- # re_label_list.extend([i for i in data5['re_label']])
- # # filetitle_采购清单.xlsx
- # data6 = pd.read_excel("filetitle_4.xlsx")
- # data6 = data6[:900]
- # for filetitle,attachmentcon,re_label in zip(data6['filetitle'],data6['attachmentcon'],data6['re_label']):
- # if re_label==6:
- # re_label = 4
- # attachmentcon = filetitle + attachmentcon
- # attachmentcon_list.append(attachmentcon)
- # re_label_list.append(re_label)
- #
- # data8 = pd.read_excel("attachment_data_relabel01_test_pred2.xlsx")
- # data8 = data8[(data8['pred_label'] == 5) | (data8['pred_label'] == 6)]
- # attachmentcon_list.extend([i for i in data8['attachmentcon']])
- # re_label_list.extend([i for i in data8['re_label']])
- #
- # for text, label in zip(attachmentcon_list, re_label_list):
- # text = str(text)
- # tokens = preprocess2(text)
- # tokens = tokens[:maxlen]
- # train_text.append("".join(tokens))
- # words_matrix = np.zeros((maxlen, words_size))
- # for i in range(len(tokens)):
- # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
- # train_x.append(words_matrix)
- # y = np.zeros(6)
- # y[int(label)] = 1
- # train_y.append(y)
- # 修正后数据
- # data = pd.read_excel("test_pre_result4.xlsx")
- # 修正后数据(新家“评标结果”)
- data = pd.read_excel("test_pre_result5.xlsx")
- for text, label in zip(data['text'], data['re_label']):
- text = str(text)
- # tokens = preprocess2(text)
- # tokens = tokens[:maxlen]
- # train_text.append("".join(tokens))
- # words_matrix = np.zeros((maxlen, words_size))
- # for i in range(len(tokens)):
- # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
- # train_x.append(words_matrix)
- # y = np.zeros(6)
- # y[int(label)] = 1
- # train_y.append(y)
- train_y.append(label)
- train_x.append(text)
- # 'filetitle_评标办法222.xlsx'
- # data2 = pd.read_excel("filetitle_5222.xlsx")
- # # data2 = data[(data['filetype']!='zip')&(data['filetype']!='rar')]
- # for text, label in zip(data2['attachmentcon'], data2['re_label']):
- # text = str(text)
- # tokens = preprocess2(text)
- # tokens = tokens[:maxlen]
- # train_text.append("".join(tokens))
- # words_matrix = np.zeros((maxlen, words_size))
- # for i in range(len(tokens)):
- # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
- # train_x.append(words_matrix)
- # y = np.zeros(6)
- # y[int(label)] = 1
- # train_y.append(y)
- # train_y.append(label)
- # train_x.append(text)
- # filetitle_pingbiaojieguo0_pred2 评标结果类
- # data3 = pd.read_excel("filetitle_pingbiaojieguo0_pred2.xlsx")
- # data3 = data3[data3['re_label']!=6]
- # for text, label in zip(data3['attachmentcon'], data3['re_label']):
- # text = str(text)
- # tokens = preprocess2(text)
- # tokens = tokens[:maxlen]
- # train_text.append("".join(tokens))
- # words_matrix = np.zeros((maxlen, words_size))
- # for i in range(len(tokens)):
- # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
- # train_x.append(words_matrix)
- # y = np.zeros(6)
- # y[int(label)] = 1
- # train_y.append(y)
- # train_y.append(label)
- # train_x.append(text)
- print("数据总量:",len(train_x))
- # train_x = np.array(train_x)
- # train_y = np.array(train_y)
- # data_len = len(train_x)
- # indices = np.random.permutation(data_len)
- # train_x = train_x[indices]
- # train_y = train_y[indices]
- # test_len = int(data_len*0.1)
- # # test_idx = indices[:test_len]
- # # train_idx = indices[test_len:]
- # test_x = train_x[:test_len]
- # test_y = train_y[:test_len]
- # print("测试数据量:", len(test_x))
- # train_x = train_x[test_len:]
- # train_y = train_y[test_len:]
- # print("训练数据量:",len(train_x))
- return train_x,train_y,train_text
- def add_data():
- train_x = []
- train_y = []
- # train_text = []
- # add数据
- data = pd.read_excel("time_202196_pred3.xlsx")
- data = data[(data['filetype']!='zip')&(data['filetype']!='rar')]
- data = data[(data['label_prob']>0.965)|(data['pred_label']==0)]
- for text, label in zip(data['attachmentcon'], data['pred_label']):
- text = str(text)
- # tokens = preprocess2(text)
- # tokens = tokens[:maxlen]
- # train_text.append("".join(tokens))
- # words_matrix = np.zeros((maxlen, words_size))
- # for i in range(len(tokens)):
- # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
- # train_x.append(words_matrix)
- # y = np.zeros(6)
- # y[int(label)] = 1
- # train_y.append(y)
- train_y.append(label)
- train_x.append(text)
- print("add数据量:",len(train_x))
- # return train_x,train_y,train_text
- return train_x,train_y
- def train_1():
- model = classify_model()
- # 载入数据
- train_x, train_y,train_text = data_generate()
- data_len = len(train_x)
- # np.random.seed(7)
- # indices = np.random.permutation(data_len)
- import random
- random.seed(7)
- train_data = [d for d in zip(train_x,train_y,train_text)]
- random.shuffle(train_data)
- train_x = np.array([i[0] for i in train_data])
- train_y = np.array([i[1] for i in train_data])
- train_text = [i[2] for i in train_data]
- # train_x = train_x[indices]
- # train_y = train_y[indices]
- # train_text = [train_text[i] for i in indices.tolist()]
- test_len = int(data_len * 0.1)
- test_x = train_x[:test_len]
- test_y = train_y[:test_len]
- test_text = train_text[:test_len]
- print("测试数据量:", len(test_x))
- train_x = train_x[test_len:]
- train_y = train_y[test_len:]
- train_text = train_text[test_len:]
- print("训练数据量:", len(train_x))
- epochs = 45
- batch_size = 256
- checkpoint = ModelCheckpoint("model_label_classify3.weights",save_weights_only=True, monitor="val_loss", verbose=1,
- save_best_only=True, mode='min')
- model.fit(x=[train_x],y=train_y,validation_data=([test_x],test_y),
- epochs=epochs,batch_size=batch_size,shuffle=True,class_weight='auto',callbacks=[checkpoint])
- model.load_weights("model_label_classify3.weights")
- y_pre = model.predict([test_x])
- # 各类别预测评估
- res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
- print(res1)
- y_pre2 = model.predict([train_x])
- res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
- print(res2)
- result_df = pd.DataFrame({"text":test_text+train_text,'y_label':[np.argmax(i) for i in test_y.tolist()+train_y.tolist()],'pre_label':[np.argmax(i) for i in y_pre.tolist()+y_pre2.tolist()]})
- result_df['is_same'] = [1 if i==j else 0 for i,j in zip(result_df['y_label'],result_df['pre_label'])]
- result_df.to_excel("test_pre_result5.xlsx")
- return model
- class DataGenerator(Sequence):
- 'Generates data for Keras'
- def __init__(self, texts, labels, batch_size=256, dim=(maxlen,words_size),
- n_classes=6, shuffle=True):
- 'Initialization'
- self.dim = dim
- self.batch_size = batch_size
- self.labels = labels
- self.texts = texts
- self.n_classes = n_classes
- self.shuffle = shuffle
- self.on_epoch_end()
- def __len__(self):
- 'Denotes the number of batches per epoch'
- _len = len(self.texts) // self.batch_size
- if len(self.texts) % self.batch_size != 0:
- _len += 1
- return _len
- # return int(np.floor(len(self.texts) / self.batch_size))
- def __getitem__(self, index):
- 'Generate one batch of data'
- # Generate indexes of the batch
- indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
- # Find list of IDs
- list_texts = [self.texts[k] for k in indexes]
- _label = [self.labels[k] for k in indexes]
- # Generate data
- X, y = self.__data_generation(list_texts,_label)
- return X, y
- def on_epoch_end(self):
- 'Updates indexes after each epoch'
- self.indexes = np.arange(len(self.texts))
- if self.shuffle == True:
- np.random.shuffle(self.indexes)
- def __data_generation(self, list_texts,_label):
- 'Generates data containing batch_size samples'
- # Initialization
- # X = np.empty((self.batch_size, *self.dim))
- # y = np.empty((self.batch_size), dtype=int)
- batch_len = len(list_texts)
- x = np.empty((batch_len, *self.dim))
- y = np.empty((batch_len), dtype=int)
- # Generate data
- for i, text in enumerate(list_texts):
- # Store sample
- tokens = preprocess2(text)
- tokens = tokens[:maxlen]
- words_matrix = np.zeros((maxlen, words_size))
- for j in range(len(tokens)):
- words_matrix[j] = np.array(get_words_matrix(tokens[j]))
- x[i,] = words_matrix
- # Store class
- y[i] = _label[i]
- return x, to_categorical(y, num_classes=self.n_classes)
- def train_2():
- model = classify_model()
- # 载入数据
- train_x, train_y = add_data()
- train_x2, train_y2,train_text = data_generate()
- data_len = len(train_x)
- import random
- random.seed(7)
- # train_data = [d for d in zip(train_x,train_y,train_text)]
- train_data = [d for d in zip(train_x,train_y)]
- random.shuffle(train_data)
- train_x = [i[0] for i in train_data]
- train_y = [i[1] for i in train_data]
- # train_text = [i[2] for i in train_data]
- test_len = int(data_len * 0.1)
- test_x = train_x[:test_len]
- test_y = train_y[:test_len]
- # test_text = train_text[:test_len]
- print("测试数据量:", len(test_x))
- train_x = train_x[test_len:]
- train_y = train_y[test_len:]
- # 加上 train_x2, train_y2
- train_x.extend(train_x2)
- train_y.extend(train_y2)
- # train_x = np.array(train_x)
- # train_y = np.array(train_y)
- # train_text = train_text[test_len:]
- print("训练数据量:", len(train_x))
- epochs = 30
- batch_size = 256
- checkpoint = ModelCheckpoint("model_label_classify6.weights",save_weights_only=True, monitor="val_loss", verbose=1,
- save_best_only=True, mode='min')
- # model.fit(x=[train_x],y=train_y,validation_data=([test_x],test_y),
- # epochs=epochs,batch_size=batch_size,shuffle=True,class_weight='auto',callbacks=[checkpoint])
- training_generator = DataGenerator(train_x, train_y)
- validation_generator = DataGenerator(test_x, test_y)
- model.fit_generator(generator=training_generator,validation_data=validation_generator,
- use_multiprocessing=True,
- workers=3,
- epochs=epochs,shuffle=True,class_weight='auto',callbacks=[checkpoint])
- # model.load_weights("model_label_classify4.weights")
- # y_pre = model.predict([test_x])
- # # 各类别预测评估
- # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
- # print(res1)
- # y_pre2 = model.predict([train_x])
- # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
- # print(res2)
- # result_df = pd.DataFrame({"text":test_text+train_text,'y_label':[np.argmax(i) for i in test_y.tolist()+train_y.tolist()],'pre_label':[np.argmax(i) for i in y_pre.tolist()+y_pre2.tolist()]})
- # result_df['is_same'] = [1 if i==j else 0 for i,j in zip(result_df['y_label'],result_df['pre_label'])]
- # result_df.to_excel("test_pre_result5.xlsx")
- return model
- class Attention02(Layer):
- def __init__(self, **kwargs):
- self.init = initializers.get('normal')
- self.supports_masking = True
- self.attention_dim = 50
- super(Attention02, self).__init__(**kwargs)
- def build(self, input_shape):
- assert len(input_shape) == 3
- self.W = K.variable(self.init((input_shape[-1], 1)))
- self.b = K.variable(self.init((self.attention_dim,)))
- self.u = K.variable(self.init((self.attention_dim, 1)))
- self.trainable_weights = [self.W, self.b, self.u]
- super(Attention02, self).build(input_shape)
- def compute_mask(self, inputs, mask=None):
- return mask
- def call(self, x, mask=None):
- uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
- ait = K.dot(uit, self.u)
- ait = K.squeeze(ait, -1)
- ait = K.exp(ait)
- if mask is not None:
- ait = ait * K.cast(mask, K.floatx())
- # ait = ait * mask
- ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
- ait = K.expand_dims(ait)
- weighted_input = x * ait
- output = K.sum(weighted_input, axis=1)
- return output
- def compute_output_shape(self, input_shape):
- return (input_shape[0], input_shape[-1])
- def predict_one(text,model):
- text = str(text)
- tokens = preprocess2(text)
- tokens = tokens[:maxlen]
- words_matrix = np.zeros((maxlen, words_size))
- for i in range(len(tokens)):
- words_matrix[i] = np.array(get_words_matrix(tokens[i]))
- y = model.predict([np.array([words_matrix])])
- y_label = np.argmax(y[0])
- prob = y[0][y_label]
- return y_label,prob
- def test01():
- # test & predict
- model = classify_model()
- model.load_weights("model_label_classify6.weights")
- test_batch_size = 2000
- # data = pd.read_csv("attachment_data_relabel01.csv")
- # data = pd.read_csv("time_202196.csv",chunksize=test_batch_size )
- data = pd.read_csv("time_20210923.csv",chunksize=test_batch_size )
- # data = pd.read_csv("filetitle_pingbiaojieguo0.csv",chunksize=test_batch_size )
- classes_dict = {
- 0: '其他',
- 1: '招标文件',
- 2: '限价(控制价)',
- 3: '工程量清单',
- 4: '采购清单',
- 5: '评标办法'
- }
- # data = data[data['new_label'] == 6]
- # print("test_nums",len(data))
- idx = 0
- new_df = pd.DataFrame()
- for df in data:
- train_x = []
- train_text = []
- for text in df['attachmentcon']:
- text = str(text)
- tokens = preprocess2(text)
- tokens = tokens[:maxlen]
- train_text.append("".join(tokens))
- words_matrix = np.zeros((maxlen, words_size))
- for i in range(len(tokens)):
- words_matrix[i] = np.array(get_words_matrix(tokens[i]))
- train_x.append(words_matrix)
- train_x = np.array(train_x)
- y_pre = model.predict([train_x])
- pred_label = [np.argmax(i) for i in y_pre.tolist()]
- label_prob = [y[y_label] for y,y_label in zip(y_pre.tolist(),pred_label)]
- classes = [classes_dict[label] for label in pred_label]
- text_len = [len(text) for text in train_text]
- df['pred_label'] = pred_label
- df['类别'] = classes
- df['label_prob'] = label_prob
- df['attachmentcon'] = train_text
- df['text_len'] = text_len
- new_df = pd.concat([new_df,df])
- idx += test_batch_size
- print(idx)
- new_df.to_excel("time_20210923_pred.xlsx")
- # 大量标注
- def data_process4():
- model, test_model = classify_model()
- model.load_weights("model_label_classify6.weights")
- # data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data02.csv")
- data = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/time_202196.xlsx")
- print(data.info())
- pred_label = []
- label_prob = []
- new_text = []
- idx = 0
- for text in data['attachmentcon']:
- print(idx)
- idx += 1
- y_label, prob = predict_one(text, model)
- pred_label.append(y_label)
- label_prob.append(prob)
- data['pred_label'] = pred_label
- data['label_prob'] = label_prob
- # data['attachmenthtml'] = [re.sub('\n{2,}','',i.replace("<div> </div>",'',i))[:4000] for i in data['attachmenthtml']]
- data['attachmenthtml'] = [i[:4500] for i in data['attachmenthtml']]
- data.to_excel("attachment_data_pred_label3.xlsx")
- def save_model():
- graph = tf.Graph()
- with graph.as_default() as graph:
- with tf.Session(graph=graph).as_default() as sess:
- test_model = classify_model()
- test_model.load_weights("model_label_classify6.weights")
- tf.saved_model.simple_save(sess,
- "models2/model_attachment_classify/",
- inputs={"input0": test_model.input},
- outputs={"outputs": test_model.output})
- if __name__ == '__main__':
- # model = classify_model()
- test_text = '''招标文件项目编号:SDGP370302202102000110项目名称:淄川经济开发区中心小学校园智能化采购项目采购人:山东淄川经
- 济开发区管理委员会采购代理机构:淄博正益招标有限公司发出日期:2021年8月目录第一章投标邀请7一、项目基本情况7二、申请人的资格要
- 求8三、获取招标文件8四、提交投标文件截止时间、开标时间和地点8五、公告期限9六、其他补充事宜9第二章投标人须知11一、总则161.采
- 购人、采购代理机构及投标人162.资金来源183.投标费用184.适用法律18二、招标文件185.招标文件构成186.招标文件的澄清与修改207.投
- 标截止时间的顺延20三、投标文件的编制208.编制要求209.投标范围及投标文件中标准和计量单位的使用2110.投标文件构成2211.投标报价241
- 2.电子版投标文件2513.投标保证金2614.投标有效期2615.投标文件的签署及规定26四、投标文件的递交2616.投标文件的递交2617.递交
- 投标文件的截止时间2718.投标文件的接收、修改与撤回27五、开标及评标2719.开标2720.资格审查2821.组建评标委员会2922.投标文件符
- 合性审查与澄清3023.投标偏离3224.投标无效3225.比较和评价3326.废标3527.保密要求36六、确定中标3628.中标候选人的确定原则及标
- 准3629.确定中标候选人和中标人3630.采购任务取消3631.中标通知书3632.签订合同3633.履约保证金3734.政府采购融资担保3735.预付
- 款3736.廉洁自律规定3737.人员回避3738.质疑与接收3739.项目其他相关费用3940.合同公示3941.验收4042.履约验收公示4043.招标文
- 件解释权40第三章货物需求41一、项目概述41
- '''
- # test_text = re.sub('\n','',test_text)
- # print(preprocess(test_text))
- # train
- # model = train_1()
- # model = train_2()
- # tokens = preprocess(test_text)
- # tokens = tokens[:maxlen]
- # words_matrix = np.zeros((maxlen, words_size))
- # for i in range(len(tokens)):
- # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
- # y = model.predict([np.array([words_matrix])])
- # print('y:',y)
- # y_label = np.argmax(y[0])
- # print('y_label:',y_label,y[0][y_label])
- # test_mask = test_model.predict([np.array([words_matrix])])
- # print('test_mask:',test_mask)
- test01()
- # save_model()
- # print(jieba.lcut("他来到上海交通大学"))
- # data_process4()
- # d1 = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/attachment_data_pred_label3.xlsx")
- # d2 = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/attachment_data_pred_label2.xlsx")
- # d1 = pd.concat([d1,d2])
- # print(len(d1))
- # d1 = d1[d1['pred_label']!=0]
- # print(len(d1))
- # d1 = d1[d1['pred_label']!=1]
- # print(len(d1))
- # data = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/time_202196_pred2.xlsx")
- # d0 = data[data['pred_label']==0][:2000]
- # d1 = data[data['pred_label']==1][:2000]
- # d2 = data[data['pred_label']==2][:2000]
- # d3 = data[data['pred_label']==3][:2000]
- # d4 = data[data['pred_label']==4][:2000]
- # d5 = data[data['pred_label']==5][:2000]
- # d = pd.concat([d0,d1,d2,d3,d4,d5])
- # d.to_excel("C:/Users/Administrator/Desktop/attachment_data/test_pred.xlsx")
- # d1 = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_relabel01_test_pred2.csv")
- # d1.to_excel("C:/Users/Administrator/Desktop/attachment_data/attachment_data_relabel01_test_pred2.xlsx")
- # d1 = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/filetitle_评标结果0.xlsx")
- # d1.to_csv("C:/Users/Administrator/Desktop/attachment_data/filetitle_评标结果0.csv")
- # model_w2v = getModel_w2v()
- # i = 0
- # print('unk',model_w2v.vocab['unk'])
- # print('unk',model_w2v.similar_by_word('unk'))
- # print('unk',model_w2v.vocab['pad'])
- #
- # print('unk',model_w2v.similar_by_word('pad'))
- # print(model_w2v.vocab['unk'])
- pass
|