Browse Source

"时间分类优化"

znj 3 năm trước cách đây
mục cha
commit
efc874fcac

+ 20 - 3
BiddingKG/dl/interface/getAttributes.py

@@ -2071,6 +2071,11 @@ def getOtherAttributes(list_entity):
                   "product":[],
                   "total_tendereeMoney":0,
                   "total_tendereeMoneyUnit":''}
+    dict_time = {
+        "time_release": [],
+        "time_bidopen": [],
+        "time_bidclose": []
+    }
     for entity in list_entity:
         if entity.entity_type == 'bidway':
             dict_other["bidway"] = turnBidWay(entity.entity_text)
@@ -2079,11 +2084,17 @@ def getOtherAttributes(list_entity):
         elif entity.entity_type=='serviceTime':
             dict_other["serviceTime"] = entity.entity_text
         elif entity.entity_type == 'time' and entity.label==1:
-            dict_other["time_release"] = timeFormat(entity.entity_text)
+            if entity.values[entity.label]>0.6:
+                dict_time['time_release'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
+            # dict_other["time_release"] = timeFormat(entity.entity_text)
         elif entity.entity_type == 'time' and entity.label==2:
-            dict_other["time_bidopen"] = timeFormat(entity.entity_text)
+            if entity.values[entity.label]>0.6:
+                dict_time['time_bidopen'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
+            # dict_other["time_bidopen"] = timeFormat(entity.entity_text)
         elif entity.entity_type == 'time' and entity.label == 3:
-            dict_other["time_bidclose"] = timeFormat(entity.entity_text)
+            if entity.values[entity.label]>0.6:
+                dict_time['time_bidclose'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
+            # dict_other["time_bidclose"] = timeFormat(entity.entity_text)
         elif entity.entity_type=="person" and entity.label ==4:
             dict_other["person_review"].append(entity.entity_text)
         elif entity.entity_type=='product':
@@ -2091,6 +2102,12 @@ def getOtherAttributes(list_entity):
         elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
             dict_other["total_tendereeMoney"] = float(entity.entity_text)
             dict_other["total_tendereeMoneyUnit"] = entity.money_unit
+    # 时间类别
+    for time_type,value in dict_time.items():
+        list_time = dict_time[time_type]
+        if list_time:
+            list_time.sort(key=lambda x:x[1],reverse=True)
+            dict_other[time_type] = list_time[0][0]
     dict_other["product"] = list(set(dict_other["product"]))
     return dict_other
 

+ 36 - 4
BiddingKG/dl/interface/predictor.py

@@ -1357,7 +1357,7 @@ class TimePredictor():
         self.sess = tf.Session(graph=tf.Graph())
         self.inputs_code = None
         self.outputs_code = None
-        self.input_shape = (2,10,128)
+        self.input_shape = (2,40,128)
         self.load_model()
 
     def load_model(self):
@@ -1385,6 +1385,7 @@ class TimePredictor():
         for list_sentence, list_entity in zip(list_sentences, list_entitys):
             p_entitys = 0
             p_sentences = 0
+            list_sentence.sort(key=lambda x: x.sentence_index)
             while(p_entitys<len(list_entity)):
                 entity = list_entity[p_entitys]
                 if entity.entity_type in ['time']:
@@ -1397,7 +1398,7 @@ class TimePredictor():
                             left = s[0]
                             right = s[1]
                             context = [left, right]
-                            x = embedding(context, shape=self.input_shape)
+                            x = self.embedding_words(context, shape=self.input_shape)
                             data_x.append(x)
                             points_entitys.append(entity)
                             break
@@ -1408,6 +1409,33 @@ class TimePredictor():
         data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
         return [data_x, points_entitys]
 
+    def embedding_words(self, datas, shape):
+        '''
+        @summary:查找词汇对应的词向量
+        @param:
+            datas:词汇的list
+            shape:结果的shape
+        @return: array,返回对应shape的词嵌入
+        '''
+        model_w2v = getModel_w2v()
+        embed = np.zeros(shape)
+        length = shape[1]
+        out_index = 0
+        for data in datas:
+            index = 0
+            for item in data:
+                item_not_space = re.sub("\s*", "", item)
+                if index >= length:
+                    break
+                if item_not_space in model_w2v.vocab:
+                    embed[out_index][index] = model_w2v[item_not_space]
+                    index += 1
+                else:
+                    embed[out_index][index] = model_w2v['unk']
+                    index += 1
+            out_index += 1
+        return embed
+
     def predict(self, list_sentences,list_entitys):
         datas = self.search_time_data(list_sentences, list_entitys)
         if datas is None:
@@ -1422,7 +1450,11 @@ class TimePredictor():
                 values = []
                 for item in predict_y[i]:
                     values.append(item)
-                    entity.set_Role(label, values)
+                if label != 0:
+                    if not timeFormat(entity.entity_text):
+                        label = 0
+                        values[0] = 0.5
+                entity.set_Role(label, values)
 
 # 产品字段提取
 class ProductPredictor():
@@ -2232,7 +2264,7 @@ def save_timesplit_model():
 if __name__=="__main__":
     #save_role_model()
     # save_codename_model()
-    save_money_model()
+    # save_money_model()
     #save_person_model()
     #save_form_model()
     #save_codesplit_model()

BIN
BiddingKG/dl/interface/timesplit_model/saved_model.pb


BIN
BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/timesplit_model/variables/variables.index


+ 1 - 1
BiddingKG/dl/relation_extraction/model.py

@@ -198,8 +198,8 @@ def position_id(x):
 
 add_dict = load(os.path.dirname(__file__)+'/../relation_extraction/add_words_dict.pkl')
 add_words = ['<unk>','<company/org>','<location>','<phone>','<contact_person>']
-model_w2v = getModel_w2v()
 def get_words_matrix(words):
+    model_w2v = getModel_w2v()
     if words in add_words:
         return add_dict[words]
     else:

+ 7 - 7
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -51,6 +51,7 @@ codeNamePredict = predictor.CodeNamePredict()
 premPredict = predictor.PREMPredict()
 epcPredict = predictor.EPCPredict()
 roleRulePredict = predictor.RoleRulePredictor()
+timePredictor = predictor.TimePredictor()
 
 #自定义jsonEncoder
 class MyEncoder(json.JSONEncoder):
@@ -91,6 +92,8 @@ def predict(doc_id,text):
     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
     # print("epcPredict")
     epcPredict.predict(list_sentences,list_entitys)
+
+    timePredictor.predict(list_sentences,list_entitys)
     # print("entityLink")
     entityLink.link_entitys(list_entitys)
     # print("getPREMs")
@@ -98,7 +101,7 @@ def predict(doc_id,text):
     # print("getPREMs")
     print("公司——联系人:", end=' ')
     print(prem[0])
-    print(prem[0]['prem']['Project']['roleList'])
+    # print(prem[0]['prem']['Project']['roleList'])
 
     
     ''''''
@@ -123,7 +126,7 @@ def predict(doc_id,text):
                 print(entity.sentence_index)
             elif entity.entity_type=="time":
                 print("time:",end=" ")
-                print(entity.entity_text)
+                print(entity.entity_text, entity.label, entity.values)
             elif entity.entity_type in ['org','company']:
                 _sentence = list_sentences[0][entity.sentence_index]
                 if entity.pointer_person:
@@ -141,9 +144,6 @@ def predict(doc_id,text):
             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
             # elif entity.entity_type in ['package']:
             #     print('pack_entity:',entity.entity_text)
-            # elif entity.entity_type=='time':
-            #     print("时间:", end=' ')
-            #     print(entity.entity_text, entity.label, entity.values)
             # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
 
     #print(prem)
@@ -429,9 +429,9 @@ if __name__=="__main__":
 # '''
     a = time.time()
     print("start")
-    print(predict("12",content))
+    # print(predict("12",content))
     # result = predict("12",text)
-    # result = predict("12",content)
+    result = predict("12",content)
     # print(json.loads(result))
     #test("12",text)
     print("takes",time.time()-a)

BIN
BiddingKG/dl/time/model_label_time_classify.model.hdf5


BIN
BiddingKG/dl/time/model_time_classify.weights


BIN
BiddingKG/dl/time/models/timesplit_model/saved_model.pb


BIN
BiddingKG/dl/time/models/timesplit_model/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/time/models/timesplit_model/variables/variables.index


+ 504 - 10
BiddingKG/dl/time/train_2.py

@@ -1,11 +1,13 @@
 import sys
 import os
 sys.path.append(os.path.abspath("../.."))
+# sys.path.append('/data/python_znj/znj/BIDI_ML_INFO_EXTRACTION/')
 import pandas as pd
 import re
 import psycopg2
 from keras.callbacks import ModelCheckpoint
 from keras import layers,models,optimizers,losses
+from keras.layers import *
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.common.models import *
 from sklearn.metrics import classification_report
@@ -13,15 +15,15 @@ from sklearn.utils import shuffle,class_weight
 import matplotlib.pyplot as plt
 
 input_shape = (2,30,60)
-input_shape2 = (2,10,128)
+input_shape2 = (2,40,128)
 output_shape = [4]
 
 def get_data():
-    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
+    data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
     id_set = set()
     for id in data_load['document_id']:
         id_set.add(id)
-    conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101")
+    conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.103")
     sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
           "FROM corpus_iedocument A,brat_bratannotation B " \
           "WHERE A.human_identifier = '%s' " \
@@ -47,10 +49,12 @@ def get_data():
     df = pd.concat([df, time_label], axis=1)
     print(df.info())
     df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
-    df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
-    df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
-    df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
-    df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
+    df['sentences'] = [eval(sentence) for sentence in df['sentences']]
+    # df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
+    # df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
+    df['offsets_to_text'] = [eval(offset) for offset in df['offsets_to_text']]
+    # df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
+    # df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
     save(df,'db_time_data.pk')
 
 def getModel():
@@ -78,6 +82,163 @@ def getModel():
     model.summary()
     return model
 
+def getModel2():
+    '''
+    @summary: 时间分类模型
+    '''
+    L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
+    R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
+
+    L_input_drop = Dropout(0.2)(L_input)
+    R_input_drop = Dropout(0.2)(R_input)
+    # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
+    L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
+    L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
+    # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
+    R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
+    R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
+    concat = layers.merge([L_att, R_att], mode='concat')
+    concat = Dropout(0.3)(concat)
+    output = layers.Dense(output_shape[0],activation="softmax")(concat)
+
+    model = models.Model(inputs=[L_input,R_input], outputs=output)
+
+    learn_rate = 0.00005
+    model.compile(optimizer=optimizers.Adam(lr=learn_rate),
+                  loss=losses.binary_crossentropy,
+                  metrics=[precision,recall,f1_score])
+    model.summary()
+    return model
+
+def getModel3():
+    '''
+    @summary: 时间分类模型
+    '''
+    L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
+    R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
+
+    L_input_drop = Dropout(0.2)(L_input)
+    R_input_drop = Dropout(0.2)(R_input)
+    # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
+    L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
+    # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
+    # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
+    R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
+    concat = layers.merge([L_lstm,R_lstm], mode='concat',concat_axis=1)
+    concat_mask = layers.merge([L_mask,R_mask], mode='concat',concat_axis=1)
+    att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
+    # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
+    # concat = layers.merge([L_att, R_att], mode='concat')
+    att = Dropout(0.3)(att)
+    output = layers.Dense(output_shape[0],activation="softmax")(att)
+
+    model = models.Model(inputs=[L_input,R_input], outputs=output)
+
+    learn_rate = 0.0001
+    model.compile(optimizer=optimizers.Adam(lr=learn_rate),
+                  loss=losses.binary_crossentropy,
+                  metrics=[precision,recall,f1_score])
+    model.summary()
+    return model
+
+class Attention02(Layer):
+    def __init__(self, **kwargs):
+        self.init = initializers.get('normal')
+        self.supports_masking = True
+        self.attention_dim = 50
+        super(Attention02, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        assert len(input_shape) == 3
+        self.W = K.variable(self.init((input_shape[-1], 1)))
+        self.b = K.variable(self.init((self.attention_dim,)))
+        self.u = K.variable(self.init((self.attention_dim, 1)))
+        self.trainable_weights = [self.W, self.b, self.u]
+        super(Attention02, self).build(input_shape)
+
+    def compute_mask(self, inputs, mask=None):
+        return mask
+
+    def call(self, x, mask=None):
+        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
+        ait = K.dot(uit, self.u)
+        ait = K.squeeze(ait, -1)
+        ait = K.exp(ait)
+
+        if mask is not None:
+            ait = ait * K.cast(mask, K.floatx())
+            # ait = ait * mask
+
+        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+        ait = K.expand_dims(ait)
+        weighted_input = x * ait
+        output = K.sum(weighted_input, axis=1)
+        return output
+
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], input_shape[-1])
+
+class OurLayer(Layer):
+    """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层
+    """
+    def reuse(self, layer, *args, **kwargs):
+        if not layer.built:
+            if len(args) > 0:
+                inputs = args[0]
+            else:
+                inputs = kwargs['inputs']
+            if isinstance(inputs, list):
+                input_shape = [K.int_shape(x) for x in inputs]
+            else:
+                input_shape = K.int_shape(inputs)
+            layer.build(input_shape)
+        outputs = layer.call(*args, **kwargs)
+        for w in layer.trainable_weights:
+            if w not in self._trainable_weights:
+                self._trainable_weights.append(w)
+        for w in layer.non_trainable_weights:
+            if w not in self._non_trainable_weights:
+                self._non_trainable_weights.append(w)
+        for u in layer.updates:
+            if not hasattr(self, '_updates'):
+                self._updates = []
+            if u not in self._updates:
+                self._updates.append(u)
+        return outputs
+class OurBidirectional(OurLayer):
+    """自己封装双向RNN,允许传入mask,保证对齐
+    """
+    def __init__(self, layer, **args):
+        super(OurBidirectional, self).__init__(**args)
+        self.forward_layer = layer.__class__.from_config(layer.get_config())
+        self.backward_layer = layer.__class__.from_config(layer.get_config())
+        self.forward_layer.name = 'forward_' + self.forward_layer.name
+        self.backward_layer.name = 'backward_' + self.backward_layer.name
+    def reverse_sequence(self, x, mask):
+        """这里的mask.shape是[batch_size, seq_len, 1]
+        """
+        seq_len = K.round(K.sum(mask, 1)[:, 0])
+        seq_len = K.cast(seq_len, 'int32')
+        return tf.reverse_sequence(x, seq_len, seq_dim=1)
+    def call(self, inputs):
+        x, mask = inputs
+        x_forward = self.reuse(self.forward_layer, x)
+        x_backward = self.reverse_sequence(x, mask)
+        x_backward = self.reuse(self.backward_layer, x_backward)
+        x_backward = self.reverse_sequence(x_backward, mask)
+        x = K.concatenate([x_forward, x_backward], -1)
+        if K.ndim(x) == 3:
+            return x * mask
+        else:
+            return x
+    def compute_output_shape(self, input_shape):
+        return input_shape[0][:-1] + (self.forward_layer.units * 2,)
+
+
 
 def training():
     data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
@@ -215,6 +376,222 @@ def train2():
     res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
     print(res2)
 
+def train3():
+    # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
+    data_load = pd.read_excel("tokens_tolabel_data1_res12.xlsx", index_col=0)
+    # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
+    # data_load = data_load[data_load['pre_label_prob']>0.97]
+    # data_load = data_load[data_load['is_same']==1]
+    data_zero = pd.read_excel("tokens_label0_data1.xlsx")
+    # data_old = pd.read_excel("tokens_data_02.xlsx")
+    data_old = pd.read_excel("tokens_data_02_res6.xlsx")
+    data_zero = data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)]
+    # data_zero = pd.concat([data_zero,data_zero])
+    # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
+    # data_zero = data_zero.sample(n=80000)
+    print("输入shape:",input_shape2)
+    data_x = []
+    data_y = []
+    for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
+        if label==_label:
+            y = np.zeros(output_shape)
+            y[label] = 1
+            left = eval(left)
+            left = left[-40:]
+            right = eval(right)
+            right = right[:40]
+            context = [left, right]
+            # x = embedding(context, shape=input_shape2)
+            data_x.append(context)
+            data_y.append(y)
+    data_load2 = data_load[data_load['re_label']==0]
+    for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
+            if label==_label:
+                y = np.zeros(output_shape)
+                y[label] = 1
+                left = eval(left)
+                left = left[-40:]
+                if len(left)>30:
+                    left = left[2:]
+                elif len(left)>15:
+                    left = left[1:]
+                right = eval(right)
+                right = right[:40]
+                if len(right)>15:
+                    right = right[:-1]
+                context = [left, right]
+                # x = embedding(context, shape=input_shape2)
+                data_x.append(context)
+                data_y.append(y)
+
+    for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
+        y = np.zeros(output_shape)
+        y[label] = 1
+        left = eval(left)
+        left = left[-40:]
+        right = eval(right)
+        right = right[:40]
+        context = [left, right]
+        # x = embedding(context, shape=input_shape2)
+        data_x.append(context)
+        data_y.append(y)
+
+    for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
+            y = np.zeros(output_shape)
+            y[label] = 1
+            left = eval(left)
+            left = left[-40:]
+            if len(left) > 30:
+                left = left[2:]
+            elif len(left) > 15:
+                left = left[1:]
+            right = eval(right)
+            right = right[:40]
+            if len(right) > 15:
+                right = right[:-1]
+            context = [left, right]
+            # x = embedding(context, shape=input_shape2)
+            data_x.append(context)
+            data_y.append(y)
+
+    # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
+    #         y = np.zeros(output_shape)
+    #         y[label] = 1
+    #         left = eval(left)
+    #         left = left[-40:]
+    #         right = eval(right)
+    #         right = right[:40]
+    #         context = [left, right]
+    #         # x = embedding(context, shape=input_shape2)
+    #         data_x.append(context)
+    #         data_y.append(y)
+
+    _data = [d for d in zip(data_x,data_y)]
+    import random
+    random.shuffle(_data)
+    data_x = [i[0] for i in _data]
+    data_y = [i[1] for i in _data]
+    test_len = int(len(data_x) * 0.13)
+    test_x = data_x[:test_len]
+    test_y = data_y[:test_len]
+    print("测试数据量:", len(test_x))
+    train_x = data_x[test_len:]
+    train_y = data_y[test_len:]
+
+    for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
+            y = np.zeros(output_shape)
+            y[label] = 1
+            left = eval(left)
+            left = left[-40:]
+            right = eval(right)
+            right = right[:40]
+            context = [left, right]
+            # x = embedding(context, shape=input_shape2)
+            train_x.append(context)
+            train_y.append(y)
+    print("训练数据量:", len(train_x))
+
+    # train_y, test_y = np.array(train_y), np.array(test_y)
+    # train_x = np.array(train_x)
+    # test_x = np.array(test_x)
+    # test_x = np.transpose(test_x, (1, 0, 2, 3))
+    # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
+    training_generator = DataGenerator(train_x, train_y)
+    # training_generator = DataGenerator(data_x, data_y)
+    validation_generator = DataGenerator(test_x, test_y)
+
+    # model = getModel3()
+    model = getModel2()
+    epochs = 100
+    # batch_size = 256
+    checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
+                                 save_best_only=True, mode='min')
+    # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
+    #                                  save_best_only=True, mode='min')
+
+    history = model.fit_generator(
+        generator=training_generator,
+        validation_data=validation_generator,
+        use_multiprocessing=True, workers=2,
+        epochs=epochs,
+        shuffle=True,
+        callbacks=[checkpoint],
+        class_weight='auto'
+    )
+    # plot_loss(history=history)
+    # load_model = models.load_model("model_label_time_classify.model.hdf5",
+    #                                custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
+    # y_pre = load_model.predict([test_x[0], test_x[1]])
+    # # y_pre = load_model.predict(test_x[0])
+    # # 各类别预测评估
+    # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
+    # print(res1)
+    # y_pre2 = load_model.predict([train_x[0], train_x[1]])
+    # # y_pre2 = load_model.predict(train_x[0])
+    # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
+    # print(res2)
+from keras.utils import Sequence,to_categorical
+class DataGenerator(Sequence):
+    'Generates data for Keras'
+    def __init__(self, texts, labels, batch_size=256,
+                 n_classes=4, shuffle=True):
+        'Initialization'
+        # self.dim = dim
+        self.batch_size = batch_size
+        self.labels = labels
+        self.texts = texts
+        self.n_classes = n_classes
+        self.shuffle = shuffle
+        self.on_epoch_end()
+
+    def __len__(self):
+        'Denotes the number of batches per epoch'
+        _len = len(self.texts) // self.batch_size
+        if len(self.texts) % self.batch_size != 0:
+            _len += 1
+        return _len
+
+    def __getitem__(self, index):
+        'Generate one batch of data'
+        # Generate indexes of the batch
+        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
+
+        # Find list of IDs
+        list_texts = [self.texts[k] for k in indexes]
+        _label = [self.labels[k] for k in indexes]
+        # Generate data
+        X, y = self.__data_generation(list_texts,_label)
+
+        return X, y
+
+    def on_epoch_end(self):
+        'Updates indexes after each epoch'
+        self.indexes = np.arange(len(self.texts))
+        if self.shuffle == True:
+            np.random.shuffle(self.indexes)
+
+    def __data_generation(self, list_texts,_label):
+        'Generates data containing batch_size samples'
+        # Initialization
+        # X = np.empty((self.batch_size, *self.dim))
+        # y = np.empty((self.batch_size), dtype=int)
+        # batch_len = len(list_texts)
+        # x = np.empty((batch_len, *self.dim))
+        x = []
+        # y = np.empty((batch_len), dtype=int)
+
+        # Generate data
+        for i, context in enumerate(list_texts):
+            # Store sample
+            # tokens = preprocess2(text)
+            # tokens = tokens[:maxlen]
+            words_matrix = embedding_mywords(context, shape=input_shape2)
+            # Store class
+            # y[i] = _label[i]
+            x.append(words_matrix)
+        x = np.array(x)
+        x = np.transpose(x, (1, 0, 2, 3))
+        return [x[0],x[1]], np.array(_label)
 
 def predict2():
     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
@@ -237,6 +614,73 @@ def predict2():
     # print(error_data.info())
     error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
 
+def predict3():
+    data = pd.read_csv("new_tokens_data1.csv", chunksize=5000)
+    model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
+    new_data = pd.DataFrame()
+    idx = 0
+    for _data in data:
+
+        test_x = []
+        test_y = []
+        for left, right, label in zip(_data['context_left'], _data['context_right'], _data['label']):
+            left = eval(left)
+            left = left[-10:]
+            right = eval(right)
+            right = right[:10]
+            label = int(label)
+            y = np.zeros(output_shape)
+            y[label] = 1
+            context = [left, right]
+            x = embedding(context, shape=input_shape2)
+            test_x.append(x)
+            test_y.append(y)
+        test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
+        pre_y = model1.predict([test_x[0], test_x[1]])
+        _data['pre'] = [np.argmax(item) for item in pre_y]
+        _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre'])]
+        # data['label'] = label
+        new_data = pd.concat([new_data, _data])
+        idx += 5000
+        print(idx)
+    # data.to_csv("new_tokens_data1.csv")
+    new_data.to_excel("new_tokens_data1_res.xlsx")
+
+def predict4():
+    data = pd.read_csv("tokens_tolabel_data1_res11.csv", chunksize=3000)
+    model1 = getModel2()
+    model1.load_weights("model_time_classify.weights")
+    new_data = pd.DataFrame()
+    idx = 0
+    for _data in data:
+        test_x = []
+        test_y = []
+        for left, right, label in zip(_data['context_left'], _data['context_right'], _data['re_label']):
+            left = eval(left)
+            left = left[-40:]
+            right = eval(right)
+            right = right[:40]
+            label = int(label)
+            y = np.zeros(output_shape)
+            y[label] = 1
+            context = [left, right]
+            x = embedding_mywords(context, shape=input_shape2)
+            test_x.append(x)
+            test_y.append(y)
+        test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
+        pre_y = model1.predict([test_x[0], test_x[1]])
+        _data['pre_label'] = [np.argmax(item) for item in pre_y]
+        _data['pre_label_prob'] = [max(item) for item in pre_y]
+        _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['re_label'],_data['pre_label'])]
+        # _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
+        # data['label'] = label
+        new_data = pd.concat([new_data, _data])
+        idx += 3000
+        print(idx)
+    # data.to_csv("new_tokens_data1.csv")
+    new_data.to_excel("tokens_tolabel_data1_res12.xlsx")
+
+
 def predict():
     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
     data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
@@ -313,7 +757,7 @@ def data_process3():
     token_end = []
     context_left = []
     context_right = []
-    data2 = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc2.csv")
+    data2 = pd.read_csv("newdata_30_prc2.csv")
     label = []
     # data=data[:20]
     for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
@@ -343,7 +787,7 @@ def data_process3():
                 break
         token_begin.append(entity_tbegin)
         token_end.append(entity_tend)
-        s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend,size=10)
+        s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend-1,size=40)
         s1 = s[0]
         _temp1 = []
         for i in range(len(s1)):
@@ -372,7 +816,8 @@ def data_process3():
     data['context_right'] = context_right
     data['label'] = label
     data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
-    data.to_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv")
+    # data.to_csv("tokens_data_02.csv")
+    data.to_excel("tokens_data_02.xlsx")
 
 def plot_loss(history):
     plt.plot(history.history['loss'])
@@ -383,15 +828,64 @@ def plot_loss(history):
     plt.legend(['Train', 'Test'], loc='upper left')
     plt.show()
 
+def embedding_mywords(datas,shape):
+    '''
+    @summary:查找词汇对应的词向量
+    @param:
+        datas:词汇的list
+        shape:结果的shape
+    @return: array,返回对应shape的词嵌入
+    '''
+    model_w2v = getModel_w2v()
+    embed = np.zeros(shape)
+    length = shape[1]
+    out_index = 0
+    #print(datas)
+    for data in datas:
+        index = 0
+        for item in data:
+            item_not_space = re.sub("\s*","",item)
+            if index>=length:
+                break
+            if item_not_space in model_w2v.vocab:
+                embed[out_index][index] = model_w2v[item_not_space]
+                index += 1
+            else:
+                embed[out_index][index] = model_w2v['unk']
+                index += 1
+        out_index += 1
+    return embed
+
+def save_model():
+    graph = tf.Graph()
+    with graph.as_default() as graph:
+        with tf.Session(graph=graph).as_default() as sess:
+            test_model = getModel2()
+            test_model.load_weights("model_time_classify.weights")
+            tf.saved_model.simple_save(sess,
+                                       "models/timesplit_model/",
+                                       inputs={"input0": test_model.input[0],
+                                               "input1":test_model.input[1]
+                                               },
+                                       outputs={"outputs": test_model.output})
+
+
+
 if __name__ == '__main__':
     # get_data()
     # getModel()
+    # getModel2()
+    # getModel3()
     # training()
     # train2()
+    # train3()
     # data_process()
     # data_process2()
     # data_process3()
     # predict()
     # predict2()
+    # predict3()
+    # predict4()
+    save_model()
 
     pass