Explorar o código

Merge remote-tracking branch 'origin/master'

luojiehua %!s(int64=3) %!d(string=hai) anos
pai
achega
345114b5ec

+ 1 - 1
.idea/misc.xml

@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (py3.5)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
     <option name="version" value="3" />
   </component>
   </component>

+ 1 - 1
BiddingKG.iml

@@ -7,7 +7,7 @@
   </component>
   </component>
   <component name="NewModuleRootManager">
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
     <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Python 3.6" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.5 (py3.5)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
     <orderEntry type="sourceFolder" forTests="false" />
     <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
     <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
   </component>
   </component>

+ 1 - 1
BiddingKG/dl/common/models.py

@@ -12,7 +12,7 @@ import keras.backend as K
 import tensorflow as tf
 import tensorflow as tf
 import math
 import math
 import six
 import six
-layers.Dense
+
 
 
 def getTextCNNModel(input_shape,vocab,embedding_weights,classes):
 def getTextCNNModel(input_shape,vocab,embedding_weights,classes):
 
 

+ 0 - 1
BiddingKG/dl/interface/Entitys.py

@@ -166,7 +166,6 @@ class Entity():
         # self.person_phone = person_phone
         # self.person_phone = person_phone
         self.person_phone = []
         self.person_phone = []
         self.is_tail = False
         self.is_tail = False
-        self.person_phone = person_phone
         self.notes = ''  # 2021/7/20 新增,保存金额大小写,单位等备注
         self.notes = ''  # 2021/7/20 新增,保存金额大小写,单位等备注
         self.money_unit = '' #2021/8/17 新增,保存金额单位 元、万元 、亿元
         self.money_unit = '' #2021/8/17 新增,保存金额单位 元、万元 、亿元
 
 

+ 20 - 3
BiddingKG/dl/interface/getAttributes.py

@@ -2071,6 +2071,11 @@ def getOtherAttributes(list_entity):
                   "product":[],
                   "product":[],
                   "total_tendereeMoney":0,
                   "total_tendereeMoney":0,
                   "total_tendereeMoneyUnit":''}
                   "total_tendereeMoneyUnit":''}
+    dict_time = {
+        "time_release": [],
+        "time_bidopen": [],
+        "time_bidclose": []
+    }
     for entity in list_entity:
     for entity in list_entity:
         if entity.entity_type == 'bidway':
         if entity.entity_type == 'bidway':
             dict_other["bidway"] = turnBidWay(entity.entity_text)
             dict_other["bidway"] = turnBidWay(entity.entity_text)
@@ -2079,11 +2084,17 @@ def getOtherAttributes(list_entity):
         elif entity.entity_type=='serviceTime':
         elif entity.entity_type=='serviceTime':
             dict_other["serviceTime"] = entity.entity_text
             dict_other["serviceTime"] = entity.entity_text
         elif entity.entity_type == 'time' and entity.label==1:
         elif entity.entity_type == 'time' and entity.label==1:
-            dict_other["time_release"] = timeFormat(entity.entity_text)
+            if entity.values[entity.label]>0.6:
+                dict_time['time_release'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
+            # dict_other["time_release"] = timeFormat(entity.entity_text)
         elif entity.entity_type == 'time' and entity.label==2:
         elif entity.entity_type == 'time' and entity.label==2:
-            dict_other["time_bidopen"] = timeFormat(entity.entity_text)
+            if entity.values[entity.label]>0.6:
+                dict_time['time_bidopen'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
+            # dict_other["time_bidopen"] = timeFormat(entity.entity_text)
         elif entity.entity_type == 'time' and entity.label == 3:
         elif entity.entity_type == 'time' and entity.label == 3:
-            dict_other["time_bidclose"] = timeFormat(entity.entity_text)
+            if entity.values[entity.label]>0.6:
+                dict_time['time_bidclose'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
+            # dict_other["time_bidclose"] = timeFormat(entity.entity_text)
         elif entity.entity_type=="person" and entity.label ==4:
         elif entity.entity_type=="person" and entity.label ==4:
             dict_other["person_review"].append(entity.entity_text)
             dict_other["person_review"].append(entity.entity_text)
         elif entity.entity_type=='product':
         elif entity.entity_type=='product':
@@ -2091,6 +2102,12 @@ def getOtherAttributes(list_entity):
         elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
         elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
             dict_other["total_tendereeMoney"] = float(entity.entity_text)
             dict_other["total_tendereeMoney"] = float(entity.entity_text)
             dict_other["total_tendereeMoneyUnit"] = entity.money_unit
             dict_other["total_tendereeMoneyUnit"] = entity.money_unit
+    # 时间类别
+    for time_type,value in dict_time.items():
+        list_time = dict_time[time_type]
+        if list_time:
+            list_time.sort(key=lambda x:x[1],reverse=True)
+            dict_other[time_type] = list_time[0][0]
     dict_other["product"] = list(set(dict_other["product"]))
     dict_other["product"] = list(set(dict_other["product"]))
     return dict_other
     return dict_other
 
 

+ 3 - 3
BiddingKG/dl/interface/modelFactory.py

@@ -260,7 +260,7 @@ class Model_relation_extraction():
 
 
     def predict(self,text_in, words, rate=0.5):
     def predict(self,text_in, words, rate=0.5):
         # text_words = text_in
         # text_words = text_in
-        R = []
+        triple_list = []
         # _t2 = [self.words2id.get(c, 1) for c in words]
         # _t2 = [self.words2id.get(c, 1) for c in words]
         _t2 = np.zeros((len(words), self.words_size))
         _t2 = np.zeros((len(words), self.words_size))
         for i in range(len(words)):
         for i in range(len(words)):
@@ -292,8 +292,8 @@ class Model_relation_extraction():
                 for _ooo1, _c1 in zip(*_oo1):
                 for _ooo1, _c1 in zip(*_oo1):
                     _object = text_in[_ooo1]
                     _object = text_in[_ooo1]
                     _predicate = self.id2predicate[_c1]
                     _predicate = self.id2predicate[_c1]
-                    R.append((_subject[0], _predicate, _object))
-            return R
+                    triple_list.append((_subject[0], _predicate, _object))
+            return triple_list
         else:
         else:
             return []
             return []
 
 

+ 36 - 4
BiddingKG/dl/interface/predictor.py

@@ -1357,7 +1357,7 @@ class TimePredictor():
         self.sess = tf.Session(graph=tf.Graph())
         self.sess = tf.Session(graph=tf.Graph())
         self.inputs_code = None
         self.inputs_code = None
         self.outputs_code = None
         self.outputs_code = None
-        self.input_shape = (2,10,128)
+        self.input_shape = (2,40,128)
         self.load_model()
         self.load_model()
 
 
     def load_model(self):
     def load_model(self):
@@ -1385,6 +1385,7 @@ class TimePredictor():
         for list_sentence, list_entity in zip(list_sentences, list_entitys):
         for list_sentence, list_entity in zip(list_sentences, list_entitys):
             p_entitys = 0
             p_entitys = 0
             p_sentences = 0
             p_sentences = 0
+            list_sentence.sort(key=lambda x: x.sentence_index)
             while(p_entitys<len(list_entity)):
             while(p_entitys<len(list_entity)):
                 entity = list_entity[p_entitys]
                 entity = list_entity[p_entitys]
                 if entity.entity_type in ['time']:
                 if entity.entity_type in ['time']:
@@ -1397,7 +1398,7 @@ class TimePredictor():
                             left = s[0]
                             left = s[0]
                             right = s[1]
                             right = s[1]
                             context = [left, right]
                             context = [left, right]
-                            x = embedding(context, shape=self.input_shape)
+                            x = self.embedding_words(context, shape=self.input_shape)
                             data_x.append(x)
                             data_x.append(x)
                             points_entitys.append(entity)
                             points_entitys.append(entity)
                             break
                             break
@@ -1408,6 +1409,33 @@ class TimePredictor():
         data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
         data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
         return [data_x, points_entitys]
         return [data_x, points_entitys]
 
 
+    def embedding_words(self, datas, shape):
+        '''
+        @summary:查找词汇对应的词向量
+        @param:
+            datas:词汇的list
+            shape:结果的shape
+        @return: array,返回对应shape的词嵌入
+        '''
+        model_w2v = getModel_w2v()
+        embed = np.zeros(shape)
+        length = shape[1]
+        out_index = 0
+        for data in datas:
+            index = 0
+            for item in data:
+                item_not_space = re.sub("\s*", "", item)
+                if index >= length:
+                    break
+                if item_not_space in model_w2v.vocab:
+                    embed[out_index][index] = model_w2v[item_not_space]
+                    index += 1
+                else:
+                    embed[out_index][index] = model_w2v['unk']
+                    index += 1
+            out_index += 1
+        return embed
+
     def predict(self, list_sentences,list_entitys):
     def predict(self, list_sentences,list_entitys):
         datas = self.search_time_data(list_sentences, list_entitys)
         datas = self.search_time_data(list_sentences, list_entitys)
         if datas is None:
         if datas is None:
@@ -1422,7 +1450,11 @@ class TimePredictor():
                 values = []
                 values = []
                 for item in predict_y[i]:
                 for item in predict_y[i]:
                     values.append(item)
                     values.append(item)
-                    entity.set_Role(label, values)
+                if label != 0:
+                    if not timeFormat(entity.entity_text):
+                        label = 0
+                        values[0] = 0.5
+                entity.set_Role(label, values)
 
 
 # 产品字段提取
 # 产品字段提取
 class ProductPredictor():
 class ProductPredictor():
@@ -2232,7 +2264,7 @@ def save_timesplit_model():
 if __name__=="__main__":
 if __name__=="__main__":
     #save_role_model()
     #save_role_model()
     # save_codename_model()
     # save_codename_model()
-    save_money_model()
+    # save_money_model()
     #save_person_model()
     #save_person_model()
     #save_form_model()
     #save_form_model()
     #save_codesplit_model()
     #save_codesplit_model()

BIN=BIN
BiddingKG/dl/interface/timesplit_model/saved_model.pb


BIN=BIN
BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001


BIN=BIN
BiddingKG/dl/interface/timesplit_model/variables/variables.index


+ 1 - 1
BiddingKG/dl/relation_extraction/model.py

@@ -198,8 +198,8 @@ def position_id(x):
 
 
 add_dict = load(os.path.dirname(__file__)+'/../relation_extraction/add_words_dict.pkl')
 add_dict = load(os.path.dirname(__file__)+'/../relation_extraction/add_words_dict.pkl')
 add_words = ['<unk>','<company/org>','<location>','<phone>','<contact_person>']
 add_words = ['<unk>','<company/org>','<location>','<phone>','<contact_person>']
-model_w2v = getModel_w2v()
 def get_words_matrix(words):
 def get_words_matrix(words):
+    model_w2v = getModel_w2v()
     if words in add_words:
     if words in add_words:
         return add_dict[words]
         return add_dict[words]
     else:
     else:

+ 7 - 7
BiddingKG/dl/test/测试整个要素提取流程.py

@@ -51,6 +51,7 @@ codeNamePredict = predictor.CodeNamePredict()
 premPredict = predictor.PREMPredict()
 premPredict = predictor.PREMPredict()
 epcPredict = predictor.EPCPredict()
 epcPredict = predictor.EPCPredict()
 roleRulePredict = predictor.RoleRulePredictor()
 roleRulePredict = predictor.RoleRulePredictor()
+timePredictor = predictor.TimePredictor()
 
 
 #自定义jsonEncoder
 #自定义jsonEncoder
 class MyEncoder(json.JSONEncoder):
 class MyEncoder(json.JSONEncoder):
@@ -91,6 +92,8 @@ def predict(doc_id,text):
     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
     # print("epcPredict")
     # print("epcPredict")
     epcPredict.predict(list_sentences,list_entitys)
     epcPredict.predict(list_sentences,list_entitys)
+
+    timePredictor.predict(list_sentences,list_entitys)
     # print("entityLink")
     # print("entityLink")
     entityLink.link_entitys(list_entitys)
     entityLink.link_entitys(list_entitys)
     # print("getPREMs")
     # print("getPREMs")
@@ -98,7 +101,7 @@ def predict(doc_id,text):
     # print("getPREMs")
     # print("getPREMs")
     print("公司——联系人:", end=' ')
     print("公司——联系人:", end=' ')
     print(prem[0])
     print(prem[0])
-    print(prem[0]['prem']['Project']['roleList'])
+    # print(prem[0]['prem']['Project']['roleList'])
 
 
     
     
     ''''''
     ''''''
@@ -123,7 +126,7 @@ def predict(doc_id,text):
                 print(entity.sentence_index)
                 print(entity.sentence_index)
             elif entity.entity_type=="time":
             elif entity.entity_type=="time":
                 print("time:",end=" ")
                 print("time:",end=" ")
-                print(entity.entity_text)
+                print(entity.entity_text, entity.label, entity.values)
             elif entity.entity_type in ['org','company']:
             elif entity.entity_type in ['org','company']:
                 _sentence = list_sentences[0][entity.sentence_index]
                 _sentence = list_sentences[0][entity.sentence_index]
                 if entity.pointer_person:
                 if entity.pointer_person:
@@ -141,9 +144,6 @@ def predict(doc_id,text):
             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
             # elif entity.entity_type in ['package']:
             # elif entity.entity_type in ['package']:
             #     print('pack_entity:',entity.entity_text)
             #     print('pack_entity:',entity.entity_text)
-            # elif entity.entity_type=='time':
-            #     print("时间:", end=' ')
-            #     print(entity.entity_text, entity.label, entity.values)
             # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
             # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
 
 
     #print(prem)
     #print(prem)
@@ -430,8 +430,8 @@ if __name__=="__main__":
     a = time.time()
     a = time.time()
     print("start")
     print("start")
     # print(predict("12",content))
     # print(predict("12",content))
-    result = predict("12",text)
-    # result = predict("12",content)
+    # result = predict("12",text)
+    result = predict("12",content)
     # print(json.loads(result))
     # print(json.loads(result))
     #test("12",text)
     #test("12",text)
     print("takes",time.time()-a)
     print("takes",time.time()-a)

BIN=BIN
BiddingKG/dl/time/model_label_time_classify.model.hdf5


BIN=BIN
BiddingKG/dl/time/model_time_classify.weights


BIN=BIN
BiddingKG/dl/time/models/timesplit_model/saved_model.pb


BIN=BIN
BiddingKG/dl/time/models/timesplit_model/variables/variables.data-00000-of-00001


BIN=BIN
BiddingKG/dl/time/models/timesplit_model/variables/variables.index


+ 504 - 10
BiddingKG/dl/time/train_2.py

@@ -1,11 +1,13 @@
 import sys
 import sys
 import os
 import os
 sys.path.append(os.path.abspath("../.."))
 sys.path.append(os.path.abspath("../.."))
+# sys.path.append('/data/python_znj/znj/BIDI_ML_INFO_EXTRACTION/')
 import pandas as pd
 import pandas as pd
 import re
 import re
 import psycopg2
 import psycopg2
 from keras.callbacks import ModelCheckpoint
 from keras.callbacks import ModelCheckpoint
 from keras import layers,models,optimizers,losses
 from keras import layers,models,optimizers,losses
+from keras.layers import *
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.common.models import *
 from BiddingKG.dl.common.models import *
 from sklearn.metrics import classification_report
 from sklearn.metrics import classification_report
@@ -13,15 +15,15 @@ from sklearn.utils import shuffle,class_weight
 import matplotlib.pyplot as plt
 import matplotlib.pyplot as plt
 
 
 input_shape = (2,30,60)
 input_shape = (2,30,60)
-input_shape2 = (2,10,128)
+input_shape2 = (2,40,128)
 output_shape = [4]
 output_shape = [4]
 
 
 def get_data():
 def get_data():
-    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
+    data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
     id_set = set()
     id_set = set()
     for id in data_load['document_id']:
     for id in data_load['document_id']:
         id_set.add(id)
         id_set.add(id)
-    conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101")
+    conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.103")
     sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
     sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
           "FROM corpus_iedocument A,brat_bratannotation B " \
           "FROM corpus_iedocument A,brat_bratannotation B " \
           "WHERE A.human_identifier = '%s' " \
           "WHERE A.human_identifier = '%s' " \
@@ -47,10 +49,12 @@ def get_data():
     df = pd.concat([df, time_label], axis=1)
     df = pd.concat([df, time_label], axis=1)
     print(df.info())
     print(df.info())
     df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
     df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
-    df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
-    df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
-    df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
-    df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
+    df['sentences'] = [eval(sentence) for sentence in df['sentences']]
+    # df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
+    # df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
+    df['offsets_to_text'] = [eval(offset) for offset in df['offsets_to_text']]
+    # df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
+    # df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
     save(df,'db_time_data.pk')
     save(df,'db_time_data.pk')
 
 
 def getModel():
 def getModel():
@@ -78,6 +82,163 @@ def getModel():
     model.summary()
     model.summary()
     return model
     return model
 
 
+def getModel2():
+    '''
+    @summary: 时间分类模型
+    '''
+    L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
+    R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
+
+    L_input_drop = Dropout(0.2)(L_input)
+    R_input_drop = Dropout(0.2)(R_input)
+    # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
+    L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
+    L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
+    # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
+    R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
+    R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
+    concat = layers.merge([L_att, R_att], mode='concat')
+    concat = Dropout(0.3)(concat)
+    output = layers.Dense(output_shape[0],activation="softmax")(concat)
+
+    model = models.Model(inputs=[L_input,R_input], outputs=output)
+
+    learn_rate = 0.00005
+    model.compile(optimizer=optimizers.Adam(lr=learn_rate),
+                  loss=losses.binary_crossentropy,
+                  metrics=[precision,recall,f1_score])
+    model.summary()
+    return model
+
+def getModel3():
+    '''
+    @summary: 时间分类模型
+    '''
+    L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
+    R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
+
+    L_input_drop = Dropout(0.2)(L_input)
+    R_input_drop = Dropout(0.2)(R_input)
+    # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
+    L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
+    # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
+    # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
+    R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
+    concat = layers.merge([L_lstm,R_lstm], mode='concat',concat_axis=1)
+    concat_mask = layers.merge([L_mask,R_mask], mode='concat',concat_axis=1)
+    att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
+    # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
+    # concat = layers.merge([L_att, R_att], mode='concat')
+    att = Dropout(0.3)(att)
+    output = layers.Dense(output_shape[0],activation="softmax")(att)
+
+    model = models.Model(inputs=[L_input,R_input], outputs=output)
+
+    learn_rate = 0.0001
+    model.compile(optimizer=optimizers.Adam(lr=learn_rate),
+                  loss=losses.binary_crossentropy,
+                  metrics=[precision,recall,f1_score])
+    model.summary()
+    return model
+
+class Attention02(Layer):
+    def __init__(self, **kwargs):
+        self.init = initializers.get('normal')
+        self.supports_masking = True
+        self.attention_dim = 50
+        super(Attention02, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        assert len(input_shape) == 3
+        self.W = K.variable(self.init((input_shape[-1], 1)))
+        self.b = K.variable(self.init((self.attention_dim,)))
+        self.u = K.variable(self.init((self.attention_dim, 1)))
+        self.trainable_weights = [self.W, self.b, self.u]
+        super(Attention02, self).build(input_shape)
+
+    def compute_mask(self, inputs, mask=None):
+        return mask
+
+    def call(self, x, mask=None):
+        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
+        ait = K.dot(uit, self.u)
+        ait = K.squeeze(ait, -1)
+        ait = K.exp(ait)
+
+        if mask is not None:
+            ait = ait * K.cast(mask, K.floatx())
+            # ait = ait * mask
+
+        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+        ait = K.expand_dims(ait)
+        weighted_input = x * ait
+        output = K.sum(weighted_input, axis=1)
+        return output
+
+    def compute_output_shape(self, input_shape):
+        return (input_shape[0], input_shape[-1])
+
+class OurLayer(Layer):
+    """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层
+    """
+    def reuse(self, layer, *args, **kwargs):
+        if not layer.built:
+            if len(args) > 0:
+                inputs = args[0]
+            else:
+                inputs = kwargs['inputs']
+            if isinstance(inputs, list):
+                input_shape = [K.int_shape(x) for x in inputs]
+            else:
+                input_shape = K.int_shape(inputs)
+            layer.build(input_shape)
+        outputs = layer.call(*args, **kwargs)
+        for w in layer.trainable_weights:
+            if w not in self._trainable_weights:
+                self._trainable_weights.append(w)
+        for w in layer.non_trainable_weights:
+            if w not in self._non_trainable_weights:
+                self._non_trainable_weights.append(w)
+        for u in layer.updates:
+            if not hasattr(self, '_updates'):
+                self._updates = []
+            if u not in self._updates:
+                self._updates.append(u)
+        return outputs
+class OurBidirectional(OurLayer):
+    """自己封装双向RNN,允许传入mask,保证对齐
+    """
+    def __init__(self, layer, **args):
+        super(OurBidirectional, self).__init__(**args)
+        self.forward_layer = layer.__class__.from_config(layer.get_config())
+        self.backward_layer = layer.__class__.from_config(layer.get_config())
+        self.forward_layer.name = 'forward_' + self.forward_layer.name
+        self.backward_layer.name = 'backward_' + self.backward_layer.name
+    def reverse_sequence(self, x, mask):
+        """这里的mask.shape是[batch_size, seq_len, 1]
+        """
+        seq_len = K.round(K.sum(mask, 1)[:, 0])
+        seq_len = K.cast(seq_len, 'int32')
+        return tf.reverse_sequence(x, seq_len, seq_dim=1)
+    def call(self, inputs):
+        x, mask = inputs
+        x_forward = self.reuse(self.forward_layer, x)
+        x_backward = self.reverse_sequence(x, mask)
+        x_backward = self.reuse(self.backward_layer, x_backward)
+        x_backward = self.reverse_sequence(x_backward, mask)
+        x = K.concatenate([x_forward, x_backward], -1)
+        if K.ndim(x) == 3:
+            return x * mask
+        else:
+            return x
+    def compute_output_shape(self, input_shape):
+        return input_shape[0][:-1] + (self.forward_layer.units * 2,)
+
+
 
 
 def training():
 def training():
     data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
     data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
@@ -215,6 +376,222 @@ def train2():
     res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
     res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
     print(res2)
     print(res2)
 
 
+def train3():
+    # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
+    data_load = pd.read_excel("tokens_tolabel_data1_res12.xlsx", index_col=0)
+    # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
+    # data_load = data_load[data_load['pre_label_prob']>0.97]
+    # data_load = data_load[data_load['is_same']==1]
+    data_zero = pd.read_excel("tokens_label0_data1.xlsx")
+    # data_old = pd.read_excel("tokens_data_02.xlsx")
+    data_old = pd.read_excel("tokens_data_02_res6.xlsx")
+    data_zero = data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)]
+    # data_zero = pd.concat([data_zero,data_zero])
+    # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
+    # data_zero = data_zero.sample(n=80000)
+    print("输入shape:",input_shape2)
+    data_x = []
+    data_y = []
+    for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
+        if label==_label:
+            y = np.zeros(output_shape)
+            y[label] = 1
+            left = eval(left)
+            left = left[-40:]
+            right = eval(right)
+            right = right[:40]
+            context = [left, right]
+            # x = embedding(context, shape=input_shape2)
+            data_x.append(context)
+            data_y.append(y)
+    data_load2 = data_load[data_load['re_label']==0]
+    for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
+            if label==_label:
+                y = np.zeros(output_shape)
+                y[label] = 1
+                left = eval(left)
+                left = left[-40:]
+                if len(left)>30:
+                    left = left[2:]
+                elif len(left)>15:
+                    left = left[1:]
+                right = eval(right)
+                right = right[:40]
+                if len(right)>15:
+                    right = right[:-1]
+                context = [left, right]
+                # x = embedding(context, shape=input_shape2)
+                data_x.append(context)
+                data_y.append(y)
+
+    for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
+        y = np.zeros(output_shape)
+        y[label] = 1
+        left = eval(left)
+        left = left[-40:]
+        right = eval(right)
+        right = right[:40]
+        context = [left, right]
+        # x = embedding(context, shape=input_shape2)
+        data_x.append(context)
+        data_y.append(y)
+
+    for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
+            y = np.zeros(output_shape)
+            y[label] = 1
+            left = eval(left)
+            left = left[-40:]
+            if len(left) > 30:
+                left = left[2:]
+            elif len(left) > 15:
+                left = left[1:]
+            right = eval(right)
+            right = right[:40]
+            if len(right) > 15:
+                right = right[:-1]
+            context = [left, right]
+            # x = embedding(context, shape=input_shape2)
+            data_x.append(context)
+            data_y.append(y)
+
+    # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
+    #         y = np.zeros(output_shape)
+    #         y[label] = 1
+    #         left = eval(left)
+    #         left = left[-40:]
+    #         right = eval(right)
+    #         right = right[:40]
+    #         context = [left, right]
+    #         # x = embedding(context, shape=input_shape2)
+    #         data_x.append(context)
+    #         data_y.append(y)
+
+    _data = [d for d in zip(data_x,data_y)]
+    import random
+    random.shuffle(_data)
+    data_x = [i[0] for i in _data]
+    data_y = [i[1] for i in _data]
+    test_len = int(len(data_x) * 0.13)
+    test_x = data_x[:test_len]
+    test_y = data_y[:test_len]
+    print("测试数据量:", len(test_x))
+    train_x = data_x[test_len:]
+    train_y = data_y[test_len:]
+
+    for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
+            y = np.zeros(output_shape)
+            y[label] = 1
+            left = eval(left)
+            left = left[-40:]
+            right = eval(right)
+            right = right[:40]
+            context = [left, right]
+            # x = embedding(context, shape=input_shape2)
+            train_x.append(context)
+            train_y.append(y)
+    print("训练数据量:", len(train_x))
+
+    # train_y, test_y = np.array(train_y), np.array(test_y)
+    # train_x = np.array(train_x)
+    # test_x = np.array(test_x)
+    # test_x = np.transpose(test_x, (1, 0, 2, 3))
+    # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
+    training_generator = DataGenerator(train_x, train_y)
+    # training_generator = DataGenerator(data_x, data_y)
+    validation_generator = DataGenerator(test_x, test_y)
+
+    # model = getModel3()
+    model = getModel2()
+    epochs = 100
+    # batch_size = 256
+    checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
+                                 save_best_only=True, mode='min')
+    # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
+    #                                  save_best_only=True, mode='min')
+
+    history = model.fit_generator(
+        generator=training_generator,
+        validation_data=validation_generator,
+        use_multiprocessing=True, workers=2,
+        epochs=epochs,
+        shuffle=True,
+        callbacks=[checkpoint],
+        class_weight='auto'
+    )
+    # plot_loss(history=history)
+    # load_model = models.load_model("model_label_time_classify.model.hdf5",
+    #                                custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
+    # y_pre = load_model.predict([test_x[0], test_x[1]])
+    # # y_pre = load_model.predict(test_x[0])
+    # # 各类别预测评估
+    # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
+    # print(res1)
+    # y_pre2 = load_model.predict([train_x[0], train_x[1]])
+    # # y_pre2 = load_model.predict(train_x[0])
+    # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
+    # print(res2)
+from keras.utils import Sequence,to_categorical
+class DataGenerator(Sequence):
+    'Generates data for Keras'
+    def __init__(self, texts, labels, batch_size=256,
+                 n_classes=4, shuffle=True):
+        'Initialization'
+        # self.dim = dim
+        self.batch_size = batch_size
+        self.labels = labels
+        self.texts = texts
+        self.n_classes = n_classes
+        self.shuffle = shuffle
+        self.on_epoch_end()
+
+    def __len__(self):
+        'Denotes the number of batches per epoch'
+        _len = len(self.texts) // self.batch_size
+        if len(self.texts) % self.batch_size != 0:
+            _len += 1
+        return _len
+
+    def __getitem__(self, index):
+        'Generate one batch of data'
+        # Generate indexes of the batch
+        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
+
+        # Find list of IDs
+        list_texts = [self.texts[k] for k in indexes]
+        _label = [self.labels[k] for k in indexes]
+        # Generate data
+        X, y = self.__data_generation(list_texts,_label)
+
+        return X, y
+
+    def on_epoch_end(self):
+        'Updates indexes after each epoch'
+        self.indexes = np.arange(len(self.texts))
+        if self.shuffle == True:
+            np.random.shuffle(self.indexes)
+
+    def __data_generation(self, list_texts,_label):
+        'Generates data containing batch_size samples'
+        # Initialization
+        # X = np.empty((self.batch_size, *self.dim))
+        # y = np.empty((self.batch_size), dtype=int)
+        # batch_len = len(list_texts)
+        # x = np.empty((batch_len, *self.dim))
+        x = []
+        # y = np.empty((batch_len), dtype=int)
+
+        # Generate data
+        for i, context in enumerate(list_texts):
+            # Store sample
+            # tokens = preprocess2(text)
+            # tokens = tokens[:maxlen]
+            words_matrix = embedding_mywords(context, shape=input_shape2)
+            # Store class
+            # y[i] = _label[i]
+            x.append(words_matrix)
+        x = np.array(x)
+        x = np.transpose(x, (1, 0, 2, 3))
+        return [x[0],x[1]], np.array(_label)
 
 
 def predict2():
 def predict2():
     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
@@ -237,6 +614,73 @@ def predict2():
     # print(error_data.info())
     # print(error_data.info())
     error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
     error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
 
 
+def predict3():
+    data = pd.read_csv("new_tokens_data1.csv", chunksize=5000)
+    model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
+    new_data = pd.DataFrame()
+    idx = 0
+    for _data in data:
+
+        test_x = []
+        test_y = []
+        for left, right, label in zip(_data['context_left'], _data['context_right'], _data['label']):
+            left = eval(left)
+            left = left[-10:]
+            right = eval(right)
+            right = right[:10]
+            label = int(label)
+            y = np.zeros(output_shape)
+            y[label] = 1
+            context = [left, right]
+            x = embedding(context, shape=input_shape2)
+            test_x.append(x)
+            test_y.append(y)
+        test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
+        pre_y = model1.predict([test_x[0], test_x[1]])
+        _data['pre'] = [np.argmax(item) for item in pre_y]
+        _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre'])]
+        # data['label'] = label
+        new_data = pd.concat([new_data, _data])
+        idx += 5000
+        print(idx)
+    # data.to_csv("new_tokens_data1.csv")
+    new_data.to_excel("new_tokens_data1_res.xlsx")
+
+def predict4():
+    data = pd.read_csv("tokens_tolabel_data1_res11.csv", chunksize=3000)
+    model1 = getModel2()
+    model1.load_weights("model_time_classify.weights")
+    new_data = pd.DataFrame()
+    idx = 0
+    for _data in data:
+        test_x = []
+        test_y = []
+        for left, right, label in zip(_data['context_left'], _data['context_right'], _data['re_label']):
+            left = eval(left)
+            left = left[-40:]
+            right = eval(right)
+            right = right[:40]
+            label = int(label)
+            y = np.zeros(output_shape)
+            y[label] = 1
+            context = [left, right]
+            x = embedding_mywords(context, shape=input_shape2)
+            test_x.append(x)
+            test_y.append(y)
+        test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
+        pre_y = model1.predict([test_x[0], test_x[1]])
+        _data['pre_label'] = [np.argmax(item) for item in pre_y]
+        _data['pre_label_prob'] = [max(item) for item in pre_y]
+        _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['re_label'],_data['pre_label'])]
+        # _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
+        # data['label'] = label
+        new_data = pd.concat([new_data, _data])
+        idx += 3000
+        print(idx)
+    # data.to_csv("new_tokens_data1.csv")
+    new_data.to_excel("tokens_tolabel_data1_res12.xlsx")
+
+
 def predict():
 def predict():
     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
     data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
     data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
@@ -313,7 +757,7 @@ def data_process3():
     token_end = []
     token_end = []
     context_left = []
     context_left = []
     context_right = []
     context_right = []
-    data2 = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc2.csv")
+    data2 = pd.read_csv("newdata_30_prc2.csv")
     label = []
     label = []
     # data=data[:20]
     # data=data[:20]
     for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
     for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
@@ -343,7 +787,7 @@ def data_process3():
                 break
                 break
         token_begin.append(entity_tbegin)
         token_begin.append(entity_tbegin)
         token_end.append(entity_tend)
         token_end.append(entity_tend)
-        s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend,size=10)
+        s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend-1,size=40)
         s1 = s[0]
         s1 = s[0]
         _temp1 = []
         _temp1 = []
         for i in range(len(s1)):
         for i in range(len(s1)):
@@ -372,7 +816,8 @@ def data_process3():
     data['context_right'] = context_right
     data['context_right'] = context_right
     data['label'] = label
     data['label'] = label
     data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
     data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
-    data.to_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv")
+    # data.to_csv("tokens_data_02.csv")
+    data.to_excel("tokens_data_02.xlsx")
 
 
 def plot_loss(history):
 def plot_loss(history):
     plt.plot(history.history['loss'])
     plt.plot(history.history['loss'])
@@ -383,15 +828,64 @@ def plot_loss(history):
     plt.legend(['Train', 'Test'], loc='upper left')
     plt.legend(['Train', 'Test'], loc='upper left')
     plt.show()
     plt.show()
 
 
+def embedding_mywords(datas,shape):
+    '''
+    @summary:查找词汇对应的词向量
+    @param:
+        datas:词汇的list
+        shape:结果的shape
+    @return: array,返回对应shape的词嵌入
+    '''
+    model_w2v = getModel_w2v()
+    embed = np.zeros(shape)
+    length = shape[1]
+    out_index = 0
+    #print(datas)
+    for data in datas:
+        index = 0
+        for item in data:
+            item_not_space = re.sub("\s*","",item)
+            if index>=length:
+                break
+            if item_not_space in model_w2v.vocab:
+                embed[out_index][index] = model_w2v[item_not_space]
+                index += 1
+            else:
+                embed[out_index][index] = model_w2v['unk']
+                index += 1
+        out_index += 1
+    return embed
+
+def save_model():
+    graph = tf.Graph()
+    with graph.as_default() as graph:
+        with tf.Session(graph=graph).as_default() as sess:
+            test_model = getModel2()
+            test_model.load_weights("model_time_classify.weights")
+            tf.saved_model.simple_save(sess,
+                                       "models/timesplit_model/",
+                                       inputs={"input0": test_model.input[0],
+                                               "input1":test_model.input[1]
+                                               },
+                                       outputs={"outputs": test_model.output})
+
+
+
 if __name__ == '__main__':
 if __name__ == '__main__':
     # get_data()
     # get_data()
     # getModel()
     # getModel()
+    # getModel2()
+    # getModel3()
     # training()
     # training()
     # train2()
     # train2()
+    # train3()
     # data_process()
     # data_process()
     # data_process2()
     # data_process2()
     # data_process3()
     # data_process3()
     # predict()
     # predict()
     # predict2()
     # predict2()
+    # predict3()
+    # predict4()
+    save_model()
 
 
     pass
     pass