3 سال پیش · 345114b5ec
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,6 +1,6 @@
 
															 <?xml version="1.0" encoding="UTF-8"?>
														
 
															 <project version="4">
														
 
															-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
														
 
															+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (py3.5)" project-jdk-type="Python SDK" />
														
 
															   <component name="PythonCompatibilityInspectionAdvertiser">
														
 
															     <option name="version" value="3" />
														
 
															   </component>
														
--- a/BiddingKG.iml
+++ b/BiddingKG.iml
@@ -7,7 +7,7 @@
 
															   </component>
														
 
															   <component name="NewModuleRootManager">
														
 
															     <content url="file://$MODULE_DIR$" />
														
 
															-    <orderEntry type="jdk" jdkName="Python 3.6" jdkType="Python SDK" />
														
 
															+    <orderEntry type="jdk" jdkName="Python 3.5 (py3.5)" jdkType="Python SDK" />
														
 
															     <orderEntry type="sourceFolder" forTests="false" />
														
 
															     <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
														
 
															   </component>
														
--- a/BiddingKG/dl/common/models.py
+++ b/BiddingKG/dl/common/models.py
@@ -12,7 +12,7 @@ import keras.backend as K
 
															 import tensorflow as tf
														
 
															 import math
														
 
															 import six
														
 
															-layers.Dense
														
 
															+
														
 
															 def getTextCNNModel(input_shape,vocab,embedding_weights,classes):
														
--- a/BiddingKG/dl/interface/Entitys.py
+++ b/BiddingKG/dl/interface/Entitys.py
@@ -166,7 +166,6 @@ class Entity():
 
															         # self.person_phone = person_phone
														
 
															         self.person_phone = []
														
 
															         self.is_tail = False
														
 
															-        self.person_phone = person_phone
														
 
															         self.notes = ''  # 2021/7/20 新增，保存金额大小写，单位等备注
														
 
															         self.money_unit = '' #2021/8/17 新增，保存金额单位 元、万元 、亿元
														
--- a/BiddingKG/dl/interface/getAttributes.py
+++ b/BiddingKG/dl/interface/getAttributes.py
@@ -2071,6 +2071,11 @@ def getOtherAttributes(list_entity):
 
															                   "product":[],
														
 
															                   "total_tendereeMoney":0,
														
 
															                   "total_tendereeMoneyUnit":''}
														
 
															+    dict_time = {
														
 
															+        "time_release": [],
														
 
															+        "time_bidopen": [],
														
 
															+        "time_bidclose": []
														
 
															+    }
														
 
															     for entity in list_entity:
														
 
															         if entity.entity_type == 'bidway':
														
 
															             dict_other["bidway"] = turnBidWay(entity.entity_text)
														
@@ -2079,11 +2084,17 @@ def getOtherAttributes(list_entity):
 
															         elif entity.entity_type=='serviceTime':
														
 
															             dict_other["serviceTime"] = entity.entity_text
														
 
															         elif entity.entity_type == 'time' and entity.label==1:
														
 
															-            dict_other["time_release"] = timeFormat(entity.entity_text)
														
 
															+            if entity.values[entity.label]>0.6:
														
 
															+                dict_time['time_release'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
														
 
															+            # dict_other["time_release"] = timeFormat(entity.entity_text)
														
 
															         elif entity.entity_type == 'time' and entity.label==2:
														
 
															-            dict_other["time_bidopen"] = timeFormat(entity.entity_text)
														
 
															+            if entity.values[entity.label]>0.6:
														
 
															+                dict_time['time_bidopen'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
														
 
															+            # dict_other["time_bidopen"] = timeFormat(entity.entity_text)
														
 
															         elif entity.entity_type == 'time' and entity.label == 3:
														
 
															-            dict_other["time_bidclose"] = timeFormat(entity.entity_text)
														
 
															+            if entity.values[entity.label]>0.6:
														
 
															+                dict_time['time_bidclose'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
														
 
															+            # dict_other["time_bidclose"] = timeFormat(entity.entity_text)
														
 
															         elif entity.entity_type=="person" and entity.label ==4:
														
 
															             dict_other["person_review"].append(entity.entity_text)
														
 
															         elif entity.entity_type=='product':
														
@@ -2091,6 +2102,12 @@ def getOtherAttributes(list_entity):
 
															         elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
														
 
															             dict_other["total_tendereeMoney"] = float(entity.entity_text)
														
 
															             dict_other["total_tendereeMoneyUnit"] = entity.money_unit
														
 
															+    # 时间类别
														
 
															+    for time_type,value in dict_time.items():
														
 
															+        list_time = dict_time[time_type]
														
 
															+        if list_time:
														
 
															+            list_time.sort(key=lambda x:x[1],reverse=True)
														
 
															+            dict_other[time_type] = list_time[0][0]
														
 
															     dict_other["product"] = list(set(dict_other["product"]))
														
 
															     return dict_other
														
--- a/BiddingKG/dl/interface/modelFactory.py
+++ b/BiddingKG/dl/interface/modelFactory.py
@@ -260,7 +260,7 @@ class Model_relation_extraction():
 
															     def predict(self,text_in, words, rate=0.5):
														
 
															         # text_words = text_in
														
 
															-        R = []
														
 
															+        triple_list = []
														
 
															         # _t2 = [self.words2id.get(c, 1) for c in words]
														
 
															         _t2 = np.zeros((len(words), self.words_size))
														
 
															         for i in range(len(words)):
														
@@ -292,8 +292,8 @@ class Model_relation_extraction():
 
															                 for _ooo1, _c1 in zip(*_oo1):
														
 
															                     _object = text_in[_ooo1]
														
 
															                     _predicate = self.id2predicate[_c1]
														
 
															-                    R.append((_subject[0], _predicate, _object))
														
 
															-            return R
														
 
															+                    triple_list.append((_subject[0], _predicate, _object))
														
 
															+            return triple_list
														
 
															         else:
														
 
															             return []
														
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -1357,7 +1357,7 @@ class TimePredictor():
 
															         self.sess = tf.Session(graph=tf.Graph())
														
 
															         self.inputs_code = None
														
 
															         self.outputs_code = None
														
 
															-        self.input_shape = (2,10,128)
														
 
															+        self.input_shape = (2,40,128)
														
 
															         self.load_model()
														
 
															     def load_model(self):
														
@@ -1385,6 +1385,7 @@ class TimePredictor():
 
															         for list_sentence, list_entity in zip(list_sentences, list_entitys):
														
 
															             p_entitys = 0
														
 
															             p_sentences = 0
														
 
															+            list_sentence.sort(key=lambda x: x.sentence_index)
														
 
															             while(p_entitys<len(list_entity)):
														
 
															                 entity = list_entity[p_entitys]
														
 
															                 if entity.entity_type in ['time']:
														
@@ -1397,7 +1398,7 @@ class TimePredictor():
 
															                             left = s[0]
														
 
															                             right = s[1]
														
 
															                             context = [left, right]
														
 
															-                            x = embedding(context, shape=self.input_shape)
														
 
															+                            x = self.embedding_words(context, shape=self.input_shape)
														
 
															                             data_x.append(x)
														
 
															                             points_entitys.append(entity)
														
 
															                             break
														
@@ -1408,6 +1409,33 @@ class TimePredictor():
 
															         data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
														
 
															         return [data_x, points_entitys]
														
 
															+    def embedding_words(self, datas, shape):
														
 
															+        '''
														
 
															+        @summary:查找词汇对应的词向量
														
 
															+        @param:
														
 
															+            datas:词汇的list
														
 
															+            shape:结果的shape
														
 
															+        @return: array,返回对应shape的词嵌入
														
 
															+        '''
														
 
															+        model_w2v = getModel_w2v()
														
 
															+        embed = np.zeros(shape)
														
 
															+        length = shape[1]
														
 
															+        out_index = 0
														
 
															+        for data in datas:
														
 
															+            index = 0
														
 
															+            for item in data:
														
 
															+                item_not_space = re.sub("\s*", "", item)
														
 
															+                if index >= length:
														
 
															+                    break
														
 
															+                if item_not_space in model_w2v.vocab:
														
 
															+                    embed[out_index][index] = model_w2v[item_not_space]
														
 
															+                    index += 1
														
 
															+                else:
														
 
															+                    embed[out_index][index] = model_w2v['unk']
														
 
															+                    index += 1
														
 
															+            out_index += 1
														
 
															+        return embed
														
 
															+
														
 
															     def predict(self, list_sentences,list_entitys):
														
 
															         datas = self.search_time_data(list_sentences, list_entitys)
														
 
															         if datas is None:
														
@@ -1422,7 +1450,11 @@ class TimePredictor():
 
															                 values = []
														
 
															                 for item in predict_y[i]:
														
 
															                     values.append(item)
														
 
															-                    entity.set_Role(label, values)
														
 
															+                if label != 0:
														
 
															+                    if not timeFormat(entity.entity_text):
														
 
															+                        label = 0
														
 
															+                        values[0] = 0.5
														
 
															+                entity.set_Role(label, values)
														
 
															 # 产品字段提取
														
 
															 class ProductPredictor():
														
@@ -2232,7 +2264,7 @@ def save_timesplit_model():
 
															 if __name__=="__main__":
														
 
															     #save_role_model()
														
 
															     # save_codename_model()
														
 
															-    save_money_model()
														
 
															+    # save_money_model()
														
 
															     #save_person_model()
														
 
															     #save_form_model()
														
 
															     #save_codesplit_model()
														
--- a/BiddingKG/dl/interface/timesplit_model/saved_model.pb
+++ b/BiddingKG/dl/interface/timesplit_model/saved_model.pb
--- a/BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001
+++ b/BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001
--- a/BiddingKG/dl/interface/timesplit_model/variables/variables.index
+++ b/BiddingKG/dl/interface/timesplit_model/variables/variables.index
--- a/BiddingKG/dl/relation_extraction/model.py
+++ b/BiddingKG/dl/relation_extraction/model.py
@@ -198,8 +198,8 @@ def position_id(x):
 
															 add_dict = load(os.path.dirname(__file__)+'/../relation_extraction/add_words_dict.pkl')
														
 
															 add_words = ['<unk>','<company/org>','<location>','<phone>','<contact_person>']
														
 
															-model_w2v = getModel_w2v()
														
 
															 def get_words_matrix(words):
														
 
															+    model_w2v = getModel_w2v()
														
 
															     if words in add_words:
														
 
															         return add_dict[words]
														
 
															     else:
														
--- a/BiddingKG/dl/test/测试整个要素提取流程.py
+++ b/BiddingKG/dl/test/测试整个要素提取流程.py
@@ -51,6 +51,7 @@ codeNamePredict = predictor.CodeNamePredict()
 
															 premPredict = predictor.PREMPredict()
														
 
															 epcPredict = predictor.EPCPredict()
														
 
															 roleRulePredict = predictor.RoleRulePredictor()
														
 
															+timePredictor = predictor.TimePredictor()
														
 
															 #自定义jsonEncoder
														
 
															 class MyEncoder(json.JSONEncoder):
														
@@ -91,6 +92,8 @@ def predict(doc_id,text):
 
															     roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
														
 
															     # print("epcPredict")
														
 
															     epcPredict.predict(list_sentences,list_entitys)
														
 
															+
														
 
															+    timePredictor.predict(list_sentences,list_entitys)
														
 
															     # print("entityLink")
														
 
															     entityLink.link_entitys(list_entitys)
														
 
															     # print("getPREMs")
														
@@ -98,7 +101,7 @@ def predict(doc_id,text):
 
															     # print("getPREMs")
														
 
															     print("公司——联系人：", end=' ')
														
 
															     print(prem[0])
														
 
															-    print(prem[0]['prem']['Project']['roleList'])
														
 
															+    # print(prem[0]['prem']['Project']['roleList'])
														
 
															     ''''''
														
@@ -123,7 +126,7 @@ def predict(doc_id,text):
 
															                 print(entity.sentence_index)
														
 
															             elif entity.entity_type=="time":
														
 
															                 print("time:",end=" ")
														
 
															-                print(entity.entity_text)
														
 
															+                print(entity.entity_text, entity.label, entity.values)
														
 
															             elif entity.entity_type in ['org','company']:
														
 
															                 _sentence = list_sentences[0][entity.sentence_index]
														
 
															                 if entity.pointer_person:
														
@@ -141,9 +144,6 @@ def predict(doc_id,text):
 
															             #         print('pointer_pack_name:',entity.pointer_pack.entity_text)
														
 
															             # elif entity.entity_type in ['package']:
														
 
															             #     print('pack_entity:',entity.entity_text)
														
 
															-            # elif entity.entity_type=='time':
														
 
															-            #     print("时间：", end=' ')
														
 
															-            #     print(entity.entity_text, entity.label, entity.values)
														
 
															             # print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
														
 
															     #print(prem)
														
@@ -430,8 +430,8 @@ if __name__=="__main__":
 
															     a = time.time()
														
 
															     print("start")
														
 
															     # print(predict("12",content))
														
 
															-    result = predict("12",text)
														
 
															-    # result = predict("12",content)
														
 
															+    # result = predict("12",text)
														
 
															+    result = predict("12",content)
														
 
															     # print(json.loads(result))
														
 
															     #test("12",text)
														
 
															     print("takes",time.time()-a)
														
--- a/BiddingKG/dl/time/model_label_time_classify.model.hdf5
+++ b/BiddingKG/dl/time/model_label_time_classify.model.hdf5
--- a/BiddingKG/dl/time/model_time_classify.weights
+++ b/BiddingKG/dl/time/model_time_classify.weights
--- a/BiddingKG/dl/time/models/timesplit_model/saved_model.pb
+++ b/BiddingKG/dl/time/models/timesplit_model/saved_model.pb
--- a/BiddingKG/dl/time/models/timesplit_model/variables/variables.data-00000-of-00001
+++ b/BiddingKG/dl/time/models/timesplit_model/variables/variables.data-00000-of-00001
--- a/BiddingKG/dl/time/models/timesplit_model/variables/variables.index
+++ b/BiddingKG/dl/time/models/timesplit_model/variables/variables.index
--- a/BiddingKG/dl/time/train_2.py
+++ b/BiddingKG/dl/time/train_2.py
@@ -1,11 +1,13 @@
 
															 import sys
														
 
															 import os
														
 
															 sys.path.append(os.path.abspath("../.."))
														
 
															+# sys.path.append('/data/python_znj/znj/BIDI_ML_INFO_EXTRACTION/')
														
 
															 import pandas as pd
														
 
															 import re
														
 
															 import psycopg2
														
 
															 from keras.callbacks import ModelCheckpoint
														
 
															 from keras import layers,models,optimizers,losses
														
 
															+from keras.layers import *
														
 
															 from BiddingKG.dl.common.Utils import *
														
 
															 from BiddingKG.dl.common.models import *
														
 
															 from sklearn.metrics import classification_report
														
@@ -13,15 +15,15 @@ from sklearn.utils import shuffle,class_weight
 
															 import matplotlib.pyplot as plt
														
 
															 input_shape = (2,30,60)
														
 
															-input_shape2 = (2,10,128)
														
 
															+input_shape2 = (2,40,128)
														
 
															 output_shape = [4]
														
 
															 def get_data():
														
 
															-    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
														
 
															+    data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
														
 
															     id_set = set()
														
 
															     for id in data_load['document_id']:
														
 
															         id_set.add(id)
														
 
															-    conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101")
														
 
															+    conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.103")
														
 
															     sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
														
 
															           "FROM corpus_iedocument A,brat_bratannotation B " \
														
 
															           "WHERE A.human_identifier = '%s' " \
														
@@ -47,10 +49,12 @@ def get_data():
 
															     df = pd.concat([df, time_label], axis=1)
														
 
															     print(df.info())
														
 
															     df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
														
 
															-    df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
														
 
															-    df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
														
 
															-    df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
														
 
															-    df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
														
 
															+    df['sentences'] = [eval(sentence) for sentence in df['sentences']]
														
 
															+    # df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
														
 
															+    # df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
														
 
															+    df['offsets_to_text'] = [eval(offset) for offset in df['offsets_to_text']]
														
 
															+    # df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
														
 
															+    # df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
														
 
															     save(df,'db_time_data.pk')
														
 
															 def getModel():
														
@@ -78,6 +82,163 @@ def getModel():
 
															     model.summary()
														
 
															     return model
														
 
															+def getModel2():
														
 
															+    '''
														
 
															+    @summary: 时间分类模型
														
 
															+    '''
														
 
															+    L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
														
 
															+    L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
														
 
															+    R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
														
 
															+    R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
														
 
															+
														
 
															+    L_input_drop = Dropout(0.2)(L_input)
														
 
															+    R_input_drop = Dropout(0.2)(R_input)
														
 
															+    # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
														
 
															+    L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
														
 
															+    L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
														
 
															+    # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
														
 
															+    R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
														
 
															+    R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
														
 
															+    concat = layers.merge([L_att, R_att], mode='concat')
														
 
															+    concat = Dropout(0.3)(concat)
														
 
															+    output = layers.Dense(output_shape[0],activation="softmax")(concat)
														
 
															+
														
 
															+    model = models.Model(inputs=[L_input,R_input], outputs=output)
														
 
															+
														
 
															+    learn_rate = 0.00005
														
 
															+    model.compile(optimizer=optimizers.Adam(lr=learn_rate),
														
 
															+                  loss=losses.binary_crossentropy,
														
 
															+                  metrics=[precision,recall,f1_score])
														
 
															+    model.summary()
														
 
															+    return model
														
 
															+
														
 
															+def getModel3():
														
 
															+    '''
														
 
															+    @summary: 时间分类模型
														
 
															+    '''
														
 
															+    L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
														
 
															+    L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
														
 
															+    R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
														
 
															+    R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
														
 
															+
														
 
															+    L_input_drop = Dropout(0.2)(L_input)
														
 
															+    R_input_drop = Dropout(0.2)(R_input)
														
 
															+    # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
														
 
															+    L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
														
 
															+    # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
														
 
															+    # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
														
 
															+    R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
														
 
															+    concat = layers.merge([L_lstm,R_lstm], mode='concat',concat_axis=1)
														
 
															+    concat_mask = layers.merge([L_mask,R_mask], mode='concat',concat_axis=1)
														
 
															+    att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
														
 
															+    # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
														
 
															+    # concat = layers.merge([L_att, R_att], mode='concat')
														
 
															+    att = Dropout(0.3)(att)
														
 
															+    output = layers.Dense(output_shape[0],activation="softmax")(att)
														
 
															+
														
 
															+    model = models.Model(inputs=[L_input,R_input], outputs=output)
														
 
															+
														
 
															+    learn_rate = 0.0001
														
 
															+    model.compile(optimizer=optimizers.Adam(lr=learn_rate),
														
 
															+                  loss=losses.binary_crossentropy,
														
 
															+                  metrics=[precision,recall,f1_score])
														
 
															+    model.summary()
														
 
															+    return model
														
 
															+
														
 
															+class Attention02(Layer):
														
 
															+    def __init__(self, **kwargs):
														
 
															+        self.init = initializers.get('normal')
														
 
															+        self.supports_masking = True
														
 
															+        self.attention_dim = 50
														
 
															+        super(Attention02, self).__init__(**kwargs)
														
 
															+
														
 
															+    def build(self, input_shape):
														
 
															+        assert len(input_shape) == 3
														
 
															+        self.W = K.variable(self.init((input_shape[-1], 1)))
														
 
															+        self.b = K.variable(self.init((self.attention_dim,)))
														
 
															+        self.u = K.variable(self.init((self.attention_dim, 1)))
														
 
															+        self.trainable_weights = [self.W, self.b, self.u]
														
 
															+        super(Attention02, self).build(input_shape)
														
 
															+
														
 
															+    def compute_mask(self, inputs, mask=None):
														
 
															+        return mask
														
 
															+
														
 
															+    def call(self, x, mask=None):
														
 
															+        uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
														
 
															+        ait = K.dot(uit, self.u)
														
 
															+        ait = K.squeeze(ait, -1)
														
 
															+        ait = K.exp(ait)
														
 
															+
														
 
															+        if mask is not None:
														
 
															+            ait = ait * K.cast(mask, K.floatx())
														
 
															+            # ait = ait * mask
														
 
															+
														
 
															+        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
														
 
															+        ait = K.expand_dims(ait)
														
 
															+        weighted_input = x * ait
														
 
															+        output = K.sum(weighted_input, axis=1)
														
 
															+        return output
														
 
															+
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return (input_shape[0], input_shape[-1])
														
 
															+
														
 
															+class OurLayer(Layer):
														
 
															+    """定义新的Layer，增加reuse方法，允许在定义Layer时调用现成的层
														
 
															+    """
														
 
															+    def reuse(self, layer, *args, **kwargs):
														
 
															+        if not layer.built:
														
 
															+            if len(args) > 0:
														
 
															+                inputs = args[0]
														
 
															+            else:
														
 
															+                inputs = kwargs['inputs']
														
 
															+            if isinstance(inputs, list):
														
 
															+                input_shape = [K.int_shape(x) for x in inputs]
														
 
															+            else:
														
 
															+                input_shape = K.int_shape(inputs)
														
 
															+            layer.build(input_shape)
														
 
															+        outputs = layer.call(*args, **kwargs)
														
 
															+        for w in layer.trainable_weights:
														
 
															+            if w not in self._trainable_weights:
														
 
															+                self._trainable_weights.append(w)
														
 
															+        for w in layer.non_trainable_weights:
														
 
															+            if w not in self._non_trainable_weights:
														
 
															+                self._non_trainable_weights.append(w)
														
 
															+        for u in layer.updates:
														
 
															+            if not hasattr(self, '_updates'):
														
 
															+                self._updates = []
														
 
															+            if u not in self._updates:
														
 
															+                self._updates.append(u)
														
 
															+        return outputs
														
 
															+class OurBidirectional(OurLayer):
														
 
															+    """自己封装双向RNN，允许传入mask，保证对齐
														
 
															+    """
														
 
															+    def __init__(self, layer, **args):
														
 
															+        super(OurBidirectional, self).__init__(**args)
														
 
															+        self.forward_layer = layer.__class__.from_config(layer.get_config())
														
 
															+        self.backward_layer = layer.__class__.from_config(layer.get_config())
														
 
															+        self.forward_layer.name = 'forward_' + self.forward_layer.name
														
 
															+        self.backward_layer.name = 'backward_' + self.backward_layer.name
														
 
															+    def reverse_sequence(self, x, mask):
														
 
															+        """这里的mask.shape是[batch_size, seq_len, 1]
														
 
															+        """
														
 
															+        seq_len = K.round(K.sum(mask, 1)[:, 0])
														
 
															+        seq_len = K.cast(seq_len, 'int32')
														
 
															+        return tf.reverse_sequence(x, seq_len, seq_dim=1)
														
 
															+    def call(self, inputs):
														
 
															+        x, mask = inputs
														
 
															+        x_forward = self.reuse(self.forward_layer, x)
														
 
															+        x_backward = self.reverse_sequence(x, mask)
														
 
															+        x_backward = self.reuse(self.backward_layer, x_backward)
														
 
															+        x_backward = self.reverse_sequence(x_backward, mask)
														
 
															+        x = K.concatenate([x_forward, x_backward], -1)
														
 
															+        if K.ndim(x) == 3:
														
 
															+            return x * mask
														
 
															+        else:
														
 
															+            return x
														
 
															+    def compute_output_shape(self, input_shape):
														
 
															+        return input_shape[0][:-1] + (self.forward_layer.units * 2,)
														
 
															+
														
 
															+
														
 
															 def training():
														
 
															     data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
														
@@ -215,6 +376,222 @@ def train2():
 
															     res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
														
 
															     print(res2)
														
 
															+def train3():
														
 
															+    # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
														
 
															+    data_load = pd.read_excel("tokens_tolabel_data1_res12.xlsx", index_col=0)
														
 
															+    # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
														
 
															+    # data_load = data_load[data_load['pre_label_prob']>0.97]
														
 
															+    # data_load = data_load[data_load['is_same']==1]
														
 
															+    data_zero = pd.read_excel("tokens_label0_data1.xlsx")
														
 
															+    # data_old = pd.read_excel("tokens_data_02.xlsx")
														
 
															+    data_old = pd.read_excel("tokens_data_02_res6.xlsx")
														
 
															+    data_zero = data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)]
														
 
															+    # data_zero = pd.concat([data_zero,data_zero])
														
 
															+    # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
														
 
															+    # data_zero = data_zero.sample(n=80000)
														
 
															+    print("输入shape：",input_shape2)
														
 
															+    data_x = []
														
 
															+    data_y = []
														
 
															+    for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
														
 
															+        if label==_label:
														
 
															+            y = np.zeros(output_shape)
														
 
															+            y[label] = 1
														
 
															+            left = eval(left)
														
 
															+            left = left[-40:]
														
 
															+            right = eval(right)
														
 
															+            right = right[:40]
														
 
															+            context = [left, right]
														
 
															+            # x = embedding(context, shape=input_shape2)
														
 
															+            data_x.append(context)
														
 
															+            data_y.append(y)
														
 
															+    data_load2 = data_load[data_load['re_label']==0]
														
 
															+    for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
														
 
															+            if label==_label:
														
 
															+                y = np.zeros(output_shape)
														
 
															+                y[label] = 1
														
 
															+                left = eval(left)
														
 
															+                left = left[-40:]
														
 
															+                if len(left)>30:
														
 
															+                    left = left[2:]
														
 
															+                elif len(left)>15:
														
 
															+                    left = left[1:]
														
 
															+                right = eval(right)
														
 
															+                right = right[:40]
														
 
															+                if len(right)>15:
														
 
															+                    right = right[:-1]
														
 
															+                context = [left, right]
														
 
															+                # x = embedding(context, shape=input_shape2)
														
 
															+                data_x.append(context)
														
 
															+                data_y.append(y)
														
 
															+
														
 
															+    for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
														
 
															+        y = np.zeros(output_shape)
														
 
															+        y[label] = 1
														
 
															+        left = eval(left)
														
 
															+        left = left[-40:]
														
 
															+        right = eval(right)
														
 
															+        right = right[:40]
														
 
															+        context = [left, right]
														
 
															+        # x = embedding(context, shape=input_shape2)
														
 
															+        data_x.append(context)
														
 
															+        data_y.append(y)
														
 
															+
														
 
															+    for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
														
 
															+            y = np.zeros(output_shape)
														
 
															+            y[label] = 1
														
 
															+            left = eval(left)
														
 
															+            left = left[-40:]
														
 
															+            if len(left) > 30:
														
 
															+                left = left[2:]
														
 
															+            elif len(left) > 15:
														
 
															+                left = left[1:]
														
 
															+            right = eval(right)
														
 
															+            right = right[:40]
														
 
															+            if len(right) > 15:
														
 
															+                right = right[:-1]
														
 
															+            context = [left, right]
														
 
															+            # x = embedding(context, shape=input_shape2)
														
 
															+            data_x.append(context)
														
 
															+            data_y.append(y)
														
 
															+
														
 
															+    # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
														
 
															+    #         y = np.zeros(output_shape)
														
 
															+    #         y[label] = 1
														
 
															+    #         left = eval(left)
														
 
															+    #         left = left[-40:]
														
 
															+    #         right = eval(right)
														
 
															+    #         right = right[:40]
														
 
															+    #         context = [left, right]
														
 
															+    #         # x = embedding(context, shape=input_shape2)
														
 
															+    #         data_x.append(context)
														
 
															+    #         data_y.append(y)
														
 
															+
														
 
															+    _data = [d for d in zip(data_x,data_y)]
														
 
															+    import random
														
 
															+    random.shuffle(_data)
														
 
															+    data_x = [i[0] for i in _data]
														
 
															+    data_y = [i[1] for i in _data]
														
 
															+    test_len = int(len(data_x) * 0.13)
														
 
															+    test_x = data_x[:test_len]
														
 
															+    test_y = data_y[:test_len]
														
 
															+    print("测试数据量：", len(test_x))
														
 
															+    train_x = data_x[test_len:]
														
 
															+    train_y = data_y[test_len:]
														
 
															+
														
 
															+    for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
														
 
															+            y = np.zeros(output_shape)
														
 
															+            y[label] = 1
														
 
															+            left = eval(left)
														
 
															+            left = left[-40:]
														
 
															+            right = eval(right)
														
 
															+            right = right[:40]
														
 
															+            context = [left, right]
														
 
															+            # x = embedding(context, shape=input_shape2)
														
 
															+            train_x.append(context)
														
 
															+            train_y.append(y)
														
 
															+    print("训练数据量：", len(train_x))
														
 
															+
														
 
															+    # train_y, test_y = np.array(train_y), np.array(test_y)
														
 
															+    # train_x = np.array(train_x)
														
 
															+    # test_x = np.array(test_x)
														
 
															+    # test_x = np.transpose(test_x, (1, 0, 2, 3))
														
 
															+    # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
														
 
															+    training_generator = DataGenerator(train_x, train_y)
														
 
															+    # training_generator = DataGenerator(data_x, data_y)
														
 
															+    validation_generator = DataGenerator(test_x, test_y)
														
 
															+
														
 
															+    # model = getModel3()
														
 
															+    model = getModel2()
														
 
															+    epochs = 100
														
 
															+    # batch_size = 256
														
 
															+    checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
														
 
															+                                 save_best_only=True, mode='min')
														
 
															+    # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
														
 
															+    #                                  save_best_only=True, mode='min')
														
 
															+
														
 
															+    history = model.fit_generator(
														
 
															+        generator=training_generator,
														
 
															+        validation_data=validation_generator,
														
 
															+        use_multiprocessing=True, workers=2,
														
 
															+        epochs=epochs,
														
 
															+        shuffle=True,
														
 
															+        callbacks=[checkpoint],
														
 
															+        class_weight='auto'
														
 
															+    )
														
 
															+    # plot_loss(history=history)
														
 
															+    # load_model = models.load_model("model_label_time_classify.model.hdf5",
														
 
															+    #                                custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
														
 
															+    # y_pre = load_model.predict([test_x[0], test_x[1]])
														
 
															+    # # y_pre = load_model.predict(test_x[0])
														
 
															+    # # 各类别预测评估
														
 
															+    # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
														
 
															+    # print(res1)
														
 
															+    # y_pre2 = load_model.predict([train_x[0], train_x[1]])
														
 
															+    # # y_pre2 = load_model.predict(train_x[0])
														
 
															+    # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
														
 
															+    # print(res2)
														
 
															+from keras.utils import Sequence,to_categorical
														
 
															+class DataGenerator(Sequence):
														
 
															+    'Generates data for Keras'
														
 
															+    def __init__(self, texts, labels, batch_size=256,
														
 
															+                 n_classes=4, shuffle=True):
														
 
															+        'Initialization'
														
 
															+        # self.dim = dim
														
 
															+        self.batch_size = batch_size
														
 
															+        self.labels = labels
														
 
															+        self.texts = texts
														
 
															+        self.n_classes = n_classes
														
 
															+        self.shuffle = shuffle
														
 
															+        self.on_epoch_end()
														
 
															+
														
 
															+    def __len__(self):
														
 
															+        'Denotes the number of batches per epoch'
														
 
															+        _len = len(self.texts) // self.batch_size
														
 
															+        if len(self.texts) % self.batch_size != 0:
														
 
															+            _len += 1
														
 
															+        return _len
														
 
															+
														
 
															+    def __getitem__(self, index):
														
 
															+        'Generate one batch of data'
														
 
															+        # Generate indexes of the batch
														
 
															+        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
														
 
															+
														
 
															+        # Find list of IDs
														
 
															+        list_texts = [self.texts[k] for k in indexes]
														
 
															+        _label = [self.labels[k] for k in indexes]
														
 
															+        # Generate data
														
 
															+        X, y = self.__data_generation(list_texts,_label)
														
 
															+
														
 
															+        return X, y
														
 
															+
														
 
															+    def on_epoch_end(self):
														
 
															+        'Updates indexes after each epoch'
														
 
															+        self.indexes = np.arange(len(self.texts))
														
 
															+        if self.shuffle == True:
														
 
															+            np.random.shuffle(self.indexes)
														
 
															+
														
 
															+    def __data_generation(self, list_texts,_label):
														
 
															+        'Generates data containing batch_size samples'
														
 
															+        # Initialization
														
 
															+        # X = np.empty((self.batch_size, *self.dim))
														
 
															+        # y = np.empty((self.batch_size), dtype=int)
														
 
															+        # batch_len = len(list_texts)
														
 
															+        # x = np.empty((batch_len, *self.dim))
														
 
															+        x = []
														
 
															+        # y = np.empty((batch_len), dtype=int)
														
 
															+
														
 
															+        # Generate data
														
 
															+        for i, context in enumerate(list_texts):
														
 
															+            # Store sample
														
 
															+            # tokens = preprocess2(text)
														
 
															+            # tokens = tokens[:maxlen]
														
 
															+            words_matrix = embedding_mywords(context, shape=input_shape2)
														
 
															+            # Store class
														
 
															+            # y[i] = _label[i]
														
 
															+            x.append(words_matrix)
														
 
															+        x = np.array(x)
														
 
															+        x = np.transpose(x, (1, 0, 2, 3))
														
 
															+        return [x[0],x[1]], np.array(_label)
														
 
															 def predict2():
														
 
															     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
														
@@ -237,6 +614,73 @@ def predict2():
 
															     # print(error_data.info())
														
 
															     error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
														
 
															+def predict3():
														
 
															+    data = pd.read_csv("new_tokens_data1.csv", chunksize=5000)
														
 
															+    model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
														
 
															+    new_data = pd.DataFrame()
														
 
															+    idx = 0
														
 
															+    for _data in data:
														
 
															+
														
 
															+        test_x = []
														
 
															+        test_y = []
														
 
															+        for left, right, label in zip(_data['context_left'], _data['context_right'], _data['label']):
														
 
															+            left = eval(left)
														
 
															+            left = left[-10:]
														
 
															+            right = eval(right)
														
 
															+            right = right[:10]
														
 
															+            label = int(label)
														
 
															+            y = np.zeros(output_shape)
														
 
															+            y[label] = 1
														
 
															+            context = [left, right]
														
 
															+            x = embedding(context, shape=input_shape2)
														
 
															+            test_x.append(x)
														
 
															+            test_y.append(y)
														
 
															+        test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
														
 
															+        pre_y = model1.predict([test_x[0], test_x[1]])
														
 
															+        _data['pre'] = [np.argmax(item) for item in pre_y]
														
 
															+        _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre'])]
														
 
															+        # data['label'] = label
														
 
															+        new_data = pd.concat([new_data, _data])
														
 
															+        idx += 5000
														
 
															+        print(idx)
														
 
															+    # data.to_csv("new_tokens_data1.csv")
														
 
															+    new_data.to_excel("new_tokens_data1_res.xlsx")
														
 
															+
														
 
															+def predict4():
														
 
															+    data = pd.read_csv("tokens_tolabel_data1_res11.csv", chunksize=3000)
														
 
															+    model1 = getModel2()
														
 
															+    model1.load_weights("model_time_classify.weights")
														
 
															+    new_data = pd.DataFrame()
														
 
															+    idx = 0
														
 
															+    for _data in data:
														
 
															+        test_x = []
														
 
															+        test_y = []
														
 
															+        for left, right, label in zip(_data['context_left'], _data['context_right'], _data['re_label']):
														
 
															+            left = eval(left)
														
 
															+            left = left[-40:]
														
 
															+            right = eval(right)
														
 
															+            right = right[:40]
														
 
															+            label = int(label)
														
 
															+            y = np.zeros(output_shape)
														
 
															+            y[label] = 1
														
 
															+            context = [left, right]
														
 
															+            x = embedding_mywords(context, shape=input_shape2)
														
 
															+            test_x.append(x)
														
 
															+            test_y.append(y)
														
 
															+        test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
														
 
															+        pre_y = model1.predict([test_x[0], test_x[1]])
														
 
															+        _data['pre_label'] = [np.argmax(item) for item in pre_y]
														
 
															+        _data['pre_label_prob'] = [max(item) for item in pre_y]
														
 
															+        _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['re_label'],_data['pre_label'])]
														
 
															+        # _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
														
 
															+        # data['label'] = label
														
 
															+        new_data = pd.concat([new_data, _data])
														
 
															+        idx += 3000
														
 
															+        print(idx)
														
 
															+    # data.to_csv("new_tokens_data1.csv")
														
 
															+    new_data.to_excel("tokens_tolabel_data1_res12.xlsx")
														
 
															+
														
 
															+
														
 
															 def predict():
														
 
															     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
														
 
															     data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
														
@@ -313,7 +757,7 @@ def data_process3():
 
															     token_end = []
														
 
															     context_left = []
														
 
															     context_right = []
														
 
															-    data2 = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc2.csv")
														
 
															+    data2 = pd.read_csv("newdata_30_prc2.csv")
														
 
															     label = []
														
 
															     # data=data[:20]
														
 
															     for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
														
@@ -343,7 +787,7 @@ def data_process3():
 
															                 break
														
 
															         token_begin.append(entity_tbegin)
														
 
															         token_end.append(entity_tend)
														
 
															-        s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend,size=10)
														
 
															+        s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend-1,size=40)
														
 
															         s1 = s[0]
														
 
															         _temp1 = []
														
 
															         for i in range(len(s1)):
														
@@ -372,7 +816,8 @@ def data_process3():
 
															     data['context_right'] = context_right
														
 
															     data['label'] = label
														
 
															     data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
														
 
															-    data.to_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv")
														
 
															+    # data.to_csv("tokens_data_02.csv")
														
 
															+    data.to_excel("tokens_data_02.xlsx")
														
 
															 def plot_loss(history):
														
 
															     plt.plot(history.history['loss'])
														
@@ -383,15 +828,64 @@ def plot_loss(history):
 
															     plt.legend(['Train', 'Test'], loc='upper left')
														
 
															     plt.show()
														
 
															+def embedding_mywords(datas,shape):
														
 
															+    '''
														
 
															+    @summary:查找词汇对应的词向量
														
 
															+    @param:
														
 
															+        datas:词汇的list
														
 
															+        shape:结果的shape
														
 
															+    @return: array,返回对应shape的词嵌入
														
 
															+    '''
														
 
															+    model_w2v = getModel_w2v()
														
 
															+    embed = np.zeros(shape)
														
 
															+    length = shape[1]
														
 
															+    out_index = 0
														
 
															+    #print(datas)
														
 
															+    for data in datas:
														
 
															+        index = 0
														
 
															+        for item in data:
														
 
															+            item_not_space = re.sub("\s*","",item)
														
 
															+            if index>=length:
														
 
															+                break
														
 
															+            if item_not_space in model_w2v.vocab:
														
 
															+                embed[out_index][index] = model_w2v[item_not_space]
														
 
															+                index += 1
														
 
															+            else:
														
 
															+                embed[out_index][index] = model_w2v['unk']
														
 
															+                index += 1
														
 
															+        out_index += 1
														
 
															+    return embed
														
 
															+
														
 
															+def save_model():
														
 
															+    graph = tf.Graph()
														
 
															+    with graph.as_default() as graph:
														
 
															+        with tf.Session(graph=graph).as_default() as sess:
														
 
															+            test_model = getModel2()
														
 
															+            test_model.load_weights("model_time_classify.weights")
														
 
															+            tf.saved_model.simple_save(sess,
														
 
															+                                       "models/timesplit_model/",
														
 
															+                                       inputs={"input0": test_model.input[0],
														
 
															+                                               "input1":test_model.input[1]
														
 
															+                                               },
														
 
															+                                       outputs={"outputs": test_model.output})
														
 
															+
														
 
															+
														
 
															+
														
 
															 if __name__ == '__main__':
														
 
															     # get_data()
														
 
															     # getModel()
														
 
															+    # getModel2()
														
 
															+    # getModel3()
														
 
															     # training()
														
 
															     # train2()
														
 
															+    # train3()
														
 
															     # data_process()
														
 
															     # data_process2()
														
 
															     # data_process3()
														
 
															     # predict()
														
 
															     # predict2()
														
 
															+    # predict3()
														
 
															+    # predict4()
														
 
															+    save_model()
														
 
															     pass