Bläddra i källkod

时间分类模型编码方式更新优化

admin 4 år sedan
förälder
incheckning
07d547b5e5

+ 7 - 4
BiddingKG/dl/interface/predictor.py

@@ -1134,7 +1134,7 @@ class TimePredictor():
         self.sess = tf.Session(graph=tf.Graph())
         self.inputs_code = None
         self.outputs_code = None
-        self.input_shape = (2,30,60)
+        self.input_shape = (2,10,128)
         self.load_model()
 
     def load_model(self):
@@ -1168,10 +1168,13 @@ class TimePredictor():
                     while(p_sentences<len(list_sentence)):
                         sentence = list_sentence[p_sentences]
                         if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
-                            left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
-                            right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
+                            # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
+                            # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
+                            s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
+                            left = s[0]
+                            right = s[1]
                             context = [left, right]
-                            x = embedding_word(context, shape=self.input_shape)
+                            x = embedding(context, shape=self.input_shape)
                             data_x.append(x)
                             points_entitys.append(entity)
                             break

BIN
BiddingKG/dl/interface/timesplit_model/saved_model.pb


BIN
BiddingKG/dl/interface/timesplit_model/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/timesplit_model/variables/variables.index


+ 12 - 5
BiddingKG/dl/test/test4.py

@@ -114,7 +114,7 @@ def test(name,content):
 if __name__=="__main__":
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
+    text = codecs.open("C:\\Users\\admin\\Desktop\\新建文本文档 (2).txt","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
     # df_a = {"html":[]}
     # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
@@ -134,11 +134,18 @@ if __name__=="__main__":
     # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
     # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
     a = time.time()
-    # text = '''
-    # ,光大证券统一认证系统服务器硬件设备更新项目中标候选人公示,项目名称:光大证券统一认证系统服务器硬件设备更新项目,招标编号:CG-202011-030-001,公告日期:2020年12月3日,评标日期:2020年11月30日13时32分,评标地点:光大证券集中采购管理平台,推荐中标候选人:上海致为信息技术有限公司,联系人:殷志超,联系电话:021-22169419
-    # '''
+    text = '''
+    ,清远市清新区治理道路货物运输车辆非法超限超载工作领导小组清远市清新区治理道路货物运输车辆非法超限超载工作领导小组喷墨打印机网上商城合同
+    验收报告,一、合同编号:GDMALL2019123563,。二、合同名称:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组喷墨打印机网上商城合同。
+    三、中标、成交供应商:广州爱联科技有限公司,地址:广州市黄埔大道西468号勤建商务大厦14层。联系人:周勇联系电话:020-85180120,。
+    四、合同金额(元):¥3,270.00,。五、合同详细信息:。采购项目编号::441827-201910-531001-0013,中标/成交标的名称::喷墨打印机,
+    数量::1台。采购项目名称::喷墨打印机,规格型号::WF-7218,中标/成交金额(元)::3,270.00。服务要求::,。,。六、验收结论:已通过。
+    七、验收小组成员名单::。八、联系事项:。(一)采购人:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组,地址:太和镇玄真路49号。
+    联系人:苏美彩,联系电话:0763-5835988,。(二)采购代理机构:地址::。联系人:联系电话::。附件::。
+    发布人:清远市清新区治理道路货物运输车辆非法超限超载工作领导小组。发布时间:2019年11月26日
+    '''
     print("start")
-    print(predict("12",content,"重庆市綦江区人民法院关于重庆市綦江区文龙街道沙溪路22号银海新城六期45号楼、46号楼、47号楼负一层213号车位(第一次拍卖)的公告"))
+    print(predict("12",text,"重庆市綦江区人民法院关于重庆市綦江区文龙街道沙溪路22号银海新城六期45号楼、46号楼、47号楼负一层213号车位(第一次拍卖)的公告"))
     # print(predict("投诉处理公告", text))
     #test("12",text)
     print("takes",time.time()-a)

BIN
BiddingKG/dl/time/model_label_time_classify.model.hdf5


+ 176 - 86
BiddingKG/dl/time/train_2.py

@@ -13,14 +13,52 @@ from sklearn.utils import shuffle,class_weight
 import matplotlib.pyplot as plt
 
 input_shape = (2,30,60)
+input_shape2 = (2,10,128)
 output_shape = [4]
 
+def get_data():
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
+    id_set = set()
+    for id in data_load['document_id']:
+        id_set.add(id)
+    conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101")
+    sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
+          "FROM corpus_iedocument A,brat_bratannotation B " \
+          "WHERE A.human_identifier = '%s' " \
+          "AND A.human_identifier = B.document_id "
+    db_data = []
+    count = 0
+    for id in list(id_set):
+        count+=1
+        print(count)
+        cur1 = conn.cursor()
+        cur1.execute(sql % (id))
+        db_data.extend(cur1.fetchall())
+        cur1.close()
+    conn.close()
+    columns = ['document_id','sentences','tokens','offsets_to_text','value']
+    df = pd.DataFrame(db_data, columns=columns)
+    df = df[df['value'].str.contains('time')]
+    df = df.reset_index(drop=True)
+    print(len(df))
+    time_label = df['value'].str.split(expand=True)
+    time_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text']
+    time_label = time_label.drop('_', axis=1)
+    df = pd.concat([df, time_label], axis=1)
+    print(df.info())
+    df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
+    df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
+    df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
+    df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
+    df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
+    save(df,'db_time_data.pk')
+
 def getModel():
     '''
     @summary: 时间分类模型
     '''
-    L_input = layers.Input(shape=input_shape[1:], dtype='float32')
-    R_input = layers.Input(shape=input_shape[1:], dtype='float32')
+    L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
+    R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
     L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input)
     # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input)
     avg_l = layers.GlobalAveragePooling1D()(L_lstm)
@@ -40,36 +78,11 @@ def getModel():
     model.summary()
     return model
 
-def getModel_center():
-    '''
-    @summary: 时间分类模型
-    '''
-    L_input = layers.Input(shape=input_shape[1:], dtype='float32')
-    R_input = layers.Input(shape=input_shape[1:], dtype='float32')
-    center_shape = (25, 60)
-    C_input = layers.Input(shape=center_shape, dtype='float32')
-    L_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(L_input)
-    avg_l = layers.GlobalAveragePooling1D()(L_lstm)
-    C_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(C_input)
-    avg_c = layers.GlobalAveragePooling1D()(C_lstm)
-    R_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(R_input)
-    avg_r = layers.GlobalAveragePooling1D()(R_lstm)
-    concat = layers.merge([avg_l, avg_c, avg_r], mode='concat')
-
-    output = layers.Dense(output_shape[0],activation="softmax")(concat)
-
-    model = models.Model(inputs=[L_input,C_input,R_input], outputs=output)
-    learn_rate = 0.0005
-    model.compile(optimizer=optimizers.Adam(lr=learn_rate),
-                  loss=losses.binary_crossentropy,
-                  metrics=[precision,recall,f1_score])
-    model.summary()
-    return model
-
 
 def training():
     data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
-    test_data = data_load.sample(frac=0.2, random_state=7)
+    data_load = data_load.reset_index(drop=True)
+    test_data = data_load.sample(frac=0.2, random_state=8)
     train_data = data_load.drop(test_data.index, axis=0)
     train_data =train_data.reset_index(drop=True)
 
@@ -139,35 +152,32 @@ def training():
     res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
     print(res2)
 
-def training_center():
-    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0)
-    test_data = data_load.sample(frac=0.25, random_state=7)
+def train2():
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
+    data_load = data_load.reset_index(drop=True)
+    data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
+    data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
+    test_data = data_load.sample(frac=0.2, random_state=8)
     train_data = data_load.drop(test_data.index, axis=0)
     train_data =train_data.reset_index(drop=True)
 
     train_x = []
     train_y = []
-    for left, center, right, label in zip(train_data['context_left'], train_data['entity_time'], train_data['context_right'], train_data['re_label']):
+    for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['label']):
         y = np.zeros(output_shape)
         y[label] = 1
-        left = ''.join(str(left))
-        right = ''.join(str(right))
-        center = ''.join(str(center))
-        context = [left,center, right]
-        x = embedding_word(context, shape=(3,25,60))
+        context = [left, right]
+        x = embedding(context, shape=input_shape2)
         train_x.append(x)
         train_y.append(y)
 
     test_x = []
     test_y = []
-    for left, center, right, label in zip(test_data['context_left'], train_data['entity_time'], test_data['context_right'], test_data['re_label']):
+    for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['label']):
         y = np.zeros(output_shape)
         y[label] = 1
-        left = ''.join(str(left))
-        right = ''.join(str(right))
-        center = ''.join(str(center))
-        context = [left, center, right]
-        x = embedding_word(context, shape=(3,25,60))
+        context = [left, right]
+        x = embedding(context, shape=input_shape2)
         test_x.append(x)
         test_y.append(y)
 
@@ -175,79 +185,83 @@ def training_center():
     train_x, test_x = (np.array(train_x), np.array(test_x))
     train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
 
-    model = getModel_center()
-    epochs = 70
+    model = getModel()
+    epochs = 150
     batch_size = 256
     checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
                                  save_best_only=True, mode='min')
     # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
     # cw = dict(enumerate(cw))
     history = model.fit(
-        x=[train_x[0], train_x[1], train_x[2]],
+        x=[train_x[0], train_x[1]],
         y=train_y,
-        validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
-        # validation_data=(test_x[0],test_y),
+        validation_data=([test_x[0], test_x[1]], test_y),
         epochs=epochs,
         batch_size=batch_size,
         shuffle=True,
         callbacks=[checkpoint],
         class_weight='auto'
     )
-    plot_loss(history = history)
+    # plot_loss(history=history)
     load_model = models.load_model("model_label_time_classify.model.hdf5",
                                    custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
-    y_pre = load_model.predict([test_x[0], test_x[1], test_x[2]])
+    y_pre = load_model.predict([test_x[0], test_x[1]])
     # y_pre = load_model.predict(test_x[0])
     # 各类别预测评估
     res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
     print(res1)
-    y_pre2 = load_model.predict([train_x[0], train_x[1], train_x[2]])
+    y_pre2 = load_model.predict([train_x[0], train_x[1]])
     # y_pre2 = load_model.predict(train_x[0])
     res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
     print(res2)
 
-def predict():
+
+def predict2():
     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
-    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
+    data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
+    data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
     test_x = []
     test_y = []
-    for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
+    for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['label']):
         y = np.zeros(output_shape)
         y[label] = 1
-        left = ''.join(str(left))
-        right = ''.join(str(right))
         context = [left, right]
-        x = embedding_word(context, shape=input_shape)
+        x = embedding(context, shape=input_shape2)
         test_x.append(x)
         test_y.append(y)
     test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
     pre_y = model1.predict([test_x[0],test_x[1]])
     data_load['pre'] = [np.argmax(item) for item in pre_y]
-    error_data = data_load[data_load['re_label']!=data_load['pre']]
+    error_data = data_load[data_load['label']!=data_load['pre']]
     # print(error_data.info())
-    error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error4-0.2-0.6_30.csv")
+    error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
 
-def predict_center():
+def predict():
     model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
-    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0)
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
     test_x = []
     test_y = []
-    for left, center, right, label in zip(data_load['context_left'],data_load['entity_time'], data_load['context_right'], data_load['re_label']):
+    for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
         y = np.zeros(output_shape)
         y[label] = 1
-        left = ''.join(str(left))
-        right = ''.join(str(right))
-        center = ''.join(str(center))
-        context = [left, center, right]
-        x = embedding_word(context, shape=(3, 25, 60))
+        left = str(left)
+        right = str(right)
+        if left == 'nan': left = ''
+        if right == 'nan': right = ''
+        left = list(left)
+        right = list(right)
+        context = [left, right]
+        x = embedding_word(context, shape=input_shape)
         test_x.append(x)
         test_y.append(y)
     test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
-    pre_y = model1.predict([test_x[0],test_x[1],test_x[2]])
+    pre_y = model1.predict([test_x[0],test_x[1]])
     data_load['pre'] = [np.argmax(item) for item in pre_y]
     error_data = data_load[data_load['re_label']!=data_load['pre']]
     # print(error_data.info())
-    error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error_center.csv")
+    error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
+
 
 def data_process():
     data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
@@ -273,6 +287,93 @@ def data_process():
     data_load['context_right'] = right_list
     data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv")
 
+def data_process2():
+    data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
+    left_list = []
+    right_list = []
+    for left, right in zip(data_load['context_left'], data_load['context_right']):
+        left = str(left)
+        right = str(right)
+        if right=='nan':
+            right = ''
+        if left=='nan':
+            left = ''
+        left = left[max(len(left)-20,0):]
+        right = right[:20]
+        left_list.append(left)
+        right_list.append(right)
+    data_load['context_left'] = left_list
+    data_load['context_right'] = right_list
+    data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_20_prc.csv")
+
+def data_process3():
+    data = load('db_time_data.pk')
+    data = data.drop('value', axis=1)
+    token_begin = []
+    token_end = []
+    context_left = []
+    context_right = []
+    data2 = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc2.csv")
+    label = []
+    # data=data[:20]
+    for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
+                                                             data['begin_index'],data['end_index'],data['entity_text']):
+        _label = data2[(data2['document_id']==int(id)) & (data2['begin_index']==int(begin))][:1]
+        if not _label.empty:
+            _label = int(_label['re_label'])
+        else:
+            _label=0
+        label.append(_label)
+        begin = int(begin)
+        end = int(end)
+        entity_tbegin = 0
+        entity_tend = 0
+        find_begin = False
+
+        for t in range(len(offset)):
+            if not find_begin:
+                if offset[t]==begin:
+                    entity_tbegin = t
+                    find_begin = True
+                if offset[t]>begin:
+                    entity_tbegin = t-1
+                    find_begin = True
+            if offset[t] >= end:
+                entity_tend = t
+                break
+        token_begin.append(entity_tbegin)
+        token_end.append(entity_tend)
+        s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend,size=10)
+        s1 = s[0]
+        _temp1 = []
+        for i in range(len(s1)):
+            if s1[i]=="。":
+                _temp1.append(i)
+        if _temp1:
+            s1 = s1[_temp1[-1]+1:]
+        s2 = s[1]
+        _temp2 = []
+        for i in range(len(s2)):
+            if s2[i] == "。":
+                _temp2.append(i)
+                break
+        if _temp2:
+            s2 = s2[:_temp2[0]+1]
+            # print(s2)
+        context_left.append(s1)
+        context_right.append(s2)
+        print(id)
+        # print(_label)
+        # print(entity_text)
+        # print(tokens[entity_tbegin:entity_tend])
+    data['token_begin'] = token_begin
+    data['token_end'] = token_end
+    data['context_left'] = context_left
+    data['context_right'] = context_right
+    data['label'] = label
+    data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
+    data.to_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv")
+
 def plot_loss(history):
     plt.plot(history.history['loss'])
     plt.plot(history.history['val_loss'])
@@ -283,25 +384,14 @@ def plot_loss(history):
     plt.show()
 
 if __name__ == '__main__':
+    # get_data()
     # getModel()
-    # getModel_center()
     # training()
+    # train2()
     # data_process()
-    # training_center()
+    # data_process2()
+    # data_process3()
     # predict()
-    # predict_center()
-    model1 = models.load_model("model_label_time_classify.model.hdf5",
-                               custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
-    test_x = []
-    test_y = []
-    left = '8675.20元人民币,(3)服务期限:'
-    right = '(4)质量:符合竞争性磋商文件规定的质'
-    context = [left, right]
-    x = embedding_word(context, shape=input_shape)
-    test_x.append(x)
-    test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
-    pre_y = model1.predict([test_x[0],test_x[1]])
-    rs = [np.argmax(item) for item in pre_y]
-    print(pre_y, rs)
+    # predict2()
 
     pass