123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- import module.model as model
- import featureEngine
- from keras.callbacks import ModelCheckpoint
- import numpy as np
- import pickle
- import os
- from module.Utils import *
- from keras import models
- def save(object_to_save, path):
- '''
- 保存对象
- @Arugs:
- object_to_save: 需要保存的对象
- @Return:
- 保存的路径
- '''
- with open(path, 'wb') as f:
- pickle.dump(object_to_save, f)
- def load(path):
- '''
- 读取对象
- @Arugs:
- path: 读取的路径
- @Return:
- 读取的对象
- '''
- with open(path, 'rb') as f:
- object = pickle.load(f)
- return object
- def train():
- pk_file = "iterator/data_28849_16.pk"
- if os.path.exists(pk_file):
- data = load(pk_file)
- #data = featureEngine.paddinig(data, pad=False)
- #data[1] = np.argmax(data[1],-1)
- #print(np.shape(data[0]))
-
-
- else:
- data = featureEngine.getAllData()
- save(data,"data_"+str(len(data[1]))+".pk")
- model1 = model.getBiRNNModel()
- #model1.load_weights("../model_data/ep028-loss0.062-val_loss0.102-f10.9624.h5")
- model_file = "contentExtract.h5"
- log_dir = "log/"
- train_percent = 0.8
- test_percent=0.9
- print(np.shape(data[0]))
- train_len = round(len(data[0])*train_percent)
- test_len = round(len(data[0])*test_percent)
- checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_acc{val_acc:.3f}-val_loss{val_loss:.3f}-f1{val_f1_score:.4f}.h5',
- monitor='loss', save_best_only=True, period=1,mode="min")
- history_model = model1.fit(x=data[0][:train_len],y=data[1][:train_len],validation_data=(data[0][train_len:test_len],data[1][train_len:test_len]),epochs=400,batch_size=256,shuffle=True,callbacks=[checkpoint])
-
- def predict(x):
- '''
- model1 = model.getBiRNNModel()
- model1.load_weights("../model_data/ep133-loss-0.991-val_acc0.972-val_loss-0.951-f10.3121.h5")
- '''
- path = "log/ep011-loss0.160-val_acc0.900-val_loss0.156-f10.4536.h5"
- model1 = models.load_model(path, custom_objects={"acc":acc,"precision":precision,"recall":recall,"f1_score":f1_score,"my_loss":my_loss})
-
- return model1.predict(x,batch_size=1)
- def test(url):
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
- data = featureEngine.getInput_byJS(url)
- if data is not None:
- x,list_inner,list_xpath,_ = data
- print("x:",x)
- p = predict(x)
- print(p)
- print(np.argmax(p,1))
- print(p[0][np.argmax(p,1)[0][1]])
- print(list_inner[np.argmax(p,1)[0][1]])
- #print(list_inner[4])
- print(list_xpath[np.argmax(p,1)[0][1]])
-
- def val():
- pk_file = "iterator/data_28849_16.pk"
- data = load(pk_file)
- train_percent = 0.9
- train_len = round(len(data[0])*train_percent)
- #print(np.shape(data))
- predict_y = np.argmax(predict(data[0][train_len:]),1)
- label_y = np.argmax(data[1][train_len:],1)
- list_url = data[2][train_len:]
- size_predict = 0
- size_considence = 0
- dict_root_true_wrong = dict()
- for _predict,_label,_url in zip(predict_y,label_y,list_url):
- root = _url.split("/")[2]
- if root not in dict_root_true_wrong:
- dict_root_true_wrong[root] = [0,0]
- if _predict[1]==_label[1]:
- size_considence += 1
- dict_root_true_wrong[root][0] += 1
- else:
- dict_root_true_wrong[root][1] += 1
- print(_url)
- size_predict += 1
- list_root_true_wrong = []
- for _key in dict_root_true_wrong.keys():
- list_root_true_wrong.append([_key,dict_root_true_wrong[_key]])
- list_root_true_wrong.sort(key=lambda x:x[1][1]/(x[1][0]+x[1][1]))
- print(list_root_true_wrong)
- print(size_considence,size_predict)
-
- def iteratorLabel():
- '''
- @summary: 迭代地进行数据的修复
- '''
- data_file = "iterator/data_28849_35.pk"
- threshold = 0.93
- train_epochs = 10
- batch_size=96
- data = load(data_file)
- data_split = round(len(data[1])*0.5)
- last_change_set = set()
- this_change_set = set()
- max_not_change_times = 10
- _not_change_times = 0
- _time = 0
- while(True):
- _time += 1
- #训练模型
- model_1 = model.getBiRNNModel()
- model_2 = model.getBiRNNModel()
- model_1.fit(x=data[0][:data_split],y=data[1][:data_split],epochs=train_epochs,batch_size=batch_size,shuffle=True)
- model_2.fit(x=data[0][data_split:],y=data[1][data_split:],epochs=train_epochs,batch_size=batch_size,shuffle=True)
-
- predict_1 = model_1.predict(data[0])
- predict_2 = model_2.predict(data[0])
- _index = 0
- for _max_1,_max_2,_y1,_y2,Y,_url in zip(np.max(predict_1,1),np.max(predict_2,1),np.argmax(predict_1,1),np.argmax(predict_2,1),np.argmax(data[1],1),data[2]):
- if _y1[1]==_y2[1] and _y1[1]!=Y[1] and _max_1[1]>threshold and _max_2[1]>threshold:
- #修改标注
- data[1][_index][Y[1]] = 0
- data[1][_index][_y1[1]] = 1
- this_change_set.add(_url)
- _index += 1
- if len(this_change_set-last_change_set)<10:
- _not_change_times += 1
- else:
- _not_change_times = 0
- if _not_change_times>=max_not_change_times:
- break
- last_change_set = this_change_set
- this_change_set = set()
- save(data,"iterator/data_"+str(len(data[1]))+"_1.pk")
-
-
-
-
- if __name__=="__main__":
- #train()
- test(url = "https://www.600757.com.cn/show-106-14208-1.html")
- #val()
- #print(2248/2555)
- #iteratorLabel()
- pass
-
|