luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
							'''
Created on 2019年3月26日

@author: User
'''

import sys
import os
sys.path.append(os.path.abspath("../.."))
from BiddingKG.dl.common.models import *
from keras.callbacks import ModelCheckpoint
import numpy as np
import time
from BiddingKG.dl.common.Utils import *
import tensorflow as tf
from generateData import *


def train():
    model = getTextCNNModel()
    train_x, train_y = getData("train.xls")
    test_x, test_y = getData("test.xls")

    callback = ModelCheckpoint('log/' + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1{val_f1_score:.4f}.h5',
                               monitor="val_loss", verbose=1, save_best_only=True, mode="min")
    model.fit(x=train_x, y=train_y, batch_size=96, epochs=400, callbacks=[callback], shuffle=True,
              validation_data=(test_x, test_y))


def train1():
    data_pk = "all_data.pk"
    if os.path.exists(data_pk):
        train_x, train_y, test_x, test_y, test_text = load(data_pk)
    else:
        train_x, train_y, test_x, test_y, test_text = getTrainData()
        save((train_x, train_y, test_x, test_y, test_text), data_pk)

    with tf.Session(graph=tf.Graph()) as sess:
        with sess.graph.as_default():
            print("11111111")
            vocab,matrix = getVocabAndMatrix(getModel_word())
            model = getBiLSTMModel(input_shape=(1,50,60), vocab=vocab, embedding_weights=matrix, classes=2)
            # model = getTextCNNModel(input_shape=(1,30,60), vocab=vocab, embedding_weights=weights, classes=2)
            print("22222222")
            callback = ModelCheckpoint('log/' + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1{val_f1_score:.4f}.h5',
                                       monitor="val_loss", verbose=1, save_best_only=True,save_weights_only=True, mode="min")
            model.fit(x=train_x, y=train_y, batch_size=128, epochs=400, callbacks=[callback], shuffle=True,
                      validation_data=(test_x, test_y))


def vali():
    data_pk = "all_data.pk"
    train_x, train_y, test_x, test_y, test_text = load(data_pk)
    model = models.load_model("log/loss_ep106-loss0.008-val_loss0.134-f10.9768.h5",
                              custom_objects={"precision": precision, "recall": recall, "f1_score": f1_score,
                                              "Attention": Attention})

    predict = model.predict(test_x)
    predict_y = np.argmax(predict, 1)
    list_filename = []
    list_text = []
    list_label = []
    list_predict = []
    list_prob = []
    data = []
    for y, y_, text, prob in zip(np.argmax(test_y, 1), predict_y, test_text, predict):
        if y == y_:
            data.append([text[0], text[1], y, y_, prob[y_]])

    data.sort(key=lambda x: x[2])
    for item in data:
        list_filename.append(item[0])
        list_text.append(item[1])
        list_label.append(item[2])
        list_predict.append(item[3])
        list_prob.append(item[4])
    df = pd.DataFrame(
        {"list_filename": list_filename, "list_text": list_text, "list_label": list_label, "list_predict": list_predict,
         "list_prob": list_prob})
    df.to_excel("vali_true.xls", columns=["list_filename", "list_text", "list_label", "list_predict", "list_prob"])

    list_filename = []
    list_text = []
    list_label = []
    list_predict = []
    list_prob = []
    data = []
    for y, y_, text, prob in zip(np.argmax(test_y, 1), predict_y, test_text, predict):
        if y != y_:
            data.append([text[0], text[1], y, y_, prob[y_]])

    data.sort(key=lambda x: x[2])
    for item in data:
        list_filename.append(item[0])
        list_text.append(item[1])
        list_label.append(item[2])
        list_predict.append(item[3])
        list_prob.append(item[4])
    df = pd.DataFrame(
        {"list_filename": list_filename, "list_text": list_text, "list_label": list_label, "list_predict": list_predict,
         "list_prob": list_prob})
    df.to_excel("vali_wrong.xls", columns=["list_filename", "list_text", "list_label", "list_predict", "list_prob"])


def test(list_text):
    x = []
    for text in list_text:
        x.append(encoding(text))
    x = np.array(x)
    # x = np.expand_dims(encoding(text),0)
    # test_x,test_y = getData("test.xls")
    model = getTextCNNModel()
    model.load_weights("log/ep082-loss0.044-val_loss0.126-f10.9592.h5")
    a = time.time()
    predict_y = model.predict(x)
    print("cost", time.time() - a)
    # model.save("model/model_form.model.hdf5")
    return predict_y


def getBestThreshold():
    def getAccurancyRecall(predict, threshold, test_y):
        nums = 0
        counts = 0
        for item, _max, y in zip(predict, np.argmax(predict, 1), np.argmax(test_y, 1)):
            if item[_max] > threshold:
                if _max == y:
                    nums += 1
                counts += 1
        precision = nums / counts
        recall = nums / len(test_y)
        return 2 * ((precision * recall) / (precision + recall))

    # return precision,recall

    model = getTextCNNModel()
    model.load_weights("model/model_form.model.hdf5")

    test_x, test_y = getData("test.xls")
    predict_y = model.predict(test_x)

    threshold = 0.5
    x = []
    y = []
    while (threshold < 1):
        x.append(threshold)
        t0 = getAccurancyRecall(predict_y, threshold, test_y)
        y.append(t0)
        print(threshold, getAccurancyRecall(predict_y, threshold, test_y))
        threshold += 0.001
    plt.plot(x, y)
    plt.show()

def save_form_model():
    with tf.Session(graph=tf.Graph()).as_default() as sess:
        with sess.graph.as_default():
            vocab,matrix = getVocabAndMatrix(getModel_word())
            model = getBiLSTMModel(input_shape=(1,50,60), vocab=vocab, embedding_weights=matrix, classes=2)
            model.load_weights(filepath="log/ep029-loss0.044-val_loss0.057-f10.9788.h5")
            tf.saved_model.simple_save(sess,
                                       "./form_savedmodel/",
                                       inputs={"inputs":model.input},
                                       outputs = {"outputs":model.output})

if __name__ == "__main__":
    # train()
    # print(test(["序号|项目名称|中选人"]))
    # getBestThreshold()
    # train1()
    # vali()
    save_form_model()