luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285
							#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Author  : bidikeji
# @Time    : 2020/4/24 0024 15:20

# coding=utf-8
# evaluate为该方法的入口函数，必须用这个名字
from odps.udf import annotate
from odps.distcache import get_cache_archive
from odps.distcache import get_cache_file
import time


def recall(y_true, y_pred):
    '''
    计算召回率

    @Argus:
        y_true: 正确的标签
        y_pred: 模型预测的标签

    @Return
        召回率
    '''
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
    if c3 == 0:
        return 0
    recall = c1 / c3
    return recall


def f1_score(y_true, y_pred):
    '''
    计算F1

    @Argus:
        y_true: 正确的标签
        y_pred: 模型预测的标签

    @Return
        F1值
    '''

    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
    precision = c1 / c2
    if c3 == 0:
        recall = 0
    else:
        recall = c1 / c3
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score


def precision(y_true, y_pred):
    '''
    计算精确率

    @Argus:
        y_true: 正确的标签
        y_pred: 模型预测的标签

    @Return
        精确率
    '''
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = c1 / c2
    return precision

# 配置pandas依赖包
def include_package_path(res_name):
    import os, sys
    archive_files = get_cache_archive(res_name)
    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
                        if '.dist_info' not in f.name], key=lambda v: len(v))
    sys.path.append(dir_names[0])

    return os.path.dirname(dir_names[0])


# 初始化业务数据包，由于上传限制,python版本以及archive解压包不统一等各种问题，需要手动导入
def init_env(list_files, package_name):
    import os, sys

    if len(list_files) == 1:
        so_file = get_cache_file(list_files[0])
        cmd_line = os.path.abspath(so_file.name)
        os.system("unzip %s -d %s" % (cmd_line, package_name))
    elif len(list_files) > 1:
        cmd_line = "cat"
        for _file in list_files:
            so_file = get_cache_file(_file)
            cmd_line += " " + os.path.abspath(so_file.name)
        cmd_line += " > temp.zip"
        os.system(cmd_line)
        os.system("unzip temp.zip -d %s" % (package_name))
    sys.path.append(os.path.abspath(package_name))


# UDF主程序
@annotate("string->string")
class Extractor(object):
    def __init__(self):
        import logging as log
        global log
        import os
        log.basicConfig(level=log.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        logger = log.getLogger(__name__)

        model_path = os.path.abspath(get_cache_file('model_changemedium_acc90.model').name)  # attentiongruacc0.932.model改为 New_attentionGUR_embed100_newlabel_20201020.h5  20201023
        log.info('model_path:%s'%model_path)
        log.info(os.path.exists(model_path))

        # init_env(['pyhanlp.z01', 'pyhanlp.z02','pyhanlp.z03','pyhanlp.z04'], 'pyhanlp')
        start_time = time.time()
        init_env(['pyhanlp.z01', 'pyhanlp.z02'], 'pyhanlp')
        log.info("init pyhanlp takes:%d"%(time.time()-start_time))
        start_time = time.time()
        # init_env(['envs_py37.zip.env'], 'envs_py37')
        # include_package_path("envs_py37.env.zip")
        include_package_path("envs_py37.env.zip")
        log.info("init envs_py37 takes:%d"%(time.time()-start_time))
        start_time = time.time()
        init_env(['so.env'], '.')
        init_env(['pkl_csv.z01'], '.')
        log.info("init pkl_csv takes:%d"%(time.time()-start_time))
        start_time = time.time()
        import pickle

        import csv
        import re as re
        import tensorflow as tf
        import numpy as np
        import keras.backend as K
        from keras import models
        from keras.engine.topology import Layer

        import json as json
        global json
        global re
        global np
        global tf,K


        log.info('import package done------------------')
        # dirpath = os.path.abspath('pyhanlp')
        # path = dirpath+'/pyhanlp/static/__init__.py'        # return dirpath
        # dirpath = os.path.dirname(os.path.abspath(get_cache_file('pyhanlp.z01').name))
        # return '; '.join([a for a in os.listdir(os.listdir(dirpath)[0])])
        # path2 = os.path.abspath(get_cache_file('hanlpinit.txt').name)
        # content = []
        # with open(path2, encoding='utf-8') as f:
        #     for line in f:
        #         content.append(line)
        # # return '; '.join(content)
        # with open(path, 'w', encoding='utf-8') as f:
        #     f.writelines(content)
        # log.info('rewrite hanlp path done--------------------')
        # archive_files = get_cache_archive('token_stopwds.zip')
        # names = [os.path.dirname(os.path.normpath(f.name)) for f in archive_files]
        # with open(names[0]+'/bidi_classify_stop_words.csv', 'r', encoding='utf-8') as f:
        #     self.stopwords = [row[0] for row in csv.reader(f)]
        # with open(names[0]+'/word_index_955871.pk', 'rb') as f:
        #     self.word_index = pickle.load(f)

        from pyhanlp import HanLP, JClass
        HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')
        HanLP.Config.ShowTermNature = False
        self.hanlp = HanLP
        log.info('import hanlp done---------------------')

        class Attention(Layer):
            log.info('******attention****************')
            print('-------attention------------------')

            def __init__(self, **kwargs):
                super(Attention, self).__init__(**kwargs)

            def build(self, input_shape):
                # W: (EMBED_SIZE, 1)
                # b: (MAX_TIMESTEPS, 1)
                # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
                self.W = self.add_weight(name="W_{:s}".format(self.name),
                                         shape=(input_shape[-1], 1),
                                         initializer="normal")
                self.b = self.add_weight(name="b_{:s}".format(self.name),
                                         shape=(input_shape[1], 1),
                                         initializer="zeros")
                self.u = self.add_weight(name="u_{:s}".format(self.name),
                                         shape=(input_shape[1], input_shape[1]),
                                         initializer="normal")
                super(Attention, self).build(input_shape)

            def call(self, x, mask=None):
                # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
                # et: (BATCH_SIZE, MAX_TIMESTEPS)
                et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
                # at: (BATCH_SIZE, MAX_TIMESTEPS)
                at = K.dot(et, self.u)
                at = K.exp(at)
                if mask is not None:
                    at *= K.cast(mask, K.floatx())
                # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
                at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
                atx = K.expand_dims(at, axis=-1)
                ot = atx * x
                # output: (BATCH_SIZE, EMBED_SIZE)
                return K.sum(ot, axis=1)

            def compute_mask(self, input, input_mask=None):
                # do not pass the mask to the next layers
                return None

            def compute_output_shape(self, input_shape):
                # output shape: (BATCH_SIZE, EMBED_SIZE)
                return (input_shape[0], input_shape[-1])

            def get_config(self):
                return super(Attention, self).get_config()


        self.model = models.load_model(model_path,
                                       custom_objects={'precision': precision,
                                                       'recall': recall,
                                                       'f1_score': f1_score,
                                                       'Attention': Attention})
        log.info('init model end  --')

        pk_path = os.path.abspath('pkl_csv')
        with open(pk_path + '/id2label.pkl', 'rb') as f:  # '/label_mapping210.pkl' 改为 id2label.pkl 20201023
            self.label_map = pickle.load(f)
        print('load label_map done')
        with open(pk_path + '/bidi_classify_stop_words.csv', 'r', encoding='utf-8') as f:
            self.stopwords = [row[0] for row in csv.reader(f)]
        with open(pk_path + '/word_index_955871.pk', 'rb') as f:
            self.word_index = pickle.load(f)
        with open(pk_path + '/class2dalei_menlei.pkl', 'rb') as f: # class_subclass_dic211.pk 改为 class2dalei_menlei.pkl 20201023
            self.class_dic = pickle.load(f)
        log.info('classs init done ----')

    def evaluate(self, text):
        # 去除html标签
        text = re.sub('\s', '', str(text))
        text = re.sub('<\s*script[^>]*>.*?<\s*/\s*script\s*>', '', text)
        text = re.sub('<\s*style[^>]*>.*?<\s*/\s*style\s*>', '', text)
        text = re.sub('</?\w+[^>]*>', '', text)
        # 清除干扰字符（英文、日期、数字、标点符号）， 返回前500字
        text = re.sub('\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]', '', text)[:500]
        # hanlp分词
        result = self.hanlp.segment(text)
        text_list = [str(result.get(i)) for i in range(result.size())]
        # 过滤停用词
        #text_list = [word for word in text_list if word not in self.stopwords and len(word) > 1]  # 取消停用词过滤 20201023
        # 顺序去重
        #l2 = []
        #[l2.append(i) for i in text_list if i not in l2]  # 取消顺序去重 20201023
        # 数字化
        text_list = [str(self.word_index.get(word, 0)) for word in text_list]  # l2 改为text_list 20201023
        # padding and trans to array
        text_list = text_list[:150] if len(text_list) > 150 else text_list + ['0'] * (150 - len(text_list))  # 由原来100个词改为150个词 20201023
        features = np.array([text_list[:150] if len(text_list) > 150 else text_list + [0] * (150 - len(text_list))])  # 由原来100个词改为150个词 20201023
        log.info('数字化结束-------------------')
        # features = np.array([s.split(',')[:100] if len(s.split(','))>100 else s.split(',')+[0]*(100-len(s.split(',')))])
        with tf.get_default_graph().as_default():
            log.info('准备预测-------------------')
            logits = self.model.predict(features)
            # return ','.join(logits[0])
            # result = self.label_map(np.argmax(logits[0]))
            # return result
            log.info('预测结束-------------------')
            top3 = np.argsort(-logits[0], axis=-1)[:3]
            prob = ['%.4f' % (logits[0][i]) for i in top3]
            pre = [self.label_map[i] for i in top3]
            rd = {}
            i = 1
            for a in pre:
                sub, father = self.class_dic[a].split(',')
                rd['top' + str(i)] = {'subclass': sub, 'class_name': a, 'class': father}
                i += 1

            log.info('准备返回字符串')
            return json.dumps(rd,ensure_ascii=False)