luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
							from collections import Counter
import pandas as pd
import fool
import pickle
import json
import random
import time
import re
import numpy as np
import gensim

w2v = r"D:\bidi\BIDI_ML_INFO_EXTRACTION\BiddingKG\dl\wiki_128_word_embedding_new.vector"
# w2v = "/data/python/lishimin/BiddingKG/dl/wiki_128_word_embedding_new.vector"
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v, binary=True)

def cut_words(text_list):
    return fool.cut(text_list)

def words_docs_list2array(words_docs_list, maxSententLen=20):
    array = np.zeros((len(words_docs_list), maxSententLen, 128))
    for i in range(len(words_docs_list)):
        words = words_docs_list[i][-maxSententLen:]
        for j in range(len(words)):
            if words[j] in model_w2v:
                array[i][j] = model_w2v[words[j]]
            else:
                print('word not in w2v: ', words[j])
    return array

def dump_segwords():
    # df = pd.read_excel('data/新标准所有规则标注数据帅选数据.xlsx')[:]
    # df = pd.read_excel('data/215类数据太少类别补充数据.xlsx')[:]
    df = pd.read_excel('data/新标准所有规则标注数据去除产品长度大于100帅选并补充后165595条215类.xlsx')[:]
    df.dropna(subset=['industry_type'], inplace=True)
    print(df.head(5))
    df.fillna(' ', inplace=True)
    print('len_df', len(df))
    df = df[df['industry_type']!=' ']
    print('len_df', len(df))

    from collections import Counter
    c = Counter(df['industry_type'])
    print('类别数：', len(c))

    def del_words(tenderee, text):
        tenderee = tenderee.replace('(', '（').replace(')', '）')
        text = text.replace('(', '（').replace(')', '）')
        text = re.sub(
            '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性',
            '', text)
        text = text.replace(tenderee, '')
        text = ' ' if text=="" else text
        return text

    def get_winer(sub_docs_json, winner):
        # winner = ' '
        if winner == ' ' and 'win_tenderer' in sub_docs_json:
            l = json.loads(sub_docs_json)
            for d in l:
                if d.get('win_tenderer', '')!="":
                    winner = d.get('win_tenderer', '')
                    return winner
        return winner
    df['doctitle'] = df.apply(lambda x:del_words(x['tenderee'], x['doctitle']), axis=1)
    df['project_name'] = df.apply(lambda x:del_words(x['tenderee'], x['project_name']), axis=1)
    # df['win_tenderer'] = df['sub_docs_json'].apply(lambda x:get_winer(x))
    df['win_tenderer'] = df.apply(lambda x:get_winer(x['sub_docs_json'], x['win_tenderer']), axis=1)
    labels = sorted(set(df['industry_type']))
    lb2id = {k:v for v, k in enumerate(labels)}
    print(df.head(5))
    print(labels, len(labels))
    print(lb2id[labels[0]], labels[0])

    t1 = time.time()

    f = open('data/all_segwords.txt', 'w', encoding='utf-8')
    # f = open('data/all_segwords.txt', 'a', encoding='utf-8')
    n = 0
    for docid, title, project, product, win_tenderer, lb in \
     zip(df['docid'], df['doctitle'], df['project_name'], df['product'], df['win_tenderer'], df['industry_type']):
        try:
            title_, project_, product_, win_tenderer_  = cut_words([title, project, product, win_tenderer])
            segwords = ' '.join(title_)+'#split#'+' '.join(project_)+'#split#'+' '.join(product_)+'#split#'+' '.join(win_tenderer_)
            f.write('%s\t%s\t%s\n'%(lb, segwords, docid))
            n += 1
            if n %1000==0:
                print('已完成%d篇'%n)
        except Exception as e:
            print('error : ', e)

    f.close()

        # all_data = []
    # all_data = [(docid,cut_words([title, project, product, win_tenderer]), lb) for docid,title,project,product,win_tenderer,lb in zip(df['docid'],df['doctitle'],df['project_name'],df['product'],df['win_tenderer'],df['industry_type'])]
    # print('all_data_len:', len(all_data))
    # with open('data/all_data.pkl', 'wb') as f:
    #     pickle.dump(all_data, f)
    # with open('data/lb2id.pkl', 'wb') as f:
    #     pickle.dump(lb2id, f)
    t2 = time.time()
    print('总耗时：', t2-t1)

# dump_segwords()

def get_array_data():
    with open('E:/行业分类/all_data.pkl', 'rb') as f:
        all_data = pickle.load(f)
    with open('E:/行业分类/lb2id.pkl', 'rb') as f:
        lb2id = pickle.load( f)
    print('all_data_len', len(all_data))
    print(lb2id)
    title = [it[0][0] for it in all_data]
    project = [it[0][1] for it in all_data]
    product = [it[0][2] for it in all_data]
    labels_type = [it[1] for it in all_data]

    title_array = words_docs_list2array(title)
    project_array = words_docs_list2array(project)
    product_array = words_docs_list2array(product)
    label_ids = [lb2id[it] for it in labels_type]
    label_array = np.zeros(shape=((len(label_ids), len(lb2id))))
    for i in range(len(label_ids)):
        label_array[i][label_ids[i]] = 1
    print(title_array.shape, label_array.shape)
    return title_array, project_array, product_array, label_array, all_data, lb2id

def split_train_test():
    # with open('E:/行业分类/all_data.pkl', 'rb') as f:
    #     all_data = pickle.load(f)
    # df = pd.DataFrame(all_data, columns=['segword_list', 'label'])
    # df['segword_list'] = df['segword_list'].apply(lambda x: json.dumps(x, ensure_ascii=False))
    # df.drop_duplicates(subset=['segword_list', 'label'], inplace=True)
    # df.drop_duplicates(subset=['segword_list'], inplace=True)

    with open('data/all_segwords.txt', 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    lines = [it.strip().split('\t') for it in lines if '#split#' in it]
    print(lines[:3])
    df = pd.DataFrame(lines, columns=['label', 'segwords', 'docid'])
    print(len(df), df.head(3))
    df.drop_duplicates(subset=['segwords'], inplace=True)
    print(len(df))

    c = Counter(df['label'])
    less_10 = []
    for k, v in c.items():
        if v < 10:
            less_10.append(k)
    df = df[~df['label'].isin(less_10)]
    df_test = pd.DataFrame()
    for lb in set(df['label']):
        df_tmp = df[df['label']==lb]
        n = int(len(df_tmp)*0.2)
        df_test = df_test.append(df_tmp.sample(n=n), ignore_index=True)
        df_test = df_test.sample(frac=1)
    # df_train = df[~df['segword_list'].isin(df_test['segword_list'])]
    df_train = df[~df['segwords'].isin(df_test['segwords'])]
    df_train = df_train.sample(frac=1)
    print(len(df), len(df_train), len(df_test))
    df_train.to_excel('data/新标准215类train.xlsx')
    df_test.to_excel('data/新标准215类test.xlsx')
    return df_train, df_test

def df_to_array(df, seq_len=20):
    # df['segword_list'] = df['segword_list'].apply(lambda x:json.loads(x))
    # with open('E:/行业分类/lb2id.pkl', 'rb') as f:
    #     lb2id = pickle.load( f)
    labels = sorted(set(df['label']))
    lb2id = {k:v for v, k in enumerate(labels)}
    # all_data = list(df['segword_list'])
    all_data = df['segwords'].apply(lambda x:x.split('#split#'))
    for l in all_data:
        assert len(l) == 4
    print('all_data_len', len(all_data))
    print(lb2id)
    title = [it[0].split() for it in all_data]
    project = [it[1].split() for it in all_data]
    product = [it[2].split() for it in all_data]
    labels_type = [it for it in df['label']]

    title = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in title]
    project = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in project]
    product = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in product]

    title_array = words_docs_list2array(title,seq_len)
    project_array = words_docs_list2array(project,seq_len)
    product_array = words_docs_list2array(product,seq_len)
    label_ids = [lb2id[it] for it in labels_type]
    label_array = np.zeros(shape=((len(label_ids), len(lb2id))))
    for i in range(len(label_ids)):
        label_array[i][label_ids[i]] = 1
    print(title_array.shape, label_array.shape)
    return title_array, project_array, product_array, label_array, all_data, lb2id

def getVocabAndMatrix(model):
    '''
    加载词向量或字向量，返回词、字列表及向量矩阵,过滤掉非纯中文词、字
    :param model: 加载的词或字向量模型  gensim.models.keyedvectors.KeyedVectors
    :return:
    '''
    vocab = ['<pad>'] + [it for it in model.index2word if re.search('^[\u4e00-\u9fa5]+$', it)]
    word2index = {k:v for k,v in enumerate(vocab)}
    Embedding_size = model[model.index2word[0]].shape[0]
    embedding_matrix = np.zeros((len(vocab), Embedding_size))
    for i in range(1, len(vocab)):
        embedding_matrix[i] = model[vocab[i]]
    return vocab, embedding_matrix, word2index

def df_to_ids(df, word2index):
    def words2ids(words_docs_list, maxSententLen=20):
        array = np.zeros((len(words_docs_list), maxSententLen))
        for i in range(len(words_docs_list)):
            words = words_docs_list[i][-maxSententLen:]
            for j in range(len(words)):
                if words[j] in word2index:
                    array[i][j] = word2index[words[j]]
        return array
    df['segword_list'] = df['segword_list'].apply(lambda x:json.loads(x))
    # with open('E:/行业分类/lb2id.pkl', 'rb') as f:
    #     lb2id = pickle.load( f)
    labels = sorted(set(df['label']))
    lb2id = {k:v for v, k in enumerate(labels)}
    all_data = list(df['segword_list'])
    print('all_data_len', len(all_data))
    print(lb2id)
    title = [it[0] for it in all_data]
    project = [it[1] for it in all_data]
    product = [it[2] for it in all_data]
    labels_type = [it for it in df['label']]

    title_ids = words2ids(title)
    project_ids = words2ids(project)
    product_ids = words2ids(product)
    label_ids = [lb2id[it] for it in labels_type]
    label_array = np.zeros(shape=((len(label_ids), len(lb2id))))
    for i in range(len(label_ids)):
        label_array[i][label_ids[i]] = 1
    return title_ids, project_ids, product_ids, label_array, all_data, lb2id
if __name__ == "__main__":
    # dump_segwords()
    # split_train_test()
    # df = pd.read_excel('E:/行业分类/新标准所有规则标注数据过滤掉标题、项目、产品三要素重复的剩余498976条215类.xlsx')[:]
    # df.fillna(' ', inplace=True)
    # c = Counter(df['industry_type'])
    import copy
    # df_fill = pd.DataFrame()
    # for k, v in c.items():
    #     if v > 1000:
    #         print(k, v)
    #         df2 = copy.deepcopy(df[df['industry_type']==k])
    #         print(len(df2))
    #         if len(df2)>1000:
    #             df2.drop_duplicates(subset=['doctitle'], inplace=True)
    #             print(len(df2))
    #             if len(df2) > 1000:
    #                 df2.drop_duplicates(subset=['project_name'], inplace=True)
    #                 print(len(df2))
    #                 if len(df2) > 1000:
    #                     df2.drop_duplicates(subset=['product'], inplace=True)
    #                     print(len(df2))
    #                     if len(df2) > 1000:
    #                         df2.drop_duplicates(subset=['tenderee'], inplace=True)
    #                         print(len(df2))
    #
    #         df_fill = df_fill.append(df2, ignore_index=True)
    #     else:
    #         df2 = copy.deepcopy(df[df['industry_type'] == k])
    #         df_fill = df_fill.append(df2, ignore_index=True)
    # print('len_df_fill', len(df_fill))
    # df_fill = df_fill.sample(frac=1)
    # df_fill.to_excel('E:/行业分类/新标准所有规则标注数据帅选数据.xlsx')
    # import re
    # df = pd.read_excel('E:/行业分类/新标准所有规则标注数据帅选数据.xlsx')
    # c = Counter(df['industry_type'])
    # print(c.most_common())
    # l = []
    # pos = neg = 0
    # for i in df.index:
    #     title = df.loc[i, 'doctitle']
    #     name = df.loc[i, 'project_name']
    #     ree = df.loc[i, 'tenderee']
    #     name = re.sub(
    #         '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性',
    #         '', name)
    #     if name in title:
    #         pos += 1
    #     else:
    #         neg += 1
    #         print(name,   title)
        # text = title.replace(name, '##').replace(ree, '')
        # text_l = text.split('##')
        # for w in text_l:
        #     l.append(w)

    # c = Counter(l)
    # print(c.most_common(100))
    # print('pos:%d, neg:%d'%(pos,neg))
    # with open('E:/行业分类/过滤词.txt', 'w', encoding='utf-8') as f:
    #     for it in c.most_common(200):
    #         f.write(it[0]+'\n')