from collections import Counter import pandas as pd import fool import pickle import json import random import time import re import numpy as np import gensim w2v = r"D:\bidi\BIDI_ML_INFO_EXTRACTION\BiddingKG\dl\wiki_128_word_embedding_new.vector" # w2v = "/data/python/lishimin/BiddingKG/dl/wiki_128_word_embedding_new.vector" model_w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v, binary=True) def cut_words(text_list): return fool.cut(text_list) def words_docs_list2array(words_docs_list, maxSententLen=20): array = np.zeros((len(words_docs_list), maxSententLen, 128)) for i in range(len(words_docs_list)): words = words_docs_list[i][-maxSententLen:] for j in range(len(words)): if words[j] in model_w2v: array[i][j] = model_w2v[words[j]] else: print('word not in w2v: ', words[j]) return array def dump_segwords(): # df = pd.read_excel('data/新标准所有规则标注数据帅选数据.xlsx')[:] # df = pd.read_excel('data/215类数据太少类别补充数据.xlsx')[:] df = pd.read_excel('data/新标准所有规则标注数据去除产品长度大于100帅选并补充后165595条215类.xlsx')[:] df.dropna(subset=['industry_type'], inplace=True) print(df.head(5)) df.fillna(' ', inplace=True) print('len_df', len(df)) df = df[df['industry_type']!=' '] print('len_df', len(df)) from collections import Counter c = Counter(df['industry_type']) print('类别数:', len(c)) def del_words(tenderee, text): tenderee = tenderee.replace('(', '(').replace(')', ')') text = text.replace('(', '(').replace(')', ')') text = re.sub( '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性', '', text) text = text.replace(tenderee, '') text = ' ' if text=="" else text return text def get_winer(sub_docs_json, winner): # winner = ' ' if winner == ' ' and 'win_tenderer' in sub_docs_json: l = json.loads(sub_docs_json) for d in l: if d.get('win_tenderer', '')!="": winner = d.get('win_tenderer', '') return winner return winner df['doctitle'] = df.apply(lambda x:del_words(x['tenderee'], x['doctitle']), axis=1) df['project_name'] = df.apply(lambda x:del_words(x['tenderee'], x['project_name']), axis=1) # df['win_tenderer'] = df['sub_docs_json'].apply(lambda x:get_winer(x)) df['win_tenderer'] = df.apply(lambda x:get_winer(x['sub_docs_json'], x['win_tenderer']), axis=1) labels = sorted(set(df['industry_type'])) lb2id = {k:v for v, k in enumerate(labels)} print(df.head(5)) print(labels, len(labels)) print(lb2id[labels[0]], labels[0]) t1 = time.time() f = open('data/all_segwords.txt', 'w', encoding='utf-8') # f = open('data/all_segwords.txt', 'a', encoding='utf-8') n = 0 for docid, title, project, product, win_tenderer, lb in \ zip(df['docid'], df['doctitle'], df['project_name'], df['product'], df['win_tenderer'], df['industry_type']): try: title_, project_, product_, win_tenderer_ = cut_words([title, project, product, win_tenderer]) segwords = ' '.join(title_)+'#split#'+' '.join(project_)+'#split#'+' '.join(product_)+'#split#'+' '.join(win_tenderer_) f.write('%s\t%s\t%s\n'%(lb, segwords, docid)) n += 1 if n %1000==0: print('已完成%d篇'%n) except Exception as e: print('error : ', e) f.close() # all_data = [] # all_data = [(docid,cut_words([title, project, product, win_tenderer]), lb) for docid,title,project,product,win_tenderer,lb in zip(df['docid'],df['doctitle'],df['project_name'],df['product'],df['win_tenderer'],df['industry_type'])] # print('all_data_len:', len(all_data)) # with open('data/all_data.pkl', 'wb') as f: # pickle.dump(all_data, f) # with open('data/lb2id.pkl', 'wb') as f: # pickle.dump(lb2id, f) t2 = time.time() print('总耗时:', t2-t1) # dump_segwords() def get_array_data(): with open('E:/行业分类/all_data.pkl', 'rb') as f: all_data = pickle.load(f) with open('E:/行业分类/lb2id.pkl', 'rb') as f: lb2id = pickle.load( f) print('all_data_len', len(all_data)) print(lb2id) title = [it[0][0] for it in all_data] project = [it[0][1] for it in all_data] product = [it[0][2] for it in all_data] labels_type = [it[1] for it in all_data] title_array = words_docs_list2array(title) project_array = words_docs_list2array(project) product_array = words_docs_list2array(product) label_ids = [lb2id[it] for it in labels_type] label_array = np.zeros(shape=((len(label_ids), len(lb2id)))) for i in range(len(label_ids)): label_array[i][label_ids[i]] = 1 print(title_array.shape, label_array.shape) return title_array, project_array, product_array, label_array, all_data, lb2id def split_train_test(): # with open('E:/行业分类/all_data.pkl', 'rb') as f: # all_data = pickle.load(f) # df = pd.DataFrame(all_data, columns=['segword_list', 'label']) # df['segword_list'] = df['segword_list'].apply(lambda x: json.dumps(x, ensure_ascii=False)) # df.drop_duplicates(subset=['segword_list', 'label'], inplace=True) # df.drop_duplicates(subset=['segword_list'], inplace=True) with open('data/all_segwords.txt', 'r', encoding='utf-8') as f: lines = f.read().split('\n') lines = [it.strip().split('\t') for it in lines if '#split#' in it] print(lines[:3]) df = pd.DataFrame(lines, columns=['label', 'segwords', 'docid']) print(len(df), df.head(3)) df.drop_duplicates(subset=['segwords'], inplace=True) print(len(df)) c = Counter(df['label']) less_10 = [] for k, v in c.items(): if v < 10: less_10.append(k) df = df[~df['label'].isin(less_10)] df_test = pd.DataFrame() for lb in set(df['label']): df_tmp = df[df['label']==lb] n = int(len(df_tmp)*0.2) df_test = df_test.append(df_tmp.sample(n=n), ignore_index=True) df_test = df_test.sample(frac=1) # df_train = df[~df['segword_list'].isin(df_test['segword_list'])] df_train = df[~df['segwords'].isin(df_test['segwords'])] df_train = df_train.sample(frac=1) print(len(df), len(df_train), len(df_test)) df_train.to_excel('data/新标准215类train.xlsx') df_test.to_excel('data/新标准215类test.xlsx') return df_train, df_test def df_to_array(df, seq_len=20): # df['segword_list'] = df['segword_list'].apply(lambda x:json.loads(x)) # with open('E:/行业分类/lb2id.pkl', 'rb') as f: # lb2id = pickle.load( f) labels = sorted(set(df['label'])) lb2id = {k:v for v, k in enumerate(labels)} # all_data = list(df['segword_list']) all_data = df['segwords'].apply(lambda x:x.split('#split#')) for l in all_data: assert len(l) == 4 print('all_data_len', len(all_data)) print(lb2id) title = [it[0].split() for it in all_data] project = [it[1].split() for it in all_data] product = [it[2].split() for it in all_data] labels_type = [it for it in df['label']] title = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in title] project = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in project] product = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in product] title_array = words_docs_list2array(title,seq_len) project_array = words_docs_list2array(project,seq_len) product_array = words_docs_list2array(product,seq_len) label_ids = [lb2id[it] for it in labels_type] label_array = np.zeros(shape=((len(label_ids), len(lb2id)))) for i in range(len(label_ids)): label_array[i][label_ids[i]] = 1 print(title_array.shape, label_array.shape) return title_array, project_array, product_array, label_array, all_data, lb2id def getVocabAndMatrix(model): ''' 加载词向量或字向量,返回词、字列表及向量矩阵,过滤掉非纯中文词、字 :param model: 加载的词或字向量模型 gensim.models.keyedvectors.KeyedVectors :return: ''' vocab = [''] + [it for it in model.index2word if re.search('^[\u4e00-\u9fa5]+$', it)] word2index = {k:v for k,v in enumerate(vocab)} Embedding_size = model[model.index2word[0]].shape[0] embedding_matrix = np.zeros((len(vocab), Embedding_size)) for i in range(1, len(vocab)): embedding_matrix[i] = model[vocab[i]] return vocab, embedding_matrix, word2index def df_to_ids(df, word2index): def words2ids(words_docs_list, maxSententLen=20): array = np.zeros((len(words_docs_list), maxSententLen)) for i in range(len(words_docs_list)): words = words_docs_list[i][-maxSententLen:] for j in range(len(words)): if words[j] in word2index: array[i][j] = word2index[words[j]] return array df['segword_list'] = df['segword_list'].apply(lambda x:json.loads(x)) # with open('E:/行业分类/lb2id.pkl', 'rb') as f: # lb2id = pickle.load( f) labels = sorted(set(df['label'])) lb2id = {k:v for v, k in enumerate(labels)} all_data = list(df['segword_list']) print('all_data_len', len(all_data)) print(lb2id) title = [it[0] for it in all_data] project = [it[1] for it in all_data] product = [it[2] for it in all_data] labels_type = [it for it in df['label']] title_ids = words2ids(title) project_ids = words2ids(project) product_ids = words2ids(product) label_ids = [lb2id[it] for it in labels_type] label_array = np.zeros(shape=((len(label_ids), len(lb2id)))) for i in range(len(label_ids)): label_array[i][label_ids[i]] = 1 return title_ids, project_ids, product_ids, label_array, all_data, lb2id if __name__ == "__main__": # dump_segwords() # split_train_test() # df = pd.read_excel('E:/行业分类/新标准所有规则标注数据过滤掉标题、项目、产品三要素重复的剩余498976条215类.xlsx')[:] # df.fillna(' ', inplace=True) # c = Counter(df['industry_type']) import copy # df_fill = pd.DataFrame() # for k, v in c.items(): # if v > 1000: # print(k, v) # df2 = copy.deepcopy(df[df['industry_type']==k]) # print(len(df2)) # if len(df2)>1000: # df2.drop_duplicates(subset=['doctitle'], inplace=True) # print(len(df2)) # if len(df2) > 1000: # df2.drop_duplicates(subset=['project_name'], inplace=True) # print(len(df2)) # if len(df2) > 1000: # df2.drop_duplicates(subset=['product'], inplace=True) # print(len(df2)) # if len(df2) > 1000: # df2.drop_duplicates(subset=['tenderee'], inplace=True) # print(len(df2)) # # df_fill = df_fill.append(df2, ignore_index=True) # else: # df2 = copy.deepcopy(df[df['industry_type'] == k]) # df_fill = df_fill.append(df2, ignore_index=True) # print('len_df_fill', len(df_fill)) # df_fill = df_fill.sample(frac=1) # df_fill.to_excel('E:/行业分类/新标准所有规则标注数据帅选数据.xlsx') # import re # df = pd.read_excel('E:/行业分类/新标准所有规则标注数据帅选数据.xlsx') # c = Counter(df['industry_type']) # print(c.most_common()) # l = [] # pos = neg = 0 # for i in df.index: # title = df.loc[i, 'doctitle'] # name = df.loc[i, 'project_name'] # ree = df.loc[i, 'tenderee'] # name = re.sub( # '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性', # '', name) # if name in title: # pos += 1 # else: # neg += 1 # print(name, title) # text = title.replace(name, '##').replace(ree, '') # text_l = text.split('##') # for w in text_l: # l.append(w) # c = Counter(l) # print(c.most_common(100)) # print('pos:%d, neg:%d'%(pos,neg)) # with open('E:/行业分类/过滤词.txt', 'w', encoding='utf-8') as f: # for it in c.most_common(200): # f.write(it[0]+'\n')