123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300 |
- from collections import Counter
- import pandas as pd
- import fool
- import pickle
- import json
- import random
- import time
- import re
- import numpy as np
- import gensim
- w2v = r"D:\bidi\BIDI_ML_INFO_EXTRACTION\BiddingKG\dl\wiki_128_word_embedding_new.vector"
- # w2v = "/data/python/lishimin/BiddingKG/dl/wiki_128_word_embedding_new.vector"
- model_w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v, binary=True)
- def cut_words(text_list):
- return fool.cut(text_list)
- def words_docs_list2array(words_docs_list, maxSententLen=20):
- array = np.zeros((len(words_docs_list), maxSententLen, 128))
- for i in range(len(words_docs_list)):
- words = words_docs_list[i][-maxSententLen:]
- for j in range(len(words)):
- if words[j] in model_w2v:
- array[i][j] = model_w2v[words[j]]
- else:
- print('word not in w2v: ', words[j])
- return array
- def dump_segwords():
- # df = pd.read_excel('data/新标准所有规则标注数据帅选数据.xlsx')[:]
- # df = pd.read_excel('data/215类数据太少类别补充数据.xlsx')[:]
- df = pd.read_excel('data/新标准所有规则标注数据去除产品长度大于100帅选并补充后165595条215类.xlsx')[:]
- df.dropna(subset=['industry_type'], inplace=True)
- print(df.head(5))
- df.fillna(' ', inplace=True)
- print('len_df', len(df))
- df = df[df['industry_type']!=' ']
- print('len_df', len(df))
- from collections import Counter
- c = Counter(df['industry_type'])
- print('类别数:', len(c))
- def del_words(tenderee, text):
- tenderee = tenderee.replace('(', '(').replace(')', ')')
- text = text.replace('(', '(').replace(')', ')')
- text = re.sub(
- '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性',
- '', text)
- text = text.replace(tenderee, '')
- text = ' ' if text=="" else text
- return text
- def get_winer(sub_docs_json, winner):
- # winner = ' '
- if winner == ' ' and 'win_tenderer' in sub_docs_json:
- l = json.loads(sub_docs_json)
- for d in l:
- if d.get('win_tenderer', '')!="":
- winner = d.get('win_tenderer', '')
- return winner
- return winner
- df['doctitle'] = df.apply(lambda x:del_words(x['tenderee'], x['doctitle']), axis=1)
- df['project_name'] = df.apply(lambda x:del_words(x['tenderee'], x['project_name']), axis=1)
- # df['win_tenderer'] = df['sub_docs_json'].apply(lambda x:get_winer(x))
- df['win_tenderer'] = df.apply(lambda x:get_winer(x['sub_docs_json'], x['win_tenderer']), axis=1)
- labels = sorted(set(df['industry_type']))
- lb2id = {k:v for v, k in enumerate(labels)}
- print(df.head(5))
- print(labels, len(labels))
- print(lb2id[labels[0]], labels[0])
- t1 = time.time()
- f = open('data/all_segwords.txt', 'w', encoding='utf-8')
- # f = open('data/all_segwords.txt', 'a', encoding='utf-8')
- n = 0
- for docid, title, project, product, win_tenderer, lb in \
- zip(df['docid'], df['doctitle'], df['project_name'], df['product'], df['win_tenderer'], df['industry_type']):
- try:
- title_, project_, product_, win_tenderer_ = cut_words([title, project, product, win_tenderer])
- segwords = ' '.join(title_)+'#split#'+' '.join(project_)+'#split#'+' '.join(product_)+'#split#'+' '.join(win_tenderer_)
- f.write('%s\t%s\t%s\n'%(lb, segwords, docid))
- n += 1
- if n %1000==0:
- print('已完成%d篇'%n)
- except Exception as e:
- print('error : ', e)
- f.close()
- # all_data = []
- # all_data = [(docid,cut_words([title, project, product, win_tenderer]), lb) for docid,title,project,product,win_tenderer,lb in zip(df['docid'],df['doctitle'],df['project_name'],df['product'],df['win_tenderer'],df['industry_type'])]
- # print('all_data_len:', len(all_data))
- # with open('data/all_data.pkl', 'wb') as f:
- # pickle.dump(all_data, f)
- # with open('data/lb2id.pkl', 'wb') as f:
- # pickle.dump(lb2id, f)
- t2 = time.time()
- print('总耗时:', t2-t1)
- # dump_segwords()
- def get_array_data():
- with open('E:/行业分类/all_data.pkl', 'rb') as f:
- all_data = pickle.load(f)
- with open('E:/行业分类/lb2id.pkl', 'rb') as f:
- lb2id = pickle.load( f)
- print('all_data_len', len(all_data))
- print(lb2id)
- title = [it[0][0] for it in all_data]
- project = [it[0][1] for it in all_data]
- product = [it[0][2] for it in all_data]
- labels_type = [it[1] for it in all_data]
- title_array = words_docs_list2array(title)
- project_array = words_docs_list2array(project)
- product_array = words_docs_list2array(product)
- label_ids = [lb2id[it] for it in labels_type]
- label_array = np.zeros(shape=((len(label_ids), len(lb2id))))
- for i in range(len(label_ids)):
- label_array[i][label_ids[i]] = 1
- print(title_array.shape, label_array.shape)
- return title_array, project_array, product_array, label_array, all_data, lb2id
- def split_train_test():
- # with open('E:/行业分类/all_data.pkl', 'rb') as f:
- # all_data = pickle.load(f)
- # df = pd.DataFrame(all_data, columns=['segword_list', 'label'])
- # df['segword_list'] = df['segword_list'].apply(lambda x: json.dumps(x, ensure_ascii=False))
- # df.drop_duplicates(subset=['segword_list', 'label'], inplace=True)
- # df.drop_duplicates(subset=['segword_list'], inplace=True)
- with open('data/all_segwords.txt', 'r', encoding='utf-8') as f:
- lines = f.read().split('\n')
- lines = [it.strip().split('\t') for it in lines if '#split#' in it]
- print(lines[:3])
- df = pd.DataFrame(lines, columns=['label', 'segwords', 'docid'])
- print(len(df), df.head(3))
- df.drop_duplicates(subset=['segwords'], inplace=True)
- print(len(df))
- c = Counter(df['label'])
- less_10 = []
- for k, v in c.items():
- if v < 10:
- less_10.append(k)
- df = df[~df['label'].isin(less_10)]
- df_test = pd.DataFrame()
- for lb in set(df['label']):
- df_tmp = df[df['label']==lb]
- n = int(len(df_tmp)*0.2)
- df_test = df_test.append(df_tmp.sample(n=n), ignore_index=True)
- df_test = df_test.sample(frac=1)
- # df_train = df[~df['segword_list'].isin(df_test['segword_list'])]
- df_train = df[~df['segwords'].isin(df_test['segwords'])]
- df_train = df_train.sample(frac=1)
- print(len(df), len(df_train), len(df_test))
- df_train.to_excel('data/新标准215类train.xlsx')
- df_test.to_excel('data/新标准215类test.xlsx')
- return df_train, df_test
- def df_to_array(df, seq_len=20):
- # df['segword_list'] = df['segword_list'].apply(lambda x:json.loads(x))
- # with open('E:/行业分类/lb2id.pkl', 'rb') as f:
- # lb2id = pickle.load( f)
- labels = sorted(set(df['label']))
- lb2id = {k:v for v, k in enumerate(labels)}
- # all_data = list(df['segword_list'])
- all_data = df['segwords'].apply(lambda x:x.split('#split#'))
- for l in all_data:
- assert len(l) == 4
- print('all_data_len', len(all_data))
- print(lb2id)
- title = [it[0].split() for it in all_data]
- project = [it[1].split() for it in all_data]
- product = [it[2].split() for it in all_data]
- labels_type = [it for it in df['label']]
- title = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in title]
- project = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in project]
- product = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in product]
- title_array = words_docs_list2array(title,seq_len)
- project_array = words_docs_list2array(project,seq_len)
- product_array = words_docs_list2array(product,seq_len)
- label_ids = [lb2id[it] for it in labels_type]
- label_array = np.zeros(shape=((len(label_ids), len(lb2id))))
- for i in range(len(label_ids)):
- label_array[i][label_ids[i]] = 1
- print(title_array.shape, label_array.shape)
- return title_array, project_array, product_array, label_array, all_data, lb2id
- def getVocabAndMatrix(model):
- '''
- 加载词向量或字向量,返回词、字列表及向量矩阵,过滤掉非纯中文词、字
- :param model: 加载的词或字向量模型 gensim.models.keyedvectors.KeyedVectors
- :return:
- '''
- vocab = ['<pad>'] + [it for it in model.index2word if re.search('^[\u4e00-\u9fa5]+$', it)]
- word2index = {k:v for k,v in enumerate(vocab)}
- Embedding_size = model[model.index2word[0]].shape[0]
- embedding_matrix = np.zeros((len(vocab), Embedding_size))
- for i in range(1, len(vocab)):
- embedding_matrix[i] = model[vocab[i]]
- return vocab, embedding_matrix, word2index
- def df_to_ids(df, word2index):
- def words2ids(words_docs_list, maxSententLen=20):
- array = np.zeros((len(words_docs_list), maxSententLen))
- for i in range(len(words_docs_list)):
- words = words_docs_list[i][-maxSententLen:]
- for j in range(len(words)):
- if words[j] in word2index:
- array[i][j] = word2index[words[j]]
- return array
- df['segword_list'] = df['segword_list'].apply(lambda x:json.loads(x))
- # with open('E:/行业分类/lb2id.pkl', 'rb') as f:
- # lb2id = pickle.load( f)
- labels = sorted(set(df['label']))
- lb2id = {k:v for v, k in enumerate(labels)}
- all_data = list(df['segword_list'])
- print('all_data_len', len(all_data))
- print(lb2id)
- title = [it[0] for it in all_data]
- project = [it[1] for it in all_data]
- product = [it[2] for it in all_data]
- labels_type = [it for it in df['label']]
- title_ids = words2ids(title)
- project_ids = words2ids(project)
- product_ids = words2ids(product)
- label_ids = [lb2id[it] for it in labels_type]
- label_array = np.zeros(shape=((len(label_ids), len(lb2id))))
- for i in range(len(label_ids)):
- label_array[i][label_ids[i]] = 1
- return title_ids, project_ids, product_ids, label_array, all_data, lb2id
- if __name__ == "__main__":
- # dump_segwords()
- # split_train_test()
- # df = pd.read_excel('E:/行业分类/新标准所有规则标注数据过滤掉标题、项目、产品三要素重复的剩余498976条215类.xlsx')[:]
- # df.fillna(' ', inplace=True)
- # c = Counter(df['industry_type'])
- import copy
- # df_fill = pd.DataFrame()
- # for k, v in c.items():
- # if v > 1000:
- # print(k, v)
- # df2 = copy.deepcopy(df[df['industry_type']==k])
- # print(len(df2))
- # if len(df2)>1000:
- # df2.drop_duplicates(subset=['doctitle'], inplace=True)
- # print(len(df2))
- # if len(df2) > 1000:
- # df2.drop_duplicates(subset=['project_name'], inplace=True)
- # print(len(df2))
- # if len(df2) > 1000:
- # df2.drop_duplicates(subset=['product'], inplace=True)
- # print(len(df2))
- # if len(df2) > 1000:
- # df2.drop_duplicates(subset=['tenderee'], inplace=True)
- # print(len(df2))
- #
- # df_fill = df_fill.append(df2, ignore_index=True)
- # else:
- # df2 = copy.deepcopy(df[df['industry_type'] == k])
- # df_fill = df_fill.append(df2, ignore_index=True)
- # print('len_df_fill', len(df_fill))
- # df_fill = df_fill.sample(frac=1)
- # df_fill.to_excel('E:/行业分类/新标准所有规则标注数据帅选数据.xlsx')
- # import re
- # df = pd.read_excel('E:/行业分类/新标准所有规则标注数据帅选数据.xlsx')
- # c = Counter(df['industry_type'])
- # print(c.most_common())
- # l = []
- # pos = neg = 0
- # for i in df.index:
- # title = df.loc[i, 'doctitle']
- # name = df.loc[i, 'project_name']
- # ree = df.loc[i, 'tenderee']
- # name = re.sub(
- # '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性',
- # '', name)
- # if name in title:
- # pos += 1
- # else:
- # neg += 1
- # print(name, title)
- # text = title.replace(name, '##').replace(ree, '')
- # text_l = text.split('##')
- # for w in text_l:
- # l.append(w)
- # c = Counter(l)
- # print(c.most_common(100))
- # print('pos:%d, neg:%d'%(pos,neg))
- # with open('E:/行业分类/过滤词.txt', 'w', encoding='utf-8') as f:
- # for it in c.most_common(200):
- # f.write(it[0]+'\n')
|