data_process.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300
  1. from collections import Counter
  2. import pandas as pd
  3. import fool
  4. import pickle
  5. import json
  6. import random
  7. import time
  8. import re
  9. import numpy as np
  10. import gensim
  11. w2v = r"D:\bidi\BIDI_ML_INFO_EXTRACTION\BiddingKG\dl\wiki_128_word_embedding_new.vector"
  12. # w2v = "/data/python/lishimin/BiddingKG/dl/wiki_128_word_embedding_new.vector"
  13. model_w2v = gensim.models.KeyedVectors.load_word2vec_format(w2v, binary=True)
  14. def cut_words(text_list):
  15. return fool.cut(text_list)
  16. def words_docs_list2array(words_docs_list, maxSententLen=20):
  17. array = np.zeros((len(words_docs_list), maxSententLen, 128))
  18. for i in range(len(words_docs_list)):
  19. words = words_docs_list[i][-maxSententLen:]
  20. for j in range(len(words)):
  21. if words[j] in model_w2v:
  22. array[i][j] = model_w2v[words[j]]
  23. else:
  24. print('word not in w2v: ', words[j])
  25. return array
  26. def dump_segwords():
  27. # df = pd.read_excel('data/新标准所有规则标注数据帅选数据.xlsx')[:]
  28. # df = pd.read_excel('data/215类数据太少类别补充数据.xlsx')[:]
  29. df = pd.read_excel('data/新标准所有规则标注数据去除产品长度大于100帅选并补充后165595条215类.xlsx')[:]
  30. df.dropna(subset=['industry_type'], inplace=True)
  31. print(df.head(5))
  32. df.fillna(' ', inplace=True)
  33. print('len_df', len(df))
  34. df = df[df['industry_type']!=' ']
  35. print('len_df', len(df))
  36. from collections import Counter
  37. c = Counter(df['industry_type'])
  38. print('类别数:', len(c))
  39. def del_words(tenderee, text):
  40. tenderee = tenderee.replace('(', '(').replace(')', ')')
  41. text = text.replace('(', '(').replace(')', ')')
  42. text = re.sub(
  43. '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性',
  44. '', text)
  45. text = text.replace(tenderee, '')
  46. text = ' ' if text=="" else text
  47. return text
  48. def get_winer(sub_docs_json, winner):
  49. # winner = ' '
  50. if winner == ' ' and 'win_tenderer' in sub_docs_json:
  51. l = json.loads(sub_docs_json)
  52. for d in l:
  53. if d.get('win_tenderer', '')!="":
  54. winner = d.get('win_tenderer', '')
  55. return winner
  56. return winner
  57. df['doctitle'] = df.apply(lambda x:del_words(x['tenderee'], x['doctitle']), axis=1)
  58. df['project_name'] = df.apply(lambda x:del_words(x['tenderee'], x['project_name']), axis=1)
  59. # df['win_tenderer'] = df['sub_docs_json'].apply(lambda x:get_winer(x))
  60. df['win_tenderer'] = df.apply(lambda x:get_winer(x['sub_docs_json'], x['win_tenderer']), axis=1)
  61. labels = sorted(set(df['industry_type']))
  62. lb2id = {k:v for v, k in enumerate(labels)}
  63. print(df.head(5))
  64. print(labels, len(labels))
  65. print(lb2id[labels[0]], labels[0])
  66. t1 = time.time()
  67. f = open('data/all_segwords.txt', 'w', encoding='utf-8')
  68. # f = open('data/all_segwords.txt', 'a', encoding='utf-8')
  69. n = 0
  70. for docid, title, project, product, win_tenderer, lb in \
  71. zip(df['docid'], df['doctitle'], df['project_name'], df['product'], df['win_tenderer'], df['industry_type']):
  72. try:
  73. title_, project_, product_, win_tenderer_ = cut_words([title, project, product, win_tenderer])
  74. segwords = ' '.join(title_)+'#split#'+' '.join(project_)+'#split#'+' '.join(product_)+'#split#'+' '.join(win_tenderer_)
  75. f.write('%s\t%s\t%s\n'%(lb, segwords, docid))
  76. n += 1
  77. if n %1000==0:
  78. print('已完成%d篇'%n)
  79. except Exception as e:
  80. print('error : ', e)
  81. f.close()
  82. # all_data = []
  83. # all_data = [(docid,cut_words([title, project, product, win_tenderer]), lb) for docid,title,project,product,win_tenderer,lb in zip(df['docid'],df['doctitle'],df['project_name'],df['product'],df['win_tenderer'],df['industry_type'])]
  84. # print('all_data_len:', len(all_data))
  85. # with open('data/all_data.pkl', 'wb') as f:
  86. # pickle.dump(all_data, f)
  87. # with open('data/lb2id.pkl', 'wb') as f:
  88. # pickle.dump(lb2id, f)
  89. t2 = time.time()
  90. print('总耗时:', t2-t1)
  91. # dump_segwords()
  92. def get_array_data():
  93. with open('E:/行业分类/all_data.pkl', 'rb') as f:
  94. all_data = pickle.load(f)
  95. with open('E:/行业分类/lb2id.pkl', 'rb') as f:
  96. lb2id = pickle.load( f)
  97. print('all_data_len', len(all_data))
  98. print(lb2id)
  99. title = [it[0][0] for it in all_data]
  100. project = [it[0][1] for it in all_data]
  101. product = [it[0][2] for it in all_data]
  102. labels_type = [it[1] for it in all_data]
  103. title_array = words_docs_list2array(title)
  104. project_array = words_docs_list2array(project)
  105. product_array = words_docs_list2array(product)
  106. label_ids = [lb2id[it] for it in labels_type]
  107. label_array = np.zeros(shape=((len(label_ids), len(lb2id))))
  108. for i in range(len(label_ids)):
  109. label_array[i][label_ids[i]] = 1
  110. print(title_array.shape, label_array.shape)
  111. return title_array, project_array, product_array, label_array, all_data, lb2id
  112. def split_train_test():
  113. # with open('E:/行业分类/all_data.pkl', 'rb') as f:
  114. # all_data = pickle.load(f)
  115. # df = pd.DataFrame(all_data, columns=['segword_list', 'label'])
  116. # df['segword_list'] = df['segword_list'].apply(lambda x: json.dumps(x, ensure_ascii=False))
  117. # df.drop_duplicates(subset=['segword_list', 'label'], inplace=True)
  118. # df.drop_duplicates(subset=['segword_list'], inplace=True)
  119. with open('data/all_segwords.txt', 'r', encoding='utf-8') as f:
  120. lines = f.read().split('\n')
  121. lines = [it.strip().split('\t') for it in lines if '#split#' in it]
  122. print(lines[:3])
  123. df = pd.DataFrame(lines, columns=['label', 'segwords', 'docid'])
  124. print(len(df), df.head(3))
  125. df.drop_duplicates(subset=['segwords'], inplace=True)
  126. print(len(df))
  127. c = Counter(df['label'])
  128. less_10 = []
  129. for k, v in c.items():
  130. if v < 10:
  131. less_10.append(k)
  132. df = df[~df['label'].isin(less_10)]
  133. df_test = pd.DataFrame()
  134. for lb in set(df['label']):
  135. df_tmp = df[df['label']==lb]
  136. n = int(len(df_tmp)*0.2)
  137. df_test = df_test.append(df_tmp.sample(n=n), ignore_index=True)
  138. df_test = df_test.sample(frac=1)
  139. # df_train = df[~df['segword_list'].isin(df_test['segword_list'])]
  140. df_train = df[~df['segwords'].isin(df_test['segwords'])]
  141. df_train = df_train.sample(frac=1)
  142. print(len(df), len(df_train), len(df_test))
  143. df_train.to_excel('data/新标准215类train.xlsx')
  144. df_test.to_excel('data/新标准215类test.xlsx')
  145. return df_train, df_test
  146. def df_to_array(df, seq_len=20):
  147. # df['segword_list'] = df['segword_list'].apply(lambda x:json.loads(x))
  148. # with open('E:/行业分类/lb2id.pkl', 'rb') as f:
  149. # lb2id = pickle.load( f)
  150. labels = sorted(set(df['label']))
  151. lb2id = {k:v for v, k in enumerate(labels)}
  152. # all_data = list(df['segword_list'])
  153. all_data = df['segwords'].apply(lambda x:x.split('#split#'))
  154. for l in all_data:
  155. assert len(l) == 4
  156. print('all_data_len', len(all_data))
  157. print(lb2id)
  158. title = [it[0].split() for it in all_data]
  159. project = [it[1].split() for it in all_data]
  160. product = [it[2].split() for it in all_data]
  161. labels_type = [it for it in df['label']]
  162. title = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in title]
  163. project = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in project]
  164. product = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)] for l in product]
  165. title_array = words_docs_list2array(title,seq_len)
  166. project_array = words_docs_list2array(project,seq_len)
  167. product_array = words_docs_list2array(product,seq_len)
  168. label_ids = [lb2id[it] for it in labels_type]
  169. label_array = np.zeros(shape=((len(label_ids), len(lb2id))))
  170. for i in range(len(label_ids)):
  171. label_array[i][label_ids[i]] = 1
  172. print(title_array.shape, label_array.shape)
  173. return title_array, project_array, product_array, label_array, all_data, lb2id
  174. def getVocabAndMatrix(model):
  175. '''
  176. 加载词向量或字向量,返回词、字列表及向量矩阵,过滤掉非纯中文词、字
  177. :param model: 加载的词或字向量模型 gensim.models.keyedvectors.KeyedVectors
  178. :return:
  179. '''
  180. vocab = ['<pad>'] + [it for it in model.index2word if re.search('^[\u4e00-\u9fa5]+$', it)]
  181. word2index = {k:v for k,v in enumerate(vocab)}
  182. Embedding_size = model[model.index2word[0]].shape[0]
  183. embedding_matrix = np.zeros((len(vocab), Embedding_size))
  184. for i in range(1, len(vocab)):
  185. embedding_matrix[i] = model[vocab[i]]
  186. return vocab, embedding_matrix, word2index
  187. def df_to_ids(df, word2index):
  188. def words2ids(words_docs_list, maxSententLen=20):
  189. array = np.zeros((len(words_docs_list), maxSententLen))
  190. for i in range(len(words_docs_list)):
  191. words = words_docs_list[i][-maxSententLen:]
  192. for j in range(len(words)):
  193. if words[j] in word2index:
  194. array[i][j] = word2index[words[j]]
  195. return array
  196. df['segword_list'] = df['segword_list'].apply(lambda x:json.loads(x))
  197. # with open('E:/行业分类/lb2id.pkl', 'rb') as f:
  198. # lb2id = pickle.load( f)
  199. labels = sorted(set(df['label']))
  200. lb2id = {k:v for v, k in enumerate(labels)}
  201. all_data = list(df['segword_list'])
  202. print('all_data_len', len(all_data))
  203. print(lb2id)
  204. title = [it[0] for it in all_data]
  205. project = [it[1] for it in all_data]
  206. product = [it[2] for it in all_data]
  207. labels_type = [it for it in df['label']]
  208. title_ids = words2ids(title)
  209. project_ids = words2ids(project)
  210. product_ids = words2ids(product)
  211. label_ids = [lb2id[it] for it in labels_type]
  212. label_array = np.zeros(shape=((len(label_ids), len(lb2id))))
  213. for i in range(len(label_ids)):
  214. label_array[i][label_ids[i]] = 1
  215. return title_ids, project_ids, product_ids, label_array, all_data, lb2id
  216. if __name__ == "__main__":
  217. # dump_segwords()
  218. # split_train_test()
  219. # df = pd.read_excel('E:/行业分类/新标准所有规则标注数据过滤掉标题、项目、产品三要素重复的剩余498976条215类.xlsx')[:]
  220. # df.fillna(' ', inplace=True)
  221. # c = Counter(df['industry_type'])
  222. import copy
  223. # df_fill = pd.DataFrame()
  224. # for k, v in c.items():
  225. # if v > 1000:
  226. # print(k, v)
  227. # df2 = copy.deepcopy(df[df['industry_type']==k])
  228. # print(len(df2))
  229. # if len(df2)>1000:
  230. # df2.drop_duplicates(subset=['doctitle'], inplace=True)
  231. # print(len(df2))
  232. # if len(df2) > 1000:
  233. # df2.drop_duplicates(subset=['project_name'], inplace=True)
  234. # print(len(df2))
  235. # if len(df2) > 1000:
  236. # df2.drop_duplicates(subset=['product'], inplace=True)
  237. # print(len(df2))
  238. # if len(df2) > 1000:
  239. # df2.drop_duplicates(subset=['tenderee'], inplace=True)
  240. # print(len(df2))
  241. #
  242. # df_fill = df_fill.append(df2, ignore_index=True)
  243. # else:
  244. # df2 = copy.deepcopy(df[df['industry_type'] == k])
  245. # df_fill = df_fill.append(df2, ignore_index=True)
  246. # print('len_df_fill', len(df_fill))
  247. # df_fill = df_fill.sample(frac=1)
  248. # df_fill.to_excel('E:/行业分类/新标准所有规则标注数据帅选数据.xlsx')
  249. # import re
  250. # df = pd.read_excel('E:/行业分类/新标准所有规则标注数据帅选数据.xlsx')
  251. # c = Counter(df['industry_type'])
  252. # print(c.most_common())
  253. # l = []
  254. # pos = neg = 0
  255. # for i in df.index:
  256. # title = df.loc[i, 'doctitle']
  257. # name = df.loc[i, 'project_name']
  258. # ree = df.loc[i, 'tenderee']
  259. # name = re.sub(
  260. # '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性',
  261. # '', name)
  262. # if name in title:
  263. # pos += 1
  264. # else:
  265. # neg += 1
  266. # print(name, title)
  267. # text = title.replace(name, '##').replace(ree, '')
  268. # text_l = text.split('##')
  269. # for w in text_l:
  270. # l.append(w)
  271. # c = Counter(l)
  272. # print(c.most_common(100))
  273. # print('pos:%d, neg:%d'%(pos,neg))
  274. # with open('E:/行业分类/过滤词.txt', 'w', encoding='utf-8') as f:
  275. # for it in c.most_common(200):
  276. # f.write(it[0]+'\n')