projectCodeAndName_tf.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796
  1. import tensorflow as tf
  2. # from tensorflow.contrib.crf import crf_log_likelihood
  3. # from tensorflow.contrib.layers.python.layers import initializers
  4. import numpy as np
  5. import pandas as pd
  6. import os
  7. import psycopg2
  8. import re
  9. import pickle
  10. from BiddingKG.dl.common.Utils import *
  11. from keras.preprocessing.sequence import pad_sequences
  12. def get_data():
  13. # with open("viewTrain.txt",'r',encoding='utf-8') as f1,open("viewTest.txt",'r',encoding='utf-8') as f2:
  14. # rows1 = f1.readlines()
  15. # rows2 = f2.readlines()
  16. # rows = rows1 + rows2
  17. # sentence = []
  18. # sentence_label = []
  19. # sentences_and_labels = []
  20. # for row in rows:
  21. # if row[1]!='#':
  22. # sentence.append(row[0])
  23. # sentence_label.append(row[2:-1])
  24. # else:
  25. # sentences_and_labels.append((sentence,sentence_label))
  26. # sentence = []
  27. # sentence_label = []
  28. # print(sentences_and_labels)
  29. # save(sentences_and_labels,"data/old_datas.pk")
  30. conn = psycopg2.connect(dbname="iepy",user="postgres",password="postgres",host="192.168.2.101")
  31. user_list = [
  32. ["test1","2020-08-01","2020-11-25"],
  33. ["test11","2020-08-01","2020-11-25"],
  34. ["test12","2020-08-01","2020-11-25"],
  35. ["test17","2020-08-01","2020-10-31"],
  36. ["test19","2020-08-01","2020-11-25"],
  37. ["test2","2020-08-01","2020-11-25"],
  38. ["test3","2020-08-01","2020-11-25"],
  39. ["test7","2020-08-01","2020-11-25"],
  40. ["test8","2020-08-01","2020-11-25"],
  41. ["test9","2020-08-01","2020-11-25"],
  42. ]
  43. db_data = []
  44. for u in user_list:
  45. cur1 = conn.cursor()
  46. sql = "SELECT B.document_id,A.text,A.sentences,B.value " \
  47. "FROM corpus_iedocument A,brat_bratannotation B " \
  48. "WHERE A.human_identifier = B.document_id " \
  49. "AND A.edituser = '%s' " \
  50. "AND A.edittime >= '%s':: date " \
  51. "AND A.edittime <= '%s':: date "
  52. # "ORDER BY B.document_id"
  53. cur1.execute(sql % (u[0], u[1], u[2]))
  54. db_data.extend(cur1.fetchall())
  55. cur1.close()
  56. # print(len(db_data))
  57. # print(db_data[0])
  58. columns = ['document_id','text', 'sentences', 'value']
  59. df = pd.DataFrame(db_data, columns=columns)
  60. df = df[df['value'].str.contains('^T')]
  61. # df = df[df['value'].str.contains('code|name|org|company')]
  62. df = df[df['value'].str.contains('code|name')]
  63. df = df.reset_index(drop=True)
  64. value_split = df['value'].str.split(expand=True)
  65. value_split.columns = ['_', 'entity_type', 'begin', 'end', 'entity_text']
  66. value_split = value_split.drop('_', axis=1)
  67. df = pd.concat([df, value_split], axis=1)
  68. df = df.drop('value', axis=1)
  69. df['begin'] = [int(_) for _ in df['begin']]
  70. df['end'] = [int(_) for _ in df['end']]
  71. code_left_list = []
  72. for begin,text,entity_type in zip(df['begin'],df['text'],df['entity_type']):
  73. code_left = ''
  74. if entity_type == 'code':
  75. code_left = text[max(0,begin-8):begin]
  76. code_left_list.append(code_left)
  77. df['code_left'] = code_left_list
  78. df.to_excel("C:\\Users\\admin\\Desktop\\项目编号和名称\\Code&Name_dbData.xlsx")
  79. conn.close()
  80. def data_process():
  81. data = pd.read_excel("C:\\Users\\admin\\Desktop\\项目编号和名称\\Code&Name_dbData.xlsx",index_col=0)
  82. data['sentences'] = [sentences[1:-1].split(',') for sentences in data['sentences']]
  83. data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
  84. memory_set = set()
  85. id_list = []
  86. text_list = []
  87. text_tagLabels = dict()
  88. for _id, _text, _sentences in zip(data['document_id'], data['text'], data['sentences']):
  89. if _id not in memory_set:
  90. memory_set.add(_id)
  91. text_list.append(_text)
  92. id_list.append(_id)
  93. text_tagLabels[_id] = [[],[]]
  94. re_drop = re.compile("((?:公开)?招标?|中标(?:结果)?|结果|公[告示]?|招标公告?|中标公[告示]?|候选人公[告示]|终止|"
  95. "[流废]标|资格预审|预审|成交(?:结果)?|交易|交易信息|入围|合同|通知书)$")
  96. re_errorCode = re.compile("账号|身份证|机构编号|代理机构|品目|单位编[号码]|索引号|标准[^项目]*$|资产编号|型号|序列号"
  97. "|宗地编号|地块编号|监测编号|不动产证")
  98. # |备案[^,.;,。;]*[号码]
  99. for id, text, sentences, entity_type, begin, end, entity_text, code_left in zip(data['document_id'], data['text'],
  100. data['sentences'], data['entity_type'],
  101. data['begin'], data['end'],
  102. data['entity_text'], data['code_left']):
  103. if entity_type == 'name':
  104. if re_drop.search(entity_text):
  105. name_2 = re_drop.sub('', re_drop.sub('', entity_text))
  106. entity_text = name_2
  107. text_tagLabels[id][0].append(entity_text)
  108. if entity_type == 'code':
  109. if not re_errorCode.search(str(code_left)):
  110. text_tagLabels[id][1].append(entity_text)
  111. train_data = []
  112. max_len = 400
  113. def hasNotBeenLabeled(items,code_begin,code):
  114. for i in range(code_begin,code_begin+len(code)):
  115. if items[i]!="O":
  116. return False
  117. return True
  118. count = 0
  119. for id,text in zip(id_list,text_list):
  120. count += 1
  121. print(count)
  122. names = text_tagLabels[id][0]
  123. names = list(set(names))
  124. names.sort(key=lambda x: len(x), reverse=True)
  125. codes = text_tagLabels[id][1]
  126. codes = list(set(codes))
  127. codes.sort(key=lambda x: len(x), reverse=True)
  128. sentences = text.split('。')
  129. for sentence in sentences:
  130. l = len(sentence)
  131. if l==0:
  132. continue
  133. elif l > max_len:
  134. l = max_len
  135. sentence = sentence[:400]
  136. sentence_label = ['O']*l
  137. code_find_flag = False
  138. name_find_flag = False
  139. if names:
  140. for name in names:
  141. name_begins = findAllIndex(name,sentence)
  142. for name_begin in name_begins:
  143. if hasNotBeenLabeled(sentence_label,name_begin,name):
  144. for j in range(name_begin,name_begin+len(name)):
  145. if j==name_begin:
  146. sentence_label[j] = "PN_B"
  147. elif j==name_begin+len(name)-1:
  148. sentence_label[j] = "PN_E"
  149. else:
  150. sentence_label[j] = "PN_M"
  151. name_find_flag = True
  152. if codes:
  153. for code in codes:
  154. code_begins = findAllIndex(code,sentence)
  155. for code_begin in code_begins:
  156. if hasNotBeenLabeled(sentence_label,code_begin,code):
  157. for j in range(code_begin,code_begin+len(code)):
  158. if j==code_begin:
  159. sentence_label[j] = "PC_B"
  160. elif j==code_begin+len(code)-1:
  161. sentence_label[j] = "PC_E"
  162. else:
  163. sentence_label[j] = "PC_M"
  164. code_find_flag = True
  165. if code_find_flag or name_find_flag:
  166. train_data.append([sentence,sentence_label])
  167. else:
  168. if np.random.random() <= 0.75:
  169. train_data.append([sentence,sentence_label])
  170. print(len(train_data))
  171. save(train_data,'train_data_new.pk')
  172. def add_data_process():
  173. def hasNotBeenLabeled(items, code_begin, code):
  174. for i in range(code_begin, code_begin + len(code)):
  175. if items[i] != "O":
  176. return False
  177. return True
  178. train_data = []
  179. max_len = 400
  180. data_path = "C:\\Users\\admin\\Desktop\\项目编号和名称\\补充数据\\data_"
  181. data_names = ["合同编号","出让公告","询价编号","询价单编号","出让成交公示",
  182. "的通知","公告编号","交易编号","询价单号","房产英文类项目名称",
  183. "挂牌编号","申购单号","订单编号","询价书编号"]
  184. for data_name in data_names:
  185. data = pd.read_csv(data_path+data_name+"_process.csv",index_col=0,encoding='utf-8')
  186. count = 0
  187. for text,_name,_code in zip(data['text'],data['pj_name'],data['pj_code']):
  188. count += 1
  189. print(count)
  190. names = str(_name).split('+')
  191. names.sort(key=lambda x: len(x), reverse=True)
  192. codes = str(_code).split('+')
  193. codes.sort(key=lambda x: len(x), reverse=True)
  194. sentences = text.split('。')
  195. for sentence in sentences:
  196. l = len(sentence)
  197. if l == 0:
  198. continue
  199. elif l > max_len:
  200. l = max_len
  201. sentence = sentence[:400]
  202. sentence_label = ['O'] * l
  203. if names:
  204. for name in names:
  205. name_begins = findAllIndex(name, sentence)
  206. for name_begin in name_begins:
  207. if hasNotBeenLabeled(sentence_label, name_begin, name):
  208. for j in range(name_begin, name_begin + len(name)):
  209. if j == name_begin:
  210. sentence_label[j] = "PN_B"
  211. elif j == name_begin + len(name) - 1:
  212. sentence_label[j] = "PN_E"
  213. else:
  214. sentence_label[j] = "PN_M"
  215. if codes:
  216. for code in codes:
  217. code_begins = findAllIndex(code, sentence)
  218. for code_begin in code_begins:
  219. if hasNotBeenLabeled(sentence_label, code_begin, code):
  220. for j in range(code_begin, code_begin + len(code)):
  221. if j == code_begin:
  222. sentence_label[j] = "PC_B"
  223. elif j == code_begin + len(code) - 1:
  224. sentence_label[j] = "PC_E"
  225. else:
  226. sentence_label[j] = "PC_M"
  227. train_data.append([sentence, sentence_label])
  228. d = load('train_data_new.pk')
  229. print(len(d))
  230. train_data = d + train_data
  231. print(len(train_data))
  232. print('ok')
  233. save(train_data, 'train_data_new2.pk')
  234. def train2():
  235. chunk_tags = {
  236. 'O':0,
  237. 'PN_B':1,
  238. 'PN_M':2,
  239. 'PN_E':3,
  240. 'PC_B':4,
  241. 'PC_M':5,
  242. 'PC_E':6,
  243. }
  244. # 获取预训练的字向量矩阵
  245. w2v_matrix = load('w2v_matrix.pk')
  246. # print(w2v_matrix[:3])
  247. vocab = load('codename_vocab.pk')
  248. word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
  249. print(vocab[:2])
  250. MAXLEN = 400
  251. data_x = []
  252. data_y = []
  253. data1 = load('train_data_new2.pk')
  254. for _data in data1:
  255. _x = list(_data[0])
  256. _x = [word2index.get(_,word2index.get('<unk>')) for _ in _x]
  257. _y = _data[1]
  258. data_x.append(_x)
  259. data_y.append(_y)
  260. # 旧的标注数据加入
  261. old_datas = load("data/old_datas2.pk")
  262. for old_data in old_datas:
  263. data_x.append([word2index.get(word,word2index.get('<unk>')) for word in old_data[0]])
  264. data_y.append(old_data[1])
  265. print("数据量:",len(data_x))
  266. data_x = np.array([np.array(x) for x in data_x])
  267. x_len = [MAXLEN if len(x) > MAXLEN else len(x) for x in data_x]
  268. data_y = np.array([np.array([chunk_tags[_] for _ in y]) for y in data_y])
  269. data_x = pad_sequences(data_x, maxlen=MAXLEN, padding="post", truncating="post")
  270. data_y = pad_sequences(data_y, maxlen=MAXLEN, padding="post", truncating="post")
  271. indices = np.random.permutation(data_x.shape[0])
  272. count = len(data_x)
  273. test_count = int(0.2 * count)
  274. test_idx, train_idx = indices[:test_count], indices[test_count:]
  275. train_x, test_x = data_x[train_idx, :], data_x[test_idx, :]
  276. train_y, test_y = data_y[train_idx, :], data_y[test_idx, :]
  277. train_x_len = np.array([x_len[idx] for idx in train_idx])
  278. test_x_len = np.array([x_len[idx] for idx in test_idx])
  279. print("训练数据量:",len(train_x))
  280. print("训练数据量:",len(test_x))
  281. # save([test_x,test_y,test_x_len],'my_test_data.pk')
  282. with tf.Session(graph=tf.Graph()) as sess:
  283. char_input,logits,target,keepprob,length,crf_loss,trans,train_op = BiLSTM_CRF_tfmodel(sess,embedding_weights=w2v_matrix)
  284. sess.run(tf.global_variables_initializer())
  285. epochs = 150
  286. saver = tf.train.Saver(max_to_keep=max(epochs,10))
  287. batch_size = 1024
  288. _test_loss = 10000.
  289. _test_f1 = 0.
  290. for epoch in range(epochs):
  291. batch_nums = 0
  292. for x_batch,y_batch,x_len_batch in batch_iter(train_x,train_y,train_x_len,batch_size=batch_size):
  293. train_loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:x_batch,target:y_batch,length:x_len_batch,keepprob:0.7})
  294. batch_nums += 1
  295. print("--epoch:" + str(epoch))
  296. print("--"+str(batch_nums)+"batch_train--", "loss:", train_loss)
  297. test_loss_sum = 0.
  298. test_sum = 0
  299. acc_sum = 0.
  300. precision_1 = 0
  301. precision_2 = 0
  302. recall_1 = 0
  303. recall_2 = 0
  304. for test_xbatch,test_ybatch,test_xlen in batch_iter(test_x,test_y,test_x_len,batch_size=batch_size):
  305. test_loss,_logits,_trans = sess.run([crf_loss,logits,trans],feed_dict={char_input:test_xbatch,target:test_ybatch,length:test_xlen,keepprob:1.0})
  306. acc,_precision,_recall = getAcc(test_ybatch, _logits, _trans, test_xlen)
  307. batch_len = len(test_xbatch)
  308. test_sum += batch_len
  309. acc_sum += acc*batch_len
  310. precision_1 += _precision[0]
  311. precision_2 += _precision[1]
  312. recall_1 += _recall[0]
  313. recall_2 += _recall[1]
  314. test_loss_sum += test_loss*batch_len
  315. print("==>epoch:" + str(epoch)+"have_done")
  316. epoch_test_loss = test_loss_sum/test_sum
  317. epoch_test_acc = acc_sum/test_sum
  318. test_precision = precision_1/precision_2
  319. test_recall = recall_1/recall_2
  320. test_f1 = ner_f1_score(test_precision,test_recall)
  321. print("--test --"," acc:",epoch_test_acc,'test_loss:',epoch_test_loss)
  322. print('test_precision:',test_precision,'test_recall',test_recall,'test_f1',test_f1)
  323. # if test_f1 > _test_f1:
  324. # _test_f1 = test_f1
  325. print("Saving-"+str(epoch)+"-model,test_loss:"+str(epoch_test_loss),'test_f1',test_f1)
  326. saver.save(sess,"models_tf/"+str(epoch)+"-L"+str(epoch_test_loss)+"-F"+str(test_f1)+"-P"+str(test_precision)+"-R"+str(test_recall)+"/model.ckpt")
  327. def BiLSTM_CRF_tfmodel(sess,embedding_weights):
  328. '''
  329. :param embedding_weights: 预训练的字向量矩阵
  330. '''
  331. BiRNN_Unit = 100
  332. chunk_tags = {
  333. 'O': 0,
  334. 'PN_B': 1,
  335. 'PN_M': 2,
  336. 'PN_E': 3,
  337. 'PC_B': 4,
  338. 'PC_M': 5,
  339. 'PC_E': 6,
  340. }
  341. def embedding_layer(input,keepprob):
  342. # 加载预训练的字向量矩阵
  343. embedding = tf.get_variable(name="embedding",initializer=np.array(embedding_weights, dtype=np.float32),dtype=tf.float32)
  344. embedding = tf.nn.embedding_lookup(params=embedding,ids=input)
  345. embedding_drop = tf.nn.dropout(embedding,keepprob)
  346. return embedding_drop
  347. def BiLSTM_Layer(input,length):
  348. with tf.variable_scope("BiLSTM"):
  349. forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True)
  350. backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True)
  351. output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
  352. output = tf.concat(output,2)
  353. return output
  354. def CRF_layer(input,num_tags,BiRNN_Unit,time_step,keepprob):
  355. with tf.variable_scope("CRF"):
  356. with tf.variable_scope("hidden"):
  357. w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Unit*2,BiRNN_Unit),dtype=tf.float32,
  358. initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
  359. b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Unit),dtype=tf.float32,initializer=tf.zeros_initializer())
  360. # print(input)
  361. input_reshape = tf.reshape(input,shape=(-1,BiRNN_Unit*2))
  362. hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
  363. hidden = tf.nn.dropout(hidden,keepprob)
  364. with tf.variable_scope("output"):
  365. w_output = tf.get_variable(name='w_output',shape=(BiRNN_Unit,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
  366. b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
  367. pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
  368. logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
  369. return logits_
  370. def layer_loss(input,true_target,num_tags,length):
  371. with tf.variable_scope("crf_loss"):
  372. trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
  373. log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
  374. return tf.reduce_mean(-log_likelihood),trans
  375. with sess.graph.as_default():
  376. char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
  377. target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
  378. length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
  379. keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
  380. _embedding = embedding_layer(char_input,keepprob)
  381. _shape = tf.shape(char_input)
  382. batch_size = _shape[0]
  383. step_size = _shape[-1]
  384. bilstm = BiLSTM_Layer(_embedding,length)
  385. _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Unit=BiRNN_Unit,time_step=step_size,keepprob=keepprob)
  386. crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
  387. global_step = tf.Variable(0,trainable=False)
  388. with tf.variable_scope("optimizer"):
  389. opt = tf.train.AdamOptimizer(0.002)
  390. grads_vars = opt.compute_gradients(crf_loss)
  391. capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
  392. train_op = opt.apply_gradients(capped_grads_vars,global_step)
  393. return char_input,_logits,target,keepprob,length,crf_loss,trans,train_op
  394. def batch_iter(x, y,x_len, batch_size=256):
  395. '''
  396. :param x: content2id
  397. :param y: label2id
  398. :param batch_size: 每批参与训练的句子数量
  399. :return:
  400. '''
  401. data_len = len(x)
  402. num_batch = int((data_len - 1) / batch_size) + 1 #计算一个epoch,需要多少次batch
  403. indices = np.random.permutation(data_len) #生成随机数列
  404. x = x[indices]
  405. y = y[indices]
  406. x_len = x_len[indices]
  407. for i in range(num_batch):
  408. start_id = batch_size * i
  409. end_id = min(batch_size*(i+1), data_len)
  410. yield x[start_id:end_id], y[start_id:end_id],x_len[start_id:end_id]
  411. from sklearn.metrics import accuracy_score
  412. def getAcc(y_batch,logits,trans,lengths):
  413. index = 0
  414. small = -1000.0
  415. preds = []
  416. true_tags = []
  417. for score, length in zip(logits, lengths):
  418. score = score[:length]
  419. path, _ = viterbi_decode(score, trans)
  420. preds += path[0:]
  421. index += 1
  422. for y, length in zip(y_batch, lengths):
  423. y = y.tolist()
  424. true_tags += y[: length]
  425. _preds = list(preds)
  426. _true_tags = list(true_tags)
  427. acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1)))
  428. precision_1,precision_2,_ = ner_precision(_preds,_true_tags)
  429. recall_1,recall_2,_ = ner_recall(_preds,_true_tags)
  430. return acc,[precision_1,precision_2],[recall_1,recall_2]
  431. def decode(logits, trans, sequence_lengths, tag_num):
  432. viterbi_sequences = []
  433. for logit, length in zip(logits, sequence_lengths):
  434. score = logit[:length]
  435. viterbi_seq, viterbi_score = viterbi_decode(score, trans)
  436. viterbi_sequences.append(viterbi_seq)
  437. return viterbi_sequences
  438. def new_process():
  439. data = pd.read_csv("C:\\Users\\admin\\Desktop\\项目编号和名称\\data_询价书编号.csv",index_col=0,encoding='utf-8')
  440. text_list = []
  441. for id,text in zip(data['id'],data['text']):
  442. # id_list.append(id)
  443. text_list.append(text)
  444. page_content = get_article1(text_list)
  445. data['text'] = page_content
  446. data.to_csv('C:\\Users\\admin\\Desktop\\项目编号和名称\\data_询价书编号_process.csv')
  447. def new_test_code():
  448. data = pd.read_csv("C:\\Users\\admin\\Desktop\\code_test_process2.csv",index_col=0)
  449. sentences_list = []
  450. for text in data['text']:
  451. sentences = text.split("。")
  452. sentences_list.append(sentences)
  453. model_path = "models_tf/27-0.984184712668-0.598231307426/model.ckpt"
  454. name_list,code_list = predict_CodeName(sentences_list,model_path)
  455. data['code'] = code_list
  456. data['name'] = name_list
  457. data.to_csv("C:\\Users\\admin\\Desktop\\code_test结果2-3.csv")
  458. def predict_CodeName(articles,model_path):
  459. w2v_matrix = load('w2v_matrix.pk')
  460. vocab = load('codename_vocab.pk')
  461. word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
  462. model_path = model_path
  463. sess = tf.Session(graph=tf.Graph())
  464. with sess:
  465. char_input, logits, target, keepprob,length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
  466. sess.run(tf.global_variables_initializer())
  467. saver = tf.train.Saver()
  468. saver.restore(sess, model_path)
  469. re_name = re.compile("12*3")
  470. re_code = re.compile("45*6")
  471. article_name_list = []
  472. article_code_list = []
  473. count = 0
  474. for sentences in articles:
  475. if len(sentences)>500:
  476. sentences = sentences[:500]
  477. # print(len(sentences))
  478. count += 1
  479. print(count)
  480. sentence_len = [ min(len(sentence),2000) for sentence in sentences]
  481. # maxlen = max(sentence_len)
  482. maxlen = max(sentence_len)
  483. sentences_x = []
  484. for sentence in sentences:
  485. sentence = list(sentence)
  486. sentence2id = [word2index.get(word,word2index.get('<unk>')) for word in sentence]
  487. sentences_x.append(sentence2id)
  488. sentences_x = pad_sequences(sentences_x,maxlen=maxlen,padding="post", truncating="post")
  489. sentences_x = [np.array(x) for x in sentences_x]
  490. _logits,_trans = sess.run([logits,trans],feed_dict={char_input:np.array(sentences_x),length:sentence_len,keepprob:1.0})
  491. viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=sentence_len,tag_num=7)
  492. # print("==",_logits)
  493. name_list = []
  494. code_list = []
  495. sentence_index = 0
  496. for _seq,sentence in zip(viterbi_sequence,sentences):
  497. seq_id = ''.join([str(s) for s in _seq])
  498. if re_name.search(seq_id):
  499. for _name in re_name.finditer(seq_id):
  500. start = _name.start()
  501. end = _name.end()
  502. n = sentence[start:end]
  503. name_list.append((n,start + sentence_index,end + sentence_index))
  504. if re_code.search(seq_id):
  505. for _code in re_code.finditer(seq_id):
  506. start = _code.start()
  507. end = _code.end()
  508. c = sentence[start:end]
  509. # print(n,'<==>',start,end)
  510. code_list.append((c,start + sentence_index,end + sentence_index))
  511. sentence_index += len(sentence)
  512. article_name_list.append(name_list)
  513. article_code_list.append(code_list)
  514. return article_name_list,article_code_list
  515. from BiddingKG.dl.interface.Preprocessing import *
  516. # 网页公告处理
  517. def get_article1(articles,cost_time = dict(),useselffool=True):
  518. '''
  519. :param articles: 待处理的article source html
  520. :param useselffool: 是否使用selffool
  521. :return: list_articles
  522. '''
  523. list_articles = []
  524. for article in articles:
  525. a_time = time.time()
  526. sourceContent = article
  527. #表格处理
  528. key_preprocess = "tableToText"
  529. start_time = time.time()
  530. article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
  531. # log(article_processed)
  532. if key_preprocess not in cost_time:
  533. cost_time[key_preprocess] = 0
  534. cost_time[key_preprocess] += time.time()-start_time
  535. #article_processed = article[1]
  536. list_articles.append(article_processed)
  537. print(time.time()-a_time)
  538. return list_articles
  539. # 分句处理
  540. def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
  541. '''
  542. :param list_articles: 经过预处理的article text
  543. :return: list_sentences
  544. '''
  545. list_sentences = []
  546. for article in list_articles:
  547. a_time = time.time()
  548. list_sentences_temp = []
  549. #表格处理
  550. key_preprocess = "tableToText"
  551. start_time = time.time()
  552. article_processed = article
  553. if key_preprocess not in cost_time:
  554. cost_time[key_preprocess] = 0
  555. cost_time[key_preprocess] += time.time()-start_time
  556. #nlp处理
  557. if article_processed is not None and len(article_processed)!=0:
  558. split_patten = "。"
  559. sentences = []
  560. _begin = 0
  561. sentences_set = set()
  562. for _iter in re.finditer(split_patten,article_processed):
  563. _sen = article_processed[_begin:_iter.span()[1]]
  564. if len(_sen)>0 and _sen not in sentences_set:
  565. sentences.append(_sen)
  566. sentences_set.add(_sen)
  567. _begin = _iter.span()[1]
  568. _sen = article_processed[_begin:]
  569. if len(_sen)>0 and _sen not in sentences_set:
  570. sentences.append(_sen)
  571. sentences_set.add(_sen)
  572. '''
  573. tokens_all = fool.cut(sentences)
  574. #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
  575. #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
  576. ner_entitys_all = fool.ner(sentences)
  577. '''
  578. #限流执行
  579. key_nerToken = "nerToken"
  580. start_time = time.time()
  581. # tokens_all = getTokens(sentences,useselffool=useselffool)
  582. if key_nerToken not in cost_time:
  583. cost_time[key_nerToken] = 0
  584. cost_time[key_nerToken] += time.time()-start_time
  585. for sentence_index in range(len(sentences)):
  586. sentence_text = sentences[sentence_index]
  587. list_sentences_temp.append(sentence_text)
  588. if len(list_sentences_temp)==0:
  589. list_sentences_temp.append(sentence_text)
  590. list_sentences.append(list_sentences_temp)
  591. print('2:',time.time()-a_time)
  592. return list_sentences
  593. def _find_tag(labels,B_label,M_label,E_label):
  594. result = []
  595. ner_begin = 0
  596. ner_end = 0
  597. for num in range(len(labels)):
  598. if labels[num] == B_label:
  599. ner_begin = num
  600. continue
  601. if labels[num] == M_label and labels[num-1] == B_label:
  602. continue
  603. if labels[num] == M_label and labels[num-1] == M_label:
  604. continue
  605. if labels[num] == E_label:
  606. if labels[num-1] == M_label or labels[num-1] == B_label:
  607. ner_end = num+1
  608. result.append((ner_begin,ner_end))
  609. ner_begin = 0
  610. ner_end = 0
  611. return result
  612. def find_all_tag(labels):
  613. # tags = [("PN_B","PN_M","PN_E"),("PC_B","PC_M","PC_E")]
  614. tags = [(1,2,3),(4,5,6)]
  615. result = []
  616. for tag in tags:
  617. res = _find_tag(labels,B_label=tag[0],M_label=tag[1],E_label=tag[2])
  618. result.append(res)
  619. return result
  620. def ner_precision(pre_labels,true_labels):
  621. '''
  622. :param pre_tags: list
  623. :param true_tags: list
  624. :return:
  625. '''
  626. pre = []
  627. pre_result = find_all_tag(pre_labels)
  628. for item in pre_result:
  629. for _item in item:
  630. if pre_labels[_item[0]:_item[1]] == true_labels[_item[0]:_item[1]]:
  631. pre.append(1)
  632. else:
  633. pre.append(0)
  634. _sum = sum(pre)
  635. _l = len(pre)
  636. if not _l:
  637. _l = 0.0001
  638. return _sum,_l,_sum/_l
  639. def ner_recall(pre_labels,true_labels):
  640. '''
  641. :param pre_tags: list
  642. :param true_tags: list
  643. :return:
  644. '''
  645. recall = []
  646. true_result = find_all_tag(true_labels)
  647. for item in true_result:
  648. for _item in item:
  649. if pre_labels[_item[0]:_item[1]] == true_labels[_item[0]:_item[1]]:
  650. recall.append(1)
  651. else:
  652. recall.append(0)
  653. _sum = sum(recall)
  654. _l = len(recall)
  655. if not _l:
  656. _l = 0.0001
  657. return _sum, _l, _sum/_l
  658. def ner_f1_score(precision,recall):
  659. _temp = precision+recall
  660. if not _temp:
  661. _temp = 0.0001
  662. return (2*precision*recall)/(_temp)
  663. def old_data_update():
  664. data = load('data/old_datas.pk')
  665. # print(len(data))
  666. re_code = re.compile("(?:(?:公告|合同)[^,,。:;]{,3}编号[::]*|寻源单据?号|计划[编文]?号|交易编[号码]|询价单编?[码号]|采购项目编号)([\-\d\w\(\)\(\)\[\]\【\】号]{3,})",re.A)
  667. index = 0
  668. updat_list = []
  669. for d in data:
  670. sentence = ''.join(d[0])
  671. label = d[1]
  672. if re_code.search(sentence):
  673. for item in re_code.finditer(sentence):
  674. begin,end = item.span()
  675. # print(sentence[max(0,begin-8):end])
  676. # print(sentence[begin:end])
  677. la = label[begin:end]
  678. if 'PC_B' not in la:
  679. updat_list.append(index)
  680. index += 1
  681. updat_list = list(set(updat_list))
  682. print(len(updat_list))
  683. for u in updat_list:
  684. item = data[u]
  685. sentence = ''.join(item[0])
  686. label = item[1]
  687. re_res = re_code.findall(sentence)
  688. for res in re_res:
  689. begin = findAllIndex(res,sentence)
  690. for b in begin:
  691. e = b + len(res)
  692. label[b] = 'PC_B'
  693. label[e-1] = 'PC_E'
  694. for i in range(b+1,e-1):
  695. label[i] = 'PC_M'
  696. data[u] = (item[0],label)
  697. # print(sentence)
  698. # print('---')
  699. # print(label)
  700. save(data,'data/old_datas2.pk')
  701. def get_word_matrix():
  702. # 获取预训练的字向量
  703. vocab_model = getModel_word()
  704. _, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
  705. # 去除第一行<pad>全0.行
  706. w2v_matrix = w2v_matrix[1:]
  707. # <pad>
  708. pad_0 = np.zeros((1, w2v_matrix.shape[1]), dtype=float)
  709. # <unk>
  710. unk_1 = np.random.normal(-0.25, 0.25, (1, w2v_matrix.shape[1]))
  711. w2v_matrix = np.concatenate((pad_0, unk_1, w2v_matrix), axis=0)
  712. print(w2v_matrix[:3])
  713. save(w2v_matrix,"w2v_matrix.pk")
  714. if __name__ == '__main__':
  715. # get_data()
  716. # data_process()
  717. # add_data_process()
  718. # train2()
  719. # test2()
  720. # new_test()
  721. # new_process()
  722. # new_test_code()
  723. # get_word_matrix()
  724. # old_data_update()
  725. # model_path = "models_tf/76-L0.472526232355-F0.8848208266348597-P0.8845455959355073-R0.8850962286662862/model.ckpt"
  726. model_path = "models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
  727. text = '''[X2002185]2020年11月麻城市生活垃圾焚烧发电项目厂前区零星计划
  728. '''
  729. name_list, code_list = predict_CodeName([text.split('。')], model_path)
  730. print(name_list)
  731. print(code_list)
  732. pass