projectCodeAndName_tf.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796
  1. import tensorflow as tf
  2. from tensorflow.contrib.crf import crf_log_likelihood
  3. from tensorflow.contrib.layers.python.layers import initializers
  4. import numpy as np
  5. import pandas as pd
  6. import os
  7. import psycopg2
  8. import re
  9. import pickle
  10. from BiddingKG.dl.common.Utils import *
  11. from keras.preprocessing.sequence import pad_sequences
  12. def get_data():
  13. # with open("viewTrain.txt",'r',encoding='utf-8') as f1,open("viewTest.txt",'r',encoding='utf-8') as f2:
  14. # rows1 = f1.readlines()
  15. # rows2 = f2.readlines()
  16. # rows = rows1 + rows2
  17. # sentence = []
  18. # sentence_label = []
  19. # sentences_and_labels = []
  20. # for row in rows:
  21. # if row[1]!='#':
  22. # sentence.append(row[0])
  23. # sentence_label.append(row[2:-1])
  24. # else:
  25. # sentences_and_labels.append((sentence,sentence_label))
  26. # sentence = []
  27. # sentence_label = []
  28. # print(sentences_and_labels)
  29. # save(sentences_and_labels,"data/old_datas.pk")
  30. conn = psycopg2.connect(dbname="iepy",user="postgres",password="postgres",host="192.168.2.101")
  31. user_list = [
  32. ["test1","2020-08-01","2020-11-25"],
  33. ["test11","2020-08-01","2020-11-25"],
  34. ["test12","2020-08-01","2020-11-25"],
  35. ["test17","2020-08-01","2020-10-31"],
  36. ["test19","2020-08-01","2020-11-25"],
  37. ["test2","2020-08-01","2020-11-25"],
  38. ["test3","2020-08-01","2020-11-25"],
  39. ["test7","2020-08-01","2020-11-25"],
  40. ["test8","2020-08-01","2020-11-25"],
  41. ["test9","2020-08-01","2020-11-25"],
  42. ]
  43. db_data = []
  44. for u in user_list:
  45. cur1 = conn.cursor()
  46. sql = "SELECT B.document_id,A.text,A.sentences,B.value " \
  47. "FROM corpus_iedocument A,brat_bratannotation B " \
  48. "WHERE A.human_identifier = B.document_id " \
  49. "AND A.edituser = '%s' " \
  50. "AND A.edittime >= '%s':: date " \
  51. "AND A.edittime <= '%s':: date "
  52. # "ORDER BY B.document_id"
  53. cur1.execute(sql % (u[0], u[1], u[2]))
  54. db_data.extend(cur1.fetchall())
  55. cur1.close()
  56. # print(len(db_data))
  57. # print(db_data[0])
  58. columns = ['document_id','text', 'sentences', 'value']
  59. df = pd.DataFrame(db_data, columns=columns)
  60. df = df[df['value'].str.contains('^T')]
  61. # df = df[df['value'].str.contains('code|name|org|company')]
  62. df = df[df['value'].str.contains('code|name')]
  63. df = df.reset_index(drop=True)
  64. value_split = df['value'].str.split(expand=True)
  65. value_split.columns = ['_', 'entity_type', 'begin', 'end', 'entity_text']
  66. value_split = value_split.drop('_', axis=1)
  67. df = pd.concat([df, value_split], axis=1)
  68. df = df.drop('value', axis=1)
  69. df['begin'] = [int(_) for _ in df['begin']]
  70. df['end'] = [int(_) for _ in df['end']]
  71. code_left_list = []
  72. for begin,text,entity_type in zip(df['begin'],df['text'],df['entity_type']):
  73. code_left = ''
  74. if entity_type == 'code':
  75. code_left = text[max(0,begin-8):begin]
  76. code_left_list.append(code_left)
  77. df['code_left'] = code_left_list
  78. df.to_excel("C:\\Users\\admin\\Desktop\\项目编号和名称\\Code&Name_dbData.xlsx")
  79. conn.close()
  80. def data_process():
  81. data = pd.read_excel("C:\\Users\\admin\\Desktop\\项目编号和名称\\Code&Name_dbData.xlsx",index_col=0)
  82. data['sentences'] = [sentences[1:-1].split(',') for sentences in data['sentences']]
  83. data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
  84. memory_set = set()
  85. id_list = []
  86. text_list = []
  87. text_tagLabels = dict()
  88. for _id, _text, _sentences in zip(data['document_id'], data['text'], data['sentences']):
  89. if _id not in memory_set:
  90. memory_set.add(_id)
  91. text_list.append(_text)
  92. id_list.append(_id)
  93. text_tagLabels[_id] = [[],[]]
  94. re_drop = re.compile("((?:公开)?招标?|中标(?:结果)?|结果|公[告示]?|招标公告?|中标公[告示]?|候选人公[告示]|终止|"
  95. "[流废]标|资格预审|预审|成交(?:结果)?|交易|交易信息|入围|合同|通知书)$")
  96. re_errorCode = re.compile("账号|身份证|机构编号|代理机构|品目|单位编[号码]|索引号|标准[^项目]*$|资产编号|型号|序列号"
  97. "|宗地编号|地块编号|监测编号|不动产证")
  98. # |备案[^,.;,。;]*[号码]
  99. for id, text, sentences, entity_type, begin, end, entity_text, code_left in zip(data['document_id'], data['text'],
  100. data['sentences'], data['entity_type'],
  101. data['begin'], data['end'],
  102. data['entity_text'], data['code_left']):
  103. if entity_type == 'name':
  104. if re_drop.search(entity_text):
  105. name_2 = re_drop.sub('', re_drop.sub('', entity_text))
  106. entity_text = name_2
  107. text_tagLabels[id][0].append(entity_text)
  108. if entity_type == 'code':
  109. if not re_errorCode.search(str(code_left)):
  110. text_tagLabels[id][1].append(entity_text)
  111. train_data = []
  112. max_len = 400
  113. def hasNotBeenLabeled(items,code_begin,code):
  114. for i in range(code_begin,code_begin+len(code)):
  115. if items[i]!="O":
  116. return False
  117. return True
  118. count = 0
  119. for id,text in zip(id_list,text_list):
  120. count += 1
  121. print(count)
  122. names = text_tagLabels[id][0]
  123. names = list(set(names))
  124. names.sort(key=lambda x: len(x), reverse=True)
  125. codes = text_tagLabels[id][1]
  126. codes = list(set(codes))
  127. codes.sort(key=lambda x: len(x), reverse=True)
  128. sentences = text.split('。')
  129. for sentence in sentences:
  130. l = len(sentence)
  131. if l==0:
  132. continue
  133. elif l > max_len:
  134. l = max_len
  135. sentence = sentence[:400]
  136. sentence_label = ['O']*l
  137. code_find_flag = False
  138. name_find_flag = False
  139. if names:
  140. for name in names:
  141. name_begins = findAllIndex(name,sentence)
  142. for name_begin in name_begins:
  143. if hasNotBeenLabeled(sentence_label,name_begin,name):
  144. for j in range(name_begin,name_begin+len(name)):
  145. if j==name_begin:
  146. sentence_label[j] = "PN_B"
  147. elif j==name_begin+len(name)-1:
  148. sentence_label[j] = "PN_E"
  149. else:
  150. sentence_label[j] = "PN_M"
  151. name_find_flag = True
  152. if codes:
  153. for code in codes:
  154. code_begins = findAllIndex(code,sentence)
  155. for code_begin in code_begins:
  156. if hasNotBeenLabeled(sentence_label,code_begin,code):
  157. for j in range(code_begin,code_begin+len(code)):
  158. if j==code_begin:
  159. sentence_label[j] = "PC_B"
  160. elif j==code_begin+len(code)-1:
  161. sentence_label[j] = "PC_E"
  162. else:
  163. sentence_label[j] = "PC_M"
  164. code_find_flag = True
  165. if code_find_flag or name_find_flag:
  166. train_data.append([sentence,sentence_label])
  167. else:
  168. if np.random.random() <= 0.75:
  169. train_data.append([sentence,sentence_label])
  170. print(len(train_data))
  171. save(train_data,'train_data_new.pk')
  172. def add_data_process():
  173. def hasNotBeenLabeled(items, code_begin, code):
  174. for i in range(code_begin, code_begin + len(code)):
  175. if items[i] != "O":
  176. return False
  177. return True
  178. train_data = []
  179. max_len = 400
  180. data_path = "C:\\Users\\admin\\Desktop\\项目编号和名称\\补充数据\\data_"
  181. data_names = ["合同编号","出让公告","询价编号","询价单编号","出让成交公示",
  182. "的通知","公告编号","交易编号","询价单号","房产英文类项目名称",
  183. "挂牌编号","申购单号","订单编号","询价书编号"]
  184. for data_name in data_names:
  185. data = pd.read_csv(data_path+data_name+"_process.csv",index_col=0,encoding='utf-8')
  186. count = 0
  187. for text,_name,_code in zip(data['text'],data['pj_name'],data['pj_code']):
  188. count += 1
  189. print(count)
  190. names = str(_name).split('+')
  191. names.sort(key=lambda x: len(x), reverse=True)
  192. codes = str(_code).split('+')
  193. codes.sort(key=lambda x: len(x), reverse=True)
  194. sentences = text.split('。')
  195. for sentence in sentences:
  196. l = len(sentence)
  197. if l == 0:
  198. continue
  199. elif l > max_len:
  200. l = max_len
  201. sentence = sentence[:400]
  202. sentence_label = ['O'] * l
  203. if names:
  204. for name in names:
  205. name_begins = findAllIndex(name, sentence)
  206. for name_begin in name_begins:
  207. if hasNotBeenLabeled(sentence_label, name_begin, name):
  208. for j in range(name_begin, name_begin + len(name)):
  209. if j == name_begin:
  210. sentence_label[j] = "PN_B"
  211. elif j == name_begin + len(name) - 1:
  212. sentence_label[j] = "PN_E"
  213. else:
  214. sentence_label[j] = "PN_M"
  215. if codes:
  216. for code in codes:
  217. code_begins = findAllIndex(code, sentence)
  218. for code_begin in code_begins:
  219. if hasNotBeenLabeled(sentence_label, code_begin, code):
  220. for j in range(code_begin, code_begin + len(code)):
  221. if j == code_begin:
  222. sentence_label[j] = "PC_B"
  223. elif j == code_begin + len(code) - 1:
  224. sentence_label[j] = "PC_E"
  225. else:
  226. sentence_label[j] = "PC_M"
  227. train_data.append([sentence, sentence_label])
  228. d = load('train_data_new.pk')
  229. print(len(d))
  230. train_data = d + train_data
  231. print(len(train_data))
  232. print('ok')
  233. save(train_data, 'train_data_new2.pk')
  234. def train2():
  235. chunk_tags = {
  236. 'O':0,
  237. 'PN_B':1,
  238. 'PN_M':2,
  239. 'PN_E':3,
  240. 'PC_B':4,
  241. 'PC_M':5,
  242. 'PC_E':6,
  243. }
  244. # 获取预训练的字向量矩阵
  245. w2v_matrix = load('w2v_matrix.pk')
  246. # print(w2v_matrix[:3])
  247. vocab = load('codename_vocab.pk')
  248. word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
  249. print(vocab[:2])
  250. MAXLEN = 400
  251. data_x = []
  252. data_y = []
  253. data1 = load('train_data_new2.pk')
  254. for _data in data1:
  255. _x = list(_data[0])
  256. _x = [word2index.get(_,word2index.get('<unk>')) for _ in _x]
  257. _y = _data[1]
  258. data_x.append(_x)
  259. data_y.append(_y)
  260. # 旧的标注数据加入
  261. old_datas = load("data/old_datas2.pk")
  262. for old_data in old_datas:
  263. data_x.append([word2index.get(word,word2index.get('<unk>')) for word in old_data[0]])
  264. data_y.append(old_data[1])
  265. print("数据量:",len(data_x))
  266. data_x = np.array([np.array(x) for x in data_x])
  267. x_len = [MAXLEN if len(x) > MAXLEN else len(x) for x in data_x]
  268. data_y = np.array([np.array([chunk_tags[_] for _ in y]) for y in data_y])
  269. data_x = pad_sequences(data_x, maxlen=MAXLEN, padding="post", truncating="post")
  270. data_y = pad_sequences(data_y, maxlen=MAXLEN, padding="post", truncating="post")
  271. indices = np.random.permutation(data_x.shape[0])
  272. count = len(data_x)
  273. test_count = int(0.2 * count)
  274. test_idx, train_idx = indices[:test_count], indices[test_count:]
  275. train_x, test_x = data_x[train_idx, :], data_x[test_idx, :]
  276. train_y, test_y = data_y[train_idx, :], data_y[test_idx, :]
  277. train_x_len = np.array([x_len[idx] for idx in train_idx])
  278. test_x_len = np.array([x_len[idx] for idx in test_idx])
  279. print("训练数据量:",len(train_x))
  280. print("训练数据量:",len(test_x))
  281. # save([test_x,test_y,test_x_len],'my_test_data.pk')
  282. with tf.Session(graph=tf.Graph()) as sess:
  283. char_input,logits,target,keepprob,length,crf_loss,trans,train_op = BiLSTM_CRF_tfmodel(sess,embedding_weights=w2v_matrix)
  284. sess.run(tf.global_variables_initializer())
  285. epochs = 150
  286. saver = tf.train.Saver(max_to_keep=max(epochs,10))
  287. batch_size = 1024
  288. _test_loss = 10000.
  289. _test_f1 = 0.
  290. for epoch in range(epochs):
  291. batch_nums = 0
  292. for x_batch,y_batch,x_len_batch in batch_iter(train_x,train_y,train_x_len,batch_size=batch_size):
  293. train_loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:x_batch,target:y_batch,length:x_len_batch,keepprob:0.7})
  294. batch_nums += 1
  295. print("--epoch:" + str(epoch))
  296. print("--"+str(batch_nums)+"batch_train--", "loss:", train_loss)
  297. test_loss_sum = 0.
  298. test_sum = 0
  299. acc_sum = 0.
  300. precision_1 = 0
  301. precision_2 = 0
  302. recall_1 = 0
  303. recall_2 = 0
  304. for test_xbatch,test_ybatch,test_xlen in batch_iter(test_x,test_y,test_x_len,batch_size=batch_size):
  305. test_loss,_logits,_trans = sess.run([crf_loss,logits,trans],feed_dict={char_input:test_xbatch,target:test_ybatch,length:test_xlen,keepprob:1.0})
  306. acc,_precision,_recall = getAcc(test_ybatch, _logits, _trans, test_xlen)
  307. batch_len = len(test_xbatch)
  308. test_sum += batch_len
  309. acc_sum += acc*batch_len
  310. precision_1 += _precision[0]
  311. precision_2 += _precision[1]
  312. recall_1 += _recall[0]
  313. recall_2 += _recall[1]
  314. test_loss_sum += test_loss*batch_len
  315. print("==>epoch:" + str(epoch)+"have_done")
  316. epoch_test_loss = test_loss_sum/test_sum
  317. epoch_test_acc = acc_sum/test_sum
  318. test_precision = precision_1/precision_2
  319. test_recall = recall_1/recall_2
  320. test_f1 = ner_f1_score(test_precision,test_recall)
  321. print("--test --"," acc:",epoch_test_acc,'test_loss:',epoch_test_loss)
  322. print('test_precision:',test_precision,'test_recall',test_recall,'test_f1',test_f1)
  323. # if test_f1 > _test_f1:
  324. # _test_f1 = test_f1
  325. print("Saving-"+str(epoch)+"-model,test_loss:"+str(epoch_test_loss),'test_f1',test_f1)
  326. saver.save(sess,"models_tf/"+str(epoch)+"-L"+str(epoch_test_loss)+"-F"+str(test_f1)+"-P"+str(test_precision)+"-R"+str(test_recall)+"/model.ckpt")
  327. def BiLSTM_CRF_tfmodel(sess,embedding_weights):
  328. '''
  329. :param embedding_weights: 预训练的字向量矩阵
  330. '''
  331. BiRNN_Unit = 100
  332. chunk_tags = {
  333. 'O': 0,
  334. 'PN_B': 1,
  335. 'PN_M': 2,
  336. 'PN_E': 3,
  337. 'PC_B': 4,
  338. 'PC_M': 5,
  339. 'PC_E': 6,
  340. }
  341. def embedding_layer(input,keepprob):
  342. # 加载预训练的字向量矩阵
  343. embedding = tf.get_variable(name="embedding",initializer=np.array(embedding_weights, dtype=np.float32),dtype=tf.float32)
  344. embedding = tf.nn.embedding_lookup(params=embedding,ids=input)
  345. embedding_drop = tf.nn.dropout(embedding,keepprob)
  346. return embedding_drop
  347. def BiLSTM_Layer(input,length):
  348. with tf.variable_scope("BiLSTM"):
  349. forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True)
  350. backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True)
  351. output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
  352. output = tf.concat(output,2)
  353. return output
  354. def CRF_layer(input,num_tags,BiRNN_Unit,time_step,keepprob):
  355. with tf.variable_scope("CRF"):
  356. with tf.variable_scope("hidden"):
  357. w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Unit*2,BiRNN_Unit),dtype=tf.float32,
  358. initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
  359. b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Unit),dtype=tf.float32,initializer=tf.zeros_initializer())
  360. # print(input)
  361. input_reshape = tf.reshape(input,shape=(-1,BiRNN_Unit*2))
  362. hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
  363. hidden = tf.nn.dropout(hidden,keepprob)
  364. with tf.variable_scope("output"):
  365. w_output = tf.get_variable(name='w_output',shape=(BiRNN_Unit,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
  366. b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
  367. pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
  368. logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
  369. return logits_
  370. def layer_loss(input,true_target,num_tags,length):
  371. with tf.variable_scope("crf_loss"):
  372. trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
  373. log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
  374. return tf.reduce_mean(-log_likelihood),trans
  375. with sess.graph.as_default():
  376. char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
  377. target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
  378. length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
  379. keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
  380. _embedding = embedding_layer(char_input,keepprob)
  381. _shape = tf.shape(char_input)
  382. batch_size = _shape[0]
  383. step_size = _shape[-1]
  384. bilstm = BiLSTM_Layer(_embedding,length)
  385. _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Unit=BiRNN_Unit,time_step=step_size,keepprob=keepprob)
  386. crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
  387. global_step = tf.Variable(0,trainable=False)
  388. with tf.variable_scope("optimizer"):
  389. opt = tf.train.AdamOptimizer(0.002)
  390. grads_vars = opt.compute_gradients(crf_loss)
  391. capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
  392. train_op = opt.apply_gradients(capped_grads_vars,global_step)
  393. return char_input,_logits,target,keepprob,length,crf_loss,trans,train_op
  394. def batch_iter(x, y,x_len, batch_size=256):
  395. '''
  396. :param x: content2id
  397. :param y: label2id
  398. :param batch_size: 每批参与训练的句子数量
  399. :return:
  400. '''
  401. data_len = len(x)
  402. num_batch = int((data_len - 1) / batch_size) + 1 #计算一个epoch,需要多少次batch
  403. indices = np.random.permutation(data_len) #生成随机数列
  404. x = x[indices]
  405. y = y[indices]
  406. x_len = x_len[indices]
  407. for i in range(num_batch):
  408. start_id = batch_size * i
  409. end_id = min(batch_size*(i+1), data_len)
  410. yield x[start_id:end_id], y[start_id:end_id],x_len[start_id:end_id]
  411. from sklearn.metrics import accuracy_score
  412. def getAcc(y_batch,logits,trans,lengths):
  413. index = 0
  414. small = -1000.0
  415. preds = []
  416. true_tags = []
  417. for score, length in zip(logits, lengths):
  418. score = score[:length]
  419. path, _ = tf.contrib.crf.viterbi_decode(score, trans)
  420. preds += path[0:]
  421. index += 1
  422. for y, length in zip(y_batch, lengths):
  423. y = y.tolist()
  424. true_tags += y[: length]
  425. _preds = list(preds)
  426. _true_tags = list(true_tags)
  427. acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1)))
  428. precision_1,precision_2,_ = ner_precision(_preds,_true_tags)
  429. recall_1,recall_2,_ = ner_recall(_preds,_true_tags)
  430. return acc,[precision_1,precision_2],[recall_1,recall_2]
  431. def decode(logits, trans, sequence_lengths, tag_num):
  432. viterbi_sequences = []
  433. for logit, length in zip(logits, sequence_lengths):
  434. score = logit[:length]
  435. viterbi_seq, viterbi_score = viterbi_decode(score, trans)
  436. viterbi_sequences.append(viterbi_seq)
  437. return viterbi_sequences
  438. def new_process():
  439. data = pd.read_csv("C:\\Users\\admin\\Desktop\\项目编号和名称\\data_询价书编号.csv",index_col=0,encoding='utf-8')
  440. text_list = []
  441. for id,text in zip(data['id'],data['text']):
  442. # id_list.append(id)
  443. text_list.append(text)
  444. page_content = get_article1(text_list)
  445. data['text'] = page_content
  446. data.to_csv('C:\\Users\\admin\\Desktop\\项目编号和名称\\data_询价书编号_process.csv')
  447. def new_test_code():
  448. data = pd.read_csv("C:\\Users\\admin\\Desktop\\code_test_process2.csv",index_col=0)
  449. sentences_list = []
  450. for text in data['text']:
  451. sentences = text.split("。")
  452. sentences_list.append(sentences)
  453. model_path = "models_tf/27-0.984184712668-0.598231307426/model.ckpt"
  454. name_list,code_list = predict_CodeName(sentences_list,model_path)
  455. data['code'] = code_list
  456. data['name'] = name_list
  457. data.to_csv("C:\\Users\\admin\\Desktop\\code_test结果2-3.csv")
  458. def predict_CodeName(articles,model_path):
  459. w2v_matrix = load('w2v_matrix.pk')
  460. vocab = load('codename_vocab.pk')
  461. word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
  462. model_path = model_path
  463. sess = tf.Session(graph=tf.Graph())
  464. with sess:
  465. char_input, logits, target, keepprob,length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
  466. sess.run(tf.global_variables_initializer())
  467. saver = tf.train.Saver()
  468. saver.restore(sess, model_path)
  469. re_name = re.compile("12*3")
  470. re_code = re.compile("45*6")
  471. article_name_list = []
  472. article_code_list = []
  473. count = 0
  474. for sentences in articles:
  475. if len(sentences)>500:
  476. sentences = sentences[:500]
  477. # print(len(sentences))
  478. count += 1
  479. print(count)
  480. sentence_len = [ min(len(sentence),2000) for sentence in sentences]
  481. # maxlen = max(sentence_len)
  482. maxlen = max(sentence_len)
  483. sentences_x = []
  484. for sentence in sentences:
  485. sentence = list(sentence)
  486. sentence2id = [word2index.get(word,word2index.get('<unk>')) for word in sentence]
  487. sentences_x.append(sentence2id)
  488. sentences_x = pad_sequences(sentences_x,maxlen=maxlen,padding="post", truncating="post")
  489. sentences_x = [np.array(x) for x in sentences_x]
  490. _logits,_trans = sess.run([logits,trans],feed_dict={char_input:np.array(sentences_x),length:sentence_len,keepprob:1.0})
  491. viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=sentence_len,tag_num=7)
  492. # print("==",_logits)
  493. name_list = []
  494. code_list = []
  495. sentence_index = 0
  496. for _seq,sentence in zip(viterbi_sequence,sentences):
  497. seq_id = ''.join([str(s) for s in _seq])
  498. if re_name.search(seq_id):
  499. for _name in re_name.finditer(seq_id):
  500. start = _name.start()
  501. end = _name.end()
  502. n = sentence[start:end]
  503. name_list.append((n,start + sentence_index,end + sentence_index))
  504. if re_code.search(seq_id):
  505. for _code in re_code.finditer(seq_id):
  506. start = _code.start()
  507. end = _code.end()
  508. c = sentence[start:end]
  509. # print(n,'<==>',start,end)
  510. code_list.append((c,start + sentence_index,end + sentence_index))
  511. sentence_index += len(sentence)
  512. article_name_list.append(name_list)
  513. article_code_list.append(code_list)
  514. return article_name_list,article_code_list
  515. from BiddingKG.dl.interface.Preprocessing import *
  516. # 网页公告处理
  517. def get_article1(articles,cost_time = dict(),useselffool=True):
  518. '''
  519. :param articles: 待处理的article source html
  520. :param useselffool: 是否使用selffool
  521. :return: list_articles
  522. '''
  523. list_articles = []
  524. for article in articles:
  525. a_time = time.time()
  526. sourceContent = article
  527. #表格处理
  528. key_preprocess = "tableToText"
  529. start_time = time.time()
  530. article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
  531. # log(article_processed)
  532. if key_preprocess not in cost_time:
  533. cost_time[key_preprocess] = 0
  534. cost_time[key_preprocess] += time.time()-start_time
  535. #article_processed = article[1]
  536. list_articles.append(article_processed)
  537. print(time.time()-a_time)
  538. return list_articles
  539. # 分句处理
  540. def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
  541. '''
  542. :param list_articles: 经过预处理的article text
  543. :return: list_sentences
  544. '''
  545. list_sentences = []
  546. for article in list_articles:
  547. a_time = time.time()
  548. list_sentences_temp = []
  549. #表格处理
  550. key_preprocess = "tableToText"
  551. start_time = time.time()
  552. article_processed = article
  553. if key_preprocess not in cost_time:
  554. cost_time[key_preprocess] = 0
  555. cost_time[key_preprocess] += time.time()-start_time
  556. #nlp处理
  557. if article_processed is not None and len(article_processed)!=0:
  558. split_patten = "。"
  559. sentences = []
  560. _begin = 0
  561. sentences_set = set()
  562. for _iter in re.finditer(split_patten,article_processed):
  563. _sen = article_processed[_begin:_iter.span()[1]]
  564. if len(_sen)>0 and _sen not in sentences_set:
  565. sentences.append(_sen)
  566. sentences_set.add(_sen)
  567. _begin = _iter.span()[1]
  568. _sen = article_processed[_begin:]
  569. if len(_sen)>0 and _sen not in sentences_set:
  570. sentences.append(_sen)
  571. sentences_set.add(_sen)
  572. '''
  573. tokens_all = fool.cut(sentences)
  574. #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
  575. #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
  576. ner_entitys_all = fool.ner(sentences)
  577. '''
  578. #限流执行
  579. key_nerToken = "nerToken"
  580. start_time = time.time()
  581. # tokens_all = getTokens(sentences,useselffool=useselffool)
  582. if key_nerToken not in cost_time:
  583. cost_time[key_nerToken] = 0
  584. cost_time[key_nerToken] += time.time()-start_time
  585. for sentence_index in range(len(sentences)):
  586. sentence_text = sentences[sentence_index]
  587. list_sentences_temp.append(sentence_text)
  588. if len(list_sentences_temp)==0:
  589. list_sentences_temp.append(sentence_text)
  590. list_sentences.append(list_sentences_temp)
  591. print('2:',time.time()-a_time)
  592. return list_sentences
  593. def _find_tag(labels,B_label,M_label,E_label):
  594. result = []
  595. ner_begin = 0
  596. ner_end = 0
  597. for num in range(len(labels)):
  598. if labels[num] == B_label:
  599. ner_begin = num
  600. continue
  601. if labels[num] == M_label and labels[num-1] == B_label:
  602. continue
  603. if labels[num] == M_label and labels[num-1] == M_label:
  604. continue
  605. if labels[num] == E_label:
  606. if labels[num-1] == M_label or labels[num-1] == B_label:
  607. ner_end = num+1
  608. result.append((ner_begin,ner_end))
  609. ner_begin = 0
  610. ner_end = 0
  611. return result
  612. def find_all_tag(labels):
  613. # tags = [("PN_B","PN_M","PN_E"),("PC_B","PC_M","PC_E")]
  614. tags = [(1,2,3),(4,5,6)]
  615. result = []
  616. for tag in tags:
  617. res = _find_tag(labels,B_label=tag[0],M_label=tag[1],E_label=tag[2])
  618. result.append(res)
  619. return result
  620. def ner_precision(pre_labels,true_labels):
  621. '''
  622. :param pre_tags: list
  623. :param true_tags: list
  624. :return:
  625. '''
  626. pre = []
  627. pre_result = find_all_tag(pre_labels)
  628. for item in pre_result:
  629. for _item in item:
  630. if pre_labels[_item[0]:_item[1]] == true_labels[_item[0]:_item[1]]:
  631. pre.append(1)
  632. else:
  633. pre.append(0)
  634. _sum = sum(pre)
  635. _l = len(pre)
  636. if not _l:
  637. _l = 0.0001
  638. return _sum,_l,_sum/_l
  639. def ner_recall(pre_labels,true_labels):
  640. '''
  641. :param pre_tags: list
  642. :param true_tags: list
  643. :return:
  644. '''
  645. recall = []
  646. true_result = find_all_tag(true_labels)
  647. for item in true_result:
  648. for _item in item:
  649. if pre_labels[_item[0]:_item[1]] == true_labels[_item[0]:_item[1]]:
  650. recall.append(1)
  651. else:
  652. recall.append(0)
  653. _sum = sum(recall)
  654. _l = len(recall)
  655. if not _l:
  656. _l = 0.0001
  657. return _sum, _l, _sum/_l
  658. def ner_f1_score(precision,recall):
  659. _temp = precision+recall
  660. if not _temp:
  661. _temp = 0.0001
  662. return (2*precision*recall)/(_temp)
  663. def old_data_update():
  664. data = load('data/old_datas.pk')
  665. # print(len(data))
  666. re_code = re.compile("(?:(?:公告|合同)[^,,。:;]{,3}编号[::]*|寻源单据?号|计划[编文]?号|交易编[号码]|询价单编?[码号]|采购项目编号)([\-\d\w\(\)\(\)\[\]\【\】号]{3,})",re.A)
  667. index = 0
  668. updat_list = []
  669. for d in data:
  670. sentence = ''.join(d[0])
  671. label = d[1]
  672. if re_code.search(sentence):
  673. for item in re_code.finditer(sentence):
  674. begin,end = item.span()
  675. # print(sentence[max(0,begin-8):end])
  676. # print(sentence[begin:end])
  677. la = label[begin:end]
  678. if 'PC_B' not in la:
  679. updat_list.append(index)
  680. index += 1
  681. updat_list = list(set(updat_list))
  682. print(len(updat_list))
  683. for u in updat_list:
  684. item = data[u]
  685. sentence = ''.join(item[0])
  686. label = item[1]
  687. re_res = re_code.findall(sentence)
  688. for res in re_res:
  689. begin = findAllIndex(res,sentence)
  690. for b in begin:
  691. e = b + len(res)
  692. label[b] = 'PC_B'
  693. label[e-1] = 'PC_E'
  694. for i in range(b+1,e-1):
  695. label[i] = 'PC_M'
  696. data[u] = (item[0],label)
  697. # print(sentence)
  698. # print('---')
  699. # print(label)
  700. save(data,'data/old_datas2.pk')
  701. def get_word_matrix():
  702. # 获取预训练的字向量
  703. vocab_model = getModel_word()
  704. _, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
  705. # 去除第一行<pad>全0.行
  706. w2v_matrix = w2v_matrix[1:]
  707. # <pad>
  708. pad_0 = np.zeros((1, w2v_matrix.shape[1]), dtype=float)
  709. # <unk>
  710. unk_1 = np.random.normal(-0.25, 0.25, (1, w2v_matrix.shape[1]))
  711. w2v_matrix = np.concatenate((pad_0, unk_1, w2v_matrix), axis=0)
  712. print(w2v_matrix[:3])
  713. save(w2v_matrix,"w2v_matrix.pk")
  714. if __name__ == '__main__':
  715. # get_data()
  716. # data_process()
  717. # add_data_process()
  718. # train2()
  719. # test2()
  720. # new_test()
  721. # new_process()
  722. # new_test_code()
  723. # get_word_matrix()
  724. # old_data_update()
  725. # model_path = "models_tf/76-L0.472526232355-F0.8848208266348597-P0.8845455959355073-R0.8850962286662862/model.ckpt"
  726. model_path = "models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
  727. text = '''[X2002185]2020年11月麻城市生活垃圾焚烧发电项目厂前区零星计划
  728. '''
  729. name_list, code_list = predict_CodeName([text.split('。')], model_path)
  730. print(name_list)
  731. print(code_list)
  732. pass