|
@@ -0,0 +1,796 @@
|
|
|
|
+import tensorflow as tf
|
|
|
|
+from tensorflow.contrib.crf import crf_log_likelihood
|
|
|
|
+from tensorflow.contrib.layers.python.layers import initializers
|
|
|
|
+import numpy as np
|
|
|
|
+import pandas as pd
|
|
|
|
+import os
|
|
|
|
+import psycopg2
|
|
|
|
+import re
|
|
|
|
+import pickle
|
|
|
|
+from BiddingKG.dl.common.Utils import *
|
|
|
|
+from keras.preprocessing.sequence import pad_sequences
|
|
|
|
+
|
|
|
|
+def get_data():
|
|
|
|
+ # with open("viewTrain.txt",'r',encoding='utf-8') as f1,open("viewTest.txt",'r',encoding='utf-8') as f2:
|
|
|
|
+ # rows1 = f1.readlines()
|
|
|
|
+ # rows2 = f2.readlines()
|
|
|
|
+ # rows = rows1 + rows2
|
|
|
|
+ # sentence = []
|
|
|
|
+ # sentence_label = []
|
|
|
|
+ # sentences_and_labels = []
|
|
|
|
+ # for row in rows:
|
|
|
|
+ # if row[1]!='#':
|
|
|
|
+ # sentence.append(row[0])
|
|
|
|
+ # sentence_label.append(row[2:-1])
|
|
|
|
+ # else:
|
|
|
|
+ # sentences_and_labels.append((sentence,sentence_label))
|
|
|
|
+ # sentence = []
|
|
|
|
+ # sentence_label = []
|
|
|
|
+ # print(sentences_and_labels)
|
|
|
|
+ # save(sentences_and_labels,"data/old_datas.pk")
|
|
|
|
+ conn = psycopg2.connect(dbname="iepy",user="postgres",password="postgres",host="192.168.2.101")
|
|
|
|
+ user_list = [
|
|
|
|
+ ["test1","2020-08-01","2020-11-25"],
|
|
|
|
+ ["test11","2020-08-01","2020-11-25"],
|
|
|
|
+ ["test12","2020-08-01","2020-11-25"],
|
|
|
|
+ ["test17","2020-08-01","2020-10-31"],
|
|
|
|
+ ["test19","2020-08-01","2020-11-25"],
|
|
|
|
+ ["test2","2020-08-01","2020-11-25"],
|
|
|
|
+ ["test3","2020-08-01","2020-11-25"],
|
|
|
|
+ ["test7","2020-08-01","2020-11-25"],
|
|
|
|
+ ["test8","2020-08-01","2020-11-25"],
|
|
|
|
+ ["test9","2020-08-01","2020-11-25"],
|
|
|
|
+ ]
|
|
|
|
+ db_data = []
|
|
|
|
+ for u in user_list:
|
|
|
|
+ cur1 = conn.cursor()
|
|
|
|
+ sql = "SELECT B.document_id,A.text,A.sentences,B.value " \
|
|
|
|
+ "FROM corpus_iedocument A,brat_bratannotation B " \
|
|
|
|
+ "WHERE A.human_identifier = B.document_id " \
|
|
|
|
+ "AND A.edituser = '%s' " \
|
|
|
|
+ "AND A.edittime >= '%s':: date " \
|
|
|
|
+ "AND A.edittime <= '%s':: date "
|
|
|
|
+ # "ORDER BY B.document_id"
|
|
|
|
+ cur1.execute(sql % (u[0], u[1], u[2]))
|
|
|
|
+ db_data.extend(cur1.fetchall())
|
|
|
|
+ cur1.close()
|
|
|
|
+ # print(len(db_data))
|
|
|
|
+ # print(db_data[0])
|
|
|
|
+ columns = ['document_id','text', 'sentences', 'value']
|
|
|
|
+ df = pd.DataFrame(db_data, columns=columns)
|
|
|
|
+ df = df[df['value'].str.contains('^T')]
|
|
|
|
+ # df = df[df['value'].str.contains('code|name|org|company')]
|
|
|
|
+ df = df[df['value'].str.contains('code|name')]
|
|
|
|
+ df = df.reset_index(drop=True)
|
|
|
|
+ value_split = df['value'].str.split(expand=True)
|
|
|
|
+ value_split.columns = ['_', 'entity_type', 'begin', 'end', 'entity_text']
|
|
|
|
+ value_split = value_split.drop('_', axis=1)
|
|
|
|
+ df = pd.concat([df, value_split], axis=1)
|
|
|
|
+ df = df.drop('value', axis=1)
|
|
|
|
+ df['begin'] = [int(_) for _ in df['begin']]
|
|
|
|
+ df['end'] = [int(_) for _ in df['end']]
|
|
|
|
+ code_left_list = []
|
|
|
|
+ for begin,text,entity_type in zip(df['begin'],df['text'],df['entity_type']):
|
|
|
|
+ code_left = ''
|
|
|
|
+ if entity_type == 'code':
|
|
|
|
+ code_left = text[max(0,begin-8):begin]
|
|
|
|
+ code_left_list.append(code_left)
|
|
|
|
+ df['code_left'] = code_left_list
|
|
|
|
+ df.to_excel("C:\\Users\\admin\\Desktop\\项目编号和名称\\Code&Name_dbData.xlsx")
|
|
|
|
+ conn.close()
|
|
|
|
+
|
|
|
|
+def data_process():
|
|
|
|
+ data = pd.read_excel("C:\\Users\\admin\\Desktop\\项目编号和名称\\Code&Name_dbData.xlsx",index_col=0)
|
|
|
|
+ data['sentences'] = [sentences[1:-1].split(',') for sentences in data['sentences']]
|
|
|
|
+ data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
|
|
|
|
+ memory_set = set()
|
|
|
|
+ id_list = []
|
|
|
|
+ text_list = []
|
|
|
|
+ text_tagLabels = dict()
|
|
|
|
+ for _id, _text, _sentences in zip(data['document_id'], data['text'], data['sentences']):
|
|
|
|
+ if _id not in memory_set:
|
|
|
|
+ memory_set.add(_id)
|
|
|
|
+ text_list.append(_text)
|
|
|
|
+ id_list.append(_id)
|
|
|
|
+ text_tagLabels[_id] = [[],[]]
|
|
|
|
+ re_drop = re.compile("((?:公开)?招标?|中标(?:结果)?|结果|公[告示]?|招标公告?|中标公[告示]?|候选人公[告示]|终止|"
|
|
|
|
+ "[流废]标|资格预审|预审|成交(?:结果)?|交易|交易信息|入围|合同|通知书)$")
|
|
|
|
+ re_errorCode = re.compile("账号|身份证|机构编号|代理机构|品目|单位编[号码]|索引号|标准[^项目]*$|资产编号|型号|序列号"
|
|
|
|
+ "|宗地编号|地块编号|监测编号|不动产证")
|
|
|
|
+ # |备案[^,.;,。;]*[号码]
|
|
|
|
+ for id, text, sentences, entity_type, begin, end, entity_text, code_left in zip(data['document_id'], data['text'],
|
|
|
|
+ data['sentences'], data['entity_type'],
|
|
|
|
+ data['begin'], data['end'],
|
|
|
|
+ data['entity_text'], data['code_left']):
|
|
|
|
+ if entity_type == 'name':
|
|
|
|
+ if re_drop.search(entity_text):
|
|
|
|
+ name_2 = re_drop.sub('', re_drop.sub('', entity_text))
|
|
|
|
+ entity_text = name_2
|
|
|
|
+ text_tagLabels[id][0].append(entity_text)
|
|
|
|
+ if entity_type == 'code':
|
|
|
|
+ if not re_errorCode.search(str(code_left)):
|
|
|
|
+ text_tagLabels[id][1].append(entity_text)
|
|
|
|
+ train_data = []
|
|
|
|
+ max_len = 400
|
|
|
|
+ def hasNotBeenLabeled(items,code_begin,code):
|
|
|
|
+ for i in range(code_begin,code_begin+len(code)):
|
|
|
|
+ if items[i]!="O":
|
|
|
|
+ return False
|
|
|
|
+ return True
|
|
|
|
+ count = 0
|
|
|
|
+ for id,text in zip(id_list,text_list):
|
|
|
|
+ count += 1
|
|
|
|
+ print(count)
|
|
|
|
+ names = text_tagLabels[id][0]
|
|
|
|
+ names = list(set(names))
|
|
|
|
+ names.sort(key=lambda x: len(x), reverse=True)
|
|
|
|
+ codes = text_tagLabels[id][1]
|
|
|
|
+ codes = list(set(codes))
|
|
|
|
+ codes.sort(key=lambda x: len(x), reverse=True)
|
|
|
|
+ sentences = text.split('。')
|
|
|
|
+ for sentence in sentences:
|
|
|
|
+ l = len(sentence)
|
|
|
|
+ if l==0:
|
|
|
|
+ continue
|
|
|
|
+ elif l > max_len:
|
|
|
|
+ l = max_len
|
|
|
|
+ sentence = sentence[:400]
|
|
|
|
+ sentence_label = ['O']*l
|
|
|
|
+ code_find_flag = False
|
|
|
|
+ name_find_flag = False
|
|
|
|
+ if names:
|
|
|
|
+ for name in names:
|
|
|
|
+ name_begins = findAllIndex(name,sentence)
|
|
|
|
+ for name_begin in name_begins:
|
|
|
|
+ if hasNotBeenLabeled(sentence_label,name_begin,name):
|
|
|
|
+ for j in range(name_begin,name_begin+len(name)):
|
|
|
|
+ if j==name_begin:
|
|
|
|
+ sentence_label[j] = "PN_B"
|
|
|
|
+ elif j==name_begin+len(name)-1:
|
|
|
|
+ sentence_label[j] = "PN_E"
|
|
|
|
+ else:
|
|
|
|
+ sentence_label[j] = "PN_M"
|
|
|
|
+ name_find_flag = True
|
|
|
|
+ if codes:
|
|
|
|
+ for code in codes:
|
|
|
|
+ code_begins = findAllIndex(code,sentence)
|
|
|
|
+ for code_begin in code_begins:
|
|
|
|
+ if hasNotBeenLabeled(sentence_label,code_begin,code):
|
|
|
|
+ for j in range(code_begin,code_begin+len(code)):
|
|
|
|
+ if j==code_begin:
|
|
|
|
+ sentence_label[j] = "PC_B"
|
|
|
|
+ elif j==code_begin+len(code)-1:
|
|
|
|
+ sentence_label[j] = "PC_E"
|
|
|
|
+ else:
|
|
|
|
+ sentence_label[j] = "PC_M"
|
|
|
|
+ code_find_flag = True
|
|
|
|
+ if code_find_flag or name_find_flag:
|
|
|
|
+ train_data.append([sentence,sentence_label])
|
|
|
|
+ else:
|
|
|
|
+ if np.random.random() <= 0.75:
|
|
|
|
+ train_data.append([sentence,sentence_label])
|
|
|
|
+ print(len(train_data))
|
|
|
|
+ save(train_data,'train_data_new.pk')
|
|
|
|
+
|
|
|
|
+def add_data_process():
|
|
|
|
+ def hasNotBeenLabeled(items, code_begin, code):
|
|
|
|
+ for i in range(code_begin, code_begin + len(code)):
|
|
|
|
+ if items[i] != "O":
|
|
|
|
+ return False
|
|
|
|
+ return True
|
|
|
|
+ train_data = []
|
|
|
|
+ max_len = 400
|
|
|
|
+ data_path = "C:\\Users\\admin\\Desktop\\项目编号和名称\\补充数据\\data_"
|
|
|
|
+ data_names = ["合同编号","出让公告","询价编号","询价单编号","出让成交公示",
|
|
|
|
+ "的通知","公告编号","交易编号","询价单号","房产英文类项目名称",
|
|
|
|
+ "挂牌编号","申购单号","订单编号","询价书编号"]
|
|
|
|
+ for data_name in data_names:
|
|
|
|
+ data = pd.read_csv(data_path+data_name+"_process.csv",index_col=0,encoding='utf-8')
|
|
|
|
+ count = 0
|
|
|
|
+ for text,_name,_code in zip(data['text'],data['pj_name'],data['pj_code']):
|
|
|
|
+ count += 1
|
|
|
|
+ print(count)
|
|
|
|
+ names = str(_name).split('+')
|
|
|
|
+ names.sort(key=lambda x: len(x), reverse=True)
|
|
|
|
+ codes = str(_code).split('+')
|
|
|
|
+ codes.sort(key=lambda x: len(x), reverse=True)
|
|
|
|
+ sentences = text.split('。')
|
|
|
|
+ for sentence in sentences:
|
|
|
|
+ l = len(sentence)
|
|
|
|
+ if l == 0:
|
|
|
|
+ continue
|
|
|
|
+ elif l > max_len:
|
|
|
|
+ l = max_len
|
|
|
|
+ sentence = sentence[:400]
|
|
|
|
+ sentence_label = ['O'] * l
|
|
|
|
+ if names:
|
|
|
|
+ for name in names:
|
|
|
|
+ name_begins = findAllIndex(name, sentence)
|
|
|
|
+ for name_begin in name_begins:
|
|
|
|
+ if hasNotBeenLabeled(sentence_label, name_begin, name):
|
|
|
|
+ for j in range(name_begin, name_begin + len(name)):
|
|
|
|
+ if j == name_begin:
|
|
|
|
+ sentence_label[j] = "PN_B"
|
|
|
|
+ elif j == name_begin + len(name) - 1:
|
|
|
|
+ sentence_label[j] = "PN_E"
|
|
|
|
+ else:
|
|
|
|
+ sentence_label[j] = "PN_M"
|
|
|
|
+ if codes:
|
|
|
|
+ for code in codes:
|
|
|
|
+ code_begins = findAllIndex(code, sentence)
|
|
|
|
+ for code_begin in code_begins:
|
|
|
|
+ if hasNotBeenLabeled(sentence_label, code_begin, code):
|
|
|
|
+ for j in range(code_begin, code_begin + len(code)):
|
|
|
|
+ if j == code_begin:
|
|
|
|
+ sentence_label[j] = "PC_B"
|
|
|
|
+ elif j == code_begin + len(code) - 1:
|
|
|
|
+ sentence_label[j] = "PC_E"
|
|
|
|
+ else:
|
|
|
|
+ sentence_label[j] = "PC_M"
|
|
|
|
+ train_data.append([sentence, sentence_label])
|
|
|
|
+ d = load('train_data_new.pk')
|
|
|
|
+ print(len(d))
|
|
|
|
+ train_data = d + train_data
|
|
|
|
+ print(len(train_data))
|
|
|
|
+ print('ok')
|
|
|
|
+ save(train_data, 'train_data_new2.pk')
|
|
|
|
+
|
|
|
|
+def train2():
|
|
|
|
+ chunk_tags = {
|
|
|
|
+ 'O':0,
|
|
|
|
+ 'PN_B':1,
|
|
|
|
+ 'PN_M':2,
|
|
|
|
+ 'PN_E':3,
|
|
|
|
+ 'PC_B':4,
|
|
|
|
+ 'PC_M':5,
|
|
|
|
+ 'PC_E':6,
|
|
|
|
+ }
|
|
|
|
+ # 获取预训练的字向量矩阵
|
|
|
|
+ w2v_matrix = load('w2v_matrix.pk')
|
|
|
|
+ # print(w2v_matrix[:3])
|
|
|
|
+ vocab = load('codename_vocab.pk')
|
|
|
|
+ word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
|
|
|
|
+ print(vocab[:2])
|
|
|
|
+ MAXLEN = 400
|
|
|
|
+
|
|
|
|
+ data_x = []
|
|
|
|
+ data_y = []
|
|
|
|
+ data1 = load('train_data_new2.pk')
|
|
|
|
+ for _data in data1:
|
|
|
|
+ _x = list(_data[0])
|
|
|
|
+ _x = [word2index.get(_,word2index.get('<unk>')) for _ in _x]
|
|
|
|
+ _y = _data[1]
|
|
|
|
+ data_x.append(_x)
|
|
|
|
+ data_y.append(_y)
|
|
|
|
+ # 旧的标注数据加入
|
|
|
|
+ old_datas = load("data/old_datas2.pk")
|
|
|
|
+ for old_data in old_datas:
|
|
|
|
+ data_x.append([word2index.get(word,word2index.get('<unk>')) for word in old_data[0]])
|
|
|
|
+ data_y.append(old_data[1])
|
|
|
|
+ print("数据量:",len(data_x))
|
|
|
|
+ data_x = np.array([np.array(x) for x in data_x])
|
|
|
|
+ x_len = [MAXLEN if len(x) > MAXLEN else len(x) for x in data_x]
|
|
|
|
+ data_y = np.array([np.array([chunk_tags[_] for _ in y]) for y in data_y])
|
|
|
|
+ data_x = pad_sequences(data_x, maxlen=MAXLEN, padding="post", truncating="post")
|
|
|
|
+ data_y = pad_sequences(data_y, maxlen=MAXLEN, padding="post", truncating="post")
|
|
|
|
+ indices = np.random.permutation(data_x.shape[0])
|
|
|
|
+ count = len(data_x)
|
|
|
|
+ test_count = int(0.2 * count)
|
|
|
|
+ test_idx, train_idx = indices[:test_count], indices[test_count:]
|
|
|
|
+ train_x, test_x = data_x[train_idx, :], data_x[test_idx, :]
|
|
|
|
+ train_y, test_y = data_y[train_idx, :], data_y[test_idx, :]
|
|
|
|
+ train_x_len = np.array([x_len[idx] for idx in train_idx])
|
|
|
|
+ test_x_len = np.array([x_len[idx] for idx in test_idx])
|
|
|
|
+ print("训练数据量:",len(train_x))
|
|
|
|
+ print("训练数据量:",len(test_x))
|
|
|
|
+ # save([test_x,test_y,test_x_len],'my_test_data.pk')
|
|
|
|
+ with tf.Session(graph=tf.Graph()) as sess:
|
|
|
|
+ char_input,logits,target,keepprob,length,crf_loss,trans,train_op = BiLSTM_CRF_tfmodel(sess,embedding_weights=w2v_matrix)
|
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
|
+ epochs = 150
|
|
|
|
+ saver = tf.train.Saver(max_to_keep=max(epochs,10))
|
|
|
|
+ batch_size = 1024
|
|
|
|
+ _test_loss = 10000.
|
|
|
|
+ _test_f1 = 0.
|
|
|
|
+ for epoch in range(epochs):
|
|
|
|
+ batch_nums = 0
|
|
|
|
+ for x_batch,y_batch,x_len_batch in batch_iter(train_x,train_y,train_x_len,batch_size=batch_size):
|
|
|
|
+ train_loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:x_batch,target:y_batch,length:x_len_batch,keepprob:0.7})
|
|
|
|
+ batch_nums += 1
|
|
|
|
+ print("--epoch:" + str(epoch))
|
|
|
|
+ print("--"+str(batch_nums)+"batch_train--", "loss:", train_loss)
|
|
|
|
+ test_loss_sum = 0.
|
|
|
|
+ test_sum = 0
|
|
|
|
+ acc_sum = 0.
|
|
|
|
+ precision_1 = 0
|
|
|
|
+ precision_2 = 0
|
|
|
|
+ recall_1 = 0
|
|
|
|
+ recall_2 = 0
|
|
|
|
+ for test_xbatch,test_ybatch,test_xlen in batch_iter(test_x,test_y,test_x_len,batch_size=batch_size):
|
|
|
|
+ test_loss,_logits,_trans = sess.run([crf_loss,logits,trans],feed_dict={char_input:test_xbatch,target:test_ybatch,length:test_xlen,keepprob:1.0})
|
|
|
|
+ acc,_precision,_recall = getAcc(test_ybatch, _logits, _trans, test_xlen)
|
|
|
|
+ batch_len = len(test_xbatch)
|
|
|
|
+ test_sum += batch_len
|
|
|
|
+ acc_sum += acc*batch_len
|
|
|
|
+ precision_1 += _precision[0]
|
|
|
|
+ precision_2 += _precision[1]
|
|
|
|
+ recall_1 += _recall[0]
|
|
|
|
+ recall_2 += _recall[1]
|
|
|
|
+ test_loss_sum += test_loss*batch_len
|
|
|
|
+ print("==>epoch:" + str(epoch)+"have_done")
|
|
|
|
+ epoch_test_loss = test_loss_sum/test_sum
|
|
|
|
+ epoch_test_acc = acc_sum/test_sum
|
|
|
|
+ test_precision = precision_1/precision_2
|
|
|
|
+ test_recall = recall_1/recall_2
|
|
|
|
+ test_f1 = ner_f1_score(test_precision,test_recall)
|
|
|
|
+ print("--test --"," acc:",epoch_test_acc,'test_loss:',epoch_test_loss)
|
|
|
|
+ print('test_precision:',test_precision,'test_recall',test_recall,'test_f1',test_f1)
|
|
|
|
+ # if test_f1 > _test_f1:
|
|
|
|
+ # _test_f1 = test_f1
|
|
|
|
+ print("Saving-"+str(epoch)+"-model,test_loss:"+str(epoch_test_loss),'test_f1',test_f1)
|
|
|
|
+ saver.save(sess,"models_tf/"+str(epoch)+"-L"+str(epoch_test_loss)+"-F"+str(test_f1)+"-P"+str(test_precision)+"-R"+str(test_recall)+"/model.ckpt")
|
|
|
|
+
|
|
|
|
+def BiLSTM_CRF_tfmodel(sess,embedding_weights):
|
|
|
|
+ '''
|
|
|
|
+ :param embedding_weights: 预训练的字向量矩阵
|
|
|
|
+
|
|
|
|
+ '''
|
|
|
|
+ BiRNN_Unit = 100
|
|
|
|
+ chunk_tags = {
|
|
|
|
+ 'O': 0,
|
|
|
|
+ 'PN_B': 1,
|
|
|
|
+ 'PN_M': 2,
|
|
|
|
+ 'PN_E': 3,
|
|
|
|
+ 'PC_B': 4,
|
|
|
|
+ 'PC_M': 5,
|
|
|
|
+ 'PC_E': 6,
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ def embedding_layer(input,keepprob):
|
|
|
|
+ # 加载预训练的字向量矩阵
|
|
|
|
+ embedding = tf.get_variable(name="embedding",initializer=np.array(embedding_weights, dtype=np.float32),dtype=tf.float32)
|
|
|
|
+ embedding = tf.nn.embedding_lookup(params=embedding,ids=input)
|
|
|
|
+ embedding_drop = tf.nn.dropout(embedding,keepprob)
|
|
|
|
+ return embedding_drop
|
|
|
|
+
|
|
|
|
+ def BiLSTM_Layer(input,length):
|
|
|
|
+ with tf.variable_scope("BiLSTM"):
|
|
|
|
+ forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True)
|
|
|
|
+ backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True)
|
|
|
|
+ output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
|
|
|
|
+ output = tf.concat(output,2)
|
|
|
|
+ return output
|
|
|
|
+
|
|
|
|
+ def CRF_layer(input,num_tags,BiRNN_Unit,time_step,keepprob):
|
|
|
|
+ with tf.variable_scope("CRF"):
|
|
|
|
+ with tf.variable_scope("hidden"):
|
|
|
|
+ w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Unit*2,BiRNN_Unit),dtype=tf.float32,
|
|
|
|
+ initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
|
|
|
|
+ b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Unit),dtype=tf.float32,initializer=tf.zeros_initializer())
|
|
|
|
+ # print(input)
|
|
|
|
+ input_reshape = tf.reshape(input,shape=(-1,BiRNN_Unit*2))
|
|
|
|
+ hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
|
|
|
|
+ hidden = tf.nn.dropout(hidden,keepprob)
|
|
|
|
+ with tf.variable_scope("output"):
|
|
|
|
+ w_output = tf.get_variable(name='w_output',shape=(BiRNN_Unit,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
|
|
|
|
+ b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
|
|
|
|
+ pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
|
|
|
|
+ logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
|
|
|
|
+ return logits_
|
|
|
|
+
|
|
|
|
+ def layer_loss(input,true_target,num_tags,length):
|
|
|
|
+ with tf.variable_scope("crf_loss"):
|
|
|
|
+ trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
|
|
|
|
+ log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
|
|
|
|
+ return tf.reduce_mean(-log_likelihood),trans
|
|
|
|
+
|
|
|
|
+ with sess.graph.as_default():
|
|
|
|
+ char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
|
|
|
|
+ target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
|
|
|
|
+ length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
|
|
|
|
+ keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
|
|
|
|
+
|
|
|
|
+ _embedding = embedding_layer(char_input,keepprob)
|
|
|
|
+ _shape = tf.shape(char_input)
|
|
|
|
+ batch_size = _shape[0]
|
|
|
|
+ step_size = _shape[-1]
|
|
|
|
+ bilstm = BiLSTM_Layer(_embedding,length)
|
|
|
|
+ _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Unit=BiRNN_Unit,time_step=step_size,keepprob=keepprob)
|
|
|
|
+ crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
|
|
|
|
+ global_step = tf.Variable(0,trainable=False)
|
|
|
|
+ with tf.variable_scope("optimizer"):
|
|
|
|
+ opt = tf.train.AdamOptimizer(0.002)
|
|
|
|
+ grads_vars = opt.compute_gradients(crf_loss)
|
|
|
|
+ capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
|
|
|
|
+ train_op = opt.apply_gradients(capped_grads_vars,global_step)
|
|
|
|
+ return char_input,_logits,target,keepprob,length,crf_loss,trans,train_op
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def batch_iter(x, y,x_len, batch_size=256):
|
|
|
|
+ '''
|
|
|
|
+ :param x: content2id
|
|
|
|
+ :param y: label2id
|
|
|
|
+ :param batch_size: 每批参与训练的句子数量
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ data_len = len(x)
|
|
|
|
+ num_batch = int((data_len - 1) / batch_size) + 1 #计算一个epoch,需要多少次batch
|
|
|
|
+ indices = np.random.permutation(data_len) #生成随机数列
|
|
|
|
+ x = x[indices]
|
|
|
|
+ y = y[indices]
|
|
|
|
+ x_len = x_len[indices]
|
|
|
|
+ for i in range(num_batch):
|
|
|
|
+ start_id = batch_size * i
|
|
|
|
+ end_id = min(batch_size*(i+1), data_len)
|
|
|
|
+ yield x[start_id:end_id], y[start_id:end_id],x_len[start_id:end_id]
|
|
|
|
+from sklearn.metrics import accuracy_score
|
|
|
|
+def getAcc(y_batch,logits,trans,lengths):
|
|
|
|
+ index = 0
|
|
|
|
+ small = -1000.0
|
|
|
|
+
|
|
|
|
+ preds = []
|
|
|
|
+ true_tags = []
|
|
|
|
+ for score, length in zip(logits, lengths):
|
|
|
|
+ score = score[:length]
|
|
|
|
+ path, _ = tf.contrib.crf.viterbi_decode(score, trans)
|
|
|
|
+ preds += path[0:]
|
|
|
|
+ index += 1
|
|
|
|
+
|
|
|
|
+ for y, length in zip(y_batch, lengths):
|
|
|
|
+ y = y.tolist()
|
|
|
|
+ true_tags += y[: length]
|
|
|
|
+ _preds = list(preds)
|
|
|
|
+ _true_tags = list(true_tags)
|
|
|
|
+ acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1)))
|
|
|
|
+ precision_1,precision_2,_ = ner_precision(_preds,_true_tags)
|
|
|
|
+ recall_1,recall_2,_ = ner_recall(_preds,_true_tags)
|
|
|
|
+
|
|
|
|
+ return acc,[precision_1,precision_2],[recall_1,recall_2]
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def decode(logits, trans, sequence_lengths, tag_num):
|
|
|
|
+ viterbi_sequences = []
|
|
|
|
+ for logit, length in zip(logits, sequence_lengths):
|
|
|
|
+ score = logit[:length]
|
|
|
|
+ viterbi_seq, viterbi_score = viterbi_decode(score, trans)
|
|
|
|
+ viterbi_sequences.append(viterbi_seq)
|
|
|
|
+ return viterbi_sequences
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def new_process():
|
|
|
|
+ data = pd.read_csv("C:\\Users\\admin\\Desktop\\项目编号和名称\\data_询价书编号.csv",index_col=0,encoding='utf-8')
|
|
|
|
+ text_list = []
|
|
|
|
+ for id,text in zip(data['id'],data['text']):
|
|
|
|
+ # id_list.append(id)
|
|
|
|
+ text_list.append(text)
|
|
|
|
+ page_content = get_article1(text_list)
|
|
|
|
+ data['text'] = page_content
|
|
|
|
+ data.to_csv('C:\\Users\\admin\\Desktop\\项目编号和名称\\data_询价书编号_process.csv')
|
|
|
|
+
|
|
|
|
+def new_test_code():
|
|
|
|
+ data = pd.read_csv("C:\\Users\\admin\\Desktop\\code_test_process2.csv",index_col=0)
|
|
|
|
+ sentences_list = []
|
|
|
|
+ for text in data['text']:
|
|
|
|
+ sentences = text.split("。")
|
|
|
|
+ sentences_list.append(sentences)
|
|
|
|
+ model_path = "models_tf/27-0.984184712668-0.598231307426/model.ckpt"
|
|
|
|
+ name_list,code_list = predict_CodeName(sentences_list,model_path)
|
|
|
|
+ data['code'] = code_list
|
|
|
|
+ data['name'] = name_list
|
|
|
|
+ data.to_csv("C:\\Users\\admin\\Desktop\\code_test结果2-3.csv")
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def predict_CodeName(articles,model_path):
|
|
|
|
+
|
|
|
|
+ w2v_matrix = load('w2v_matrix.pk')
|
|
|
|
+ vocab = load('codename_vocab.pk')
|
|
|
|
+ word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
|
|
|
|
+
|
|
|
|
+ model_path = model_path
|
|
|
|
+ sess = tf.Session(graph=tf.Graph())
|
|
|
|
+ with sess:
|
|
|
|
+ char_input, logits, target, keepprob,length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
|
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
|
+ saver = tf.train.Saver()
|
|
|
|
+ saver.restore(sess, model_path)
|
|
|
|
+ re_name = re.compile("12*3")
|
|
|
|
+ re_code = re.compile("45*6")
|
|
|
|
+ article_name_list = []
|
|
|
|
+ article_code_list = []
|
|
|
|
+ count = 0
|
|
|
|
+ for sentences in articles:
|
|
|
|
+ if len(sentences)>500:
|
|
|
|
+ sentences = sentences[:500]
|
|
|
|
+ # print(len(sentences))
|
|
|
|
+ count += 1
|
|
|
|
+ print(count)
|
|
|
|
+ sentence_len = [ min(len(sentence),2000) for sentence in sentences]
|
|
|
|
+ # maxlen = max(sentence_len)
|
|
|
|
+ maxlen = max(sentence_len)
|
|
|
|
+ sentences_x = []
|
|
|
|
+ for sentence in sentences:
|
|
|
|
+ sentence = list(sentence)
|
|
|
|
+ sentence2id = [word2index.get(word,word2index.get('<unk>')) for word in sentence]
|
|
|
|
+ sentences_x.append(sentence2id)
|
|
|
|
+ sentences_x = pad_sequences(sentences_x,maxlen=maxlen,padding="post", truncating="post")
|
|
|
|
+ sentences_x = [np.array(x) for x in sentences_x]
|
|
|
|
+
|
|
|
|
+ _logits,_trans = sess.run([logits,trans],feed_dict={char_input:np.array(sentences_x),length:sentence_len,keepprob:1.0})
|
|
|
|
+
|
|
|
|
+ viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=sentence_len,tag_num=7)
|
|
|
|
+ # print("==",_logits)
|
|
|
|
+ name_list = []
|
|
|
|
+ code_list = []
|
|
|
|
+ sentence_index = 0
|
|
|
|
+ for _seq,sentence in zip(viterbi_sequence,sentences):
|
|
|
|
+ seq_id = ''.join([str(s) for s in _seq])
|
|
|
|
+ if re_name.search(seq_id):
|
|
|
|
+ for _name in re_name.finditer(seq_id):
|
|
|
|
+ start = _name.start()
|
|
|
|
+ end = _name.end()
|
|
|
|
+ n = sentence[start:end]
|
|
|
|
+ name_list.append((n,start + sentence_index,end + sentence_index))
|
|
|
|
+ if re_code.search(seq_id):
|
|
|
|
+ for _code in re_code.finditer(seq_id):
|
|
|
|
+ start = _code.start()
|
|
|
|
+ end = _code.end()
|
|
|
|
+ c = sentence[start:end]
|
|
|
|
+ # print(n,'<==>',start,end)
|
|
|
|
+ code_list.append((c,start + sentence_index,end + sentence_index))
|
|
|
|
+ sentence_index += len(sentence)
|
|
|
|
+ article_name_list.append(name_list)
|
|
|
|
+ article_code_list.append(code_list)
|
|
|
|
+ return article_name_list,article_code_list
|
|
|
|
+from BiddingKG.dl.interface.Preprocessing import *
|
|
|
|
+# 网页公告处理
|
|
|
|
+def get_article1(articles,cost_time = dict(),useselffool=True):
|
|
|
|
+ '''
|
|
|
|
+ :param articles: 待处理的article source html
|
|
|
|
+ :param useselffool: 是否使用selffool
|
|
|
|
+ :return: list_articles
|
|
|
|
+ '''
|
|
|
|
+
|
|
|
|
+ list_articles = []
|
|
|
|
+ for article in articles:
|
|
|
|
+ a_time = time.time()
|
|
|
|
+ sourceContent = article
|
|
|
|
+ #表格处理
|
|
|
|
+ key_preprocess = "tableToText"
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml")))
|
|
|
|
+
|
|
|
|
+ # log(article_processed)
|
|
|
|
+
|
|
|
|
+ if key_preprocess not in cost_time:
|
|
|
|
+ cost_time[key_preprocess] = 0
|
|
|
|
+ cost_time[key_preprocess] += time.time()-start_time
|
|
|
|
+
|
|
|
|
+ #article_processed = article[1]
|
|
|
|
+ list_articles.append(article_processed)
|
|
|
|
+ print(time.time()-a_time)
|
|
|
|
+ return list_articles
|
|
|
|
+# 分句处理
|
|
|
|
+def get_sentences1(list_articles,useselffool=True,cost_time=dict()):
|
|
|
|
+ '''
|
|
|
|
+
|
|
|
|
+ :param list_articles: 经过预处理的article text
|
|
|
|
+ :return: list_sentences
|
|
|
|
+ '''
|
|
|
|
+
|
|
|
|
+ list_sentences = []
|
|
|
|
+ for article in list_articles:
|
|
|
|
+ a_time = time.time()
|
|
|
|
+ list_sentences_temp = []
|
|
|
|
+ #表格处理
|
|
|
|
+ key_preprocess = "tableToText"
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ article_processed = article
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ if key_preprocess not in cost_time:
|
|
|
|
+ cost_time[key_preprocess] = 0
|
|
|
|
+ cost_time[key_preprocess] += time.time()-start_time
|
|
|
|
+
|
|
|
|
+ #nlp处理
|
|
|
|
+ if article_processed is not None and len(article_processed)!=0:
|
|
|
|
+ split_patten = "。"
|
|
|
|
+ sentences = []
|
|
|
|
+ _begin = 0
|
|
|
|
+ sentences_set = set()
|
|
|
|
+ for _iter in re.finditer(split_patten,article_processed):
|
|
|
|
+ _sen = article_processed[_begin:_iter.span()[1]]
|
|
|
|
+ if len(_sen)>0 and _sen not in sentences_set:
|
|
|
|
+ sentences.append(_sen)
|
|
|
|
+ sentences_set.add(_sen)
|
|
|
|
+ _begin = _iter.span()[1]
|
|
|
|
+ _sen = article_processed[_begin:]
|
|
|
|
+ if len(_sen)>0 and _sen not in sentences_set:
|
|
|
|
+ sentences.append(_sen)
|
|
|
|
+ sentences_set.add(_sen)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ '''
|
|
|
|
+ tokens_all = fool.cut(sentences)
|
|
|
|
+ #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all)
|
|
|
|
+ #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all)
|
|
|
|
+ ner_entitys_all = fool.ner(sentences)
|
|
|
|
+ '''
|
|
|
|
+ #限流执行
|
|
|
|
+ key_nerToken = "nerToken"
|
|
|
|
+ start_time = time.time()
|
|
|
|
+ # tokens_all = getTokens(sentences,useselffool=useselffool)
|
|
|
|
+ if key_nerToken not in cost_time:
|
|
|
|
+ cost_time[key_nerToken] = 0
|
|
|
|
+ cost_time[key_nerToken] += time.time()-start_time
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ for sentence_index in range(len(sentences)):
|
|
|
|
+
|
|
|
|
+ sentence_text = sentences[sentence_index]
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ list_sentences_temp.append(sentence_text)
|
|
|
|
+
|
|
|
|
+ if len(list_sentences_temp)==0:
|
|
|
|
+ list_sentences_temp.append(sentence_text)
|
|
|
|
+ list_sentences.append(list_sentences_temp)
|
|
|
|
+ print('2:',time.time()-a_time)
|
|
|
|
+ return list_sentences
|
|
|
|
+
|
|
|
|
+def _find_tag(labels,B_label,M_label,E_label):
|
|
|
|
+ result = []
|
|
|
|
+ ner_begin = 0
|
|
|
|
+ ner_end = 0
|
|
|
|
+ for num in range(len(labels)):
|
|
|
|
+ if labels[num] == B_label:
|
|
|
|
+ ner_begin = num
|
|
|
|
+ continue
|
|
|
|
+ if labels[num] == M_label and labels[num-1] == B_label:
|
|
|
|
+ continue
|
|
|
|
+ if labels[num] == M_label and labels[num-1] == M_label:
|
|
|
|
+ continue
|
|
|
|
+ if labels[num] == E_label:
|
|
|
|
+ if labels[num-1] == M_label or labels[num-1] == B_label:
|
|
|
|
+ ner_end = num+1
|
|
|
|
+ result.append((ner_begin,ner_end))
|
|
|
|
+ ner_begin = 0
|
|
|
|
+ ner_end = 0
|
|
|
|
+ return result
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def find_all_tag(labels):
|
|
|
|
+ # tags = [("PN_B","PN_M","PN_E"),("PC_B","PC_M","PC_E")]
|
|
|
|
+ tags = [(1,2,3),(4,5,6)]
|
|
|
|
+ result = []
|
|
|
|
+ for tag in tags:
|
|
|
|
+ res = _find_tag(labels,B_label=tag[0],M_label=tag[1],E_label=tag[2])
|
|
|
|
+ result.append(res)
|
|
|
|
+ return result
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def ner_precision(pre_labels,true_labels):
|
|
|
|
+ '''
|
|
|
|
+ :param pre_tags: list
|
|
|
|
+ :param true_tags: list
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ pre = []
|
|
|
|
+
|
|
|
|
+ pre_result = find_all_tag(pre_labels)
|
|
|
|
+ for item in pre_result:
|
|
|
|
+ for _item in item:
|
|
|
|
+ if pre_labels[_item[0]:_item[1]] == true_labels[_item[0]:_item[1]]:
|
|
|
|
+ pre.append(1)
|
|
|
|
+ else:
|
|
|
|
+ pre.append(0)
|
|
|
|
+ _sum = sum(pre)
|
|
|
|
+ _l = len(pre)
|
|
|
|
+ if not _l:
|
|
|
|
+ _l = 0.0001
|
|
|
|
+ return _sum,_l,_sum/_l
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def ner_recall(pre_labels,true_labels):
|
|
|
|
+ '''
|
|
|
|
+ :param pre_tags: list
|
|
|
|
+ :param true_tags: list
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ recall = []
|
|
|
|
+
|
|
|
|
+ true_result = find_all_tag(true_labels)
|
|
|
|
+ for item in true_result:
|
|
|
|
+ for _item in item:
|
|
|
|
+ if pre_labels[_item[0]:_item[1]] == true_labels[_item[0]:_item[1]]:
|
|
|
|
+ recall.append(1)
|
|
|
|
+ else:
|
|
|
|
+ recall.append(0)
|
|
|
|
+ _sum = sum(recall)
|
|
|
|
+ _l = len(recall)
|
|
|
|
+ if not _l:
|
|
|
|
+ _l = 0.0001
|
|
|
|
+ return _sum, _l, _sum/_l
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def ner_f1_score(precision,recall):
|
|
|
|
+ _temp = precision+recall
|
|
|
|
+ if not _temp:
|
|
|
|
+ _temp = 0.0001
|
|
|
|
+ return (2*precision*recall)/(_temp)
|
|
|
|
+
|
|
|
|
+def old_data_update():
|
|
|
|
+ data = load('data/old_datas.pk')
|
|
|
|
+ # print(len(data))
|
|
|
|
+ re_code = re.compile("(?:(?:公告|合同)[^,,。:;]{,3}编号[::]*|寻源单据?号|计划[编文]?号|交易编[号码]|询价单编?[码号]|采购项目编号)([\-\d\w\(\)\(\)\[\]\【\】号]{3,})",re.A)
|
|
|
|
+ index = 0
|
|
|
|
+ updat_list = []
|
|
|
|
+ for d in data:
|
|
|
|
+ sentence = ''.join(d[0])
|
|
|
|
+ label = d[1]
|
|
|
|
+ if re_code.search(sentence):
|
|
|
|
+ for item in re_code.finditer(sentence):
|
|
|
|
+ begin,end = item.span()
|
|
|
|
+ # print(sentence[max(0,begin-8):end])
|
|
|
|
+ # print(sentence[begin:end])
|
|
|
|
+ la = label[begin:end]
|
|
|
|
+ if 'PC_B' not in la:
|
|
|
|
+ updat_list.append(index)
|
|
|
|
+ index += 1
|
|
|
|
+ updat_list = list(set(updat_list))
|
|
|
|
+ print(len(updat_list))
|
|
|
|
+ for u in updat_list:
|
|
|
|
+ item = data[u]
|
|
|
|
+ sentence = ''.join(item[0])
|
|
|
|
+ label = item[1]
|
|
|
|
+ re_res = re_code.findall(sentence)
|
|
|
|
+ for res in re_res:
|
|
|
|
+ begin = findAllIndex(res,sentence)
|
|
|
|
+ for b in begin:
|
|
|
|
+ e = b + len(res)
|
|
|
|
+ label[b] = 'PC_B'
|
|
|
|
+ label[e-1] = 'PC_E'
|
|
|
|
+ for i in range(b+1,e-1):
|
|
|
|
+ label[i] = 'PC_M'
|
|
|
|
+ data[u] = (item[0],label)
|
|
|
|
+ # print(sentence)
|
|
|
|
+ # print('---')
|
|
|
|
+ # print(label)
|
|
|
|
+ save(data,'data/old_datas2.pk')
|
|
|
|
+
|
|
|
|
+def get_word_matrix():
|
|
|
|
+ # 获取预训练的字向量
|
|
|
|
+ vocab_model = getModel_word()
|
|
|
|
+ _, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
|
|
|
|
+ # 去除第一行<pad>全0.行
|
|
|
|
+ w2v_matrix = w2v_matrix[1:]
|
|
|
|
+ # <pad>
|
|
|
|
+ pad_0 = np.zeros((1, w2v_matrix.shape[1]), dtype=float)
|
|
|
|
+ # <unk>
|
|
|
|
+ unk_1 = np.random.normal(-0.25, 0.25, (1, w2v_matrix.shape[1]))
|
|
|
|
+ w2v_matrix = np.concatenate((pad_0, unk_1, w2v_matrix), axis=0)
|
|
|
|
+ print(w2v_matrix[:3])
|
|
|
|
+ save(w2v_matrix,"w2v_matrix.pk")
|
|
|
|
+
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
+ # get_data()
|
|
|
|
+ # data_process()
|
|
|
|
+ # add_data_process()
|
|
|
|
+ # train2()
|
|
|
|
+ # test2()
|
|
|
|
+ # new_test()
|
|
|
|
+ # new_process()
|
|
|
|
+ # new_test_code()
|
|
|
|
+ # get_word_matrix()
|
|
|
|
+ # old_data_update()
|
|
|
|
+
|
|
|
|
+ # model_path = "models_tf/76-L0.472526232355-F0.8848208266348597-P0.8845455959355073-R0.8850962286662862/model.ckpt"
|
|
|
|
+ model_path = "models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
|
|
|
|
+ text = '''[X2002185]2020年11月麻城市生活垃圾焚烧发电项目厂前区零星计划
|
|
|
|
+ '''
|
|
|
|
+ name_list, code_list = predict_CodeName([text.split('。')], model_path)
|
|
|
|
+ print(name_list)
|
|
|
|
+ print(code_list)
|
|
|
|
+
|
|
|
|
+ pass
|