import tensorflow as tf # from tensorflow.contrib.crf import crf_log_likelihood # from tensorflow.contrib.layers.python.layers import initializers import numpy as np import pandas as pd import os import psycopg2 import re import pickle from BiddingKG.dl.common.Utils import * from keras.preprocessing.sequence import pad_sequences def get_data(): # with open("viewTrain.txt",'r',encoding='utf-8') as f1,open("viewTest.txt",'r',encoding='utf-8') as f2: # rows1 = f1.readlines() # rows2 = f2.readlines() # rows = rows1 + rows2 # sentence = [] # sentence_label = [] # sentences_and_labels = [] # for row in rows: # if row[1]!='#': # sentence.append(row[0]) # sentence_label.append(row[2:-1]) # else: # sentences_and_labels.append((sentence,sentence_label)) # sentence = [] # sentence_label = [] # print(sentences_and_labels) # save(sentences_and_labels,"data/old_datas.pk") conn = psycopg2.connect(dbname="iepy",user="postgres",password="postgres",host="192.168.2.101") user_list = [ ["test1","2020-08-01","2020-11-25"], ["test11","2020-08-01","2020-11-25"], ["test12","2020-08-01","2020-11-25"], ["test17","2020-08-01","2020-10-31"], ["test19","2020-08-01","2020-11-25"], ["test2","2020-08-01","2020-11-25"], ["test3","2020-08-01","2020-11-25"], ["test7","2020-08-01","2020-11-25"], ["test8","2020-08-01","2020-11-25"], ["test9","2020-08-01","2020-11-25"], ] db_data = [] for u in user_list: cur1 = conn.cursor() sql = "SELECT B.document_id,A.text,A.sentences,B.value " \ "FROM corpus_iedocument A,brat_bratannotation B " \ "WHERE A.human_identifier = B.document_id " \ "AND A.edituser = '%s' " \ "AND A.edittime >= '%s':: date " \ "AND A.edittime <= '%s':: date " # "ORDER BY B.document_id" cur1.execute(sql % (u[0], u[1], u[2])) db_data.extend(cur1.fetchall()) cur1.close() # print(len(db_data)) # print(db_data[0]) columns = ['document_id','text', 'sentences', 'value'] df = pd.DataFrame(db_data, columns=columns) df = df[df['value'].str.contains('^T')] # df = df[df['value'].str.contains('code|name|org|company')] df = df[df['value'].str.contains('code|name')] df = df.reset_index(drop=True) value_split = df['value'].str.split(expand=True) value_split.columns = ['_', 'entity_type', 'begin', 'end', 'entity_text'] value_split = value_split.drop('_', axis=1) df = pd.concat([df, value_split], axis=1) df = df.drop('value', axis=1) df['begin'] = [int(_) for _ in df['begin']] df['end'] = [int(_) for _ in df['end']] code_left_list = [] for begin,text,entity_type in zip(df['begin'],df['text'],df['entity_type']): code_left = '' if entity_type == 'code': code_left = text[max(0,begin-8):begin] code_left_list.append(code_left) df['code_left'] = code_left_list df.to_excel("C:\\Users\\admin\\Desktop\\项目编号和名称\\Code&Name_dbData.xlsx") conn.close() def data_process(): data = pd.read_excel("C:\\Users\\admin\\Desktop\\项目编号和名称\\Code&Name_dbData.xlsx",index_col=0) data['sentences'] = [sentences[1:-1].split(',') for sentences in data['sentences']] data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']] memory_set = set() id_list = [] text_list = [] text_tagLabels = dict() for _id, _text, _sentences in zip(data['document_id'], data['text'], data['sentences']): if _id not in memory_set: memory_set.add(_id) text_list.append(_text) id_list.append(_id) text_tagLabels[_id] = [[],[]] re_drop = re.compile("((?:公开)?招标?|中标(?:结果)?|结果|公[告示]?|招标公告?|中标公[告示]?|候选人公[告示]|终止|" "[流废]标|资格预审|预审|成交(?:结果)?|交易|交易信息|入围|合同|通知书)$") re_errorCode = re.compile("账号|身份证|机构编号|代理机构|品目|单位编[号码]|索引号|标准[^项目]*$|资产编号|型号|序列号" "|宗地编号|地块编号|监测编号|不动产证") # |备案[^,.;,。;]*[号码] for id, text, sentences, entity_type, begin, end, entity_text, code_left in zip(data['document_id'], data['text'], data['sentences'], data['entity_type'], data['begin'], data['end'], data['entity_text'], data['code_left']): if entity_type == 'name': if re_drop.search(entity_text): name_2 = re_drop.sub('', re_drop.sub('', entity_text)) entity_text = name_2 text_tagLabels[id][0].append(entity_text) if entity_type == 'code': if not re_errorCode.search(str(code_left)): text_tagLabels[id][1].append(entity_text) train_data = [] max_len = 400 def hasNotBeenLabeled(items,code_begin,code): for i in range(code_begin,code_begin+len(code)): if items[i]!="O": return False return True count = 0 for id,text in zip(id_list,text_list): count += 1 print(count) names = text_tagLabels[id][0] names = list(set(names)) names.sort(key=lambda x: len(x), reverse=True) codes = text_tagLabels[id][1] codes = list(set(codes)) codes.sort(key=lambda x: len(x), reverse=True) sentences = text.split('。') for sentence in sentences: l = len(sentence) if l==0: continue elif l > max_len: l = max_len sentence = sentence[:400] sentence_label = ['O']*l code_find_flag = False name_find_flag = False if names: for name in names: name_begins = findAllIndex(name,sentence) for name_begin in name_begins: if hasNotBeenLabeled(sentence_label,name_begin,name): for j in range(name_begin,name_begin+len(name)): if j==name_begin: sentence_label[j] = "PN_B" elif j==name_begin+len(name)-1: sentence_label[j] = "PN_E" else: sentence_label[j] = "PN_M" name_find_flag = True if codes: for code in codes: code_begins = findAllIndex(code,sentence) for code_begin in code_begins: if hasNotBeenLabeled(sentence_label,code_begin,code): for j in range(code_begin,code_begin+len(code)): if j==code_begin: sentence_label[j] = "PC_B" elif j==code_begin+len(code)-1: sentence_label[j] = "PC_E" else: sentence_label[j] = "PC_M" code_find_flag = True if code_find_flag or name_find_flag: train_data.append([sentence,sentence_label]) else: if np.random.random() <= 0.75: train_data.append([sentence,sentence_label]) print(len(train_data)) save(train_data,'train_data_new.pk') def add_data_process(): def hasNotBeenLabeled(items, code_begin, code): for i in range(code_begin, code_begin + len(code)): if items[i] != "O": return False return True train_data = [] max_len = 400 data_path = "C:\\Users\\admin\\Desktop\\项目编号和名称\\补充数据\\data_" data_names = ["合同编号","出让公告","询价编号","询价单编号","出让成交公示", "的通知","公告编号","交易编号","询价单号","房产英文类项目名称", "挂牌编号","申购单号","订单编号","询价书编号"] for data_name in data_names: data = pd.read_csv(data_path+data_name+"_process.csv",index_col=0,encoding='utf-8') count = 0 for text,_name,_code in zip(data['text'],data['pj_name'],data['pj_code']): count += 1 print(count) names = str(_name).split('+') names.sort(key=lambda x: len(x), reverse=True) codes = str(_code).split('+') codes.sort(key=lambda x: len(x), reverse=True) sentences = text.split('。') for sentence in sentences: l = len(sentence) if l == 0: continue elif l > max_len: l = max_len sentence = sentence[:400] sentence_label = ['O'] * l if names: for name in names: name_begins = findAllIndex(name, sentence) for name_begin in name_begins: if hasNotBeenLabeled(sentence_label, name_begin, name): for j in range(name_begin, name_begin + len(name)): if j == name_begin: sentence_label[j] = "PN_B" elif j == name_begin + len(name) - 1: sentence_label[j] = "PN_E" else: sentence_label[j] = "PN_M" if codes: for code in codes: code_begins = findAllIndex(code, sentence) for code_begin in code_begins: if hasNotBeenLabeled(sentence_label, code_begin, code): for j in range(code_begin, code_begin + len(code)): if j == code_begin: sentence_label[j] = "PC_B" elif j == code_begin + len(code) - 1: sentence_label[j] = "PC_E" else: sentence_label[j] = "PC_M" train_data.append([sentence, sentence_label]) d = load('train_data_new.pk') print(len(d)) train_data = d + train_data print(len(train_data)) print('ok') save(train_data, 'train_data_new2.pk') def train2(): chunk_tags = { 'O':0, 'PN_B':1, 'PN_M':2, 'PN_E':3, 'PC_B':4, 'PC_M':5, 'PC_E':6, } # 获取预训练的字向量矩阵 w2v_matrix = load('w2v_matrix.pk') # print(w2v_matrix[:3]) vocab = load('codename_vocab.pk') word2index = dict((w, i) for i, w in enumerate(np.array(vocab))) print(vocab[:2]) MAXLEN = 400 data_x = [] data_y = [] data1 = load('train_data_new2.pk') for _data in data1: _x = list(_data[0]) _x = [word2index.get(_,word2index.get('')) for _ in _x] _y = _data[1] data_x.append(_x) data_y.append(_y) # 旧的标注数据加入 old_datas = load("data/old_datas2.pk") for old_data in old_datas: data_x.append([word2index.get(word,word2index.get('')) for word in old_data[0]]) data_y.append(old_data[1]) print("数据量:",len(data_x)) data_x = np.array([np.array(x) for x in data_x]) x_len = [MAXLEN if len(x) > MAXLEN else len(x) for x in data_x] data_y = np.array([np.array([chunk_tags[_] for _ in y]) for y in data_y]) data_x = pad_sequences(data_x, maxlen=MAXLEN, padding="post", truncating="post") data_y = pad_sequences(data_y, maxlen=MAXLEN, padding="post", truncating="post") indices = np.random.permutation(data_x.shape[0]) count = len(data_x) test_count = int(0.2 * count) test_idx, train_idx = indices[:test_count], indices[test_count:] train_x, test_x = data_x[train_idx, :], data_x[test_idx, :] train_y, test_y = data_y[train_idx, :], data_y[test_idx, :] train_x_len = np.array([x_len[idx] for idx in train_idx]) test_x_len = np.array([x_len[idx] for idx in test_idx]) print("训练数据量:",len(train_x)) print("训练数据量:",len(test_x)) # save([test_x,test_y,test_x_len],'my_test_data.pk') with tf.Session(graph=tf.Graph()) as sess: char_input,logits,target,keepprob,length,crf_loss,trans,train_op = BiLSTM_CRF_tfmodel(sess,embedding_weights=w2v_matrix) sess.run(tf.global_variables_initializer()) epochs = 150 saver = tf.train.Saver(max_to_keep=max(epochs,10)) batch_size = 1024 _test_loss = 10000. _test_f1 = 0. for epoch in range(epochs): batch_nums = 0 for x_batch,y_batch,x_len_batch in batch_iter(train_x,train_y,train_x_len,batch_size=batch_size): train_loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:x_batch,target:y_batch,length:x_len_batch,keepprob:0.7}) batch_nums += 1 print("--epoch:" + str(epoch)) print("--"+str(batch_nums)+"batch_train--", "loss:", train_loss) test_loss_sum = 0. test_sum = 0 acc_sum = 0. precision_1 = 0 precision_2 = 0 recall_1 = 0 recall_2 = 0 for test_xbatch,test_ybatch,test_xlen in batch_iter(test_x,test_y,test_x_len,batch_size=batch_size): test_loss,_logits,_trans = sess.run([crf_loss,logits,trans],feed_dict={char_input:test_xbatch,target:test_ybatch,length:test_xlen,keepprob:1.0}) acc,_precision,_recall = getAcc(test_ybatch, _logits, _trans, test_xlen) batch_len = len(test_xbatch) test_sum += batch_len acc_sum += acc*batch_len precision_1 += _precision[0] precision_2 += _precision[1] recall_1 += _recall[0] recall_2 += _recall[1] test_loss_sum += test_loss*batch_len print("==>epoch:" + str(epoch)+"have_done") epoch_test_loss = test_loss_sum/test_sum epoch_test_acc = acc_sum/test_sum test_precision = precision_1/precision_2 test_recall = recall_1/recall_2 test_f1 = ner_f1_score(test_precision,test_recall) print("--test --"," acc:",epoch_test_acc,'test_loss:',epoch_test_loss) print('test_precision:',test_precision,'test_recall',test_recall,'test_f1',test_f1) # if test_f1 > _test_f1: # _test_f1 = test_f1 print("Saving-"+str(epoch)+"-model,test_loss:"+str(epoch_test_loss),'test_f1',test_f1) saver.save(sess,"models_tf/"+str(epoch)+"-L"+str(epoch_test_loss)+"-F"+str(test_f1)+"-P"+str(test_precision)+"-R"+str(test_recall)+"/model.ckpt") def BiLSTM_CRF_tfmodel(sess,embedding_weights): ''' :param embedding_weights: 预训练的字向量矩阵 ''' BiRNN_Unit = 100 chunk_tags = { 'O': 0, 'PN_B': 1, 'PN_M': 2, 'PN_E': 3, 'PC_B': 4, 'PC_M': 5, 'PC_E': 6, } def embedding_layer(input,keepprob): # 加载预训练的字向量矩阵 embedding = tf.get_variable(name="embedding",initializer=np.array(embedding_weights, dtype=np.float32),dtype=tf.float32) embedding = tf.nn.embedding_lookup(params=embedding,ids=input) embedding_drop = tf.nn.dropout(embedding,keepprob) return embedding_drop def BiLSTM_Layer(input,length): with tf.variable_scope("BiLSTM"): forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True) backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True) output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length) output = tf.concat(output,2) return output def CRF_layer(input,num_tags,BiRNN_Unit,time_step,keepprob): with tf.variable_scope("CRF"): with tf.variable_scope("hidden"): w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Unit*2,BiRNN_Unit),dtype=tf.float32, initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001)) b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Unit),dtype=tf.float32,initializer=tf.zeros_initializer()) # print(input) input_reshape = tf.reshape(input,shape=(-1,BiRNN_Unit*2)) hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden)) hidden = tf.nn.dropout(hidden,keepprob) with tf.variable_scope("output"): w_output = tf.get_variable(name='w_output',shape=(BiRNN_Unit,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001)) b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer()) pred = tf.nn.xw_plus_b(hidden,w_output,b_output) logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits') return logits_ def layer_loss(input,true_target,num_tags,length): with tf.variable_scope("crf_loss"): trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer()) log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length) return tf.reduce_mean(-log_likelihood),trans with sess.graph.as_default(): char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32) target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32) length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32) keepprob = tf.placeholder(name='keepprob',dtype=tf.float32) _embedding = embedding_layer(char_input,keepprob) _shape = tf.shape(char_input) batch_size = _shape[0] step_size = _shape[-1] bilstm = BiLSTM_Layer(_embedding,length) _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Unit=BiRNN_Unit,time_step=step_size,keepprob=keepprob) crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length) global_step = tf.Variable(0,trainable=False) with tf.variable_scope("optimizer"): opt = tf.train.AdamOptimizer(0.002) grads_vars = opt.compute_gradients(crf_loss) capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars] train_op = opt.apply_gradients(capped_grads_vars,global_step) return char_input,_logits,target,keepprob,length,crf_loss,trans,train_op def batch_iter(x, y,x_len, batch_size=256): ''' :param x: content2id :param y: label2id :param batch_size: 每批参与训练的句子数量 :return: ''' data_len = len(x) num_batch = int((data_len - 1) / batch_size) + 1 #计算一个epoch,需要多少次batch indices = np.random.permutation(data_len) #生成随机数列 x = x[indices] y = y[indices] x_len = x_len[indices] for i in range(num_batch): start_id = batch_size * i end_id = min(batch_size*(i+1), data_len) yield x[start_id:end_id], y[start_id:end_id],x_len[start_id:end_id] from sklearn.metrics import accuracy_score def getAcc(y_batch,logits,trans,lengths): index = 0 small = -1000.0 preds = [] true_tags = [] for score, length in zip(logits, lengths): score = score[:length] path, _ = viterbi_decode(score, trans) preds += path[0:] index += 1 for y, length in zip(y_batch, lengths): y = y.tolist() true_tags += y[: length] _preds = list(preds) _true_tags = list(true_tags) acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1))) precision_1,precision_2,_ = ner_precision(_preds,_true_tags) recall_1,recall_2,_ = ner_recall(_preds,_true_tags) return acc,[precision_1,precision_2],[recall_1,recall_2] def decode(logits, trans, sequence_lengths, tag_num): viterbi_sequences = [] for logit, length in zip(logits, sequence_lengths): score = logit[:length] viterbi_seq, viterbi_score = viterbi_decode(score, trans) viterbi_sequences.append(viterbi_seq) return viterbi_sequences def new_process(): data = pd.read_csv("C:\\Users\\admin\\Desktop\\项目编号和名称\\data_询价书编号.csv",index_col=0,encoding='utf-8') text_list = [] for id,text in zip(data['id'],data['text']): # id_list.append(id) text_list.append(text) page_content = get_article1(text_list) data['text'] = page_content data.to_csv('C:\\Users\\admin\\Desktop\\项目编号和名称\\data_询价书编号_process.csv') def new_test_code(): data = pd.read_csv("C:\\Users\\admin\\Desktop\\code_test_process2.csv",index_col=0) sentences_list = [] for text in data['text']: sentences = text.split("。") sentences_list.append(sentences) model_path = "models_tf/27-0.984184712668-0.598231307426/model.ckpt" name_list,code_list = predict_CodeName(sentences_list,model_path) data['code'] = code_list data['name'] = name_list data.to_csv("C:\\Users\\admin\\Desktop\\code_test结果2-3.csv") def predict_CodeName(articles,model_path): w2v_matrix = load('w2v_matrix.pk') vocab = load('codename_vocab.pk') word2index = dict((w, i) for i, w in enumerate(np.array(vocab))) model_path = model_path sess = tf.Session(graph=tf.Graph()) with sess: char_input, logits, target, keepprob,length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, model_path) re_name = re.compile("12*3") re_code = re.compile("45*6") article_name_list = [] article_code_list = [] count = 0 for sentences in articles: if len(sentences)>500: sentences = sentences[:500] # print(len(sentences)) count += 1 print(count) sentence_len = [ min(len(sentence),2000) for sentence in sentences] # maxlen = max(sentence_len) maxlen = max(sentence_len) sentences_x = [] for sentence in sentences: sentence = list(sentence) sentence2id = [word2index.get(word,word2index.get('')) for word in sentence] sentences_x.append(sentence2id) sentences_x = pad_sequences(sentences_x,maxlen=maxlen,padding="post", truncating="post") sentences_x = [np.array(x) for x in sentences_x] _logits,_trans = sess.run([logits,trans],feed_dict={char_input:np.array(sentences_x),length:sentence_len,keepprob:1.0}) viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=sentence_len,tag_num=7) # print("==",_logits) name_list = [] code_list = [] sentence_index = 0 for _seq,sentence in zip(viterbi_sequence,sentences): seq_id = ''.join([str(s) for s in _seq]) if re_name.search(seq_id): for _name in re_name.finditer(seq_id): start = _name.start() end = _name.end() n = sentence[start:end] name_list.append((n,start + sentence_index,end + sentence_index)) if re_code.search(seq_id): for _code in re_code.finditer(seq_id): start = _code.start() end = _code.end() c = sentence[start:end] # print(n,'<==>',start,end) code_list.append((c,start + sentence_index,end + sentence_index)) sentence_index += len(sentence) article_name_list.append(name_list) article_code_list.append(code_list) return article_name_list,article_code_list from BiddingKG.dl.interface.Preprocessing import * # 网页公告处理 def get_article1(articles,cost_time = dict(),useselffool=True): ''' :param articles: 待处理的article source html :param useselffool: 是否使用selffool :return: list_articles ''' list_articles = [] for article in articles: a_time = time.time() sourceContent = article #表格处理 key_preprocess = "tableToText" start_time = time.time() article_processed = segment(tableToText(BeautifulSoup(sourceContent,"lxml"))) # log(article_processed) if key_preprocess not in cost_time: cost_time[key_preprocess] = 0 cost_time[key_preprocess] += round(time.time()-start_time,2) #article_processed = article[1] list_articles.append(article_processed) print(time.time()-a_time) return list_articles # 分句处理 def get_sentences1(list_articles,useselffool=True,cost_time=dict()): ''' :param list_articles: 经过预处理的article text :return: list_sentences ''' list_sentences = [] for article in list_articles: a_time = time.time() list_sentences_temp = [] #表格处理 key_preprocess = "tableToText" start_time = time.time() article_processed = article if key_preprocess not in cost_time: cost_time[key_preprocess] = 0 cost_time[key_preprocess] += round(time.time()-start_time,2) #nlp处理 if article_processed is not None and len(article_processed)!=0: split_patten = "。" sentences = [] _begin = 0 sentences_set = set() for _iter in re.finditer(split_patten,article_processed): _sen = article_processed[_begin:_iter.span()[1]] if len(_sen)>0 and _sen not in sentences_set: sentences.append(_sen) sentences_set.add(_sen) _begin = _iter.span()[1] _sen = article_processed[_begin:] if len(_sen)>0 and _sen not in sentences_set: sentences.append(_sen) sentences_set.add(_sen) ''' tokens_all = fool.cut(sentences) #pos_all = fool.LEXICAL_ANALYSER.pos(tokens_all) #ner_tag_all = fool.LEXICAL_ANALYSER.ner_labels(sentences,tokens_all) ner_entitys_all = fool.ner(sentences) ''' #限流执行 key_nerToken = "nerToken" start_time = time.time() # tokens_all = getTokens(sentences,useselffool=useselffool) if key_nerToken not in cost_time: cost_time[key_nerToken] = 0 cost_time[key_nerToken] += time.time()-start_time for sentence_index in range(len(sentences)): sentence_text = sentences[sentence_index] list_sentences_temp.append(sentence_text) if len(list_sentences_temp)==0: list_sentences_temp.append(sentence_text) list_sentences.append(list_sentences_temp) print('2:',time.time()-a_time) return list_sentences def _find_tag(labels,B_label,M_label,E_label): result = [] ner_begin = 0 ner_end = 0 for num in range(len(labels)): if labels[num] == B_label: ner_begin = num continue if labels[num] == M_label and labels[num-1] == B_label: continue if labels[num] == M_label and labels[num-1] == M_label: continue if labels[num] == E_label: if labels[num-1] == M_label or labels[num-1] == B_label: ner_end = num+1 result.append((ner_begin,ner_end)) ner_begin = 0 ner_end = 0 return result def find_all_tag(labels): # tags = [("PN_B","PN_M","PN_E"),("PC_B","PC_M","PC_E")] tags = [(1,2,3),(4,5,6)] result = [] for tag in tags: res = _find_tag(labels,B_label=tag[0],M_label=tag[1],E_label=tag[2]) result.append(res) return result def ner_precision(pre_labels,true_labels): ''' :param pre_tags: list :param true_tags: list :return: ''' pre = [] pre_result = find_all_tag(pre_labels) for item in pre_result: for _item in item: if pre_labels[_item[0]:_item[1]] == true_labels[_item[0]:_item[1]]: pre.append(1) else: pre.append(0) _sum = sum(pre) _l = len(pre) if not _l: _l = 0.0001 return _sum,_l,_sum/_l def ner_recall(pre_labels,true_labels): ''' :param pre_tags: list :param true_tags: list :return: ''' recall = [] true_result = find_all_tag(true_labels) for item in true_result: for _item in item: if pre_labels[_item[0]:_item[1]] == true_labels[_item[0]:_item[1]]: recall.append(1) else: recall.append(0) _sum = sum(recall) _l = len(recall) if not _l: _l = 0.0001 return _sum, _l, _sum/_l def ner_f1_score(precision,recall): _temp = precision+recall if not _temp: _temp = 0.0001 return (2*precision*recall)/(_temp) def old_data_update(): data = load('data/old_datas.pk') # print(len(data)) re_code = re.compile("(?:(?:公告|合同)[^,,。:;]{,3}编号[::]*|寻源单据?号|计划[编文]?号|交易编[号码]|询价单编?[码号]|采购项目编号)([\-\d\w\(\)\(\)\[\]\【\】号]{3,})",re.A) index = 0 updat_list = [] for d in data: sentence = ''.join(d[0]) label = d[1] if re_code.search(sentence): for item in re_code.finditer(sentence): begin,end = item.span() # print(sentence[max(0,begin-8):end]) # print(sentence[begin:end]) la = label[begin:end] if 'PC_B' not in la: updat_list.append(index) index += 1 updat_list = list(set(updat_list)) print(len(updat_list)) for u in updat_list: item = data[u] sentence = ''.join(item[0]) label = item[1] re_res = re_code.findall(sentence) for res in re_res: begin = findAllIndex(res,sentence) for b in begin: e = b + len(res) label[b] = 'PC_B' label[e-1] = 'PC_E' for i in range(b+1,e-1): label[i] = 'PC_M' data[u] = (item[0],label) # print(sentence) # print('---') # print(label) save(data,'data/old_datas2.pk') def get_word_matrix(): # 获取预训练的字向量 vocab_model = getModel_word() _, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60) # 去除第一行全0.行 w2v_matrix = w2v_matrix[1:] # pad_0 = np.zeros((1, w2v_matrix.shape[1]), dtype=float) # unk_1 = np.random.normal(-0.25, 0.25, (1, w2v_matrix.shape[1])) w2v_matrix = np.concatenate((pad_0, unk_1, w2v_matrix), axis=0) print(w2v_matrix[:3]) save(w2v_matrix,"w2v_matrix.pk") if __name__ == '__main__': # get_data() # data_process() # add_data_process() # train2() # test2() # new_test() # new_process() # new_test_code() # get_word_matrix() # old_data_update() # model_path = "models_tf/76-L0.472526232355-F0.8848208266348597-P0.8845455959355073-R0.8850962286662862/model.ckpt" model_path = "models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt" text = '''[X2002185]2020年11月麻城市生活垃圾焚烧发电项目厂前区零星计划 ''' name_list, code_list = predict_CodeName([text.split('。')], model_path) print(name_list) print(code_list) pass