#encoding:utf-8 import codecs import json import psycopg2 from collections import Counter from keras.preprocessing.sequence import pad_sequences import numpy import re projectName_pattern_str="工程|项目|系统" def importProjectData(): ''' @summary:导入原算法处理的编号和名称数据 ''' conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() file = "C:\\Users\\User\\Desktop\\bxkc.zhongbiao_extraction.json" with codecs.open(file,"r",encoding="utf8") as f: data = f.read().strip() f.close() data = "["+data.replace("}","},")[:-1]+"]" for item in json.loads(data): if len(item.keys())!=3: continue doc_id = item["document_id"] projectCode = item["project_code"] projectName = item["project_name"] sql = " insert into project_compare(doc_id,projectcode,projectname) values('"+doc_id+"','"+projectCode+"','"+projectName+"')" print(sql) cursor.execute(sql) conn.commit() conn.close() def generateDatas(MAX_LEN,min_freq=1,vocab_set=None): ''' @summary:从数据库中查询标注数据并数值化训练测试数据 @param: MAX_LEN:句子的最大长度 min_freq:字出现的最小频次 vocab_set:预训练字向量的字典,若为None,则使用标注数据生成 @return: train_process:经过数值化的训练数据 test_process:经过数据化的测试数据 vocab:字典 chunk_tags:标签 ''' conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = " select A.content,B.projectcode,B.projectname,B.doc_id from articles_processed A,project_relabel B where A.id=B.doc_id " order_sql = " order by B.doc_id " train_sql = " and B.doc_id not in (select doc_id from project order by doc_id limit 300) " test_sql = " and B.doc_id in (select doc_id from project order by doc_id limit 300) " def hasNotBeenLabeled(items,code_begin,code): for i in range(code_begin,code_begin+len(code)): if items[i][1]!="O": return False return True def findAllIndex(substr,wholestr): copystr = wholestr result = [] indexappend = 0 while(True): index = copystr.find(substr) if index<0: break else: result.append(indexappend+index) indexappend += index+len(substr) copystr = copystr[index+len(substr):] return result def replaceCharAndNumber(data): char_replace = re.compile("[a-zA-Z]") number_replace = re.compile("[0-9]") for i in range(len(data)): for j in range(len(data[i])): data[i][j][0] = re.sub(number_replace,"N",re.sub(char_replace,"C",data[i][j][0])) print(sql+train_sql+order_sql) cursor.execute(sql+train_sql+order_sql) rows = cursor.fetchall() train = [] for row in rows: code = row[1] if row[1]!="" else "" name = row[2] if row[2]!="" else "" codes = re.split("[;;]",code) codes.sort(key = lambda x:len(x),reverse = True) names = re.split("[;;]",name) names.sort(key = lambda x:len(x),reverse = True) contents = re.split("。",str(row[0])) for content in contents: out_of_len = False double_flag = False data_item = [] for i in range(len(str(content))): data_item_item = [] data_item_item.append(content[i]) data_item_item.append("O") data_item.append(data_item_item) code_find_flag = False name_find_flag = False for code in codes: if len(code)==0: continue code_begins = findAllIndex(code,content) for code_begin in code_begins: if code_beginMAX_LEN: out_of_len = True if code_begin>5: double_begin = code_begin-5 else: double_begin = 0 if len(re.findall("编号|名称",content[double_begin:code_begin]))>0: double_flag = True code_find_flag = True if len(code)==1: if hasNotBeenLabeled(data_item, code_begin, code): data_item[code_begin][1] = "PC_S" else: if hasNotBeenLabeled(data_item, code_begin, code): for j in range(code_begin,code_begin+len(code)): if j==code_begin: data_item[j][1] = "PC_B" elif j==code_begin+len(code)-1: data_item[j][1] = "PC_E" else: data_item[j][1] = "PC_M" for name in names: if len(name)==0: continue name_begins = findAllIndex(name,content) for name_begin in name_begins: if name_beginMAX_LEN: out_of_len = True if name_begin>5: double_begin = name_begin-5 else: double_begin = 0 if len(re.findall("编号|名称",content[double_begin:name_begin]))>0: double_flag = True name_find_flag = True if len(name)==1: if hasNotBeenLabeled(data_item, name_begin, name): data_item[name_begin][1] = "PC_S" else: if hasNotBeenLabeled(data_item, name_begin, name): for j in range(name_begin,name_begin+len(name)): if j==name_begin: data_item[j][1] = "PN_B" elif j==name_begin+len(name)-1: data_item[j][1] = "PN_E" else: data_item[j][1] = "PN_M" if code_find_flag or name_find_flag: if not out_of_len: if double_flag: train.append(data_item) train.append(data_item) else: if numpy.random.random()<=0.05: train.append(data_item) print(sql+test_sql+order_sql) cursor.execute(sql+test_sql+order_sql) rows = cursor.fetchall() test = [] list_docid_content = [] for row in rows: code = row[1] if row[1]!="" else "" name = row[2] if row[2]!="" else "" codes = re.split("[;;]",code) codes.sort(key = lambda x:len(x),reverse = True) names = re.split("[;;]",name) names.sort(key = lambda x:len(x),reverse = True) contents = re.split("。",str(row[0])) for content in contents: data_item = [] for i in range(len(str(content))): data_item_item = [] data_item_item.append(content[i]) data_item_item.append("O") data_item.append(data_item_item) code_find_flag = False name_find_flag = False for code in codes: if len(code)==0: continue code_begins = findAllIndex(code,content) for code_begin in code_begins: code_find_flag = True if len(code)==1: if hasNotBeenLabeled(data_item, code_begin, code): data_item[code_begin][1] = "PC_S" else: if hasNotBeenLabeled(data_item, code_begin, code): for j in range(code_begin,code_begin+len(code)): if j==code_begin: data_item[j][1] = "PC_B" elif j==code_begin+len(code)-1: data_item[j][1] = "PC_E" else: data_item[j][1] = "PC_M" for name in names: if len(name)==0: continue name_begins = findAllIndex(name,content) for name_begin in name_begins: name_find_flag = True if len(name)==1: if hasNotBeenLabeled(data_item, name_begin, name): data_item[name_begin][1] = "PC_S" else: if hasNotBeenLabeled(data_item, name_begin, name): for j in range(name_begin,name_begin+len(name)): if j==name_begin: data_item[j][1] = "PN_B" elif j==name_begin+len(name)-1: data_item[j][1] = "PN_E" else: data_item[j][1] = "PN_M" if code_find_flag or name_find_flag: test.append(data_item) doc_id_content = [row[3],content] list_docid_content.append(doc_id_content) else: if numpy.random.random()<=0.05: test.append(data_item) doc_id_content = [row[3],content] list_docid_content.append(doc_id_content) #replaceCharAndNumber(train) #replaceCharAndNumber(test) word_counts = Counter(row[0] for sample in train for row in sample) chunk_tags = sorted(list(set(row[1] for sample in train + test for row in sample))) # in alphabetic order if vocab_set is not None: vocab = vocab_set else: vocab = ['', ''] + [w for w, f in iter(word_counts.items()) if f >= min_freq] with codecs.open("viewTest.txt","w",encoding="utf8") as f: for t in test: for h in t: f.write(str(h[0])+" "+str(h[1])) f.write("\n") f.write("$#\n") f.flush() f.close() with codecs.open("viewTrain.txt","w",encoding="utf8") as f: for t in train: for h in t: f.write(str(h[0])+" "+str(h[1])) f.write("\n") f.write("$#\n") f.flush() f.close() with codecs.open("docid_content.txt","w",encoding="utf8") as f: for t in list_docid_content: f.write(t[0]+" "+t[1]) f.write("\n") f.flush() train_process = _process_data(train, vocab, chunk_tags,MAX_LEN) test_process = _process_data(test, vocab, chunk_tags,MAX_LEN) #return (train[0][:-200],train[1][:-200]),(train[0][-200:],train[1][-200:]),(vocab, chunk_tags) return train_process, test_process, (vocab, chunk_tags),test def _process_data(data, vocab,chunk_tags, maxlen): ''' @summary:处理数据 @param: data:句子list vocab:字典 chunk_tags:标签list maxlen:限定最大的句子长度,若不设置则为最长的句子的长度 @return: x:经过补全和数值化的数据 y_chunk:对应的标签 ''' if maxlen is None: maxlen = max(len(s) for s in data) word2idx = dict((w, i) for i, w in enumerate(numpy.array(vocab))) print(len(vocab)) print(vocab.index("")) index_unk = word2idx.get("") index_pad = word2idx.get("") print("unk",index_unk,"pad",index_pad) x = [[word2idx.get(w[0], index_unk) for w in s] for s in data] # set to (index 1) if not in vocab x_len = [maxlen if len(s)>maxlen else len(s) for s in data] y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data] x = pad_sequences(x, maxlen,value=index_pad,padding='post',truncating='post') # left padding y_chunk = pad_sequences(y_chunk, maxlen, value=0,padding='post',truncating='post') # y_chunk = numpy.expand_dims(y_chunk, 2) return x, y_chunk,x_len if __name__=="__main__": ''' importProjectData() ''' # (train_x, train_y), (test_x, test_y), (vocab,class_labels),test = generateDatas(MAX_LEN=300) # print(len(train_x)) # print(len(test_x)) # print(class_labels)