123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340 |
- #encoding:utf-8
- import codecs
- import json
- import psycopg2
- from collections import Counter
- from keras.preprocessing.sequence import pad_sequences
- import numpy
- import re
- projectName_pattern_str="工程|项目|系统"
- def importProjectData():
- '''
- @summary:导入原算法处理的编号和名称数据
- '''
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- file = "C:\\Users\\User\\Desktop\\bxkc.zhongbiao_extraction.json"
- with codecs.open(file,"r",encoding="utf8") as f:
- data = f.read().strip()
- f.close()
- data = "["+data.replace("}","},")[:-1]+"]"
- for item in json.loads(data):
- if len(item.keys())!=3:
- continue
- doc_id = item["document_id"]
- projectCode = item["project_code"]
- projectName = item["project_name"]
- sql = " insert into project_compare(doc_id,projectcode,projectname) values('"+doc_id+"','"+projectCode+"','"+projectName+"')"
- print(sql)
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- def generateDatas(MAX_LEN,min_freq=1,vocab_set=None):
- '''
- @summary:从数据库中查询标注数据并数值化训练测试数据
- @param:
- MAX_LEN:句子的最大长度
- min_freq:字出现的最小频次
- vocab_set:预训练字向量的字典,若为None,则使用标注数据生成
- @return:
- train_process:经过数值化的训练数据
- test_process:经过数据化的测试数据
- vocab:字典
- chunk_tags:标签
- '''
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- sql = " select A.content,B.projectcode,B.projectname,B.doc_id from articles_processed A,project_relabel B where A.id=B.doc_id "
-
- order_sql = " order by B.doc_id "
- train_sql = " and B.doc_id not in (select doc_id from project order by doc_id limit 300) "
- test_sql = " and B.doc_id in (select doc_id from project order by doc_id limit 300) "
-
-
- def hasNotBeenLabeled(items,code_begin,code):
- for i in range(code_begin,code_begin+len(code)):
- if items[i][1]!="O":
- return False
- return True
-
-
- def findAllIndex(substr,wholestr):
- copystr = wholestr
- result = []
- indexappend = 0
- while(True):
- index = copystr.find(substr)
- if index<0:
- break
- else:
- result.append(indexappend+index)
- indexappend += index+len(substr)
- copystr = copystr[index+len(substr):]
- return result
-
- def replaceCharAndNumber(data):
- char_replace = re.compile("[a-zA-Z]")
- number_replace = re.compile("[0-9]")
- for i in range(len(data)):
- for j in range(len(data[i])):
- data[i][j][0] = re.sub(number_replace,"N",re.sub(char_replace,"C",data[i][j][0]))
-
- print(sql+train_sql+order_sql)
- cursor.execute(sql+train_sql+order_sql)
- rows = cursor.fetchall()
- train = []
-
-
- for row in rows:
- code = row[1] if row[1]!="" else ""
- name = row[2] if row[2]!="" else ""
- codes = re.split("[;;]",code)
- codes.sort(key = lambda x:len(x),reverse = True)
- names = re.split("[;;]",name)
- names.sort(key = lambda x:len(x),reverse = True)
- contents = re.split("。",str(row[0]))
- for content in contents:
-
- out_of_len = False
-
- double_flag = False
- data_item = []
- for i in range(len(str(content))):
- data_item_item = []
- data_item_item.append(content[i])
- data_item_item.append("O")
- data_item.append(data_item_item)
-
- code_find_flag = False
- name_find_flag = False
- for code in codes:
- if len(code)==0:
- continue
- code_begins = findAllIndex(code,content)
- for code_begin in code_begins:
-
- if code_begin<MAX_LEN and (code_begin+len(code))>MAX_LEN:
- out_of_len = True
-
- if code_begin>5:
- double_begin = code_begin-5
- else:
- double_begin = 0
- if len(re.findall("编号|名称",content[double_begin:code_begin]))>0:
- double_flag = True
-
- code_find_flag = True
- if len(code)==1:
- if hasNotBeenLabeled(data_item, code_begin, code):
- data_item[code_begin][1] = "PC_S"
- else:
- if hasNotBeenLabeled(data_item, code_begin, code):
- for j in range(code_begin,code_begin+len(code)):
- if j==code_begin:
- data_item[j][1] = "PC_B"
- elif j==code_begin+len(code)-1:
- data_item[j][1] = "PC_E"
- else:
- data_item[j][1] = "PC_M"
- for name in names:
- if len(name)==0:
- continue
-
- name_begins = findAllIndex(name,content)
- for name_begin in name_begins:
-
- if name_begin<MAX_LEN and (name_begin+len(name))>MAX_LEN:
- out_of_len = True
-
- if name_begin>5:
- double_begin = name_begin-5
- else:
- double_begin = 0
- if len(re.findall("编号|名称",content[double_begin:name_begin]))>0:
- double_flag = True
-
- name_find_flag = True
- if len(name)==1:
- if hasNotBeenLabeled(data_item, name_begin, name):
- data_item[name_begin][1] = "PC_S"
- else:
- if hasNotBeenLabeled(data_item, name_begin, name):
- for j in range(name_begin,name_begin+len(name)):
- if j==name_begin:
- data_item[j][1] = "PN_B"
- elif j==name_begin+len(name)-1:
- data_item[j][1] = "PN_E"
- else:
- data_item[j][1] = "PN_M"
- if code_find_flag or name_find_flag:
- if not out_of_len:
- if double_flag:
- train.append(data_item)
- train.append(data_item)
- else:
- if numpy.random.random()<=0.05:
- train.append(data_item)
- print(sql+test_sql+order_sql)
- cursor.execute(sql+test_sql+order_sql)
- rows = cursor.fetchall()
- test = []
- list_docid_content = []
- for row in rows:
- code = row[1] if row[1]!="" else ""
- name = row[2] if row[2]!="" else ""
- codes = re.split("[;;]",code)
- codes.sort(key = lambda x:len(x),reverse = True)
- names = re.split("[;;]",name)
- names.sort(key = lambda x:len(x),reverse = True)
-
- contents = re.split("。",str(row[0]))
- for content in contents:
- data_item = []
- for i in range(len(str(content))):
- data_item_item = []
- data_item_item.append(content[i])
- data_item_item.append("O")
- data_item.append(data_item_item)
-
- code_find_flag = False
- name_find_flag = False
- for code in codes:
- if len(code)==0:
- continue
- code_begins = findAllIndex(code,content)
- for code_begin in code_begins:
- code_find_flag = True
- if len(code)==1:
- if hasNotBeenLabeled(data_item, code_begin, code):
- data_item[code_begin][1] = "PC_S"
- else:
- if hasNotBeenLabeled(data_item, code_begin, code):
- for j in range(code_begin,code_begin+len(code)):
- if j==code_begin:
- data_item[j][1] = "PC_B"
- elif j==code_begin+len(code)-1:
- data_item[j][1] = "PC_E"
- else:
- data_item[j][1] = "PC_M"
- for name in names:
- if len(name)==0:
- continue
- name_begins = findAllIndex(name,content)
- for name_begin in name_begins:
- name_find_flag = True
- if len(name)==1:
- if hasNotBeenLabeled(data_item, name_begin, name):
- data_item[name_begin][1] = "PC_S"
- else:
- if hasNotBeenLabeled(data_item, name_begin, name):
- for j in range(name_begin,name_begin+len(name)):
- if j==name_begin:
- data_item[j][1] = "PN_B"
- elif j==name_begin+len(name)-1:
- data_item[j][1] = "PN_E"
- else:
- data_item[j][1] = "PN_M"
- if code_find_flag or name_find_flag:
- test.append(data_item)
- doc_id_content = [row[3],content]
- list_docid_content.append(doc_id_content)
- else:
- if numpy.random.random()<=0.05:
- test.append(data_item)
- doc_id_content = [row[3],content]
- list_docid_content.append(doc_id_content)
-
- #replaceCharAndNumber(train)
- #replaceCharAndNumber(test)
-
- word_counts = Counter(row[0] for sample in train for row in sample)
- chunk_tags = sorted(list(set(row[1] for sample in train + test for row in sample))) # in alphabetic order
-
- if vocab_set is not None:
- vocab = vocab_set
- else:
- vocab = ['<pad>', '<unk>'] + [w for w, f in iter(word_counts.items()) if f >= min_freq]
-
-
- with codecs.open("viewTest.txt","w",encoding="utf8") as f:
- for t in test:
- for h in t:
- f.write(str(h[0])+" "+str(h[1]))
- f.write("\n")
- f.write("$#\n")
- f.flush()
- f.close()
-
- with codecs.open("viewTrain.txt","w",encoding="utf8") as f:
- for t in train:
- for h in t:
- f.write(str(h[0])+" "+str(h[1]))
- f.write("\n")
- f.write("$#\n")
- f.flush()
- f.close()
-
- with codecs.open("docid_content.txt","w",encoding="utf8") as f:
- for t in list_docid_content:
- f.write(t[0]+" "+t[1])
- f.write("\n")
- f.flush()
- train_process = _process_data(train, vocab, chunk_tags,MAX_LEN)
- test_process = _process_data(test, vocab, chunk_tags,MAX_LEN)
- #return (train[0][:-200],train[1][:-200]),(train[0][-200:],train[1][-200:]),(vocab, chunk_tags)
- return train_process, test_process, (vocab, chunk_tags),test
-
-
- def _process_data(data, vocab,chunk_tags, maxlen):
- '''
- @summary:处理数据
- @param:
- data:句子list
- vocab:字典
- chunk_tags:标签list
- maxlen:限定最大的句子长度,若不设置则为最长的句子的长度
- @return:
- x:经过补全和数值化的数据
- y_chunk:对应的标签
- '''
- if maxlen is None:
- maxlen = max(len(s) for s in data)
- word2idx = dict((w, i) for i, w in enumerate(numpy.array(vocab)))
- print(len(vocab))
- print(vocab.index("<unk>"))
- index_unk = word2idx.get("<unk>")
- index_pad = word2idx.get("<pad>")
-
- print("unk",index_unk,"pad",index_pad)
- x = [[word2idx.get(w[0], index_unk) for w in s] for s in data] # set to <unk> (index 1) if not in vocab
- x_len = [maxlen if len(s)>maxlen else len(s) for s in data]
- y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
- x = pad_sequences(x, maxlen,value=index_pad,padding='post',truncating='post') # left padding
- y_chunk = pad_sequences(y_chunk, maxlen, value=0,padding='post',truncating='post')
- # y_chunk = numpy.expand_dims(y_chunk, 2)
- return x, y_chunk,x_len
-
-
- if __name__=="__main__":
- '''
- importProjectData()
- '''
- # (train_x, train_y), (test_x, test_y), (vocab,class_labels),test = generateDatas(MAX_LEN=300)
- # print(len(train_x))
- # print(len(test_x))
- # print(class_labels)
-
|