import os import pandas as pd import pickle import psycopg2 import codecs import re import fool def getHandLabelDatas(): ''' @summary:对使用jupyter标注的数据插入到数据库中 ''' conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() def load(path): ''' 读取对象 @Arugs: path: 读取的路径 @Return: 读取的对象 ''' with open(path, 'rb') as f: object = pickle.load(f) return object for file in os.listdir(""): if file[-2:]=="pk": #if file[-9:-6] in ["l0.","l1.","l2.","l8."]: df = load("./"+file) for i in range(len(df)): if df.loc[i]['projectcode']!="" or df.loc[i]['projectname']!="": sql = " insert into project(doc_id,projectCode,projectName) values('"+df.loc[i]['doc_id']+"','"+df.loc[i]['projectcode']+"','"+df.loc[i]['projectname']+"')" #print(sql) print(file,i) cursor.execute(sql) conn.commit() conn.close() def getPredictCodeAndName(): ''' @summary:对模型的测试数据进行解析,判断模型的效果 ''' file = "predict_test.txt" with codecs.open(file,"r",encoding="utf8") as f: contents = f.read() data = [[row.split() for row in sample.split("\n")]for sample in contents.strip().split("\n\n")] ''' with codecs.open("docid_content.txt","r",encoding="utf8") as f: docid_content = f.read() data_docid_content = [sample.split() for sample in docid_content.strip().split("\n")] print(len(data),len(data_docid_content)) assert len(data)==len(data_docid_content) ''' code_pattern = re.compile("PC_B,(PC_M,)+PC_E") name_pattern = re.compile("PN_B,(PN_M,)+PN_E") sum_label_code = 0 sum_predict_code = 0 sum_label_predict_code = 0 sum_label_name = 0 sum_predict_name = 0 sum_label_predict_name = 0 with codecs.open("projectcodename.html","w",encoding="utf8") as f: f.write('\ \ \ \ \ \ \ \ \ \ \ ') for i in range(len(data)): a,b,c = zip(*data[i]) text = "".join(a) label = ",".join(b) predict = ",".join(c) label_code = [] label_name = [] predict_code = [] predict_name = [] for match in re.finditer(code_pattern,label): (match_begin,match_end) = match.span() text_begin = len(re.split(",",label[:match_begin]))-1 text_length = len(re.split(",",label[match_begin:match_end])) label_code.append(text[text_begin:text_begin+text_length]) for match in re.finditer(name_pattern,label): (match_begin,match_end) = match.span() text_begin = len(re.split(",",label[:match_begin]))-1 text_length = len(re.split(",",label[match_begin:match_end])) label_name.append(text[text_begin:text_begin+text_length]) for match in re.finditer(code_pattern,predict): (match_begin,match_end) = match.span() text_begin = len(re.split(",",predict[:match_begin]))-1 text_length = len(re.split(",",predict[match_begin:match_end])) predict_code.append(text[text_begin:text_begin+text_length]) for match in re.finditer(name_pattern,predict): (match_begin,match_end) = match.span() text_begin = len(re.split(",",predict[:match_begin]))-1 text_length = len(re.split(",",predict[match_begin:match_end])) predict_name.append(text[text_begin:text_begin+text_length]) if len(label_code)>0: sum_label_code += 1 if len(predict_code)>0: sum_predict_code += 1 if len(set(label_code)&set(predict_code))>0: sum_label_predict_code += 1 if len(label_name)>0: sum_label_name += 1 if len(predict_name)>0: sum_predict_name += 1 if len(set(label_name)&set(predict_name))>0: sum_label_predict_name += 1 #f.write(""+""+""+""+""+"") f.write(""+""+""+""+"") f.write("") f.write("\n") f.write('\
句子标签编号标签名称预测标号预测名称<
"+str(docid_content[i][0])+""+str(docid_content[i][1])+""+str(";".join(label_code))+""+str(";".join(label_name))+""+str(";".join(predict_code))+""+str(";".join(predict_name))+""+text+""+str(";".join(label_code))+""+str(";".join(label_name))+""+str(";".join(predict_code))+""+str(";".join(predict_name))+"
\ \ ') print("sum_label_code:%d,sum_predict_code:%d,sum_label_predict_code:%s"%(sum_label_code,sum_predict_code,sum_label_predict_code)) print("sum_label_name:%d,sum_predict_name:%d,sum_label_predict_name:%s"%(sum_label_name,sum_predict_name,sum_label_predict_name)) f.flush() f.close() def relabelHandlabels(): ''' @summary:对标注的项目名称,若前面含有实体,则加入到项目名称中,使用fool在发现实体 ''' conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() sql = " select A.content,B.projectname,B.doc_id from articles_processed A,project B where A.id=B.doc_id " cursor.execute(sql) def findAllIndex(substr,wholestr): copystr = wholestr result = [] indexappend = 0 while(True): index = copystr.find(substr) if index<0: break else: result.append(indexappend+index) indexappend += index+len(substr) copystr = copystr[index+len(substr):] return result rows = cursor.fetchall() updateData = [] row_index = 0 for row in rows: print(len(rows),row_index) row_index += 1 doc_id = row[2] name = row[1] if row[1]!="" else "" names = re.split("[;;]",name) contents = re.split("。",str(row[0])) first_len_name = len(names) for content in contents: for name in names: if len(name)==0: continue all_begin_index = findAllIndex(name,content) for begin_index in all_begin_index: if begin_index<30: test_text = content[:begin_index] else: test_text = content[begin_index-30:begin_index] entitys = fool.ner(test_text)[0] for entity in entitys: if len(entity)==0: continue if int(entity[1])==len(test_text)+1 and entity[2] in ["org","company"]: if entity[3]+name not in names: names.append(entity[3]+name) if len(names)>first_len_name: data_item = [doc_id,";".join(names)] updateData.append(data_item) print("lenUpdatedata:",len(updateData)) for item in updateData: sql = " insert into relabelproject(doc_id,names) values('"+item[0]+"','"+item[1]+"')" cursor.execute(sql) conn.commit() conn.close() if __name__=="__main__": #getHandLabelDatas() getPredictCodeAndName() #relabelHandlabels()