import os
import pandas as pd
import pickle
import psycopg2
import codecs
import re
import fool
def getHandLabelDatas():
'''
@summary:对使用jupyter标注的数据插入到数据库中
'''
conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
cursor = conn.cursor()
def load(path):
'''
读取对象
@Arugs:
path: 读取的路径
@Return:
读取的对象
'''
with open(path, 'rb') as f:
object = pickle.load(f)
return object
for file in os.listdir(""):
if file[-2:]=="pk":
#if file[-9:-6] in ["l0.","l1.","l2.","l8."]:
df = load("./"+file)
for i in range(len(df)):
if df.loc[i]['projectcode']!="" or df.loc[i]['projectname']!="":
sql = " insert into project(doc_id,projectCode,projectName) values('"+df.loc[i]['doc_id']+"','"+df.loc[i]['projectcode']+"','"+df.loc[i]['projectname']+"')"
#print(sql)
print(file,i)
cursor.execute(sql)
conn.commit()
conn.close()
def getPredictCodeAndName():
'''
@summary:对模型的测试数据进行解析,判断模型的效果
'''
file = "predict_test.txt"
with codecs.open(file,"r",encoding="utf8") as f:
contents = f.read()
data = [[row.split() for row in sample.split("\n")]for sample in contents.strip().split("\n\n")]
'''
with codecs.open("docid_content.txt","r",encoding="utf8") as f:
docid_content = f.read()
data_docid_content = [sample.split() for sample in docid_content.strip().split("\n")]
print(len(data),len(data_docid_content))
assert len(data)==len(data_docid_content)
'''
code_pattern = re.compile("PC_B,(PC_M,)+PC_E")
name_pattern = re.compile("PN_B,(PN_M,)+PN_E")
sum_label_code = 0
sum_predict_code = 0
sum_label_predict_code = 0
sum_label_name = 0
sum_predict_name = 0
sum_label_predict_name = 0
with codecs.open("projectcodename.html","w",encoding="utf8") as f:
f.write('
\
\
\
\
\
\
句子 | \
标签编号 | \
标签名称 | \
预测标号 | \
预测名称< | \
')
for i in range(len(data)):
a,b,c = zip(*data[i])
text = "".join(a)
label = ",".join(b)
predict = ",".join(c)
label_code = []
label_name = []
predict_code = []
predict_name = []
for match in re.finditer(code_pattern,label):
(match_begin,match_end) = match.span()
text_begin = len(re.split(",",label[:match_begin]))-1
text_length = len(re.split(",",label[match_begin:match_end]))
label_code.append(text[text_begin:text_begin+text_length])
for match in re.finditer(name_pattern,label):
(match_begin,match_end) = match.span()
text_begin = len(re.split(",",label[:match_begin]))-1
text_length = len(re.split(",",label[match_begin:match_end]))
label_name.append(text[text_begin:text_begin+text_length])
for match in re.finditer(code_pattern,predict):
(match_begin,match_end) = match.span()
text_begin = len(re.split(",",predict[:match_begin]))-1
text_length = len(re.split(",",predict[match_begin:match_end]))
predict_code.append(text[text_begin:text_begin+text_length])
for match in re.finditer(name_pattern,predict):
(match_begin,match_end) = match.span()
text_begin = len(re.split(",",predict[:match_begin]))-1
text_length = len(re.split(",",predict[match_begin:match_end]))
predict_name.append(text[text_begin:text_begin+text_length])
if len(label_code)>0:
sum_label_code += 1
if len(predict_code)>0:
sum_predict_code += 1
if len(set(label_code)&set(predict_code))>0:
sum_label_predict_code += 1
if len(label_name)>0:
sum_label_name += 1
if len(predict_name)>0:
sum_predict_name += 1
if len(set(label_name)&set(predict_name))>0:
sum_label_predict_name += 1
#f.write(""+str(docid_content[i][0])+" | "+""+str(docid_content[i][1])+" | "+""+str(";".join(label_code))+" | "+""+str(";".join(label_name))+" | "+""+str(";".join(predict_code))+" | "+""+str(";".join(predict_name))+" | ")
f.write(""+text+" | "+""+str(";".join(label_code))+" | "+""+str(";".join(label_name))+" | "+""+str(";".join(predict_code))+" | "+""+str(";".join(predict_name))+" | ")
f.write("")
f.write("\n")
f.write('\
\
\
')
print("sum_label_code:%d,sum_predict_code:%d,sum_label_predict_code:%s"%(sum_label_code,sum_predict_code,sum_label_predict_code))
print("sum_label_name:%d,sum_predict_name:%d,sum_label_predict_name:%s"%(sum_label_name,sum_predict_name,sum_label_predict_name))
f.flush()
f.close()
def relabelHandlabels():
'''
@summary:对标注的项目名称,若前面含有实体,则加入到项目名称中,使用fool在发现实体
'''
conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
cursor = conn.cursor()
sql = " select A.content,B.projectname,B.doc_id from articles_processed A,project B where A.id=B.doc_id "
cursor.execute(sql)
def findAllIndex(substr,wholestr):
copystr = wholestr
result = []
indexappend = 0
while(True):
index = copystr.find(substr)
if index<0:
break
else:
result.append(indexappend+index)
indexappend += index+len(substr)
copystr = copystr[index+len(substr):]
return result
rows = cursor.fetchall()
updateData = []
row_index = 0
for row in rows:
print(len(rows),row_index)
row_index += 1
doc_id = row[2]
name = row[1] if row[1]!="" else ""
names = re.split("[;;]",name)
contents = re.split("。",str(row[0]))
first_len_name = len(names)
for content in contents:
for name in names:
if len(name)==0:
continue
all_begin_index = findAllIndex(name,content)
for begin_index in all_begin_index:
if begin_index<30:
test_text = content[:begin_index]
else:
test_text = content[begin_index-30:begin_index]
entitys = fool.ner(test_text)[0]
for entity in entitys:
if len(entity)==0:
continue
if int(entity[1])==len(test_text)+1 and entity[2] in ["org","company"]:
if entity[3]+name not in names:
names.append(entity[3]+name)
if len(names)>first_len_name:
data_item = [doc_id,";".join(names)]
updateData.append(data_item)
print("lenUpdatedata:",len(updateData))
for item in updateData:
sql = " insert into relabelproject(doc_id,names) values('"+item[0]+"','"+item[1]+"')"
cursor.execute(sql)
conn.commit()
conn.close()
if __name__=="__main__":
#getHandLabelDatas()
getPredictCodeAndName()
#relabelHandlabels()