123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- import os
- import pandas as pd
- import pickle
- import psycopg2
- import codecs
- import re
- import fool
- def getHandLabelDatas():
- '''
- @summary:对使用jupyter标注的数据插入到数据库中
- '''
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
- def load(path):
- '''
- 读取对象
- @Arugs:
- path: 读取的路径
-
- @Return:
- 读取的对象
- '''
- with open(path, 'rb') as f:
- object = pickle.load(f)
- return object
-
- for file in os.listdir(""):
- if file[-2:]=="pk":
- #if file[-9:-6] in ["l0.","l1.","l2.","l8."]:
- df = load("./"+file)
- for i in range(len(df)):
- if df.loc[i]['projectcode']!="" or df.loc[i]['projectname']!="":
- sql = " insert into project(doc_id,projectCode,projectName) values('"+df.loc[i]['doc_id']+"','"+df.loc[i]['projectcode']+"','"+df.loc[i]['projectname']+"')"
- #print(sql)
- print(file,i)
- cursor.execute(sql)
- conn.commit()
- conn.close()
-
- def getPredictCodeAndName():
- '''
- @summary:对模型的测试数据进行解析,判断模型的效果
- '''
- file = "predict_test.txt"
- with codecs.open(file,"r",encoding="utf8") as f:
- contents = f.read()
- data = [[row.split() for row in sample.split("\n")]for sample in contents.strip().split("\n\n")]
- '''
- with codecs.open("docid_content.txt","r",encoding="utf8") as f:
- docid_content = f.read()
- data_docid_content = [sample.split() for sample in docid_content.strip().split("\n")]
-
- print(len(data),len(data_docid_content))
- assert len(data)==len(data_docid_content)
- '''
- code_pattern = re.compile("PC_B,(PC_M,)+PC_E")
- name_pattern = re.compile("PN_B,(PN_M,)+PN_E")
-
-
- sum_label_code = 0
- sum_predict_code = 0
- sum_label_predict_code = 0
- sum_label_name = 0
- sum_predict_name = 0
- sum_label_predict_name = 0
- with codecs.open("projectcodename.html","w",encoding="utf8") as f:
- f.write('<html><head>\
- <meta http-equiv="Content-Type"\
- content="text/html; charset=UTF-8">\
- </head>\
- <body bgcolor="#FFFFFF">\
- <table border="1">\
- <tr>\
- <td>句子</td>\
- <td>标签编号</td>\
- <td>标签名称</td>\
- <td>预测标号</td>\
- <td>预测名称<</td>\
- </tr>')
- for i in range(len(data)):
- a,b,c = zip(*data[i])
- text = "".join(a)
- label = ",".join(b)
- predict = ",".join(c)
- label_code = []
- label_name = []
- predict_code = []
- predict_name = []
- for match in re.finditer(code_pattern,label):
- (match_begin,match_end) = match.span()
- text_begin = len(re.split(",",label[:match_begin]))-1
- text_length = len(re.split(",",label[match_begin:match_end]))
- label_code.append(text[text_begin:text_begin+text_length])
- for match in re.finditer(name_pattern,label):
- (match_begin,match_end) = match.span()
- text_begin = len(re.split(",",label[:match_begin]))-1
- text_length = len(re.split(",",label[match_begin:match_end]))
- label_name.append(text[text_begin:text_begin+text_length])
- for match in re.finditer(code_pattern,predict):
- (match_begin,match_end) = match.span()
- text_begin = len(re.split(",",predict[:match_begin]))-1
- text_length = len(re.split(",",predict[match_begin:match_end]))
- predict_code.append(text[text_begin:text_begin+text_length])
- for match in re.finditer(name_pattern,predict):
- (match_begin,match_end) = match.span()
- text_begin = len(re.split(",",predict[:match_begin]))-1
- text_length = len(re.split(",",predict[match_begin:match_end]))
- predict_name.append(text[text_begin:text_begin+text_length])
- if len(label_code)>0:
- sum_label_code += 1
- if len(predict_code)>0:
- sum_predict_code += 1
- if len(set(label_code)&set(predict_code))>0:
- sum_label_predict_code += 1
-
- if len(label_name)>0:
- sum_label_name += 1
- if len(predict_name)>0:
- sum_predict_name += 1
- if len(set(label_name)&set(predict_name))>0:
- sum_label_predict_name += 1
-
-
-
- #f.write("<td>"+str(docid_content[i][0])+"</td>"+"<td>"+str(docid_content[i][1])+"</td>"+"<td>"+str(";".join(label_code))+"</td>"+"<td>"+str(";".join(label_name))+"</td>"+"<td>"+str(";".join(predict_code))+"</td>"+"<td>"+str(";".join(predict_name))+"</td>")
- f.write("<td>"+text+"</td>"+"<td>"+str(";".join(label_code))+"</td>"+"<td>"+str(";".join(label_name))+"</td>"+"<td>"+str(";".join(predict_code))+"</td>"+"<td>"+str(";".join(predict_name))+"</td>")
- f.write("</tr>")
- f.write("\n")
- f.write('</tr>\
- </table>\
- </body>\
- </html>')
- print("sum_label_code:%d,sum_predict_code:%d,sum_label_predict_code:%s"%(sum_label_code,sum_predict_code,sum_label_predict_code))
- print("sum_label_name:%d,sum_predict_name:%d,sum_label_predict_name:%s"%(sum_label_name,sum_predict_name,sum_label_predict_name))
- f.flush()
- f.close()
-
- def relabelHandlabels():
- '''
- @summary:对标注的项目名称,若前面含有实体,则加入到项目名称中,使用fool在发现实体
- '''
- conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
- cursor = conn.cursor()
-
- sql = " select A.content,B.projectname,B.doc_id from articles_processed A,project B where A.id=B.doc_id "
- cursor.execute(sql)
-
- def findAllIndex(substr,wholestr):
- copystr = wholestr
- result = []
- indexappend = 0
- while(True):
- index = copystr.find(substr)
- if index<0:
- break
- else:
- result.append(indexappend+index)
- indexappend += index+len(substr)
- copystr = copystr[index+len(substr):]
- return result
-
- rows = cursor.fetchall()
-
- updateData = []
-
- row_index = 0
-
-
- for row in rows:
- print(len(rows),row_index)
- row_index += 1
- doc_id = row[2]
- name = row[1] if row[1]!="" else ""
- names = re.split("[;;]",name)
- contents = re.split("。",str(row[0]))
-
- first_len_name = len(names)
- for content in contents:
- for name in names:
- if len(name)==0:
- continue
- all_begin_index = findAllIndex(name,content)
- for begin_index in all_begin_index:
- if begin_index<30:
- test_text = content[:begin_index]
- else:
- test_text = content[begin_index-30:begin_index]
- entitys = fool.ner(test_text)[0]
- for entity in entitys:
- if len(entity)==0:
- continue
- if int(entity[1])==len(test_text)+1 and entity[2] in ["org","company"]:
- if entity[3]+name not in names:
- names.append(entity[3]+name)
- if len(names)>first_len_name:
- data_item = [doc_id,";".join(names)]
- updateData.append(data_item)
-
-
- print("lenUpdatedata:",len(updateData))
- for item in updateData:
- sql = " insert into relabelproject(doc_id,names) values('"+item[0]+"','"+item[1]+"')"
- cursor.execute(sql)
-
- conn.commit()
- conn.close()
-
-
-
- if __name__=="__main__":
- #getHandLabelDatas()
- getPredictCodeAndName()
- #relabelHandlabels()
|