luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
							import os
import pandas as pd
import pickle
import psycopg2
import codecs
import re 
import fool


def getHandLabelDatas():
    '''
    @summary:对使用jupyter标注的数据插入到数据库中
    '''
    conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    def load(path):
        '''
        读取对象
        @Arugs:
            path: 读取的路径
    
        @Return:
            读取的对象
        '''
        with open(path, 'rb') as f:
            object = pickle.load(f)
            return object
        
    for file in os.listdir(""):
        if file[-2:]=="pk":
            #if file[-9:-6] in ["l0.","l1.","l2.","l8."]:
            df = load("./"+file)
            for i in range(len(df)):
                if df.loc[i]['projectcode']!="" or df.loc[i]['projectname']!="":
                    sql = " insert into project(doc_id,projectCode,projectName) values('"+df.loc[i]['doc_id']+"','"+df.loc[i]['projectcode']+"','"+df.loc[i]['projectname']+"')"
                    #print(sql)
                    print(file,i)
                    cursor.execute(sql)
    conn.commit()
    conn.close()
    
def getPredictCodeAndName():
    '''
    @summary:对模型的测试数据进行解析，判断模型的效果
    '''
    file = "predict_test.txt"
    with codecs.open(file,"r",encoding="utf8") as f:
        contents = f.read()
    data = [[row.split() for row in sample.split("\n")]for sample in contents.strip().split("\n\n")]
    '''
    with codecs.open("docid_content.txt","r",encoding="utf8") as f:
        docid_content = f.read()
    data_docid_content = [sample.split() for sample in docid_content.strip().split("\n")]
    
    print(len(data),len(data_docid_content))
    assert len(data)==len(data_docid_content)
    '''
    code_pattern = re.compile("PC_B,(PC_M,)+PC_E")
    name_pattern = re.compile("PN_B,(PN_M,)+PN_E")
    
    
    sum_label_code = 0
    sum_predict_code = 0
    sum_label_predict_code = 0
    sum_label_name = 0
    sum_predict_name = 0
    sum_label_predict_name = 0
    with codecs.open("projectcodename.html","w",encoding="utf8") as f:
        f.write('<html><head>\
        <meta http-equiv="Content-Type"\
        content="text/html; charset=UTF-8">\
        </head>\
        <body bgcolor="#FFFFFF">\
        <table border="1">\
        <tr>\
        <td>句子</td>\
        <td>标签编号</td>\
        <td>标签名称</td>\
        <td>预测标号</td>\
        <td>预测名称<</td>\
        </tr>')
        for i in range(len(data)):
            a,b,c = zip(*data[i])
            text = "".join(a)
            label = ",".join(b)
            predict = ",".join(c)
            label_code = []
            label_name = []
            predict_code = []
            predict_name = []
            for match in re.finditer(code_pattern,label):
                (match_begin,match_end) = match.span()
                text_begin = len(re.split(",",label[:match_begin]))-1
                text_length = len(re.split(",",label[match_begin:match_end]))
                label_code.append(text[text_begin:text_begin+text_length])
            for match in re.finditer(name_pattern,label):
                (match_begin,match_end) = match.span()
                text_begin = len(re.split(",",label[:match_begin]))-1
                text_length = len(re.split(",",label[match_begin:match_end]))
                label_name.append(text[text_begin:text_begin+text_length])
            for match in re.finditer(code_pattern,predict):
                (match_begin,match_end) = match.span()
                text_begin = len(re.split(",",predict[:match_begin]))-1
                text_length = len(re.split(",",predict[match_begin:match_end]))
                predict_code.append(text[text_begin:text_begin+text_length])
            for match in re.finditer(name_pattern,predict):
                (match_begin,match_end) = match.span()
                text_begin = len(re.split(",",predict[:match_begin]))-1
                text_length = len(re.split(",",predict[match_begin:match_end]))
                predict_name.append(text[text_begin:text_begin+text_length])
            if len(label_code)>0:
                sum_label_code += 1
            if len(predict_code)>0:
                sum_predict_code += 1
            if len(set(label_code)&set(predict_code))>0:
                sum_label_predict_code += 1
                
            if len(label_name)>0:
                sum_label_name += 1
            if len(predict_name)>0:
                sum_predict_name += 1
            if len(set(label_name)&set(predict_name))>0:
                sum_label_predict_name += 1
            
            
            #f.write("<td>"+str(docid_content[i][0])+"</td>"+"<td>"+str(docid_content[i][1])+"</td>"+"<td>"+str(";".join(label_code))+"</td>"+"<td>"+str(";".join(label_name))+"</td>"+"<td>"+str(";".join(predict_code))+"</td>"+"<td>"+str(";".join(predict_name))+"</td>")
            f.write("<td>"+text+"</td>"+"<td>"+str(";".join(label_code))+"</td>"+"<td>"+str(";".join(label_name))+"</td>"+"<td>"+str(";".join(predict_code))+"</td>"+"<td>"+str(";".join(predict_name))+"</td>")
            f.write("</tr>")
            f.write("\n")
        f.write('</tr>\
        </table>\
        </body>\
        </html>')
        print("sum_label_code:%d,sum_predict_code:%d,sum_label_predict_code:%s"%(sum_label_code,sum_predict_code,sum_label_predict_code))
        print("sum_label_name:%d,sum_predict_name:%d,sum_label_predict_name:%s"%(sum_label_name,sum_predict_name,sum_label_predict_name))
        f.flush()
        f.close()
        
def relabelHandlabels():
    '''
    @summary:对标注的项目名称，若前面含有实体，则加入到项目名称中，使用fool在发现实体
    '''
    conn = psycopg2.connect(dbname="BiddingKG",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    
    sql = " select A.content,B.projectname,B.doc_id from articles_processed A,project B where A.id=B.doc_id "
    cursor.execute(sql)
    
    def findAllIndex(substr,wholestr):
        copystr = wholestr
        result = []
        indexappend = 0
        while(True):
            index = copystr.find(substr)
            if index<0:
                break
            else:
                result.append(indexappend+index)
                indexappend += index+len(substr)
                copystr = copystr[index+len(substr):]
        return result
    
    rows = cursor.fetchall()
    
    updateData = []
    
    row_index = 0
    
    
    for row in rows:
        print(len(rows),row_index)
        row_index += 1
        doc_id = row[2]
        name = row[1] if row[1]!="" else ""
        names = re.split("[;；]",name)

        contents = re.split("。",str(row[0]))
        
        first_len_name = len(names)
        for content in contents:
            for name in names:
                if len(name)==0:
                    continue
                all_begin_index = findAllIndex(name,content)
                for begin_index in all_begin_index:
                    if begin_index<30:
                        test_text = content[:begin_index]
                    else:
                        test_text = content[begin_index-30:begin_index]
                    entitys = fool.ner(test_text)[0]
                    for entity in entitys:
                        if len(entity)==0:
                            continue
                        if int(entity[1])==len(test_text)+1 and entity[2] in ["org","company"]:
                            if entity[3]+name not in names:
                                names.append(entity[3]+name)
        if len(names)>first_len_name:
            data_item = [doc_id,";".join(names)]
            updateData.append(data_item)
        
        
    print("lenUpdatedata:",len(updateData))
    for item in updateData:
        sql = " insert into relabelproject(doc_id,names) values('"+item[0]+"','"+item[1]+"')"
        cursor.execute(sql)
        
    conn.commit()
    conn.close()
            
        
if __name__=="__main__":
    #getHandLabelDatas()
    getPredictCodeAndName()
    #relabelHandlabels()