luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764
							'''
Created on 2019年6月4日

@author: User
'''
import fool
#import BiddingKG.dl.interface.Preprocessing as Preprocessing
from bs4 import BeautifulSoup
import re
import codecs
from BiddingKG.dl.common.Utils import save,load, findAllIndex
import glob
import threading
import numpy as np 
import time
from zipfile import ZipFile
import json
import psycopg2
import pandas as pd
import math
from BiddingKG.dl.foolnltk.bi_lstm_crf import BiLSTM
import copy
from BiddingKG.dl.interface.Entitys import *
from BiddingKG.dl.foolnltk.Entity2DB import *
import tensorflow as tf
import requests

def getNers(sentences,MAXAREA = 100000,userselffool=False):
    '''
    @param: sentences:句子数
    @return 限流执行后的分词和实体识别list
    '''
    
    def getData(ners,process_data):
        process_sentences = [item[1] for item in process_data]
        if userselffool:
            ner_ = Preprocessing.selffool.ner(process_sentences)
        else:
            ner_ = fool.ner(process_sentences)
        for i in range(len(ner_)):
            the_index = process_data[i][0]
            ners[the_index] = ner_[i]
    sents = []
    for i in range(len(sentences)):
        sents.append([i,sentences[i]])
    sents.sort(key=lambda x:len(x[1]),reverse=True)
    index_ = 0
    ners = [[]for i in range(len(sentences))]
    
    while(True):
        width = len(sents[index_][1])
        height = MAXAREA//width+1
        if height>len(sents)-index_:
            height = len(sents)-index_
        process_data = sents[index_:index_+height]
        getData(ners, process_data)
        index_ += height
        if index_>=len(sents):
            break
    return ners


def preprocess(list_articles):
    '''
    @summary: 预处理文本，将foolnltk的识别结果存储到数据库，方便查看和修正
    '''
    import psycopg2
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    list_filename_text = []
    cursor = conn.cursor()
    for article in list_articles:
        doc_id = article[0]
        text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(article[1],"lxml")))
        list_filename_text.append([doc_id,text,article[2]])
        list_sent = []
        for x in re.split("[。]", text):
            if len(x)>0:
                list_sent.append(x+"。")
        for n in getNers(list_sent):
            for _entity in n:
                print(_entity)
                sql = " insert into fool_ner_train(filename,begin_index,end_index,type,text) values('"+str(doc_id)+"',"+str(_entity[0])+","+str(_entity[1])+",'"+str(_entity[2])+"','"+str(_entity[3])+"')"
                cursor.execute(sql)
    conn.commit()
    conn.close()
    return list_filename_text
    
def hasNotBeenLabeled(items,code_begin,code):
        for i in range(code_begin,code_begin+len(code)):
            if items[i][1]!="O":
                return False
        return True
    
    
def findAllIndex(substr,wholestr):
    copystr = wholestr
    result = []
    indexappend = 0
    while(True):
        index = copystr.find(substr)
        if index<0:
            break
        else:
            result.append(indexappend+index)
            indexappend += index+len(substr)
            copystr = copystr[index+len(substr):]
    return result

def labelEntity():
    '''
    @summary: 标注数据，从数据库中查询实体信息，生成对文本生成标签数据
    '''
    import psycopg2
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    list_filename_text = load("list_filename_text_wrongEntity.pk")
    list_sent_label = []
    
    list_text_label = []
    sql = " select distinct filename from fool_ner t where not exists(select 1 from fool_ner a where t.filename=a.filename and type_0 in('org','company') and new_type is NULL) "
    cursor.execute(sql)
    set_filename = set()
    for row in cursor.fetchall():
        set_filename.add(row[0])
    
    for filename_text in list_filename_text:
        filename = filename_text[0]
        text = filename_text[1]
        if filename not in set_filename:
            continue
        
        
        sql = " select text,type_0,new_text,new_type from fool_ner where filename='"+filename+"' group by text,type_0,new_text,new_type"
        print(sql)
        cursor.execute(sql)
        rows = cursor.fetchall()
        rows.sort(key=lambda x:len(x[0]))
        
        list_entity = []
        for row in rows:
            entity = row[0]
            type = row[1]
            new_entity = row[2]
            new_type = row[3]
            _entitys = []
            if new_type is None or new_type=="" or new_type=="nan":
                _type = type
            else:
                _type = new_type
            if new_entity=="1":
                continue
            elif new_entity is None or new_entity =="" or new_entity=="nan":
                list_entity.append([entity,_type])
                _entitys.append([entity,_type])
            else:
                for _entity in new_entity.split("##"):
                    list_entity.append([_entity,_type])
                    _entitys.append([_entity,_type])
            if len(_entitys)>=2:
                data_item = []
                for i in range(len(str(entity))):
                    _item = []
                    _item.append(entity[i])
                    _item.append("O")
                    data_item.append(_item)
                for _entity_type in _entitys:
                    _entity = _entity_type[0]
                    _type = _entity_type[1]
                    if _type not in ["person","company","org","job","time","location"]:
                        continue
                    for _index in findAllIndex(_entity, entity):
                        _find_flag = True
                        if len(_entity)==1:
                            if hasNotBeenLabeled(data_item, _index, _entity):
                                data_item[_index][1] = "S_"+_type
                        else:
                            if hasNotBeenLabeled(data_item, _index, _entity):
                                for j in range(_index,_index+len(_entity)):
                                    if j==_index: 
                                        data_item[j][1] = "B_"+_type
                                    elif j==_index+len(_entity)-1:
                                        data_item[j][1] = "E_"+_type
                                    else:
                                        data_item[j][1] = "M_"+_type
                if _find_flag:
                    list_text_label.append(data_item)
                    
                list_insert = [" ","根据","就","受","，",",","。",":","：","#","&","$","、","/","-","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","RR","S","TA","U","V","Wa","X","YG","Z","a","b","c","d","e","f","g"]
                for insert_item in list_insert:
                    if np.random.random()>0.7:
                        copy_data_item = copy.copy(data_item)
                        list_index = []
                        for i in range(len(copy_data_item)):
                            _split = copy_data_item[i][1].split("_")
                            if len(_split)==2:
                                if _split[0]=="B":
                                    list_index.append(i)
                                if _split[0]=="E":
                                    list_index.append(i+1)
                        list_index.sort(key=lambda x:x,reverse=True)
                        for _index in list_index:
                            if np.random.random()>0.5:
                                for j in range(len(insert_item)):
                                    copy_data_item.insert(_index+j,[insert_item[j],"O"])
                                if np.random.random()>0.5:
                                    break
                        list_text_label.append(copy_data_item)
                        
        ''''''
        list_entity.sort(key=lambda x:len(x[0]),reverse=True)
        
        for _sent in text.split("。"):
            _sent+= "。"
            
            _find_flag = False
            data_item = []
            for i in range(len(str(_sent))):
                _item = []
                _item.append(_sent[i])
                _item.append("O")
                data_item.append(_item)
                            
            for _entity_type in list_entity:
                _entity = _entity_type[0]
                _type = _entity_type[1]
                if _type not in ["person","company","org","job","time","location"]:
                    continue
                for _index in findAllIndex(_entity, _sent):
                    _find_flag = True
                    if len(_entity)==1:
                        if hasNotBeenLabeled(data_item, _index, _entity):
                            data_item[_index][1] = "S_"+_type
                    else:
                        if hasNotBeenLabeled(data_item, _index, _entity):
                            for j in range(_index,_index+len(_entity)):
                                if j==_index: 
                                    data_item[j][1] = "B_"+_type
                                elif j==_index+len(_entity)-1:
                                    data_item[j][1] = "E_"+_type
                                else:
                                    data_item[j][1] = "M_"+_type
            #根据句子中是否包含实体来判断是否加入训练数据
            if _find_flag:
                list_sent_label.append(data_item)
            else:
                if np.random.random()>0.9:
                    list_sent_label.append(data_item)
        
    ''''''
    with codecs.open("ner_label.txt","w",encoding="utf8") as f:
        for _sent_label in list_sent_label:
            for _word,_label in _sent_label:
                f.write(_word+" "+_label+"\n")
            f.write("\n")
        f.flush()
    
    with codecs.open("ner_label_split.txt","w",encoding="utf8") as f:
        for _sent_label in list_text_label:
            for _word,_label in _sent_label:
                f.write(_word+" "+_label+"\n")
            f.write("\n")
        f.flush()
    return list_sent_label 

class MyThread(threading.Thread):

    def __init__(self,func,args=()):
        super(MyThread,self).__init__()
        self.func = func
        self.args = args

    def run(self):
        self.result = self.func(*self.args)

    def get_result(self):
        try:
            return self.result  # 如果子线程不使用join方法，此处可能会报没有self.result的错误
        except Exception:
            return None       

def deal():
    list_articles = []
    path = "C:\\Users\\User\\Desktop\\fool语料\\*.html"
    set_doc_id = set()
    for file in glob.glob(path):
        filename = file.split("\\")[-1]
        doc_id = filename.split("_")[-1][:-5]
        text = codecs.open(file,"r",encoding="utf8").read()
        wrong_entity = "".join(filename.split("_")[:-1])
        if doc_id in set_doc_id:
            for item in list_articles:
                if doc_id==item[0]:
                    item[2].append(wrong_entity)
        else:
            
            set_doc_id.add(doc_id)
            list_articles.append([doc_id,text,[wrong_entity]])
    save(list_articles,"list_filename_html_wrongEntity.pk")
    

def dataSplit(data,parts=2):
    _index = 0
    part_len = len(data)//parts
    while(True):
        if _index+part_len<len(data):
            yield data[_index:_index+part_len]
            _index += part_len
        else:
            yield data[_index:]
            break
        
def makeFoolTrainData():
    '''
    @summary: 生成fool训练数据
    '''
    import psycopg2
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    list_articles = []
    #list_path = ["C:\\Users\\User\\Desktop\\20190306要素\\*.html","C:\\Users\\User\\Desktop\\20190320要素\\*.html"]
    list_path = ["C:\\Users\\User\\Desktop\\data_20190703\\*.html"]
    set_doc_id = set()
    for path in list_path:
        for file in glob.glob(path):
            filename = file.split("\\")[-1]
            text = codecs.open(file,"r",encoding="utf8").read()
            if filename in set_doc_id:
                continue
            else:
                set_doc_id.add(filename)
                list_articles.append([filename,text])
    list_filename_text = []
    cursor = conn.cursor()
    _count = 0
    for article in list_articles:
        _count += 1
        print(str(_count)+"/"+str(len(list_articles)))
        doc_id = article[0]
        text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(article[1],"lxml")))
        list_filename_text.append([doc_id,text])
        list_sent = []
        for x in re.split("[。]", text):
            if len(x)>0:
                list_sent.append(x+"。")
        for n in getNers(list_sent,userselffool=True):
            for _entity in n:
                sql = " insert into fool_ner_train_1(filename,begin_index,end_index,type_0,text) values('"+str(doc_id)+"',"+str(_entity[0])+","+str(_entity[1])+",'"+str(_entity[2])+"','"+str(_entity[3])+"')"
                cursor.execute(sql)
    conn.commit()
    conn.close()
    save(list_filename_text,"list_filename_text_train_1.pk")
    return list_filename_text

def makeLabel(sent,list_entity_type):
    _find_flag = False
    data_item = []
    list_entity_type.sort(key=lambda x:len(x[0]),reverse=True)
    for i in range(len(str(sent))):
        _item = []
        _item.append(sent[i])
        _item.append("O")
        data_item.append(_item)
                    
    for _entity_type in list_entity_type:
        _entity = _entity_type[0]
        _type = _entity_type[1]
        if _type not in ["person","company","org","job","time","location"]:
            continue
        for _index in findAllIndex(_entity, sent):
            _find_flag = True
            if len(_entity)==1:
                if hasNotBeenLabeled(data_item, _index, _entity):
                    data_item[_index][1] = "S_"+_type
            else:
                if hasNotBeenLabeled(data_item, _index, _entity):
                    for j in range(_index,_index+len(_entity)):
                        if j==_index: 
                            data_item[j][1] = "B_"+_type
                        elif j==_index+len(_entity)-1:
                            data_item[j][1] = "E_"+_type
                        else:
                            data_item[j][1] = "M_"+_type
    return data_item,_find_flag

def makeTrainTxt():
    '''
    @summary: 生成训练数据文本
    '''
    import psycopg2
    conn = psycopg2.connect(dbname="article_label",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    list_filename_text = load("list_filename_text_train.pk")
    list_sent_label = []
    
    
    list_text_label = []
    for filename_text in list_filename_text:
        filename = filename_text[0]
        text = filename_text[1]
        
        
        sql = " select text,type_0,new_text,new_type from fool_ner_train where filename='"+filename+"' group by text,type_0,new_text,new_type"
        print(sql)
        cursor.execute(sql)
        rows = cursor.fetchall()
        rows.sort(key=lambda x:len(x[0]))
        list_entity = []
        for row in rows:
            entity = row[0]
            type = row[1]
            new_entity = row[2]
            new_type = row[3]
            _entitys = []
            if new_type is None or new_type=="" or new_type=="nan":
                _type = type
            else:
                _type = new_type
            if new_entity=="1":
                continue
            elif new_entity is None or new_entity =="" or new_entity=="nan":
                list_entity.append([entity,_type])
                _entitys.append([entity,_type])
            else:
                for _entity in new_entity.split("##"):
                    list_entity.append([_entity,_type])
                    _entitys.append([_entity,_type])
                    
            if len(_entitys)>=2:
                data_item = []
                for i in range(len(str(entity))):
                    _item = []
                    _item.append(entity[i])
                    _item.append("O")
                    data_item.append(_item)
                for _entity_type in _entitys:
                    _entity = _entity_type[0]
                    _type = _entity_type[1]
                    if _type not in ["person","company","org","job","time","location"]:
                        continue
                    for _index in findAllIndex(_entity, entity):
                        _find_flag = True
                        if len(_entity)==1:
                            if hasNotBeenLabeled(data_item, _index, _entity):
                                data_item[_index][1] = "S_"+_type
                        else:
                            if hasNotBeenLabeled(data_item, _index, _entity):
                                for j in range(_index,_index+len(_entity)):
                                    if j==_index: 
                                        data_item[j][1] = "B_"+_type
                                    elif j==_index+len(_entity)-1:
                                        data_item[j][1] = "E_"+_type
                                    else:
                                        data_item[j][1] = "M_"+_type
                if _find_flag:
                    list_text_label.append(data_item)
                    
                list_insert = ["根据","就","，",",","。",":","："]
                for insert_item in list_insert:
                    if np.random.random()>0.5:
                        copy_data_item = copy.copy(data_item)
                        list_index = []
                        for i in range(len(copy_data_item)):
                            _split = copy_data_item[i][1].split("_")
                            if len(_split)==2:
                                if _split[0]=="B":
                                    list_index.append(i)
                                if _split[0]=="E":
                                    list_index.append(i+1)
                        list_index.sort(key=lambda x:x,reverse=True)
                        for _index in list_index:
                            if np.random.random()>0.5:
                                for j in range(len(insert_item)):
                                    copy_data_item.insert(_index+j,[insert_item[j],"O"])
                        list_text_label.append(copy_data_item)
                list_insert = [" ","根据","就","受","，",",","。",":","：","#","&","$","、","/","-","A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","RR","S","TA","U","V","Wa","X","YG","Z","a","b","c","d","e","f","g"]
                for insert_item in list_insert:
                    if np.random.random()>0.7:
                        copy_data_item = copy.copy(data_item)
                        list_index = []
                        for i in range(len(copy_data_item)):
                            _split = copy_data_item[i][1].split("_")
                            if len(_split)==2:
                                if _split[0]=="B":
                                    list_index.append(i)
                                if _split[0]=="E":
                                    list_index.append(i+1)
                        list_index.sort(key=lambda x:x,reverse=True)
                        for _index in list_index:
                            if np.random.random()>0.5:
                                for j in range(len(insert_item)):
                                    copy_data_item.insert(_index+j,[insert_item[j],"O"])
                                if np.random.random()>0.5:
                                    break
                        list_text_label.append(copy_data_item)    
        ''''''
        list_entity.sort(key=lambda x:len(x[0]),reverse=True)
        
        
        for _sent in text.split("。"):
            _sent+= "。"
            
            _find_flag = False
            data_item = []
            for i in range(len(str(_sent))):
                _item = []
                _item.append(_sent[i])
                _item.append("O")
                data_item.append(_item)
                            
            for _entity_type in list_entity:
                _entity = _entity_type[0]
                _type = _entity_type[1]
                if _type not in ["person","company","org","job","time","location"]:
                    continue
                for _index in findAllIndex(_entity, _sent):
                    _find_flag = True
                    if len(_entity)==1:
                        if hasNotBeenLabeled(data_item, _index, _entity):
                            data_item[_index][1] = "S_"+_type
                    else:
                        if hasNotBeenLabeled(data_item, _index, _entity):
                            for j in range(_index,_index+len(_entity)):
                                if j==_index: 
                                    data_item[j][1] = "B_"+_type
                                elif j==_index+len(_entity)-1:
                                    data_item[j][1] = "E_"+_type
                                else:
                                    data_item[j][1] = "M_"+_type
            #根据句子中是否包含实体来判断是否加入训练数据
            if _find_flag:
                list_sent_label.append(data_item)
            else:
                if np.random.random()>0.9:
                    list_sent_label.append(data_item)
                    
        
    ''' '''   
    with codecs.open("ner_train.txt","w",encoding="utf8") as f:
        for _sent_label in list_sent_label:
            for _word,_label in _sent_label:
                f.write(_word+" "+_label+"\n")
            f.write("\n")
        f.flush()
    
    with codecs.open("ner_train_split.txt","w",encoding="utf8") as f:
        for _sent_label in list_text_label:
            for _word,_label in _sent_label:
                f.write(_word+" "+_label+"\n")
            f.write("\n")
        f.flush()
    
def _load_map_file(path, char_map_name, id_map_name):
    with ZipFile(path) as myzip:
        with myzip.open('all_map.json') as myfile:
            content = myfile.readline()
            content = content.decode()
            data = json.loads(content)
            return data.get(char_map_name), data.get(id_map_name)
        
def getContext(file):
    char_to_id, id_to_seg = _load_map_file("data/map.zip", "char_map", "ner_map")
    id_to_tag = {int(k):v for k,v in id_to_seg.items()}
    tag_to_id = {v:int(k) for k,v in id_to_seg.items()}
    list_sent_label = []
    with codecs.open(file,"r",encoding="utf8") as f:
        sentence = []
        while(True):
            line = f.readline()
            if not line:
                break
            if len(line)==1:
                if len(sentence)>0:
                    list_sent_label.append(sentence)
                sentence = []
            else:
                _word_id = char_to_id.get(line[0]) if line[0] in char_to_id.keys() else char_to_id.get("<OOV>")
                _tag_id = tag_to_id.get(line.split()[-1].strip())
                sentence.append([_word_id,_tag_id])
    return list_sent_label
def readlabeldata(file,list_context,MAX_LEN=300,keep_prob=1):
    '''
    @summary: 读取文件中的标注数据
    '''
    
    def addContext(_sentence,entity_sent,entity_label,id_B_company,id_E_company):
        _sent = []
        _label = []
        _flag = 0
        _find_flag = False
        for item in _sentence:
            if _flag==0:
                if item[1]==id_B_company:
                    for word_id,tag_id in zip(entity_sent,entity_label):
                        _sent.append(word_id)
                        _label.append(tag_id)
                    _flag = 1
                    _find_flag = True
                else:
                    _sent.append(item[0])
                    _label.append(item[1])
            elif _flag==1:
                if item[1]==id_E_company:
                    _flag = 2
                else:
                    continue
            else:
                _sent.append(item[0])
                _label.append(item[1])
        return _sent,_label,_find_flag
    
    def spreadContext(_sent,_label,id_to_char,id_to_tag):
        list_sent_label = []
        for _word,_l in zip(_sent,_label):
            list_sent_label.append([id_to_char.get(_word),id_to_tag.get(_l)])
        print(list_sent_label)
    
    list_sent_label_lengths = []
    char_to_id, id_to_seg = _load_map_file("data/map.zip", "char_map", "ner_map")
    id_to_char = {int(v):k for k,v in char_to_id.items()}
    id_to_tag = {int(k):v for k,v in id_to_seg.items()}
    tag_to_id = {v:int(k) for k,v in id_to_seg.items()}
    id_B_company = tag_to_id.get("B_company")
    id_E_company = tag_to_id.get("E_company")
    with codecs.open(file,"r",encoding="utf8") as f:
        _sent = []
        _label = []
        while(True):
            line = f.readline()
            if not line:
                break
            if len(line)==1:
                if np.random.rand()<keep_prob:
                    if len(_label)>0 and _label[0]==id_B_company and _label[-1]==id_E_company:
                        if np.random.rand()<0.8:
                            _int_random = np.random.randint(0,len(list_context))
                            _sentence = list_context[_int_random]
                            _sent_context,_label_context,_find_flag = addContext(_sentence, _sent, _label, id_B_company, id_E_company)
                            
                            if _find_flag:
                                if len(_sent_context)<MAX_LEN:
                                    list_sent_label_lengths.append([_sent_context,_label_context,len(_sent_context)])
                            else:
                                if len(_sent)<MAX_LEN and len(_sent)>0:
                                    list_sent_label_lengths.append([_sent,_label,len(_sent)])
                            '''
                            print("====")
                            spreadContext(_sent, _label, id_to_char, id_to_tag)
                            spreadContext(_sent_context, _label_context, id_to_char, id_to_tag)
                            print("====")
                            '''
                    else:
                        if len(_sent)<MAX_LEN and len(_sent)>0:
                            list_sent_label_lengths.append([_sent,_label,len(_sent)])
                _sent = []
                _label = []
            else:
                _sent.append(char_to_id.get(line[0]) if line[0] in char_to_id.keys() else char_to_id.get("<OOV>"))
                tag = line.split()[-1].strip()
                _label.append(tag_to_id.get(tag))
    return list_sent_label_lengths

def gt3():
    '''
    @summary: 获取错误标签长度大于3的
    '''    
    list_articles = []
    list_filename_html_wrongEntity = load("list_filename_html_wrongEntity.pk")
    for row in list_filename_html_wrongEntity:
        if len(row[2])>1 or len(row[2][0])>3:
            list_articles.append(row)
    print(len(list_articles))
    save(list_articles,"list_filename_html_wrongEntity_gt3.pk")
    
def selectByRule():
    '''
    @summary: 从数据库中查询出符合规则的记录，方便修正
    '''
    conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    
    tables = ["fool_ner","fool_ner_train"]
    #tables = ["fool_ner"]
    
    for table in tables:
        sql = " select filename,type_0,text,new_type,new_text from "+table+" where ctid in (select max(ctid) from "+table+" where type_0 in ('org','company','location')  group by filename,text) order by text "
        cursor.execute(sql)
        
        rows = cursor.fetchall()
        list_filename = []
        list_type_0 = []
        list_text = []
        list_new_type = []
        list_new_text = []
        pattern = "室"
        list_keyword = ["厂","所","出","院","部","行","局","社","采招办","酒店","办事处","分理处","管理处","集团","组织","支队","部队","支行","银行","支局","分行","分公司","公司","中心","医院","卫生院","小学","中学","大学","学校","政府","委员会","委会","财政局"]
        list_second_keyword = ["处","厅","园","委","队","室","站","会","办","馆","共和国","科技"]
        for row in rows:
            filename = row[0]
            type_0 = row[1]
            entity = row[2]
            new_type = row[3]
            new_entity = row[4]
            
            list_entity = []
            if new_type is None or new_type=="" or new_type=="nan":
                    _type = type
            else:
                _type = new_type
            if new_entity=="1":
                continue
            elif new_entity is None or new_entity =="" or new_entity=="nan":
                list_entity.append([entity,_type,new_entity])
            else:
                for _entity in new_entity.split("##"):
                    list_entity.append([_entity,_type,entity])
            _flag = False
            _index = 0
            
            
            for _entity in list_entity:
                '''
                if re.search('监狱.{,4}$',entity) is not None:
                    _flag = True
                '''
                if (len(entity)>2 and entity[-1]==entity[-2]) or (len(entity)>4 and entity[-4:-2]==entity[-2:]):
                    _flag = True 
                    
            '''
            pattern = "|".join(list_keyword)
            for _iter in re.finditer(pattern,text):
                if _iter.span()[1]>_index:
                    _index = _iter.span()[1]
                    new_text = text[:_index]
            if _index == 0:
                for _iter in re.finditer("|".join(list_second_keyword),text):
                    if _iter.span()[1]>_index:
                        _index = _iter.span()[1]
                        new_text = text[:_index]
            '''
            '''
            for keyword in list_keyword:
                if _flag:
                    break
                allindex = findAllIndex(keyword, text)
                if len(allindex)>0:
                    _flag = True
                    _index = allindex[-1]+len(keyword)
                    new_text = text[:_index]
            '''
            if _flag:
                list_filename.append(filename)
                list_type_0.append(type_0)
                list_text.append(entity)
                list_new_type.append(new_type)
                list_new_text.append(new_entity)
        data = {"list_filename":list_filename,"list_type_0":list_type_0,"list_text":list_text,"list_new_type":list_new_type,"list_new_text":list_new_text}
        df = pd.DataFrame(data,columns=["list_filename","list_type_0","list_text","list_new_type","list_new_text"])
        df.to_excel(table+".xls")
    
def makeDict_filename_content():
    
    dict_filename_content = {}
    path = "C:\\Users\\User\\Desktop\\fool语料\\*.html"
    set_doc_id = set()
    for file in glob.glob(path):
        filename = file.split("\\")[-1]
        doc_id = filename.split("_")[-1][:-5]
        text = codecs.open(file,"r",encoding="utf8").read()
        dict_filename_content[doc_id] = text
    list_path = ["C:\\Users\\User\\Desktop\\20190416要素\\*.html","C:\\Users\\User\\Desktop\\20190306要素\\*.html","C:\\Users\\User\\Desktop\\20190320要素\\*.html","C:\\Users\\User\\Desktop\\data_20190703\\*.html","C:\\Users\\User\\Desktop\\20190715\\*.html"]
    for path in list_path:
        for file in glob.glob(path):
            filename = file.split("\\")[-1]
            text = codecs.open(file,"r",encoding="utf8").read()
            dict_filename_content[filename] = text
    save(dict_filename_content,"dict_filename_content.pk")
    
def importLabelData():
    conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    
    for file in glob.glob("label/*.xls"):
        
        if len(file.split("_"))>1:
            table = "fool_ner"
        else:
            table = "fool_ner_train"
        print(file,table)
        df = pd.read_excel(file)
        for filename,type_0,text,new_type,new_text in zip(df["list_filename"],df["list_type_0"],df["list_text"],df["list_new_type"],df["list_new_text"]):
            sql = " insert into "+table+" (filename,type_0,text,new_type,new_text) values('"+str(filename).replace(".0","")+"','"+str(type_0)+"','"+str(text)+"','"+str(new_type)+"','"+str(new_text)+"')"
            #sql = " update "+table+" set new_text='"+str(new_text)+"',new_type='"+str(new_type)+"' where filename='"+str(filename)+"' and text='"+str(text)+"' "
            cursor.execute(sql)
    conn.commit()
    conn.close()
   
def checklabel():
    '''
    @summary: 检查label是否标注正确
    '''
    with codecs.open("ner_train.txt","r",encoding="utf8") as f:
        a = ""
        b = ""
        c = ""
        _index = 0
        while(True):
            _index += 1
            line = f.readline()
            if not line:
                break
            c = line.split(" ")[0].strip()
            if a=="新" and b=="乡" and c=="华":
                print(_index)
            a = b
            b = c

def updateLabel():
    '''
    @summary: 更新标注数据
    '''
    conn = psycopg2.connect(dbname="article_label", user="postgres", password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    tables = ["fool_ner","fool_ner_train"]
    for table in tables:
        file = table+".xls"
        df = pd.read_excel(file)
        for filename,type_0,text,new_type,new_text in zip(df["list_filename"],df["list_type_0"],df["list_text"],df["list_new_type"],df["list_new_text"]):
            sql = " update "+table+" set new_type='"+str(new_type)+"',new_text='"+str(new_text)+"' where filename='"+str(filename).replace(".0","")+"' and text='"+str(text)+"'"
            print(sql)
            cursor.execute(sql)
    conn.commit()
    conn.close()
    
def makeCertainEntity():
    fileList = ["C:\\Users\\User\\Desktop\\cleanedEntity.tsv","C:\\Users\\User\\Desktop\\company_found.tsv"]
    for file in fileList:
        outfile = file.split(".")[0]+".txt"
        with codecs.open(outfile,"w",encoding="utf8") as f_w:
            with codecs.open(file,"r",encoding="utf8") as f:
                while(True):
                    line = f.readline().strip()
                    if not line:
                        break
                    for i in range(len(line)):
                        if i==0:
                            f_w.write(line[i]+" B_company\n")
                        elif i==len(line)-1:
                            f_w.write(line[i]+" E_company\n")
                            f_w.write("\n")
                        else:
                            f_w.write(line[i]+" M_company\n")
                            
                
def addContextToTheEntity(entity_file):
    
    def getContext(file):
        list_sent_label = []
        with codecs.open(file,"r",encoding="utf8") as f:
            sentence = []
            while(True):
                line = f.readline()
                if not line:
                    break
                if len(line)==1:
                    list_sent_label.append(sentence)
                    sentence = []
                else:
                    sentence.append([line[0],line.split()[-1].strip()])
        return list_sent_label
    list_sent_label = getContext("ner_label.txt")
    print("getContent done",len(list_sent_label))
    context_len = len(list_sent_label)
    outputfile = entity_file.split(".")[0]+"_addContext.txt"
    with codecs.open(outputfile,"w",encoding="utf8") as f_w:
        with codecs.open(entity_file,"r",encoding="utf8") as f_r:
            while(True):
                entity = f_r.readline().strip()
                random_int = np.random.randint(0,context_len)
                _sentence = list_sent_label[random_int]
                _flag = 0
                for item in _sentence:
                    if _flag==0:
                        if item[1]=="B_company":
                            for word_index in range(len(entity)):
                                if word_index==0:
                                    f_w.write(entity[word_index]+" B_company\n")
                                elif word_index==len(entity)-1:
                                    f_w.write(entity[word_index]+" E_company\n")
                                else:
                                    f_w.write(entity[word_index]+" M_company\n")
                            _flag = 1
                        else:
                            f_w.write(item[0]+" "+item[1]+"\n")
                    elif _flag==1:
                        if item[1]=="E_company":
                            _flag = 2
                        else:
                            continue
                    else:
                        f_w.write(item[0]+" "+item[1]+"\n")
                f_w.write("\n")

def makeContext_by_fool_selffool():
    '''
    @summary: 通过fool和selffool的识别结果来判断一个句子的识别是否正确，若fool和selffool的识别一样，则为正确，否则待定
    '''
    import psycopg2
    conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
    list_filename_text = []
    cursor = conn.cursor()
    file_index = 0
    for file in glob.glob("C:\\Users\\User\\Desktop\\测试数据20190812\\*.html"):
        try:
            filename = file.split("\\")[-1]
            
            sql = " select count(1) from articles_processed_selffool where id='"+filename+"'"
            cursor.execute(sql)
            rows = cursor.fetchall()
            if rows[0][0]>0:
                continue
            
            content = codecs.open(file,"r",encoding="utf8").read()
            print(file_index,filename)
            text = Preprocessing.segment(Preprocessing.tableToText(BeautifulSoup(content,"lxml")))
            _article = Article(id=filename, content=text, sourceContent="", doc_id="", title="")
            persistArticle(conn,[_article],"articles_processed_selffool")
            list_sentences = []
            _sent_index = 0
            set_sentences = set()
            for x in re.split("[。]", text):
                if len(x)>0:
                    if x in set_sentences:
                        continue
                    set_sentences.add(x)
                    _sentence = Sentences(doc_id=filename,sentence_index=_sent_index,sentence_text=x+"。",tokens=[],pos_tags=[],ner_tags=[])
                    list_sentences.append(_sentence)
                    
                    
                    _ner_fool = fool.ner(_sentence.sentence_text)
                    _ner_selffool = Preprocessing.selffool.ner(_sentence.sentence_text)
                    
                    if len(set(_ner_fool[0])&set(_ner_selffool[0]))==len(_ner_fool[0]):
                        table_entity = "entity_mention_selffool"
                    else:
                        table_entity = "entity_mention_selffool_notsame"
                        
                    list_entitys = []
                    for item in _ner_selffool[0]:
                        _entity_id = filename+"_"+str(_sent_index)+"_"+str(item[0])+"_"+str(item[1])
                        _entity = Entity(doc_id=filename,entity_id=_entity_id,entity_text=item[3],entity_type=item[2],sentence_index=_sent_index,begin_index=item[0],end_index=item[1])
                        list_entitys.append(_entity)
                    persistEntity(conn,list_entitys,table_entity)
                    _sent_index += 1
            persistSentence(conn,list_sentences,"sentences_selffool")
            conn.commit()
        except Exception as e:
            print(e)
            conn.close()
            conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
            cursor = conn.cursor()
        finally:
            file_index += 1
        
    conn.close()

def makeCompare():
    '''
    @summary: 通过比较fool的多个版本的selffool来判断置信度
    '''
    
    
    bilstm_new = BiLSTM()
    path_add = "new_model/"
    path = 'model/'+path_add+'model.ckpt'
    bilstm_new.restore(path)
    
    
    conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    
    sql = " select doc_id,sentence_index,sentence_text from sentences_selffool A where exists(select 1 from entity_mention_selffool_notsame B where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and fool_version='selffool') and not exists(select 1 from entity_mention_selffool_notsame B where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index and fool_version='fool') "
    
    cursor.execute(sql)
    rows = cursor.fetchall()
    table_entity = "entity_mention_selffool_notsame"
    _index = 0
    try:
        for row in rows:
            _index += 1
            print(_index,len(rows))
            doc_id = row[0]
            sentence_index = row[1]
            '''
            sql = " select count(1) from "+table_entity+" where doc_id='"+doc_id+"' and sentence_index="+str(sentence_index)+" and fool_version='fool' "
            cursor.execute(sql)
            count_rows = cursor.fetchall()
            if count_rows[0][0]>0:
                continue
            '''
            text = row[2]
            _ner_entity_fool = set()
            _ner_entity_selffool = set()
            _ner_fool = fool.ner(text)[0]
            _ner_selffool = bilstm_new.ner(text)[0]
            
            list_entitys = []
            for item in _ner_fool:
                _entity_id = doc_id+"_"+str(sentence_index)+"_"+str(item[0])+"_"+str(item[1])
                _entity = Entity(doc_id=doc_id,entity_id=_entity_id,entity_text=item[3],entity_type=item[2],sentence_index=sentence_index,begin_index=item[0],end_index=item[1])
                list_entitys.append(_entity)
            persistEntity(conn,list_entitys,table_entity)
            
            conn.commit()
        '''
        for item in _ner_fool:
            if item[2] in ["org","company"]:
                _ner_entity_fool.add(item)
        for item in _ner_selffool:
            if item[2] in ["org","company"]:
                _ner_entity_selffool.add(item)
        if len(_ner_entity_fool&_ner_entity_selffool)==len(_ner_entity_fool) and len(_ner_entity_fool)==len(_ner_entity_selffool):
            print(text)
            print(_ner_selffool)
        '''
    except Exception as e:
        print(e)
        conn.close()
        conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
        cursor = conn.cursor()
        
    conn.close()

def cluster_difference():
    '''
    @summary: 对截断的尾部进行聚类
    '''
    conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    
    sql = " select entity_id,doc_id,sentence_index,begin_index,end_index,entity_type,entity_text,fool_version from entity_mention_selffool_notsame where entity_type in ('org','company') order by entity_id  "
    cursor.execute(sql)
    rows = cursor.fetchall()
    
    row_begin = 0
    DIFF_LEN = 2
    dict_diff_list = dict()
    while(row_begin<len(rows)-1):
        print(row_begin,len(rows))
        doc_id = rows[row_begin][1]
        sentence_index = rows[row_begin][2]
        row_end = row_begin
        for _i in range(row_begin+1,len(rows)):
            row_end = _i
            if rows[_i][1]==doc_id and rows[_i][2]==sentence_index:
                continue
            else:
                break
            
        list_entitys_fool = []
        list_entitys_selffool = []
        #对同一篇文章同一个句子的实体进行分类
        for _row in rows[row_begin:row_end]:
            entity_id = _row[0]
            begin_index = _row[3]
            end_index = _row[4]-1
            entity_type = _row[5]
            entity_text = _row[6]
            fool_version = _row[7]
            if entity_type in ["org","company"]:
                _entity = [entity_id,begin_index,end_index,entity_text,entity_type]
                if fool_version=="fool":
                    list_entitys_fool.append(_entity)
                else:
                    list_entitys_selffool.append(_entity)
        row_begin = row_end
        #分别遍历fool和selffool的结果，对不同点进行聚类
        list_key_entityid = []
        for _entity_fool in list_entitys_fool:
            entity_id_src = _entity_fool[0]
            begin_index_src = _entity_fool[1]
            end_index_src = _entity_fool[2]
            entity_text_src = _entity_fool[3]
            entity_type_src = _entity_fool[4]
            
            for _entity_selffool in list_entitys_selffool:
                entity_id_des = _entity_selffool[0]
                begin_index_des = _entity_selffool[1]
                end_index_des = _entity_selffool[2]
                entity_text_des = _entity_selffool[3]
                entity_type_des = _entity_selffool[4]
                
                if min(end_index_src,end_index_des)>max(begin_index_des,begin_index_src):
                    if begin_index_src==begin_index_des:
                        _key_begin = "SAME"
                    else:
                        _key_begin = entity_text_src[0:min(DIFF_LEN,len(entity_text_src))]+"#"+entity_text_des[0:min(DIFF_LEN,len(entity_text_des))]
                    if end_index_src==end_index_des:
                        _key_end = "SAME"
                    else:
                        _key_end = entity_text_src[-min(DIFF_LEN,len(entity_text_src)):]+"#"+entity_text_des[-min(DIFF_LEN,len(entity_text_des)):]
                    _key = _key_begin+"|"+_key_end
                    list_key_entityid.append([_key,[entity_id_src,entity_id_des]])
        #查看是否独有
        for _entity_fool in list_entitys_fool:
            entity_id_src = _entity_fool[0]
            begin_index_src = _entity_fool[1]
            end_index_src = _entity_fool[2]
            entity_text_src = _entity_fool[3]
            entity_type_src = _entity_fool[4]
            
            find_flag = False
            for item in list_key_entityid:
                if entity_id_src in item[1]:
                    find_flag = True
            if not find_flag:
                _key = "fool|"+entity_text_src[-min(DIFF_LEN,len(entity_text_src)):]
                list_key_entityid.append([_key,[entity_id_src]])
        for _entity_fool in list_entitys_selffool:
            entity_id_src = _entity_fool[0]
            begin_index_src = _entity_fool[1]
            end_index_src = _entity_fool[2]
            entity_text_src = _entity_fool[3]
            entity_type_src = _entity_fool[4]
            
            find_flag = False
            for item in list_key_entityid:
                if entity_id_src in item[1]:
                    find_flag = True
            if not find_flag:
                _key = "selffool|"+entity_text_src[-min(DIFF_LEN,len(entity_text_src)):]
                list_key_entityid.append([_key,[entity_id_src]])
        #聚类
        for item in list_key_entityid:
            find_flag = False
            if item[0]=="SAME|SAME":
                continue
            for _key in dict_diff_list.keys():
                if item[0]==_key:
                    dict_diff_list[_key].append(item[1])
                    find_flag = True
            if not find_flag:
                dict_diff_list[item[0]] = [item[1]]
                
        print(len(dict_diff_list.keys()))
    list_key_count = []
    for _key in dict_diff_list.keys():
        list_key_count.append([_key,len(dict_diff_list[_key])])
    list_key_count.sort(key=lambda x:x[1],reverse=True)
    with codecs.open("diff_key_count.txt","w",encoding="utf8") as f:
        for item in list_key_count:
            f.write(item[0]+"\t"+str(item[1])+"\n")
            
    save(dict_diff_list,"dict_diff_list.pk")
            
dict_sentence = None

def get_sentence(doc_id,sentence_index):
    global dict_sentence
    file_dict_sentence = "dict_sentence.pk"
    if dict_sentence is None:
        if os.path.exists(file_dict_sentence):
            dict_sentence = load(file_dict_sentence)
        else:
            conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
            cursor = conn.cursor()
            
            sql = " select doc_id,sentence_index,sentence_text from sentences_selffool  "
            cursor.execute(sql)
            
            dict_sentence = dict()
            rows = cursor.fetchall()
            for row in rows:
                _doc_id = row[0]
                _sentence_index = row[1]
                _sentence_text = row[2]
                _key = _doc_id+str(_sentence_index)
                dict_sentence[_key] = _sentence_text
            save(dict_sentence,file_dict_sentence)
    _key = doc_id+str(sentence_index)
    if _key in dict_sentence.keys():
        return dict_sentence[_key]
    return None

dict_diff_list = None            
def viewEntityByKey():
    global dict_diff_list
    if dict_diff_list is None:
        dict_diff_list = load("dict_diff_list.pk")
    CONTEXT_LEN = 15
    for key in dict_diff_list.keys():
        diff_list = dict_diff_list[key]
        file = "cluster_view/"+re.sub("[\*\|\/\r\n:]","",key.strip())+".xls"
        if os.path.exists(file):
            continue
        list_entityid = []
        list_before = []
        list_center = []
        list_after = []
        list_change = []
        list_type = []
        list_version = []
        if len(diff_list[0])==2:
            for item in diff_list:
                for i in range(len(item)):
                    if i==0:
                        list_version.append("fool")
                    else:
                        list_version.append("selffool")
                    entityid = item[i]
                    split_entityid = entityid.split("html")[1].split("_")
                    doc_id = entityid.split("html")[0]+"html"
                    sentence_index = split_entityid[1]
                    sentence_text = get_sentence(doc_id, sentence_index)
                    begin_index = int(split_entityid[2])
                    end_index = int(split_entityid[3])-1
                    list_entityid.append(entityid)
                    before = sentence_text[max(0,begin_index-CONTEXT_LEN):begin_index]
                    center = sentence_text[begin_index:end_index]
                    after = sentence_text[end_index:min(end_index+CONTEXT_LEN,len(sentence_text))]
                    list_before.append(before)
                    list_center.append(center)
                    list_after.append(after)
                    list_change.append(center)
                    list_type.append("")
        else:
            version = key.split("|")[0]
            for item in diff_list:
                list_version.append(version)
                entityid = item[0]
                split_entityid = entityid.split("html")[1].split("_")
                doc_id = entityid.split("html")[0]+"html"
                sentence_index = split_entityid[1]
                sentence_text = get_sentence(doc_id, sentence_index)
                begin_index = int(split_entityid[2])
                end_index = int(split_entityid[3])-1
                list_entityid.append(entityid)
                before = sentence_text[max(0,begin_index-CONTEXT_LEN):begin_index]
                center = sentence_text[begin_index:end_index]
                after = sentence_text[end_index:min(end_index+CONTEXT_LEN,len(sentence_text))]
                list_before.append(before)
                list_center.append(center)
                list_after.append(after)
                list_change.append(center)
                list_type.append("")
        data = {"list_entityid":list_entityid,
                "list_before":list_before,
                "list_center":list_center,
                "list_after":list_after,
                "list_change":list_change,
                "list_type":list_type,
                "list_version":list_version}
        df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
        df.to_excel(file)
            

def alterFileByRule(file):
    df = pd.read_excel(file)
    _location = "location"
    for _index in range(len(df["list_entityid"])):
        version = df["list_version"][_index]
        if version=="selffool":
            ''''''
            
            df["list_change"][_index] = df["list_change"][_index-1]
            
    df.to_excel(file,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])

def getCompanyByTianyan():
    token = "b775e2ed-d919-4d5f-8ab1-406d82d6bb56"
    headers = {"Authorization":token}
    url = "http://open.api.tianyancha.com/services/v4/open/searchV2?word="
    _inTianYan = "inTianYan"
    _inSource = "inSource"
    _dict = load("dict_company.pk")
    
    
    count = 0
    for entity in load("set_company.pk"):
        try:
            count += 1
            print(count,len(_dict.keys()))
            if entity in _dict:
                _dict[entity][_inSource] = True
                if _dict[entity][_inTianYan]:
                    continue
            else:
                _dict[entity] = {_inTianYan:False,_inSource:True}
            r = requests.get(url+entity,headers=headers)
            r_json = r.json()
            if r_json["error_code"]==0:
                for item in r_json["result"]["items"]:
                    companyName = re.sub("</?em>","",item["name"]).replace("（","(").replace("）",")")
                    if companyName in _dict:
                        _dict[companyName][_inTianYan] = True
                    else:
                        _dict[companyName] = {_inTianYan:True,_inSource:False}
            elif r_json["error_code"]==300007:
                print("剩余次数不足")
                break
        except Exception as e:
            print(str(e))
    save(_dict,"dict_company.pk")
    
def labelByTianyan():
    '''
    @summary: 通过天眼查的数据接口来获取标注
    '''
    list_entityid = []
    list_before = []
    list_center = []
    list_after = []
    list_change = []
    list_type = []
    list_version = []
    
    list_entityid_notmatch = []
    list_before_notmatch = []
    list_center_notmatch = []
    list_after_notmatch = []
    list_change_notmatch = []
    list_type_notmatch = []
    list_version_notmatch = []
    _inTianYan = "inTianYan"
    _inSource = "inSource"
    _dict_company = load("dict_company.pk")
    is_compare = False
    for file in glob.glob("cluster_view/add/*.xls"):
        df = pd.read_excel(file)
        for _index in range(len(df["list_change"])):
            version = df["list_version"][_index]
            if version in ["selffool","fool"]:
                _match_count = 0
                true_entity = None
                if df["list_change"][_index] in _dict_company:
                    if _dict_company[df["list_change"][_index]][_inTianYan]:
                        _match_count += 1
                        true_entity = df["list_change"][_index]
                if is_compare:
                    if df["list_change"][_index-1] in _dict_company:
                        if _dict_company[df["list_change"][_index-1]][_inTianYan]:
                            _match_count += 1
                            true_entity = df["list_change"][_index-1]
                if _match_count==1:
                    if is_compare:
                        list_entityid.append(df["list_entityid"][_index-1])
                        list_before.append(df["list_before"][_index-1])
                        list_center.append(df["list_center"][_index-1])
                        list_after.append(df["list_after"][_index-1])
                        list_change.append(df["list_change"][_index-1])
                        list_type.append(df["list_type"][_index-1])
                        list_version.append(df["list_version"][_index-1])
                    list_entityid.append(df["list_entityid"][_index])
                    list_before.append(df["list_before"][_index])
                    list_center.append(df["list_center"][_index])
                    list_after.append(df["list_after"][_index])
                    list_change.append(true_entity)
                    list_type.append(df["list_type"][_index])
                    list_version.append(df["list_version"][_index])
                else:
                    if is_compare:
                        list_entityid_notmatch.append(df["list_entityid"][_index-1])
                        list_before_notmatch.append(df["list_before"][_index-1])
                        list_center_notmatch.append(df["list_center"][_index-1])
                        list_after_notmatch.append(df["list_after"][_index-1])
                        list_change_notmatch.append(df["list_change"][_index-1])
                        list_type_notmatch.append(df["list_type"][_index-1])
                        list_version_notmatch.append(df["list_version"][_index-1])
                    list_entityid_notmatch.append(df["list_entityid"][_index])
                    list_before_notmatch.append(df["list_before"][_index])
                    list_center_notmatch.append(df["list_center"][_index])
                    list_after_notmatch.append(df["list_after"][_index])
                    list_change_notmatch.append(df["list_change"][_index])
                    list_type_notmatch.append(df["list_type"][_index])
                    list_version_notmatch.append(df["list_version"][_index])
    data = {"list_entityid":list_entityid,
                "list_before":list_before,
                "list_center":list_center,
                "list_after":list_after,
                "list_change":list_change,
                "list_type":list_type,
                "list_version":list_version}
    df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
    df.to_excel("cluster_view/add_match.xls")
    
    nums = 50000
    _begin = 0
    while(_begin<len(list_entityid_notmatch)):            
        data = {"list_entityid":list_entityid_notmatch[_begin:_begin+nums],
                    "list_before":list_before_notmatch[_begin:_begin+nums],
                    "list_center":list_center_notmatch[_begin:_begin+nums],
                    "list_after":list_after_notmatch[_begin:_begin+nums],
                    "list_change":list_change_notmatch[_begin:_begin+nums],
                    "list_type":list_type_notmatch[_begin:_begin+nums],
                    "list_version":list_version_notmatch[_begin:_begin+nums]}
        df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
        df.to_excel("cluster_view/add_notmatch_"+str(_begin)+".xls")
        _begin += nums
        
def cluster_entitys():
    '''
    @summary: 对实体进行聚类，统一截断
    '''
    tail_pattern = re.compile("学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
    
    dict_tail_entitys = {}
    listfile = ["cluster_view/*_match.xls","cluster_view/done/*.xls","cluster_view/tofix/done/*.xls"]
    count = 0
    for globfile in listfile:
        for file in glob.glob(globfile):
            isadd = re.search("fool|add",file) is not None
            count += 1
            print(count,file)
            df = pd.read_excel(file)
            list_entityid = df["list_entityid"]
            list_before = df["list_before"]
            list_center = df["list_center"]
            list_after = df["list_after"]
            list_change = df["list_change"]
            list_type = df["list_type"]
            list_version = df["list_version"]
            for _index in range(len(list_entityid)):
                '''
                #跳过标注为1的
                if str(list_change[_index])=="1":
                    continue
                '''
                #跳过用于对比标注的fool数据
                if not isadd and list_version[_index]=="fool":
                    continue
                if str(list_change[_index])=="1":
                    _key = "-1-"
                else:
                    _find = re.findall(tail_pattern,str(list_change[_index]))
                    if len(_find)==0:
                        _key = "other"
                    else:
                        _key = "-".join(_find)
                if _key in dict_tail_entitys:
                    dict_tail_entitys[_key]["list_entityid"].append(list_entityid[_index])
                    dict_tail_entitys[_key]["list_before"].append(list_before[_index])
                    dict_tail_entitys[_key]["list_center"].append(list_center[_index])
                    dict_tail_entitys[_key]["list_after"].append(list_after[_index])
                    dict_tail_entitys[_key]["list_change"].append(list_change[_index])
                    dict_tail_entitys[_key]["list_type"].append(list_type[_index])
                    dict_tail_entitys[_key]["list_version"].append(list_version[_index])
                else:
                    dict_tail_entitys[_key] = {"list_entityid":[list_entityid[_index]],
                                               "list_before":[list_before[_index]],
                                               "list_center":[list_center[_index]],
                                               "list_after":[list_after[_index]],
                                               "list_change":[list_change[_index]],
                                               "list_type":[list_type[_index]],
                                               "list_version":[list_version[_index]]}
    print(len(dict_tail_entitys.keys()))
    for _key in dict_tail_entitys.keys():
        
        filename = "cluster_view/cluster/"+_key+".xls"
        nums = 50000
        _begin = 0
        if os.path.exists(filename):
            continue
        while(_begin*nums<len(dict_tail_entitys[_key]["list_entityid"])):            
            data = {"list_entityid":dict_tail_entitys[_key]["list_entityid"][_begin*nums:(_begin+1)*nums],
                        "list_before":dict_tail_entitys[_key]["list_before"][_begin*nums:(_begin+1)*nums],
                        "list_center":dict_tail_entitys[_key]["list_center"][_begin*nums:(_begin+1)*nums],
                        "list_after":dict_tail_entitys[_key]["list_after"][_begin*nums:(_begin+1)*nums],
                        "list_change":dict_tail_entitys[_key]["list_change"][_begin*nums:(_begin+1)*nums],
                        "list_type":dict_tail_entitys[_key]["list_type"][_begin*nums:(_begin+1)*nums],
                        "list_version":dict_tail_entitys[_key]["list_version"][_begin*nums:(_begin+1)*nums]}
            df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
            df.to_excel("cluster_view/cluster/"+_key+"-"+str(_begin)+".xls")
            _begin += 1
        
def tofix():
    '''
    @summary: 获取所有待标注数据，使用规则过滤掉一些，再均分
    '''
    conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    sql = " select entity_id,fool_version,entity_text from entity_mention_selffool_notsame"
    cursor.execute(sql)
    
    dict_idversion_entity = {}
    rows = cursor.fetchall()
    for row in rows:
        entity_id = row[0]
        fool_version = row[1]
        entity_text = row[2]
        _key = entity_id+"-"+fool_version
        dict_idversion_entity[_key] = entity_text
    
    list_entityid_tofix = []
    list_before_tofix = []
    list_center_tofix = []
    list_after_tofix = []
    list_change_tofix = []
    list_type_tofix = []
    list_version_tofix = []
    
    list_entityid_nottofix = []
    list_before_nottofix = []
    list_center_nottofix = []
    list_after_nottofix = []
    list_change_nottofix = []
    list_type_nottofix = []
    list_version_nottofix = []
    
    nottofix_pattern = "^[A-Za-z0-9\-]*[省市区县州镇]|^[A-Za-z0-9\-]+$"
    listfile = ["cluster_view/*notmatch*.xls"]
    count = 0
    data = []
    for globfile in listfile:
        for file in glob.glob(globfile):
            isadd = re.search("fool|add",file) is not None
            count += 1
            print(count)
            df = pd.read_excel(file)
            list_entityid = df["list_entityid"]
            list_before = df["list_before"]
            list_center = df["list_center"]
            list_after = df["list_after"]
            list_change = df["list_change"]
            list_type = df["list_type"]
            list_version = df["list_version"]
            for _index in range(len(list_entityid)):
                if not isadd and list_version[_index]=="fool":
                    continue
                _key = str(list_entityid[_index])+"-"+str(list_version[_index])
                if _key in dict_idversion_entity and dict_idversion_entity[_key]!=list_center[_index]:
                    list_center[_index] = dict_idversion_entity[_key]
                    list_change[_index] = dict_idversion_entity[_key]
                data.append([str(list_entityid[_index]),str(list_before[_index]),str(list_center[_index]),str(list_after[_index]),str(list_change[_index]),str(list_type[_index]),str(list_version[_index])])
    data.sort(key=lambda x:x[4])
    
    for item in data:
        entityid = item[0]
        before = item[1]
        center = item[2]
        after = item[3]
        change = item[4]
        type = item[5]
        version = item[6]
        if re.search(nottofix_pattern,change) is not None:
            list_entityid_nottofix.append(entityid)
            list_before_nottofix.append(before)
            list_center_nottofix.append(center)
            list_after_nottofix.append(after)
            list_change_nottofix.append(change)
            list_type_nottofix.append(type)
            list_version_nottofix.append(version)
        else:
            list_entityid_tofix.append(entityid)
            list_before_tofix.append(before)
            list_center_tofix.append(center)
            list_after_tofix.append(after)
            list_change_tofix.append(change)
            list_type_tofix.append(type)
            list_version_tofix.append(version)
    
    parts = 16
    nums = len(list_entityid_tofix)//parts
    _begin = 0
    while(_begin*nums<len(list_entityid_tofix)):            
        data = {"list_entityid":list_entityid_tofix[_begin*nums:(_begin+1)*nums],
                    "list_before":list_before_tofix[_begin*nums:(_begin+1)*nums],
                    "list_center":list_center_tofix[_begin*nums:(_begin+1)*nums],
                    "list_after":list_after_tofix[_begin*nums:(_begin+1)*nums],
                    "list_change":list_change_tofix[_begin*nums:(_begin+1)*nums],
                    "list_type":list_type_tofix[_begin*nums:(_begin+1)*nums],
                    "list_version":list_version_tofix[_begin*nums:(_begin+1)*nums]}
        df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
        df.to_excel("cluster_view/tofix/"+str(_begin)+".xls")
        _begin += 1
    data = {"list_entityid":list_entityid_nottofix,
                    "list_before":list_before_nottofix,
                    "list_center":list_center_nottofix,
                    "list_after":list_after_nottofix,
                    "list_change":list_change_nottofix,
                    "list_type":list_type_nottofix,
                    "list_version":list_version_nottofix}
    df = pd.DataFrame(data,columns=["list_entityid","list_before","list_center","list_after","list_change","list_type","list_version"])
    df.to_excel("cluster_view/tofix/nottofix.xls")
    
def updateEntityview():
    '''
    @summary: 将视图中的数据更新回去，以selffool的为准进行更新操作，若是遇到fool新增的，则进行插入操作，fool_version改为selffool_add
    '''
    conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()            
    listfile = ["cluster_view/cluster/*.xls"]
    count = 0
    for globfile in listfile:
        for file in glob.glob(globfile):
            count += 1
            print(count,file)
            df = pd.read_excel(file)
            for _index in range(len(df["list_entityid"])):
                entity_id = df["list_entityid"][_index]
                doc_id = entity_id.split("html")[0]+"html"
                list_index = entity_id.split("html")[1].split("_")
                sentence_index = list_index[1]
                begin_index = list_index[2]
                end_index = list_index[3]
                change = str(df["list_change"][_index])
                type = str(df["list_type"][_index])
                version = str(df["list_version"][_index])
                if version=="fool":
                    sql = " update entity_mention_selffool_notsame set new_text='"+str(change)+"',new_type='"+type+"',fool_version='fool_add' where entity_id='"+entity_id+"' and fool_version='fool' "
                    cursor.execute(sql)
                else:
                    sql = " update entity_mention_selffool_notsame set new_text='"+str(change)+"',new_type='"+type+"' where entity_id='"+entity_id+"' and fool_version='selffool' "
                    cursor.execute(sql)
    conn.commit()
    conn.close()
    
def makeLabelText():
    '''
    @summary: 更新数据太慢，直接从数据库查询出数据库进行替换，然后生成训练数据
    '''
    conn = psycopg2.connect(dbname="selffool",user="postgres",password="postgres",host="192.168.2.101")
    cursor = conn.cursor()
    #根据聚类的结果生成替换字典
    dict_replace = dict()
    listfile = ["cluster_view/cluster/*.xls"]
    count = 0
    for globfile in listfile:
        for file in glob.glob(globfile):
            count += 1
            print(count,file)
            df = pd.read_excel(file)
            for _index in range(len(df["list_entityid"])):
                entity_id = df["list_entityid"][_index]
                doc_id = entity_id.split("html")[0]+"html"
                list_index = entity_id.split("html")[1].split("_")
                sentence_index = list_index[1]
                begin_index = list_index[2]
                end_index = list_index[3]
                change = str(df["list_change"][_index])
                type = str(df["list_type"][_index])
                version = str(df["list_version"][_index])
                _key = entity_id+version
                dict_replace[_key] = [change,type]
    print("get dict_replace done")
    #从数据库中查询出实体，按照entity_id排序
    data_entity = []
    sql = " select entity_id,doc_id,sentence_index,entity_text,entity_type,'selffool' from entity_mention_selffool order by entity_id "
    cursor.execute(sql)
    rows = cursor.fetchall()
    data_entity = data_entity+rows
    sql = " select entity_id,doc_id,sentence_index,entity_text,entity_type,fool_version from entity_mention_selffool_notsame order by entity_id "
    cursor.execute(sql)
    rows = cursor.fetchall()
    data_entity = data_entity+rows
    #生成doc_id-sentence_index-list_entity_type字典
    dict_sent_entitys = dict()
    _begin = 0
    while(_begin<len(data_entity)-1):
        _begin_doc_id = data_entity[_begin][1]
        _begin_sentence_index = data_entity[_begin][2]
        _end = _begin
        print(_begin)
        for end in range(_begin+1,len(data_entity)):
            _end = end
            _end_doc_id = data_entity[end][1]
            _end_sentence_index = data_entity[end][2]
            if _begin_doc_id==_end_doc_id and _begin_sentence_index==_end_sentence_index:
                continue
            else:
                break
        for item in data_entity[_begin:_end]:
            entity_id = item[0]
            doc_id = item[1]
            sentence_index = item[2]
            entity_text = item[3]
            entity_type = item[4]
            version = item[5]
            _key = doc_id+"-"+str(sentence_index)
            if _key not in dict_sent_entitys:
                dict_sent_entitys[_key] = []
            #进行替换
            if entity_type in ["org","company"]:
                _key1 = entity_id+version
                if _key1 in dict_replace:
                    if str(dict_replace[_key1][0])=="1":
                        continue
                    if dict_replace[_key1][1] in ["org","company","person","location"]:
                        for item in re.split("##",dict_replace[_key1][0]):
                            dict_sent_entitys[_key].append([item,dict_replace[_key1][1]])
                    else:
                        for item in re.split("##",dict_replace[_key1][0]):
                            dict_sent_entitys[_key].append([item,entity_type])
                else:
                    if version=="selffool":
                        dict_sent_entitys[_key].append([entity_text,entity_type])
            else:
                dict_sent_entitys[_key].append([entity_text,entity_type])
        _begin = _end
    print("get dict_sent_entitys done")
    #查询出句子
    sql = " select doc_id,sentence_index,sentence_text from sentences_selffool order by doc_id "
    cursor.execute(sql)
    
    list_sentence = cursor.fetchall()
    count = 0
    with codecs.open("selffool_train.txt","w",encoding="utf8") as f:
        
            
        for sent in list_sentence:
            
            count += 1
            print(count)
            _key = sent[0]+"-"+str(sent[1])
            sentence = sent[2]
            if len(sentence)>2000:
                continue
            if _key in dict_sent_entitys:
                data_item,_find_flag = makeLabel(sentence, dict_sent_entitys[_key])
                for _word,_label in data_item:
                    f.write(_word+" "+_label+"\n")
            else:
                if np.random.random()>0.8:
                    data_item,_find_flag = makeLabel(sentence, [])
                    for _word,_label in data_item:
                        f.write(_word+" "+_label+"\n")
            f.write("\n")
        f.flush()
                
        
if __name__=="__main__":
    #makeFoolTrainData()
    #makeTrainTxt()
    #labelEntity()
    #readlabeldata("cleanedEntity.txt",getContext("ner_train.txt"))
    #makeDict_filename_content()
    #selectByRule()
    #updateLabel()
    #importLabelData()
    #makeCertainEntity()
    #addContextToTheEntity("company_found.tsv")
    #makeContext_by_fool_selffool()
    #makeCompare()
    #cluster_difference()
    #viewEntityByKey()
    #alterFileByRule("cluster_view/change/SAME版社#大学.xls")
    #getCompanyByTianyan()
    '''
    data = load("dict_company.pk")
    for item in data.keys():
        print(item,data[item])
    '''
    #labelByTianyan()
    '''
    token = "b775e2ed-d919-4d5f-8ab1-406d82d6bb56"
    headers = {"Authorization":token}
    url = "http://open.api.tianyancha.com/services/v4/open/searchV2?word="
    r = requests.get(url+"安阳鑫龙煤业(集团)龙山煤业有限责任公司",headers=headers)
    r_json = r.json()
    print(r_json)
    '''
    #tofix()
    cluster_entitys()
    makeLabelText()