luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304
							

from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date
from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
from decimal import Decimal
import re
import copy
import math
import pandas as pd
import os
from scipy.optimize import linear_sum_assignment
from BiddingKG.dl.interface.Entitys import Match
import numpy as np

def getTheRole(entity,role_list):
    '''
    @summary:根据实体名称拿到index
    @param:
        entity:实体名称
        role_list:角色list
    @return:该实体所在下标
    '''
    for role_index in range(len(role_list)):
        if entity in role_list[role_index]:
            return role_index
    return None

dict_role_id = {"0":"tenderee",
                "1":"agency",
                "2":"win_tenderer",
                "3":"second_tenderer",
                "4":"third_tenderer"}

def getPackage(packageList,sentence_index,begin_index,roleid,MAX_DIS=None,DIRECT=None):
    '''
    @param:
        packageList:文章的包的信息,包号-sent_index-词偏移-字偏移-[[前作用域句子,句内偏移],[后作用域句子,句内偏移]]-匹配集合
        sentence_index:实体所在的句子
        begin_index:实体所在句子的起始位置
    @return:公司实体所属的包
    @summary: 优化多标段，确定标段作用域之后，寻找作用域包含该实体的所有包，从前往后找到一个还没有该roleid的包返回，若找到的包都有roleid,则返回第一个，若没有找到包，返回None
    '''
    
    '''
    if len(packageList)==0:
        return None
    before_index = None
    after_index = None
    equal_index = None
    equal_count = 0
    
    
    for pack_index in range(len(packageList)):
        if packageList[pack_index][1]>sentence_index and after_index is None:
            after_index = pack_index
        if packageList[pack_index][1]<sentence_index:
            before_index = pack_index
        if packageList[pack_index][1]==sentence_index and equal_index is None:
            equal_index = pack_index
    #当前句子和之前句子未找到包
    if before_index is None and equal_index is None:
        return None
    else:
        if after_index is None:
            end_index = len(packageList)
        else:
            end_index = after_index
        #只在当前句子找到一个包号
        if end_index-max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1))==1:
            return packageList[end_index-1][0]
        else:
            for i in range(max((before_index if before_index is not None else -1,equal_index if equal_index is not None else -1)),end_index):
                if packageList[i][2]>int(begin_index):
                    if packageList[i-1][4]:
                        return packageList[i-1][0]
                    else:
                        if packageList[i][4]:
                            return packageList[i-1][0]
                        else:
                            return packageList[i][0]
            return packageList[end_index-1][0]
    '''
    if len(packageList)==0:
        return None,False
    list_legalPack = []
    for pack_index in range(len(packageList)):
        if DIRECT=="L" and (packageList[pack_index]["sentence_index"]>sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetWords_begin"]>begin_index)):
            continue
        if DIRECT=="R" and (packageList[pack_index]["sentence_index"]<sentence_index or (packageList[pack_index]["sentence_index"]==sentence_index and packageList[pack_index]["offsetwords_begin"]<begin_index)):
            continue
        if (packageList[pack_index]["scope"][0][0]<sentence_index or (packageList[pack_index]["scope"][0][0]==sentence_index and packageList[pack_index]["scope"][0][1]<=begin_index))  and (packageList[pack_index]["scope"][1][0]>sentence_index or (packageList[pack_index]["scope"][1][0]==sentence_index and packageList[pack_index]["scope"][1][1]>=begin_index)):
            if MAX_DIS is not None:
                if abs(sentence_index-packageList[pack_index]["sentence_index"])<=MAX_DIS:
                    list_legalPack.append(pack_index)
            else:
                list_legalPack.append(pack_index)
    # if (packageList[pack_index]["scope"][0][0] < sentence_index
    #         or (packageList[pack_index]["scope"][0][0] == sentence_index
    #      and packageList[pack_index]["scope"][0][1] <= begin_index))
    #         and (packageList[pack_index]["scope"][1][0] > sentence_index
    #      or (packageList[pack_index]["scope"][1][0] == sentence_index
    #         and packageList[pack_index]["scope"][1][1] >= begin_index)):
    #     pass
    _flag = True
    for _index in list_legalPack:
        if roleid in packageList[_index]["hit"]:
            continue
        else:
            _flag = False
            packageList[_index]["hit"].add(roleid)
            return packageList[_index]["pointer"],_flag
    if len(list_legalPack)>0:
        return packageList[0]["pointer"],_flag
    return None,False

#生成合法的组合
def get_legal_comba(list_entity,dict_role_combination):
    
    #拿到一个包中所有合法的组合
    def circle_package(_dict_legal_combination):
        list_dict_role_first = []
        for _role in _dict_legal_combination:
            if len(list_dict_role_first)==0:
                for _entity in _dict_legal_combination[_role]:
                    if _entity !="":
                        list_dict_role_first.append({_role:_entity})
            else:
                list_dict_role_after = []
                _find_count = 0
                for _entity in _dict_legal_combination[_role]:
                    if _entity !="":
                        for _dict in list_dict_role_first:
                            _flag = True
                            for _key1 in _dict:
                                if _entity==_dict[_key1]:
                                    #修改为招标人和代理人可以为同一个
                                    if str(_key1) in ["0","1"] and str(_role) in ["0","1"]:
                                        _flag = True
                                    else:
                                        _flag = False
                            if _flag:
                                _find_count += 1
                                _new_dict = copy.copy(_dict)
                                _new_dict[_role] = _entity
                                if len(list_dict_role_after)>100000:
                                    break
                                list_dict_role_after.append(_new_dict)
                            else:
                                # 2021/5/25 update,同一实体（entity_text）不同角色
                                if len(list_dict_role_after) > 100000:
                                    break
                                for _dict in list_dict_role_first:
                                    for _key1 in _dict:
                                        if _entity == _dict[_key1]:
                                            _new_dict = copy.copy(_dict)
                                            _new_dict.pop(_key1)
                                            _new_dict[_role] = _entity
                                            list_dict_role_after.append({_role:_entity})
                if len(list_dict_role_after)==0:
                    pass
                else:
                    list_dict_role_first.extend(list_dict_role_after)

        return list_dict_role_first


    def recursive_package(_dict_legal_combination,set_legal_entity,dict_one_selution,list_all_selution):
        last_layer = False
        #若是空组合则放回空
        if len(_dict_legal_combination.keys())==0:
            return []
        #递归到最后一层则修改状态
        if len(_dict_legal_combination.keys())==1:
            last_layer = True
        #取一个角色开始进行遍历
        _key_role = list(_dict_legal_combination.keys())[0]
        for item in _dict_legal_combination[_key_role]:
            copy_dict_one_selution = copy.copy(dict_one_selution)
            copy_dict_legal_combination = {}
            copy_set_legal_entity = copy.copy(set_legal_entity)
            
            #复制余下的所有角色，进行下一轮递归
            for _key in _dict_legal_combination.keys():
                if _key!=_key_role:
                    copy_dict_legal_combination[_key] = _dict_legal_combination[_key]

            #修改为招标人和代理人可以为同一个
            if item !="":
                _flag = True
                if str(_key_role) in ["0","1"]:
                    for _key_flag in copy_dict_one_selution:
                        if _key_flag not in ["0","1"] and copy_dict_one_selution[_key_flag]==item:
                            _flag = False
                else:
                    for _key_flag in copy_dict_one_selution:
                        if copy_dict_one_selution[_key_flag]==item:
                            _flag = False
                if _flag:
                    copy_dict_one_selution[_key_role] = item
                    
            '''
            if item not in copy_set_legal_entity:
                if item !="":
                    copy_dict_one_selution[_key_role] = item
            '''
            copy_set_legal_entity.add(item)
            if last_layer:
                list_all_selution.append(copy_dict_one_selution)
            else:
                recursive_package(copy_dict_legal_combination,copy_set_legal_entity,copy_dict_one_selution,list_all_selution)
    

    #递归匹配各个包的结果        
    def recursive_packages(_dict_legal_combination,dict_one_selution,list_all_selution):
        last_layer = False
        if len(_dict_legal_combination.keys())==0:
            return []
        if len(_dict_legal_combination.keys())==1:
            last_layer = True
        _key_pack = list(_dict_legal_combination.keys())[0]
        for item in _dict_legal_combination[_key_pack]:
            copy_dict_one_selution = copy.copy(dict_one_selution)
            copy_dict_legal_combination = {}
            for _key in _dict_legal_combination.keys():
                if _key!=_key_pack:
                    copy_dict_legal_combination[_key] = _dict_legal_combination[_key]
            for _key_role in item.keys():
                copy_dict_one_selution[_key_pack+"$$"+_key_role] = item[_key_role]
            if last_layer:
                list_all_selution.append(copy_dict_one_selution)
            else:
                recursive_packages(copy_dict_legal_combination,copy_dict_one_selution,list_all_selution)
        return list_all_selution
    
    #循环获取所有包组合
    def circle_pageages(_dict_legal_combination):
        list_all_selution = []
        for _key_pack in _dict_legal_combination.keys():
            list_key_selution = []
            for item in _dict_legal_combination[_key_pack]:
                _dict = dict()
                for _key_role in item.keys():
                    _dict[_key_pack+"$$"+_key_role] = item[_key_role]
                list_key_selution.append(_dict)
            if len(list_all_selution)==0:
                list_all_selution = list_key_selution
            else:
                _list_all_selution = []
                for item_1 in list_all_selution:
                    for item_2 in list_key_selution:
                        _list_all_selution.append(dict(item_1,**item_2))
                list_all_selution = _list_all_selution
        return list_all_selution
                      
    #拿到各个包解析之后的结果
    _dict_legal_combination = {}
    for packageName in dict_role_combination.keys():
        _list_all_selution = []

        # recursive_package(dict_role_combination[packageName], set(), {}, _list_all_selution)
        _list_all_selution = circle_package(dict_role_combination[packageName])
        '''
        # print("===1")
        # print(packageName)
        for item in _list_all_selution:
            # print(item)
        # print("===2")
        '''
        #去除包含子集
        list_all_selution_simple = []
        _list_set_all_selution = []
        for item_selution in _list_all_selution:
            item_set_selution = set()
            for _key in item_selution.keys():
                item_set_selution.add((_key,item_selution[_key]))
            _list_set_all_selution.append(item_set_selution)
        if len(_list_set_all_selution)>1000:
            _dict_legal_combination[packageName] = _list_all_selution
            continue
        for i in range(len(_list_set_all_selution)):
            
            be_included = False
            for j in range(len(_list_set_all_selution)):
                if i!=j:
                    if len(set(_list_set_all_selution[i])&set(_list_set_all_selution[j]))==len(_list_set_all_selution[i]) and len(_list_set_all_selution[i])!=len(_list_set_all_selution[j]):
                        be_included = True
            if not be_included:
                list_all_selution_simple.append(_list_all_selution[i])
        _dict_legal_combination[packageName] = list_all_selution_simple
    _list_final_comba = []
    #对各个包的结果进行排列组合
    _comba_count = 1
    for _key in _dict_legal_combination.keys():
        _comba_count *= len(_dict_legal_combination[_key])
    #如果过大，则每个包只取概率最大的那个
    dict_pack_entity_prob = get_dict_entity_prob(list_entity)
    if _comba_count>250:
        new_dict_legal_combination = dict()
        for _key_pack in _dict_legal_combination.keys():
            MAX_PROB = -1000
            _MAX_PROB_COMBA = None
            for item in _dict_legal_combination[_key_pack]:
                # print(_key_pack,item)
                _dict = dict()
                for _key in item.keys():
                    _dict[str(_key_pack)+"$$"+str(_key)] = item[_key]
                _prob = getSumExpectation(dict_pack_entity_prob, _dict)
                if _prob>MAX_PROB:
                    MAX_PROB = _prob
                    _MAX_PROB_COMBA = [item]
            if _MAX_PROB_COMBA is not None:
                new_dict_legal_combination[_key_pack] = _MAX_PROB_COMBA
        _dict_legal_combination = new_dict_legal_combination
    #recursive_packages(_dict_legal_combination, {}, _list_final_comba)
    _list_final_comba = circle_pageages(_dict_legal_combination)
    #除了Project包(招标人和代理人)，其他包是不会有冲突的
    #查看是否有一个实体出现在了Project包和其他包中，如有，要进行裁剪
    _list_real_comba = []
    for dict_item in _list_final_comba:
        set_project = set()
        set_other = set()
        for _key in list(dict_item.keys()):
            if _key.split("$$")[0]=="Project":
                set_project.add(dict_item[_key])
            else:
                set_other.add(dict_item[_key])
        set_common = set_project&set_other
        if len(set_common)>0:
            dict_project = {}
            dict_not_project = {}
            for _key in list(dict_item.keys()):
                if dict_item[_key] in set_common:
                    if str(_key.split("$$")[0])=="Project":
                        dict_project[_key] = dict_item[_key]
                    else:
                        dict_not_project[_key] = dict_item[_key]
                else:
                    dict_project[_key] = dict_item[_key]
                    dict_not_project[_key] = dict_item[_key]
            
            _list_real_comba.append(dict_project)
            _list_real_comba.append(dict_not_project)
        else:
            _list_real_comba.append(dict_item)

    return _list_real_comba

def get_dict_entity_prob(list_entity,on_value=0.5):
    dict_pack_entity_prob = {}
    for entity in list_entity:
        if entity.entity_type in ['org','company']:
            values = entity.values
            role_prob = float(values[int(entity.label)])
            _key = entity.packageName+"$$"+str(entity.label)
            if role_prob>=on_value and str(entity.label)!="5":
                _key_prob = _key+"$text$"+entity.entity_text
                if _key_prob in dict_pack_entity_prob:
                    if role_prob>dict_pack_entity_prob[_key_prob][1]:
                        dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
                else:
                    dict_pack_entity_prob[_key_prob] = [entity.entity_text,role_prob]
    return dict_pack_entity_prob


#计算合计期望            
def getSumExpectation(dict_pack_entity_prob,combination,on_value=0.5):
    '''
    expect = 0
    for entity in list_entity:
        if entity.entity_type in ['org','company']:
            values = entity.values
            role_prob = float(values[int(entity.label)])
            _key = entity.packageName+"$$"+str(entity.label)
            if role_prob>on_value and str(entity.label)!="5":
                if _key in combination.keys() and combination[_key]==entity.entity_text:
                    expect += math.pow(role_prob,4)
                else:
                    expect -= math.pow(role_prob,4)
    '''
    #修改为同一个实体只取对应包-角色的最大的概率值
    expect = 0
    dict_entity_prob = {}
    for _key_pack_entity in dict_pack_entity_prob:
        _key_pack = _key_pack_entity.split("$text$")[0]
        role_prob = dict_pack_entity_prob[_key_pack_entity][1]
        if _key_pack in combination.keys() and combination[_key_pack]==dict_pack_entity_prob[_key_pack_entity][0]:
            if _key_pack_entity in dict_entity_prob.keys():
                if dict_entity_prob[_key_pack_entity]<role_prob:
                    dict_entity_prob[_key_pack_entity] = role_prob
            else:
                dict_entity_prob[_key_pack_entity] = role_prob
        else:
            if _key_pack_entity in dict_entity_prob.keys():
                if dict_entity_prob[_key_pack_entity]>-role_prob:
                    dict_entity_prob[_key_pack_entity] = -role_prob
            else:
                dict_entity_prob[_key_pack_entity] = -role_prob
    # for entity in list_entity:
    #     if entity.entity_type in ['org','company']:
    #         values = entity.values
    #         role_prob = float(values[int(entity.label)])
    #         _key = entity.packageName+"$$"+str(entity.label)
    #         if role_prob>=on_value and str(entity.label)!="5":
    #             if _key in combination.keys() and combination[_key]==entity.entity_text:
    #                 _key_prob = _key+entity.entity_text
    #                 if _key_prob in dict_entity_prob.keys():
    #                     if dict_entity_prob[_key_prob]<role_prob:
    #                         dict_entity_prob[_key_prob] = role_prob
    #                 else:
    #                     dict_entity_prob[_key_prob] = role_prob
    #             else:
    #                 _key_prob = _key+entity.entity_text
    #                 if _key_prob in dict_entity_prob.keys():
    #                     if dict_entity_prob[_key_prob]>-role_prob:
    #                         dict_entity_prob[_key_prob] = -role_prob
    #                 else:
    #                     dict_entity_prob[_key_prob] = -role_prob
    for _key in dict_entity_prob.keys():
        symbol = 1 if dict_entity_prob[_key]>0 else -1 
        expect += symbol*math.pow(dict_entity_prob[_key],2)
    return expect


def getRoleList(list_sentence,list_entity,on_value = 0.5):
    '''
    @summary: 搜索树，得到所有不矛盾的角色组合，取合计期望值最大的作为结果返回
    @param:
        list_sentence:文章所有的sentence
        list_entity:文章所有的实体
        on_value:概率阈值
    @return:文章的角色list
    '''

    pack = getPackagesFromArticle(list_sentence,list_entity)
    if pack is None:
        return None
    PackageList,PackageSet,dict_PackageCode = pack


    #拿到所有可能的情况
    dict_role_combination = {}
  # print(PackageList)
    #拿到各个实体的packageName,packageCode
    for entity in list_entity:
        if entity.entity_type in ['org','company']:
            #过滤掉字数小于3个的实体
            if len(entity.entity_text)<=3:
                continue
            values = entity.values
            role_prob = float(values[int(entity.label)])
            if role_prob>=on_value and str(entity.label)!="5":
                if str(entity.label) in ["0","1"]:
                    packageName = "Project"
                else:
                    if len(PackageSet)>0:
                        packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"role-"+str(entity.label))
                        if packagePointer is None:
                            #continue
                            packageName = "Project"
                          # print(entity.entity_text, packageName,entity.sentence_index,entity.begin_index)
                        else:
                            #add pointer_pack
                            entity.pointer_pack = packagePointer
                            packageName = packagePointer.entity_text
                          # print(entity.entity_text, packageName)
                    else:
                        packageName = "Project"
                    find_flag = False

                    if packageName in dict_PackageCode.keys():
                        packageCode = dict_PackageCode[packageName]
                    else:
                        packageCode = ""
                    entity.packageCode = packageCode
                role_name = dict_role_id.get(str(entity.label))
                entity.roleName = role_name
                entity.packageName = packageName
                if entity.packageName in dict_role_combination.keys():
                    if str(entity.label) in dict_role_combination[entity.packageName].keys():
                        dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
                    else:
                        dict_role_combination[entity.packageName][str(entity.label)] = set([entity.entity_text])
                else:
                    dict_role_combination[entity.packageName] = {}
                    #初始化空值
                    roleIds = [0,1,2,3,4]
                    for _roleId in roleIds:
                        dict_role_combination[entity.packageName][str(_roleId)] = set([""])
                    dict_role_combination[entity.packageName][str(entity.label)].add(entity.entity_text)
    list_real_comba = get_legal_comba(list_entity,dict_role_combination)
  # print("===role_combination",dict_role_combination)
  # print("== real_comba",list_real_comba)
    #拿到最大期望值的组合
    max_index = 0
    max_expect = -100
    _index = 0
    dict_pack_entity_prob = get_dict_entity_prob(list_entity)
    for item_combination in list_real_comba:
        expect = getSumExpectation(dict_pack_entity_prob, item_combination)
        if expect>max_expect:
            max_index = _index
            max_expect = expect
        _index += 1
    RoleList = []
    RoleSet = set()
    if len(list_real_comba)>0:
        for _key in list_real_comba[max_index].keys():
            packageName = _key.split("$$")[0]
            label = _key.split("$$")[1]
            role_name = dict_role_id.get(str(label))
            entity_text = list_real_comba[max_index][_key]
            if packageName in dict_PackageCode.keys():
                packagecode = dict_PackageCode.get(packageName)
            else:
                packagecode = ""
            RoleList.append(PREM(packageName,packagecode,role_name,entity_text,0,0,0.0,[]))
            RoleSet.add(entity_text)

    #根据最优树来修正list_entity中角色对包的连接
    for _entity in list_entity:
        if _entity.pointer_pack is not None:
            _pack_name = _entity.pointer_pack.entity_text
            _find_flag = False
            for _prem in RoleList:
                if _prem.packageName==_pack_name and _prem.entity_text==_entity.entity_text:
                    _find_flag = True
            if not _find_flag:
                _entity.pointer_pack = None
    return RoleList,RoleSet,PackageList,PackageSet

def getPackageScopePattern():
    '''
    @summary: 获取包的作用域关键词
    '''
    df = pd.read_excel(os.path.dirname(__file__)+"/end.xls")
    pattern = "("
    for item in df["list_word"]:
        item = str(item).replace("(","\(").replace(")","\)").replace(".","\.").replace("[","\[").replace("]","\]").replace("-","\-")
        pattern += item+"|"
    pattern = pattern[:-1]+")[:：是为]|业绩.{,30}标段[0-9A-Za-z一二三四五六七八九十]{0,3}"
    return pattern
        
pattern_packageScope = getPackageScopePattern()   
def getPackagesFromArticle(list_sentence,list_entity):
    '''
    @param:
        list_sentence:文章的句子list
    @summary: 将包的信息插入list_entity中
    @return: type:list if [包号,句子index,词偏移,标段号] meaning:文章的包/标段信息
    '''
    
    if len(list_sentence)==0:
        return None
    list_sentence.sort(key=lambda x:x.sentence_index)

    PackageList = []
    PackageList_scope = []
    PackageSet = set()
    dict_packageCode = dict()
    
    package_name_pattern = re.compile("((标[段号的包]|分包)的?(名[称单]?|包名))[:：]?([^:：]{3,30}?)，{1}")
    package_N_name_pattern = re.compile("[^承](分?包|标段|标包|标|包|包组|子项目|包件|项目类型)编?号?[:：]?[\(（]?([0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]){1,2}，{1}")
    package_number_pattern = re.compile("(([^承](包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型)[:：]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.])[^至]?|([^\.]?第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))")
    # other_package_pattern = re.compile('(项目名称|物资名称|设备名称|场次名称|标段名称)[:：](.{,20}?)(，|项目)')  # 新正则识别标段
    other_package_pattern = re.compile('((项目|物资|设备|场次|标段|标的|产品)(名称))[:：]([^，。]{2,50}?)[，。]')  #  # 2020/11/23 大网站规则 调整  package_N_name_pattern， package_N_name_pattern 中的项目 改为 子项目
    win_tenderer_pattern = re.compile('(中标人|供应商)[:：](.{2,25})[，。]')  # 2020/11/23 大网站规则 调整
    model_pattern = re.compile('(型号|序号)[:：]([^，。]{2,20})[，。]')  # 2020/11/23 大网站规则 调整
    number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")

    package_code_pattern = re.compile("(?:编号[:：]?\s*)([-\dA-Za-z\(\)]+)")
    # 纯数字类型的包号统一，例如：'01','1'
    re_digital = re.compile("^\d+$")
    def changeIndexFromWordToWords(tokens,word_index):
        '''
        @summary:转换某个字的字偏移为词偏移
        '''
        before_index = 0
        after_index = 0
        for i in range(len(tokens)):
            after_index = after_index+len(tokens[i])
            if before_index<=word_index and after_index>=word_index:
                return i
            before_index = after_index
    package_names = []
    
    def extractPackageCode(tokens,word_index,size=20,pattern = package_code_pattern):
        '''
        @summary:抽取包附近的标段号
        @param:
            tokens:包所在句子的分词
            word_index:包所在字偏移
            size:左右各取多少个词
            pattern:提取标段号的正则
        @return: type:string,meaning:标段号
        '''
        index = changeIndexFromWordToWords(tokens,word_index)
        if index<size:
            begin = index
        else:
            begin = index-size
        if index+size>len(tokens):
            end = len(tokens)
        else:
            end = index+size
        #拿到左右两边的词语组成短语
        text = "".join(tokens[begin:end])
        #在短语中的字偏移
        new_word_index = word_index-len("".join(tokens[:begin]))
        min_distance = len(text)
        packageCode = None
        for the_iter in re.finditer(pattern,text):
            #算出最小距离
            distance = min([abs(new_word_index-the_iter.span()[0]),abs(new_word_index-the_iter.span()[1])])
            if distance<min_distance:
                min_distance = distance
                packageCode = the_iter.group(1)
        return packageCode
    #从标段介绍表格中提取包名和包号
    for i in range(len(list_sentence)):
        content = list_sentence[i].sentence_text
        names = re.findall(package_name_pattern,content)
        if names == []:
            names = re.findall(other_package_pattern, content)
        N_names = re.findall(package_N_name_pattern,content)
        if len(names)==1 and len(N_names)==1:
            package_names.append([names[0][-1],N_names[0][-1]])
    for i in range(len(list_sentence)):
        PackageList_item = []
        PackageList_item_scope = []
        content = list_sentence[i].sentence_text
        tokens = list_sentence[i].tokens
        _names = []
        # 2021/6/23 包名称去重
        for name in package_names:
            if name not in _names:
                _names.append(name)
        # for name in package_names[:20]:
        for name in _names[:20]:
            for index in findAllIndex(name[0],content):
                temp_package_number = re.findall(number_pattern,name[1])[0]
                if re.search(re_digital,temp_package_number):
                    temp_package_number = str(int(temp_package_number))
                PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,index),"offsetWord_begin":index,"offsetWord_end":index+len(name[0])})
                # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,index),index,index+len(str(temp_package_number))])
                code = extractPackageCode(tokens, index)
                if code is not None:
                    dict_packageCode[temp_package_number] = code
                PackageSet.add(temp_package_number)
        for iter in re.finditer(package_number_pattern,content):
            temp_package_number = re.findall(number_pattern,content[iter.span()[0]:iter.span()[1]])[0]
            if re.search(re_digital, temp_package_number):
                temp_package_number = str(int(temp_package_number))
            PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
            # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
            code = extractPackageCode(tokens, iter.span()[0])
            if code is not None:
                dict_packageCode[temp_package_number] = code
            PackageSet.add(temp_package_number)
        
        #识别packageScope
        for iter in re.finditer(pattern_packageScope,content):
            PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
            # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
        PackageList_item_scope = PackageList_item +PackageList_item_scope
        PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
        PackageList_scope = PackageList_scope+PackageList_item_scope
        PackageList_item.sort(key=lambda x:x["sentence_index"])
        #PackageList = PackageList+PackageList_item
    #不作为包
    # if len(PackageSet)==0:
    #     for i in range(len(list_sentence)):
    #         PackageList_item = []
    #         PackageList_item_scope = []
    #         content = list_sentence[i].sentence_text
    #         tokens = list_sentence[i].tokens
    #         for iter in re.finditer(other_package_pattern,content):
    #             temp_package_number = iter.group(2)
    #             PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
    #             # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
    #             code = extractPackageCode(tokens, iter.span()[0])
    #             if code is not None:
    #                 dict_packageCode[temp_package_number] = code
    #             PackageSet.add(temp_package_number)
    #         #识别packageScope
    #         for iter in re.finditer(pattern_packageScope,content):
    #             PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
    #             # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
    #         PackageList_item_scope = PackageList_item +PackageList_item_scope
    #         PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
    #         PackageList_scope = PackageList_scope+PackageList_item_scope
    #         PackageList_item.sort(key=lambda x:x["sentence_index"])

    # 2020/11/23 大网站规则 调整
    if len(PackageSet)==0 and len(set([it.entity_text for it in list_entity if it.entity_type in ['org', 'company'] and it.label==2]))>1:
        for i in range(len(list_sentence)):
            PackageList_item = []
            PackageList_item_scope = []
            content = list_sentence[i].sentence_text
            tokens = list_sentence[i].tokens
            names = re.findall(other_package_pattern, content)
            N_names = re.findall(win_tenderer_pattern, content)
            if len(names) != 1 or len(N_names) != 1:
                continue
            for iter in re.finditer(other_package_pattern,content):
                temp_package_number = iter.group(4)
                xinghao = re.search(model_pattern, content)
                if xinghao:
                    temp_package_number = temp_package_number + '+' + xinghao.group(2)
                # print('新正则采购包名补充',temp_package_number)
                if re.search(re_digital,temp_package_number):
                    temp_package_number = str(int(temp_package_number))
                PackageList_item.append({"name":temp_package_number,"sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
                # PackageList_item.append([temp_package_number,i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
                code = extractPackageCode(tokens, iter.span()[0])
                if code is not None:
                    dict_packageCode[temp_package_number] = code
                PackageSet.add(temp_package_number)
            #识别packageScope
            for iter in re.finditer(pattern_packageScope,content):
                PackageList_item_scope.append({"name":"","sentence_index":list_sentence[i].sentence_index,"offsetWords_begin":changeIndexFromWordToWords(tokens,iter.span()[0]),"offsetWord_begin":iter.span()[0],"offsetWord_end":iter.span()[1]})
                # PackageList_item_scope.append(["",i,changeIndexFromWordToWords(tokens,iter.span()[0]),iter.span()[0],iter.span()[1]])
            PackageList_item_scope = PackageList_item +PackageList_item_scope
            PackageList_item_scope.sort(key=lambda x:x["offsetWord_begin"])
            PackageList_scope = PackageList_scope+PackageList_item_scope
            PackageList_item.sort(key=lambda x:x["sentence_index"])
    pattern_punctuation = "[:：（）\(\),，。；;]"
  # print("===packageList_scope",PackageList_scope)
    for i in range(len(list_sentence)):
        for j in range(len(PackageList_scope)):
            if i==PackageList_scope[j]["sentence_index"] and PackageList_scope[j]["name"]!="":
                _flag = False
                left_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]-30:PackageList_scope[j]["offsetWord_begin"]+1]
                right_str = list_sentence[i].sentence_text[PackageList_scope[j]["offsetWord_begin"]:PackageList_scope[j]["offsetWord_begin"]+30]
                _left_find = re.findall(pattern_punctuation,left_str)
                _right_find = re.findall(pattern_punctuation,right_str)
                #print(left_str)
                if re.search("同",left_str[-1:]) is not None and PackageList_scope[j]["name"]=="一":
                    continue
                if re.search("划分",right_str[:10]) is not None:
                    continue
                if len(_left_find)>0 and _left_find[-1] in [":","："]:
                    _flag = True
                if len(_right_find)>0 and _right_find[0] in [":","："]:
                    _flag = True
                if _flag:
                    scope_begin = [PackageList_scope[j]["sentence_index"],PackageList_scope[j]["offsetWords_begin"]]
                else:
                    if j==0:
                        scope_begin = [0,0]
                    else:
                        scope_begin = [PackageList_scope[j-1]["sentence_index"],PackageList_scope[j-1]["offsetWords_begin"]]
                if j==len(PackageList_scope)-1:
                    scope_end = [list_sentence[-1].sentence_index,changeIndexFromWordToWords(list_sentence[-1].tokens, len(list_sentence[-1].sentence_text))]
                else:
                    scope_end = [PackageList_scope[j+1]["sentence_index"],PackageList_scope[j+1]["offsetWords_begin"]]
                if PackageList_scope[j-1]["sentence_index"]==PackageList_scope[j]["sentence_index"] and PackageList_scope[j-1]["offsetWord_begin"]<=PackageList_scope[j]["offsetWord_begin"] and PackageList_scope[j-1]["offsetWord_end"]>=PackageList_scope[j]["offsetWord_end"]:
                    continue

                #add package to entity
                _pack_entity = Entity(doc_id=list_sentence[0].doc_id,entity_id="%s_%s_%s_%s"%(list_sentence[0].doc_id,i,PackageList_scope[j]["offsetWord_begin"],PackageList_scope[j]["offsetWord_begin"]),entity_text=PackageList_scope[j]["name"],entity_type="package",sentence_index=PackageList_scope[j]["sentence_index"],begin_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_begin"]),end_index=changeIndexFromWordToWords(list_sentence[i].tokens,PackageList_scope[j]["offsetWord_end"]),wordOffset_begin=PackageList_scope[j]["offsetWord_begin"],wordOffset_end=PackageList_scope[j]["offsetWord_end"])
                list_entity.append(_pack_entity)
                copy_pack = copy.copy(PackageList_scope[j])
                copy_pack["scope"] = [scope_begin,scope_end]
                copy_pack["hit"] = set()
                copy_pack["pointer"] = _pack_entity

                PackageList.append(copy_pack)
    return PackageList,PackageSet,dict_packageCode

from BiddingKG.dl.interface.modelFactory import Model_relation_extraction
relationExtraction_model = Model_relation_extraction()
def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_entity,list_sentence,on_value = 0.5,on_value_person=0.5,sentence_len=4):
    '''
    @param:
        PackDict:文章包dict
        roleSet:文章所有角色的公司名称
        PackageList:文章的包信息
        PackageSet:文章所有包的名称
        list_entity:文章所有经过模型处理的实体
        on_value:金额模型的阈值
        on_value_person:联系人模型的阈值
        sentence_len:公司和属性间隔句子的最大长度
    @return:添加了属性信息的角色list
    '''
    
    #根据roleid添加金额到rolelist中
    def addMoneyByRoleid(packDict,packageName,roleid,money,money_prob):
        for i in range(len(packDict[packageName]["roleList"])):
            if packDict[packageName]["roleList"][i].role_name==dict_role_id.get(str(roleid)):
                if money_prob>packDict[packageName]["roleList"][i].money_prob:
                    packDict[packageName]["roleList"][i].money = money
                    packDict[packageName]["roleList"][i].money_prob = money_prob
        return packDict
                    
    #根据实体名称添加金额到rolelist中
    def addMoneyByEntity(packDict,packageName,entity,money,money_prob):
        for i in range(len(packDict[packageName]["roleList"])):
            if packDict[packageName]["roleList"][i].entity_text==entity:
                # if money_prob>packDict[packageName]["roleList"][i].money_prob:
                #     packDict[packageName]["roleList"][i].money = money
                #     packDict[packageName]["roleList"][i].money_prob = money_prob
                if packDict[packageName]["roleList"][i].money_prob==0 :  # 2021/7/20第一次更新金额
                    packDict[packageName]["roleList"][i].money = money.entity_text
                    packDict[packageName]["roleList"][i].money_prob = money_prob
                    packDict[packageName]["roleList"][i].money_unit = money.money_unit
                elif money_prob>packDict[packageName]["roleList"][i].money_prob+0.2 or money.notes in ['大写']: # 2021/7/20改为优先选择大写金额,
                    # print('已连接金额概率：money_prob:',packDict[packageName]["roleList"][i].money_prob)
                    # print('链接金额备注 ',money.notes, money.entity_text, money.values)
                    packDict[packageName]["roleList"][i].money = money.entity_text
                    packDict[packageName]["roleList"][i].money_prob = money_prob
                    packDict[packageName]["roleList"][i].money_unit = money.money_unit
                # print('链接中的金额：{0}, 单位：{1}'.format(money.entity_text, money.money_unit))
        return packDict
    
    #根据实体名称得到角色
    def getRoleWithText(packDict,entity_text):
        for pack in packDict.keys():
            for i in range(len(packDict[pack]["roleList"])):
                if packDict[pack]["roleList"][i].entity_text==entity_text:
                    return packDict[pack]["roleList"][i].role_name
    
    def doesEntityOrLinkedEntity_inRoleSet(entity,RoleSet):
        _list_entitys = [entity]+entity.linked_entitys
        for _entity in _list_entitys:
            if _entity.entity_text in RoleSet:
                return True
    
    p_entity = 0

    # 2021/7/19 顺序比较金额，前面是后面的一万倍则把前面金额/10000
    money_list = [it for it in list_entity if it.entity_type=="money"]
    for i in range(len(money_list)-1):
        for j in range(1, len(money_list)):
            if (float(money_list[i].entity_text) > 5000000000 or money_list[j].notes=='大写') and \
                    Decimal(money_list[i].entity_text)/Decimal(money_list[j].entity_text)==10000:
                money_list[i].entity_text = str(Decimal(money_list[i].entity_text)/10000)
                # print('连接前修改大于50亿金额：前面是后面的一万倍则把前面金额/10000')
    
    #遍历所有实体
    while(p_entity<len(list_entity)):
        entity = list_entity[p_entity]
        '''
        #招标金额从后往前找
        if entity.entity_type=="money":
            if entity.values[entity.label]>=on_value:
                if str(entity.label)=="0":
                    packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
                    if packagePointer is None:
                        packageName = "Project"
                    else:
                        packageName = packagePointer.entity_text
                    addMoneyByRoleid(PackDict, packageName, "0", entity.entity_text, entity.values[entity.label])
        '''
        ''' # 2020/11/25 与下面的联系人连接步骤重复，取消
        if entity.entity_type=="person":
            if entity.values[entity.label]>=on_value_person:
                if str(entity.label)=="1":
                    for i in range(len(PackDict["Project"]["roleList"])):
                        if PackDict["Project"]["roleList"][i].role_name=="tenderee":
                            PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
                        # add pointer_person
                        for _entity in list_entity:
                            if dict_role_id.get(str(_entity.label))=="tenderee":
                                for i in range(len(PackDict["Project"]["roleList"])):
                                    if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
                                        _entity.pointer_person = entity
                elif str(entity.label)=="2":
                    for i in range(len(PackDict["Project"]["roleList"])):
                        if PackDict["Project"]["roleList"][i].role_name=="agency":
                            PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
                    # add pointer_person
                    for _entity in list_entity:
                        if dict_role_id.get(str(_entity.label))=="agency":
                            for i in range(len(PackDict["Project"]["roleList"])):
                                if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
                                    _entity.pointer_person = entity
    '''
        # #金额往前找实体
        # if entity.entity_type=="money":
        #     if entity.values[entity.label]>=on_value:
        #         p_entity_money= p_entity
        #         entity_money = list_entity[p_entity_money]
        #         if len(PackageSet)>0:
        #             packagePointer,_ = getPackage(PackageList,entity_money.sentence_index,entity_money.begin_index,"money-"+str(entity_money.entity_text)+"-"+str(entity_money.label))
        #             if packagePointer is None:
        #                 packageName_entity = "Project"
        #             else:
        #                 packageName_entity = packagePointer.entity_text
        #         else:
        #             packageName_entity = "Project"
        #         while(p_entity_money>0):
        #             entity_before = list_entity[p_entity_money]
        #             if entity_before.entity_type in ['org','company']:
        #                 if str(entity_before.label)=="1":
        #                     addMoneyByEntity(PackDict, packageName_entity, entity_before.entity_text, entity_money.entity_text, entity_money.values[entity_money.label])
        #                     #add pointer_money
        #                     entity_before.pointer_money = entity_money
        #                 break
        #             p_entity_money -= 1


        #如果实体属于角色集合，则往后找属性
        if doesEntityOrLinkedEntity_inRoleSet(entity, roleSet):
            
            p_entity += 1
            #循环查找符合的属性
            while(p_entity<len(list_entity)):
                
                entity_after = list_entity[p_entity]
                if entity_after.sentence_index-entity.sentence_index>=sentence_len:
                    p_entity -= 1
                    break
                #若是遇到公司实体，则跳出循环
                if entity_after.entity_type in ['org','company']:
                    p_entity -= 1
                    break
                if entity_after.values is not None:
                    if entity_after.entity_type=="money":
                        if entity_after.values[entity_after.label]>=on_value:
                            '''
                            #招标金额从后往前找
                            if str(entity_after.label)=="0":
                                packagePointer,_ = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label))
                                if packagePointer is None:
                                    packageName = "Project"
                                else:
                                    packageName = packagePointer.entity_text
                                addMoneyByRoleid(PackDict, packageName, "0", entity_after.entity_text, entity_after.values[entity_after.label])
                            '''
                            if str(entity_after.label)=="1":
                                #print(entity_after.entity_text,entity.entity_text)
                                _list_entitys = [entity]+entity.linked_entitys
                                if len(PackageSet)>0:
                                    packagePointer,_ = getPackage(PackageList,entity_after.sentence_index,entity_after.begin_index,"money-"+str(entity_after.entity_text)+"-"+str(entity_after.label))
                                    if packagePointer is None:
                                        packageName_entity = "Project"
                                    else:
                                        packageName_entity = packagePointer.entity_text
                                else:
                                    packageName_entity = "Project"
                                if str(entity.label) in ["2","3","4"]:
                                    # addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after.entity_text, entity_after.values[entity_after.label])
                                    if entity_after.notes == '单价' or float(entity_after.entity_text)<5000: #2021/12/17 调整小金额阈值，避免203608823.html 两次金额一次万元没提取到的情况
                                        addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
                                                         0.5)
                                        entity.pointer_money = entity_after
                                        # print('role zhao money', entity.entity_text, '中标金额：', entity_after.entity_text)
                                    else:
                                        addMoneyByEntity(PackDict, packageName_entity, entity.entity_text, entity_after,
                                                         entity_after.values[entity_after.label])
                                        entity.pointer_money = entity_after
                                        # print('role zhao money', entity.entity_text, '中标金额：', entity_after.entity_text)
                                        if entity_after.values[entity_after.label]>0.6:
                                            break # 2021/7/16 新增，找到中标金额，非单价即停止，不再往后找金额
                                    #add pointer_money
                                    # entity.pointer_money = entity_after
                                    # print('role zhao money', entity.entity_text, '中标金额：', entity_after.entity_text)
                                    # if entity_after.notes!='单价':
                                    #     break  # 2021/7/16 新增，找到中标金额即停止，不再往后找金额
                        '''
                    if entity_after.entity_type=="person":
                        if entity_after.values[entity_after.label]>=on_value_person:
                            if str(entity_after.label)=="1":
                                for i in range(len(roleList)):
                                    if roleList[i].role_name=="tenderee":
                                        roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
                            elif str(entity_after.label)=="2":
                                for i in range(len(roleList)):
                                    if roleList[i].role_name=="agency":
                                        roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
                            elif str(entity_after.label)=="3":
                                _list_entitys = [entity]+entity.linked_entitys
                                for _entity in _list_entitys:
                                    for i in range(len(roleList)):
                                        if roleList[i].entity_text==_entity.entity_text:
                                            if entity_after.sentence_index-_entity.sentence_index>1 and len(roleList[i].linklist)>0:
                                                break
                                            roleList[i].linklist.append((entity_after.entity_text,entity_after.person_phone))
                    '''
                    
                p_entity += 1  
                
        p_entity += 1
    
    ''''''
    # 通过模型分类的招标/代理联系人
    list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
    person_list = [entity for entity in list_entity if entity.entity_type == 'person' and entity.label in [1, 2]]
    tenderee_contact = set()
    tenderee_phone = set()
    agency_contact = set()
    agency_phone = set()
    winter_contact = set()
    for _person in person_list:
        if _person.label == 1:
            tenderee_contact.add(_person.entity_text)
        if _person.label == 2:
            agency_contact.add(_person.entity_text)
    # 正则匹配无 '主体/联系人' 的电话
    # 例："采购人联系方式：0833-5226788，"
    phone_pattern = '(1[3|4|5|6|7|8|9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|' \
                    '\+86.?1[3|4|5|6|7|8|9]\d{9}|' \
                    '0[1-9]\d{1,2}[-—－―][1-9]\d{6,7}/[1-9]\d{6,10}|' \
                    '0[1-9]\d{1,2}[-—－―]\d{7,8}.?转\d{1,4}|' \
                    '0[1-9]\d{1,2}[-—－―]\d{7,8}[-—－―]\d{1,4}|' \
                    '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|' \
                   '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?)|' \
                   '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|' \
                   '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?|' \
                   '[\（|\(]0[1-9]\d{1,2}[\）|\)]-?\d{7,8}-?\d{,4}|' \
                   '[2-9]\d{6,7})'
    re_tenderee_phone = re.compile(
        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人)[:：]?[^。]{0,7}?)"
        # 电话号码
        + phone_pattern)
    # 例："采购人地址和联系方式：峨边彝族自治县教育局，0833-5226788，"
    re_tenderee_phone2 = re.compile(
        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[:：]?[^。]{0,20}?)"
        # 电话号码
        + phone_pattern)
    re_agent_phone = re.compile(
        "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,5}(?:电话|联系方式|联系人)[:：]?[^。]{0,7}?)"
        # 电话号码
        + phone_pattern)
    re_agent_phone2 = re.compile(
        "(?:(?:代理(?:人|机构|公司|单位|组织|方)|采购机构|集中采购机构|集采机构|招标机构)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人)[:：]?[^。]{0,20}?)"
        # 电话号码
        + phone_pattern)
    content = ""
    for _sentence in list_sentence:
        content += "".join(_sentence.tokens)
    _content = copy.deepcopy(content)
    while re.search("(.)(，)([^0-9])|([^0-9])(，)(.)", content):
        content_words = list(content)
        for i in re.finditer("(.)(，)([^0-9])", content):
            content_words[i.span(2)[0]] = ""
        for i in re.finditer("([^0-9])(，)(.)", content):
            content_words[i.span(2)[0]] = ""
        content = "".join(content_words)
    content = re.sub("[:：]|[\(（]|[\)）]", "", content)
    _tenderee_phone = re.findall(re_tenderee_phone, content)
    # 更新正则确定的角色属性
    for i in range(len(PackDict["Project"]["roleList"])):
        if PackDict["Project"]["roleList"][i].role_name == "tenderee":
            _tenderee_phone = re.findall(re_tenderee_phone, content)
            if _tenderee_phone:
                for _phone in _tenderee_phone:
                    _phone = _phone.split("/") # 分割多个号码
                    for one_phone in _phone:
                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
                        tenderee_phone.add(one_phone)
            _tenderee_phone2 = re.findall(re_tenderee_phone2, content)
            if _tenderee_phone2:
                for _phone in _tenderee_phone2:
                    _phone = _phone.split("/")
                    for one_phone in _phone:
                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
                        tenderee_phone.add(one_phone)
        if PackDict["Project"]["roleList"][i].role_name == "agency":
            _agent_phone = re.findall(re_agent_phone, content)
            if _agent_phone:
                for _phone in _agent_phone:
                    _phone = _phone.split("/")
                    for one_phone in _phone:
                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
                        agency_phone.add(one_phone)
            _agent_phone2 = re.findall(re_agent_phone2, content)
            if _agent_phone2:
                for _phone in _agent_phone2:
                    _phone = _phone.split("/")
                    for one_phone in _phone:
                        PackDict["Project"]["roleList"][i].linklist.append(("", one_phone))
                        agency_phone.add(one_phone)
    # km配对方法
    def dispatch(match_list):
        main_roles = list(set([match.main_role for match in match_list]))
        attributes = list(set([match.attribute for match in match_list]))

        label = np.zeros(shape=(len(main_roles), len(attributes)))
        for match in match_list:
            main_role = match.main_role
            attribute = match.attribute
            value = match.value
            label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
        # print(label)
        gragh = -label
        # km算法
        row, col = linear_sum_assignment(gragh)
        max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
        # return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
        return [(main_roles[row], attributes[col]) for row, col in max_dispatch]

    # 正则提取电话号码实体
    # key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)([0-1]\d{6,11})')
    phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|'
                       '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
                       # '0[^0]\d{1,2}[-—－―][1-9]\d{6,7}/[1-9]\d{6,10}|'
                       '0[1-9]\d{1,2}[-—－―]\d{7,8}.?转\d{1,4}|'
                       '0[1-9]\d{1,2}[-—－―]\d{7,8}[-—－―]\d{1,4}|'
                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?)|'
                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?|'
                       '[\（|\(]0[1-9]\d{1,2}[\）|\)]-?\d{7,8}-?\d{,4}|'
                       '[2-9]\d{6,7}')
    phone_entitys = []
    for _sentence in list_sentence:
        sentence_text = _sentence.sentence_text
        list_tokenbegin = []
        begin = 0
        for i in range(0, len(_sentence.tokens)):
            list_tokenbegin.append(begin)
            begin += len(str(_sentence.tokens[i]))
        list_tokenbegin.append(begin + 1)

        res_set = set()
        for i in re.finditer(phone, sentence_text):
            res_set.add((i.group(), i.start(), i.end()))
        res_set = sorted(list(res_set),key=lambda x:x[1])
        last_phone_mask = True
        for item_idx in range(len(res_set)):
            item = res_set[item_idx]
            phone_left = sentence_text[max(0, item[1] - 10):item[1]]
            phone_right = sentence_text[item[2]:item[2] + 8]
            # 排除“传真号”和其它错误项
            if re.search("传，?真|信，?箱|邮，?[箱件]", phone_left):
                if not re.search("电，?话", phone_left):
                    last_phone_mask = False
                    continue
            if re.search("注册[证号]|帐，?号|编，?[号码]|报，?价|证，?号|价，?格|[\(\（]万?元[\)\）]|[a-zA-Z]+\d*$", phone_left):
                last_phone_mask = False
                continue
            if re.search("^\d{0,4}[.,]\d{2,}|^[0-9a-zA-Z\.]*@|^\d*[a-zA-Z]+", phone_right):
                last_phone_mask = False
                continue
            # if:上一个phone实体不符合条件
            if not last_phone_mask:
                item_start = item[1]
                last_item_end = res_set[item_idx-1][2]
                if item_start - last_item_end<=1 or re.search("^\d+$",sentence_text[last_item_end:item_start]):
                    last_phone_mask = False
                    continue
            for j in range(len(list_tokenbegin)):
                if list_tokenbegin[j] == item[1]:
                    begin_index = j
                    break
                elif list_tokenbegin[j] > item[1]:
                    begin_index = j - 1
                    break
            for j in range(begin_index, len(list_tokenbegin)):
                if list_tokenbegin[j] >= item[2]:
                    end_index = j - 1
                    break
            _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, begin_index, end_index, item[1],
                             item[2])
            phone_entitys.append(_entity)
            last_phone_mask = True

    def is_company(entity,text):
        # 判断"公司"实体是否为地址地点
        if entity.label!=5 and entity.values[entity.label]>0.5:
            return True
        if ent.is_tail==True:
            return False
        entity_left = text[max(0,entity.wordOffset_begin-10):entity.wordOffset_begin]
        entity_left = re.sub("，（）\(\):：","",entity_left)
        entity_left = entity_left[-5:]
        if re.search("地址|地点|银行[：:]",entity_left):
            return False
        else:
            return True
    pre_entity = []
    for ent in list_entity:
        if (ent.entity_type in ['company','org','phone'] and is_company(ent,list_sentence[ent.sentence_index].sentence_text)) or (ent.entity_type=='person' and ent.label in [1,2,3]) \
                or (ent.entity_type=='location' and len(ent.entity_text)>5):
            pre_entity.append(ent)
    text_data,pre_data = relationExtraction_model.encode(pre_entity + phone_entitys, list_sentence)
    # print(pre_data)
    maxlen = 512
    relation_list = []
    if 0<len(text_data)<=maxlen:
        relation_list = relationExtraction_model.predict(text_data, pre_data)
    else:
        # 公告大于maxlen时，分段预测
        start = 0
        while start<len(pre_data):
            _pre_data = pre_data[start:start+maxlen]
            _text_data = text_data[start:start+maxlen]
            relation_list.extend(relationExtraction_model.predict(_text_data,_pre_data))
            start = start + maxlen - 120
        # 去重结果
        relation_list = list(set(relation_list))
    # print(relation_list)
    tokens_num_dict = dict()
    last_tokens_num = 0
    for sentence in list_sentence:
        _index = sentence.sentence_index
        if _index == 0:
            tokens_num_dict[_index] = 0
        else:
            tokens_num_dict[_index] = tokens_num_dict[_index - 1] + last_tokens_num
        last_tokens_num = len(sentence.tokens)
    right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
    linked_company = set()
    linked_person = set()
    for predicate in ["rel_address","rel_phone","rel_person"]:
        _match_list = []
        _match_combo = []
        for relation in relation_list:
            _subject = relation[0]
            _object = relation[2]
            if isinstance(_subject,Entity) and isinstance(_object,Entity) and (_subject.entity_type,_object.entity_type) in right_combination:
                if relation[1]==predicate:
                    if predicate=="rel_person":
                        if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
                            continue
                    distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
                                tokens_num_dict[_subject.sentence_index] + _subject.end_index)
                    if distance>0:
                        value = (-1 / 2 * (distance ** 2))/10000
                    else:
                        distance = abs(distance)
                        value = (-1 / 2 * (distance ** 2))
                    _match_list.append(Match(_subject,_object,value))
                    _match_combo.append((_subject,_object))
        match_result = dispatch(_match_list)
        error_list = []
        for mat in list(set(_match_combo)-set(match_result)):
            for temp in match_result:
                if mat[1]==temp[1] and mat[0]!=temp[0]:
                    error_list.append(mat)
                    break
        result = list(set(_match_combo)-set(error_list))
        if predicate=='rel_person':
            # 从后往前更新状态，已近后向链接的属性不在前向链接（解决错误链接）
            result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
            for combo in result:
                is_continue = False
                if not combo[0].pointer_person:
                    combo[0].pointer_person = []
                if combo[1].begin_index<combo[0].begin_index:
                    if combo[0].pointer_person:
                        for temp in combo[0].pointer_person:
                            if temp.begin_index>combo[0].begin_index:
                                is_continue = True
                                break
                if is_continue: continue
                combo[0].pointer_person.append(combo[1])
                linked_company.add(combo[0])
                linked_person.add(combo[1])
                # print(1,combo[0].entity_text,combo[1].entity_text)
        if predicate=='rel_address':
            result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
            for combo in result:
                if combo[0].pointer_address:
                    continue
                combo[0].pointer_address = combo[1]
                # print(2,combo[0].entity_text,combo[1].entity_text)
        if predicate=='rel_phone':
            result = sorted(result,key=lambda x:x[1].begin_index,reverse=True)
            for combo in result:
                is_continue = False
                if not combo[0].person_phone:
                    combo[0].person_phone = []
                if combo[1].begin_index<combo[0].begin_index:
                    if combo[0].person_phone:
                        for temp in combo[0].person_phone:
                            if temp.begin_index>combo[0].begin_index:
                                is_continue = True
                                break
                if is_continue: continue
                combo[0].person_phone.append(combo[1])
                if combo[0].label in [1,2]:
                    if PackDict.get("Project"):
                        for i in range(len(PackDict["Project"]["roleList"])):
                            if (combo[0].label==1 and PackDict["Project"]["roleList"][i].role_name=='tenderee') \
                                    or (combo[0].label==2 and PackDict["Project"]["roleList"][i].role_name=='agency'):
                                PackDict["Project"]["roleList"][i].linklist.append((combo[0].entity_text,combo[1].entity_text))
                                break
                # print(3,combo[0].entity_text,combo[1].entity_text)
    # 更新 PackDict
    not_sure_linked = []
    for link_p in list(linked_company):
        for k in PackDict.keys():
            for i in range(len(PackDict[k]["roleList"])):
                if PackDict[k]["roleList"][i].role_name == "tenderee":
                    if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 0:
                        not_sure_linked.append(link_p)
                        continue
                    if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
                        for per in link_p.pointer_person:
                            person_phone = [phone for phone in per.person_phone] if per.person_phone else []
                            if not person_phone:
                                if per.entity_text not in agency_contact:
                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
                                    continue
                            for _p in person_phone:
                                if per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
                elif PackDict[k]["roleList"][i].role_name == "agency":
                    if PackDict[k]["roleList"][i].entity_text != link_p.entity_text and link_p.label == 1:
                        not_sure_linked.append(link_p)
                        continue
                    if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
                        for per in link_p.pointer_person:
                            person_phone = [phone for phone in per.person_phone] if per.person_phone else []
                            if not person_phone:
                                if per.entity_text not in tenderee_contact:
                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
                                    continue
                            for _p in person_phone:
                                if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone:
                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
                else:
                    if PackDict[k]["roleList"][i].entity_text == link_p.entity_text:
                        for per in link_p.pointer_person:
                            person_phone = [phone for phone in per.person_phone] if per.person_phone else []
                            if not person_phone:
                                if per.entity_text not in tenderee_contact and per.entity_text not in agency_contact:
                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
                                    winter_contact.add(per.entity_text)
                                    continue
                            for _p in person_phone:
                                if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and \
                                        per.entity_text not in agency_contact and _p.entity_text not in agency_phone:
                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
                                    winter_contact.add(per.entity_text)
    # 更新org/company实体label为0，1的链接
    for link_p in not_sure_linked:
        for k in PackDict.keys():
            for i in range(len(PackDict[k]["roleList"])):
                if PackDict[k]["roleList"][i].role_name == "tenderee":
                    if link_p.label == 0:
                        for per in link_p.pointer_person:
                            person_phone = [phone for phone in per.person_phone] if per.person_phone else []
                            if not person_phone:
                                if per.entity_text not in agency_contact and per.entity_text not in winter_contact:
                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
                                    continue
                            for _p in person_phone:
                                if per.entity_text not in agency_contact and _p.entity_text not in agency_phone and per.entity_text not in winter_contact:
                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))
                elif PackDict[k]["roleList"][i].role_name == "agency":
                    if link_p.label == 1:
                        for per in link_p.pointer_person:
                            person_phone = [phone for phone in per.person_phone] if per.person_phone else []
                            if not person_phone:
                                if per.entity_text not in tenderee_contact and per.entity_text not in winter_contact:
                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, ""))
                                    continue
                            for _p in person_phone:
                                if per.entity_text not in tenderee_contact and _p.entity_text not in tenderee_phone and per.entity_text not in winter_contact:
                                    PackDict[k]["roleList"][i].linklist.append((per.entity_text, _p.entity_text))

    re_split = re.compile("[^\u4e00-\u9fa5、](十一|十二|十三|十四|十五|一|二|三|四|五|六|七|八|九|十)、")
    split_list = [0] * 16
    split_dict = {
        "一、": 1,
        "二、": 2,
        "三、": 3,
        "四、": 4,
        "五、": 5,
        "六、": 6,
        "七、": 7,
        "八、": 8,
        "九、": 9,
        "十、": 10,
        "十一、": 11,
        "十二、": 12,
        "十三、": 13,
        "十四、": 14,
        "十五、": 15
    }

    for item in re.finditer(re_split, _content):
        _index = split_dict.get(item.group()[1:])
        if not split_list[_index]:
            split_list[_index] = item.span()[0] + 1
    split_list = [i for i in split_list if i != 0]
    start = 0
    new_split_list = []
    for idx in split_list:
        new_split_list.append((start, idx))
        start = idx
    new_split_list.append((start, len(_content)))
    # 实体列表按照“公告分段”分组
    words_num_dict = dict()
    last_words_num = 0
    for sentence in list_sentence:
        _index = sentence.sentence_index
        if _index == 0:
            words_num_dict[_index] = 0
        else:
            words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
        last_words_num = len(sentence.sentence_text)

    # 公司-联系人连接（km算法）
    re_phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|'
                       '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
                       '0[1-9]\d{1,2}[-—－―][1-9]\d{6,7}/[1-9]\d{6,10}|'
                       '0[1-9]\d{1,2}[-—－―]\d{7,8}.?转\d{1,4}|'
                       '0[1-9]\d{1,2}[-—－―]\d{7,8}[-—－―]\d{1,4}|'
                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=1[3|4|5|6|7|8|9]\d{9})|'
                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?)|'
                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?(?=[1-9]\d{6,7})|'
                       '0[1-9]\d{1,2}[-—－―]?[1-9]\d{6}\d?|'
                       '[\（|\(]0[1-9]\d{1,2}[\）|\)]-?\d{7,8}-?\d{,4}|'
                       '[2-9]\d{6,7}')
    key_phone = re.compile("联系方式|电话|联系人|负责人")
    temporary_list2 = []
    for entity in list_entity:
        # if entity.entity_type in ['org', 'company', 'person'] and entity.is_tail==False:
        if entity.entity_type in ['org', 'company', 'person']:
            temporary_list2.append(entity)
    temporary_list2 = sorted(temporary_list2, key=lambda x: (x.sentence_index, x.begin_index))
    new_temporary_list2 = []
    for _split in new_split_list:
        temp_list = []
        for _entity in temporary_list2:
            if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
                _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
                temp_list.append(_entity)
            elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
                break
        new_temporary_list2.append(temp_list)
    # print(new_temporary_list2)
    match_list2 = []
    for split_index in range(len(new_temporary_list2)):
        split_entitys = new_temporary_list2[split_index]
        is_skip = False
        for index in range(len(split_entitys)):
            entity = split_entitys[index]
            if is_skip:
                is_skip = False
                continue
            else:
                if entity.entity_type in ['org', 'company']:
                    if entity.label != 5 or entity.entity_text in roleSet:
                        match_nums = 0
                        for after_index in range(index + 1, min(len(split_entitys), index + 4)):
                            after_entity = split_entitys[after_index]
                            if after_entity.entity_type in ['person']:
                                # 实体为中标人/候选人，联系人已确定类别【1，2】
                                if entity.label in [2, 3, 4] and after_entity.label in [1, 2]:
                                    break
                                if after_entity.label in [1, 2, 3]:
                                    distance = (tokens_num_dict[
                                                    after_entity.sentence_index] + after_entity.begin_index) - (
                                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
                                    sentence_distance = after_entity.sentence_index - entity.sentence_index
                                    if sentence_distance == 0:
                                        if distance < 100:
                                            if (entity.label == 0 and after_entity.label == 1) or (
                                                    entity.label == 1 and after_entity.label == 2):
                                                distance = distance / 100
                                            value = (-1 / 2 * (distance ** 2)) / 10000
                                            match_list2.append(Match(entity, after_entity, value))
                                            match_nums += 1
                                    else:
                                        if distance < 60:
                                            if (entity.label == 0 and after_entity.label == 1) or (
                                                    entity.label == 1 and after_entity.label == 2):
                                                distance = distance / 100
                                            value = (-1 / 2 * (distance ** 2)) / 10000
                                            match_list2.append(Match(entity, after_entity, value))
                                            match_nums += 1
                            if after_entity.entity_type in ['org', 'company']:
                                # 解决在‘地址’中识别出org/company的问题
                                # if entity.label in [0,1] and after_index==index+1 and after_entity.label not in [0,1]:
                                if entity.label != 5 and after_index == index + 1 and (
                                        after_entity.label == entity.label or after_entity.label == 5):
                                    distance = (tokens_num_dict[
                                                    after_entity.sentence_index] + after_entity.begin_index) - (
                                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
                                    if distance < 20:
                                        after_entity_left = list_sentence[after_entity.sentence_index].tokens[max(0,
                                                                                                                  after_entity.begin_index - 10):after_entity.begin_index]
                                        after_entity_right = list_sentence[after_entity.sentence_index].tokens[
                                                             after_entity.end_index + 1:after_entity.end_index + 6]
                                        after_entity_left = "".join(after_entity_left)
                                        if len(after_entity_left) > 20:
                                            after_entity_left = after_entity_left[-20:]
                                        after_entity_right = "".join(after_entity_right)[:10]
                                        if re.search("地，?址", after_entity_left):
                                            is_skip = True
                                            continue
                                        if re.search("\(|（", after_entity_left) and re.search("\)|）",
                                                                                              after_entity_right):
                                            is_skip = True
                                            continue
                                if entity.label in [0, 1] and after_entity.label in [0,
                                                                                     1] and entity.label == after_entity.label:
                                    break
                                if entity.label in [0, 1] and after_entity.label in [0, 1] and split_entitys[
                                    index + 1].entity_type == "person":
                                    break
                                if entity.label in [0, 1] and after_entity.label in [2, 3, 4]:
                                    break
                                if entity.label in [2, 3, 4] and after_entity.label in [0, 1]:
                                    break
                        # 搜索没有联系人的电话
                        mid_tokens = []
                        is_same_sentence = False
                        if index == len(split_entitys) - 1:
                            for i in range(entity.sentence_index, len(list_sentence)):
                                mid_tokens += list_sentence[i].tokens
                            mid_tokens = mid_tokens[entity.end_index + 1:]
                            mid_sentence = "".join(mid_tokens)
                            have_phone = re.findall(re_phone, mid_sentence)
                            if have_phone:
                                if re.findall(re_phone, mid_sentence.split("。")[0]):
                                    is_same_sentence = True
                                _phone = have_phone[0]
                                phone_begin = mid_sentence.find(_phone)
                                if words_num_dict[entity.sentence_index] + entity.wordOffset_begin + phone_begin < \
                                        new_split_list[split_index][1]:
                                    mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace("，", "")
                                    if re.search(key_phone, mid_sentence):
                                        distance = 1
                                        if is_same_sentence:
                                            if phone_begin <= 200:
                                                value = (-1 / 2 * (distance ** 2)) / 10000
                                                match_list2.append(Match(entity, (entity, _phone), value))
                                                match_nums += 1
                                        else:
                                            if phone_begin <= 60:
                                                value = (-1 / 2 * (distance ** 2)) / 10000
                                                match_list2.append(Match(entity, (entity, _phone), value))
                                                match_nums += 1
                        else:
                            next_entity = split_entitys[index + 1]
                            if entity.sentence_index == next_entity.sentence_index:
                                mid_tokens += list_sentence[entity.sentence_index].tokens[
                                              entity.end_index + 1:next_entity.begin_index]
                            else:
                                sentence_index = entity.sentence_index
                                while sentence_index <= next_entity.sentence_index:
                                    mid_tokens += list_sentence[sentence_index].tokens
                                    sentence_index += 1
                                mid_tokens = mid_tokens[entity.end_index + 1:-(len(
                                    list_sentence[next_entity.sentence_index].tokens) - next_entity.begin_index) + 1]
                            mid_sentence = "".join(mid_tokens)
                            have_phone = re.findall(re_phone, mid_sentence)
                            if have_phone:
                                if re.findall(re_phone, mid_sentence.split("。")[0]):
                                    is_same_sentence = True
                                _phone = have_phone[0]
                                phone_begin = mid_sentence.find(_phone)
                                mid_sentence = mid_sentence[max(0, phone_begin - 15):phone_begin].replace("，", "")
                                if re.search(key_phone, mid_sentence):
                                    p_phone = [p.entity_text for p in next_entity.person_phone] if next_entity.person_phone else []
                                    if next_entity.entity_type == 'person' and _phone in p_phone:
                                        pass
                                    else:
                                        distance = (tokens_num_dict[
                                                        next_entity.sentence_index] + next_entity.begin_index) - (
                                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
                                        distance = distance / 2
                                        if is_same_sentence:
                                            if phone_begin <= 200:
                                                value = (-1 / 2 * (distance ** 2)) / 10000
                                                match_list2.append(Match(entity, (entity, _phone), value))
                                                match_nums += 1
                                        else:
                                            if phone_begin <= 60:
                                                value = (-1 / 2 * (distance ** 2)) / 10000
                                                match_list2.append(Match(entity, (entity, _phone), value))
                                                match_nums += 1
                        # 实体无匹配时，尝试前向查找匹配
                        if not match_nums:
                            if entity.label != 5 and entity.values[entity.label] > 0.5 and index != 0:
                                previous_entity = split_entitys[index - 1]
                                if previous_entity.entity_type == 'person' and previous_entity.label in [1, 2, 3]:
                                    if entity.label in [2, 3, 4] and previous_entity.label in [1, 2]:
                                        continue
                                    if previous_entity.sentence_index == entity.sentence_index:
                                        distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
                                                tokens_num_dict[
                                                    previous_entity.sentence_index] + previous_entity.end_index)
                                        if distance < 20:
                                            # 距离相等时，前向添加处罚值
                                            # distance += 1
                                            # 前向 没有 /10000
                                            value = (-1 / 2 * (distance ** 2))
                                            match_list2.append(Match(entity, previous_entity, value))
    # print(match_list2)
    match_list2 = [mat for mat in match_list2 if mat.main_role not in linked_company and mat.attribute not in linked_person]
    # print(match_list2)
    # km算法分配求解
    result2 = dispatch(match_list2)
    # print(result2)
    for match in result2:
        entity = match[0]
        # print(entity.entity_text)
        # print(match.attribute)
        entity_index = list_entity.index(entity)
        is_update = False
        if isinstance(match[1], tuple):
            person_ = ''
            phone_ = match[1][1].split("/") # 分割多个号码
            # print(person_,phone_)
        else:
            person_ = match[1].entity_text
            phone_ = [i.entity_text for i in match[1].person_phone] if match[1].person_phone else []
        for k in PackDict.keys():
            for i in range(len(PackDict[k]["roleList"])):
                if PackDict[k]["roleList"][i].role_name == "tenderee":
                    if not PackDict[k]["roleList"][i].linklist:
                        if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 0:
                            if person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0 and person_ not in winter_contact:
                                if not phone_:
                                    PackDict[k]["roleList"][i].linklist.append((person_, ""))
                                for p in phone_:
                                    # if not person_ and len()
                                    PackDict[k]["roleList"][i].linklist.append((person_, p))
                                is_update = True
                elif PackDict[k]["roleList"][i].role_name == "agency":
                    if not PackDict[k]["roleList"][i].linklist:
                        if PackDict[k]["roleList"][i].entity_text == entity.entity_text or entity.label == 1 and person_ not in winter_contact:
                            if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0:
                                if not phone_:
                                    PackDict[k]["roleList"][i].linklist.append((person_, ""))
                                for p in phone_:
                                    PackDict[k]["roleList"][i].linklist.append((person_, p))
                                is_update = True
                else:
                    if PackDict[k]["roleList"][i].entity_text == entity.entity_text:
                        if not PackDict[k]["roleList"][i].linklist:
                            if person_ not in tenderee_contact and len(set(phone_)&set(tenderee_phone))==0 and \
                                    person_ not in agency_contact and len(set(phone_)&set(agency_phone))==0:
                                if not phone_:
                                    PackDict[k]["roleList"][i].linklist.append((person_, ""))
                                for p in phone_:
                                    PackDict[k]["roleList"][i].linklist.append((person_, p))
                                is_update = True
        if not person_:
            is_update = False
        if is_update:
            # 更新 list_entity
            if not list_entity[entity_index].pointer_person:
                list_entity[entity_index].pointer_person = []
            list_entity[entity_index].pointer_person.append(match[1])

    linked_person = []
    linked_persons_with = []
    for company_entity in [entity for entity in list_entity if entity.entity_type in ['company','org']]:
        if company_entity.pointer_person:
            for _person in company_entity.pointer_person:
                linked_person.append(_person)
                linked_persons_with.append(company_entity)

    # 一个公司对应多个联系人的补充
    person_entitys = [entity for entity in list_entity if entity.entity_type=='person']
    person_entitys = person_entitys[::-1]
    for index in range(len(person_entitys)):
        entity = person_entitys[index]
        prepare_link = []
        if entity not in linked_person:
            prepare_link.append(entity)
            last_person = entity
            for after_index in range(index + 1, min(len(person_entitys), index + 5)):
                after_entity = person_entitys[after_index]
                if after_entity.sentence_index==last_person.sentence_index and last_person.begin_index-after_entity.end_index<5:
                    if after_entity in linked_person:
                        _index = linked_person.index(after_entity)
                        with_company = linked_persons_with[_index]
                        for i in range(len(PackDict["Project"]["roleList"])):
                            if PackDict["Project"]["roleList"][i].role_name == "tenderee":
                                if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 0:
                                    for item in prepare_link:
                                        person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
                                        for _p in person_phone:
                                            PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
                                        with_company.pointer_person.append(item)
                                        linked_person.append(item)
                            elif PackDict["Project"]["roleList"][i].role_name == "agency":
                                if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text or with_company.label == 1:
                                    for item in prepare_link:
                                        person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
                                        for _p in person_phone:
                                            PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
                                        with_company.pointer_person.append(item)
                                        linked_person.append(item)
                            else:
                                if PackDict["Project"]["roleList"][i].entity_text == with_company.entity_text:
                                    for item in prepare_link:
                                        person_phone = [p.entity_text for p in item.person_phone] if item.person_phone else []
                                        for _p in person_phone:
                                            PackDict["Project"]["roleList"][i].linklist.append((item.entity_text, _p))
                                        with_company.pointer_person.append(item)
                                        linked_person.append(item)
                        break
                    else:
                        prepare_link.append(after_entity)
                        last_person = after_entity
                        continue

    # 统一同类角色的属性
    if PackDict.get("Project"):
        for i in range(len(PackDict["Project"]["roleList"])):
            # if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
            for _entity in list_entity:
                if _entity.entity_type in ['org','company']:
                    is_similar = False
                    # entity_text相同
                    if _entity.entity_text==PackDict["Project"]["roleList"][i].entity_text:
                        is_similar = True
                    # entity.label为【0，1】
                    if _entity.label in [0,1] and dict_role_id[str(_entity.label)]==PackDict["Project"]["roleList"][i].role_name:
                        is_similar = True
                    if is_similar:
                        linked_entitys = _entity.linked_entitys
                        if linked_entitys:
                            for linked_entity in linked_entitys:
                                pointer_person = linked_entity.pointer_person if linked_entity.pointer_person else []
                                for _pointer_person in pointer_person:
                                    _phone = [p.entity_text for p in _pointer_person.person_phone] if _pointer_person.person_phone else []
                                    for _p in _phone:
                                        if (_pointer_person.entity_text,_p) not in PackDict["Project"]["roleList"][i].linklist:
                                            PackDict["Project"]["roleList"][i].linklist.append((_pointer_person.entity_text,_p))

    # "roleList"中联系人电话去重
    for i in range(len(PackDict["Project"]["roleList"])):
        # print(123, PackDict["Project"]["roleList"][i].linklist)
        # 带有联系人的电话
        with_person = [person_phone[1] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[0]]
        # 带有电话的联系人
        with_phone = [person_phone[0] for person_phone in PackDict["Project"]["roleList"][i].linklist if person_phone[1]]
        remove_list = []
        for item in PackDict["Project"]["roleList"][i].linklist:
            if not item[0]:
                if item[1] in with_person:
                    # 删除重复的无联系人电话
                    remove_list.append(item)
            elif not item[1]:
                if item[0] in with_phone:
                    remove_list.append(item)
        for _item in remove_list:
            PackDict["Project"]["roleList"][i].linklist.remove(_item)

    # 联系人——电子邮箱链接
    temporary_list3 = [entity for entity in list_entity if entity.entity_type=='email' or (entity.entity_type=='person' and entity.label in [1,2,3])]
    temporary_list3 = sorted(temporary_list3, key=lambda x: (x.sentence_index, x.begin_index))
    new_temporary_list3 = []
    for _split in new_split_list:
        temp_list = []
        for _entity in temporary_list3:
            if words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[0] and words_num_dict[
                _entity.sentence_index] + _entity.wordOffset_end < _split[1]:
                temp_list.append(_entity)
            elif words_num_dict[_entity.sentence_index] + _entity.wordOffset_begin >= _split[1]:
                break
        new_temporary_list3.append(temp_list)
    # print(new_temporary_list3)
    match_list3 = []
    for split_index in range(len(new_temporary_list3)):
        split_entitys = new_temporary_list3[split_index]
        for index in range(len(split_entitys)):
            entity = split_entitys[index]
            if entity.entity_type == 'person':
                match_nums = 0
                for after_index in range(index + 1, min(len(split_entitys), index + 4)):
                    after_entity = split_entitys[after_index]
                    if match_nums > 2:
                        break
                    if after_entity.entity_type == 'email':
                        distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
                                           tokens_num_dict[entity.sentence_index] + entity.end_index)
                        sentence_distance = after_entity.sentence_index - entity.sentence_index
                        if sentence_distance == 0:
                            if distance < 100:
                                if (entity.label == 0 and after_entity.label == 1) or (
                                        entity.label == 1 and after_entity.label == 2):
                                    distance = distance / 100
                                value = (-1 / 2 * (distance ** 2)) / 10000
                                match_list3.append(Match(entity, after_entity, value))
                                match_nums += 1
                        else:
                            if distance < 60:
                                if (entity.label == 0 and after_entity.label == 1) or (
                                        entity.label == 1 and after_entity.label == 2):
                                    distance = distance / 100
                                value = (-1 / 2 * (distance ** 2)) / 10000
                                match_list3.append(Match(entity, after_entity, value))
                                match_nums += 1
                # 前向查找匹配
                # if not match_nums:
                if index != 0:
                    previous_entity = split_entitys[index - 1]
                    if previous_entity.entity_type == 'email':
                        if previous_entity.sentence_index == entity.sentence_index:
                            distance = (tokens_num_dict[entity.sentence_index] + entity.begin_index) - (
                                    tokens_num_dict[
                                        previous_entity.sentence_index] + previous_entity.end_index)
                            if distance < 30:
                                # 距离相等时，前向添加处罚值
                                # distance += 1
                                # 前向 没有 /10000
                                value = (-1 / 2 * (distance ** 2))
                                match_list3.append(Match(entity, previous_entity, value))
    # print(match_list3)
    # km算法分配求解
    result3 = dispatch(match_list3)
    for match in result3:
        match_person = match[0]
        match_email = match[1]
        match_person.pointer_email = match_email

    # # 1）第一个公司实体的招标人，则看看下一个实体是否为代理人，如果是则联系人错位连接 。2）在同一句中往后找联系人。3）连接不上在整个文章找联系人。
    # temp_ent_list = []  # 临时列表，记录0,1角色及3联系人
    # other_person = []  # 阈值以上的联系人列表
    # link_person = []   # 有电话没联系上角色的person列表
    # other_ent = []
    # link_ent = []
    # found_person = False
    # ent_list = []
    # for entity in list_entity:
    #     if entity.entity_type in ['org','company','person']:
    #         ent_list.append(entity)
    # # ent_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
    # #for list_index in range(len(ent_list)):
    #     #if ent_list[list_index].entity_type in ['org','company'] and ent_list[list_index].label == 0 and list_index+2<len(ent_list) and \
    #        #ent_list[list_index+1].entity_type in ['org','company'] and ent_list[list_index+1].label == 1 and ent_list[list_index+2].entity_type in ['person']:
    #         #ent_list[list_index+1], ent_list[list_index+2] = ent_list[list_index+2], ent_list[list_index+1]
    # # 2020/11/25增加确定角色联系人判断
    # sure_person_set = set([entity.entity_text for entity in ent_list if entity.entity_type == 'person' and entity.label in [1, 2]])
    # # 招标/代理在同一句中交叉情况的处理
    # for index in range(len(ent_list)):
    #     entity = ent_list[index]
    #     if entity.entity_text in roleSet and entity.label in [0, 1] and index+3<len(ent_list):
    #         if entity.sentence_index==ent_list[index+1].sentence_index==ent_list[index+2].sentence_index==ent_list[index+3].sentence_index:
    #             if ent_list[index+1].begin_index - entity.end_index < 30:
    #                 if ent_list[index+1].entity_text in roleSet and ent_list[index+1].label in [0, 1] and entity.label!=ent_list[index+1].label:
    #                     if ent_list[index+2].entity_type=="person" and ent_list[index+3].entity_type=="person" and \
    #                             ent_list[index+2].label==3 and ent_list[index+3].label==3:
    #                         ent_list[index + 1], ent_list[index + 2] = ent_list[index + 2], ent_list[index + 1]
    #
    #
    # for index in range(len(ent_list)):
    #     entity = ent_list[index]
    #     if entity.entity_type=="person":
    #         if str(entity.label) == "0":  # 2020/11/25 非联系人直接跳过
    #             continue
    #         if entity.values[entity.label]>on_value_person:
    #             if str(entity.label)=="1":
    #                 for i in range(len(PackDict["Project"]["roleList"])):
    #                     if PackDict["Project"]["roleList"][i].role_name=="tenderee":
    #                         PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
    #                         link_person.append(entity.entity_text)
    #                         link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
    #                 # add pointer_person
    #                 for _entity in list_entity:
    #                     if dict_role_id.get(str(_entity.label))=="tenderee":
    #                         for i in range(len(PackDict["Project"]["roleList"])):
    #                             if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="tenderee":
    #                                 _entity.pointer_person = entity
    #             elif str(entity.label)=="2":
    #                 for i in range(len(PackDict["Project"]["roleList"])):
    #                     if PackDict["Project"]["roleList"][i].role_name=="agency":
    #                         PackDict["Project"]["roleList"][i].linklist.append((entity.entity_text,entity.person_phone))
    #                         link_person.append(entity.entity_text)
    #                         link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
    #                 # add pointer_person
    #                 for _entity in list_entity:
    #                     if dict_role_id.get(str(_entity.label))=="agency":
    #                         for i in range(len(PackDict["Project"]["roleList"])):
    #                             if PackDict["Project"]["roleList"][i].entity_text==_entity.entity_text and PackDict["Project"]["roleList"][i].role_name=="agency":
    #                                 _entity.pointer_person = entity
    #             elif str(entity.label)=="3":
    #                 if entity.entity_text in sure_person_set:  # 2020/11/25 排除已经确定角色的联系人
    #                     continue
    #                 #not_link_person.append((entity_after.entity_text,entity_after.person_phone))
    #                 other_person.append(entity.entity_text)
    #                 temp_ent_list.append((entity.entity_text,entity.person_phone,entity))
    #
    #     #if entity.entity_text in roleSet:
    #     if entity.entity_text in roleSet:
    #         if entity.label in [0,1]:
    #             other_ent.append(entity.entity_text)
    #             temp_ent_list.append((entity.entity_text, entity.label,entity))
    #         for behind_index in range(index+1, len(ent_list)):
    #             entity_after = ent_list[behind_index]
    #             if entity_after.sentence_index-entity.sentence_index>=1 or entity_after.entity_type in ['org','company']:   # 只在本句中找联系人
    #                 break
    #             if entity_after.values is not None:
    #                 if entity_after.entity_type=="person":
    #                     if str(entity_after.label) == "0": # 2020/11/25角色后面为非联系人 停止继续往后找
    #                         break
    #                     if entity_after.values[entity_after.label]>on_value_person:
    #                         if str(entity_after.label)=="1":
    #                             for i in range(len(PackDict["Project"]["roleList"])):
    #                                 if PackDict["Project"]["roleList"][i].role_name=="tenderee":
    #                                     PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
    #                                     link_person.append(entity_after.entity_text)
    #                                     link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
    #                         elif str(entity_after.label)=="2":
    #                             for i in range(len(PackDict["Project"]["roleList"])):
    #                                 if PackDict["Project"]["roleList"][i].role_name=="agency":
    #                                     PackDict["Project"]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
    #                                     link_person.append(entity_after.entity_text)
    #                                     link_ent.append(PackDict["Project"]["roleList"][i].entity_text)
    #                         elif str(entity_after.label)=="3":
    #                             if entity_after.entity_text in sure_person_set: # 2020/11/25 如果姓名已经出现在确定角色联系人中则停止往后找
    #                                 break
    #                             elif entity_after.begin_index - entity.end_index > 30:#2020/10/25 如果角色实体与联系人实体间隔大于阈值停止
    #                                 break
    #                             for pack in PackDict.keys():
    #                                 for i in range(len(PackDict[pack]["roleList"])):
    #                                     if PackDict[pack]["roleList"][i].entity_text==entity.entity_text:
    #                                         #if entity_after.sentence_index-entity.sentence_index>1 and len(roleList[i].linklist)>0:
    #                                             #break
    #                                         PackDict[pack]["roleList"][i].linklist.append((entity_after.entity_text,entity_after.person_phone))
    #                                         link_person.append(entity_after.entity_text)
    #                                         #add pointer_person
    #                                         entity.pointer_person = entity_after
    #
    # not_link_person = [person for person in other_person if person not in link_person]
    # not_link_ent = [ent for ent in other_ent if ent not in link_ent]
    # if len(not_link_person) > 0 and len(not_link_ent) > 0 :
    #     item = temp_ent_list
    #     for i in range(len(item)):
    #         if item[i][0] in not_link_ent and item[i][1] == 0 and i+3 < len(item):
    #             if item[i+1][0] in other_ent and item[i+1][1] == 1 and item[i+2][0] in other_person and item[i+3][0] in other_person:
    #                 item[i+1], item[i+2] = item[i+2], item[i+1]
    #     for i in range(len(item)-1, -1, -1):
    #         if item[i][0] in not_link_ent:
    #             for pack in PackDict.keys():
    #                 for role in PackDict[pack]["roleList"]:
    #                     if role.entity_text == item[i][0] and len(role.linklist) < 1:
    #                         for j in range(i+1, len(item)):
    #                             if item[j][0] in not_link_person:
    #                                 role.linklist.append(item[j][:2])
    #                                 #add pointer_person
    #                                 item[i][2].pointer_person = item[j][2]
    #                                 break
    #                             else:
    #                                 break
    # # 电话没有联系人的处理
    # role_with_no_phone = []
    # for i in range(len(PackDict["Project"]["roleList"])):
    #     if PackDict["Project"]["roleList"][i].role_name in ["tenderee","agency"]:
    #         if len(PackDict["Project"]["roleList"][i].linklist)==0: # 找出没有联系人的招标/代理人
    #             role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
    #         else:
    #             phone_nums = 0
    #             for link in PackDict["Project"]["roleList"][i].linklist:
    #                 if link[1]:
    #                     phone_nums += 1
    #                     break
    #             if not phone_nums:
    #                 role_with_no_phone.append(PackDict["Project"]["roleList"][i].entity_text)
    # if role_with_no_phone:
    #     phone_with_person = [entity.person_phone for entity in list_entity if entity.entity_type == "person"]
    #     # phone_with_person = [phone for phone in phone_with_person if phone]
    #
    #     dict_index_sentence = {}
    #     for _sentence in list_sentence:
    #         dict_index_sentence[_sentence.sentence_index] = _sentence
    #     new_entity_list = [entity for entity in list_entity if entity.entity_type in ['org','company','person']]
    #     for index in range(len(new_entity_list)):
    #         entity = new_entity_list[index]
    #         if entity.entity_text in role_with_no_phone:
    #             e_sentence = dict_index_sentence[entity.sentence_index]
    #             entity_right = e_sentence.tokens[entity.end_index:entity.end_index+40]
    #             entity_right = "".join(entity_right)
    #             if index+1<len(new_entity_list) and entity_right.find(new_entity_list[index+1].entity_text)>-1:
    #                 entity_right = entity_right[:entity_right.find(new_entity_list[index+1].entity_text)]
    #             have_phone = re.findall(phone,entity_right)
    #             if have_phone:
    #                 _phone = have_phone[0]
    #                 phone_begin = entity_right.find(_phone)
    #                 if _phone not in phone_with_person and re.search(key_phone,entity_right[:phone_begin]):
    #                     # entity.person_phone = _phone
    #                     for i in range(len(PackDict["Project"]["roleList"])):
    #                         if PackDict["Project"]["roleList"][i].entity_text == entity.entity_text:
    #                             PackDict["Project"]["roleList"][i].linklist.append(('', _phone))

    
    #寻找多标段招标金额
    p_entity = len(list_entity)-1

    set_tenderer_money = set()
    list_tenderer_money = []  #2021/7/16 新增列表，倒序保存所有中标金额
    unit_list = [] #2021/8/17 新增，保存金额单位

    #遍历所有实体
    while(p_entity>=0):
        entity = list_entity[p_entity]
        if entity.entity_type=="money":
            # 2021/12/03 添加成本警戒线、保证金
            if entity.notes in ['保证金', '成本警戒线']:
                packagePointer, _flag = getPackage(PackageList, entity.sentence_index, entity.begin_index,
                                                   "money-" + str(entity.label), MAX_DIS=2, DIRECT="L")
                if packagePointer is None:
                    packageName = "Project"
                else:
                    packageName = packagePointer.entity_text

                if packageName == "Project":
                    # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
                    #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
                    if entity.notes=="保证金" and "bond" not in PackDict["Project"]:
                        PackDict["Project"]["bond"] = float(entity.entity_text)
                    elif entity.notes=="成本警戒线" and "cost_warning" not in PackDict["Project"]:
                        PackDict["Project"]["cost_warning"] = float(entity.entity_text)

                else:
                    if entity.notes == "保证金" and "bond" not in PackDict[packageName]:
                        PackDict[packageName]["bond"] = float(entity.entity_text)
                    elif entity.notes == "成本警戒线" and "cost_warning" not in PackDict[packageName]:
                        PackDict[packageName]["cost_warning"] = float(entity.entity_text)

            elif entity.values[entity.label]>=on_value:
                if str(entity.label)=="1":
                    set_tenderer_money.add(float(entity.entity_text))
                    list_tenderer_money.append(float(entity.entity_text))  # 2021/7/16 新增列表，倒序保存所有中标金额
                    unit_list.append(entity.money_unit)
                # if str(entity.label)=="0":
                if str(entity.label)=="0" and entity.notes!='总投资':
                    '''
                    if p_entity>0:
                        p_before = list_entity[p_entity-1]
                        if p_before.entity_type=="money" and p_before.label==entity.label and p_before.entity_text==entity.entity_text and abs(entity.begin_index-p_before.end_index)<=2:
                            p_entity -= 1
                            continue
                    '''
                    packagePointer,_flag = getPackage(PackageList,entity.sentence_index,entity.begin_index,"money-"+str(entity.label),MAX_DIS=2,DIRECT="L")
                    if packagePointer is None:
                        packageName = "Project"
                    else:
                        packageName = packagePointer.entity_text
                        
                    if packageName=="Project":
                        # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
                        #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
                        if entity.values[entity.label]>on_value:
                            PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
                            PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
                    else:
                        PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
                        PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
                        #add pointer_tendereeMoney
                        packagePointer.pointer_tendereeMoney = entity
        p_entity -= 1            
    
        
    #删除一个机构有多个角色的数据
    #删除重复人、概率不回传
    final_roleList = []
    list_pop = []
    set_tenderer_role = set()
    dict_pack_tenderer_money = dict()

    for pack in PackDict.keys():
        #删除无效包
        if PackDict[pack]["code"]=="" and PackDict[pack]["tendereeMoney"]==0 and len(PackDict[pack]["roleList"])==0:
            list_pop.append(pack)
        for i in range(len(PackDict[pack]["roleList"])):
            if PackDict[pack]["roleList"][i].role_name=="win_tenderer":
                if PackDict[pack]["roleList"][i].money==0:
                    set_tenderer_role.add(PackDict[pack]["roleList"][i])
                    dict_pack_tenderer_money[pack] = [PackDict[pack]["roleList"][i],set()]
    #找到包的中投标金额
    for _index in range(len(PackageList)):
        if "hit" in PackageList[_index]:
            for _hit in list(PackageList[_index]["hit"]):
                _money = float(_hit.split("-")[1]) if _hit.split("-")[0]=="money" else None
                if PackageList[_index]["name"] in dict_pack_tenderer_money and _money is not None:
                    dict_pack_tenderer_money[PackageList[_index]["name"]][1].add(_money)
    #只找到一个中标人和中标金额
    if len(set_tenderer_money)==1 and len(set_tenderer_role)==1:
        list(set_tenderer_role)[0].money = list(set_tenderer_money)[0]
        list(set_tenderer_role)[0].money_unit = unit_list[0]
        # print('一个中标人一个金额：', list(set_tenderer_money)[0])
    #找到一个中标人和多个招标金额
    if len(set_tenderer_money)>1 and len(set_tenderer_role)==1:
        _maxMoney = 0
        _sumMoney = 0
        for _m in list(set_tenderer_money):
            _sumMoney += _m
            if _m>_maxMoney:
                _maxMoney = _m
        if _sumMoney/_maxMoney==2:
            list(set_tenderer_role)[0].money = _maxMoney
            # print('一人多金额分项合计 取最大金额：', _maxMoney)
        else:
            # list(set_tenderer_role)[0].money = _maxMoney
            if min(list_tenderer_money)>200000 and list_tenderer_money[-1]/min(list_tenderer_money)>9000:
                list(set_tenderer_role)[0].money = min(list_tenderer_money)
                list(set_tenderer_role)[0].money_unit = unit_list[list_tenderer_money.index(min(list_tenderer_money))]
                # print('一人多金额 且最小的大于20万第一个金额比最小金额大几千倍的最小中标金额：', min(list_tenderer_money))
            else:
                list(set_tenderer_role)[0].money = list_tenderer_money[-1]  # 2021/7/16 修改 不是单价合计方式取第一个中标金额
                list(set_tenderer_role)[0].money_unit = unit_list[-1] # 金额单位
                # print('一人多金额 取第一个中标金额：', list_tenderer_money[-1])
    #每个包都只找到一个金额
    _flag_pack_money = True
    for k,v in dict_pack_tenderer_money.items():
        if len(v[1])!=1:
            _flag_pack_money = False
    if _flag_pack_money and len(PackageSet)==len(dict_pack_tenderer_money.keys()):
        for k,v in dict_pack_tenderer_money.items():
            v[0].money = list(v[1])[0]
            # print('k,v in dict_pack_tenderer_money.items', k, v)

    # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
    for pack in PackDict.keys():
        for i in range(len(PackDict[pack]["roleList"])):
            if PackDict[pack]["tendereeMoney"] > 0:
                # print('金额数据类型：',type(PackDict[pack]["roleList"][i].money))
                if float(PackDict[pack]["roleList"][i].money) >10000000 and \
                        float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
                    # print('招标金额校正中标金额')

    # 2021/7/19 #增加判断中标金额是否远大于第二三中标金额
    for pack in PackDict.keys():
        tmp_moneys = []
        for i in range(len(PackDict[pack]["roleList"])):
            if float(PackDict[pack]["roleList"][i].money) >100000:
                tmp_moneys.append(float(PackDict[pack]["roleList"][i].money))
        if len(tmp_moneys)>2 and max(tmp_moneys)/min(tmp_moneys)>1000:
            for i in range(len(PackDict[pack]["roleList"])):
                if float(PackDict[pack]["roleList"][i].money)/min(tmp_moneys)>1000:
                    PackDict[pack]["roleList"][i].money = float(PackDict[pack]["roleList"][i].money) / 10000
                    # print('通过其他中标人投标金额校正中标金额')


    for pack in PackDict.keys():
        for i in range(len(PackDict[pack]["roleList"])):
            PackDict[pack]["roleList"][i] = PackDict[pack]["roleList"][i].getString()

    for item in list_pop:
        PackDict.pop(item)
        
    return PackDict 

def initPackageAttr(RoleList,PackageSet):
    '''
    @summary: 根据拿到的roleList和packageSet初始化接口返回的数据
    '''   
    packDict = dict()
    packDict["Project"] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
    for item in list(PackageSet):
        packDict[item] = {"code":"","tendereeMoney":0,"roleList":[], 'tendereeMoneyUnit':''}
    for item in RoleList:
        if packDict[item.packageName]["code"] =="":
            packDict[item.packageName]["code"] = item.packageCode
        # packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[]))
        packDict[item.packageName]["roleList"].append(Role(item.role_name,item.entity_text,0,0,0.0,[])) #Role(角色名称，实体名称，角色阈值，金额，金额阈值，连接列表，金额单位)
    return packDict
                
def getPackageRoleMoney(list_sentence,list_entity):
    '''
    @param:
        list_sentence:文章的句子list
        list_entity:文章的实体list
    @return: 拿到文章的包-标段号-角色-实体名称-金额-联系人-联系电话  
    '''
    # print("=1")
    theRole = getRoleList(list_sentence,list_entity)
    if not theRole:
        return []
    RoleList,RoleSet,PackageList,PackageSet = theRole
    '''
    for item in PackageList:
        # print(item)
    '''
    # print("=2")
    PackDict = initPackageAttr(RoleList, PackageSet)
    # print("=3")
    PackDict = findAttributeAfterEntity(PackDict, RoleSet, PackageList, PackageSet, list_entity, list_sentence)
    # print("=4")
    return PackDict

def turnBidWay(bidway):
    if bidway in ("邀请招标","采购方式：邀请"):
        return "邀请招标"
    elif bidway in ("询价","询单","询比","采购方式：询价"):
        return "询价"
    elif bidway in ("竞谈","竞争性谈判","公开竞谈"):
        return "竞争性谈判"
    elif bidway in ("竞争性磋商","磋商"):
        return "竞争性磋商"
    elif bidway in ("竞价","竞标","电子竞价","以电子竞价","电子书面竞投"):
        return "竞价"
    elif bidway in ("公开招标","网上电子投标","网上招标","采购方式：公开","招标为其他"):
        return "公开招标"
    elif bidway in ("单一来源"):
        return "单一来源"
    elif bidway in ("比选"):
        return "比选"
    else:
        return "其他"

def getOtherAttributes(list_entity):
    dict_other = {"moneysource":"",
                  "person_review":[],
                  "time_release":"",
                  "time_bidopen":"",
                  "time_bidclose":"",
                  "serviceTime":"",
                  "product":[],
                  "total_tendereeMoney":0,
                  "total_tendereeMoneyUnit":''
                   }
    dict_time = {
        "time_release": [],
        "time_bidopen": [],
        "time_bidclose": []
    }
    for entity in list_entity:
        if entity.entity_type == 'bidway':
            dict_other["bidway"] = turnBidWay(entity.entity_text)
        elif entity.entity_type=='moneysource':
            dict_other["moneysource"] = entity.entity_text
        elif entity.entity_type=='serviceTime':
            dict_other["serviceTime"] = entity.entity_text
        elif entity.entity_type == 'time' and entity.label==1:
            if entity.values[entity.label]>0.6:
                dict_time['time_release'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
            # dict_other["time_release"] = timeFormat(entity.entity_text)
        elif entity.entity_type == 'time' and entity.label==2:
            if entity.values[entity.label]>0.6:
                dict_time['time_bidopen'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
            # dict_other["time_bidopen"] = timeFormat(entity.entity_text)
        elif entity.entity_type == 'time' and entity.label == 3:
            if entity.values[entity.label]>0.6:
                dict_time['time_bidclose'].append((timeFormat(entity.entity_text),entity.values[entity.label]))
            # dict_other["time_bidclose"] = timeFormat(entity.entity_text)
        elif entity.entity_type=="person" and entity.label ==4:
            dict_other["person_review"].append(entity.entity_text)
        elif entity.entity_type=='product':
            dict_other["product"].append(entity.entity_text)
        elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
                dict_other["total_tendereeMoney"] = float(entity.entity_text)
                dict_other["total_tendereeMoneyUnit"] = entity.money_unit
    # 时间类别
    for time_type,value in dict_time.items():
        list_time = dict_time[time_type]
        if list_time:
            list_time.sort(key=lambda x:x[1],reverse=True)
            dict_other[time_type] = list_time[0][0]
    dict_other["product"] = list(set(dict_other["product"]))
    return dict_other

def getMoneyRange(RoleList):
    pass

def getPREMs(list_sentences,list_entitys,list_articles):
    '''
    @param:
        list_sentence:所有文章的句子list
        list_entity:所有文章的实体list
    @return:list of dict which include文章的包-角色-实体名称-金额-联系人-联系电话  
    '''
    result = []
    for list_sentence,list_entity,list_article in zip(list_sentences,list_entitys,list_articles):
        RoleList = getPackageRoleMoney(list_sentence,list_entity)
        result.append(dict({"prem":RoleList,"docid":list_article.id},**getOtherAttributes(list_entity),
                           **{"fingerprint":list_article.fingerprint,"match_enterprise":list_article.match_enterprise,
                              "match_enterprise_type":list_article.match_enterprise_type,"process_time":getCurrent_date(),
                              "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
    return result


if __name__=="__main__":
    '''
    conn = getConnection()
    cursor = conn.cursor()
    #sql = " select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200"
    sql = " select B.doc_id,B.prem from articles_processed A, articles_validation B where A.id=B.doc_id "
    
    result = []
    
    cursor.execute(sql)
    rows = cursor.fetchall()
    count = 0
    for row in rows:
        
        count += 1
        # print(count)
        doc_id = row[0]
        
        roleList = getPackageRoleMoney(doc_id)
        result.append([doc_id,str(roleList),row[1]])
        ''''''
    with codecs.open("getAttribute.html","w",encoding="utf8") as f:
        f.write('<html><head>\
        <meta http-equiv="Content-Type"\
        content="text/html; charset=UTF-8">\
        </head>\
        <body bgcolor="#FFFFFF">\
        <table border="1">\
        <tr>\
        <td>doc_id</td>\
        <td>角色</td>\
        </tr>')
        for item in result:
            f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
        f.write("</table></body>")
    '''