luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792
							'''
Created on 2018年12月26日

@author: User
'''

import os
import sys
from BiddingKG.dl.common.nerUtils import *
sys.path.append(os.path.abspath("../.."))
# from keras.engine import topology
# from keras import models
# from keras import layers
# from keras_contrib.layers.crf import CRF
# from keras.preprocessing.sequence import pad_sequences
# from keras import optimizers,losses,metrics
from BiddingKG.dl.common.Utils import *
from BiddingKG.dl.interface.modelFactory import *
import tensorflow as tf
from BiddingKG.dl.product.data_util import decode, process_data
from BiddingKG.dl.interface.Entitys import Entity
from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
from bs4 import BeautifulSoup
import copy
import calendar
import datetime

from threading import RLock
dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
              "prem":{"predictor":None,"Lock":RLock()},
              "epc":{"predictor":None,"Lock":RLock()},
              "roleRule":{"predictor":None,"Lock":RLock()},
              "roleRuleFinal":{"predictor":None,"Lock":RLock()},
                  "form":{"predictor":None,"Lock":RLock()},
                  "time":{"predictor":None,"Lock":RLock()},
                  "punish":{"predictor":None,"Lock":RLock()},
                  "product":{"predictor":None,"Lock":RLock()},
                "product_attrs":{"predictor":None,"Lock":RLock()},
                  "channel": {"predictor": None, "Lock": RLock()},
                  "deposit_payment_way": {"predictor": None, "Lock": RLock()},
                  "total_unit_money": {"predictor": None, "Lock": RLock()}
                  }


def getPredictor(_type):
    if _type in dict_predictor:
        with dict_predictor[_type]["Lock"]:
            if dict_predictor[_type]["predictor"] is None:
                if _type == "codeName":
                    dict_predictor[_type]["predictor"] = CodeNamePredict()
                if _type == "prem":
                    dict_predictor[_type]["predictor"] = PREMPredict()
                if _type == "epc":
                    dict_predictor[_type]["predictor"] = EPCPredict()
                if _type == "roleRule":
                    dict_predictor[_type]["predictor"] = RoleRulePredictor()
                if _type == "roleRuleFinal":
                    dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
                if _type == "form":
                    dict_predictor[_type]["predictor"] = FormPredictor()
                if _type == "time":
                    dict_predictor[_type]["predictor"] = TimePredictor()
                if _type == "punish":
                    dict_predictor[_type]["predictor"] = Punish_Extract()
                if _type == "product":
                    dict_predictor[_type]["predictor"] = ProductPredictor()
                if _type == "product_attrs":
                    dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
                if _type == "channel":
                    dict_predictor[_type]["predictor"] = DocChannel()
                if _type == 'deposit_payment_way':
                    dict_predictor[_type]["predictor"] = DepositPaymentWay()
                if _type == 'total_unit_money':
                    dict_predictor[_type]["predictor"] = TotalUnitMoney()
            return dict_predictor[_type]["predictor"]
    raise NameError("no this type of predictor")


# 编号名称模型
class CodeNamePredict():
    
    def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
        
        self.model = None
        self.MAX_LEN = None
        self.model_code = None
        if EMBED_DIM is None:
            self.EMBED_DIM = 60
        else:
            self.EMBED_DIM = EMBED_DIM
        if BiRNN_UNITS is None:
            self.BiRNN_UNITS = 200
        else:
            self.BiRNN_UNITS = BiRNN_UNITS
        self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
        #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5"
        self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5"
        vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk"
        classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk"
        self.vocab = load(vocabpath)
        self.class_labels = load(classlabelspath)
        
        #生成提取编号和名称的正则
        id_PC_B = self.class_labels.index("PC_B")
        id_PC_M = self.class_labels.index("PC_M")
        id_PC_E = self.class_labels.index("PC_E")
        id_PN_B = self.class_labels.index("PN_B")
        id_PN_M = self.class_labels.index("PN_M")
        id_PN_E = self.class_labels.index("PN_E")
        self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E))
        self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E))
        # print("pc",self.PC_pattern)
        # print("pn",self.PN_pattern)
        self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
        
        self.inputs = None
        self.outputs = None
        self.sess_codename = tf.Session(graph=tf.Graph())
        self.sess_codesplit = tf.Session(graph=tf.Graph())
        self.inputs_code = None
        self.outputs_code = None
        if not lazyLoad:
            self.getModel()
            self.getModel_code()
        
        
    def getModel(self):
        '''
        @summary: 取得编号和名称模型
        '''
        if self.inputs is None:
            log("get model of codename")
            with self.sess_codename.as_default():
                with self.sess_codename.graph.as_default():
                    meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf")
                    signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
                    signature_def = meta_graph_def.signature_def
                    self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
                    self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name)
                    self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name)
                    self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name)
                    self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name)

                return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
        else:
            return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
        '''    
        if self.model is None:
            self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
            self.model.load_weights(self.filepath)
        return self.model
        '''
    
    def getModel_code(self):
        if self.inputs_code is None:
            log("get model of code")
            with self.sess_codesplit.as_default():
                with self.sess_codesplit.graph.as_default():
                    meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel")
                    signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
                    signature_def = meta_graph_def.signature_def
                    self.inputs_code = []
                    self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
                    self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
                    self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name))
                    self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
                    self.sess_codesplit.graph.finalize()
                    return self.inputs_code,self.outputs_code
        else:
            return self.inputs_code,self.outputs_code
        '''
        if self.model_code is None:
            log("get model of model_code")
            with self.sess_codesplit.as_default():
                with self.sess_codesplit.graph.as_default():
                    self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
        return self.model_code
        '''
    
    def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
        '''
        model = models.Sequential()
        model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True))  # Random embedding
        model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
        crf = CRF(len(chunk_tags), sparse_target=True)
        model.add(crf)
        model.summary()
        model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
        return model
        '''
        input = layers.Input(shape=(None,))
        if weights is not None:
            embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
        else:
            embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
        bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
        bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
        crf = CRF(len(chunk_tags),sparse_target=True)
        crf_out = crf(bilstm_dense)
        model = models.Model(input=[input],output = [crf_out])
        model.summary()
        model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
        return model
    
    #根据规则补全编号或名称两边的符号
    def fitDataByRule(self,data):
        symbol_dict = {"(":")",
                       "（":"）",
                       "[":"]",
                       "【":"】",
                       ")":"(",
                       "）":"（",
                       "]":"[",
                       "】":"【"}
        leftSymbol_pattern = re.compile("[\(（\[【]")
        rightSymbol_pattern = re.compile("[\)）\]】]")
        leftfinds = re.findall(leftSymbol_pattern,data)
        rightfinds = re.findall(rightSymbol_pattern,data)
        result = data
        if len(leftfinds)+len(rightfinds)==0:
            return data
        elif len(leftfinds)==len(rightfinds):
            return data
        elif abs(len(leftfinds)-len(rightfinds))==1:
            if len(leftfinds)>len(rightfinds):
                if symbol_dict.get(data[0]) is not None:
                    result = data[1:]
                else:
                    #print(symbol_dict.get(leftfinds[0]))
                    result = data+symbol_dict.get(leftfinds[0])
            else:
                if symbol_dict.get(data[-1]) is not None:
                    result = data[:-1]
                else:
                    result = symbol_dict.get(rightfinds[0])+data
        return  result

    def decode(self,logits, trans, sequence_lengths, tag_num):
        viterbi_sequences = []
        for logit, length in zip(logits, sequence_lengths):
            score = logit[:length]
            viterbi_seq, viterbi_score = viterbi_decode(score, trans)
            viterbi_sequences.append(viterbi_seq)
        return viterbi_sequences
    
    def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
        #@summary: 获取每篇文章的code和name
        
        pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")

        result = []
        index_unk = self.word2index.get("<unk>")
        # index_pad = self.word2index.get("<pad>")
        if list_entitys is None:
            list_entitys = [[] for _ in range(len(list_sentences))]
        for list_sentence,list_entity in zip(list_sentences,list_entitys):
            if len(list_sentence)==0:
                result.append([{"code":[],"name":""}])
                continue
            doc_id = list_sentence[0].doc_id
            # sentences = []
            # for sentence in list_sentence:
            #     if len(sentence.sentence_text)>MAX_AREA:
            #         for _sentence_comma in re.split("[;；，\n]",sentence):
            #             _comma_index = 0
            #             while(_comma_index<len(_sentence_comma)):
            #                 sentences.append(_sentence_comma[_comma_index:_comma_index+MAX_AREA])
            #                 _comma_index += MAX_AREA
            #     else:
            #         sentences.append(sentence+"。")
            list_sentence.sort(key=lambda x:len(x.sentence_text),reverse=True)
            _begin_index = 0
            
            item = {"code":[],"name":""}
            code_set = set()
            dict_name_freq_score = dict()
            while(True):
                MAX_LEN = len(list_sentence[_begin_index].sentence_text)
                if MAX_LEN>MAX_AREA:
                    MAX_LEN = MAX_AREA
                _LEN = MAX_AREA//MAX_LEN
                #预测

                x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
                # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
                x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
                x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")

                if USE_PAI_EAS:
                    request = tf_predict_pb2.PredictRequest()
                    request.inputs["inputs"].dtype = tf_predict_pb2.DT_INT32
                    request.inputs["inputs"].array_shape.dim.extend(np.shape(x))
                    request.inputs["inputs"].int_val.extend(np.array(x,dtype=np.int32).reshape(-1))
                    request_data = request.SerializeToString()
                    list_outputs = ["outputs"]
                    _result = vpc_requests(codename_url, codename_authorization, request_data, list_outputs)
                    if _result is not None:
                        predict_y = _result["outputs"]
                    else:
                        with self.sess_codename.as_default():
                            t_input,t_output = self.getModel()
                            predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
                else:
                    with self.sess_codename.as_default():
                        t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
                        _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x,
                                                                                              t_input_length:x_len,
                                                                                              t_keepprob:1.0})
                        predict_y = self.decode(_logits,_trans,x_len,7)
                        # print('==========',_logits)

                        '''
                        for item11 in np.argmax(predict_y,-1):
                            print(item11)
                        print(predict_y)
                        '''
                # print(predict_y)
                for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)):
                    pad_sentence = sentence.sentence_text[:MAX_LEN]
                    join_predict = "".join([str(s) for s in predict])
                    # print(pad_sentence)
                    # print(join_predict)
                    code_x = []
                    code_text = []
                    temp_entitys = []
                    for iter in re.finditer(self.PC_pattern,join_predict):
                        get_len = 40
                        if iter.span()[0]<get_len:
                            begin = 0
                        else:
                            begin = iter.span()[0]-get_len
                        end = iter.span()[1]+get_len
                        code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
                        code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]],entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1])
                        temp_entitys.append(_entity)
                    #print("code",code_text)
                    if len(code_x)>0:
                        code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3))
                        if USE_PAI_EAS:
                            request = tf_predict_pb2.PredictRequest()
                            request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
                            request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0]))
                            request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1))
                            request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
                            request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1]))
                            request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1))
                            request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
                            request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2]))
                            request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1))
                            request_data = request.SerializeToString()
                            list_outputs = ["outputs"]
                            _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs)
                            if _result is not None:
                                predict_code = _result["outputs"]
                            else:
                                with self.sess_codesplit.as_default():
                                    with self.sess_codesplit.graph.as_default():
                                        predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
                        else:
                            with self.sess_codesplit.as_default():
                                with self.sess_codesplit.graph.as_default():
                                    inputs_code,outputs_code = self.getModel_code()
                                    predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]},MAX_BATCH=2)[0]

                                    #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
                                    #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
                        for h in range(len(predict_code)):
                            if predict_code[h][0]>0.5:
                                the_code = self.fitDataByRule(code_text[h])

                                #add code to entitys
                                list_entity.append(temp_entitys[h])

                                if the_code not in code_set:
                                    code_set.add(the_code)
                                    item['code'] = list(code_set)
                    for iter in re.finditer(self.PN_pattern,join_predict):
                        _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])

                        #add name to entitys
                        _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1])
                        list_entity.append(_entity)
                        w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[:：\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
                        if _name not in dict_name_freq_score:
                            # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
                        else:
                            dict_name_freq_score[_name][0] += 1
                    '''
                    for iter in re.finditer(self.PN_pattern,join_predict):
                        print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
                    if item[1]['name']=="":
                        for iter in re.finditer(self.PN_pattern,join_predict):
                            #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
                            item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
                            break
                    '''
                if _begin_index+_LEN>=len(list_sentence):
                    break
                _begin_index += _LEN
            
            list_name_freq_score = []

            # 2020/11/23 大网站规则调整
            if len(dict_name_freq_score) == 0:
                name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[:：\s]+([^，。：；]{2,60})[，。]'
                for sentence in list_sentence:
                    # pad_sentence = sentence.sentence_text
                    othername = re.search(name_re1, sentence.sentence_text)
                    if othername != None:
                        project_name = othername.group(3)
                        beg = find_index([project_name], sentence.sentence_text)[0]
                        end = beg + len(project_name)
                        _name = self.fitDataByRule(sentence.sentence_text[beg:end])
                        # add name to entitys
                        _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
                        sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
                                         entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
                                         end_index=0, wordOffset_begin=beg, wordOffset_end=end)
                        list_entity.append(_entity)
                        w = 1
                        if _name not in dict_name_freq_score:
                            # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
                        else:
                            dict_name_freq_score[_name][0] += 1
                # othername = re.search(name_re1, sentence.sentence_text)
                # if othername != None:
                #     _name = othername.group(3)
                #     if _name not in dict_name_freq_score:
                #         dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1]
                #     else:
                #         dict_name_freq_score[_name][0] += 1

            for _name in dict_name_freq_score.keys():
                list_name_freq_score.append([_name,dict_name_freq_score[_name]])
            # print(list_name_freq_score)
            if len(list_name_freq_score)>0:
                list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
                item['name'] = list_name_freq_score[0][0]
                # if list_name_freq_score[0][1][0]>1:
                #     item[1]['name'] = list_name_freq_score[0][0]
                # else:
                #     list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True)
                #     item[1]["name"] = list_name_freq_score[0][0]
                
            #下面代码加上去用正则添加某些识别不到的项目编号
            if item['code'] == []:
                for sentence in list_sentence:
                    # othercode = re.search('(采购计划编号|询价编号)[\)）]?[:：]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text)
                    # if othercode != None:
                    #     item[1]['code'].append(othercode.group(2))
                    # 2020/11/23 大网站规则调整
                    othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[:：\s]+([^，。；：、]{8,30}[a-zA-Z0-9\号])[\)，。]', sentence.sentence_text)
                    if othercode != None:
                        item['code'].append(othercode.group(3))
            item['code'].sort(key=lambda x:len(x),reverse=True)
            result.append(item)

            list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
        return result
                        
                
    '''
    #当数据量过大时会报错
    def predict(self,articles,MAX_LEN = None):
        sentences = []
        for article in articles:
            for sentence in article.content.split("。"):
                sentences.append([sentence,article.id])
        if MAX_LEN is None:
            sent_len = [len(sentence[0]) for sentence in sentences]
            MAX_LEN = max(sent_len)
            #print(MAX_LEN)
           
        #若为空，则直接返回空
        result = [] 
        if MAX_LEN==0:
            for article in articles:
                result.append([article.id,{"code":[],"name":""}])
            return result
        
        index_unk = self.word2index.get("<unk>")
        index_pad = self.word2index.get("<pad>")
        
        x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences]
        x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
        
        predict_y = self.getModel().predict(x)
        
        
        last_doc_id = ""
        item = []
        for sentence,predict in zip(sentences,np.argmax(predict_y,-1)):
            pad_sentence = sentence[0][:MAX_LEN]
            doc_id = sentence[1]
            join_predict = "".join([str(s) for s in predict])
            if doc_id!=last_doc_id:
                if last_doc_id!="":
                    result.append(item)
                item = [doc_id,{"code":[],"name":""}]
                code_set = set()
            code_x = []
            code_text = []
            for iter in re.finditer(self.PC_pattern,join_predict):
                get_len = 40
                if iter.span()[0]<get_len:
                    begin = 0
                else:
                    begin = iter.span()[0]-get_len
                end = iter.span()[1]+get_len
                code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
                code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
            if len(code_x)>0:
                code_x = np.transpose(np.array(code_x),(1,0,2,3))
                predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
                for h in range(len(predict_code)):
                    if predict_code[h][0]>0.5:
                        the_code = self.fitDataByRule(code_text[h])
                        if the_code not in code_set:
                            code_set.add(the_code)
                            item[1]['code'] = list(code_set)
            if item[1]['name']=="":
                for iter in re.finditer(self.PN_pattern,join_predict):
                    #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
                    item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
                    break
                
            last_doc_id = doc_id
        result.append(item)
        return result
    '''
        
#角色金额模型        
class PREMPredict():

    
    def __init__(self):
        #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
        self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
        self.model_role = Model_role_classify_word()
        self.model_money = Model_money_classify()
        
        return
    
    def search_role_data(self,list_sentences,list_entitys):
        '''
        @summary:根据句子list和实体list查询角色模型的输入数据
        @param:
            list_sentences:文章的sentences
            list_entitys:文章的entitys
        @return:角色模型的输入数据
        '''
        text_list = []
        data_x = []
        points_entitys = []
        for list_entity,list_sentence in zip(list_entitys,list_sentences):

            list_entity.sort(key=lambda x:x.sentence_index)
            list_sentence.sort(key=lambda x:x.sentence_index)
            p_entitys = 0
            p_sentences = 0
            while(p_entitys<len(list_entity)):
                entity = list_entity[p_entitys]
                if entity.entity_type in ['org','company']:
                    while(p_sentences<len(list_sentence)):
                        sentence = list_sentence[p_sentences]
                        if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-10):entity.wordOffset_end+10])
                            #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
                            item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
                            data_x.append(item_x)
                            points_entitys.append(entity)
                            break
                        p_sentences += 1
                    
                p_entitys += 1
            
        if len(points_entitys)==0:
            return None
        
        return [data_x,points_entitys, text_list]
    
    
    def search_money_data(self,list_sentences,list_entitys):
        '''
        @summary:根据句子list和实体list查询金额模型的输入数据
        @param:
            list_sentences:文章的sentences
            list_entitys:文章的entitys
        @return:金额模型的输入数据
        '''
        text_list = []
        data_x = []
        points_entitys = []
        for list_entity,list_sentence in zip(list_entitys,list_sentences):

            list_entity.sort(key=lambda x:x.sentence_index)
            list_sentence.sort(key=lambda x:x.sentence_index)
            p_entitys = 0
    
            while(p_entitys<len(list_entity)):
                entity = list_entity[p_entitys]
                if entity.entity_type=="money":
                    p_sentences = 0
                    while(p_sentences<len(list_sentence)):
                        sentence = list_sentence[p_sentences]
                        if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
                            text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 8):entity.wordOffset_end])
                            #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
                            #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
                            item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
                            data_x.append(item_x)
                            points_entitys.append(entity)
                            break
                        p_sentences += 1
                p_entitys += 1
        
        if len(points_entitys)==0:
            return None
        
        return [data_x,points_entitys, text_list]
    
    def predict_role(self,list_sentences, list_entitys):
        datas = self.search_role_data(list_sentences, list_entitys)

        if datas is None:
            return
        points_entitys = datas[1]
        text_list = datas[2]


        if USE_PAI_EAS:
            _data = datas[0]
            _data = np.transpose(np.array(_data),(1,0,2))
            request = tf_predict_pb2.PredictRequest()
            request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
            request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
            request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
            request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
            request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
            request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
            request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
            request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
            request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
            request_data = request.SerializeToString()
            list_outputs = ["outputs"]
            _result = vpc_requests(role_url, role_authorization, request_data, list_outputs)
            if _result is not None:
                predict_y = _result["outputs"]
            else:
                predict_y = self.model_role.predict(datas[0])
        else:
            predict_y = self.model_role.predict(np.array(datas[0],dtype=np.float64))
        for i in range(len(predict_y)):
            entity = points_entitys[i]
            label = np.argmax(predict_y[i])
            values = predict_y[i]
            text = text_list[i]
            if label == 2:
                if re.search('中标单位和.{,25}签订合同', text):
                    label = 0
                    values[label] = 0.501
                elif re.search('尊敬的供应商：.{,25}我公司', text):
                    label = 0
                    values[label] = 0.801
            elif label == 1 and re.search('委托(单位|人|方)[是为：]+', text[:10]) and re.search('受委托(单位|人|方)[是为：]+', text[:10])==None:
                label = 0
                values[label] = 0.501
            elif label == 1 and re.search('([，。：]|^)(服务|中选)机构(名称)?', text[:-10]):
                label = 2
                values[label] = 0.501
            entity.set_Role(label, values)

    def predict_money(self,list_sentences,list_entitys):
        datas = self.search_money_data(list_sentences, list_entitys)
        if datas is None:
            return
        points_entitys = datas[1]
        _data = datas[0]
        text_list = datas[2]
        if USE_PAI_EAS:
            _data = np.transpose(np.array(_data),(1,0,2,3))
            request = tf_predict_pb2.PredictRequest()
            request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
            request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
            request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
            request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
            request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
            request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
            request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
            request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
            request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
            request_data = request.SerializeToString()
            list_outputs = ["outputs"]
            _result = vpc_requests(money_url, money_authorization, request_data, list_outputs)
            if _result is not None:
                predict_y = _result["outputs"]
            else:
                predict_y = self.model_money.predict(_data)
        else:
            predict_y = self.model_money.predict(_data)
        for i in range(len(predict_y)):
            entity = points_entitys[i]
            label = np.argmax(predict_y[i])
            values = predict_y[i]
            text = text_list[i]
            if label == 1 and re.search('[:：，。](总金额|总价|单价)', text):
                values[label] = 0.49
            elif label ==0 and entity.notes in ["投资", "工程造价"]:
                values[label] = 0.49
            entity.set_Money(label, values)
        
    def predict(self,list_sentences,list_entitys):
        self.predict_role(list_sentences,list_entitys)
        self.predict_money(list_sentences,list_entitys)
        
        
#联系人模型    
class EPCPredict():
    
    def __init__(self):
        self.model_person = Model_person_classify()


    def search_person_data(self,list_sentences,list_entitys):
        '''
        @summary:根据句子list和实体list查询联系人模型的输入数据
        @param:
            list_sentences:文章的sentences
            list_entitys:文章的entitys
        @return:联系人模型的输入数据
        '''

        data_x = []
        points_entitys = []
        for list_entity,list_sentence in zip(list_entitys,list_sentences):
            
            p_entitys = 0
            dict_index_sentence = {}
            for _sentence in list_sentence:
                dict_index_sentence[_sentence.sentence_index] = _sentence

            _list_entity = [entity for entity in list_entity if entity.entity_type=="person"]
            while(p_entitys<len(_list_entity)):
                entity = _list_entity[p_entitys]
                if entity.entity_type=="person":
                    sentence = dict_index_sentence[entity.sentence_index]

                    item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
                    data_x.append(item_x)
                    points_entitys.append(entity)

                p_entitys += 1

        if len(points_entitys)==0:
            return None
        
        # return [data_x,points_entitys,dianhua]
        return [data_x,points_entitys]

    def predict_person(self,list_sentences, list_entitys):
        datas = self.search_person_data(list_sentences, list_entitys)
        if datas is None:
            return
        points_entitys = datas[1]
        # phone = datas[2]
        if USE_PAI_EAS:
            _data = datas[0]
            _data = np.transpose(np.array(_data),(1,0,2,3))
            request = tf_predict_pb2.PredictRequest()
            request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
            request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
            request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
            request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
            request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
            request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
            request_data = request.SerializeToString()
            list_outputs = ["outputs"]
            _result = vpc_requests(person_url, person_authorization, request_data, list_outputs)
            if _result is not None:
                predict_y = _result["outputs"]
            else:
                predict_y = self.model_person.predict(datas[0])
        else:
            predict_y = self.model_person.predict(datas[0])
        # assert len(predict_y)==len(points_entitys)==len(phone)
        assert len(predict_y)==len(points_entitys)
        for i in range(len(predict_y)):
            entity = points_entitys[i]
            label = np.argmax(predict_y[i])
            values = []
            for item in predict_y[i]:
                values.append(item)
            # phone_number = phone[i]
            # entity.set_Person(label,values,phone_number)
            entity.set_Person(label,values,[])
        # 为联系人匹配电话
        # self.person_search_phone(list_sentences, list_entitys)

    def person_search_phone(self,list_sentences, list_entitys):
        def phoneFromList(phones):
            # for phone in phones:
            #     if len(phone)==11:
            #         return re.sub('电话[：|:]|联系方式[：|:]','',phone)
            return re.sub('电话[：|:]|联系方式[：|:]', '', phones[0])

        for list_entity, list_sentence in zip(list_entitys, list_sentences):
            # p_entitys = 0
            # p_sentences = 0
            #
            # key_word = re.compile('电话[：|:].{0,4}\d{7,12}|联系方式[：|:].{0,4}\d{7,12}')
            # # phone = re.compile('1[3|4|5|7|8][0-9][-—－]?\d{4}[-—－]?\d{4}|\d{3,4}[-—－]\d{7,8}/\d{3,8}|\d{3,4}[-—－]\d{7,8}转\d{1,4}|\d{3,4}[-—－]\d{7,8}|[\（|\(]0\d{2,3}[\）|\)]-?\d{7,8}-?\d{,4}')  # 联系电话
            # # 2020/11/25 增加发现的号码段
            # phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—－]?\d{4}[-—－]?\d{4}|'
            #                    '\d{3,4}[-—－][1-9]\d{6,7}/\d{3,8}|'
            #                    '\d{3,4}[-—－]\d{7,8}转\d{1,4}|'
            #                    '\d{3,4}[-—－]?[1-9]\d{6,7}|'
            #                    '[\（|\(]0\d{2,3}[\）|\)]-?\d{7,8}-?\d{,4}|'
            #                    '[1-9]\d{6,7}')  # 联系电话
            # dict_index_sentence = {}
            # for _sentence in list_sentence:
            #     dict_index_sentence[_sentence.sentence_index] = _sentence
            #
            # dict_context_itemx = {}
            # last_person = "####****++++$$^"
            # last_person_phone = "####****++++$^"
            # _list_entity = [entity for entity in list_entity if entity.entity_type == "person"]
            # while (p_entitys < len(_list_entity)):
            #     entity = _list_entity[p_entitys]
            #     if entity.entity_type == "person" and entity.label in [1,2,3]:
            #         sentence = dict_index_sentence[entity.sentence_index]
            #         # item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_PERSON_INPUT_SHAPE[1]),shape=settings.MODEL_PERSON_INPUT_SHAPE)
            #
            #         # s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20)
            #
            #         # 2021/5/8 取上下文的句子，解决表格处理的分句问题
            #         left_sentence = dict_index_sentence.get(entity.sentence_index - 1)
            #         left_sentence_tokens = left_sentence.tokens if left_sentence else []
            #         right_sentence = dict_index_sentence.get(entity.sentence_index + 1)
            #         right_sentence_tokens = right_sentence.tokens if right_sentence else []
            #         entity_beginIndex = entity.begin_index + len(left_sentence_tokens)
            #         entity_endIndex = entity.end_index + len(left_sentence_tokens)
            #         context_sentences_tokens = left_sentence_tokens + sentence.tokens + right_sentence_tokens
            #         s = spanWindow(tokens=context_sentences_tokens, begin_index=entity_beginIndex,
            #                        end_index=entity_endIndex, size=20)
            #
            #         _key = "".join(["".join(x) for x in s])
            #         if _key in dict_context_itemx:
            #             _dianhua = dict_context_itemx[_key][0]
            #         else:
            #             s1 = ''.join(s[1])
            #             # s1 = re.sub('，）', '-', s1)
            #             s1 = re.sub('\s', '', s1)
            #             have_key = re.findall(key_word, s1)
            #             have_phone = re.findall(phone, s1)
            #             s0 = ''.join(s[0])
            #             # s0 = re.sub('，）', '-', s0)
            #             s0 = re.sub('\s', '', s0)
            #             have_key2 = re.findall(key_word, s0)
            #             have_phone2 = re.findall(phone, s0)
            #
            #             s3 = ''.join(s[1])
            #             # s0 = re.sub('，）', '-', s0)
            #             s3 = re.sub(',|，|\s', '', s3)
            #             have_key3 = re.findall(key_word, s3)
            #             have_phone3 = re.findall(phone, s3)
            #
            #             s4 = ''.join(s[0])
            #             # s0 = re.sub('，）', '-', s0)
            #             s4 = re.sub(',|，|\s', '', s0)
            #             have_key4 = re.findall(key_word, s4)
            #             have_phone4 = re.findall(phone, s4)
            #
            #             _dianhua = ""
            #             if have_phone:
            #                 if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
            #                         last_person_phone) != -1:
            #                     if len(have_phone) > 1:
            #                         _dianhua = phoneFromList(have_phone[1:])
            #                 else:
            #                     _dianhua = phoneFromList(have_phone)
            #             elif have_key:
            #                 if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
            #                         last_person_phone) != -1:
            #                     if len(have_key) > 1:
            #                         _dianhua = phoneFromList(have_key[1:])
            #                 else:
            #                     _dianhua = phoneFromList(have_key)
            #             elif have_phone2:
            #                 if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
            #                         last_person_phone) != -1:
            #                     if len(have_phone2) > 1:
            #                         _dianhua = phoneFromList(have_phone2[1:])
            #                 else:
            #                     _dianhua = phoneFromList(have_phone2)
            #             elif have_key2:
            #                 if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
            #                         last_person_phone) != -1:
            #                     if len(have_key2) > 1:
            #                         _dianhua = phoneFromList(have_key2[1:])
            #                 else:
            #                     _dianhua = phoneFromList(have_key2)
            #             elif have_phone3:
            #                 if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
            #                         last_person_phone) != -1:
            #                     if len(have_phone3) > 1:
            #                         _dianhua = phoneFromList(have_phone3[1:])
            #                 else:
            #                     _dianhua = phoneFromList(have_phone3)
            #             elif have_key3:
            #                 if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
            #                         last_person_phone) != -1:
            #                     if len(have_key3) > 1:
            #                         _dianhua = phoneFromList(have_key3[1:])
            #                 else:
            #                     _dianhua = phoneFromList(have_key3)
            #             elif have_phone4:
            #                 if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
            #                         last_person_phone) != -1:
            #                     if len(have_phone4) > 1:
            #                         _dianhua = phoneFromList(have_phone4)
            #                 else:
            #                     _dianhua = phoneFromList(have_phone4)
            #             elif have_key4:
            #                 if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
            #                         last_person_phone) != -1:
            #                     if len(have_key4) > 1:
            #                         _dianhua = phoneFromList(have_key4)
            #                 else:
            #                     _dianhua = phoneFromList(have_key4)
            #             else:
            #                 _dianhua = ""
            #             # dict_context_itemx[_key] = [item_x, _dianhua]
            #             dict_context_itemx[_key] = [_dianhua]
            #         # points_entitys.append(entity)
            #         # dianhua.append(_dianhua)
            #         last_person = entity.entity_text
            #         if _dianhua:
            #             # 更新联系人entity联系方式（person_phone）
            #             entity.person_phone = _dianhua
            #             last_person_phone = _dianhua
            #         else:
            #             last_person_phone = "####****++++$^"
            #     p_entitys += 1

            from scipy.optimize import linear_sum_assignment
            from BiddingKG.dl.interface.Entitys import Match
            def dispatch(match_list):
                main_roles = list(set([match.main_role for match in match_list]))
                attributes = list(set([match.attribute for match in match_list]))

                label = np.zeros(shape=(len(main_roles), len(attributes)))
                for match in match_list:
                    main_role = match.main_role
                    attribute = match.attribute
                    value = match.value
                    label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
                # print(label)
                gragh = -label
                # km算法
                row, col = linear_sum_assignment(gragh)
                max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
                return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
            # km算法
            key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})')
            phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—－―]?\d{4}[-—－―]?\d{4}|'
                               '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
                               '0\d{2,3}[-—－―][1-9]\d{6,7}/[1-9]\d{6,10}|'
                               '0\d{2,3}[-—－―]\d{7,8}转\d{1,4}|'
                               '0\d{2,3}[-—－―]?[1-9]\d{6,7}|'
                               '[\（|\(]0\d{2,3}[\）|\)]-?\d{7,8}-?\d{,4}|'
                               '[1-9]\d{6,7}')
            phone_entitys = []
            for _sentence in list_sentence:
                sentence_text = _sentence.sentence_text
                res_set = set()
                for i in re.finditer(phone,sentence_text):
                    res_set.add((i.group(),i.start(),i.end()))
                for i in re.finditer(key_word,sentence_text):
                    res_set.add((i.group(2),i.start()+len(i.group(1)),i.end()))
                for item in list(res_set):
                    phone_left = sentence_text[max(0,item[1]-10):item[1]]
                    phone_right = sentence_text[item[2]:item[2]+8]
                    # 排除传真号 和 其它错误项
                    if re.search("传，?真|信，?箱|邮，?箱",phone_left):
                        if not re.search("电，?话",phone_left):
                            continue
                    if re.search("帐，?号|编，?号|报，?价|证，?号|价，?格|[\(（]万?元[\)）]",phone_left):
                        continue
                    if re.search("[.,]\d{2,}",phone_right):
                        continue
                    _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2])
                    phone_entitys.append(_entity)
            person_entitys = []
            for entity in list_entity:
                if entity.entity_type == "person":
                    entity.person_phone = ""
                    person_entitys.append(entity)
            _list_entity = phone_entitys + person_entitys
            _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin))

            words_num_dict = dict()
            last_words_num = 0
            list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
            for sentence in list_sentence:
                _index = sentence.sentence_index
                if _index == 0:
                    words_num_dict[_index] = 0
                else:
                    words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
                last_words_num = len(sentence.sentence_text)
            match_list = []
            for index in range(len(_list_entity)):
                entity = _list_entity[index]
                if entity.entity_type=="person" and entity.label in [1,2,3]:
                    match_nums = 0
                    for after_index in range(index + 1, min(len(_list_entity), index + 5)):
                        after_entity = _list_entity[after_index]
                        if after_entity.entity_type=="phone":
                            sentence_distance = after_entity.sentence_index - entity.sentence_index
                            distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - (
                                    words_num_dict[entity.sentence_index] + entity.wordOffset_end)
                            if sentence_distance < 2 and distance < 50:
                                value = (-1 / 2 * (distance ** 2)) / 10000
                                match_list.append(Match(entity, after_entity, value))
                                match_nums += 1
                            else:
                                break
                        if after_entity.entity_type=="person":
                            if after_entity.label not in [1,2,3]:
                                break
                    if not match_nums:
                        for previous_index in range(index-1, max(0,index-5), -1):
                            previous_entity = _list_entity[previous_index]
                            if previous_entity.entity_type == "phone":
                                sentence_distance = entity.sentence_index - previous_entity.sentence_index
                                distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - (
                                        words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end)
                                if sentence_distance < 1 and distance<30:
                                    # 前向 没有 /10000
                                    value = (-1 / 2 * (distance ** 2))
                                    match_list.append(Match(entity, previous_entity, value))
                                else:
                                    break

            result = dispatch(match_list)
            for match in result:
                entity = match.main_role
                # 更新 list_entity
                entity_index = list_entity.index(entity)
                list_entity[entity_index].person_phone = match.attribute.entity_text


    def predict(self,list_sentences,list_entitys):
        self.predict_person(list_sentences,list_entitys)
            
#表格预测
class FormPredictor():
    
    def __init__(self,lazyLoad=getLazyLoad()):
        self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
        self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
        self.model_form_item = Model_form_item()
        self.model_form_context = Model_form_context()
        self.model_dict = {"line":[None,self.model_file_line]}
        
        
    def getModel(self,type):
        if type=="item":
            return self.model_form_item
        elif type=="context":
            return self.model_form_context
        else:
            return self.getModel(type)

    def encode(self,data,**kwargs):
        return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0]
        return encodeInput_form(data)

    
    def predict(self,form_datas,type):
        if type=="item":
            return self.model_form_item.predict(form_datas)
        elif type=="context":
            return self.model_form_context.predict(form_datas)
        else:
            return self.getModel(type).predict(form_datas)

    
#角色规则
#依据正则给所有无角色的实体赋予角色，给予等于阈值的最低概率
class RoleRulePredictor():
    
    def __init__(self):
        # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
        self.pattern_tenderee_left = "(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
                                "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|(业主|采购人|招标人)联系方式[，：]公司名称：|权属人|甲方当事人|询价书企业|比选发起人|项目单位[，：]单位名称|结算单位)"\
                                "[）)]?(信息[，：])?(名称)?([(（](全称|盖章)[）)])?(是|为|：|:|\s*)+$)"
        self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询价|评选|谈判|邀标|邀请|洽谈|约谈)" \
                                     "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
                                     "(名称)?([(（](全称|盖章)[）)])?(是|为|：|:|\s*)+$)"
        self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托))"
        self.pattern_tenderee_right = "(?P<tenderee_right>^([(（](以下简称)?[，\"“]*(招标|采购)(人|单位|机构)[，\"”]*[)）])|^委托|^现委托|^的\w{2,10}正在进行)"  #|(^[^.。，,:：](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
        self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
        self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议)）]+标机构)(.{,4}名，?称|全称|是|为|：|:|[,，]?\s*)$|(受.{5,20}委托，?$))"
        self.pattern_agency_right = "(?P<agency_right>^([(（](以下简称)?[，\"“]*(代理)(人|单位|机构)[，\"”]*[)）])|^受.{5,20}委托|^受委?托，)"  # |^受托  会与 受托生产等冲突，代理表达一般会在后面有逗号
        # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
        self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[：:是为]+$|" \
                                        "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[：:是为]+$|((评审结果|名次|排名|中标结果)[:：]*第?[一1]名?)[：:是为]+$|" \
                                        "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[：:是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[：:是为]+$)"
        self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[：:是为]+$)" #取消逗号 并拒绝执行改进计划的供应商，华新水泥将可能终止与其合作关系
        # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[：:是为])"
        # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低，确定为本项目成交供应商)"
        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
                                        "^(报价|价格)最低，确定为本项目成交供应商|^：贵公司参与|^：?你方于|^中标。|^成为[\w、()（）]+项目的成交供应商))"
        self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果：由.{5,20}供货)|中标通知书.{,15}你方"   # 2020//11/24 大网站规则 中标关键词添加 谈判结果：由.{5,20}供货

        # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[：:]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|：|:|\s*$)|((评审结果|名次|排名)[:：]第?[一1]名?)|(单一来源(采购)?方式向.?$)"

        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[：:是为]+$)|((评审结果|名次|排名)[:：]第?[二2]名?，?投标商名称[:：]+$))"
        self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
        
        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[：:是为]+$|((评审结果|名次|排名)[:：]第?[三3]名?，?投标商名称[:：]+$))"
        self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"

        self.pattern_whole = [self.pattern_tenderee_left,
                              self.pattern_tenderee_left_w1,
                              self.pattern_tenderee_center,
                              self.pattern_tenderee_right,
                              self.pattern_tendereeORagency_right,
                              self.pattern_agency_left,
                              self.pattern_agency_right,
                              self.pattern_winTenderer_left,
                              self.pattern_winTenderer_left_w1,
                              self.pattern_winTenderer_whole,
                              self.pattern_winTenderer_right,
                              self.pattern_secondTenderer_left,
                              self.pattern_secondTenderer_right,
                              self.pattern_thirdTenderer_left,
                              self.pattern_thirdTenderer_right
                              ]  # 需按顺序排列， 第二、三中标要在中标正则后面

        self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
        
        self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收)[）\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况")
        self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
        self.pattern_money_other = re.compile("代理费|服务费")
        self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[:：]?[\(（]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
        
    def _check_input(self,text, ignore=False):
        if not text:
            return []
        
        if not isinstance(text, list):
            text = [text]
        
        null_index = [i for i, t in enumerate(text) if not t]
        if null_index and not ignore:
            raise Exception("null text in input ")
        
        return text


    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):

        for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
                                                                      list_codenames):
            list_sentence.sort(key=lambda x: x.sentence_index)  # 2022/1/5 按句子顺序排序
            # list_name = list_codename["name"]
            list_name = []  # 2022/1/5  改为实体列表内所有项目名称
            for entity in list_entity:
                if entity.entity_type == 'name':
                    list_name.append(entity.entity_text)
            list_name = self._check_input(list_name) + [article.title]
            for p_entity in list_entity:

                if p_entity.entity_type in ["org", "company"]:
                    # 只解析角色为无的或者概率低于阈值的
                    if p_entity.label is None:
                        continue
                    # 将上下文包含标题的实体概率置为0.6，因为标题中的实体不一定是招标人
                    if str(p_entity.label) == "0":
                        find_flag = False
                        for _sentence in list_sentence:
                            if _sentence.sentence_index == p_entity.sentence_index:
                                _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
                                                   end_index=p_entity.end_index, size=20, center_include=True,
                                                   word_flag=True, use_text=True,
                                                   text=re.sub("）", ")", re.sub("（", "(", p_entity.entity_text)))
                                for _name in list_name:
                                    if _name != "" and str(_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:
                                        find_flag = True
                                        if p_entity.values[0] > on_value:
                                            p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
                                        else:
                                            p_entity.values[0] = on_value  # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
                        if find_flag:
                            continue

                    # 正则从概率低于阈值或其他类别中召回角色
                    role_prob = float(p_entity.values[int(p_entity.label)])
                    if role_prob < on_value or str(p_entity.label) == "5":
                        # 将标题中的实体置为招标人
                        _list_name = self._check_input(list_name, ignore=True)
                        find_flag = False
                        for _name in _list_name:  # 2022/1/5修正只要项目名称出现过的角色，所有位置都标注为招标人
                            if str(_name).find(re.sub("）", ")", re.sub("（", "(",
                                                                       p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4:
                                for _sentence in list_sentence:
                                    if _sentence.sentence_index == p_entity.sentence_index:
                                        _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
                                                           end_index=p_entity.end_index, size=20, center_include=True,
                                                           word_flag=True, use_text=True, text=re.sub("）", ")",
                                                                                                      re.sub("（", "(",
                                                                                                             p_entity.entity_text)))
                                        if str(_span[1] + _span[2][:len(str(_name))]).find(
                                                _name) >= 0:
                                            find_flag = True
                                            _label = 0
                                            p_entity.label = _label
                                            p_entity.values[int(_label)] = on_value
                                            break
                                    if p_entity.sentence_index >= 4:
                                        break
                            if find_flag:
                                break
                            # if str(_name).find(p_entity.entity_text)>=0:
                            #     find_flag = True
                            #     _label = 0
                            #     p_entity.label = _label
                            #     p_entity.values[int(_label)] = on_value
                            #     break
                        # 若是实体在标题中，默认为招标人，不进行以下的规则匹配
                        if find_flag:
                            continue

                        for s_index in range(len(list_sentence)):
                            if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \
                                    list_sentence[s_index].sentence_index:
                                tokens = list_sentence[s_index].tokens
                                begin_index = p_entity.begin_index
                                end_index = p_entity.end_index
                                size = 15
                                spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
                                                   word_flag=True, use_text=False)
                                # _flag = False

                                # 使用正则+距离解决冲突
                                # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
                                list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:10], spans[2]] # 实体左、中、右 信息
                                for _i_span in range(len(list_spans)):
                                    _flag = False
                                    _prob_weight = 1

                                    # print(list_spans[_i_span],p_entity.entity_text)
                                    for _pattern in self.pattern_whole:
                                        for _iter in re.finditer(_pattern, list_spans[_i_span]):
                                            for _group, _v_group in _iter.groupdict().items():
                                                if _v_group is not None and _v_group != "":
                                                    _role = _group.split("_")[0]
                                                    if _role == "tendereeORagency":   # 2022/3/9 新增不确定招标代理判断逻辑
                                                        print('p_entity_sentenceindex:', p_entity.sentence_index)
                                                        if p_entity.sentence_index>=1:  # 只在第一句进行这种模糊匹配
                                                            continue
                                                        if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
                                                            or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
                                                            _role = 'tenderee'
                                                        else:
                                                            _role = "agency"
                                                    _direct = _group.split("_")[1]
                                                    _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
                                                    # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                    #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|交易服务单位',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
                                                                                                        list_spans[
                                                                                                            0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                        _flag = True
                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
                                                        _prob_weight = 1.2 if _weight=='w1' else 1
                                                        # print('_v_group:',_group, _v_group, p_entity.entity_text)

                                                    if _i_span == 1 and _direct == "center":
                                                        _flag = True
                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
                                                        _prob_weight = 1.2 if _weight == 'w1' else 1
                                                        # print('_v_group:', _group, _v_group, p_entity.entity_text)

                                                    if _i_span == 2 and _direct == "right":
                                                        _flag = True
                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
                                                        _prob_weight = 1.2 if _weight == 'w1' else 1
                                                        # print('_v_group:', _group, _v_group, p_entity.entity_text)

                                    # 得到结果
                                    if _flag:
                                        p_entity.label = _label
                                        p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
                                        # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group,  _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
                                        break

                # 其他金额通过正则召回可能是招标或中投标的金额
                if p_entity.entity_type in ["money"]:
                    if str(p_entity.label) == "2":
                        for _sentence in list_sentence:
                            if _sentence.sentence_index == p_entity.sentence_index:
                                _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
                                                   end_index=p_entity.end_index, size=20, center_include=True,
                                                   word_flag=True, text=p_entity.entity_text)
                                if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
                                        self.pattern_money_other, _span[0]) is None:
                                    p_entity.values[0] = 0.8 + p_entity.values[0] / 10
                                    p_entity.label = 0
                                if re.search(self.pattern_money_tenderer, _span[0]) is not None:
                                    if re.search(self.pattern_money_other, _span[0]) is not None:
                                        if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
                                                re.search(self.pattern_money_other, _span[0]).span()[1]:
                                            p_entity.values[1] = 0.8 + p_entity.values[1] / 10
                                            p_entity.label = 1
                                    else:
                                        p_entity.values[1] = 0.8 + p_entity.values[1] / 10
                                        p_entity.label = 1
                                if re.search(self.pattern_money_tenderer_whole,
                                             "".join(_span)) is not None and re.search(self.pattern_money_other,
                                                                                       _span[0]) is None:
                                    p_entity.values[1] = 0.8 + p_entity.values[1] / 10
                                    p_entity.label = 1

            # 增加招标金额扩展，招标金额+连续的未识别金额，并且都可以匹配到标段信息，则将为识别的金额设置为招标金额
            list_p = []
            state = 0
            for p_entity in list_entity:
                for _sentence in list_sentence:
                    if _sentence.sentence_index == p_entity.sentence_index:
                        _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
                                           end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
                                           text=p_entity.entity_text)

                        if state == 2:
                            for _p in list_p[1:]:
                                _p.values[0] = 0.8 + _p.values[0] / 10
                                _p.label = 0
                            state = 0
                            list_p = []

                        if state == 0:
                            if p_entity.entity_type in ["money"]:
                                if str(p_entity.label) == "0" and re.search(self.pattern_pack,
                                                                            _span[0] + "-" + _span[2]) is not None:
                                    state = 1
                                    list_p.append(p_entity)
                        elif state == 1:
                            if p_entity.entity_type in ["money"]:
                                if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack,
                                                                                   _span[0] + "-" + _span[
                                                                                       2]) is not None and re.search(
                                        self.pattern_money_other,
                                        _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[
                                    0].sentence_index:
                                    list_p.append(p_entity)
                                else:
                                    state = 2

            if len(list_p) > 1:
                for _p in list_p[1:]:
                    # print("==",_p.entity_text,_p.sentence_index,_p.label)
                    _p.values[0] = 0.8 + _p.values[0] / 10
                    _p.label = 0
                state = 0
                list_p = []

            for p_entity in list_entity:
                # 将属于集合中的不可能是中标人的标签置为无
                if p_entity.entity_text in self.SET_NOT_TENDERER:
                    p_entity.label = 5

'''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
class RoleRuleFinalAdd():
    def predict(self, list_articles, list_entitys, list_codenames):
        text_end = list_articles[0].content[-40:]
        # sear_ent = re.search('[，。]([\u4e00-\u9fa5()（）]{5,20})，?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
        sear_ent = re.search('[，。]([\u4e00-\u9fa5()（）]{5,20}(，?[\u4e00-\u9fa5]{,6}(分公司|部))?)，?\s*[0-9零一二三四五六七八九十]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
        sear_ent2 = re.search('(户名|开户名称)[:：]([\u4e00-\u9fa5()（）]{5,20})[，。]', list_articles[0].content[:5000])
        sear_ent3 = re.search('(报名咨询|收货地点|送货地点)[，：]([\u4e00-\u9fa5()（）]{5,20})[0-9\-]*[，。]', list_articles[0].content[:5000])

        if sear_ent or sear_ent2 or sear_ent3:
            if sear_ent3:
                ent_re = sear_ent3.group(2)
            elif sear_ent2:
                ent_re = sear_ent2.group(2)
            else:
                ent_re = sear_ent.group(1)
            ent_re = ent_re.replace('，', '').replace("(","（").replace(")","）")
            tenderee_notfound = True
            agency_notfound = True
            ents = []
            for ent in list_entitys[0]:
                if ent.entity_type in ['org', 'company']:
                    if ent.label == 0:
                        tenderee_notfound = False
                    elif ent.label == 1:
                        agency_notfound = False
                    elif ent.label == 5:
                        ents.append(ent)
            if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
                                              or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
                n = 0
                for i in range(len(ents) - 1, -1, -1):
                    n += 1
                    if n > 3 and sear_ent: # 文章末尾角色加日期这种只找后三个实体
                        break
                    if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
                        ents[i].label = 0
                        ents[i].values[0] = 0.5
                        # log('正则最后补充实体： %s'%(ent_re))
                        break
            elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
                n = 0
                for i in range(len(ents) - 1, -1, -1):
                    n += 1
                    if n > 3 and sear_ent:  # 文章末尾角色加日期这种只找后三个实体
                        break
                    if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
                        ents[i].label = 1
                        ents[i].values[1] = 0.5
                        # log('正则最后补充实体： %s'%(ent_re))
                        break


        elif list_codenames[0]['name'] != "":  #把标题包含的公司实体作为招标人
            tenderee_notfound = True
            ents = []
            for ent in list_entitys[0]:
                if ent.entity_type in ['org', 'company']:
                    if ent.label == 0:
                        tenderee_notfound = False
                    elif ent.label == 1:
                        agency_notfound = False
                    elif ent.label == 5:
                        ents.append(ent)
            if tenderee_notfound == True:
                print('list_codenames',list_codenames[0]['name'])
                for ent in ents:
                    if ent.entity_text in list_codenames[0]['name']:
                        ent.label = 0
                        ent.values[0] = 0.5
                        # log('正则召回标题中包含的实体:%s'%ent.entity_text)
                        break


# 时间类别
class TimePredictor():
    def __init__(self):
        self.sess = tf.Session(graph=tf.Graph())
        self.inputs_code = None
        self.outputs_code = None
        self.input_shape = (2,40,128)
        self.load_model()

    def load_model(self):
        model_path = os.path.dirname(__file__)+'/timesplit_model'
        if self.inputs_code is None:
            log("get model of time")
            with self.sess.as_default():
                with self.sess.graph.as_default():
                    meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path)
                    signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
                    signature_def = meta_graph_def.signature_def
                    self.inputs_code = []
                    self.inputs_code.append(
                        self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
                    self.inputs_code.append(
                        self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
                    self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
                    return self.inputs_code, self.outputs_code
        else:
            return self.inputs_code, self.outputs_code

    def search_time_data(self,list_sentences,list_entitys):
        data_x = []
        points_entitys = []
        for list_sentence, list_entity in zip(list_sentences, list_entitys):
            p_entitys = 0
            p_sentences = 0
            list_sentence.sort(key=lambda x: x.sentence_index)
            while(p_entitys<len(list_entity)):
                entity = list_entity[p_entitys]
                if entity.entity_type in ['time']:
                    while(p_sentences<len(list_sentence)):
                        sentence = list_sentence[p_sentences]
                        if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
                            # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
                            # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
                            s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
                            left = s[0]
                            right = s[1]
                            context = [left, right]
                            x = self.embedding_words(context, shape=self.input_shape)
                            data_x.append(x)
                            points_entitys.append(entity)
                            break
                        p_sentences += 1
                p_entitys += 1
        if len(points_entitys)==0:
            return None
        data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
        return [data_x, points_entitys]

    def embedding_words(self, datas, shape):
        '''
        @summary:查找词汇对应的词向量
        @param:
            datas:词汇的list
            shape:结果的shape
        @return: array,返回对应shape的词嵌入
        '''
        model_w2v = getModel_w2v()
        embed = np.zeros(shape)
        length = shape[1]
        out_index = 0
        for data in datas:
            index = 0
            for item in data:
                item_not_space = re.sub("\s*", "", item)
                if index >= length:
                    break
                if item_not_space in model_w2v.vocab:
                    embed[out_index][index] = model_w2v[item_not_space]
                    index += 1
                else:
                    embed[out_index][index] = model_w2v['unk']
                    index += 1
            out_index += 1
        return embed

    def predict(self, list_sentences,list_entitys):
        datas = self.search_time_data(list_sentences, list_entitys)
        if datas is None:
            return
        points_entitys = datas[1]
        with self.sess.as_default():
            predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0]
                ,self.inputs_code[1]:datas[0][1]})[0]
            for i in range(len(predict_y)):
                entity = points_entitys[i]
                label = np.argmax(predict_y[i])
                values = []
                for item in predict_y[i]:
                    values.append(item)
                if label != 0:
                    if not timeFormat(entity.entity_text):
                        label = 0
                        values[0] = 0.5
                entity.set_Role(label, values)

# 产品字段提取
class ProductPredictor():
    def __init__(self):
        vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
        self.vocab = load(vocabpath)
        self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab)))
        self.sess = tf.Session(graph=tf.Graph())
        self.load_model()

    def load_model(self):
        # model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
        model_path = os.path.dirname(__file__)+'/product_savedmodel/productAndfailreason.pb'
        with self.sess.as_default():
            with self.sess.graph.as_default():
                output_graph_def = tf.GraphDef()
                with open(model_path, 'rb') as f:
                    output_graph_def.ParseFromString(f.read())
                    tf.import_graph_def(output_graph_def, name='')
                    self.sess.run(tf.global_variables_initializer())
                    self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
                    self.length = self.sess.graph.get_tensor_by_name("Sum:0")
                    self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
                    self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
                    self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")

    def decode(self,logits, lengths, matrix):
        paths = []
        small = -1000.0
        # start = np.asarray([[small] * 4 + [0]])
        start = np.asarray([[small]*7+[0]])
        for score, length in zip(logits, lengths):
            score = score[:length]
            pad = small * np.ones([length, 1])
            logits = np.concatenate([score, pad], axis=1)
            logits = np.concatenate([start, logits], axis=0)
            path, _ = viterbi_decode(logits, matrix)
            paths.append(path[1:])
        return paths

    def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
        '''
        预测实体代码，每个句子最多取MAX_AREA个字，超过截断
        :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
        :param list_entitys: 多篇公告实体列表
        :param MAX_AREA: 每个句子最多截取多少字
        :return: 把预测出来的实体放进实体类
        '''
        with self.sess.as_default() as sess:
            with self.sess.graph.as_default():
                result = []
                if fail and list_articles!=[]:
                    text_list = [list_articles[0].content[:MAX_AREA]]
                    chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in text] for text in text_list]
                    lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
                                                      feed_dict={
                                                          self.char_input: np.asarray(chars),
                                                          self.dropout: 1.0
                                                      })
                    batch_paths = self.decode(scores, lengths, tran_)
                    for text, path, length in zip(text_list, batch_paths, lengths):
                        tags = ''.join([str(it) for it in path[:length]])
                        for it in re.finditer("12*3", tags):
                            start = it.start()
                            end = it.end()
                            _entity = Entity(doc_id=list_articles[0].doc_id, entity_id="%s_%s_%s_%s" % (
                                list_articles[0].doc_id, 0, start, end),
                                             entity_text=text[start:end],
                                             entity_type="product", sentence_index=0,
                                             begin_index=0, end_index=0, wordOffset_begin=start,
                                             wordOffset_end=end)
                            list_entitys[0].append(_entity)
                        for it in re.finditer("45*6", tags):
                            start = it.start()
                            end = it.end()
                            result.append(text[start:end].replace('？', '').strip())
                    reasons = []
                    for it in result:
                        if "(√)" in it or "（√）" in it:
                            reasons = [it]
                            break
                        if reasons != [] and (it not in reasons[-1] and it not in reasons):
                            reasons.append(it)
                        elif reasons == []:
                            reasons.append(it)
                    return {'fail_reason':'；'.join(reasons)}

                if list_entitys is None:
                    list_entitys = [[] for _ in range(len(list_sentences))]
                for list_sentence, list_entity in zip(list_sentences,list_entitys):
                    if len(list_sentence)==0:
                        result.append({"product":[]})
                        continue
                    list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
                    _begin_index = 0
                    item = {"product":[]}
                    temp_list = []
                    while True:
                        MAX_LEN = len(list_sentence[_begin_index].sentence_text)
                        if MAX_LEN > MAX_AREA:
                            MAX_LEN = MAX_AREA
                        _LEN = MAX_AREA//MAX_LEN
                        chars = [sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
                        chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in l] for l in chars]
                        chars = pad_sequences(chars, maxlen=MAX_LEN, padding="post", truncating="post")
                        lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
                                                          feed_dict={
                                                                    self.char_input: np.asarray(chars),
                                                                    self.dropout: 1.0
                                                                    })
                        batch_paths = self.decode(scores, lengths, tran_)
                        for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
                            tags = ''.join([str(it) for it in path[:length]])
                            for it in re.finditer("12*3", tags):
                                start = it.start()
                                end = it.end()
                                _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
                                sentence.doc_id, sentence.sentence_index, start, end),
                                                 entity_text=sentence.sentence_text[start:end],
                                                 entity_type="product", sentence_index=sentence.sentence_index,
                                                 begin_index=0, end_index=0, wordOffset_begin=start,
                                                 wordOffset_end=end)
                                list_entity.append(_entity)
                                temp_list.append(sentence.sentence_text[start:end])
                        # item["product"] = list(set(temp_list))
                        # result.append(item)
                        if _begin_index+_LEN >= len(list_sentence):
                            break
                        _begin_index += _LEN
                    item["product"] = list(set(temp_list))
                    result.append(item) # 修正bug
                return {'fail_reason': ""}


# 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
class ProductAttributesPredictor():
    def __init__(self,):
        self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\)）]?(名称|内容|描述)'
        self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
        with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
            self.header_set = pickle.load(f)
    def isTrueTable(self, table):
        '''真假表格规则：
        1、包含<caption>或<th>标签为真
        2、包含大量链接、表单、图片或嵌套表格为假
        3、表格尺寸太小为假
        4、外层<table>嵌套子<table>,一般子为真，外为假'''
        if table.find_all(['caption', 'th']) != []:
            return True
        elif len(table.find_all(['form', 'a', 'img'])) > 5:
            return False
        elif len(table.find_all(['tr'])) < 2:
            return False
        elif len(table.find_all(['table'])) >= 1:
            return False
        else:
            return True

    def getTrs(self, tbody):
        # 获取所有的tr
        trs = []
        objs = tbody.find_all(recursive=False)
        for obj in objs:
            if obj.name == "tr":
                trs.append(obj)
            if obj.name == "tbody":
                for tr in obj.find_all("tr", recursive=False):
                    trs.append(tr)
        return trs

    def getTable(self, tbody):
        trs = self.getTrs(tbody)
        inner_table = []
        if len(trs) < 2:
            return inner_table
        for tr in trs:
            tr_line = []
            tds = tr.findChildren(['td', 'th'], recursive=False)
            if len(tds) < 2:
                continue
            for td in tds:
                td_text = re.sub('\s', '', td.get_text())
                tr_line.append(td_text)
            inner_table.append(tr_line)
        return inner_table

    def fixSpan(self, tbody):
        # 处理colspan, rowspan信息补全问题
        trs = self.getTrs(tbody)
        ths_len = 0
        ths = list()
        trs_set = set()
        # 修改为先进行列补全再进行行补全，否则可能会出现表格解析混乱
        # 遍历每一个tr

        for indtr, tr in enumerate(trs):
            ths_tmp = tr.findChildren('th', recursive=False)
            # 不补全含有表格的tr
            if len(tr.findChildren('table')) > 0:
                continue
            if len(ths_tmp) > 0:
                ths_len = ths_len + len(ths_tmp)
                for th in ths_tmp:
                    ths.append(th)
                trs_set.add(tr)
            # 遍历每行中的element
            tds = tr.findChildren(recursive=False)
            if len(tds) < 3:
                continue  # 列数太少的不补全
            for indtd, td in enumerate(tds):
                # 若有colspan 则补全同一行下一个位置
                if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "":
                    col = int(re.sub("[^0-9]", "", str(td['colspan'])))
                    if col < 10 and len(td.get_text()) < 500:
                        td['colspan'] = 1
                        for i in range(1, col, 1):
                            td.insert_after(copy.copy(td))
        for indtr, tr in enumerate(trs):
            ths_tmp = tr.findChildren('th', recursive=False)
            # 不补全含有表格的tr
            if len(tr.findChildren('table')) > 0:
                continue
            if len(ths_tmp) > 0:
                ths_len = ths_len + len(ths_tmp)
                for th in ths_tmp:
                    ths.append(th)
                trs_set.add(tr)
            # 遍历每行中的element
            tds = tr.findChildren(recursive=False)
            same_span = 0
            if len(tds) > 1 and 'rowspan' in tds[0].attrs:
                span0 = tds[0].attrs['rowspan']
                for td in tds:
                    if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0:
                        same_span += 1
            if same_span == len(tds):
                continue

            for indtd, td in enumerate(tds):
                # 若有rowspan 则补全下一行同样位置
                if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "":
                    row = int(re.sub("[^0-9]", "", str(td['rowspan'])))
                    td['rowspan'] = 1
                    for i in range(1, row, 1):
                        # 获取下一行的所有td， 在对应的位置插入
                        if indtr + i < len(trs):
                            tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False)
                            if len(tds1) >= (indtd) and len(tds1) > 0:
                                if indtd > 0:
                                    tds1[indtd - 1].insert_after(copy.copy(td))
                                else:
                                    tds1[0].insert_before(copy.copy(td))
                            elif len(tds1) > 0 and len(tds1) == indtd - 1:
                                tds1[indtd - 2].insert_after(copy.copy(td))

    def get_monthlen(self, year, month):
        '''输入年份、月份 int类型 得到该月份天数'''
        try:
            weekday, num = calendar.monthrange(int(year), int(month))
        except:
            num = 30
        return str(num)
    def fix_time(self, text, html, page_time):
        '''输入日期字段返回格式化日期'''
        for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'),
                   ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]:
            if it[0] in text:
                text = text.replace(it[0], it[1])
        if re.search('^\d{1,2}月$', text):
            m = re.search('^(\d{1,2})月$', text).group(1)
            if len(m) < 2:
                m = '0' + m
            year = re.search('(\d{4})年(.{,12}采购意向)?', html)
            if year:
                y = year.group(1)
                num = self.get_monthlen(y, m)
                if len(num) < 2:
                    num = '0' + num
                order_begin = "%s-%s-01" % (y, m)
                order_end = "%s-%s-%s" % (y, m, num)
            elif page_time != "":
                year = re.search('\d{4}', page_time)
                if year:
                    y = year.group(0)
                    num = self.get_monthlen(y, m)
                    if len(num) < 2:
                        num = '0' + num
                    order_begin = "%s-%s-01" % (y, m)
                    order_end = "%s-%s-%s" % (y, m, num)
                else:
                    y = str(datetime.datetime.now().year)
                    num = self.get_monthlen(y, m)
                    if len(num) < 2:
                        num = '0' + num
                    order_begin = "%s-%s-01" % (y, m)
                    order_end = "%s-%s-%s" % (y, m, num)
            else:
                y = str(datetime.datetime.now().year)
                num = self.get_monthlen(y, m)
                if len(num) < 2:
                    num = '0' + num
                order_begin = "%s-%s-01" % (y, m)
                order_end = "%s-%s-%s" % (y, m, num)
            return order_begin, order_end

        t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text)
        if t1:
            year = t1.group(1)
            month = t1.group(3)
            num = self.get_monthlen(year, month)
            if len(month)<2:
                month = '0'+month
            if len(num) < 2:
                num = '0'+num
            order_begin = "%s-%s-01" % (year, month)
            order_end = "%s-%s-%s" % (year, month, num)
            return order_begin, order_end
        t2 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)(\d{1,2})日?$', text)
        if t2:
            y = t2.group(1)
            m = t2.group(3)
            d = t2.group(5)
            m = '0'+ m if len(m)<2 else m
            d = '0'+d if len(d)<2 else d
            order_begin = order_end = "%s-%s-%s"%(y,m,d)
            return order_begin, order_end
        all_match = re.finditer('^(?P<y1>\d{4})(年|/|.)(?P<m1>\d{1,2})(?:(月|/|.)(?:(?P<d1>\d{1,2})日)?)?'
                                '(到|至|-)(?:(?P<y2>\d{4})(年|/|.))?(?P<m2>\d{1,2})(?:(月|/|.)'
                                '(?:(?P<d2>\d{1,2})日)?)?$', text)
        y1 = m1 = d1 = y2 = m2 = d2 = ""
        found_math = False
        for _match in all_match:
            if len(_match.group()) > 0:
                found_math = True
                for k, v in _match.groupdict().items():
                    if v!="" and v is not None:
                        if k == 'y1':
                            y1 = v
                        elif k == 'm1':
                            m1 = v
                        elif k == 'd1':
                            d1 = v
                        elif k == 'y2':
                            y2 = v
                        elif k == 'm2':
                            m2 = v
                        elif k == 'd2':
                            d2 = v
        if not found_math:
            return "", ""
        y2 = y1 if y2 == "" else y2
        d1 = '1' if d1 == "" else d1
        d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
        m1 = '0' + m1 if len(m1) < 2 else m1
        m2 = '0' + m2 if len(m2) < 2 else m2
        d1 = '0' + d1 if len(d1) < 2 else d1
        d2 = '0' + d2 if len(d2) < 2 else d2
        order_begin = "%s-%s-%s"%(y1,m1,d1)
        order_end = "%s-%s-%s"%(y2,m2,d2)
        return order_begin, order_end

    def find_header(self, items, p1, p2):
        '''
        inner_table 每行正则检查是否为表头，是则返回表头所在列序号，及表头内容
        :param items: 列表，内容为每个td 文本内容
        :param p1: 优先表头正则
        :param p2: 第二表头正则
        :return: 表头所在列序号，是否表头，表头内容
        '''
        flag = False
        header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
        product = ""  # 产品
        quantity = ""  # 数量
        unitPrice = ""  # 单价
        brand = ""  # 品牌
        specs = ""  # 规格
        demand = "" # 采购需求
        budget = "" # 预算金额
        order_time = "" # 采购时间

        for i in range(min(4, len(items))):
            it = items[i]
            if len(it) < 15 and re.search(p1, it) != None:
                flag = True
                product = it
                header_dic['名称'] = i
                break
        if not flag:
            for i in range(min(4, len(items))):
                it = items[i]
                if len(it) < 15 and re.search(p2, it) and re.search(
                        '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None:
                    flag = True
                    product = it
                    header_dic['名称'] = i
                    break
        if flag:
            for j in range(i + 1, len(items)):
                if len(items[j]) > 20 and len(re.sub('[\(（].*[）\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
                    continue
                if header_dic['数量']=="" and re.search('数量', items[j]):
                    header_dic['数量'] = j
                    quantity = items[j]
                elif re.search('单价', items[j]):
                    header_dic['单价'] = j
                    unitPrice = items[j]
                elif re.search('品牌', items[j]):
                    header_dic['品牌'] = j
                    brand = items[j]
                elif re.search('规格', items[j]):
                    header_dic['规格'] = j
                    specs = items[j]

                elif re.search('需求', items[j]):
                    header_dic['需求'] = j
                    demand = items[j]
                elif re.search('预算', items[j]):
                    header_dic['预算'] = j
                    budget = items[j]
                elif re.search('时间|采购实施月份|采购月份', items[j]):
                    header_dic['时间'] = j
                    order_time = items[j]

            if header_dic.get('名称', "") != "" :
                num = 0
                for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
                    if it != "":
                        num  += 1
                if num >=2:
                    return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
        flag = False
        return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)

    def predict(self, docid='', html='', page_time=""):
        '''
        正则寻找table表格内 产品相关信息
        :param html:公告HTML原文
        :return:公告表格内 产品、数量、单价、品牌、规格 ，表头，表头列等信息
        '''


        soup = BeautifulSoup(html, 'lxml')
        flag_yx = True if re.search('采购意向', html) else False
        tables = soup.find_all(['table'])
        headers = []
        headers_demand = []
        header_col = []
        product_link = []
        demand_link = []
        total_product_money = 0
        for i in range(len(tables)-1, -1, -1):
            table = tables[i]
            if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
                table.string = table.get_text()
                table.name = 'turntable'
                continue
            if not self.isTrueTable(table):
                continue
            self.fixSpan(table)
            inner_table = self.getTable(table)
            i = 0
            found_header = False
            header_colnum = 0

            if flag_yx:
                col0_l = []
                col1_l = []
                for tds in inner_table:
                    if len(tds) == 2:
                        col0_l.append(re.sub('：', '', tds[0]))
                        col1_l.append(tds[1])
                if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2:
                    header_list2 = []
                    product = demand = budget = order_begin = order_end = ""
                    for i in range(len(col0_l)):
                        if re.search('项目名称', col0_l[i]):
                            header_list2.append(col0_l[i])
                            product = col1_l[i]
                        elif re.search('采购需求|需求概况', col0_l[i]):
                            header_list2.append(col0_l[i])
                            demand = col1_l[i]
                        elif re.search('采购预算|预算金额', col0_l[i]):
                            header_list2.append(col0_l[i])
                            budget = col1_l[i]
                            if '万元' in col0_l[i] and '万' not in budget:
                                budget += '万元'
                            budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
                            budget = str(getUnifyMoney(budget))
                        elif re.search('采购时间|采购实施月份|采购月份', col0_l[i]):
                            header_list2.append(col0_l[i])
                            order_time = col1_l[i].strip()
                            order_begin, order_end = self.fix_time(order_time, html, page_time)
                    if product!= "" and demand != "" and budget!="" and order_begin != "":
                        link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
                                'order_begin': order_begin, 'order_end': order_end}
                        if link not in demand_link:
                            demand_link.append(link)
                            headers_demand.append('_'.join(header_list2))
                        continue

            while i < (len(inner_table)):
                tds = inner_table[i]
                not_empty = [it for it in tds if it != ""]
                if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
                    i += 1
                    continue
                product = ""  # 产品
                quantity = ""  # 数量
                unitPrice = ""  # 单价
                brand = ""  # 品牌
                specs = ""  # 规格
                demand = ""  # 采购需求
                budget = ""  # 预算金额
                order_time = ""  # 采购时间
                order_begin = ""
                order_end = ""

                if len(set(tds) & self.header_set) > len(tds) * 0.2:
                    header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
                    if found_header:
                        headers.append('_'.join(header_list))
                        headers_demand.append('_'.join(header_list2))
                        header_colnum = len(tds)
                        header_col.append('_'.join(tds))
                    i += 1
                    continue
                elif found_header:
                    if len(tds) != header_colnum:  # 表头、属性列数不一致跳过
                        i += 1
                        continue
                    id1 = header_dic.get('名称', "")
                    id2 = header_dic.get('数量', "")
                    id3 = header_dic.get('单价', "")
                    id4 = header_dic.get('品牌', "")
                    id5 = header_dic.get('规格', "")

                    id6 = header_dic.get('需求', "")
                    id7 = header_dic.get('预算', "")
                    id8 = header_dic.get('时间', "")
                    if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
                            re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
                        product = tds[id1]
                        if id2 != "":
                            if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
                                quantity = tds[id2]
                            else:
                                quantity = ""
                        if id3 != "":
                            if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
                                unitPrice = tds[id3]
                                if '万元' in header_list[2] and '万' not in unitPrice:
                                    unitPrice += '万元'
                                unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
                                unitPrice = str(getUnifyMoney(unitPrice))
                            else:
                                unitPrice = ""
                        if id4 != "":
                            if re.search('\w', tds[id4]):
                                brand = tds[id4]
                            else:
                                brand = ""
                        if id5 != "":
                            if re.search('\w', tds[id5]):
                                specs = tds[id5]
                            else:
                                specs = ""
                        if id6 != "":
                            if re.search('\w', tds[id6]):
                                demand = tds[id6]
                            else:
                                demand = ""
                        if id7 != "":
                            if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
                                budget = tds[id7]
                                if '万元' in header_list2[2] and '万' not in budget:
                                    budget += '万元'
                                budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget)
                                budget = str(getUnifyMoney(budget))
                            else:
                                budget = ""
                        if id8 != "":
                            if re.search('\w', tds[id8]):
                                order_time = tds[id8].strip()
                                order_begin, order_end = self.fix_time(order_time, html, page_time)
                        if quantity != "" or unitPrice != "" or brand != "" or specs != "":
                            link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
                                                      'brand': brand[:50], 'specs':specs}
                            if link not in product_link:
                                product_link.append(link)
                                mat = re.match('([0-9.,]+)[(（]?\w{,3}[)）]?$', link['quantity'])
                                if link['unitPrice'] != "" and mat:
                                    try:
                                        total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', ''))
                                    except:
                                        log('产品属性单价数量相乘出错, 单价： %s, 数量： %s'%(link['unitPrice'], link['quantity']))
                        if budget != "" and order_time != "" :
                            link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
                            if link not in demand_link:
                                demand_link.append(link)
                    i += 1
                else:
                    i += 1
        if len(product_link)>0:
            attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
        else:
            attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
        if len(demand_link)>0:
            demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
        else:
            demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
        return [attr_dic, demand_dic], total_product_money

# docchannel类型提取
class DocChannel():
  def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
    self.mask, self.mask_title = self.load_life(life_model)
    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
    self.type_mask, self.type_mask_title = self.load_type(type_model)
    self.sequen_len = 200  # 150 200
    self.title_len = 30
    self.sentence_num = 10
    self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'

    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
    self.id2type = {k: v for k, v in enumerate(lb_type)}
    self.id2life = {k: v for k, v in enumerate(lb_life)}

  def load_life(self,life_model):
    with tf.Graph().as_default() as graph:
      output_graph_def = graph.as_graph_def()
      with open(os.path.dirname(__file__)+life_model, 'rb') as f:
        output_graph_def.ParseFromString(f.read())
        tf.import_graph_def(output_graph_def, name='')
        print("%d ops in the final graph" % len(output_graph_def.node))
        del output_graph_def
        sess = tf.Session(graph=graph)
        sess.run(tf.global_variables_initializer())
        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
        title = sess.graph.get_tensor_by_name('inputs/title:0')
        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
        # logit = sess.graph.get_tensor_by_name('output/logit:0')
        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
        return sess, title, inputs, prob, softmax, mask, mask_title

  def load_type(self,type_model):
    with tf.Graph().as_default() as graph:
      output_graph_def = graph.as_graph_def()
      with open(os.path.dirname(__file__)+type_model, 'rb') as f:
        output_graph_def.ParseFromString(f.read())
        tf.import_graph_def(output_graph_def, name='')
        print("%d ops in the final graph" % len(output_graph_def.node))
        del output_graph_def
        sess = tf.Session(graph=graph)
        sess.run(tf.global_variables_initializer())
        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
        title = sess.graph.get_tensor_by_name('inputs/title:0')
        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
        # logit = sess.graph.get_tensor_by_name('output/logit:0')
        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
        return sess, title, inputs, prob, softmax, mask, mask_title

  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
    # print('准备预处理')
    def get_kw_senten(s, span=10):
      doc_sens = []
      tmp = 0
      num = 0
      end_idx = 0
      for it in re.finditer(self.kws, s):  # '|'.join(keywordset)
        left = s[end_idx:it.end()].split()
        right = s[it.end():].split()
        tmp_seg = s[tmp:it.start()].split()
        if len(tmp_seg) > span or tmp == 0:
          doc_sens.append(' '.join(left[-span:] + right[:span]))
          end_idx = it.end() + 1 + len(' '.join(right[:span]))
          tmp = it.end()
          num += 1
          if num >= self.sentence_num:
            break
      if doc_sens == []:
        doc_sens.append(s)
      return doc_sens

    def word2id(wordlist, max_len=self.sequen_len):
      ids = [getIndexOfWords(w) for w in wordlist]
      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
      assert len(ids) == max_len
      return ids

    cost_time = dict()
    datas = []
    datas_title = []
    try:
      segword_title = ' '.join(selffool.cut(doctitle)[0])
      segword_content = dochtmlcon
    except:
      segword_content = ''
      segword_title = ''
    if isinstance(segword_content, float):
      segword_content = ''
    if isinstance(segword_title, float):
      segword_title = ''
    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
    segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
    segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
    doc_word_list = segword_content.split()
    if len(doc_word_list) > self.sequen_len / 2:
      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
    else:
      doc_sens = ' '.join(doc_word_list[:self.sequen_len])
    # print('标题：',segword_title)
    # print('正文：',segword_content)
    datas.append(doc_sens.split())
    datas_title.append(segword_title.split())
    # print('完成预处理')
    return datas, datas_title

  def is_houxuan(self, title, content):
    '''
    通过标题和中文内容判断是否属于候选人公示类别
    :param title: 公告标题
    :param content: 公告正文文本内容
    :return: 1 是候选人公示 ；0 不是
    '''
    if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
      if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
        return 0
      return 1
    if re.search('候选人的?公示', content[:100]):
      if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
        return 0
      return 1
    else:
      return 0

  def predict(self, title='', list_sentence='', web_source_no=''):
    if web_source_no in ['02104-7']:
      return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}}

    if isinstance(list_sentence, list):
      token_l = [it.tokens for it in list_sentence]
      tokens = [it for l in token_l for it in l]
      content = ' '.join(tokens[:500])

    title = re.sub('[^\u4e00-\u9fa5]', '', title)
    if len(title)>50:
        title = title[:20]+title[-30:]
    data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
    text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
    title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
    result = {'docchannel': {'docchannel':'', 'doctype':''}}

    array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
    array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
    pred = self.type_sess.run(self.type_softmax,
                                    feed_dict={
                                              self.type_title: array_title,
                                              self.type_content: array_content,
                                              self.type_mask:[[0]*text_len+[1]*(self.sequen_len-text_len)],
                                              self.type_mask_title:[[0]*title_len+[1]*(self.title_len-title_len)],
                                              self.type_prob:1}
                            )
    id = np.argmax(pred, axis=1)[0]
    prob = pred[0][id]
    result['docchannel']['doctype'] = self.id2type[id]
    # print('公告类别：', self.id2type[id], '概率：',prob)
    # if id == 0:
    if result['docchannel']['doctype'] not in ['', '新闻资讯']:
      pred = self.lift_sess.run(self.lift_softmax,
                                      feed_dict={
                                                self.lift_title: array_title,
                                                self.lift_content: array_content,
                                                self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
                                                self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
                                                self.lift_prob:1}
                              )
      id = np.argmax(pred, axis=1)[0]
      prob = pred[0][id]
      result['docchannel']['docchannel'] = self.id2life[id]
      # print('生命周期：',self.id2life[id], '概率：',prob)
      # if id == 6:
      if result['docchannel']['docchannel'] == '中标信息':
        if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
          result['docchannel']['docchannel'] = '候选人公示'
          # return '候选人公示', prob
          # return [{'docchannel': '候选人公示'}]

    return result
    #   return [{'docchannel':self.id2life[id]}]
    # else:
    #   # return self.id2type[id], prob
    #   return [{'docchannel':self.id2type[id]}]

  def predict_rule(self, title, content, channel_dic, prem_dic):
      '''2022/2/10加入规则去除某些数据源及内容过短且不包含类别关键词的公告不做预测'''
      hetong = '(合同|验收|履约)(公告|公示)|合同号?$'  # 合同标题正则
      zhongbiao_t = '(中标|中选|成交|入选|入围|结果|确认)(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选)结果|开标(记录|信息|情况)|单一来源|直接(选取|选定)|中标通知书|中标$'
      zhongbiao_c = '(中标|中选|成交|拟选用|拟邀请|最终选定的?|拟定)(供应商|供货商|服务商|企业|公司|单位|(候选)?人)(名称)?[:：]|[，。：.](供应商|供货商|服务商)(名称)?：|指定的中介服务机构：|建设服务单位：'
      zhaobiao_t = '(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈)(公告|公示|$)'
      title_cn = re.sub('[^\u4e00-\u9fa5]', '', title)
      if len(re.sub('[^\u4e00-\u9fa5]', "", content))<50 and channel_dic['docchannel']['doctype'] != '新闻资讯':
          if re.search(hetong, title_cn) != None:
              channel_dic['docchannel']['docchannel'] = '合同公告'
          elif re.search(zhongbiao_t, title_cn):
              channel_dic['docchannel']['docchannel'] = '中标信息'
          elif re.search(zhaobiao_t, title_cn):
              channel_dic['docchannel']['docchannel'] = '招标公告'
          else:
              channel_dic['docchannel']['docchannel'] = ''
      elif channel_dic['docchannel'].get('docchannel', '') == '招标公告' and 'win_tenderer' in json.dumps(prem_dic,
                                                                                              ensure_ascii=False):
          if re.search(hetong, title_cn) != None:
              channel_dic['docchannel']['docchannel'] = '合同公告'
              log('正则把招标公告修改为合同公告')
          elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
                                                                                                      content):
              channel_dic['docchannel']['docchannel'] = '中标信息'
              log('正则把招标公告修改为中标信息')
      elif channel_dic['docchannel'].get('docchannel', '') == '中标信息' and 'win_tenderer' not in json.dumps(prem_dic,
                                                                                                    ensure_ascii=False):
          if re.search(hetong, title_cn):
              channel_dic['docchannel']['docchannel'] = '合同公告'
              log('正则把中标信息修改为合同公告')
          elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
                                                                                                      content):
              pass
          elif re.search(zhaobiao_t, title_cn):
              channel_dic['docchannel']['docchannel'] = '招标公告'
              log('正则把中标信息修改为招标公告')
          elif re.search('中标|成交|中选|入选|入围|结果|供应商|供货商|候选人', title_cn+content)==None:
              channel_dic['docchannel']['docchannel'] = ''
              log('正则把中标信息修改为空')
      return channel_dic

# 保证金支付方式提取
class DepositPaymentWay():
    def __init__(self,):
        self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[:：]*([^，。]{,60})'
        self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式'
        kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)',
               '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码',
               '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出',
               '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函']
        self.kws = sorted(kws, key=lambda x: len(x), reverse=True)

    def predict(self,content):
        pay_way = {'deposit_patment_way':''}
        result = []
        pay = re.search(self.pt, content)
        if pay:
            # print(pay.group(0))
            pay = pay.group(3)
            for it in re.finditer('|'.join(self.kws), pay):
                result.append(it.group(0))
            pay_way['deposit_patment_way'] = '；'.join(result)
            return pay_way
        pay = re.search(self.pt2, content)
        if pay:
            # print(pay.group(0))
            pay = pay.group(2)
            for it in re.finditer('|'.join(self.kws), pay):
                result.append(it.group(0))
            pay_way['deposit_patment_way'] = '；'.join(result)
            return pay_way
        else:
            return pay_way


# 总价单价提取
class TotalUnitMoney:
    def __init__(self):
        pass

    def predict(self, list_sentences, list_entitys):
        for i in range(len(list_entitys)):
            list_entity = list_entitys[i]

            # 总价单价
            for _entity in list_entity:
                if _entity.entity_type == 'money':
                    word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
                    # 总价在中投标金额中
                    if _entity.label == 1:
                        result = extract_total_money(word_of_sentence,
                                                     _entity.entity_text,
                                                     [_entity.wordOffset_begin, _entity.wordOffset_end])
                        if result:
                            _entity.is_total_money = 1

                    # 单价在普通金额中
                    else:
                        result = extract_unit_money(word_of_sentence,
                                                    _entity.entity_text,
                                                    [_entity.wordOffset_begin, _entity.wordOffset_end])
                        if result:
                            _entity.is_unit_money = 1
                # print("total_unit_money", _entity.entity_text,
                #       _entity.is_total_money, _entity.is_unit_money)


def getSavedModel():
    #predictor = FormPredictor()
    graph = tf.Graph()
    with graph.as_default():
        model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
        
        #print(tf.graph_util.remove_training_nodes(model))
        tf.saved_model.simple_save(
          tf.keras.backend.get_session(),
          "./h5_savedmodel/",
          inputs={"image": model.input},
          outputs={"scores": model.output}
        )
        
def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
    '''
    model = models.Sequential()
    model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True))  # Random embedding
    model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
    crf = CRF(len(chunk_tags), sparse_target=True)
    model.add(crf)
    model.summary()
    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
    return model
    '''
    input = layers.Input(shape=(None,),dtype="int32")
    if weights is not None:
        embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
    else:
        embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
    bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
    bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
    crf = CRF(len(chunk_tags),sparse_target=True)
    crf_out = crf(bilstm_dense)
    model = models.Model(input=[input],output = [crf_out])
    model.summary()
    model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
    return model


import h5py
def h5_to_graph(sess,graph,h5file):
    
    f = h5py.File(h5file,'r')   #打开h5文件 
    def getValue(v):
        _value = f["model_weights"]
        list_names = str(v.name).split("/")
        for _index in range(len(list_names)):
            print(v.name)
            if _index==1:
                _value = _value[list_names[0]]
            _value = _value[list_names[_index]]
        return _value.value
            
    def _load_attributes_from_hdf5_group(group, name):
        """Loads attributes of the specified name from the HDF5 group.
    
        This method deals with an inherent problem
        of HDF5 file which is not able to store
        data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
    
        # Arguments
            group: A pointer to a HDF5 group.
            name: A name of the attributes to load.
    
        # Returns
            data: Attributes data.
        """
        if name in group.attrs:
            data = [n.decode('utf8') for n in group.attrs[name]]
        else:
            data = []
            chunk_id = 0
            while ('%s%d' % (name, chunk_id)) in group.attrs:
                data.extend([n.decode('utf8')
                            for n in group.attrs['%s%d' % (name, chunk_id)]])
                chunk_id += 1
        return data
    
    def readGroup(gr,parent_name,data):
        for subkey in gr:
            print(subkey)
            if parent_name!=subkey:
                if parent_name=="":
                    _name = subkey
                else:
                    _name = parent_name+"/"+subkey
            else:
                _name = parent_name
            if str(type(gr[subkey]))=="<class 'h5py._hl.group.Group'>":
                readGroup(gr[subkey],_name,data)
            else:
                data.append([_name,gr[subkey].value])
                print(_name,gr[subkey].shape)
                
    
    layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names')
    list_name_value = []
    readGroup(f["model_weights"], "", list_name_value)
    '''
    for k, name in enumerate(layer_names):
        g = f["model_weights"][name]
        weight_names = _load_attributes_from_hdf5_group(g, 'weight_names')
        #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
        for weight_name in weight_names:
            list_name_value.append([weight_name,np.asarray(g[weight_name])])
    '''
    for name_value in list_name_value:
        name = name_value[0]
        '''
        if re.search("dense",name) is not None:
            name = name[:7]+"_1"+name[7:]
        '''
        value = name_value[1]
        print(name,graph.get_tensor_by_name(name),np.shape(value))
        sess.run(tf.assign(graph.get_tensor_by_name(name),value))


def initialize_uninitialized(sess):
    global_vars          = tf.global_variables()
    is_not_initialized   = sess.run([tf.is_variable_initialized(var) for var in global_vars])
    not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
    
    adam_vars = []
    for _vars in not_initialized_vars:
        if re.search("Adam",_vars.name) is not None:
            adam_vars.append(_vars)
 
    print([str(i.name) for i in adam_vars]) # only for testing
    if len(adam_vars):
        sess.run(tf.variables_initializer(adam_vars))
    
      
def save_codename_model():
    # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
    filepath = "../projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
    vocabpath = "../projectCode/models/vocab.pk"
    classlabelspath = "../projectCode/models/classlabels.pk"
    # vocab = load(vocabpath)
    # class_labels = load(classlabelspath)
    w2v_matrix = load('codename_w2v_matrix.pk')
    graph = tf.get_default_graph()
    with graph.as_default() as g:
        ''''''
        # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
        #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
        
        sess = tf.Session(graph=g)
        # sess = tf.keras.backend.get_session()
        char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
        #with sess.as_default():
        sess.run(tf.global_variables_initializer())
        # print(sess.run("time_distributed_1/kernel:0"))
        # model.load_weights(filepath)
        saver = tf.train.Saver()
        saver.restore(sess, filepath)

        # print("logits",sess.run(logits))
        
        # print("#",sess.run("time_distributed_1/kernel:0"))

        # x = load("codename_x.pk")
        #y = model.predict(x)
        # y = sess.run(model.output,feed_dict={model.input:x})
        
        # for item in np.argmax(y,-1):
        #     print(item)
        tf.saved_model.simple_save(
                                    sess,
                                    "./codename_savedmodel_tf/",
                                    inputs={"inputs": char_input,
                                            "inputs_length":length,
                                            'keepprob':keepprob},
                                    outputs={"logits": logits,
                                             "trans":trans}
        )
        
    
def save_role_model():
    '''
    @summary: 保存model为savedModel，部署到PAI平台上调用
    '''
    model_role = PREMPredict().model_role
    with model_role.graph.as_default():
        model = model_role.getModel()
        sess = tf.Session(graph=model_role.graph)
        print(type(model.input))
        
        sess.run(tf.global_variables_initializer())
        h5_to_graph(sess, model_role.graph, model_role.model_role_file)
        model = model_role.getModel()
        
        tf.saved_model.simple_save(sess,
                                   "./role_savedmodel/",
                                   inputs={"input0":model.input[0],
                                           "input1":model.input[1],
                                           "input2":model.input[2]},
                                   outputs={"outputs":model.output}
                                   )


def save_money_model():
    model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
    graph = tf.Graph()
    with graph.as_default():

        sess = tf.Session(graph=graph)

        with sess.as_default():
            # model = model_money.getModel()
            # model.summary()
            # sess.run(tf.global_variables_initializer())
            # h5_to_graph(sess, model_money.graph, model_money.model_money_file)

            model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
            model.summary()
            print(model.weights)
            tf.saved_model.simple_save(sess,
                                       "./money_savedmodel2/",
                                       inputs = {"input0":model.input[0],
                                                 "input1":model.input[1],
                                                 "input2":model.input[2]},
                                       outputs = {"outputs":model.output}
                                       )
    

def save_person_model():
    model_person = EPCPredict().model_person
    with model_person.graph.as_default():
        
        x = load("person_x.pk")
        _data = np.transpose(np.array(x),(1,0,2,3))
        model = model_person.getModel()
        
        sess = tf.Session(graph=model_person.graph)
        with sess.as_default():
            
            sess.run(tf.global_variables_initializer())
            model_person.load_weights()
        
        
        #h5_to_graph(sess, model_person.graph, model_person.model_person_file)
        
        predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]})
        #predict_y = model.predict([_data[0],_data[1]])
        print(np.argmax(predict_y,-1))
        
        tf.saved_model.simple_save(sess,
                                   "./person_savedmodel/",
                                   inputs={"input0":model.input[0],
                                           "input1":model.input[1]},
                                   outputs = {"outputs":model.output})
    
def save_form_model():
    model_form = FormPredictor()
    with model_form.graph.as_default():
        model = model_form.getModel("item")
        sess = tf.Session(graph=model_form.graph)
        sess.run(tf.global_variables_initializer())
        h5_to_graph(sess, model_form.graph, model_form.model_file_item)
        tf.saved_model.simple_save(sess,
                                   "./form_savedmodel/",
                                   inputs={"inputs":model.input},
                                   outputs = {"outputs":model.output})
    
def save_codesplit_model():
    filepath_code = "../projectCode/models/model_code.hdf5"
    
    
    graph = tf.Graph()
    with graph.as_default():
        model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        h5_to_graph(sess, graph, filepath_code)
        tf.saved_model.simple_save(sess,
                                   "./codesplit_savedmodel/",
                                   inputs={"input0":model_code.input[0],
                                           "input1":model_code.input[1],
                                           "input2":model_code.input[2]},
                                   outputs={"outputs":model_code.output})

def save_timesplit_model():
    filepath = '../time/model_label_time_classify.model.hdf5'
    with tf.Graph().as_default() as graph:
        time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            h5_to_graph(sess, graph, filepath)
            tf.saved_model.simple_save(sess,
                                       "./timesplit_model/",
                                       inputs={"input0":time_model.input[0],
                                               "input1":time_model.input[1]},
                                       outputs={"outputs":time_model.output})


if __name__=="__main__":
    #save_role_model()
    # save_codename_model()
    # save_money_model()
    #save_person_model()
    #save_form_model()
    #save_codesplit_model()
    # save_timesplit_model()
    '''
    # with tf.Session(graph=tf.Graph()) as sess:
    #     from tensorflow.python.saved_model import tag_constants
    #     meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
    #     graph = tf.get_default_graph()
    #     signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
    #     signature = meta_graph_def.signature_def
    #     input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
    #     input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
    #     outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
    #     x = load("person_x.pk")
    #     _data = np.transpose(x,[1,0,2,3])
    #     y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
    #     print(np.argmax(y,-1))
    '''