1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442 |
- '''
- Created on 2018年12月26日
- @author: User
- '''
- import os
- import sys
- from BiddingKG.dl.common.nerUtils import *
- sys.path.append(os.path.abspath("../.."))
- # from keras.engine import topology
- # from keras import models
- # from keras import layers
- # from keras_contrib.layers.crf import CRF
- # from keras.preprocessing.sequence import pad_sequences
- # from keras import optimizers,losses,metrics
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.interface.modelFactory import *
- import tensorflow as tf
- from BiddingKG.dl.product.data_util import decode, process_data
- from BiddingKG.dl.interface.Entitys import Entity
- from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
- from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
- from bs4 import BeautifulSoup
- import copy
- import calendar
- import datetime
- from threading import RLock
- dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
- "prem":{"predictor":None,"Lock":RLock()},
- "epc":{"predictor":None,"Lock":RLock()},
- "roleRule":{"predictor":None,"Lock":RLock()},
- "roleRuleFinal":{"predictor":None,"Lock":RLock()},
- "tendereeRuleRecall":{"predictor":None,"Lock":RLock()},
- "form":{"predictor":None,"Lock":RLock()},
- "time":{"predictor":None,"Lock":RLock()},
- "punish":{"predictor":None,"Lock":RLock()},
- "product":{"predictor":None,"Lock":RLock()},
- "product_attrs":{"predictor":None,"Lock":RLock()},
- "channel": {"predictor": None, "Lock": RLock()},
- "deposit_payment_way": {"predictor": None, "Lock": RLock()},
- "total_unit_money": {"predictor": None, "Lock": RLock()}
- }
- def getPredictor(_type):
- if _type in dict_predictor:
- with dict_predictor[_type]["Lock"]:
- if dict_predictor[_type]["predictor"] is None:
- if _type == "codeName":
- dict_predictor[_type]["predictor"] = CodeNamePredict()
- if _type == "prem":
- dict_predictor[_type]["predictor"] = PREMPredict()
- if _type == "epc":
- dict_predictor[_type]["predictor"] = EPCPredict()
- if _type == "roleRule":
- dict_predictor[_type]["predictor"] = RoleRulePredictor()
- if _type == "roleRuleFinal":
- dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
- if _type == "tendereeRuleRecall":
- dict_predictor[_type]["predictor"] = TendereeRuleRecall()
- if _type == "form":
- dict_predictor[_type]["predictor"] = FormPredictor()
- if _type == "time":
- dict_predictor[_type]["predictor"] = TimePredictor()
- if _type == "punish":
- dict_predictor[_type]["predictor"] = Punish_Extract()
- if _type == "product":
- dict_predictor[_type]["predictor"] = ProductPredictor()
- if _type == "product_attrs":
- dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
- if _type == "channel":
- dict_predictor[_type]["predictor"] = DocChannel()
- if _type == 'deposit_payment_way':
- dict_predictor[_type]["predictor"] = DepositPaymentWay()
- if _type == 'total_unit_money':
- dict_predictor[_type]["predictor"] = TotalUnitMoney()
- return dict_predictor[_type]["predictor"]
- raise NameError("no this type of predictor")
- # 编号名称模型
- class CodeNamePredict():
-
- def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
-
- self.model = None
- self.MAX_LEN = None
- self.model_code = None
- if EMBED_DIM is None:
- self.EMBED_DIM = 60
- else:
- self.EMBED_DIM = EMBED_DIM
- if BiRNN_UNITS is None:
- self.BiRNN_UNITS = 200
- else:
- self.BiRNN_UNITS = BiRNN_UNITS
- self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
- #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5"
- self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5"
- vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk"
- classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk"
- self.vocab = load(vocabpath)
- self.class_labels = load(classlabelspath)
-
- #生成提取编号和名称的正则
- id_PC_B = self.class_labels.index("PC_B")
- id_PC_M = self.class_labels.index("PC_M")
- id_PC_E = self.class_labels.index("PC_E")
- id_PN_B = self.class_labels.index("PN_B")
- id_PN_M = self.class_labels.index("PN_M")
- id_PN_E = self.class_labels.index("PN_E")
- self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E))
- self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E))
- # print("pc",self.PC_pattern)
- # print("pn",self.PN_pattern)
- self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
-
- self.inputs = None
- self.outputs = None
- self.sess_codename = tf.Session(graph=tf.Graph())
- self.sess_codesplit = tf.Session(graph=tf.Graph())
- self.inputs_code = None
- self.outputs_code = None
- if not lazyLoad:
- self.getModel()
- self.getModel_code()
-
-
-
- def getModel(self):
- '''
- @summary: 取得编号和名称模型
- '''
- if self.inputs is None:
- log("get model of codename")
- with self.sess_codename.as_default():
- with self.sess_codename.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf")
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
- self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name)
- self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name)
- self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name)
- self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name)
- return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
- else:
- return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
- '''
- if self.model is None:
- self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
- self.model.load_weights(self.filepath)
- return self.model
- '''
-
- def getModel_code(self):
- if self.inputs_code is None:
- log("get model of code")
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel")
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- self.inputs_code = []
- self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
- self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
- self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name))
- self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
- self.sess_codesplit.graph.finalize()
- return self.inputs_code,self.outputs_code
- else:
- return self.inputs_code,self.outputs_code
- '''
- if self.model_code is None:
- log("get model of model_code")
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- return self.model_code
- '''
-
- def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
- '''
- model = models.Sequential()
- model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
- model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
- crf = CRF(len(chunk_tags), sparse_target=True)
- model.add(crf)
- model.summary()
- model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
- return model
- '''
- input = layers.Input(shape=(None,))
- if weights is not None:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
- else:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
- bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
- bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
- crf = CRF(len(chunk_tags),sparse_target=True)
- crf_out = crf(bilstm_dense)
- model = models.Model(input=[input],output = [crf_out])
- model.summary()
- model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
- return model
-
- #根据规则补全编号或名称两边的符号
- def fitDataByRule(self,data):
- symbol_dict = {"(":")",
- "(":")",
- "[":"]",
- "【":"】",
- ")":"(",
- ")":"(",
- "]":"[",
- "】":"【"}
- leftSymbol_pattern = re.compile("[\((\[【]")
- rightSymbol_pattern = re.compile("[\))\]】]")
- leftfinds = re.findall(leftSymbol_pattern,data)
- rightfinds = re.findall(rightSymbol_pattern,data)
- result = data
- if len(leftfinds)+len(rightfinds)==0:
- return data
- elif len(leftfinds)==len(rightfinds):
- return data
- elif abs(len(leftfinds)-len(rightfinds))==1:
- if len(leftfinds)>len(rightfinds):
- if symbol_dict.get(data[0]) is not None:
- result = data[1:]
- else:
- #print(symbol_dict.get(leftfinds[0]))
- result = data+symbol_dict.get(leftfinds[0])
- else:
- if symbol_dict.get(data[-1]) is not None:
- result = data[:-1]
- else:
- result = symbol_dict.get(rightfinds[0])+data
- return result
- def decode(self,logits, trans, sequence_lengths, tag_num):
- viterbi_sequences = []
- for logit, length in zip(logits, sequence_lengths):
- score = logit[:length]
- viterbi_seq, viterbi_score = viterbi_decode(score, trans)
- viterbi_sequences.append(viterbi_seq)
- return viterbi_sequences
-
- def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
- #@summary: 获取每篇文章的code和name
-
- pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
- result = []
- index_unk = self.word2index.get("<unk>")
- # index_pad = self.word2index.get("<pad>")
- if list_entitys is None:
- list_entitys = [[] for _ in range(len(list_sentences))]
- for list_sentence,list_entity in zip(list_sentences,list_entitys):
- if len(list_sentence)==0:
- result.append([{"code":[],"name":""}])
- continue
- doc_id = list_sentence[0].doc_id
- # sentences = []
- # for sentence in list_sentence:
- # if len(sentence.sentence_text)>MAX_AREA:
- # for _sentence_comma in re.split("[;;,\n]",sentence):
- # _comma_index = 0
- # while(_comma_index<len(_sentence_comma)):
- # sentences.append(_sentence_comma[_comma_index:_comma_index+MAX_AREA])
- # _comma_index += MAX_AREA
- # else:
- # sentences.append(sentence+"。")
- list_sentence.sort(key=lambda x:len(x.sentence_text),reverse=True)
- _begin_index = 0
-
- item = {"code":[],"name":""}
- code_set = set()
- dict_name_freq_score = dict()
- while(True):
- MAX_LEN = len(list_sentence[_begin_index].sentence_text)
- if MAX_LEN>MAX_AREA:
- MAX_LEN = MAX_AREA
- _LEN = MAX_AREA//MAX_LEN
- #预测
- x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
- # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
- x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
- x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
- if USE_PAI_EAS:
- request = tf_predict_pb2.PredictRequest()
- request.inputs["inputs"].dtype = tf_predict_pb2.DT_INT32
- request.inputs["inputs"].array_shape.dim.extend(np.shape(x))
- request.inputs["inputs"].int_val.extend(np.array(x,dtype=np.int32).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(codename_url, codename_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- with self.sess_codename.as_default():
- t_input,t_output = self.getModel()
- predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
- else:
- with self.sess_codename.as_default():
- t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
- _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x,
- t_input_length:x_len,
- t_keepprob:1.0})
- predict_y = self.decode(_logits,_trans,x_len,7)
- # print('==========',_logits)
- '''
- for item11 in np.argmax(predict_y,-1):
- print(item11)
- print(predict_y)
- '''
- # print(predict_y)
- for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)):
- pad_sentence = sentence.sentence_text[:MAX_LEN]
- join_predict = "".join([str(s) for s in predict])
- # print(pad_sentence)
- # print(join_predict)
- code_x = []
- code_text = []
- temp_entitys = []
- for iter in re.finditer(self.PC_pattern,join_predict):
- get_len = 40
- if iter.span()[0]<get_len:
- begin = 0
- else:
- begin = iter.span()[0]-get_len
- end = iter.span()[1]+get_len
- code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
- code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""))
- _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
- temp_entitys.append(_entity)
- #print("code",code_text)
- if len(code_x)>0:
- code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3))
- if USE_PAI_EAS:
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0]))
- request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1]))
- request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1))
- request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2]))
- request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs)
- if _result is not None:
- predict_code = _result["outputs"]
- else:
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
- else:
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- inputs_code,outputs_code = self.getModel_code()
- predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})[0]
- #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
- #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
- for h in range(len(predict_code)):
- if predict_code[h][0]>0.5:
- the_code = self.fitDataByRule(code_text[h])
- #add code to entitys
- list_entity.append(temp_entitys[h])
- if the_code not in code_set:
- code_set.add(the_code)
- item['code'] = list(code_set)
- for iter in re.finditer(self.PN_pattern,join_predict):
- _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- #add name to entitys
- _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
- list_entity.append(_entity)
- w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
- if _name not in dict_name_freq_score:
- # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
- else:
- dict_name_freq_score[_name][0] += 1
- '''
- for iter in re.finditer(self.PN_pattern,join_predict):
- print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
- if item[1]['name']=="":
- for iter in re.finditer(self.PN_pattern,join_predict):
- #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- break
- '''
- if _begin_index+_LEN>=len(list_sentence):
- break
- _begin_index += _LEN
-
- list_name_freq_score = []
- # 2020/11/23 大网站规则调整
- if len(dict_name_freq_score) == 0:
- name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
- for sentence in list_sentence:
- # pad_sentence = sentence.sentence_text
- othername = re.search(name_re1, sentence.sentence_text)
- if othername != None:
- project_name = othername.group(3)
- beg = find_index([project_name], sentence.sentence_text)[0]
- end = beg + len(project_name)
- _name = self.fitDataByRule(sentence.sentence_text[beg:end])
- # add name to entitys
- _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
- sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
- entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
- end_index=0, wordOffset_begin=beg, wordOffset_end=end,in_attachment=sentence.in_attachment)
- list_entity.append(_entity)
- w = 1
- if _name not in dict_name_freq_score:
- # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
- else:
- dict_name_freq_score[_name][0] += 1
- # othername = re.search(name_re1, sentence.sentence_text)
- # if othername != None:
- # _name = othername.group(3)
- # if _name not in dict_name_freq_score:
- # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1]
- # else:
- # dict_name_freq_score[_name][0] += 1
- for _name in dict_name_freq_score.keys():
- list_name_freq_score.append([_name,dict_name_freq_score[_name]])
- # print(list_name_freq_score)
- if len(list_name_freq_score)>0:
- list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
- item['name'] = list_name_freq_score[0][0]
- # if list_name_freq_score[0][1][0]>1:
- # item[1]['name'] = list_name_freq_score[0][0]
- # else:
- # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True)
- # item[1]["name"] = list_name_freq_score[0][0]
-
- #下面代码加上去用正则添加某些识别不到的项目编号
- if item['code'] == []:
- for sentence in list_sentence:
- # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text)
- # if othercode != None:
- # item[1]['code'].append(othercode.group(2))
- # 2020/11/23 大网站规则调整
- othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
- if othercode != None:
- item['code'].append(othercode.group(3))
- item['code'].sort(key=lambda x:len(x),reverse=True)
- result.append(item)
- list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
- return result
-
-
- '''
- #当数据量过大时会报错
- def predict(self,articles,MAX_LEN = None):
- sentences = []
- for article in articles:
- for sentence in article.content.split("。"):
- sentences.append([sentence,article.id])
- if MAX_LEN is None:
- sent_len = [len(sentence[0]) for sentence in sentences]
- MAX_LEN = max(sent_len)
- #print(MAX_LEN)
-
- #若为空,则直接返回空
- result = []
- if MAX_LEN==0:
- for article in articles:
- result.append([article.id,{"code":[],"name":""}])
- return result
-
- index_unk = self.word2index.get("<unk>")
- index_pad = self.word2index.get("<pad>")
-
- x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences]
- x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
-
- predict_y = self.getModel().predict(x)
-
-
- last_doc_id = ""
- item = []
- for sentence,predict in zip(sentences,np.argmax(predict_y,-1)):
- pad_sentence = sentence[0][:MAX_LEN]
- doc_id = sentence[1]
- join_predict = "".join([str(s) for s in predict])
- if doc_id!=last_doc_id:
- if last_doc_id!="":
- result.append(item)
- item = [doc_id,{"code":[],"name":""}]
- code_set = set()
- code_x = []
- code_text = []
- for iter in re.finditer(self.PC_pattern,join_predict):
- get_len = 40
- if iter.span()[0]<get_len:
- begin = 0
- else:
- begin = iter.span()[0]-get_len
- end = iter.span()[1]+get_len
- code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
- code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
- if len(code_x)>0:
- code_x = np.transpose(np.array(code_x),(1,0,2,3))
- predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
- for h in range(len(predict_code)):
- if predict_code[h][0]>0.5:
- the_code = self.fitDataByRule(code_text[h])
- if the_code not in code_set:
- code_set.add(the_code)
- item[1]['code'] = list(code_set)
- if item[1]['name']=="":
- for iter in re.finditer(self.PN_pattern,join_predict):
- #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- break
-
- last_doc_id = doc_id
- result.append(item)
- return result
- '''
-
- #角色金额模型
- class PREMPredict():
-
- def __init__(self):
- #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
- self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
- self.model_role = Model_role_classify_word()
- self.model_money = Model_money_classify()
-
- return
-
- def search_role_data(self,list_sentences,list_entitys):
- '''
- @summary:根据句子list和实体list查询角色模型的输入数据
- @param:
- list_sentences:文章的sentences
- list_entitys:文章的entitys
- @return:角色模型的输入数据
- '''
- text_list = []
- data_x = []
- points_entitys = []
- for list_entity,list_sentence in zip(list_entitys,list_sentences):
- list_entity.sort(key=lambda x:x.sentence_index)
- list_sentence.sort(key=lambda x:x.sentence_index)
- p_entitys = 0
- p_sentences = 0
- while(p_entitys<len(list_entity)):
- entity = list_entity[p_entitys]
- if entity.entity_type in ['org','company']:
- while(p_sentences<len(list_sentence)):
- sentence = list_sentence[p_sentences]
- if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
- text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-10):entity.wordOffset_end+10])
- #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
- item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
- data_x.append(item_x)
- points_entitys.append(entity)
- break
- p_sentences += 1
-
- p_entitys += 1
-
- if len(points_entitys)==0:
- return None
-
- return [data_x,points_entitys, text_list]
-
-
- def search_money_data(self,list_sentences,list_entitys):
- '''
- @summary:根据句子list和实体list查询金额模型的输入数据
- @param:
- list_sentences:文章的sentences
- list_entitys:文章的entitys
- @return:金额模型的输入数据
- '''
- text_list = []
- data_x = []
- points_entitys = []
- for list_entity,list_sentence in zip(list_entitys,list_sentences):
- list_entity.sort(key=lambda x:x.sentence_index)
- list_sentence.sort(key=lambda x:x.sentence_index)
- p_entitys = 0
-
- while(p_entitys<len(list_entity)):
- entity = list_entity[p_entitys]
- if entity.entity_type=="money":
- p_sentences = 0
- while(p_sentences<len(list_sentence)):
- sentence = list_sentence[p_sentences]
- if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
- text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 8):entity.wordOffset_end])
- #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
- #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
- item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
- data_x.append(item_x)
- points_entitys.append(entity)
- break
- p_sentences += 1
- p_entitys += 1
-
- if len(points_entitys)==0:
- return None
-
- return [data_x,points_entitys, text_list]
-
- def predict_role(self,list_sentences, list_entitys):
- datas = self.search_role_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- text_list = datas[2]
- if USE_PAI_EAS:
- _data = datas[0]
- _data = np.transpose(np.array(_data),(1,0,2))
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
- request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
- request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
- request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
- request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(role_url, role_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- predict_y = self.model_role.predict(datas[0])
- else:
- predict_y = self.model_role.predict(np.array(datas[0],dtype=np.float64))
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = predict_y[i]
- text = text_list[i]
- if label == 2:
- if re.search('中标单位和.{,25}签订合同', text):
- label = 0
- values[label] = 0.501
- elif re.search('尊敬的供应商:.{,25}我公司', text):
- label = 0
- values[label] = 0.801
- elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
- label = 0
- values[label] = 0.501
- elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', text[:-10]):
- label = 2
- values[label] = 0.501
- entity.set_Role(label, values)
- def predict_money(self,list_sentences,list_entitys):
- datas = self.search_money_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- _data = datas[0]
- text_list = datas[2]
- if USE_PAI_EAS:
- _data = np.transpose(np.array(_data),(1,0,2,3))
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
- request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
- request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
- request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
- request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(money_url, money_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- predict_y = self.model_money.predict(_data)
- else:
- predict_y = self.model_money.predict(_data)
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = predict_y[i]
- text = text_list[i]
- if label == 1 and re.search('[::,。](总金额|总价|单价)', text):
- values[label] = 0.49
- elif label ==0 and entity.notes in ["投资", "工程造价"]:
- values[label] = 0.49
- entity.set_Money(label, values)
-
- def predict(self,list_sentences,list_entitys):
- self.predict_role(list_sentences,list_entitys)
- self.predict_money(list_sentences,list_entitys)
-
-
- #联系人模型
- class EPCPredict():
-
- def __init__(self):
- self.model_person = Model_person_classify()
-
- def search_person_data(self,list_sentences,list_entitys):
- '''
- @summary:根据句子list和实体list查询联系人模型的输入数据
- @param:
- list_sentences:文章的sentences
- list_entitys:文章的entitys
- @return:联系人模型的输入数据
- '''
- data_x = []
- points_entitys = []
- for list_entity,list_sentence in zip(list_entitys,list_sentences):
-
- p_entitys = 0
- dict_index_sentence = {}
- for _sentence in list_sentence:
- dict_index_sentence[_sentence.sentence_index] = _sentence
- _list_entity = [entity for entity in list_entity if entity.entity_type=="person"]
- while(p_entitys<len(_list_entity)):
- entity = _list_entity[p_entitys]
- if entity.entity_type=="person":
- sentence = dict_index_sentence[entity.sentence_index]
- item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
- data_x.append(item_x)
- points_entitys.append(entity)
- p_entitys += 1
- if len(points_entitys)==0:
- return None
-
- # return [data_x,points_entitys,dianhua]
- return [data_x,points_entitys]
- def predict_person(self,list_sentences, list_entitys):
- datas = self.search_person_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- # phone = datas[2]
- if USE_PAI_EAS:
- _data = datas[0]
- _data = np.transpose(np.array(_data),(1,0,2,3))
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
- request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
- request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(person_url, person_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- predict_y = self.model_person.predict(datas[0])
- else:
- predict_y = self.model_person.predict(datas[0])
- # assert len(predict_y)==len(points_entitys)==len(phone)
- assert len(predict_y)==len(points_entitys)
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = []
- for item in predict_y[i]:
- values.append(item)
- # phone_number = phone[i]
- # entity.set_Person(label,values,phone_number)
- entity.set_Person(label,values,[])
- # 为联系人匹配电话
- # self.person_search_phone(list_sentences, list_entitys)
- def person_search_phone(self,list_sentences, list_entitys):
- def phoneFromList(phones):
- # for phone in phones:
- # if len(phone)==11:
- # return re.sub('电话[:|:]|联系方式[:|:]','',phone)
- return re.sub('电话[:|:]|联系方式[:|:]', '', phones[0])
- for list_entity, list_sentence in zip(list_entitys, list_sentences):
- # p_entitys = 0
- # p_sentences = 0
- #
- # key_word = re.compile('电话[:|:].{0,4}\d{7,12}|联系方式[:|:].{0,4}\d{7,12}')
- # # phone = re.compile('1[3|4|5|7|8][0-9][-—-]?\d{4}[-—-]?\d{4}|\d{3,4}[-—-]\d{7,8}/\d{3,8}|\d{3,4}[-—-]\d{7,8}转\d{1,4}|\d{3,4}[-—-]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}') # 联系电话
- # # 2020/11/25 增加发现的号码段
- # phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-]?\d{4}[-—-]?\d{4}|'
- # '\d{3,4}[-—-][1-9]\d{6,7}/\d{3,8}|'
- # '\d{3,4}[-—-]\d{7,8}转\d{1,4}|'
- # '\d{3,4}[-—-]?[1-9]\d{6,7}|'
- # '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
- # '[1-9]\d{6,7}') # 联系电话
- # dict_index_sentence = {}
- # for _sentence in list_sentence:
- # dict_index_sentence[_sentence.sentence_index] = _sentence
- #
- # dict_context_itemx = {}
- # last_person = "####****++++$$^"
- # last_person_phone = "####****++++$^"
- # _list_entity = [entity for entity in list_entity if entity.entity_type == "person"]
- # while (p_entitys < len(_list_entity)):
- # entity = _list_entity[p_entitys]
- # if entity.entity_type == "person" and entity.label in [1,2,3]:
- # sentence = dict_index_sentence[entity.sentence_index]
- # # item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_PERSON_INPUT_SHAPE[1]),shape=settings.MODEL_PERSON_INPUT_SHAPE)
- #
- # # s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20)
- #
- # # 2021/5/8 取上下文的句子,解决表格处理的分句问题
- # left_sentence = dict_index_sentence.get(entity.sentence_index - 1)
- # left_sentence_tokens = left_sentence.tokens if left_sentence else []
- # right_sentence = dict_index_sentence.get(entity.sentence_index + 1)
- # right_sentence_tokens = right_sentence.tokens if right_sentence else []
- # entity_beginIndex = entity.begin_index + len(left_sentence_tokens)
- # entity_endIndex = entity.end_index + len(left_sentence_tokens)
- # context_sentences_tokens = left_sentence_tokens + sentence.tokens + right_sentence_tokens
- # s = spanWindow(tokens=context_sentences_tokens, begin_index=entity_beginIndex,
- # end_index=entity_endIndex, size=20)
- #
- # _key = "".join(["".join(x) for x in s])
- # if _key in dict_context_itemx:
- # _dianhua = dict_context_itemx[_key][0]
- # else:
- # s1 = ''.join(s[1])
- # # s1 = re.sub(',)', '-', s1)
- # s1 = re.sub('\s', '', s1)
- # have_key = re.findall(key_word, s1)
- # have_phone = re.findall(phone, s1)
- # s0 = ''.join(s[0])
- # # s0 = re.sub(',)', '-', s0)
- # s0 = re.sub('\s', '', s0)
- # have_key2 = re.findall(key_word, s0)
- # have_phone2 = re.findall(phone, s0)
- #
- # s3 = ''.join(s[1])
- # # s0 = re.sub(',)', '-', s0)
- # s3 = re.sub(',|,|\s', '', s3)
- # have_key3 = re.findall(key_word, s3)
- # have_phone3 = re.findall(phone, s3)
- #
- # s4 = ''.join(s[0])
- # # s0 = re.sub(',)', '-', s0)
- # s4 = re.sub(',|,|\s', '', s0)
- # have_key4 = re.findall(key_word, s4)
- # have_phone4 = re.findall(phone, s4)
- #
- # _dianhua = ""
- # if have_phone:
- # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
- # last_person_phone) != -1:
- # if len(have_phone) > 1:
- # _dianhua = phoneFromList(have_phone[1:])
- # else:
- # _dianhua = phoneFromList(have_phone)
- # elif have_key:
- # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
- # last_person_phone) != -1:
- # if len(have_key) > 1:
- # _dianhua = phoneFromList(have_key[1:])
- # else:
- # _dianhua = phoneFromList(have_key)
- # elif have_phone2:
- # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
- # last_person_phone) != -1:
- # if len(have_phone2) > 1:
- # _dianhua = phoneFromList(have_phone2[1:])
- # else:
- # _dianhua = phoneFromList(have_phone2)
- # elif have_key2:
- # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
- # last_person_phone) != -1:
- # if len(have_key2) > 1:
- # _dianhua = phoneFromList(have_key2[1:])
- # else:
- # _dianhua = phoneFromList(have_key2)
- # elif have_phone3:
- # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
- # last_person_phone) != -1:
- # if len(have_phone3) > 1:
- # _dianhua = phoneFromList(have_phone3[1:])
- # else:
- # _dianhua = phoneFromList(have_phone3)
- # elif have_key3:
- # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
- # last_person_phone) != -1:
- # if len(have_key3) > 1:
- # _dianhua = phoneFromList(have_key3[1:])
- # else:
- # _dianhua = phoneFromList(have_key3)
- # elif have_phone4:
- # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
- # last_person_phone) != -1:
- # if len(have_phone4) > 1:
- # _dianhua = phoneFromList(have_phone4)
- # else:
- # _dianhua = phoneFromList(have_phone4)
- # elif have_key4:
- # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
- # last_person_phone) != -1:
- # if len(have_key4) > 1:
- # _dianhua = phoneFromList(have_key4)
- # else:
- # _dianhua = phoneFromList(have_key4)
- # else:
- # _dianhua = ""
- # # dict_context_itemx[_key] = [item_x, _dianhua]
- # dict_context_itemx[_key] = [_dianhua]
- # # points_entitys.append(entity)
- # # dianhua.append(_dianhua)
- # last_person = entity.entity_text
- # if _dianhua:
- # # 更新联系人entity联系方式(person_phone)
- # entity.person_phone = _dianhua
- # last_person_phone = _dianhua
- # else:
- # last_person_phone = "####****++++$^"
- # p_entitys += 1
- from scipy.optimize import linear_sum_assignment
- from BiddingKG.dl.interface.Entitys import Match
- def dispatch(match_list):
- main_roles = list(set([match.main_role for match in match_list]))
- attributes = list(set([match.attribute for match in match_list]))
- label = np.zeros(shape=(len(main_roles), len(attributes)))
- for match in match_list:
- main_role = match.main_role
- attribute = match.attribute
- value = match.value
- label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
- # print(label)
- gragh = -label
- # km算法
- row, col = linear_sum_assignment(gragh)
- max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
- return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
- # km算法
- key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})')
- phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
- '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
- '0\d{2,3}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
- '0\d{2,3}[-—-―]\d{7,8}转\d{1,4}|'
- '0\d{2,3}[-—-―]?[1-9]\d{6,7}|'
- '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
- '[1-9]\d{6,7}')
- phone_entitys = []
- for _sentence in list_sentence:
- sentence_text = _sentence.sentence_text
- res_set = set()
- for i in re.finditer(phone,sentence_text):
- res_set.add((i.group(),i.start(),i.end()))
- for i in re.finditer(key_word,sentence_text):
- res_set.add((i.group(2),i.start()+len(i.group(1)),i.end()))
- for item in list(res_set):
- phone_left = sentence_text[max(0,item[1]-10):item[1]]
- phone_right = sentence_text[item[2]:item[2]+8]
- # 排除传真号 和 其它错误项
- if re.search("传,?真|信,?箱|邮,?箱",phone_left):
- if not re.search("电,?话",phone_left):
- continue
- if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]",phone_left):
- continue
- if re.search("[.,]\d{2,}",phone_right):
- continue
- _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2],in_attachment=_sentence.in_attachment)
- phone_entitys.append(_entity)
- person_entitys = []
- for entity in list_entity:
- if entity.entity_type == "person":
- entity.person_phone = ""
- person_entitys.append(entity)
- _list_entity = phone_entitys + person_entitys
- _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin))
- words_num_dict = dict()
- last_words_num = 0
- list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
- for sentence in list_sentence:
- _index = sentence.sentence_index
- if _index == 0:
- words_num_dict[_index] = 0
- else:
- words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
- last_words_num = len(sentence.sentence_text)
- match_list = []
- for index in range(len(_list_entity)):
- entity = _list_entity[index]
- if entity.entity_type=="person" and entity.label in [1,2,3]:
- match_nums = 0
- for after_index in range(index + 1, min(len(_list_entity), index + 5)):
- after_entity = _list_entity[after_index]
- if after_entity.entity_type=="phone":
- sentence_distance = after_entity.sentence_index - entity.sentence_index
- distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - (
- words_num_dict[entity.sentence_index] + entity.wordOffset_end)
- if sentence_distance < 2 and distance < 50:
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list.append(Match(entity, after_entity, value))
- match_nums += 1
- else:
- break
- if after_entity.entity_type=="person":
- if after_entity.label not in [1,2,3]:
- break
- if not match_nums:
- for previous_index in range(index-1, max(0,index-5), -1):
- previous_entity = _list_entity[previous_index]
- if previous_entity.entity_type == "phone":
- sentence_distance = entity.sentence_index - previous_entity.sentence_index
- distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - (
- words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end)
- if sentence_distance < 1 and distance<30:
- # 前向 没有 /10000
- value = (-1 / 2 * (distance ** 2))
- match_list.append(Match(entity, previous_entity, value))
- else:
- break
- result = dispatch(match_list)
- for match in result:
- entity = match.main_role
- # 更新 list_entity
- entity_index = list_entity.index(entity)
- list_entity[entity_index].person_phone = match.attribute.entity_text
- def predict(self,list_sentences,list_entitys):
- self.predict_person(list_sentences,list_entitys)
-
- #表格预测
- class FormPredictor():
-
- def __init__(self,lazyLoad=getLazyLoad()):
- self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
- self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
- self.model_form_item = Model_form_item()
- self.model_form_context = Model_form_context()
- self.model_dict = {"line":[None,self.model_file_line]}
-
-
- def getModel(self,type):
- if type=="item":
- return self.model_form_item
- elif type=="context":
- return self.model_form_context
- else:
- return self.getModel(type)
- def encode(self,data,**kwargs):
- return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0]
- return encodeInput_form(data)
-
- def predict(self,form_datas,type):
- if type=="item":
- return self.model_form_item.predict(form_datas)
- elif type=="context":
- return self.model_form_context.predict(form_datas)
- else:
- return self.getModel(type).predict(form_datas)
-
- #角色规则
- #依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
- class RoleRulePredictor():
-
- def __init__(self):
- # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
- self.pattern_tenderee_left = "(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
- "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
- "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
- self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
- "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
- "(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
- self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托))"
- self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^现委托|^的\w{2,10}正在进行)" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
- self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
- self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
- self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
- # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
- self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
- "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
- "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
- self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
- # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
- # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
- self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
- "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^成为[\w、()()]+项目的成交供应商))"
- self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)|中标通知书.{,15}你方" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
- # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
- self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
- self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
-
- self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
- self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
- self.pattern_whole = [self.pattern_tenderee_left,
- self.pattern_tenderee_left_w1,
- self.pattern_tenderee_center,
- self.pattern_tenderee_right,
- self.pattern_tendereeORagency_right,
- self.pattern_agency_left,
- self.pattern_agency_right,
- self.pattern_winTenderer_left,
- self.pattern_winTenderer_left_w1,
- self.pattern_winTenderer_whole,
- self.pattern_winTenderer_right,
- self.pattern_secondTenderer_left,
- self.pattern_secondTenderer_right,
- self.pattern_thirdTenderer_left,
- self.pattern_thirdTenderer_right
- ] # 需按顺序排列, 第二、三中标要在中标正则后面
- self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
-
- self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
- self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况")
- self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
- self.pattern_money_other = re.compile("代理费|服务费")
- self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
-
- def _check_input(self,text, ignore=False):
- if not text:
- return []
-
- if not isinstance(text, list):
- text = [text]
-
- null_index = [i for i, t in enumerate(text) if not t]
- if null_index and not ignore:
- raise Exception("null text in input ")
-
- return text
- def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
- for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
- list_codenames):
- list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序
- # list_name = list_codename["name"]
- list_name = [] # 2022/1/5 改为实体列表内所有项目名称
- for entity in list_entity:
- if entity.entity_type == 'name':
- list_name.append(entity.entity_text)
- list_name = self._check_input(list_name) + [article.title]
- for p_entity in list_entity:
- if p_entity.entity_type in ["org", "company"]:
- # 只解析角色为无的或者概率低于阈值的
- if p_entity.label is None:
- continue
- # 将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
- if str(p_entity.label) == "0":
- find_flag = False
- for _sentence in list_sentence:
- if _sentence.sentence_index == p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
- end_index=p_entity.end_index, size=20, center_include=True,
- word_flag=True, use_text=True,
- text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
- for _name in list_name:
- if _name != "" and str(_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:
- find_flag = True
- if p_entity.values[0] > on_value:
- p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
- else:
- p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
- if find_flag:
- continue
- # 正则从概率低于阈值或其他类别中召回角色
- role_prob = float(p_entity.values[int(p_entity.label)])
- if role_prob < on_value or str(p_entity.label) == "5":
- # 将标题中的实体置为招标人
- _list_name = self._check_input(list_name, ignore=True)
- find_flag = False
- for _name in _list_name: # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
- if str(_name).find(re.sub(")", ")", re.sub("(", "(",
- p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4:
- for _sentence in list_sentence:
- if _sentence.sentence_index == p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
- end_index=p_entity.end_index, size=20, center_include=True,
- word_flag=True, use_text=True, text=re.sub(")", ")",
- re.sub("(", "(",
- p_entity.entity_text)))
- if str(_span[1] + _span[2][:len(str(_name))]).find(
- _name) >= 0:
- find_flag = True
- _label = 0
- p_entity.label = _label
- p_entity.values[int(_label)] = on_value
- break
- if p_entity.sentence_index >= 4:
- break
- if find_flag:
- break
- # if str(_name).find(p_entity.entity_text)>=0:
- # find_flag = True
- # _label = 0
- # p_entity.label = _label
- # p_entity.values[int(_label)] = on_value
- # break
- # 若是实体在标题中,默认为招标人,不进行以下的规则匹配
- if find_flag:
- continue
- for s_index in range(len(list_sentence)):
- if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \
- list_sentence[s_index].sentence_index:
- tokens = list_sentence[s_index].tokens
- begin_index = p_entity.begin_index
- end_index = p_entity.end_index
- size = 15
- spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
- word_flag=True, use_text=False)
- # _flag = False
- # 使用正则+距离解决冲突
- # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
- list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:10], spans[2]] # 实体左、中、右 信息
- for _i_span in range(len(list_spans)):
- _flag = False
- _prob_weight = 1
- # print(list_spans[_i_span],p_entity.entity_text)
- for _pattern in self.pattern_whole:
- for _iter in re.finditer(_pattern, list_spans[_i_span]):
- for _group, _v_group in _iter.groupdict().items():
- if _v_group is not None and _v_group != "":
- _role = _group.split("_")[0]
- if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
- # print('p_entity_sentenceindex:', p_entity.sentence_index)
- if p_entity.sentence_index>=1: # 只在第一句进行这种模糊匹配
- continue
- if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
- or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
- _role = 'tenderee'
- else:
- _role = "agency"
- _direct = _group.split("_")[1]
- _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
- # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
- # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
- if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
- list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
- _flag = True
- _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
- "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
- _prob_weight = 1.2 if _weight=='w1' else 1
- # print('_v_group:',_group, _v_group, p_entity.entity_text)
- if _i_span == 1 and _direct == "center":
- _flag = True
- _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
- "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
- _prob_weight = 1.2 if _weight == 'w1' else 1
- # print('_v_group:', _group, _v_group, p_entity.entity_text)
- if _i_span == 2 and _direct == "right":
- _flag = True
- _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
- "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
- _prob_weight = 1.2 if _weight == 'w1' else 1
- # print('_v_group:', _group, _v_group, p_entity.entity_text)
- # 得到结果
- if _flag:
- p_entity.label = _label
- p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
- # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
- break
- # 其他金额通过正则召回可能是招标或中投标的金额
- if p_entity.entity_type in ["money"]:
- if str(p_entity.label) == "2":
- for _sentence in list_sentence:
- if _sentence.sentence_index == p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
- end_index=p_entity.end_index, size=20, center_include=True,
- word_flag=True, text=p_entity.entity_text)
- if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
- self.pattern_money_other, _span[0]) is None:
- p_entity.values[0] = 0.8 + p_entity.values[0] / 10
- p_entity.label = 0
- if re.search(self.pattern_money_tenderer, _span[0]) is not None:
- if re.search(self.pattern_money_other, _span[0]) is not None:
- if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
- re.search(self.pattern_money_other, _span[0]).span()[1]:
- p_entity.values[1] = 0.8 + p_entity.values[1] / 10
- p_entity.label = 1
- else:
- p_entity.values[1] = 0.8 + p_entity.values[1] / 10
- p_entity.label = 1
- if re.search(self.pattern_money_tenderer_whole,
- "".join(_span)) is not None and re.search(self.pattern_money_other,
- _span[0]) is None:
- p_entity.values[1] = 0.8 + p_entity.values[1] / 10
- p_entity.label = 1
- # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
- list_p = []
- state = 0
- for p_entity in list_entity:
- for _sentence in list_sentence:
- if _sentence.sentence_index == p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
- end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
- text=p_entity.entity_text)
- if state == 2:
- for _p in list_p[1:]:
- _p.values[0] = 0.8 + _p.values[0] / 10
- _p.label = 0
- state = 0
- list_p = []
- if state == 0:
- if p_entity.entity_type in ["money"]:
- if str(p_entity.label) == "0" and re.search(self.pattern_pack,
- _span[0] + "-" + _span[2]) is not None:
- state = 1
- list_p.append(p_entity)
- elif state == 1:
- if p_entity.entity_type in ["money"]:
- if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack,
- _span[0] + "-" + _span[
- 2]) is not None and re.search(
- self.pattern_money_other,
- _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[
- 0].sentence_index:
- list_p.append(p_entity)
- else:
- state = 2
- if len(list_p) > 1:
- for _p in list_p[1:]:
- # print("==",_p.entity_text,_p.sentence_index,_p.label)
- _p.values[0] = 0.8 + _p.values[0] / 10
- _p.label = 0
- state = 0
- list_p = []
- for p_entity in list_entity:
- # 将属于集合中的不可能是中标人的标签置为无
- if p_entity.entity_text in self.SET_NOT_TENDERER:
- p_entity.label = 5
- '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
- class RoleRuleFinalAdd():
- def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
- # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
- main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
- end_tokens = []
- for sentence in main_sentences[-5:]:
- end_tokens.extend(sentence.tokens)
- text_end = "".join(end_tokens[-30:])
- # print(text_end)
- # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
- sear_ent = re.search('[,。;]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
- sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
- sear_ent3 = re.search('(报名咨询|[收送交]货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
- sear_ent4 = re.search('(发布(?:人|单位|机构))[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
- sear_list = [sear_ent4 , sear_ent3 , sear_ent2 , sear_ent]
- tenderee_notfound = True
- agency_notfound = True
- ents = []
- for ent in list_entitys[0]:
- if ent.entity_type in ['org', 'company']:
- if ent.label == 0:
- tenderee_notfound = False
- elif ent.label == 1:
- agency_notfound = False
- elif ent.label == 5:
- ents.append(ent)
- if sear_ent or sear_ent2 or sear_ent3 or sear_ent4:
- for _sear_ent in [_sear for _sear in sear_list if _sear]:
- # if sear_ent4:
- # ent_re = sear_ent4.group(2)
- # elif sear_ent3:
- # ent_re = sear_ent3.group(2)
- # elif sear_ent2:
- # ent_re = sear_ent2.group(2)
- # else:
- # ent_re = sear_ent.group(1)
- if _sear_ent==sear_ent4:
- ent_re = _sear_ent.group(2)
- elif _sear_ent==sear_ent3:
- ent_re = _sear_ent.group(2)
- elif _sear_ent==sear_ent2:
- ent_re = _sear_ent.group(2)
- else:
- ent_re = _sear_ent.group(1)
- # print('ent_re', ent_re)
- ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
- if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
- or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
- n = 0
- for i in range(len(ents) - 1, -1, -1):
- if not ents[i].in_attachment:
- n += 1
- if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
- break
- if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
- ents[i].label = 0
- ents[i].values[0] = 0.5
- tenderee_notfound = False
- # log('正则最后补充实体: %s'%(ent_re))
- break
- elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
- n = 0
- for i in range(len(ents) - 1, -1, -1):
- if not ents[i].in_attachment:
- n += 1
- if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
- break
- if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
- ents[i].label = 1
- ents[i].values[1] = 0.5
- agency_notfound = False
- # log('正则最后补充实体: %s'%(ent_re))
- break
- if not tenderee_notfound:
- break
- elif list_codenames[0]['name'] != "": #把标题包含的公司实体作为招标人
- # tenderee_notfound = True
- # ents = []
- # for ent in list_entitys[0]:
- # if ent.entity_type in ['org', 'company']:
- # if ent.label == 0:
- # tenderee_notfound = False
- # elif ent.label == 1:
- # agency_notfound = False
- # elif ent.label == 5:
- # ents.append(ent)
- if tenderee_notfound == True:
- # print('list_codenames',list_codenames[0]['name'])
- for ent in ents:
- if ent.entity_text in list_codenames[0]['name']:
- ent.label = 0
- ent.values[0] = 0.5
- # log('正则召回标题中包含的实体:%s'%ent.entity_text)
- break
- # 招标人角色召回规则
- class TendereeRuleRecall():
- def __init__(self):
- self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|"
- "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::][^。;,]{,5}$")
- self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|"
- "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|"
- "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|"
- "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|"
- "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|"
- "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)")
- self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)")
- self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P<project>[^。;,?!::]+)")
- # 公告主语判断规则
- self.subject = re.compile("[我本][院校局]")
- # 未识别实体召回正则
- self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
- "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
- "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
- self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
- "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
- "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
- # 未识别实体尾部判断
- self.unrecognized_end1 = re.compile(".{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心)")
- self.unrecognized_end2 = re.compile(".{4,}(?:署|局|厅|处|室|科|部|站|所|股|行)")
- def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
- # tenderee_notfound = True
- # agency_notfound = True
- self.get_tenderee = False
- ents = []
- list_name = []
- for ent in list_entitys[0]:
- if ent.entity_type == 'name':
- list_name.append(ent.entity_text)
- if ent.entity_type in ['org', 'company']:
- if ent.label == 0:
- # tenderee_notfound = False
- self.get_tenderee = True
- # elif ent.label == 1:
- # agency_notfound = False
- elif ent.label == 5:
- ents.append(ent)
- if not self.get_tenderee:
- self.entity_context_rule(ents,list_name,list_sentences)
- if not self.get_tenderee:
- self.subject_rule(ents,list_articles,list_sentences)
- if not self.get_tenderee:
- self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
- if not self.get_tenderee:
- self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
- #entity上下文正则判断
- def entity_context_rule(self,entitys,list_name,list_sentences):
- for ent in entitys:
- _sentence = list_sentences[0][ent.sentence_index]
- _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
- end_index=ent.end_index, size=40, center_include=True,
- word_flag=True, use_text=True,
- text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
- if re.search(self.tenderee_left,_span[0]):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- elif re.search(self.tenderee_right,_span[2]):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- elif re.search(self.tenderee_right2, _span[2]):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- elif list_name:
- pj_name = re.search(self.tenderee_right3, _span[2])
- if pj_name:
- pj_name = pj_name.groupdict()["project"]
- for _name in list_name:
- if _name in pj_name:
- ent.label = 0
- ent.values[0] = 0.5
- self.get_tenderee = True
- break
- # 公告主语判断
- def subject_rule(self, entitys,list_articles,list_sentences):
- content = list_articles[0].content.split('##attachment##')[0]
- if re.search(self.subject,content):
- _subject = re.search(self.subject,content).group()
- for ent in entitys:
- if re.search("院",_subject) and re.search("医院|学院",ent.entity_text):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- elif re.search("局", _subject) and re.search("局", ent.entity_text):
- _sentence = list_sentences[0][ent.sentence_index]
- _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
- end_index=ent.end_index, size=20, center_include=True,
- word_flag=True, use_text=True,
- text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
- if not re.search("监督|投诉",_span[0][-10:]):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- # 正则召回未识别实体
- def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
- list_sentence = list_sentences[0]
- for in_attachment in [False,True]:
- for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
- sentence_text = sentence.sentence_text
- tokens = sentence.tokens
- doc_id = sentence.doc_id
- in_attachment = sentence.in_attachment
- list_tokenbegin = []
- begin = 0
- for i in range(0, len(tokens)):
- list_tokenbegin.append(begin)
- begin += len(str(tokens[i]))
- list_tokenbegin.append(begin + 1)
- for _match in re.finditer(pattern,sentence_text):
- _groupdict = _match.groupdict()
- _match_text = _match.group()
- _unrecognized_text = _groupdict["unrecognized"]
- # print(_unrecognized_text)
- # if _match_text[-1] in [':',':']:
- # _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
- # if not _unrecognized:
- # _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
- # if _unrecognized:
- # _unrecognized = _unrecognized.group()
- # else:
- # continue
- # else:
- # _unrecognized = _unrecognized_text
- _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
- if not _unrecognized:
- _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
- if _unrecognized:
- _unrecognized = _unrecognized.group()
- else:
- continue
- # print(_unrecognized)
- if re.search("某",_unrecognized):
- continue
- begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
- for j in range(len(list_tokenbegin)):
- if list_tokenbegin[j] == begin_index_temp:
- begin_index = j
- break
- elif list_tokenbegin[j] > begin_index_temp:
- begin_index = j - 1
- break
- index = begin_index_temp + len(_unrecognized)
- end_index_temp = index
- for j in range(begin_index, len(list_tokenbegin)):
- if list_tokenbegin[j] >= index:
- end_index = j - 1
- break
- entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
- entity_text = _unrecognized
- new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
- begin_index_temp, end_index_temp, in_attachment=in_attachment)
- new_entity.label = 0
- new_entity.values = [on_value,0,0,0,0,0]
- list_entitys[0].append(new_entity)
- self.get_tenderee = True
- if self.get_tenderee:
- list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
- break
- # 时间类别
- class TimePredictor():
- def __init__(self):
- self.sess = tf.Session(graph=tf.Graph())
- self.inputs_code = None
- self.outputs_code = None
- self.input_shape = (2,40,128)
- self.load_model()
- def load_model(self):
- model_path = os.path.dirname(__file__)+'/timesplit_model'
- if self.inputs_code is None:
- log("get model of time")
- with self.sess.as_default():
- with self.sess.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path)
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- self.inputs_code = []
- self.inputs_code.append(
- self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
- self.inputs_code.append(
- self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
- self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
- return self.inputs_code, self.outputs_code
- else:
- return self.inputs_code, self.outputs_code
- def search_time_data(self,list_sentences,list_entitys):
- data_x = []
- points_entitys = []
- for list_sentence, list_entity in zip(list_sentences, list_entitys):
- p_entitys = 0
- p_sentences = 0
- list_sentence.sort(key=lambda x: x.sentence_index)
- while(p_entitys<len(list_entity)):
- entity = list_entity[p_entitys]
- if entity.entity_type in ['time']:
- while(p_sentences<len(list_sentence)):
- sentence = list_sentence[p_sentences]
- if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
- # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
- # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
- s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
- left = s[0]
- right = s[1]
- context = [left, right]
- x = self.embedding_words(context, shape=self.input_shape)
- data_x.append(x)
- points_entitys.append(entity)
- break
- p_sentences += 1
- p_entitys += 1
- if len(points_entitys)==0:
- return None
- data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
- return [data_x, points_entitys]
- def embedding_words(self, datas, shape):
- '''
- @summary:查找词汇对应的词向量
- @param:
- datas:词汇的list
- shape:结果的shape
- @return: array,返回对应shape的词嵌入
- '''
- model_w2v = getModel_w2v()
- embed = np.zeros(shape)
- length = shape[1]
- out_index = 0
- for data in datas:
- index = 0
- for item in data:
- item_not_space = re.sub("\s*", "", item)
- if index >= length:
- break
- if item_not_space in model_w2v.vocab:
- embed[out_index][index] = model_w2v[item_not_space]
- index += 1
- else:
- embed[out_index][index] = model_w2v['unk']
- index += 1
- out_index += 1
- return embed
- def predict(self, list_sentences,list_entitys):
- datas = self.search_time_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- with self.sess.as_default():
- predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0]
- ,self.inputs_code[1]:datas[0][1]})[0]
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = []
- for item in predict_y[i]:
- values.append(item)
- if label != 0:
- if not timeFormat(entity.entity_text):
- label = 0
- values[0] = 0.5
- entity.set_Role(label, values)
- # 产品字段提取
- class ProductPredictor():
- def __init__(self):
- vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
- self.vocab = load(vocabpath)
- self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab)))
- self.sess = tf.Session(graph=tf.Graph())
- self.load_model()
- def load_model(self):
- # model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
- model_path = os.path.dirname(__file__)+'/product_savedmodel/productAndfailreason.pb'
- with self.sess.as_default():
- with self.sess.graph.as_default():
- output_graph_def = tf.GraphDef()
- with open(model_path, 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- self.sess.run(tf.global_variables_initializer())
- self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
- self.length = self.sess.graph.get_tensor_by_name("Sum:0")
- self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
- self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
- self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
- def decode(self,logits, lengths, matrix):
- paths = []
- small = -1000.0
- # start = np.asarray([[small] * 4 + [0]])
- start = np.asarray([[small]*7+[0]])
- for score, length in zip(logits, lengths):
- score = score[:length]
- pad = small * np.ones([length, 1])
- logits = np.concatenate([score, pad], axis=1)
- logits = np.concatenate([start, logits], axis=0)
- path, _ = viterbi_decode(logits, matrix)
- paths.append(path[1:])
- return paths
- def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
- '''
- 预测实体代码,每个句子最多取MAX_AREA个字,超过截断
- :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
- :param list_entitys: 多篇公告实体列表
- :param MAX_AREA: 每个句子最多截取多少字
- :return: 把预测出来的实体放进实体类
- '''
- with self.sess.as_default() as sess:
- with self.sess.graph.as_default():
- result = []
- if fail and list_articles!=[]:
- text_list = [list_articles[0].content[:MAX_AREA]]
- chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in text] for text in text_list]
- lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
- feed_dict={
- self.char_input: np.asarray(chars),
- self.dropout: 1.0
- })
- batch_paths = self.decode(scores, lengths, tran_)
- for text, path, length in zip(text_list, batch_paths, lengths):
- tags = ''.join([str(it) for it in path[:length]])
- for it in re.finditer("12*3", tags):
- start = it.start()
- end = it.end()
- _entity = Entity(doc_id=list_articles[0].doc_id, entity_id="%s_%s_%s_%s" % (
- list_articles[0].doc_id, 0, start, end),
- entity_text=text[start:end],
- entity_type="product", sentence_index=0,
- begin_index=0, end_index=0, wordOffset_begin=start,
- wordOffset_end=end)
- list_entitys[0].append(_entity)
- for it in re.finditer("45*6", tags):
- start = it.start()
- end = it.end()
- result.append(text[start:end].replace('?', '').strip())
- reasons = []
- for it in result:
- if "(√)" in it or "(√)" in it:
- reasons = [it]
- break
- if reasons != [] and (it not in reasons[-1] and it not in reasons):
- reasons.append(it)
- elif reasons == []:
- reasons.append(it)
- return {'fail_reason':';'.join(reasons)}
- if list_entitys is None:
- list_entitys = [[] for _ in range(len(list_sentences))]
- for list_sentence, list_entity in zip(list_sentences,list_entitys):
- if len(list_sentence)==0:
- result.append({"product":[]})
- continue
- list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
- _begin_index = 0
- item = {"product":[]}
- temp_list = []
- while True:
- MAX_LEN = len(list_sentence[_begin_index].sentence_text)
- if MAX_LEN > MAX_AREA:
- MAX_LEN = MAX_AREA
- _LEN = MAX_AREA//MAX_LEN
- chars = [sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
- chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in l] for l in chars]
- chars = pad_sequences(chars, maxlen=MAX_LEN, padding="post", truncating="post")
- lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
- feed_dict={
- self.char_input: np.asarray(chars),
- self.dropout: 1.0
- })
- batch_paths = self.decode(scores, lengths, tran_)
- for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
- tags = ''.join([str(it) for it in path[:length]])
- for it in re.finditer("12*3", tags):
- start = it.start()
- end = it.end()
- _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
- sentence.doc_id, sentence.sentence_index, start, end),
- entity_text=sentence.sentence_text[start:end],
- entity_type="product", sentence_index=sentence.sentence_index,
- begin_index=0, end_index=0, wordOffset_begin=start,
- wordOffset_end=end,in_attachment=sentence.in_attachment)
- list_entity.append(_entity)
- temp_list.append(sentence.sentence_text[start:end])
- # item["product"] = list(set(temp_list))
- # result.append(item)
- if _begin_index+_LEN >= len(list_sentence):
- break
- _begin_index += _LEN
- item["product"] = list(set(temp_list))
- result.append(item) # 修正bug
- return {'fail_reason': ""}
- # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
- class ProductAttributesPredictor():
- def __init__(self,):
- self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
- self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
- with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
- self.header_set = pickle.load(f)
- def isTrueTable(self, table):
- '''真假表格规则:
- 1、包含<caption>或<th>标签为真
- 2、包含大量链接、表单、图片或嵌套表格为假
- 3、表格尺寸太小为假
- 4、外层<table>嵌套子<table>,一般子为真,外为假'''
- if table.find_all(['caption', 'th']) != []:
- return True
- elif len(table.find_all(['form', 'a', 'img'])) > 5:
- return False
- elif len(table.find_all(['tr'])) < 2:
- return False
- elif len(table.find_all(['table'])) >= 1:
- return False
- else:
- return True
- def getTrs(self, tbody):
- # 获取所有的tr
- trs = []
- objs = tbody.find_all(recursive=False)
- for obj in objs:
- if obj.name == "tr":
- trs.append(obj)
- if obj.name == "tbody":
- for tr in obj.find_all("tr", recursive=False):
- trs.append(tr)
- return trs
- def getTable(self, tbody):
- trs = self.getTrs(tbody)
- inner_table = []
- if len(trs) < 2:
- return inner_table
- for tr in trs:
- tr_line = []
- tds = tr.findChildren(['td', 'th'], recursive=False)
- if len(tds) < 2:
- continue
- for td in tds:
- td_text = re.sub('\s', '', td.get_text())
- tr_line.append(td_text)
- inner_table.append(tr_line)
- return inner_table
- def fixSpan(self, tbody):
- # 处理colspan, rowspan信息补全问题
- trs = self.getTrs(tbody)
- ths_len = 0
- ths = list()
- trs_set = set()
- # 修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
- # 遍历每一个tr
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- # 不补全含有表格的tr
- if len(tr.findChildren('table')) > 0:
- continue
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- if len(tds) < 3:
- continue # 列数太少的不补全
- for indtd, td in enumerate(tds):
- # 若有colspan 则补全同一行下一个位置
- if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "":
- col = int(re.sub("[^0-9]", "", str(td['colspan'])))
- if col < 10 and len(td.get_text()) < 500:
- td['colspan'] = 1
- for i in range(1, col, 1):
- td.insert_after(copy.copy(td))
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- # 不补全含有表格的tr
- if len(tr.findChildren('table')) > 0:
- continue
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- same_span = 0
- if len(tds) > 1 and 'rowspan' in tds[0].attrs:
- span0 = tds[0].attrs['rowspan']
- for td in tds:
- if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0:
- same_span += 1
- if same_span == len(tds):
- continue
- for indtd, td in enumerate(tds):
- # 若有rowspan 则补全下一行同样位置
- if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "":
- row = int(re.sub("[^0-9]", "", str(td['rowspan'])))
- td['rowspan'] = 1
- for i in range(1, row, 1):
- # 获取下一行的所有td, 在对应的位置插入
- if indtr + i < len(trs):
- tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False)
- if len(tds1) >= (indtd) and len(tds1) > 0:
- if indtd > 0:
- tds1[indtd - 1].insert_after(copy.copy(td))
- else:
- tds1[0].insert_before(copy.copy(td))
- elif len(tds1) > 0 and len(tds1) == indtd - 1:
- tds1[indtd - 2].insert_after(copy.copy(td))
- def get_monthlen(self, year, month):
- '''输入年份、月份 int类型 得到该月份天数'''
- try:
- weekday, num = calendar.monthrange(int(year), int(month))
- except:
- num = 30
- return str(num)
- def fix_time(self, text, html, page_time):
- '''输入日期字段返回格式化日期'''
- for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'),
- ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]:
- if it[0] in text:
- text = text.replace(it[0], it[1])
- if re.search('^\d{1,2}月$', text):
- m = re.search('^(\d{1,2})月$', text).group(1)
- if len(m) < 2:
- m = '0' + m
- year = re.search('(\d{4})年(.{,12}采购意向)?', html)
- if year:
- y = year.group(1)
- num = self.get_monthlen(y, m)
- if len(num) < 2:
- num = '0' + num
- order_begin = "%s-%s-01" % (y, m)
- order_end = "%s-%s-%s" % (y, m, num)
- elif page_time != "":
- year = re.search('\d{4}', page_time)
- if year:
- y = year.group(0)
- num = self.get_monthlen(y, m)
- if len(num) < 2:
- num = '0' + num
- order_begin = "%s-%s-01" % (y, m)
- order_end = "%s-%s-%s" % (y, m, num)
- else:
- y = str(datetime.datetime.now().year)
- num = self.get_monthlen(y, m)
- if len(num) < 2:
- num = '0' + num
- order_begin = "%s-%s-01" % (y, m)
- order_end = "%s-%s-%s" % (y, m, num)
- else:
- y = str(datetime.datetime.now().year)
- num = self.get_monthlen(y, m)
- if len(num) < 2:
- num = '0' + num
- order_begin = "%s-%s-01" % (y, m)
- order_end = "%s-%s-%s" % (y, m, num)
- return order_begin, order_end
- t1 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})月?$', text)
- if t1:
- year = t1.group(1)
- month = t1.group(3)
- num = self.get_monthlen(year, month)
- if len(month)<2:
- month = '0'+month
- if len(num) < 2:
- num = '0'+num
- order_begin = "%s-%s-01" % (year, month)
- order_end = "%s-%s-%s" % (year, month, num)
- return order_begin, order_end
- t2 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})(月|/|\.|-)(\d{1,2})日?$', text)
- if t2:
- y = t2.group(1)
- m = t2.group(3)
- d = t2.group(5)
- m = '0'+ m if len(m)<2 else m
- d = '0'+d if len(d)<2 else d
- order_begin = order_end = "%s-%s-%s"%(y,m,d)
- return order_begin, order_end
- # 时间样式:"202105"
- t3 = re.search("^(20\d{2})(\d{1,2})$",text)
- if t3:
- year = t3.group(1)
- month = t3.group(2)
- if int(month)>0 and int(month)<=12:
- num = self.get_monthlen(year, month)
- if len(month) < 2:
- month = '0' + month
- if len(num) < 2:
- num = '0' + num
- order_begin = "%s-%s-01" % (year, month)
- order_end = "%s-%s-%s" % (year, month, num)
- return order_begin, order_end
- # 时间样式:"20210510"
- t4 = re.search("^(20\d{2})(\d{2})(\d{2})$", text)
- if t4:
- year = t4.group(1)
- month = t4.group(2)
- day = t4.group(3)
- if int(month) > 0 and int(month) <= 12 and int(day)>0 and int(day)<=31:
- order_begin = order_end = "%s-%s-%s"%(year,month,day)
- return order_begin, order_end
- all_match = re.finditer('^(?P<y1>\d{4})(年|/|\.)(?P<m1>\d{1,2})(?:(月|/|\.)(?:(?P<d1>\d{1,2})日)?)?'
- '(到|至|-)(?:(?P<y2>\d{4})(年|/|\.))?(?P<m2>\d{1,2})(?:(月|/|\.)'
- '(?:(?P<d2>\d{1,2})日)?)?$', text)
- y1 = m1 = d1 = y2 = m2 = d2 = ""
- found_math = False
- for _match in all_match:
- if len(_match.group()) > 0:
- found_math = True
- for k, v in _match.groupdict().items():
- if v!="" and v is not None:
- if k == 'y1':
- y1 = v
- elif k == 'm1':
- m1 = v
- elif k == 'd1':
- d1 = v
- elif k == 'y2':
- y2 = v
- elif k == 'm2':
- m2 = v
- elif k == 'd2':
- d2 = v
- if not found_math:
- return "", ""
- y2 = y1 if y2 == "" else y2
- d1 = '1' if d1 == "" else d1
- d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
- m1 = '0' + m1 if len(m1) < 2 else m1
- m2 = '0' + m2 if len(m2) < 2 else m2
- d1 = '0' + d1 if len(d1) < 2 else d1
- d2 = '0' + d2 if len(d2) < 2 else d2
- order_begin = "%s-%s-%s"%(y1,m1,d1)
- order_end = "%s-%s-%s"%(y2,m2,d2)
- return order_begin, order_end
- def find_header(self, items, p1, p2):
- '''
- inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
- :param items: 列表,内容为每个td 文本内容
- :param p1: 优先表头正则
- :param p2: 第二表头正则
- :return: 表头所在列序号,是否表头,表头内容
- '''
- flag = False
- header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
- product = "" # 产品
- quantity = "" # 数量
- unitPrice = "" # 单价
- brand = "" # 品牌
- specs = "" # 规格
- demand = "" # 采购需求
- budget = "" # 预算金额
- order_time = "" # 采购时间
- for i in range(min(4, len(items))):
- it = items[i]
- if len(it) < 15 and re.search(p1, it) != None:
- flag = True
- product = it
- header_dic['名称'] = i
- break
- if not flag:
- for i in range(min(4, len(items))):
- it = items[i]
- if len(it) < 15 and re.search(p2, it) and re.search(
- '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None:
- flag = True
- product = it
- header_dic['名称'] = i
- break
- if flag:
- for j in range(i + 1, len(items)):
- if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
- continue
- if header_dic['数量']=="" and re.search('数量', items[j]):
- header_dic['数量'] = j
- quantity = items[j]
- elif re.search('单价', items[j]):
- header_dic['单价'] = j
- unitPrice = items[j]
- elif re.search('品牌', items[j]):
- header_dic['品牌'] = j
- brand = items[j]
- elif re.search('规格', items[j]):
- header_dic['规格'] = j
- specs = items[j]
- elif re.search('需求', items[j]):
- header_dic['需求'] = j
- demand = items[j]
- elif re.search('预算', items[j]):
- header_dic['预算'] = j
- budget = items[j]
- elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
- header_dic['时间'] = j
- order_time = items[j]
- if header_dic.get('名称', "") != "" :
- num = 0
- for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
- if it != "":
- num += 1
- if num >=2:
- return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
- flag = False
- return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
- def predict(self, docid='', html='', page_time=""):
- '''
- 正则寻找table表格内 产品相关信息
- :param html:公告HTML原文
- :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息
- '''
- soup = BeautifulSoup(html, 'lxml')
- flag_yx = True if re.search('采购意向', html) else False
- tables = soup.find_all(['table'])
- headers = []
- headers_demand = []
- header_col = []
- product_link = []
- demand_link = []
- total_product_money = 0
- for i in range(len(tables)-1, -1, -1):
- table = tables[i]
- if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
- table.string = table.get_text()
- table.name = 'turntable'
- continue
- if not self.isTrueTable(table):
- continue
- self.fixSpan(table)
- inner_table = self.getTable(table)
- i = 0
- found_header = False
- header_colnum = 0
- if flag_yx:
- col0_l = []
- col1_l = []
- for tds in inner_table:
- if len(tds) == 2:
- col0_l.append(re.sub(':', '', tds[0]))
- col1_l.append(tds[1])
- if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2:
- header_list2 = []
- product = demand = budget = order_begin = order_end = ""
- for i in range(len(col0_l)):
- if re.search('项目名称', col0_l[i]):
- header_list2.append(col0_l[i])
- product = col1_l[i]
- elif re.search('采购需求|需求概况', col0_l[i]):
- header_list2.append(col0_l[i])
- demand = col1_l[i]
- elif re.search('采购预算|预算金额', col0_l[i]):
- header_list2.append(col0_l[i])
- budget = col1_l[i]
- re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
- if re_price:
- budget = re_price[0]
- if '万元' in col0_l[i] and '万' not in budget:
- budget += '万元'
- budget = str(getUnifyMoney(budget))
- else:
- budget = ""
- elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
- header_list2.append(col0_l[i])
- order_time = col1_l[i].strip()
- order_begin, order_end = self.fix_time(order_time, html, page_time)
- if order_begin != "" and order_end!="":
- order_begin_year = int(order_begin.split("-")[0])
- order_end_year = int(order_end.split("-")[0])
- # 限制附件错误识别时间
- if order_begin_year>=2050 or order_end_year>=2050:
- order_begin = order_end = ""
- if product!= "" and demand != "" and budget!="" and order_begin != "":
- link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
- 'order_begin': order_begin, 'order_end': order_end}
- if link not in demand_link:
- demand_link.append(link)
- headers_demand.append('_'.join(header_list2))
- continue
- while i < (len(inner_table)):
- tds = inner_table[i]
- not_empty = [it for it in tds if it != ""]
- if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
- i += 1
- continue
- product = "" # 产品
- quantity = "" # 数量
- unitPrice = "" # 单价
- brand = "" # 品牌
- specs = "" # 规格
- demand = "" # 采购需求
- budget = "" # 预算金额
- order_time = "" # 采购时间
- order_begin = ""
- order_end = ""
- if len(set(tds) & self.header_set) > len(tds) * 0.2:
- header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
- if found_header:
- headers.append('_'.join(header_list))
- headers_demand.append('_'.join(header_list2))
- header_colnum = len(tds)
- header_col.append('_'.join(tds))
- i += 1
- continue
- elif found_header:
- if len(tds) != header_colnum: # 表头、属性列数不一致跳过
- i += 1
- continue
- id1 = header_dic.get('名称', "")
- id2 = header_dic.get('数量', "")
- id3 = header_dic.get('单价', "")
- id4 = header_dic.get('品牌', "")
- id5 = header_dic.get('规格', "")
- id6 = header_dic.get('需求', "")
- id7 = header_dic.get('预算', "")
- id8 = header_dic.get('时间', "")
- if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
- re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
- product = tds[id1]
- if id2 != "":
- if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
- quantity = tds[id2]
- else:
- quantity = ""
- if id3 != "":
- if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
- unitPrice = tds[id3]
- re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?",unitPrice)
- if re_price:
- unitPrice = re_price[0]
- if '万元' in header_list[2] and '万' not in unitPrice:
- unitPrice += '万元'
- # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
- unitPrice = str(getUnifyMoney(unitPrice))
- else:
- unitPrice = ""
- else:
- unitPrice = ""
- if id4 != "":
- if re.search('\w', tds[id4]):
- brand = tds[id4]
- else:
- brand = ""
- if id5 != "":
- if re.search('\w', tds[id5]):
- specs = tds[id5]
- else:
- specs = ""
- if id6 != "":
- if re.search('\w', tds[id6]):
- demand = tds[id6]
- else:
- demand = ""
- if id7 != "":
- if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
- budget = tds[id7]
- re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
- if re_price:
- budget = re_price[0]
- if '万元' in header_list[2] and '万' not in budget:
- budget += '万元'
- budget = str(getUnifyMoney(budget))
- else:
- budget = ""
- else:
- budget = ""
- if id8 != "":
- if re.search('\w', tds[id8]):
- order_time = tds[id8].strip()
- order_begin, order_end = self.fix_time(order_time, html, page_time)
- if quantity != "" or unitPrice != "" or brand != "" or specs != "":
- link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
- 'brand': brand[:50], 'specs':specs}
- if link not in product_link:
- product_link.append(link)
- mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
- if link['unitPrice'] != "" and mat:
- try:
- total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', ''))
- except:
- log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
- if order_begin != "" and order_end != "":
- order_begin_year = int(order_begin.split("-")[0])
- order_end_year = int(order_end.split("-")[0])
- # 限制附件错误识别时间
- if order_begin_year >= 2050 or order_end_year >= 2050:
- order_begin = order_end = ""
- if budget != "" and order_time != "":
- link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
- if link not in demand_link:
- demand_link.append(link)
- i += 1
- else:
- i += 1
- if len(product_link)>0:
- attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
- else:
- attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
- if len(demand_link)>0:
- demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
- else:
- demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
- return [attr_dic, demand_dic], total_product_money
- def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
- if len(prem[0]['prem'])==1:
- list_sentence = list_sentences[0]
- list_entity = list_entitys[0]
- _data = product_attrs[1]['demand_info']['data']
- re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
- order_times = []
- for entity in list_entity:
- if entity.entity_type=='time':
- sentence = list_sentence[entity.sentence_index]
- s = spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index,
- end_index=entity.end_index,size=20)
- entity_left = "".join(s[0])
- if re.search(re_bidding_time,entity_left):
- time_text = entity.entity_text.strip()
- standard_time = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2})日?)")
- time_match = re.search(standard_time,time_text)
- if time_match:
- time_text = time_match.group()
- order_times.append(time_text)
- # print(order_times)
- order_times = [tuple(self.fix_time(order_time, html, page_time)) for order_time in order_times]
- order_times = [order_time for order_time in order_times if order_time[0]!=""]
- if len(set(order_times))==1:
- order_begin,order_end = order_times[0]
- project_name = codeName[0]['name']
- pack_info = [pack for pack in prem[0]['prem'].values()]
- budget = pack_info[0].get('tendereeMoney',0)
- product = prem[0]['product']
- link = {'project_name': project_name, 'product': product, 'demand': project_name, 'budget': budget,
- 'order_begin': order_begin, 'order_end': order_end}
- _data.append(link)
- product_attrs[1]['demand_info']['data'] = _data
- return product_attrs
- # docchannel类型提取
- class DocChannel():
- def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
- self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
- self.mask, self.mask_title = self.load_life(life_model)
- self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
- self.type_mask, self.type_mask_title = self.load_type(type_model)
- self.sequen_len = 200 # 150 200
- self.title_len = 30
- self.sentence_num = 10
- self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
- lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
- lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
- self.id2type = {k: v for k, v in enumerate(lb_type)}
- self.id2life = {k: v for k, v in enumerate(lb_life)}
- self.load_pattern()
- def load_pattern(self):
- self.type_dic = {
- '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
- '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
- '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
- '采招数据': '(采购|招标|代理)(人|机构|单位)|(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' #|变更|答疑|澄清|中标|成交|合同|废标|流标
- }
- self.title_type_dic = {
- '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
- '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
- '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
- '采招数据': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标', # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
- '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
- }
- self.life_dic = {
- '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
- '招标预告': '预计(采购|招标)(时间|日期)',
- '招标公告': '(采购|招标|竞选|报名)条件;报名时间;报名流程;报名方法;报名需提供的材料;参加竞价采购交易资格;(申请人|投标人|供应商|报价人|参选人)的?资格要求;获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件;(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
- '资审结果': '招标资审公告|评审入围公示|资审及业绩公示|资格后审情况报告|资格后审结果公告|资格后审结果公示|资格预审结果公告|资格预审结果公示|预审公示|预审结果公示',
- '招标答疑': '现澄清为|现澄清如下|第\d次澄清|答疑澄清公告|异议的回复|(最高(投标)?限价|控制价|拦标价)公示',
- '公告变更': '原公告(主要)?(信息|内容)|变更[前后]内容|现在?(变更|更正|修改|更改)为|(变更|更正)内容为|更正理由|更正人名称|[、\s](更正信息|更正内容):',
- '候选人公示': '候选人公示|评标结果公示',
- '中标信息': '供地结果信息|采用单源直接采购的?情况说明|现将\w{,4}(成交|中标|中选|选定结果|选取结果)\w{2,8}(进行公示|公示如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|(中标(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
- '中标信息2': '(成交|中标)(日期|时间)[::\s]|成交金额:',
- '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
- '合同公告': '合同(公告|公示)信息;合同(公告|公示)日期;合同(公告|公示)内容;合同编号;合同名称;合同签订日期;合同主体;供应商乙方',
- '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|(按|做|作)(流标|废标)处理)',
- }
- self.title_life_dic = {
- '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
- '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|供应计划$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
- '公告变更': '(变更|更正(事项)?|更改|延期|暂停)的?(公告|公示|通知)|变更$|更正$',
- '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告)',
- '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销|取消成交)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)',
- '合同公告': '(合同(成交)?|履约验收|履约|验收结果)(公告|公示|信息|公式)|合同备案|合同书', # 合同$|
- '候选人公示': '候选人公示|评标(结果)?公示|中标前?公示|中标预公示',
- '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)结果|开标(记录|信息|情况)|中标通知书|中标$',
- # '资审结果': '(资质|资格)(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)(审查|预审)结果(公示)?|资审结果公示|未?入围(公示|公告)|资审及业绩公示',
- '资审结果': '((资格|资质)(审查|预审|后审|审核|入围项?目?)|资审|入围)结果(公告|公示)?|(资质|资格)(预审|后审|入围)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|未?入围(公示|公告)|资审及业绩公示',
- '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)',
- }
- self.wrong_win = '按项目控制价下浮\d%即为成交价|不得确定为(中标|成交)|招标人按下列原则选择中标人|确定成交供应商:|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)|竞拍起止时间:'
- def load_life(self,life_model):
- with tf.Graph().as_default() as graph:
- output_graph_def = graph.as_graph_def()
- with open(os.path.dirname(__file__)+life_model, 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- # print("%d ops in the final graph" % len(output_graph_def.node))
- del output_graph_def
- sess = tf.Session(graph=graph)
- sess.run(tf.global_variables_initializer())
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- mask = sess.graph.get_tensor_by_name('inputs/mask:0')
- mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
- # logit = sess.graph.get_tensor_by_name('output/logit:0')
- softmax = sess.graph.get_tensor_by_name('output/softmax:0')
- return sess, title, inputs, prob, softmax, mask, mask_title
- def load_type(self,type_model):
- with tf.Graph().as_default() as graph:
- output_graph_def = graph.as_graph_def()
- with open(os.path.dirname(__file__)+type_model, 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- # print("%d ops in the final graph" % len(output_graph_def.node))
- del output_graph_def
- sess = tf.Session(graph=graph)
- sess.run(tf.global_variables_initializer())
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- mask = sess.graph.get_tensor_by_name('inputs/mask:0')
- mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
- # logit = sess.graph.get_tensor_by_name('output/logit:0')
- softmax = sess.graph.get_tensor_by_name('output/softmax:0')
- return sess, title, inputs, prob, softmax, mask, mask_title
- def predict_process(self, docid='', doctitle='', dochtmlcon=''):
- # print('准备预处理')
- def get_kw_senten(s, span=10):
- doc_sens = []
- tmp = 0
- num = 0
- end_idx = 0
- for it in re.finditer(self.kws, s): # '|'.join(keywordset)
- left = s[end_idx:it.end()].split()
- right = s[it.end():].split()
- tmp_seg = s[tmp:it.start()].split()
- if len(tmp_seg) > span or tmp == 0:
- doc_sens.append(' '.join(left[-span:] + right[:span]))
- end_idx = it.end() + 1 + len(' '.join(right[:span]))
- tmp = it.end()
- num += 1
- if num >= self.sentence_num:
- break
- if doc_sens == []:
- doc_sens.append(s)
- return doc_sens
- def word2id(wordlist, max_len=self.sequen_len):
- ids = [getIndexOfWords(w) for w in wordlist]
- ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
- assert len(ids) == max_len
- return ids
- cost_time = dict()
- datas = []
- datas_title = []
- try:
- segword_title = ' '.join(selffool.cut(doctitle)[0])
- segword_content = dochtmlcon
- except:
- segword_content = ''
- segword_title = ''
- if isinstance(segword_content, float):
- segword_content = ''
- if isinstance(segword_title, float):
- segword_title = ''
- segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
- replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
- replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
- segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
- segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
- doc_word_list = segword_content.split()
- if len(doc_word_list) > self.sequen_len / 2:
- doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
- doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
- else:
- doc_sens = ' '.join(doc_word_list[:self.sequen_len])
- # print('标题:',segword_title)
- # print('正文:',segword_content)
- datas.append(doc_sens.split())
- datas_title.append(segword_title.split())
- # print('完成预处理')
- return datas, datas_title
- def is_houxuan(self, title, content):
- '''
- 通过标题和中文内容判断是否属于候选人公示类别
- :param title: 公告标题
- :param content: 公告正文文本内容
- :return: 1 是候选人公示 ;0 不是
- '''
- if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围)
- if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
- return 0
- return 1
- if re.search('候选人的?公示', content[:100]):
- if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
- return 0
- return 1
- else:
- return 0
- def predict(self, title='', list_sentence='', web_source_no='', original_docchannel=''):
- not_extract_dic = {
- 104: '招标文件',
- 106: '法律法规',
- 107: '新闻资讯',
- 108: '拟建项目',
- 109: '展会推广',
- 110: '企业名录',
- 111: '企业资质',
- 112: '全国工程人员',
- 113: '业主采购'
- }
- if original_docchannel in not_extract_dic:
- return {'docchannel': {'docchannel':'', 'doctype':not_extract_dic[original_docchannel], "original_docchannel_id": str(original_docchannel)}}
- if web_source_no in ['02104-7']:
- return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}}
- if isinstance(list_sentence, list):
- token_l = [it.tokens for it in list_sentence]
- tokens = [it for l in token_l for it in l]
- content = ' '.join(tokens[:500])
- title = re.sub('[^\u4e00-\u9fa5]', '', title)
- if len(title)>50:
- title = title[:20]+title[-30:]
- data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
- text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
- title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
- result = {'docchannel': {'docchannel':'', 'doctype':'', "original_docchannel_id": str(original_docchannel)}}
- array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
- array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
- pred = self.type_sess.run(self.type_softmax,
- feed_dict={
- self.type_title: array_title,
- self.type_content: array_content,
- self.type_mask:[[0]*text_len+[1]*(self.sequen_len-text_len)],
- self.type_mask_title:[[0]*title_len+[1]*(self.title_len-title_len)],
- self.type_prob:1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- result['docchannel']['doctype'] = self.id2type[id]
- # print('公告类别:', self.id2type[id], '概率:',prob)
- # if id == 0:
- if result['docchannel']['doctype'] not in ['', '新闻资讯']:
- pred = self.lift_sess.run(self.lift_softmax,
- feed_dict={
- self.lift_title: array_title,
- self.lift_content: array_content,
- self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
- self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
- self.lift_prob:1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- result['docchannel']['docchannel'] = self.id2life[id]
- # print('生命周期:纯模型预测',self.id2life[id], '概率:',prob)
- # if id == 6:
- if result['docchannel']['docchannel'] == '中标信息':
- if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
- result['docchannel']['docchannel'] = '候选人公示'
- # return '候选人公示', prob
- # return [{'docchannel': '候选人公示'}]
- return result
- # return [{'docchannel':self.id2life[id]}]
- # else:
- # # return self.id2type[id], prob
- # return [{'docchannel':self.id2type[id]}]
- def predict_rule(self, title, content, channel_dic, prem_dic):
- '''2022/2/10加入规则去除某些数据源及内容过短且不包含类别关键词的公告不做预测'''
- hetong = '(合同|验收|履约)(公告|公示)|合同号?$' # 合同标题正则
- zhongbiao_t = '(中标|中选|成交|入选|入围|结果|确认)(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选)结果|开标(记录|信息|情况)|单一来源|直接(选取|选定)|中标通知书|中标$'
- zhongbiao_c = '(中标|中选|成交|拟选用|拟邀请|最终选定的?|拟定)(供应商|供货商|服务商|企业|公司|单位|(候选)?人)(名称)?[::]|[,。:.](供应商|供货商|服务商)(名称)?:|指定的中介服务机构:|建设服务单位:'
- zhaobiao_t = '(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈)(公告|公示|$)'
- title_cn = re.sub('[^\u4e00-\u9fa5]', '', title)
- if len(re.sub('[^\u4e00-\u9fa5]', "", content))<50 and channel_dic['docchannel']['doctype'] != '新闻资讯':
- if re.search(hetong, title_cn) != None:
- channel_dic['docchannel']['docchannel'] = '合同公告'
- elif re.search(zhongbiao_t, title_cn):
- channel_dic['docchannel']['docchannel'] = '中标信息'
- elif re.search(zhaobiao_t, title_cn):
- channel_dic['docchannel']['docchannel'] = '招标公告'
- else:
- channel_dic['docchannel']['docchannel'] = ''
- elif channel_dic['docchannel'].get('docchannel', '') == '招标公告' and 'win_tenderer' in json.dumps(prem_dic,
- ensure_ascii=False):
- if re.search(hetong, title_cn) != None:
- channel_dic['docchannel']['docchannel'] = '合同公告'
- log('正则把招标公告修改为合同公告')
- elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
- content):
- channel_dic['docchannel']['docchannel'] = '中标信息'
- log('正则把招标公告修改为中标信息')
- elif channel_dic['docchannel'].get('docchannel', '') == '中标信息' and 'win_tenderer' not in json.dumps(prem_dic,
- ensure_ascii=False):
- if re.search(hetong, title_cn):
- channel_dic['docchannel']['docchannel'] = '合同公告'
- log('正则把中标信息修改为合同公告')
- elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
- content):
- pass
- elif re.search(zhaobiao_t, title_cn):
- channel_dic['docchannel']['docchannel'] = '招标公告'
- log('正则把中标信息修改为招标公告')
- elif re.search('中标|成交|中选|入选|入围|结果|供应商|供货商|候选人', title_cn+content)==None:
- channel_dic['docchannel']['docchannel'] = ''
- log('正则把中标信息修改为空')
- return channel_dic
- def predict_merge(self, title, list_sentence, html, bidway, prem, original_docchannel='', web_source_no=''):
- '''
- 正则,模型混合预测,返回公告类型及生命周期
- :param title: 公告标题
- :param content: 预处理后的返回的句子实体列表 list_sentence
- :param html: 公告原文 html 内容
- :param bidway: 招标方式
- :param prem: 提取的prem 字典
- :return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
- '''
- def cut_single_cn_space(text):
- new_text = ""
- for w in text.split():
- if len(w) == 1 or re.search('^[\u4e00-\u9fa5][::]', w):
- new_text += w
- else:
- new_text += ' ' + w
- return new_text
- def html2text(html):
- ser = re.search('<div[^<>]*richTextFetch', html)
- if ser:
- html = html[:ser.start()]+'##richTextFetch##'
- text = re.sub('<[^<]*?>', '', html).replace(' ', ' ')
- text = re.sub('\s+', ' ', text)
- text = re.sub('[/|[()()]', '', text)
- text = cut_single_cn_space(text)
- return text[:20000]
- def count_diffser(pattern, text):
- num = 0
- kw = []
- for p in pattern.split(';'):
- if re.search(p, text):
- num += 1
- kw.append(re.search(p, text).group(0))
- return num, ';'.join(kw)
- def is_contain_winner(extract_json):
- if re.search('win_tenderer', extract_json):
- return True
- else:
- return False
- def is_single_source(bidway, title):
- if re.search('单一来源|单一性采购', title):
- return True
- elif bidway == '单一来源':
- return True
- else:
- return False
- def get_type(title, text):
- if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'],
- text): # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None
- if re.search(self.title_type_dic['采招数据'], title + text[:50]):
- return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
- return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0)
- elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
- if re.search(self.title_type_dic['采招数据'], title + text[:50]):
- return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
- return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
- elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text):
- if re.search(self.title_type_dic['采招数据'], title + text[:50]):
- return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
- return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0)
- elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text):
- return '采招数据', (
- re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group(
- 0)
- elif re.search(self.title_type_dic['新闻资讯'], title):
- if re.search(self.title_type_dic['采招数据'], title + text[:150]):
- return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:150]).group(0)
- return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0)
- else:
- return '', '没有公告类型关键词,返回空'
- def get_life(title, text, extract_json="", bidway="", original_docchannel=''):
- if re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100]):
- if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
- return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
- 0)
- elif re.search(self.title_life_dic['候选人公示'], title):
- return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
- elif re.search(self.title_life_dic['中标信息'], title):
- return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
- elif re.search('终止|废标|流标', title):
- return '废标公告', re.search('终止|废标|流标', title).group(0)
- elif is_single_source(bidway, title):
- return '中标信息', 'bidway单一来源'
- return '采购意向', (
- re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100])).group(0)
- elif re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text):
- if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
- return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
- 0)
- elif re.search(self.title_life_dic['候选人公示'], title):
- return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
- elif re.search(self.title_life_dic['中标信息'], title):
- return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
- elif re.search('终止|废标|流标', title):
- return '废标公告', re.search('终止|废标|流标', title).group(0)
- elif is_single_source(extract_json, title):
- return '中标信息', 'bidway单一来源'
- return '招标预告', (re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text)).group(0)
- elif re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
- if re.search(self.title_life_dic['废标公告'], title):
- return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
- # elif re.search('(中标|成交)结果', title[-8:]):
- # return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
- return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(0)
- elif re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or len(
- re.findall('(答:|回复:)', text)) >= 2: # or re.search(self.title_life_dic['招标答疑'], text[:150])
- if re.search(self.title_life_dic['废标公告'], title):
- return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
- elif re.search('(中标|成交)结果', title[-8:]):
- return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
- return '招标答疑', (
- re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or re.search(
- '(答:|回复:)', text)).group(0)
- elif re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150]):
- return '废标公告', (
- re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150])).group(0)
- elif re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150]):
- if re.search('候选人|公示期?(已?满|已经?结束)|中标(结果|公告)', text) == None:
- return '中标信息', '候选人公示排除,修改为中标信息'
- return '候选人公示', (
- re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150])).group(
- 0)
- elif re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'], text[
- :150]):
- return '合同公告', (re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'],
- text[:150]) or re.search(
- self.life_dic['合同公告'], text)).group(0)
- elif re.search(self.life_dic['合同公告'].replace(';', '|'), text): # or re.search(self.life_dic['合同公告'], text[:300]):
- num, kw = count_diffser(self.life_dic['合同公告'], text)
- if num >= 3:
- return '合同公告', kw
- elif re.search(self.title_life_dic['招标公告'], title[-8:]):
- return '招标公告', re.search(self.title_life_dic['招标公告'], title[-8:]).group(0)
- elif not is_contain_winner(extract_json):
- return '', '有合同关键词无中标角色返回空'
- return '合同公告', re.search(self.life_dic['合同公告'].replace(';', '|'), text).group(0)
- elif is_single_source(extract_json, title):
- return '中标信息', '单一来源采购'
- elif re.search(self.title_life_dic['中标信息'], title):
- if re.search(self.title_life_dic['资审结果'], title+text[:150]):
- return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
- return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
- elif re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:]):
- if re.search(self.title_life_dic['资审结果'], title+text[:150]):
- return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
- # if re.search(self.wrong_win, text):
- # return '招标公告', re.search(self.wrong_win, text).group(0)
- return '中标信息', (
- re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:])).group(
- 0)
- elif re.search(self.life_dic['中标信息2'], text[:]):
- if re.search(self.wrong_win, text):
- return '招标公告', re.search(self.wrong_win, text).group(0)
- return '中标信息', re.search(self.life_dic['中标信息2'], text[:]).group(0)
- elif re.search(self.life_dic['中标信息3'], text[:]) and is_contain_winner(extract_json):
- if re.search(self.wrong_win, text):
- return '招标公告', re.search(self.wrong_win, text).group(0)
- return '中标信息', re.search(self.life_dic['中标信息3'], text[:]).group(0)
- elif re.search('公开选取.{,20}机构的公告', title):
- if re.search('(中标|成交|中选)(中介|服务)?机构(名称)?[::\s]', text):
- return '中标信息', '机构选取有中选机构'
- else:
- return '招标公告', '公开选取机构'
- elif is_contain_winner(extract_json):
- num, kw = count_diffser(self.life_dic['招标公告'], text)
- if re.search(self.wrong_win, text):
- return '招标公告', re.search(self.wrong_win, text).group(0)
- elif num >= 2:
- return '招标公告', kw
- elif re.search('##richTextFetch##', text):
- return '', '提取到中标人但包含附件返回空'
- return '中标信息', '提取到中标人'
- elif re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:]):
- return '资审结果', (re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:])).group(0)
- elif re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'), text[:]):
- if re.search('意向|预告|变更|更正|中标|中选|成交|答疑|废标|流标|终止', title):
- return '', '招标正则召回标题有其他类别关键词,返回空'
- return '招标公告', (re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'),
- text[:])).group(0)
- else:
- return '', '未预测到关键词, 返回空'
- not_extract_dic = {
- 104: '招标文件',
- 106: '法律法规',
- 107: '新闻资讯',
- 108: '拟建项目',
- 109: '展会推广',
- 110: '企业名录',
- 111: '企业资质',
- 112: '全国工程人员',
- 113: '业主采购'
- }
- origin_dic = {51: '公告变更',
- 52: '招标公告',
- 101: '中标信息',
- 102: '招标预告',
- 103: '招标答疑',
- 104: '招标文件',
- 105: '资审结果',
- 106: '法律法规',
- 107: '新闻资讯',
- 108: '拟建项目',
- 109: '展会推广',
- 110: '企业名录',
- 111: '企业资质',
- 112: '全国工程',
- 113: '业主采购',
- 114: '采购意向',
- 115: '拍卖出让',
- 116: '土地矿产',
- 117: '产权交易',
- 118: '废标公告',
- 119: '候选人公示',
- 120: '合同公告'}
- if original_docchannel in not_extract_dic:
- return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel]}}
- if web_source_no in ['02104-7', '04733']: # 这些数据源无法识别
- return {'docchannel': {'docchannel': '', 'doctype': '采招数据'}}
- title = re.sub('[^\u4e00-\u9fa5]', '', title)
- if len(title) > 50:
- title = title[:20] + title[-30:]
- text = html2text(html)
- prem_json = json.dumps(prem, ensure_ascii=False)
- result = {'docchannel': {'docchannel': '', 'doctype': ''}}
- doc_type, type_kw = get_type(title, text)
- doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
- if doc_type in self.title_type_dic:
- result['docchannel']['doctype'] = doc_type
- if doc_life in self.title_life_dic:
- result['docchannel']['docchannel'] = doc_life
- if doc_type=="" or doc_life=="":
- list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
- token_l = [it.tokens for it in list_sentence]
- tokens = [it for l in token_l for it in l]
- content = ' '.join(tokens[:500])
- data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
- dochtmlcon=content) # 标题最多取50字
- text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
- title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
- array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
- array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
- if doc_type == "":
- pred = self.type_sess.run(self.type_softmax,
- feed_dict={
- self.type_title: array_title,
- self.type_content: array_content,
- self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
- self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
- self.type_prob: 1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- result['docchannel']['doctype'] = self.id2type[id]
- # print('公告类别:', self.id2type[id], '概率:',prob)
- # if id == 0:
- if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
- if len(text)>150 and re.search(self.kws, content):
- pred = self.lift_sess.run(self.lift_softmax,
- feed_dict={
- self.lift_title: array_title,
- self.lift_content: array_content,
- self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
- self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
- self.lift_prob: 1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- if self.id2life[id] == '中标信息' and original_docchannel in [52, '52', '招标公告'] and not is_contain_winner(prem_json):
- result['docchannel']['docchannel'] = '招标公告'
- elif self.id2life[id] == '采购意向' and re.search('意向品牌|意向单位', text):
- result['docchannel']['docchannel'] = '招标公告'
- else:
- result['docchannel']['docchannel'] = self.id2life[id]
- # print('生命周期:',self.id2life[id], '概率:',prob)
- # if id == 6:
- if result['docchannel']['docchannel'] == '中标信息':
- if self.is_houxuan(''.join([it for it in title if it.isalpha()]),
- ''.join([it for it in content if it.isalpha()])):
- result['docchannel']['docchannel'] = '候选人公示'
- # return '候选人公示', prob
- # return [{'docchannel': '候选人公示'}]
- # print('公告类型:%s, 生命周期:%s, 关键词:%s '%(doc_type, doc_life, life_kw))
- # print('result: ', result)
- if result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(self.title_life_dic['废标公告'], title)==None:
- result['docchannel']['docchannel'] = '中标信息'
- if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
- result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
- else:
- result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
- return result
- # 保证金支付方式提取
- class DepositPaymentWay():
- def __init__(self,):
- self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[::]*([^,。]{,60})'
- self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式'
- kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)',
- '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码',
- '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出',
- '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函']
- self.kws = sorted(kws, key=lambda x: len(x), reverse=True)
- def predict(self,content):
- pay_way = {'deposit_patment_way':''}
- result = []
- pay = re.search(self.pt, content)
- if pay:
- # print(pay.group(0))
- pay = pay.group(3)
- for it in re.finditer('|'.join(self.kws), pay):
- result.append(it.group(0))
- pay_way['deposit_patment_way'] = ';'.join(result)
- return pay_way
- pay = re.search(self.pt2, content)
- if pay:
- # print(pay.group(0))
- pay = pay.group(2)
- for it in re.finditer('|'.join(self.kws), pay):
- result.append(it.group(0))
- pay_way['deposit_patment_way'] = ';'.join(result)
- return pay_way
- else:
- return pay_way
- # 总价单价提取
- class TotalUnitMoney:
- def __init__(self):
- pass
- def predict(self, list_sentences, list_entitys):
- for i in range(len(list_entitys)):
- list_entity = list_entitys[i]
- # 总价单价
- for _entity in list_entity:
- if _entity.entity_type == 'money':
- word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
- # 总价在中投标金额中
- if _entity.label == 1:
- result = extract_total_money(word_of_sentence,
- _entity.entity_text,
- [_entity.wordOffset_begin, _entity.wordOffset_end])
- if result:
- _entity.is_total_money = 1
- # 单价在普通金额中
- else:
- result = extract_unit_money(word_of_sentence,
- _entity.entity_text,
- [_entity.wordOffset_begin, _entity.wordOffset_end])
- if result:
- _entity.is_unit_money = 1
- # print("total_unit_money", _entity.entity_text,
- # _entity.is_total_money, _entity.is_unit_money)
- def getSavedModel():
- #predictor = FormPredictor()
- graph = tf.Graph()
- with graph.as_default():
- model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
-
- #print(tf.graph_util.remove_training_nodes(model))
- tf.saved_model.simple_save(
- tf.keras.backend.get_session(),
- "./h5_savedmodel/",
- inputs={"image": model.input},
- outputs={"scores": model.output}
- )
-
- def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
- '''
- model = models.Sequential()
- model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
- model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
- crf = CRF(len(chunk_tags), sparse_target=True)
- model.add(crf)
- model.summary()
- model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
- return model
- '''
- input = layers.Input(shape=(None,),dtype="int32")
- if weights is not None:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
- else:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
- bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
- bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
- crf = CRF(len(chunk_tags),sparse_target=True)
- crf_out = crf(bilstm_dense)
- model = models.Model(input=[input],output = [crf_out])
- model.summary()
- model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
- return model
- import h5py
- def h5_to_graph(sess,graph,h5file):
-
- f = h5py.File(h5file,'r') #打开h5文件
- def getValue(v):
- _value = f["model_weights"]
- list_names = str(v.name).split("/")
- for _index in range(len(list_names)):
- print(v.name)
- if _index==1:
- _value = _value[list_names[0]]
- _value = _value[list_names[_index]]
- return _value.value
-
- def _load_attributes_from_hdf5_group(group, name):
- """Loads attributes of the specified name from the HDF5 group.
-
- This method deals with an inherent problem
- of HDF5 file which is not able to store
- data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
-
- # Arguments
- group: A pointer to a HDF5 group.
- name: A name of the attributes to load.
-
- # Returns
- data: Attributes data.
- """
- if name in group.attrs:
- data = [n.decode('utf8') for n in group.attrs[name]]
- else:
- data = []
- chunk_id = 0
- while ('%s%d' % (name, chunk_id)) in group.attrs:
- data.extend([n.decode('utf8')
- for n in group.attrs['%s%d' % (name, chunk_id)]])
- chunk_id += 1
- return data
-
- def readGroup(gr,parent_name,data):
- for subkey in gr:
- print(subkey)
- if parent_name!=subkey:
- if parent_name=="":
- _name = subkey
- else:
- _name = parent_name+"/"+subkey
- else:
- _name = parent_name
- if str(type(gr[subkey]))=="<class 'h5py._hl.group.Group'>":
- readGroup(gr[subkey],_name,data)
- else:
- data.append([_name,gr[subkey].value])
- print(_name,gr[subkey].shape)
-
-
- layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names')
- list_name_value = []
- readGroup(f["model_weights"], "", list_name_value)
- '''
- for k, name in enumerate(layer_names):
- g = f["model_weights"][name]
- weight_names = _load_attributes_from_hdf5_group(g, 'weight_names')
- #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
- for weight_name in weight_names:
- list_name_value.append([weight_name,np.asarray(g[weight_name])])
- '''
- for name_value in list_name_value:
- name = name_value[0]
- '''
- if re.search("dense",name) is not None:
- name = name[:7]+"_1"+name[7:]
- '''
- value = name_value[1]
- print(name,graph.get_tensor_by_name(name),np.shape(value))
- sess.run(tf.assign(graph.get_tensor_by_name(name),value))
- def initialize_uninitialized(sess):
- global_vars = tf.global_variables()
- is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars])
- not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
-
- adam_vars = []
- for _vars in not_initialized_vars:
- if re.search("Adam",_vars.name) is not None:
- adam_vars.append(_vars)
-
- print([str(i.name) for i in adam_vars]) # only for testing
- if len(adam_vars):
- sess.run(tf.variables_initializer(adam_vars))
-
-
- def save_codename_model():
- # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
- filepath = "../projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
- vocabpath = "../projectCode/models/vocab.pk"
- classlabelspath = "../projectCode/models/classlabels.pk"
- # vocab = load(vocabpath)
- # class_labels = load(classlabelspath)
- w2v_matrix = load('codename_w2v_matrix.pk')
- graph = tf.get_default_graph()
- with graph.as_default() as g:
- ''''''
- # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
- #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
-
- sess = tf.Session(graph=g)
- # sess = tf.keras.backend.get_session()
- char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
- #with sess.as_default():
- sess.run(tf.global_variables_initializer())
- # print(sess.run("time_distributed_1/kernel:0"))
- # model.load_weights(filepath)
- saver = tf.train.Saver()
- saver.restore(sess, filepath)
- # print("logits",sess.run(logits))
-
- # print("#",sess.run("time_distributed_1/kernel:0"))
- # x = load("codename_x.pk")
- #y = model.predict(x)
- # y = sess.run(model.output,feed_dict={model.input:x})
-
- # for item in np.argmax(y,-1):
- # print(item)
- tf.saved_model.simple_save(
- sess,
- "./codename_savedmodel_tf/",
- inputs={"inputs": char_input,
- "inputs_length":length,
- 'keepprob':keepprob},
- outputs={"logits": logits,
- "trans":trans}
- )
-
-
- def save_role_model():
- '''
- @summary: 保存model为savedModel,部署到PAI平台上调用
- '''
- model_role = PREMPredict().model_role
- with model_role.graph.as_default():
- model = model_role.getModel()
- sess = tf.Session(graph=model_role.graph)
- print(type(model.input))
-
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, model_role.graph, model_role.model_role_file)
- model = model_role.getModel()
-
- tf.saved_model.simple_save(sess,
- "./role_savedmodel/",
- inputs={"input0":model.input[0],
- "input1":model.input[1],
- "input2":model.input[2]},
- outputs={"outputs":model.output}
- )
- def save_money_model():
- model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
- graph = tf.Graph()
- with graph.as_default():
- sess = tf.Session(graph=graph)
- with sess.as_default():
- # model = model_money.getModel()
- # model.summary()
- # sess.run(tf.global_variables_initializer())
- # h5_to_graph(sess, model_money.graph, model_money.model_money_file)
- model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- model.summary()
- print(model.weights)
- tf.saved_model.simple_save(sess,
- "./money_savedmodel2/",
- inputs = {"input0":model.input[0],
- "input1":model.input[1],
- "input2":model.input[2]},
- outputs = {"outputs":model.output}
- )
-
- def save_person_model():
- model_person = EPCPredict().model_person
- with model_person.graph.as_default():
-
- x = load("person_x.pk")
- _data = np.transpose(np.array(x),(1,0,2,3))
- model = model_person.getModel()
-
- sess = tf.Session(graph=model_person.graph)
- with sess.as_default():
-
- sess.run(tf.global_variables_initializer())
- model_person.load_weights()
-
-
- #h5_to_graph(sess, model_person.graph, model_person.model_person_file)
-
- predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]})
- #predict_y = model.predict([_data[0],_data[1]])
- print(np.argmax(predict_y,-1))
-
- tf.saved_model.simple_save(sess,
- "./person_savedmodel/",
- inputs={"input0":model.input[0],
- "input1":model.input[1]},
- outputs = {"outputs":model.output})
-
- def save_form_model():
- model_form = FormPredictor()
- with model_form.graph.as_default():
- model = model_form.getModel("item")
- sess = tf.Session(graph=model_form.graph)
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, model_form.graph, model_form.model_file_item)
- tf.saved_model.simple_save(sess,
- "./form_savedmodel/",
- inputs={"inputs":model.input},
- outputs = {"outputs":model.output})
-
- def save_codesplit_model():
- filepath_code = "../projectCode/models/model_code.hdf5"
-
-
- graph = tf.Graph()
- with graph.as_default():
- model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- sess = tf.Session()
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, graph, filepath_code)
- tf.saved_model.simple_save(sess,
- "./codesplit_savedmodel/",
- inputs={"input0":model_code.input[0],
- "input1":model_code.input[1],
- "input2":model_code.input[2]},
- outputs={"outputs":model_code.output})
- def save_timesplit_model():
- filepath = '../time/model_label_time_classify.model.hdf5'
- with tf.Graph().as_default() as graph:
- time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- with tf.Session() as sess:
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, graph, filepath)
- tf.saved_model.simple_save(sess,
- "./timesplit_model/",
- inputs={"input0":time_model.input[0],
- "input1":time_model.input[1]},
- outputs={"outputs":time_model.output})
- if __name__=="__main__":
- #save_role_model()
- # save_codename_model()
- # save_money_model()
- #save_person_model()
- #save_form_model()
- #save_codesplit_model()
- # save_timesplit_model()
- '''
- # with tf.Session(graph=tf.Graph()) as sess:
- # from tensorflow.python.saved_model import tag_constants
- # meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
- # graph = tf.get_default_graph()
- # signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- # signature = meta_graph_def.signature_def
- # input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
- # input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
- # outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
- # x = load("person_x.pk")
- # _data = np.transpose(x,[1,0,2,3])
- # y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
- # print(np.argmax(y,-1))
- '''
|