predictor.py 196 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442
  1. '''
  2. Created on 2018年12月26日
  3. @author: User
  4. '''
  5. import os
  6. import sys
  7. from BiddingKG.dl.common.nerUtils import *
  8. sys.path.append(os.path.abspath("../.."))
  9. # from keras.engine import topology
  10. # from keras import models
  11. # from keras import layers
  12. # from keras_contrib.layers.crf import CRF
  13. # from keras.preprocessing.sequence import pad_sequences
  14. # from keras import optimizers,losses,metrics
  15. from BiddingKG.dl.common.Utils import *
  16. from BiddingKG.dl.interface.modelFactory import *
  17. import tensorflow as tf
  18. from BiddingKG.dl.product.data_util import decode, process_data
  19. from BiddingKG.dl.interface.Entitys import Entity
  20. from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
  21. from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
  22. from bs4 import BeautifulSoup
  23. import copy
  24. import calendar
  25. import datetime
  26. from threading import RLock
  27. dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
  28. "prem":{"predictor":None,"Lock":RLock()},
  29. "epc":{"predictor":None,"Lock":RLock()},
  30. "roleRule":{"predictor":None,"Lock":RLock()},
  31. "roleRuleFinal":{"predictor":None,"Lock":RLock()},
  32. "tendereeRuleRecall":{"predictor":None,"Lock":RLock()},
  33. "form":{"predictor":None,"Lock":RLock()},
  34. "time":{"predictor":None,"Lock":RLock()},
  35. "punish":{"predictor":None,"Lock":RLock()},
  36. "product":{"predictor":None,"Lock":RLock()},
  37. "product_attrs":{"predictor":None,"Lock":RLock()},
  38. "channel": {"predictor": None, "Lock": RLock()},
  39. "deposit_payment_way": {"predictor": None, "Lock": RLock()},
  40. "total_unit_money": {"predictor": None, "Lock": RLock()}
  41. }
  42. def getPredictor(_type):
  43. if _type in dict_predictor:
  44. with dict_predictor[_type]["Lock"]:
  45. if dict_predictor[_type]["predictor"] is None:
  46. if _type == "codeName":
  47. dict_predictor[_type]["predictor"] = CodeNamePredict()
  48. if _type == "prem":
  49. dict_predictor[_type]["predictor"] = PREMPredict()
  50. if _type == "epc":
  51. dict_predictor[_type]["predictor"] = EPCPredict()
  52. if _type == "roleRule":
  53. dict_predictor[_type]["predictor"] = RoleRulePredictor()
  54. if _type == "roleRuleFinal":
  55. dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
  56. if _type == "tendereeRuleRecall":
  57. dict_predictor[_type]["predictor"] = TendereeRuleRecall()
  58. if _type == "form":
  59. dict_predictor[_type]["predictor"] = FormPredictor()
  60. if _type == "time":
  61. dict_predictor[_type]["predictor"] = TimePredictor()
  62. if _type == "punish":
  63. dict_predictor[_type]["predictor"] = Punish_Extract()
  64. if _type == "product":
  65. dict_predictor[_type]["predictor"] = ProductPredictor()
  66. if _type == "product_attrs":
  67. dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
  68. if _type == "channel":
  69. dict_predictor[_type]["predictor"] = DocChannel()
  70. if _type == 'deposit_payment_way':
  71. dict_predictor[_type]["predictor"] = DepositPaymentWay()
  72. if _type == 'total_unit_money':
  73. dict_predictor[_type]["predictor"] = TotalUnitMoney()
  74. return dict_predictor[_type]["predictor"]
  75. raise NameError("no this type of predictor")
  76. # 编号名称模型
  77. class CodeNamePredict():
  78. def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()):
  79. self.model = None
  80. self.MAX_LEN = None
  81. self.model_code = None
  82. if EMBED_DIM is None:
  83. self.EMBED_DIM = 60
  84. else:
  85. self.EMBED_DIM = EMBED_DIM
  86. if BiRNN_UNITS is None:
  87. self.BiRNN_UNITS = 200
  88. else:
  89. self.BiRNN_UNITS = BiRNN_UNITS
  90. self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
  91. #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5"
  92. self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5"
  93. vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk"
  94. classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk"
  95. self.vocab = load(vocabpath)
  96. self.class_labels = load(classlabelspath)
  97. #生成提取编号和名称的正则
  98. id_PC_B = self.class_labels.index("PC_B")
  99. id_PC_M = self.class_labels.index("PC_M")
  100. id_PC_E = self.class_labels.index("PC_E")
  101. id_PN_B = self.class_labels.index("PN_B")
  102. id_PN_M = self.class_labels.index("PN_M")
  103. id_PN_E = self.class_labels.index("PN_E")
  104. self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E))
  105. self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E))
  106. # print("pc",self.PC_pattern)
  107. # print("pn",self.PN_pattern)
  108. self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
  109. self.inputs = None
  110. self.outputs = None
  111. self.sess_codename = tf.Session(graph=tf.Graph())
  112. self.sess_codesplit = tf.Session(graph=tf.Graph())
  113. self.inputs_code = None
  114. self.outputs_code = None
  115. if not lazyLoad:
  116. self.getModel()
  117. self.getModel_code()
  118. def getModel(self):
  119. '''
  120. @summary: 取得编号和名称模型
  121. '''
  122. if self.inputs is None:
  123. log("get model of codename")
  124. with self.sess_codename.as_default():
  125. with self.sess_codename.graph.as_default():
  126. meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf")
  127. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  128. signature_def = meta_graph_def.signature_def
  129. self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
  130. self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name)
  131. self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name)
  132. self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name)
  133. self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name)
  134. return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
  135. else:
  136. return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
  137. '''
  138. if self.model is None:
  139. self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
  140. self.model.load_weights(self.filepath)
  141. return self.model
  142. '''
  143. def getModel_code(self):
  144. if self.inputs_code is None:
  145. log("get model of code")
  146. with self.sess_codesplit.as_default():
  147. with self.sess_codesplit.graph.as_default():
  148. meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel")
  149. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  150. signature_def = meta_graph_def.signature_def
  151. self.inputs_code = []
  152. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
  153. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
  154. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name))
  155. self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
  156. self.sess_codesplit.graph.finalize()
  157. return self.inputs_code,self.outputs_code
  158. else:
  159. return self.inputs_code,self.outputs_code
  160. '''
  161. if self.model_code is None:
  162. log("get model of model_code")
  163. with self.sess_codesplit.as_default():
  164. with self.sess_codesplit.graph.as_default():
  165. self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  166. return self.model_code
  167. '''
  168. def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
  169. '''
  170. model = models.Sequential()
  171. model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
  172. model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
  173. crf = CRF(len(chunk_tags), sparse_target=True)
  174. model.add(crf)
  175. model.summary()
  176. model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
  177. return model
  178. '''
  179. input = layers.Input(shape=(None,))
  180. if weights is not None:
  181. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
  182. else:
  183. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
  184. bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
  185. bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
  186. crf = CRF(len(chunk_tags),sparse_target=True)
  187. crf_out = crf(bilstm_dense)
  188. model = models.Model(input=[input],output = [crf_out])
  189. model.summary()
  190. model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
  191. return model
  192. #根据规则补全编号或名称两边的符号
  193. def fitDataByRule(self,data):
  194. symbol_dict = {"(":")",
  195. "(":")",
  196. "[":"]",
  197. "【":"】",
  198. ")":"(",
  199. ")":"(",
  200. "]":"[",
  201. "】":"【"}
  202. leftSymbol_pattern = re.compile("[\((\[【]")
  203. rightSymbol_pattern = re.compile("[\))\]】]")
  204. leftfinds = re.findall(leftSymbol_pattern,data)
  205. rightfinds = re.findall(rightSymbol_pattern,data)
  206. result = data
  207. if len(leftfinds)+len(rightfinds)==0:
  208. return data
  209. elif len(leftfinds)==len(rightfinds):
  210. return data
  211. elif abs(len(leftfinds)-len(rightfinds))==1:
  212. if len(leftfinds)>len(rightfinds):
  213. if symbol_dict.get(data[0]) is not None:
  214. result = data[1:]
  215. else:
  216. #print(symbol_dict.get(leftfinds[0]))
  217. result = data+symbol_dict.get(leftfinds[0])
  218. else:
  219. if symbol_dict.get(data[-1]) is not None:
  220. result = data[:-1]
  221. else:
  222. result = symbol_dict.get(rightfinds[0])+data
  223. return result
  224. def decode(self,logits, trans, sequence_lengths, tag_num):
  225. viterbi_sequences = []
  226. for logit, length in zip(logits, sequence_lengths):
  227. score = logit[:length]
  228. viterbi_seq, viterbi_score = viterbi_decode(score, trans)
  229. viterbi_sequences.append(viterbi_seq)
  230. return viterbi_sequences
  231. def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
  232. #@summary: 获取每篇文章的code和name
  233. pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
  234. result = []
  235. index_unk = self.word2index.get("<unk>")
  236. # index_pad = self.word2index.get("<pad>")
  237. if list_entitys is None:
  238. list_entitys = [[] for _ in range(len(list_sentences))]
  239. for list_sentence,list_entity in zip(list_sentences,list_entitys):
  240. if len(list_sentence)==0:
  241. result.append([{"code":[],"name":""}])
  242. continue
  243. doc_id = list_sentence[0].doc_id
  244. # sentences = []
  245. # for sentence in list_sentence:
  246. # if len(sentence.sentence_text)>MAX_AREA:
  247. # for _sentence_comma in re.split("[;;,\n]",sentence):
  248. # _comma_index = 0
  249. # while(_comma_index<len(_sentence_comma)):
  250. # sentences.append(_sentence_comma[_comma_index:_comma_index+MAX_AREA])
  251. # _comma_index += MAX_AREA
  252. # else:
  253. # sentences.append(sentence+"。")
  254. list_sentence.sort(key=lambda x:len(x.sentence_text),reverse=True)
  255. _begin_index = 0
  256. item = {"code":[],"name":""}
  257. code_set = set()
  258. dict_name_freq_score = dict()
  259. while(True):
  260. MAX_LEN = len(list_sentence[_begin_index].sentence_text)
  261. if MAX_LEN>MAX_AREA:
  262. MAX_LEN = MAX_AREA
  263. _LEN = MAX_AREA//MAX_LEN
  264. #预测
  265. x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  266. # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  267. x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
  268. x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
  269. if USE_PAI_EAS:
  270. request = tf_predict_pb2.PredictRequest()
  271. request.inputs["inputs"].dtype = tf_predict_pb2.DT_INT32
  272. request.inputs["inputs"].array_shape.dim.extend(np.shape(x))
  273. request.inputs["inputs"].int_val.extend(np.array(x,dtype=np.int32).reshape(-1))
  274. request_data = request.SerializeToString()
  275. list_outputs = ["outputs"]
  276. _result = vpc_requests(codename_url, codename_authorization, request_data, list_outputs)
  277. if _result is not None:
  278. predict_y = _result["outputs"]
  279. else:
  280. with self.sess_codename.as_default():
  281. t_input,t_output = self.getModel()
  282. predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
  283. else:
  284. with self.sess_codename.as_default():
  285. t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
  286. _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x,
  287. t_input_length:x_len,
  288. t_keepprob:1.0})
  289. predict_y = self.decode(_logits,_trans,x_len,7)
  290. # print('==========',_logits)
  291. '''
  292. for item11 in np.argmax(predict_y,-1):
  293. print(item11)
  294. print(predict_y)
  295. '''
  296. # print(predict_y)
  297. for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)):
  298. pad_sentence = sentence.sentence_text[:MAX_LEN]
  299. join_predict = "".join([str(s) for s in predict])
  300. # print(pad_sentence)
  301. # print(join_predict)
  302. code_x = []
  303. code_text = []
  304. temp_entitys = []
  305. for iter in re.finditer(self.PC_pattern,join_predict):
  306. get_len = 40
  307. if iter.span()[0]<get_len:
  308. begin = 0
  309. else:
  310. begin = iter.span()[0]-get_len
  311. end = iter.span()[1]+get_len
  312. code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
  313. code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""))
  314. _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
  315. temp_entitys.append(_entity)
  316. #print("code",code_text)
  317. if len(code_x)>0:
  318. code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3))
  319. if USE_PAI_EAS:
  320. request = tf_predict_pb2.PredictRequest()
  321. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  322. request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0]))
  323. request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1))
  324. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  325. request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1]))
  326. request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1))
  327. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  328. request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2]))
  329. request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1))
  330. request_data = request.SerializeToString()
  331. list_outputs = ["outputs"]
  332. _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs)
  333. if _result is not None:
  334. predict_code = _result["outputs"]
  335. else:
  336. with self.sess_codesplit.as_default():
  337. with self.sess_codesplit.graph.as_default():
  338. predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  339. else:
  340. with self.sess_codesplit.as_default():
  341. with self.sess_codesplit.graph.as_default():
  342. inputs_code,outputs_code = self.getModel_code()
  343. predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})[0]
  344. #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
  345. #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  346. for h in range(len(predict_code)):
  347. if predict_code[h][0]>0.5:
  348. the_code = self.fitDataByRule(code_text[h])
  349. #add code to entitys
  350. list_entity.append(temp_entitys[h])
  351. if the_code not in code_set:
  352. code_set.add(the_code)
  353. item['code'] = list(code_set)
  354. for iter in re.finditer(self.PN_pattern,join_predict):
  355. _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  356. #add name to entitys
  357. _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
  358. list_entity.append(_entity)
  359. w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
  360. if _name not in dict_name_freq_score:
  361. # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
  362. dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
  363. else:
  364. dict_name_freq_score[_name][0] += 1
  365. '''
  366. for iter in re.finditer(self.PN_pattern,join_predict):
  367. print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
  368. if item[1]['name']=="":
  369. for iter in re.finditer(self.PN_pattern,join_predict):
  370. #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  371. item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  372. break
  373. '''
  374. if _begin_index+_LEN>=len(list_sentence):
  375. break
  376. _begin_index += _LEN
  377. list_name_freq_score = []
  378. # 2020/11/23 大网站规则调整
  379. if len(dict_name_freq_score) == 0:
  380. name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
  381. for sentence in list_sentence:
  382. # pad_sentence = sentence.sentence_text
  383. othername = re.search(name_re1, sentence.sentence_text)
  384. if othername != None:
  385. project_name = othername.group(3)
  386. beg = find_index([project_name], sentence.sentence_text)[0]
  387. end = beg + len(project_name)
  388. _name = self.fitDataByRule(sentence.sentence_text[beg:end])
  389. # add name to entitys
  390. _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
  391. sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
  392. entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
  393. end_index=0, wordOffset_begin=beg, wordOffset_end=end,in_attachment=sentence.in_attachment)
  394. list_entity.append(_entity)
  395. w = 1
  396. if _name not in dict_name_freq_score:
  397. # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
  398. dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
  399. else:
  400. dict_name_freq_score[_name][0] += 1
  401. # othername = re.search(name_re1, sentence.sentence_text)
  402. # if othername != None:
  403. # _name = othername.group(3)
  404. # if _name not in dict_name_freq_score:
  405. # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1]
  406. # else:
  407. # dict_name_freq_score[_name][0] += 1
  408. for _name in dict_name_freq_score.keys():
  409. list_name_freq_score.append([_name,dict_name_freq_score[_name]])
  410. # print(list_name_freq_score)
  411. if len(list_name_freq_score)>0:
  412. list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
  413. item['name'] = list_name_freq_score[0][0]
  414. # if list_name_freq_score[0][1][0]>1:
  415. # item[1]['name'] = list_name_freq_score[0][0]
  416. # else:
  417. # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True)
  418. # item[1]["name"] = list_name_freq_score[0][0]
  419. #下面代码加上去用正则添加某些识别不到的项目编号
  420. if item['code'] == []:
  421. for sentence in list_sentence:
  422. # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text)
  423. # if othercode != None:
  424. # item[1]['code'].append(othercode.group(2))
  425. # 2020/11/23 大网站规则调整
  426. othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
  427. if othercode != None:
  428. item['code'].append(othercode.group(3))
  429. item['code'].sort(key=lambda x:len(x),reverse=True)
  430. result.append(item)
  431. list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
  432. return result
  433. '''
  434. #当数据量过大时会报错
  435. def predict(self,articles,MAX_LEN = None):
  436. sentences = []
  437. for article in articles:
  438. for sentence in article.content.split("。"):
  439. sentences.append([sentence,article.id])
  440. if MAX_LEN is None:
  441. sent_len = [len(sentence[0]) for sentence in sentences]
  442. MAX_LEN = max(sent_len)
  443. #print(MAX_LEN)
  444. #若为空,则直接返回空
  445. result = []
  446. if MAX_LEN==0:
  447. for article in articles:
  448. result.append([article.id,{"code":[],"name":""}])
  449. return result
  450. index_unk = self.word2index.get("<unk>")
  451. index_pad = self.word2index.get("<pad>")
  452. x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences]
  453. x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
  454. predict_y = self.getModel().predict(x)
  455. last_doc_id = ""
  456. item = []
  457. for sentence,predict in zip(sentences,np.argmax(predict_y,-1)):
  458. pad_sentence = sentence[0][:MAX_LEN]
  459. doc_id = sentence[1]
  460. join_predict = "".join([str(s) for s in predict])
  461. if doc_id!=last_doc_id:
  462. if last_doc_id!="":
  463. result.append(item)
  464. item = [doc_id,{"code":[],"name":""}]
  465. code_set = set()
  466. code_x = []
  467. code_text = []
  468. for iter in re.finditer(self.PC_pattern,join_predict):
  469. get_len = 40
  470. if iter.span()[0]<get_len:
  471. begin = 0
  472. else:
  473. begin = iter.span()[0]-get_len
  474. end = iter.span()[1]+get_len
  475. code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
  476. code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
  477. if len(code_x)>0:
  478. code_x = np.transpose(np.array(code_x),(1,0,2,3))
  479. predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  480. for h in range(len(predict_code)):
  481. if predict_code[h][0]>0.5:
  482. the_code = self.fitDataByRule(code_text[h])
  483. if the_code not in code_set:
  484. code_set.add(the_code)
  485. item[1]['code'] = list(code_set)
  486. if item[1]['name']=="":
  487. for iter in re.finditer(self.PN_pattern,join_predict):
  488. #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  489. item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  490. break
  491. last_doc_id = doc_id
  492. result.append(item)
  493. return result
  494. '''
  495. #角色金额模型
  496. class PREMPredict():
  497. def __init__(self):
  498. #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
  499. self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
  500. self.model_role = Model_role_classify_word()
  501. self.model_money = Model_money_classify()
  502. return
  503. def search_role_data(self,list_sentences,list_entitys):
  504. '''
  505. @summary:根据句子list和实体list查询角色模型的输入数据
  506. @param:
  507. list_sentences:文章的sentences
  508. list_entitys:文章的entitys
  509. @return:角色模型的输入数据
  510. '''
  511. text_list = []
  512. data_x = []
  513. points_entitys = []
  514. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  515. list_entity.sort(key=lambda x:x.sentence_index)
  516. list_sentence.sort(key=lambda x:x.sentence_index)
  517. p_entitys = 0
  518. p_sentences = 0
  519. while(p_entitys<len(list_entity)):
  520. entity = list_entity[p_entitys]
  521. if entity.entity_type in ['org','company']:
  522. while(p_sentences<len(list_sentence)):
  523. sentence = list_sentence[p_sentences]
  524. if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
  525. text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-10):entity.wordOffset_end+10])
  526. #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
  527. item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
  528. data_x.append(item_x)
  529. points_entitys.append(entity)
  530. break
  531. p_sentences += 1
  532. p_entitys += 1
  533. if len(points_entitys)==0:
  534. return None
  535. return [data_x,points_entitys, text_list]
  536. def search_money_data(self,list_sentences,list_entitys):
  537. '''
  538. @summary:根据句子list和实体list查询金额模型的输入数据
  539. @param:
  540. list_sentences:文章的sentences
  541. list_entitys:文章的entitys
  542. @return:金额模型的输入数据
  543. '''
  544. text_list = []
  545. data_x = []
  546. points_entitys = []
  547. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  548. list_entity.sort(key=lambda x:x.sentence_index)
  549. list_sentence.sort(key=lambda x:x.sentence_index)
  550. p_entitys = 0
  551. while(p_entitys<len(list_entity)):
  552. entity = list_entity[p_entitys]
  553. if entity.entity_type=="money":
  554. p_sentences = 0
  555. while(p_sentences<len(list_sentence)):
  556. sentence = list_sentence[p_sentences]
  557. if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
  558. text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 8):entity.wordOffset_end])
  559. #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
  560. #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
  561. item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
  562. data_x.append(item_x)
  563. points_entitys.append(entity)
  564. break
  565. p_sentences += 1
  566. p_entitys += 1
  567. if len(points_entitys)==0:
  568. return None
  569. return [data_x,points_entitys, text_list]
  570. def predict_role(self,list_sentences, list_entitys):
  571. datas = self.search_role_data(list_sentences, list_entitys)
  572. if datas is None:
  573. return
  574. points_entitys = datas[1]
  575. text_list = datas[2]
  576. if USE_PAI_EAS:
  577. _data = datas[0]
  578. _data = np.transpose(np.array(_data),(1,0,2))
  579. request = tf_predict_pb2.PredictRequest()
  580. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  581. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  582. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  583. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  584. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  585. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  586. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  587. request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
  588. request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
  589. request_data = request.SerializeToString()
  590. list_outputs = ["outputs"]
  591. _result = vpc_requests(role_url, role_authorization, request_data, list_outputs)
  592. if _result is not None:
  593. predict_y = _result["outputs"]
  594. else:
  595. predict_y = self.model_role.predict(datas[0])
  596. else:
  597. predict_y = self.model_role.predict(np.array(datas[0],dtype=np.float64))
  598. for i in range(len(predict_y)):
  599. entity = points_entitys[i]
  600. label = np.argmax(predict_y[i])
  601. values = predict_y[i]
  602. text = text_list[i]
  603. if label == 2:
  604. if re.search('中标单位和.{,25}签订合同', text):
  605. label = 0
  606. values[label] = 0.501
  607. elif re.search('尊敬的供应商:.{,25}我公司', text):
  608. label = 0
  609. values[label] = 0.801
  610. elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
  611. label = 0
  612. values[label] = 0.501
  613. elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', text[:-10]):
  614. label = 2
  615. values[label] = 0.501
  616. entity.set_Role(label, values)
  617. def predict_money(self,list_sentences,list_entitys):
  618. datas = self.search_money_data(list_sentences, list_entitys)
  619. if datas is None:
  620. return
  621. points_entitys = datas[1]
  622. _data = datas[0]
  623. text_list = datas[2]
  624. if USE_PAI_EAS:
  625. _data = np.transpose(np.array(_data),(1,0,2,3))
  626. request = tf_predict_pb2.PredictRequest()
  627. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  628. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  629. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  630. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  631. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  632. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  633. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  634. request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
  635. request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
  636. request_data = request.SerializeToString()
  637. list_outputs = ["outputs"]
  638. _result = vpc_requests(money_url, money_authorization, request_data, list_outputs)
  639. if _result is not None:
  640. predict_y = _result["outputs"]
  641. else:
  642. predict_y = self.model_money.predict(_data)
  643. else:
  644. predict_y = self.model_money.predict(_data)
  645. for i in range(len(predict_y)):
  646. entity = points_entitys[i]
  647. label = np.argmax(predict_y[i])
  648. values = predict_y[i]
  649. text = text_list[i]
  650. if label == 1 and re.search('[::,。](总金额|总价|单价)', text):
  651. values[label] = 0.49
  652. elif label ==0 and entity.notes in ["投资", "工程造价"]:
  653. values[label] = 0.49
  654. entity.set_Money(label, values)
  655. def predict(self,list_sentences,list_entitys):
  656. self.predict_role(list_sentences,list_entitys)
  657. self.predict_money(list_sentences,list_entitys)
  658. #联系人模型
  659. class EPCPredict():
  660. def __init__(self):
  661. self.model_person = Model_person_classify()
  662. def search_person_data(self,list_sentences,list_entitys):
  663. '''
  664. @summary:根据句子list和实体list查询联系人模型的输入数据
  665. @param:
  666. list_sentences:文章的sentences
  667. list_entitys:文章的entitys
  668. @return:联系人模型的输入数据
  669. '''
  670. data_x = []
  671. points_entitys = []
  672. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  673. p_entitys = 0
  674. dict_index_sentence = {}
  675. for _sentence in list_sentence:
  676. dict_index_sentence[_sentence.sentence_index] = _sentence
  677. _list_entity = [entity for entity in list_entity if entity.entity_type=="person"]
  678. while(p_entitys<len(_list_entity)):
  679. entity = _list_entity[p_entitys]
  680. if entity.entity_type=="person":
  681. sentence = dict_index_sentence[entity.sentence_index]
  682. item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
  683. data_x.append(item_x)
  684. points_entitys.append(entity)
  685. p_entitys += 1
  686. if len(points_entitys)==0:
  687. return None
  688. # return [data_x,points_entitys,dianhua]
  689. return [data_x,points_entitys]
  690. def predict_person(self,list_sentences, list_entitys):
  691. datas = self.search_person_data(list_sentences, list_entitys)
  692. if datas is None:
  693. return
  694. points_entitys = datas[1]
  695. # phone = datas[2]
  696. if USE_PAI_EAS:
  697. _data = datas[0]
  698. _data = np.transpose(np.array(_data),(1,0,2,3))
  699. request = tf_predict_pb2.PredictRequest()
  700. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  701. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  702. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  703. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  704. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  705. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  706. request_data = request.SerializeToString()
  707. list_outputs = ["outputs"]
  708. _result = vpc_requests(person_url, person_authorization, request_data, list_outputs)
  709. if _result is not None:
  710. predict_y = _result["outputs"]
  711. else:
  712. predict_y = self.model_person.predict(datas[0])
  713. else:
  714. predict_y = self.model_person.predict(datas[0])
  715. # assert len(predict_y)==len(points_entitys)==len(phone)
  716. assert len(predict_y)==len(points_entitys)
  717. for i in range(len(predict_y)):
  718. entity = points_entitys[i]
  719. label = np.argmax(predict_y[i])
  720. values = []
  721. for item in predict_y[i]:
  722. values.append(item)
  723. # phone_number = phone[i]
  724. # entity.set_Person(label,values,phone_number)
  725. entity.set_Person(label,values,[])
  726. # 为联系人匹配电话
  727. # self.person_search_phone(list_sentences, list_entitys)
  728. def person_search_phone(self,list_sentences, list_entitys):
  729. def phoneFromList(phones):
  730. # for phone in phones:
  731. # if len(phone)==11:
  732. # return re.sub('电话[:|:]|联系方式[:|:]','',phone)
  733. return re.sub('电话[:|:]|联系方式[:|:]', '', phones[0])
  734. for list_entity, list_sentence in zip(list_entitys, list_sentences):
  735. # p_entitys = 0
  736. # p_sentences = 0
  737. #
  738. # key_word = re.compile('电话[:|:].{0,4}\d{7,12}|联系方式[:|:].{0,4}\d{7,12}')
  739. # # phone = re.compile('1[3|4|5|7|8][0-9][-—-]?\d{4}[-—-]?\d{4}|\d{3,4}[-—-]\d{7,8}/\d{3,8}|\d{3,4}[-—-]\d{7,8}转\d{1,4}|\d{3,4}[-—-]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}') # 联系电话
  740. # # 2020/11/25 增加发现的号码段
  741. # phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-]?\d{4}[-—-]?\d{4}|'
  742. # '\d{3,4}[-—-][1-9]\d{6,7}/\d{3,8}|'
  743. # '\d{3,4}[-—-]\d{7,8}转\d{1,4}|'
  744. # '\d{3,4}[-—-]?[1-9]\d{6,7}|'
  745. # '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
  746. # '[1-9]\d{6,7}') # 联系电话
  747. # dict_index_sentence = {}
  748. # for _sentence in list_sentence:
  749. # dict_index_sentence[_sentence.sentence_index] = _sentence
  750. #
  751. # dict_context_itemx = {}
  752. # last_person = "####****++++$$^"
  753. # last_person_phone = "####****++++$^"
  754. # _list_entity = [entity for entity in list_entity if entity.entity_type == "person"]
  755. # while (p_entitys < len(_list_entity)):
  756. # entity = _list_entity[p_entitys]
  757. # if entity.entity_type == "person" and entity.label in [1,2,3]:
  758. # sentence = dict_index_sentence[entity.sentence_index]
  759. # # item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_PERSON_INPUT_SHAPE[1]),shape=settings.MODEL_PERSON_INPUT_SHAPE)
  760. #
  761. # # s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20)
  762. #
  763. # # 2021/5/8 取上下文的句子,解决表格处理的分句问题
  764. # left_sentence = dict_index_sentence.get(entity.sentence_index - 1)
  765. # left_sentence_tokens = left_sentence.tokens if left_sentence else []
  766. # right_sentence = dict_index_sentence.get(entity.sentence_index + 1)
  767. # right_sentence_tokens = right_sentence.tokens if right_sentence else []
  768. # entity_beginIndex = entity.begin_index + len(left_sentence_tokens)
  769. # entity_endIndex = entity.end_index + len(left_sentence_tokens)
  770. # context_sentences_tokens = left_sentence_tokens + sentence.tokens + right_sentence_tokens
  771. # s = spanWindow(tokens=context_sentences_tokens, begin_index=entity_beginIndex,
  772. # end_index=entity_endIndex, size=20)
  773. #
  774. # _key = "".join(["".join(x) for x in s])
  775. # if _key in dict_context_itemx:
  776. # _dianhua = dict_context_itemx[_key][0]
  777. # else:
  778. # s1 = ''.join(s[1])
  779. # # s1 = re.sub(',)', '-', s1)
  780. # s1 = re.sub('\s', '', s1)
  781. # have_key = re.findall(key_word, s1)
  782. # have_phone = re.findall(phone, s1)
  783. # s0 = ''.join(s[0])
  784. # # s0 = re.sub(',)', '-', s0)
  785. # s0 = re.sub('\s', '', s0)
  786. # have_key2 = re.findall(key_word, s0)
  787. # have_phone2 = re.findall(phone, s0)
  788. #
  789. # s3 = ''.join(s[1])
  790. # # s0 = re.sub(',)', '-', s0)
  791. # s3 = re.sub(',|,|\s', '', s3)
  792. # have_key3 = re.findall(key_word, s3)
  793. # have_phone3 = re.findall(phone, s3)
  794. #
  795. # s4 = ''.join(s[0])
  796. # # s0 = re.sub(',)', '-', s0)
  797. # s4 = re.sub(',|,|\s', '', s0)
  798. # have_key4 = re.findall(key_word, s4)
  799. # have_phone4 = re.findall(phone, s4)
  800. #
  801. # _dianhua = ""
  802. # if have_phone:
  803. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
  804. # last_person_phone) != -1:
  805. # if len(have_phone) > 1:
  806. # _dianhua = phoneFromList(have_phone[1:])
  807. # else:
  808. # _dianhua = phoneFromList(have_phone)
  809. # elif have_key:
  810. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
  811. # last_person_phone) != -1:
  812. # if len(have_key) > 1:
  813. # _dianhua = phoneFromList(have_key[1:])
  814. # else:
  815. # _dianhua = phoneFromList(have_key)
  816. # elif have_phone2:
  817. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
  818. # last_person_phone) != -1:
  819. # if len(have_phone2) > 1:
  820. # _dianhua = phoneFromList(have_phone2[1:])
  821. # else:
  822. # _dianhua = phoneFromList(have_phone2)
  823. # elif have_key2:
  824. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
  825. # last_person_phone) != -1:
  826. # if len(have_key2) > 1:
  827. # _dianhua = phoneFromList(have_key2[1:])
  828. # else:
  829. # _dianhua = phoneFromList(have_key2)
  830. # elif have_phone3:
  831. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
  832. # last_person_phone) != -1:
  833. # if len(have_phone3) > 1:
  834. # _dianhua = phoneFromList(have_phone3[1:])
  835. # else:
  836. # _dianhua = phoneFromList(have_phone3)
  837. # elif have_key3:
  838. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
  839. # last_person_phone) != -1:
  840. # if len(have_key3) > 1:
  841. # _dianhua = phoneFromList(have_key3[1:])
  842. # else:
  843. # _dianhua = phoneFromList(have_key3)
  844. # elif have_phone4:
  845. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
  846. # last_person_phone) != -1:
  847. # if len(have_phone4) > 1:
  848. # _dianhua = phoneFromList(have_phone4)
  849. # else:
  850. # _dianhua = phoneFromList(have_phone4)
  851. # elif have_key4:
  852. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
  853. # last_person_phone) != -1:
  854. # if len(have_key4) > 1:
  855. # _dianhua = phoneFromList(have_key4)
  856. # else:
  857. # _dianhua = phoneFromList(have_key4)
  858. # else:
  859. # _dianhua = ""
  860. # # dict_context_itemx[_key] = [item_x, _dianhua]
  861. # dict_context_itemx[_key] = [_dianhua]
  862. # # points_entitys.append(entity)
  863. # # dianhua.append(_dianhua)
  864. # last_person = entity.entity_text
  865. # if _dianhua:
  866. # # 更新联系人entity联系方式(person_phone)
  867. # entity.person_phone = _dianhua
  868. # last_person_phone = _dianhua
  869. # else:
  870. # last_person_phone = "####****++++$^"
  871. # p_entitys += 1
  872. from scipy.optimize import linear_sum_assignment
  873. from BiddingKG.dl.interface.Entitys import Match
  874. def dispatch(match_list):
  875. main_roles = list(set([match.main_role for match in match_list]))
  876. attributes = list(set([match.attribute for match in match_list]))
  877. label = np.zeros(shape=(len(main_roles), len(attributes)))
  878. for match in match_list:
  879. main_role = match.main_role
  880. attribute = match.attribute
  881. value = match.value
  882. label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
  883. # print(label)
  884. gragh = -label
  885. # km算法
  886. row, col = linear_sum_assignment(gragh)
  887. max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
  888. return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
  889. # km算法
  890. key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})')
  891. phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
  892. '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
  893. '0\d{2,3}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
  894. '0\d{2,3}[-—-―]\d{7,8}转\d{1,4}|'
  895. '0\d{2,3}[-—-―]?[1-9]\d{6,7}|'
  896. '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
  897. '[1-9]\d{6,7}')
  898. phone_entitys = []
  899. for _sentence in list_sentence:
  900. sentence_text = _sentence.sentence_text
  901. res_set = set()
  902. for i in re.finditer(phone,sentence_text):
  903. res_set.add((i.group(),i.start(),i.end()))
  904. for i in re.finditer(key_word,sentence_text):
  905. res_set.add((i.group(2),i.start()+len(i.group(1)),i.end()))
  906. for item in list(res_set):
  907. phone_left = sentence_text[max(0,item[1]-10):item[1]]
  908. phone_right = sentence_text[item[2]:item[2]+8]
  909. # 排除传真号 和 其它错误项
  910. if re.search("传,?真|信,?箱|邮,?箱",phone_left):
  911. if not re.search("电,?话",phone_left):
  912. continue
  913. if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]",phone_left):
  914. continue
  915. if re.search("[.,]\d{2,}",phone_right):
  916. continue
  917. _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2],in_attachment=_sentence.in_attachment)
  918. phone_entitys.append(_entity)
  919. person_entitys = []
  920. for entity in list_entity:
  921. if entity.entity_type == "person":
  922. entity.person_phone = ""
  923. person_entitys.append(entity)
  924. _list_entity = phone_entitys + person_entitys
  925. _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin))
  926. words_num_dict = dict()
  927. last_words_num = 0
  928. list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
  929. for sentence in list_sentence:
  930. _index = sentence.sentence_index
  931. if _index == 0:
  932. words_num_dict[_index] = 0
  933. else:
  934. words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
  935. last_words_num = len(sentence.sentence_text)
  936. match_list = []
  937. for index in range(len(_list_entity)):
  938. entity = _list_entity[index]
  939. if entity.entity_type=="person" and entity.label in [1,2,3]:
  940. match_nums = 0
  941. for after_index in range(index + 1, min(len(_list_entity), index + 5)):
  942. after_entity = _list_entity[after_index]
  943. if after_entity.entity_type=="phone":
  944. sentence_distance = after_entity.sentence_index - entity.sentence_index
  945. distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - (
  946. words_num_dict[entity.sentence_index] + entity.wordOffset_end)
  947. if sentence_distance < 2 and distance < 50:
  948. value = (-1 / 2 * (distance ** 2)) / 10000
  949. match_list.append(Match(entity, after_entity, value))
  950. match_nums += 1
  951. else:
  952. break
  953. if after_entity.entity_type=="person":
  954. if after_entity.label not in [1,2,3]:
  955. break
  956. if not match_nums:
  957. for previous_index in range(index-1, max(0,index-5), -1):
  958. previous_entity = _list_entity[previous_index]
  959. if previous_entity.entity_type == "phone":
  960. sentence_distance = entity.sentence_index - previous_entity.sentence_index
  961. distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - (
  962. words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end)
  963. if sentence_distance < 1 and distance<30:
  964. # 前向 没有 /10000
  965. value = (-1 / 2 * (distance ** 2))
  966. match_list.append(Match(entity, previous_entity, value))
  967. else:
  968. break
  969. result = dispatch(match_list)
  970. for match in result:
  971. entity = match.main_role
  972. # 更新 list_entity
  973. entity_index = list_entity.index(entity)
  974. list_entity[entity_index].person_phone = match.attribute.entity_text
  975. def predict(self,list_sentences,list_entitys):
  976. self.predict_person(list_sentences,list_entitys)
  977. #表格预测
  978. class FormPredictor():
  979. def __init__(self,lazyLoad=getLazyLoad()):
  980. self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
  981. self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
  982. self.model_form_item = Model_form_item()
  983. self.model_form_context = Model_form_context()
  984. self.model_dict = {"line":[None,self.model_file_line]}
  985. def getModel(self,type):
  986. if type=="item":
  987. return self.model_form_item
  988. elif type=="context":
  989. return self.model_form_context
  990. else:
  991. return self.getModel(type)
  992. def encode(self,data,**kwargs):
  993. return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0]
  994. return encodeInput_form(data)
  995. def predict(self,form_datas,type):
  996. if type=="item":
  997. return self.model_form_item.predict(form_datas)
  998. elif type=="context":
  999. return self.model_form_context.predict(form_datas)
  1000. else:
  1001. return self.getModel(type).predict(form_datas)
  1002. #角色规则
  1003. #依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
  1004. class RoleRulePredictor():
  1005. def __init__(self):
  1006. # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
  1007. self.pattern_tenderee_left = "(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
  1008. "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
  1009. "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
  1010. self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
  1011. "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
  1012. "(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
  1013. self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托))"
  1014. self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))])|^委托|^现委托|^的\w{2,10}正在进行)" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
  1015. self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
  1016. self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
  1017. self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
  1018. # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
  1019. self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
  1020. "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
  1021. "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|服务|实施)(机构|单位|商|方)(名称)?[::是为]+$)"
  1022. self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系
  1023. # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
  1024. # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
  1025. self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
  1026. "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^成为[\w、()()]+项目的成交供应商))"
  1027. self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)|中标通知书.{,15}你方" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
  1028. # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
  1029. self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
  1030. self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
  1031. self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
  1032. self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
  1033. self.pattern_whole = [self.pattern_tenderee_left,
  1034. self.pattern_tenderee_left_w1,
  1035. self.pattern_tenderee_center,
  1036. self.pattern_tenderee_right,
  1037. self.pattern_tendereeORagency_right,
  1038. self.pattern_agency_left,
  1039. self.pattern_agency_right,
  1040. self.pattern_winTenderer_left,
  1041. self.pattern_winTenderer_left_w1,
  1042. self.pattern_winTenderer_whole,
  1043. self.pattern_winTenderer_right,
  1044. self.pattern_secondTenderer_left,
  1045. self.pattern_secondTenderer_right,
  1046. self.pattern_thirdTenderer_left,
  1047. self.pattern_thirdTenderer_right
  1048. ] # 需按顺序排列, 第二、三中标要在中标正则后面
  1049. self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
  1050. self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额")
  1051. self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况")
  1052. self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
  1053. self.pattern_money_other = re.compile("代理费|服务费")
  1054. self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
  1055. def _check_input(self,text, ignore=False):
  1056. if not text:
  1057. return []
  1058. if not isinstance(text, list):
  1059. text = [text]
  1060. null_index = [i for i, t in enumerate(text) if not t]
  1061. if null_index and not ignore:
  1062. raise Exception("null text in input ")
  1063. return text
  1064. def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
  1065. for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
  1066. list_codenames):
  1067. list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序
  1068. # list_name = list_codename["name"]
  1069. list_name = [] # 2022/1/5 改为实体列表内所有项目名称
  1070. for entity in list_entity:
  1071. if entity.entity_type == 'name':
  1072. list_name.append(entity.entity_text)
  1073. list_name = self._check_input(list_name) + [article.title]
  1074. for p_entity in list_entity:
  1075. if p_entity.entity_type in ["org", "company"]:
  1076. # 只解析角色为无的或者概率低于阈值的
  1077. if p_entity.label is None:
  1078. continue
  1079. # 将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
  1080. if str(p_entity.label) == "0":
  1081. find_flag = False
  1082. for _sentence in list_sentence:
  1083. if _sentence.sentence_index == p_entity.sentence_index:
  1084. _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1085. end_index=p_entity.end_index, size=20, center_include=True,
  1086. word_flag=True, use_text=True,
  1087. text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
  1088. for _name in list_name:
  1089. if _name != "" and str(_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:
  1090. find_flag = True
  1091. if p_entity.values[0] > on_value:
  1092. p_entity.values[0] = 0.6 + (p_entity.values[0] - 0.6) / 10
  1093. else:
  1094. p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
  1095. if find_flag:
  1096. continue
  1097. # 正则从概率低于阈值或其他类别中召回角色
  1098. role_prob = float(p_entity.values[int(p_entity.label)])
  1099. if role_prob < on_value or str(p_entity.label) == "5":
  1100. # 将标题中的实体置为招标人
  1101. _list_name = self._check_input(list_name, ignore=True)
  1102. find_flag = False
  1103. for _name in _list_name: # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
  1104. if str(_name).find(re.sub(")", ")", re.sub("(", "(",
  1105. p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4:
  1106. for _sentence in list_sentence:
  1107. if _sentence.sentence_index == p_entity.sentence_index:
  1108. _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1109. end_index=p_entity.end_index, size=20, center_include=True,
  1110. word_flag=True, use_text=True, text=re.sub(")", ")",
  1111. re.sub("(", "(",
  1112. p_entity.entity_text)))
  1113. if str(_span[1] + _span[2][:len(str(_name))]).find(
  1114. _name) >= 0:
  1115. find_flag = True
  1116. _label = 0
  1117. p_entity.label = _label
  1118. p_entity.values[int(_label)] = on_value
  1119. break
  1120. if p_entity.sentence_index >= 4:
  1121. break
  1122. if find_flag:
  1123. break
  1124. # if str(_name).find(p_entity.entity_text)>=0:
  1125. # find_flag = True
  1126. # _label = 0
  1127. # p_entity.label = _label
  1128. # p_entity.values[int(_label)] = on_value
  1129. # break
  1130. # 若是实体在标题中,默认为招标人,不进行以下的规则匹配
  1131. if find_flag:
  1132. continue
  1133. for s_index in range(len(list_sentence)):
  1134. if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \
  1135. list_sentence[s_index].sentence_index:
  1136. tokens = list_sentence[s_index].tokens
  1137. begin_index = p_entity.begin_index
  1138. end_index = p_entity.end_index
  1139. size = 15
  1140. spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
  1141. word_flag=True, use_text=False)
  1142. # _flag = False
  1143. # 使用正则+距离解决冲突
  1144. # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
  1145. list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:10], spans[2]] # 实体左、中、右 信息
  1146. for _i_span in range(len(list_spans)):
  1147. _flag = False
  1148. _prob_weight = 1
  1149. # print(list_spans[_i_span],p_entity.entity_text)
  1150. for _pattern in self.pattern_whole:
  1151. for _iter in re.finditer(_pattern, list_spans[_i_span]):
  1152. for _group, _v_group in _iter.groupdict().items():
  1153. if _v_group is not None and _v_group != "":
  1154. _role = _group.split("_")[0]
  1155. if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
  1156. # print('p_entity_sentenceindex:', p_entity.sentence_index)
  1157. if p_entity.sentence_index>=1: # 只在第一句进行这种模糊匹配
  1158. continue
  1159. if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
  1160. or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
  1161. _role = 'tenderee'
  1162. else:
  1163. _role = "agency"
  1164. _direct = _group.split("_")[1]
  1165. _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
  1166. # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1167. # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1168. if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
  1169. list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
  1170. _flag = True
  1171. _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1172. "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1173. _prob_weight = 1.2 if _weight=='w1' else 1
  1174. # print('_v_group:',_group, _v_group, p_entity.entity_text)
  1175. if _i_span == 1 and _direct == "center":
  1176. _flag = True
  1177. _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1178. "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1179. _prob_weight = 1.2 if _weight == 'w1' else 1
  1180. # print('_v_group:', _group, _v_group, p_entity.entity_text)
  1181. if _i_span == 2 and _direct == "right":
  1182. _flag = True
  1183. _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1184. "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1185. _prob_weight = 1.2 if _weight == 'w1' else 1
  1186. # print('_v_group:', _group, _v_group, p_entity.entity_text)
  1187. # 得到结果
  1188. if _flag:
  1189. p_entity.label = _label
  1190. p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
  1191. # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
  1192. break
  1193. # 其他金额通过正则召回可能是招标或中投标的金额
  1194. if p_entity.entity_type in ["money"]:
  1195. if str(p_entity.label) == "2":
  1196. for _sentence in list_sentence:
  1197. if _sentence.sentence_index == p_entity.sentence_index:
  1198. _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1199. end_index=p_entity.end_index, size=20, center_include=True,
  1200. word_flag=True, text=p_entity.entity_text)
  1201. if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
  1202. self.pattern_money_other, _span[0]) is None:
  1203. p_entity.values[0] = 0.8 + p_entity.values[0] / 10
  1204. p_entity.label = 0
  1205. if re.search(self.pattern_money_tenderer, _span[0]) is not None:
  1206. if re.search(self.pattern_money_other, _span[0]) is not None:
  1207. if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
  1208. re.search(self.pattern_money_other, _span[0]).span()[1]:
  1209. p_entity.values[1] = 0.8 + p_entity.values[1] / 10
  1210. p_entity.label = 1
  1211. else:
  1212. p_entity.values[1] = 0.8 + p_entity.values[1] / 10
  1213. p_entity.label = 1
  1214. if re.search(self.pattern_money_tenderer_whole,
  1215. "".join(_span)) is not None and re.search(self.pattern_money_other,
  1216. _span[0]) is None:
  1217. p_entity.values[1] = 0.8 + p_entity.values[1] / 10
  1218. p_entity.label = 1
  1219. # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
  1220. list_p = []
  1221. state = 0
  1222. for p_entity in list_entity:
  1223. for _sentence in list_sentence:
  1224. if _sentence.sentence_index == p_entity.sentence_index:
  1225. _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1226. end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
  1227. text=p_entity.entity_text)
  1228. if state == 2:
  1229. for _p in list_p[1:]:
  1230. _p.values[0] = 0.8 + _p.values[0] / 10
  1231. _p.label = 0
  1232. state = 0
  1233. list_p = []
  1234. if state == 0:
  1235. if p_entity.entity_type in ["money"]:
  1236. if str(p_entity.label) == "0" and re.search(self.pattern_pack,
  1237. _span[0] + "-" + _span[2]) is not None:
  1238. state = 1
  1239. list_p.append(p_entity)
  1240. elif state == 1:
  1241. if p_entity.entity_type in ["money"]:
  1242. if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack,
  1243. _span[0] + "-" + _span[
  1244. 2]) is not None and re.search(
  1245. self.pattern_money_other,
  1246. _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[
  1247. 0].sentence_index:
  1248. list_p.append(p_entity)
  1249. else:
  1250. state = 2
  1251. if len(list_p) > 1:
  1252. for _p in list_p[1:]:
  1253. # print("==",_p.entity_text,_p.sentence_index,_p.label)
  1254. _p.values[0] = 0.8 + _p.values[0] / 10
  1255. _p.label = 0
  1256. state = 0
  1257. list_p = []
  1258. for p_entity in list_entity:
  1259. # 将属于集合中的不可能是中标人的标签置为无
  1260. if p_entity.entity_text in self.SET_NOT_TENDERER:
  1261. p_entity.label = 5
  1262. '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
  1263. class RoleRuleFinalAdd():
  1264. def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
  1265. # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
  1266. main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
  1267. end_tokens = []
  1268. for sentence in main_sentences[-5:]:
  1269. end_tokens.extend(sentence.tokens)
  1270. text_end = "".join(end_tokens[-30:])
  1271. # print(text_end)
  1272. # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
  1273. sear_ent = re.search('[,。;]([\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
  1274. sear_ent2 = re.search('(户名|开户名称)[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
  1275. sear_ent3 = re.search('(报名咨询|[收送交]货地点)[,:]([\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
  1276. sear_ent4 = re.search('(发布(?:人|单位|机构))[::]([\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
  1277. sear_list = [sear_ent4 , sear_ent3 , sear_ent2 , sear_ent]
  1278. tenderee_notfound = True
  1279. agency_notfound = True
  1280. ents = []
  1281. for ent in list_entitys[0]:
  1282. if ent.entity_type in ['org', 'company']:
  1283. if ent.label == 0:
  1284. tenderee_notfound = False
  1285. elif ent.label == 1:
  1286. agency_notfound = False
  1287. elif ent.label == 5:
  1288. ents.append(ent)
  1289. if sear_ent or sear_ent2 or sear_ent3 or sear_ent4:
  1290. for _sear_ent in [_sear for _sear in sear_list if _sear]:
  1291. # if sear_ent4:
  1292. # ent_re = sear_ent4.group(2)
  1293. # elif sear_ent3:
  1294. # ent_re = sear_ent3.group(2)
  1295. # elif sear_ent2:
  1296. # ent_re = sear_ent2.group(2)
  1297. # else:
  1298. # ent_re = sear_ent.group(1)
  1299. if _sear_ent==sear_ent4:
  1300. ent_re = _sear_ent.group(2)
  1301. elif _sear_ent==sear_ent3:
  1302. ent_re = _sear_ent.group(2)
  1303. elif _sear_ent==sear_ent2:
  1304. ent_re = _sear_ent.group(2)
  1305. else:
  1306. ent_re = _sear_ent.group(1)
  1307. # print('ent_re', ent_re)
  1308. ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
  1309. if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
  1310. or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
  1311. n = 0
  1312. for i in range(len(ents) - 1, -1, -1):
  1313. if not ents[i].in_attachment:
  1314. n += 1
  1315. if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
  1316. break
  1317. if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
  1318. ents[i].label = 0
  1319. ents[i].values[0] = 0.5
  1320. tenderee_notfound = False
  1321. # log('正则最后补充实体: %s'%(ent_re))
  1322. break
  1323. elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re):
  1324. n = 0
  1325. for i in range(len(ents) - 1, -1, -1):
  1326. if not ents[i].in_attachment:
  1327. n += 1
  1328. if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
  1329. break
  1330. if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
  1331. ents[i].label = 1
  1332. ents[i].values[1] = 0.5
  1333. agency_notfound = False
  1334. # log('正则最后补充实体: %s'%(ent_re))
  1335. break
  1336. if not tenderee_notfound:
  1337. break
  1338. elif list_codenames[0]['name'] != "": #把标题包含的公司实体作为招标人
  1339. # tenderee_notfound = True
  1340. # ents = []
  1341. # for ent in list_entitys[0]:
  1342. # if ent.entity_type in ['org', 'company']:
  1343. # if ent.label == 0:
  1344. # tenderee_notfound = False
  1345. # elif ent.label == 1:
  1346. # agency_notfound = False
  1347. # elif ent.label == 5:
  1348. # ents.append(ent)
  1349. if tenderee_notfound == True:
  1350. # print('list_codenames',list_codenames[0]['name'])
  1351. for ent in ents:
  1352. if ent.entity_text in list_codenames[0]['name']:
  1353. ent.label = 0
  1354. ent.values[0] = 0.5
  1355. # log('正则召回标题中包含的实体:%s'%ent.entity_text)
  1356. break
  1357. # 招标人角色召回规则
  1358. class TendereeRuleRecall():
  1359. def __init__(self):
  1360. self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|"
  1361. "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::][^。;,]{,5}$")
  1362. self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|"
  1363. "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|"
  1364. "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|"
  1365. "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|"
  1366. "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|"
  1367. "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)")
  1368. self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)")
  1369. self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P<project>[^。;,?!::]+)")
  1370. # 公告主语判断规则
  1371. self.subject = re.compile("[我本][院校局]")
  1372. # 未识别实体召回正则
  1373. self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
  1374. "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
  1375. "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
  1376. self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
  1377. "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
  1378. "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
  1379. # 未识别实体尾部判断
  1380. self.unrecognized_end1 = re.compile(".{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心)")
  1381. self.unrecognized_end2 = re.compile(".{4,}(?:署|局|厅|处|室|科|部|站|所|股|行)")
  1382. def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
  1383. # tenderee_notfound = True
  1384. # agency_notfound = True
  1385. self.get_tenderee = False
  1386. ents = []
  1387. list_name = []
  1388. for ent in list_entitys[0]:
  1389. if ent.entity_type == 'name':
  1390. list_name.append(ent.entity_text)
  1391. if ent.entity_type in ['org', 'company']:
  1392. if ent.label == 0:
  1393. # tenderee_notfound = False
  1394. self.get_tenderee = True
  1395. # elif ent.label == 1:
  1396. # agency_notfound = False
  1397. elif ent.label == 5:
  1398. ents.append(ent)
  1399. if not self.get_tenderee:
  1400. self.entity_context_rule(ents,list_name,list_sentences)
  1401. if not self.get_tenderee:
  1402. self.subject_rule(ents,list_articles,list_sentences)
  1403. if not self.get_tenderee:
  1404. self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
  1405. if not self.get_tenderee:
  1406. self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
  1407. #entity上下文正则判断
  1408. def entity_context_rule(self,entitys,list_name,list_sentences):
  1409. for ent in entitys:
  1410. _sentence = list_sentences[0][ent.sentence_index]
  1411. _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
  1412. end_index=ent.end_index, size=40, center_include=True,
  1413. word_flag=True, use_text=True,
  1414. text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
  1415. if re.search(self.tenderee_left,_span[0]):
  1416. ent.label = 0
  1417. ent.values[0] = 0.5 + ent.values[0] / 10
  1418. self.get_tenderee = True
  1419. elif re.search(self.tenderee_right,_span[2]):
  1420. ent.label = 0
  1421. ent.values[0] = 0.5 + ent.values[0] / 10
  1422. self.get_tenderee = True
  1423. elif re.search(self.tenderee_right2, _span[2]):
  1424. ent.label = 0
  1425. ent.values[0] = 0.5 + ent.values[0] / 10
  1426. self.get_tenderee = True
  1427. elif list_name:
  1428. pj_name = re.search(self.tenderee_right3, _span[2])
  1429. if pj_name:
  1430. pj_name = pj_name.groupdict()["project"]
  1431. for _name in list_name:
  1432. if _name in pj_name:
  1433. ent.label = 0
  1434. ent.values[0] = 0.5
  1435. self.get_tenderee = True
  1436. break
  1437. # 公告主语判断
  1438. def subject_rule(self, entitys,list_articles,list_sentences):
  1439. content = list_articles[0].content.split('##attachment##')[0]
  1440. if re.search(self.subject,content):
  1441. _subject = re.search(self.subject,content).group()
  1442. for ent in entitys:
  1443. if re.search("院",_subject) and re.search("医院|学院",ent.entity_text):
  1444. ent.label = 0
  1445. ent.values[0] = 0.5 + ent.values[0] / 10
  1446. self.get_tenderee = True
  1447. elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text):
  1448. ent.label = 0
  1449. ent.values[0] = 0.5 + ent.values[0] / 10
  1450. self.get_tenderee = True
  1451. elif re.search("局", _subject) and re.search("局", ent.entity_text):
  1452. _sentence = list_sentences[0][ent.sentence_index]
  1453. _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
  1454. end_index=ent.end_index, size=20, center_include=True,
  1455. word_flag=True, use_text=True,
  1456. text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
  1457. if not re.search("监督|投诉",_span[0][-10:]):
  1458. ent.label = 0
  1459. ent.values[0] = 0.5 + ent.values[0] / 10
  1460. self.get_tenderee = True
  1461. # 正则召回未识别实体
  1462. def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
  1463. list_sentence = list_sentences[0]
  1464. for in_attachment in [False,True]:
  1465. for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
  1466. sentence_text = sentence.sentence_text
  1467. tokens = sentence.tokens
  1468. doc_id = sentence.doc_id
  1469. in_attachment = sentence.in_attachment
  1470. list_tokenbegin = []
  1471. begin = 0
  1472. for i in range(0, len(tokens)):
  1473. list_tokenbegin.append(begin)
  1474. begin += len(str(tokens[i]))
  1475. list_tokenbegin.append(begin + 1)
  1476. for _match in re.finditer(pattern,sentence_text):
  1477. _groupdict = _match.groupdict()
  1478. _match_text = _match.group()
  1479. _unrecognized_text = _groupdict["unrecognized"]
  1480. # print(_unrecognized_text)
  1481. # if _match_text[-1] in [':',':']:
  1482. # _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
  1483. # if not _unrecognized:
  1484. # _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
  1485. # if _unrecognized:
  1486. # _unrecognized = _unrecognized.group()
  1487. # else:
  1488. # continue
  1489. # else:
  1490. # _unrecognized = _unrecognized_text
  1491. _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
  1492. if not _unrecognized:
  1493. _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
  1494. if _unrecognized:
  1495. _unrecognized = _unrecognized.group()
  1496. else:
  1497. continue
  1498. # print(_unrecognized)
  1499. if re.search("某",_unrecognized):
  1500. continue
  1501. begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
  1502. for j in range(len(list_tokenbegin)):
  1503. if list_tokenbegin[j] == begin_index_temp:
  1504. begin_index = j
  1505. break
  1506. elif list_tokenbegin[j] > begin_index_temp:
  1507. begin_index = j - 1
  1508. break
  1509. index = begin_index_temp + len(_unrecognized)
  1510. end_index_temp = index
  1511. for j in range(begin_index, len(list_tokenbegin)):
  1512. if list_tokenbegin[j] >= index:
  1513. end_index = j - 1
  1514. break
  1515. entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
  1516. entity_text = _unrecognized
  1517. new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
  1518. begin_index_temp, end_index_temp, in_attachment=in_attachment)
  1519. new_entity.label = 0
  1520. new_entity.values = [on_value,0,0,0,0,0]
  1521. list_entitys[0].append(new_entity)
  1522. self.get_tenderee = True
  1523. if self.get_tenderee:
  1524. list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
  1525. break
  1526. # 时间类别
  1527. class TimePredictor():
  1528. def __init__(self):
  1529. self.sess = tf.Session(graph=tf.Graph())
  1530. self.inputs_code = None
  1531. self.outputs_code = None
  1532. self.input_shape = (2,40,128)
  1533. self.load_model()
  1534. def load_model(self):
  1535. model_path = os.path.dirname(__file__)+'/timesplit_model'
  1536. if self.inputs_code is None:
  1537. log("get model of time")
  1538. with self.sess.as_default():
  1539. with self.sess.graph.as_default():
  1540. meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path)
  1541. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  1542. signature_def = meta_graph_def.signature_def
  1543. self.inputs_code = []
  1544. self.inputs_code.append(
  1545. self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
  1546. self.inputs_code.append(
  1547. self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
  1548. self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
  1549. return self.inputs_code, self.outputs_code
  1550. else:
  1551. return self.inputs_code, self.outputs_code
  1552. def search_time_data(self,list_sentences,list_entitys):
  1553. data_x = []
  1554. points_entitys = []
  1555. for list_sentence, list_entity in zip(list_sentences, list_entitys):
  1556. p_entitys = 0
  1557. p_sentences = 0
  1558. list_sentence.sort(key=lambda x: x.sentence_index)
  1559. while(p_entitys<len(list_entity)):
  1560. entity = list_entity[p_entitys]
  1561. if entity.entity_type in ['time']:
  1562. while(p_sentences<len(list_sentence)):
  1563. sentence = list_sentence[p_sentences]
  1564. if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
  1565. # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
  1566. # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
  1567. s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
  1568. left = s[0]
  1569. right = s[1]
  1570. context = [left, right]
  1571. x = self.embedding_words(context, shape=self.input_shape)
  1572. data_x.append(x)
  1573. points_entitys.append(entity)
  1574. break
  1575. p_sentences += 1
  1576. p_entitys += 1
  1577. if len(points_entitys)==0:
  1578. return None
  1579. data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
  1580. return [data_x, points_entitys]
  1581. def embedding_words(self, datas, shape):
  1582. '''
  1583. @summary:查找词汇对应的词向量
  1584. @param:
  1585. datas:词汇的list
  1586. shape:结果的shape
  1587. @return: array,返回对应shape的词嵌入
  1588. '''
  1589. model_w2v = getModel_w2v()
  1590. embed = np.zeros(shape)
  1591. length = shape[1]
  1592. out_index = 0
  1593. for data in datas:
  1594. index = 0
  1595. for item in data:
  1596. item_not_space = re.sub("\s*", "", item)
  1597. if index >= length:
  1598. break
  1599. if item_not_space in model_w2v.vocab:
  1600. embed[out_index][index] = model_w2v[item_not_space]
  1601. index += 1
  1602. else:
  1603. embed[out_index][index] = model_w2v['unk']
  1604. index += 1
  1605. out_index += 1
  1606. return embed
  1607. def predict(self, list_sentences,list_entitys):
  1608. datas = self.search_time_data(list_sentences, list_entitys)
  1609. if datas is None:
  1610. return
  1611. points_entitys = datas[1]
  1612. with self.sess.as_default():
  1613. predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0]
  1614. ,self.inputs_code[1]:datas[0][1]})[0]
  1615. for i in range(len(predict_y)):
  1616. entity = points_entitys[i]
  1617. label = np.argmax(predict_y[i])
  1618. values = []
  1619. for item in predict_y[i]:
  1620. values.append(item)
  1621. if label != 0:
  1622. if not timeFormat(entity.entity_text):
  1623. label = 0
  1624. values[0] = 0.5
  1625. entity.set_Role(label, values)
  1626. # 产品字段提取
  1627. class ProductPredictor():
  1628. def __init__(self):
  1629. vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
  1630. self.vocab = load(vocabpath)
  1631. self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab)))
  1632. self.sess = tf.Session(graph=tf.Graph())
  1633. self.load_model()
  1634. def load_model(self):
  1635. # model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
  1636. model_path = os.path.dirname(__file__)+'/product_savedmodel/productAndfailreason.pb'
  1637. with self.sess.as_default():
  1638. with self.sess.graph.as_default():
  1639. output_graph_def = tf.GraphDef()
  1640. with open(model_path, 'rb') as f:
  1641. output_graph_def.ParseFromString(f.read())
  1642. tf.import_graph_def(output_graph_def, name='')
  1643. self.sess.run(tf.global_variables_initializer())
  1644. self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
  1645. self.length = self.sess.graph.get_tensor_by_name("Sum:0")
  1646. self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
  1647. self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
  1648. self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
  1649. def decode(self,logits, lengths, matrix):
  1650. paths = []
  1651. small = -1000.0
  1652. # start = np.asarray([[small] * 4 + [0]])
  1653. start = np.asarray([[small]*7+[0]])
  1654. for score, length in zip(logits, lengths):
  1655. score = score[:length]
  1656. pad = small * np.ones([length, 1])
  1657. logits = np.concatenate([score, pad], axis=1)
  1658. logits = np.concatenate([start, logits], axis=0)
  1659. path, _ = viterbi_decode(logits, matrix)
  1660. paths.append(path[1:])
  1661. return paths
  1662. def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
  1663. '''
  1664. 预测实体代码,每个句子最多取MAX_AREA个字,超过截断
  1665. :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
  1666. :param list_entitys: 多篇公告实体列表
  1667. :param MAX_AREA: 每个句子最多截取多少字
  1668. :return: 把预测出来的实体放进实体类
  1669. '''
  1670. with self.sess.as_default() as sess:
  1671. with self.sess.graph.as_default():
  1672. result = []
  1673. if fail and list_articles!=[]:
  1674. text_list = [list_articles[0].content[:MAX_AREA]]
  1675. chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in text] for text in text_list]
  1676. lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
  1677. feed_dict={
  1678. self.char_input: np.asarray(chars),
  1679. self.dropout: 1.0
  1680. })
  1681. batch_paths = self.decode(scores, lengths, tran_)
  1682. for text, path, length in zip(text_list, batch_paths, lengths):
  1683. tags = ''.join([str(it) for it in path[:length]])
  1684. for it in re.finditer("12*3", tags):
  1685. start = it.start()
  1686. end = it.end()
  1687. _entity = Entity(doc_id=list_articles[0].doc_id, entity_id="%s_%s_%s_%s" % (
  1688. list_articles[0].doc_id, 0, start, end),
  1689. entity_text=text[start:end],
  1690. entity_type="product", sentence_index=0,
  1691. begin_index=0, end_index=0, wordOffset_begin=start,
  1692. wordOffset_end=end)
  1693. list_entitys[0].append(_entity)
  1694. for it in re.finditer("45*6", tags):
  1695. start = it.start()
  1696. end = it.end()
  1697. result.append(text[start:end].replace('?', '').strip())
  1698. reasons = []
  1699. for it in result:
  1700. if "(√)" in it or "(√)" in it:
  1701. reasons = [it]
  1702. break
  1703. if reasons != [] and (it not in reasons[-1] and it not in reasons):
  1704. reasons.append(it)
  1705. elif reasons == []:
  1706. reasons.append(it)
  1707. return {'fail_reason':';'.join(reasons)}
  1708. if list_entitys is None:
  1709. list_entitys = [[] for _ in range(len(list_sentences))]
  1710. for list_sentence, list_entity in zip(list_sentences,list_entitys):
  1711. if len(list_sentence)==0:
  1712. result.append({"product":[]})
  1713. continue
  1714. list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
  1715. _begin_index = 0
  1716. item = {"product":[]}
  1717. temp_list = []
  1718. while True:
  1719. MAX_LEN = len(list_sentence[_begin_index].sentence_text)
  1720. if MAX_LEN > MAX_AREA:
  1721. MAX_LEN = MAX_AREA
  1722. _LEN = MAX_AREA//MAX_LEN
  1723. chars = [sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  1724. chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in l] for l in chars]
  1725. chars = pad_sequences(chars, maxlen=MAX_LEN, padding="post", truncating="post")
  1726. lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
  1727. feed_dict={
  1728. self.char_input: np.asarray(chars),
  1729. self.dropout: 1.0
  1730. })
  1731. batch_paths = self.decode(scores, lengths, tran_)
  1732. for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
  1733. tags = ''.join([str(it) for it in path[:length]])
  1734. for it in re.finditer("12*3", tags):
  1735. start = it.start()
  1736. end = it.end()
  1737. _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
  1738. sentence.doc_id, sentence.sentence_index, start, end),
  1739. entity_text=sentence.sentence_text[start:end],
  1740. entity_type="product", sentence_index=sentence.sentence_index,
  1741. begin_index=0, end_index=0, wordOffset_begin=start,
  1742. wordOffset_end=end,in_attachment=sentence.in_attachment)
  1743. list_entity.append(_entity)
  1744. temp_list.append(sentence.sentence_text[start:end])
  1745. # item["product"] = list(set(temp_list))
  1746. # result.append(item)
  1747. if _begin_index+_LEN >= len(list_sentence):
  1748. break
  1749. _begin_index += _LEN
  1750. item["product"] = list(set(temp_list))
  1751. result.append(item) # 修正bug
  1752. return {'fail_reason': ""}
  1753. # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
  1754. class ProductAttributesPredictor():
  1755. def __init__(self,):
  1756. self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
  1757. self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
  1758. with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
  1759. self.header_set = pickle.load(f)
  1760. def isTrueTable(self, table):
  1761. '''真假表格规则:
  1762. 1、包含<caption>或<th>标签为真
  1763. 2、包含大量链接、表单、图片或嵌套表格为假
  1764. 3、表格尺寸太小为假
  1765. 4、外层<table>嵌套子<table>,一般子为真,外为假'''
  1766. if table.find_all(['caption', 'th']) != []:
  1767. return True
  1768. elif len(table.find_all(['form', 'a', 'img'])) > 5:
  1769. return False
  1770. elif len(table.find_all(['tr'])) < 2:
  1771. return False
  1772. elif len(table.find_all(['table'])) >= 1:
  1773. return False
  1774. else:
  1775. return True
  1776. def getTrs(self, tbody):
  1777. # 获取所有的tr
  1778. trs = []
  1779. objs = tbody.find_all(recursive=False)
  1780. for obj in objs:
  1781. if obj.name == "tr":
  1782. trs.append(obj)
  1783. if obj.name == "tbody":
  1784. for tr in obj.find_all("tr", recursive=False):
  1785. trs.append(tr)
  1786. return trs
  1787. def getTable(self, tbody):
  1788. trs = self.getTrs(tbody)
  1789. inner_table = []
  1790. if len(trs) < 2:
  1791. return inner_table
  1792. for tr in trs:
  1793. tr_line = []
  1794. tds = tr.findChildren(['td', 'th'], recursive=False)
  1795. if len(tds) < 2:
  1796. continue
  1797. for td in tds:
  1798. td_text = re.sub('\s', '', td.get_text())
  1799. tr_line.append(td_text)
  1800. inner_table.append(tr_line)
  1801. return inner_table
  1802. def fixSpan(self, tbody):
  1803. # 处理colspan, rowspan信息补全问题
  1804. trs = self.getTrs(tbody)
  1805. ths_len = 0
  1806. ths = list()
  1807. trs_set = set()
  1808. # 修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  1809. # 遍历每一个tr
  1810. for indtr, tr in enumerate(trs):
  1811. ths_tmp = tr.findChildren('th', recursive=False)
  1812. # 不补全含有表格的tr
  1813. if len(tr.findChildren('table')) > 0:
  1814. continue
  1815. if len(ths_tmp) > 0:
  1816. ths_len = ths_len + len(ths_tmp)
  1817. for th in ths_tmp:
  1818. ths.append(th)
  1819. trs_set.add(tr)
  1820. # 遍历每行中的element
  1821. tds = tr.findChildren(recursive=False)
  1822. if len(tds) < 3:
  1823. continue # 列数太少的不补全
  1824. for indtd, td in enumerate(tds):
  1825. # 若有colspan 则补全同一行下一个位置
  1826. if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "":
  1827. col = int(re.sub("[^0-9]", "", str(td['colspan'])))
  1828. if col < 10 and len(td.get_text()) < 500:
  1829. td['colspan'] = 1
  1830. for i in range(1, col, 1):
  1831. td.insert_after(copy.copy(td))
  1832. for indtr, tr in enumerate(trs):
  1833. ths_tmp = tr.findChildren('th', recursive=False)
  1834. # 不补全含有表格的tr
  1835. if len(tr.findChildren('table')) > 0:
  1836. continue
  1837. if len(ths_tmp) > 0:
  1838. ths_len = ths_len + len(ths_tmp)
  1839. for th in ths_tmp:
  1840. ths.append(th)
  1841. trs_set.add(tr)
  1842. # 遍历每行中的element
  1843. tds = tr.findChildren(recursive=False)
  1844. same_span = 0
  1845. if len(tds) > 1 and 'rowspan' in tds[0].attrs:
  1846. span0 = tds[0].attrs['rowspan']
  1847. for td in tds:
  1848. if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0:
  1849. same_span += 1
  1850. if same_span == len(tds):
  1851. continue
  1852. for indtd, td in enumerate(tds):
  1853. # 若有rowspan 则补全下一行同样位置
  1854. if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "":
  1855. row = int(re.sub("[^0-9]", "", str(td['rowspan'])))
  1856. td['rowspan'] = 1
  1857. for i in range(1, row, 1):
  1858. # 获取下一行的所有td, 在对应的位置插入
  1859. if indtr + i < len(trs):
  1860. tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False)
  1861. if len(tds1) >= (indtd) and len(tds1) > 0:
  1862. if indtd > 0:
  1863. tds1[indtd - 1].insert_after(copy.copy(td))
  1864. else:
  1865. tds1[0].insert_before(copy.copy(td))
  1866. elif len(tds1) > 0 and len(tds1) == indtd - 1:
  1867. tds1[indtd - 2].insert_after(copy.copy(td))
  1868. def get_monthlen(self, year, month):
  1869. '''输入年份、月份 int类型 得到该月份天数'''
  1870. try:
  1871. weekday, num = calendar.monthrange(int(year), int(month))
  1872. except:
  1873. num = 30
  1874. return str(num)
  1875. def fix_time(self, text, html, page_time):
  1876. '''输入日期字段返回格式化日期'''
  1877. for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'),
  1878. ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]:
  1879. if it[0] in text:
  1880. text = text.replace(it[0], it[1])
  1881. if re.search('^\d{1,2}月$', text):
  1882. m = re.search('^(\d{1,2})月$', text).group(1)
  1883. if len(m) < 2:
  1884. m = '0' + m
  1885. year = re.search('(\d{4})年(.{,12}采购意向)?', html)
  1886. if year:
  1887. y = year.group(1)
  1888. num = self.get_monthlen(y, m)
  1889. if len(num) < 2:
  1890. num = '0' + num
  1891. order_begin = "%s-%s-01" % (y, m)
  1892. order_end = "%s-%s-%s" % (y, m, num)
  1893. elif page_time != "":
  1894. year = re.search('\d{4}', page_time)
  1895. if year:
  1896. y = year.group(0)
  1897. num = self.get_monthlen(y, m)
  1898. if len(num) < 2:
  1899. num = '0' + num
  1900. order_begin = "%s-%s-01" % (y, m)
  1901. order_end = "%s-%s-%s" % (y, m, num)
  1902. else:
  1903. y = str(datetime.datetime.now().year)
  1904. num = self.get_monthlen(y, m)
  1905. if len(num) < 2:
  1906. num = '0' + num
  1907. order_begin = "%s-%s-01" % (y, m)
  1908. order_end = "%s-%s-%s" % (y, m, num)
  1909. else:
  1910. y = str(datetime.datetime.now().year)
  1911. num = self.get_monthlen(y, m)
  1912. if len(num) < 2:
  1913. num = '0' + num
  1914. order_begin = "%s-%s-01" % (y, m)
  1915. order_end = "%s-%s-%s" % (y, m, num)
  1916. return order_begin, order_end
  1917. t1 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})月?$', text)
  1918. if t1:
  1919. year = t1.group(1)
  1920. month = t1.group(3)
  1921. num = self.get_monthlen(year, month)
  1922. if len(month)<2:
  1923. month = '0'+month
  1924. if len(num) < 2:
  1925. num = '0'+num
  1926. order_begin = "%s-%s-01" % (year, month)
  1927. order_end = "%s-%s-%s" % (year, month, num)
  1928. return order_begin, order_end
  1929. t2 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})(月|/|\.|-)(\d{1,2})日?$', text)
  1930. if t2:
  1931. y = t2.group(1)
  1932. m = t2.group(3)
  1933. d = t2.group(5)
  1934. m = '0'+ m if len(m)<2 else m
  1935. d = '0'+d if len(d)<2 else d
  1936. order_begin = order_end = "%s-%s-%s"%(y,m,d)
  1937. return order_begin, order_end
  1938. # 时间样式:"202105"
  1939. t3 = re.search("^(20\d{2})(\d{1,2})$",text)
  1940. if t3:
  1941. year = t3.group(1)
  1942. month = t3.group(2)
  1943. if int(month)>0 and int(month)<=12:
  1944. num = self.get_monthlen(year, month)
  1945. if len(month) < 2:
  1946. month = '0' + month
  1947. if len(num) < 2:
  1948. num = '0' + num
  1949. order_begin = "%s-%s-01" % (year, month)
  1950. order_end = "%s-%s-%s" % (year, month, num)
  1951. return order_begin, order_end
  1952. # 时间样式:"20210510"
  1953. t4 = re.search("^(20\d{2})(\d{2})(\d{2})$", text)
  1954. if t4:
  1955. year = t4.group(1)
  1956. month = t4.group(2)
  1957. day = t4.group(3)
  1958. if int(month) > 0 and int(month) <= 12 and int(day)>0 and int(day)<=31:
  1959. order_begin = order_end = "%s-%s-%s"%(year,month,day)
  1960. return order_begin, order_end
  1961. all_match = re.finditer('^(?P<y1>\d{4})(年|/|\.)(?P<m1>\d{1,2})(?:(月|/|\.)(?:(?P<d1>\d{1,2})日)?)?'
  1962. '(到|至|-)(?:(?P<y2>\d{4})(年|/|\.))?(?P<m2>\d{1,2})(?:(月|/|\.)'
  1963. '(?:(?P<d2>\d{1,2})日)?)?$', text)
  1964. y1 = m1 = d1 = y2 = m2 = d2 = ""
  1965. found_math = False
  1966. for _match in all_match:
  1967. if len(_match.group()) > 0:
  1968. found_math = True
  1969. for k, v in _match.groupdict().items():
  1970. if v!="" and v is not None:
  1971. if k == 'y1':
  1972. y1 = v
  1973. elif k == 'm1':
  1974. m1 = v
  1975. elif k == 'd1':
  1976. d1 = v
  1977. elif k == 'y2':
  1978. y2 = v
  1979. elif k == 'm2':
  1980. m2 = v
  1981. elif k == 'd2':
  1982. d2 = v
  1983. if not found_math:
  1984. return "", ""
  1985. y2 = y1 if y2 == "" else y2
  1986. d1 = '1' if d1 == "" else d1
  1987. d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
  1988. m1 = '0' + m1 if len(m1) < 2 else m1
  1989. m2 = '0' + m2 if len(m2) < 2 else m2
  1990. d1 = '0' + d1 if len(d1) < 2 else d1
  1991. d2 = '0' + d2 if len(d2) < 2 else d2
  1992. order_begin = "%s-%s-%s"%(y1,m1,d1)
  1993. order_end = "%s-%s-%s"%(y2,m2,d2)
  1994. return order_begin, order_end
  1995. def find_header(self, items, p1, p2):
  1996. '''
  1997. inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
  1998. :param items: 列表,内容为每个td 文本内容
  1999. :param p1: 优先表头正则
  2000. :param p2: 第二表头正则
  2001. :return: 表头所在列序号,是否表头,表头内容
  2002. '''
  2003. flag = False
  2004. header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
  2005. product = "" # 产品
  2006. quantity = "" # 数量
  2007. unitPrice = "" # 单价
  2008. brand = "" # 品牌
  2009. specs = "" # 规格
  2010. demand = "" # 采购需求
  2011. budget = "" # 预算金额
  2012. order_time = "" # 采购时间
  2013. for i in range(min(4, len(items))):
  2014. it = items[i]
  2015. if len(it) < 15 and re.search(p1, it) != None:
  2016. flag = True
  2017. product = it
  2018. header_dic['名称'] = i
  2019. break
  2020. if not flag:
  2021. for i in range(min(4, len(items))):
  2022. it = items[i]
  2023. if len(it) < 15 and re.search(p2, it) and re.search(
  2024. '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None:
  2025. flag = True
  2026. product = it
  2027. header_dic['名称'] = i
  2028. break
  2029. if flag:
  2030. for j in range(i + 1, len(items)):
  2031. if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
  2032. continue
  2033. if header_dic['数量']=="" and re.search('数量', items[j]):
  2034. header_dic['数量'] = j
  2035. quantity = items[j]
  2036. elif re.search('单价', items[j]):
  2037. header_dic['单价'] = j
  2038. unitPrice = items[j]
  2039. elif re.search('品牌', items[j]):
  2040. header_dic['品牌'] = j
  2041. brand = items[j]
  2042. elif re.search('规格', items[j]):
  2043. header_dic['规格'] = j
  2044. specs = items[j]
  2045. elif re.search('需求', items[j]):
  2046. header_dic['需求'] = j
  2047. demand = items[j]
  2048. elif re.search('预算', items[j]):
  2049. header_dic['预算'] = j
  2050. budget = items[j]
  2051. elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
  2052. header_dic['时间'] = j
  2053. order_time = items[j]
  2054. if header_dic.get('名称', "") != "" :
  2055. num = 0
  2056. for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
  2057. if it != "":
  2058. num += 1
  2059. if num >=2:
  2060. return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
  2061. flag = False
  2062. return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
  2063. def predict(self, docid='', html='', page_time=""):
  2064. '''
  2065. 正则寻找table表格内 产品相关信息
  2066. :param html:公告HTML原文
  2067. :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息
  2068. '''
  2069. soup = BeautifulSoup(html, 'lxml')
  2070. flag_yx = True if re.search('采购意向', html) else False
  2071. tables = soup.find_all(['table'])
  2072. headers = []
  2073. headers_demand = []
  2074. header_col = []
  2075. product_link = []
  2076. demand_link = []
  2077. total_product_money = 0
  2078. for i in range(len(tables)-1, -1, -1):
  2079. table = tables[i]
  2080. if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
  2081. table.string = table.get_text()
  2082. table.name = 'turntable'
  2083. continue
  2084. if not self.isTrueTable(table):
  2085. continue
  2086. self.fixSpan(table)
  2087. inner_table = self.getTable(table)
  2088. i = 0
  2089. found_header = False
  2090. header_colnum = 0
  2091. if flag_yx:
  2092. col0_l = []
  2093. col1_l = []
  2094. for tds in inner_table:
  2095. if len(tds) == 2:
  2096. col0_l.append(re.sub(':', '', tds[0]))
  2097. col1_l.append(tds[1])
  2098. if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2:
  2099. header_list2 = []
  2100. product = demand = budget = order_begin = order_end = ""
  2101. for i in range(len(col0_l)):
  2102. if re.search('项目名称', col0_l[i]):
  2103. header_list2.append(col0_l[i])
  2104. product = col1_l[i]
  2105. elif re.search('采购需求|需求概况', col0_l[i]):
  2106. header_list2.append(col0_l[i])
  2107. demand = col1_l[i]
  2108. elif re.search('采购预算|预算金额', col0_l[i]):
  2109. header_list2.append(col0_l[i])
  2110. budget = col1_l[i]
  2111. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
  2112. if re_price:
  2113. budget = re_price[0]
  2114. if '万元' in col0_l[i] and '万' not in budget:
  2115. budget += '万元'
  2116. budget = str(getUnifyMoney(budget))
  2117. else:
  2118. budget = ""
  2119. elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
  2120. header_list2.append(col0_l[i])
  2121. order_time = col1_l[i].strip()
  2122. order_begin, order_end = self.fix_time(order_time, html, page_time)
  2123. if order_begin != "" and order_end!="":
  2124. order_begin_year = int(order_begin.split("-")[0])
  2125. order_end_year = int(order_end.split("-")[0])
  2126. # 限制附件错误识别时间
  2127. if order_begin_year>=2050 or order_end_year>=2050:
  2128. order_begin = order_end = ""
  2129. if product!= "" and demand != "" and budget!="" and order_begin != "":
  2130. link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
  2131. 'order_begin': order_begin, 'order_end': order_end}
  2132. if link not in demand_link:
  2133. demand_link.append(link)
  2134. headers_demand.append('_'.join(header_list2))
  2135. continue
  2136. while i < (len(inner_table)):
  2137. tds = inner_table[i]
  2138. not_empty = [it for it in tds if it != ""]
  2139. if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
  2140. i += 1
  2141. continue
  2142. product = "" # 产品
  2143. quantity = "" # 数量
  2144. unitPrice = "" # 单价
  2145. brand = "" # 品牌
  2146. specs = "" # 规格
  2147. demand = "" # 采购需求
  2148. budget = "" # 预算金额
  2149. order_time = "" # 采购时间
  2150. order_begin = ""
  2151. order_end = ""
  2152. if len(set(tds) & self.header_set) > len(tds) * 0.2:
  2153. header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
  2154. if found_header:
  2155. headers.append('_'.join(header_list))
  2156. headers_demand.append('_'.join(header_list2))
  2157. header_colnum = len(tds)
  2158. header_col.append('_'.join(tds))
  2159. i += 1
  2160. continue
  2161. elif found_header:
  2162. if len(tds) != header_colnum: # 表头、属性列数不一致跳过
  2163. i += 1
  2164. continue
  2165. id1 = header_dic.get('名称', "")
  2166. id2 = header_dic.get('数量', "")
  2167. id3 = header_dic.get('单价', "")
  2168. id4 = header_dic.get('品牌', "")
  2169. id5 = header_dic.get('规格', "")
  2170. id6 = header_dic.get('需求', "")
  2171. id7 = header_dic.get('预算', "")
  2172. id8 = header_dic.get('时间', "")
  2173. if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
  2174. re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
  2175. product = tds[id1]
  2176. if id2 != "":
  2177. if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
  2178. quantity = tds[id2]
  2179. else:
  2180. quantity = ""
  2181. if id3 != "":
  2182. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
  2183. unitPrice = tds[id3]
  2184. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?",unitPrice)
  2185. if re_price:
  2186. unitPrice = re_price[0]
  2187. if '万元' in header_list[2] and '万' not in unitPrice:
  2188. unitPrice += '万元'
  2189. # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
  2190. unitPrice = str(getUnifyMoney(unitPrice))
  2191. else:
  2192. unitPrice = ""
  2193. else:
  2194. unitPrice = ""
  2195. if id4 != "":
  2196. if re.search('\w', tds[id4]):
  2197. brand = tds[id4]
  2198. else:
  2199. brand = ""
  2200. if id5 != "":
  2201. if re.search('\w', tds[id5]):
  2202. specs = tds[id5]
  2203. else:
  2204. specs = ""
  2205. if id6 != "":
  2206. if re.search('\w', tds[id6]):
  2207. demand = tds[id6]
  2208. else:
  2209. demand = ""
  2210. if id7 != "":
  2211. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
  2212. budget = tds[id7]
  2213. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?", budget)
  2214. if re_price:
  2215. budget = re_price[0]
  2216. if '万元' in header_list[2] and '万' not in budget:
  2217. budget += '万元'
  2218. budget = str(getUnifyMoney(budget))
  2219. else:
  2220. budget = ""
  2221. else:
  2222. budget = ""
  2223. if id8 != "":
  2224. if re.search('\w', tds[id8]):
  2225. order_time = tds[id8].strip()
  2226. order_begin, order_end = self.fix_time(order_time, html, page_time)
  2227. if quantity != "" or unitPrice != "" or brand != "" or specs != "":
  2228. link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
  2229. 'brand': brand[:50], 'specs':specs}
  2230. if link not in product_link:
  2231. product_link.append(link)
  2232. mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
  2233. if link['unitPrice'] != "" and mat:
  2234. try:
  2235. total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', ''))
  2236. except:
  2237. log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
  2238. if order_begin != "" and order_end != "":
  2239. order_begin_year = int(order_begin.split("-")[0])
  2240. order_end_year = int(order_end.split("-")[0])
  2241. # 限制附件错误识别时间
  2242. if order_begin_year >= 2050 or order_end_year >= 2050:
  2243. order_begin = order_end = ""
  2244. if budget != "" and order_time != "":
  2245. link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
  2246. if link not in demand_link:
  2247. demand_link.append(link)
  2248. i += 1
  2249. else:
  2250. i += 1
  2251. if len(product_link)>0:
  2252. attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
  2253. else:
  2254. attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
  2255. if len(demand_link)>0:
  2256. demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
  2257. else:
  2258. demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
  2259. return [attr_dic, demand_dic], total_product_money
  2260. def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
  2261. if len(prem[0]['prem'])==1:
  2262. list_sentence = list_sentences[0]
  2263. list_entity = list_entitys[0]
  2264. _data = product_attrs[1]['demand_info']['data']
  2265. re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
  2266. order_times = []
  2267. for entity in list_entity:
  2268. if entity.entity_type=='time':
  2269. sentence = list_sentence[entity.sentence_index]
  2270. s = spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index,
  2271. end_index=entity.end_index,size=20)
  2272. entity_left = "".join(s[0])
  2273. if re.search(re_bidding_time,entity_left):
  2274. time_text = entity.entity_text.strip()
  2275. standard_time = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*(?P<day>\d{1,2})日?)")
  2276. time_match = re.search(standard_time,time_text)
  2277. if time_match:
  2278. time_text = time_match.group()
  2279. order_times.append(time_text)
  2280. # print(order_times)
  2281. order_times = [tuple(self.fix_time(order_time, html, page_time)) for order_time in order_times]
  2282. order_times = [order_time for order_time in order_times if order_time[0]!=""]
  2283. if len(set(order_times))==1:
  2284. order_begin,order_end = order_times[0]
  2285. project_name = codeName[0]['name']
  2286. pack_info = [pack for pack in prem[0]['prem'].values()]
  2287. budget = pack_info[0].get('tendereeMoney',0)
  2288. product = prem[0]['product']
  2289. link = {'project_name': project_name, 'product': product, 'demand': project_name, 'budget': budget,
  2290. 'order_begin': order_begin, 'order_end': order_end}
  2291. _data.append(link)
  2292. product_attrs[1]['demand_info']['data'] = _data
  2293. return product_attrs
  2294. # docchannel类型提取
  2295. class DocChannel():
  2296. def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'):
  2297. self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
  2298. self.mask, self.mask_title = self.load_life(life_model)
  2299. self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
  2300. self.type_mask, self.type_mask_title = self.load_type(type_model)
  2301. self.sequen_len = 200 # 150 200
  2302. self.title_len = 30
  2303. self.sentence_num = 10
  2304. self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
  2305. lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
  2306. lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
  2307. self.id2type = {k: v for k, v in enumerate(lb_type)}
  2308. self.id2life = {k: v for k, v in enumerate(lb_life)}
  2309. self.load_pattern()
  2310. def load_pattern(self):
  2311. self.type_dic = {
  2312. '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
  2313. '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
  2314. '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
  2315. '采招数据': '(采购|招标|代理)(人|机构|单位)|(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' #|变更|答疑|澄清|中标|成交|合同|废标|流标
  2316. }
  2317. self.title_type_dic = {
  2318. '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
  2319. '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
  2320. '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
  2321. '采招数据': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标', # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
  2322. '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
  2323. }
  2324. self.life_dic = {
  2325. '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
  2326. '招标预告': '预计(采购|招标)(时间|日期)',
  2327. '招标公告': '(采购|招标|竞选|报名)条件;报名时间;报名流程;报名方法;报名需提供的材料;参加竞价采购交易资格;(申请人|投标人|供应商|报价人|参选人)的?资格要求;获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件;(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
  2328. '资审结果': '招标资审公告|评审入围公示|资审及业绩公示|资格后审情况报告|资格后审结果公告|资格后审结果公示|资格预审结果公告|资格预审结果公示|预审公示|预审结果公示',
  2329. '招标答疑': '现澄清为|现澄清如下|第\d次澄清|答疑澄清公告|异议的回复|(最高(投标)?限价|控制价|拦标价)公示',
  2330. '公告变更': '原公告(主要)?(信息|内容)|变更[前后]内容|现在?(变更|更正|修改|更改)为|(变更|更正)内容为|更正理由|更正人名称|[、\s](更正信息|更正内容):',
  2331. '候选人公示': '候选人公示|评标结果公示',
  2332. '中标信息': '供地结果信息|采用单源直接采购的?情况说明|现将\w{,4}(成交|中标|中选|选定结果|选取结果)\w{2,8}(进行公示|公示如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|(中标(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
  2333. '中标信息2': '(成交|中标)(日期|时间)[::\s]|成交金额:',
  2334. '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
  2335. '合同公告': '合同(公告|公示)信息;合同(公告|公示)日期;合同(公告|公示)内容;合同编号;合同名称;合同签订日期;合同主体;供应商乙方',
  2336. '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):废标|((本|该)项目|本标段|本次(招标)?)((采购|招标)?(失败|终止|流标|废标)|(按|做|作)(流标|废标)处理)',
  2337. }
  2338. self.title_life_dic = {
  2339. '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
  2340. '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|供应计划$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
  2341. '公告变更': '(变更|更正(事项)?|更改|延期|暂停)的?(公告|公示|通知)|变更$|更正$',
  2342. '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告)',
  2343. '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销|取消成交)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)',
  2344. '合同公告': '(合同(成交)?|履约验收|履约|验收结果)(公告|公示|信息|公式)|合同备案|合同书', # 合同$|
  2345. '候选人公示': '候选人公示|评标(结果)?公示|中标前?公示|中标预公示',
  2346. '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)结果|开标(记录|信息|情况)|中标通知书|中标$',
  2347. # '资审结果': '(资质|资格)(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|(资质|资格)(审查|预审)结果(公示)?|资审结果公示|未?入围(公示|公告)|资审及业绩公示',
  2348. '资审结果': '((资格|资质)(审查|预审|后审|审核|入围项?目?)|资审|入围)结果(公告|公示)?|(资质|资格)(预审|后审|入围)(入围)?(公示|公告|报告)|(资质|资格)?(预审|后审)(入围)?(公示|公告|报告)|未?入围(公示|公告)|资审及业绩公示',
  2349. '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)',
  2350. }
  2351. self.wrong_win = '按项目控制价下浮\d%即为成交价|不得确定为(中标|成交)|招标人按下列原则选择中标人|确定成交供应商:|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)|竞拍起止时间:'
  2352. def load_life(self,life_model):
  2353. with tf.Graph().as_default() as graph:
  2354. output_graph_def = graph.as_graph_def()
  2355. with open(os.path.dirname(__file__)+life_model, 'rb') as f:
  2356. output_graph_def.ParseFromString(f.read())
  2357. tf.import_graph_def(output_graph_def, name='')
  2358. # print("%d ops in the final graph" % len(output_graph_def.node))
  2359. del output_graph_def
  2360. sess = tf.Session(graph=graph)
  2361. sess.run(tf.global_variables_initializer())
  2362. inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
  2363. prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
  2364. title = sess.graph.get_tensor_by_name('inputs/title:0')
  2365. mask = sess.graph.get_tensor_by_name('inputs/mask:0')
  2366. mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
  2367. # logit = sess.graph.get_tensor_by_name('output/logit:0')
  2368. softmax = sess.graph.get_tensor_by_name('output/softmax:0')
  2369. return sess, title, inputs, prob, softmax, mask, mask_title
  2370. def load_type(self,type_model):
  2371. with tf.Graph().as_default() as graph:
  2372. output_graph_def = graph.as_graph_def()
  2373. with open(os.path.dirname(__file__)+type_model, 'rb') as f:
  2374. output_graph_def.ParseFromString(f.read())
  2375. tf.import_graph_def(output_graph_def, name='')
  2376. # print("%d ops in the final graph" % len(output_graph_def.node))
  2377. del output_graph_def
  2378. sess = tf.Session(graph=graph)
  2379. sess.run(tf.global_variables_initializer())
  2380. inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
  2381. prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
  2382. title = sess.graph.get_tensor_by_name('inputs/title:0')
  2383. mask = sess.graph.get_tensor_by_name('inputs/mask:0')
  2384. mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
  2385. # logit = sess.graph.get_tensor_by_name('output/logit:0')
  2386. softmax = sess.graph.get_tensor_by_name('output/softmax:0')
  2387. return sess, title, inputs, prob, softmax, mask, mask_title
  2388. def predict_process(self, docid='', doctitle='', dochtmlcon=''):
  2389. # print('准备预处理')
  2390. def get_kw_senten(s, span=10):
  2391. doc_sens = []
  2392. tmp = 0
  2393. num = 0
  2394. end_idx = 0
  2395. for it in re.finditer(self.kws, s): # '|'.join(keywordset)
  2396. left = s[end_idx:it.end()].split()
  2397. right = s[it.end():].split()
  2398. tmp_seg = s[tmp:it.start()].split()
  2399. if len(tmp_seg) > span or tmp == 0:
  2400. doc_sens.append(' '.join(left[-span:] + right[:span]))
  2401. end_idx = it.end() + 1 + len(' '.join(right[:span]))
  2402. tmp = it.end()
  2403. num += 1
  2404. if num >= self.sentence_num:
  2405. break
  2406. if doc_sens == []:
  2407. doc_sens.append(s)
  2408. return doc_sens
  2409. def word2id(wordlist, max_len=self.sequen_len):
  2410. ids = [getIndexOfWords(w) for w in wordlist]
  2411. ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
  2412. assert len(ids) == max_len
  2413. return ids
  2414. cost_time = dict()
  2415. datas = []
  2416. datas_title = []
  2417. try:
  2418. segword_title = ' '.join(selffool.cut(doctitle)[0])
  2419. segword_content = dochtmlcon
  2420. except:
  2421. segword_content = ''
  2422. segword_title = ''
  2423. if isinstance(segword_content, float):
  2424. segword_content = ''
  2425. if isinstance(segword_title, float):
  2426. segword_title = ''
  2427. segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
  2428. replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
  2429. replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
  2430. segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
  2431. segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
  2432. doc_word_list = segword_content.split()
  2433. if len(doc_word_list) > self.sequen_len / 2:
  2434. doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
  2435. doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
  2436. else:
  2437. doc_sens = ' '.join(doc_word_list[:self.sequen_len])
  2438. # print('标题:',segword_title)
  2439. # print('正文:',segword_content)
  2440. datas.append(doc_sens.split())
  2441. datas_title.append(segword_title.split())
  2442. # print('完成预处理')
  2443. return datas, datas_title
  2444. def is_houxuan(self, title, content):
  2445. '''
  2446. 通过标题和中文内容判断是否属于候选人公示类别
  2447. :param title: 公告标题
  2448. :param content: 公告正文文本内容
  2449. :return: 1 是候选人公示 ;0 不是
  2450. '''
  2451. if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围)
  2452. if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
  2453. return 0
  2454. return 1
  2455. if re.search('候选人的?公示', content[:100]):
  2456. if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
  2457. return 0
  2458. return 1
  2459. else:
  2460. return 0
  2461. def predict(self, title='', list_sentence='', web_source_no='', original_docchannel=''):
  2462. not_extract_dic = {
  2463. 104: '招标文件',
  2464. 106: '法律法规',
  2465. 107: '新闻资讯',
  2466. 108: '拟建项目',
  2467. 109: '展会推广',
  2468. 110: '企业名录',
  2469. 111: '企业资质',
  2470. 112: '全国工程人员',
  2471. 113: '业主采购'
  2472. }
  2473. if original_docchannel in not_extract_dic:
  2474. return {'docchannel': {'docchannel':'', 'doctype':not_extract_dic[original_docchannel], "original_docchannel_id": str(original_docchannel)}}
  2475. if web_source_no in ['02104-7']:
  2476. return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}}
  2477. if isinstance(list_sentence, list):
  2478. token_l = [it.tokens for it in list_sentence]
  2479. tokens = [it for l in token_l for it in l]
  2480. content = ' '.join(tokens[:500])
  2481. title = re.sub('[^\u4e00-\u9fa5]', '', title)
  2482. if len(title)>50:
  2483. title = title[:20]+title[-30:]
  2484. data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
  2485. text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
  2486. title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
  2487. result = {'docchannel': {'docchannel':'', 'doctype':'', "original_docchannel_id": str(original_docchannel)}}
  2488. array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
  2489. array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
  2490. pred = self.type_sess.run(self.type_softmax,
  2491. feed_dict={
  2492. self.type_title: array_title,
  2493. self.type_content: array_content,
  2494. self.type_mask:[[0]*text_len+[1]*(self.sequen_len-text_len)],
  2495. self.type_mask_title:[[0]*title_len+[1]*(self.title_len-title_len)],
  2496. self.type_prob:1}
  2497. )
  2498. id = np.argmax(pred, axis=1)[0]
  2499. prob = pred[0][id]
  2500. result['docchannel']['doctype'] = self.id2type[id]
  2501. # print('公告类别:', self.id2type[id], '概率:',prob)
  2502. # if id == 0:
  2503. if result['docchannel']['doctype'] not in ['', '新闻资讯']:
  2504. pred = self.lift_sess.run(self.lift_softmax,
  2505. feed_dict={
  2506. self.lift_title: array_title,
  2507. self.lift_content: array_content,
  2508. self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
  2509. self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
  2510. self.lift_prob:1}
  2511. )
  2512. id = np.argmax(pred, axis=1)[0]
  2513. prob = pred[0][id]
  2514. result['docchannel']['docchannel'] = self.id2life[id]
  2515. # print('生命周期:纯模型预测',self.id2life[id], '概率:',prob)
  2516. # if id == 6:
  2517. if result['docchannel']['docchannel'] == '中标信息':
  2518. if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
  2519. result['docchannel']['docchannel'] = '候选人公示'
  2520. # return '候选人公示', prob
  2521. # return [{'docchannel': '候选人公示'}]
  2522. return result
  2523. # return [{'docchannel':self.id2life[id]}]
  2524. # else:
  2525. # # return self.id2type[id], prob
  2526. # return [{'docchannel':self.id2type[id]}]
  2527. def predict_rule(self, title, content, channel_dic, prem_dic):
  2528. '''2022/2/10加入规则去除某些数据源及内容过短且不包含类别关键词的公告不做预测'''
  2529. hetong = '(合同|验收|履约)(公告|公示)|合同号?$' # 合同标题正则
  2530. zhongbiao_t = '(中标|中选|成交|入选|入围|结果|确认)(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选)结果|开标(记录|信息|情况)|单一来源|直接(选取|选定)|中标通知书|中标$'
  2531. zhongbiao_c = '(中标|中选|成交|拟选用|拟邀请|最终选定的?|拟定)(供应商|供货商|服务商|企业|公司|单位|(候选)?人)(名称)?[::]|[,。:.](供应商|供货商|服务商)(名称)?:|指定的中介服务机构:|建设服务单位:'
  2532. zhaobiao_t = '(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈)(公告|公示|$)'
  2533. title_cn = re.sub('[^\u4e00-\u9fa5]', '', title)
  2534. if len(re.sub('[^\u4e00-\u9fa5]', "", content))<50 and channel_dic['docchannel']['doctype'] != '新闻资讯':
  2535. if re.search(hetong, title_cn) != None:
  2536. channel_dic['docchannel']['docchannel'] = '合同公告'
  2537. elif re.search(zhongbiao_t, title_cn):
  2538. channel_dic['docchannel']['docchannel'] = '中标信息'
  2539. elif re.search(zhaobiao_t, title_cn):
  2540. channel_dic['docchannel']['docchannel'] = '招标公告'
  2541. else:
  2542. channel_dic['docchannel']['docchannel'] = ''
  2543. elif channel_dic['docchannel'].get('docchannel', '') == '招标公告' and 'win_tenderer' in json.dumps(prem_dic,
  2544. ensure_ascii=False):
  2545. if re.search(hetong, title_cn) != None:
  2546. channel_dic['docchannel']['docchannel'] = '合同公告'
  2547. log('正则把招标公告修改为合同公告')
  2548. elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
  2549. content):
  2550. channel_dic['docchannel']['docchannel'] = '中标信息'
  2551. log('正则把招标公告修改为中标信息')
  2552. elif channel_dic['docchannel'].get('docchannel', '') == '中标信息' and 'win_tenderer' not in json.dumps(prem_dic,
  2553. ensure_ascii=False):
  2554. if re.search(hetong, title_cn):
  2555. channel_dic['docchannel']['docchannel'] = '合同公告'
  2556. log('正则把中标信息修改为合同公告')
  2557. elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
  2558. content):
  2559. pass
  2560. elif re.search(zhaobiao_t, title_cn):
  2561. channel_dic['docchannel']['docchannel'] = '招标公告'
  2562. log('正则把中标信息修改为招标公告')
  2563. elif re.search('中标|成交|中选|入选|入围|结果|供应商|供货商|候选人', title_cn+content)==None:
  2564. channel_dic['docchannel']['docchannel'] = ''
  2565. log('正则把中标信息修改为空')
  2566. return channel_dic
  2567. def predict_merge(self, title, list_sentence, html, bidway, prem, original_docchannel='', web_source_no=''):
  2568. '''
  2569. 正则,模型混合预测,返回公告类型及生命周期
  2570. :param title: 公告标题
  2571. :param content: 预处理后的返回的句子实体列表 list_sentence
  2572. :param html: 公告原文 html 内容
  2573. :param bidway: 招标方式
  2574. :param prem: 提取的prem 字典
  2575. :return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
  2576. '''
  2577. def cut_single_cn_space(text):
  2578. new_text = ""
  2579. for w in text.split():
  2580. if len(w) == 1 or re.search('^[\u4e00-\u9fa5][::]', w):
  2581. new_text += w
  2582. else:
  2583. new_text += ' ' + w
  2584. return new_text
  2585. def html2text(html):
  2586. ser = re.search('<div[^<>]*richTextFetch', html)
  2587. if ser:
  2588. html = html[:ser.start()]+'##richTextFetch##'
  2589. text = re.sub('<[^<]*?>', '', html).replace('&nbsp;', ' ')
  2590. text = re.sub('\s+', ' ', text)
  2591. text = re.sub('[/|[()()]', '', text)
  2592. text = cut_single_cn_space(text)
  2593. return text[:20000]
  2594. def count_diffser(pattern, text):
  2595. num = 0
  2596. kw = []
  2597. for p in pattern.split(';'):
  2598. if re.search(p, text):
  2599. num += 1
  2600. kw.append(re.search(p, text).group(0))
  2601. return num, ';'.join(kw)
  2602. def is_contain_winner(extract_json):
  2603. if re.search('win_tenderer', extract_json):
  2604. return True
  2605. else:
  2606. return False
  2607. def is_single_source(bidway, title):
  2608. if re.search('单一来源|单一性采购', title):
  2609. return True
  2610. elif bidway == '单一来源':
  2611. return True
  2612. else:
  2613. return False
  2614. def get_type(title, text):
  2615. if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'],
  2616. text): # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None
  2617. if re.search(self.title_type_dic['采招数据'], title + text[:50]):
  2618. return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
  2619. return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0)
  2620. elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
  2621. if re.search(self.title_type_dic['采招数据'], title + text[:50]):
  2622. return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
  2623. return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
  2624. elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text):
  2625. if re.search(self.title_type_dic['采招数据'], title + text[:50]):
  2626. return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:50]).group(0)
  2627. return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0)
  2628. elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text):
  2629. return '采招数据', (
  2630. re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group(
  2631. 0)
  2632. elif re.search(self.title_type_dic['新闻资讯'], title):
  2633. if re.search(self.title_type_dic['采招数据'], title + text[:150]):
  2634. return '采招数据', re.search(self.title_type_dic['采招数据'], title + text[:150]).group(0)
  2635. return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0)
  2636. else:
  2637. return '', '没有公告类型关键词,返回空'
  2638. def get_life(title, text, extract_json="", bidway="", original_docchannel=''):
  2639. if re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100]):
  2640. if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
  2641. return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
  2642. 0)
  2643. elif re.search(self.title_life_dic['候选人公示'], title):
  2644. return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
  2645. elif re.search(self.title_life_dic['中标信息'], title):
  2646. return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
  2647. elif re.search('终止|废标|流标', title):
  2648. return '废标公告', re.search('终止|废标|流标', title).group(0)
  2649. elif is_single_source(bidway, title):
  2650. return '中标信息', 'bidway单一来源'
  2651. return '采购意向', (
  2652. re.search(self.title_life_dic['采购意向'], title) and re.search(self.life_dic['采购意向'], text[:100])).group(0)
  2653. elif re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text):
  2654. if re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
  2655. return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(
  2656. 0)
  2657. elif re.search(self.title_life_dic['候选人公示'], title):
  2658. return '候选人公示', re.search(self.title_life_dic['候选人公示'], title).group(0)
  2659. elif re.search(self.title_life_dic['中标信息'], title):
  2660. return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
  2661. elif re.search('终止|废标|流标', title):
  2662. return '废标公告', re.search('终止|废标|流标', title).group(0)
  2663. elif is_single_source(extract_json, title):
  2664. return '中标信息', 'bidway单一来源'
  2665. return '招标预告', (re.search(self.title_life_dic['招标预告'], title) or re.search(self.life_dic['招标预告'], text)).group(0)
  2666. elif re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text):
  2667. if re.search(self.title_life_dic['废标公告'], title):
  2668. return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
  2669. # elif re.search('(中标|成交)结果', title[-8:]):
  2670. # return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
  2671. return '公告变更', (re.search(self.title_life_dic['公告变更'], title) or re.search(self.life_dic['公告变更'], text)).group(0)
  2672. elif re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or len(
  2673. re.findall('(答:|回复:)', text)) >= 2: # or re.search(self.title_life_dic['招标答疑'], text[:150])
  2674. if re.search(self.title_life_dic['废标公告'], title):
  2675. return '废标公告', re.search(self.title_life_dic['废标公告'], title).group(0)
  2676. elif re.search('(中标|成交)结果', title[-8:]):
  2677. return '中标信息', re.search('(中标|成交)结果', title[-8:]).group(0)
  2678. return '招标答疑', (
  2679. re.search(self.title_life_dic['招标答疑'], title) or re.search(self.life_dic['招标答疑'], text) or re.search(
  2680. '(答:|回复:)', text)).group(0)
  2681. elif re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150]):
  2682. return '废标公告', (
  2683. re.search(self.title_life_dic['废标公告'], title+ text[:150]) or re.search(self.life_dic['废标公告'], text[:150])).group(0)
  2684. elif re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150]):
  2685. if re.search('候选人|公示期?(已?满|已经?结束)|中标(结果|公告)', text) == None:
  2686. return '中标信息', '候选人公示排除,修改为中标信息'
  2687. return '候选人公示', (
  2688. re.search(self.title_life_dic['候选人公示'], title) or re.search(self.life_dic['候选人公示'], text[:150])).group(
  2689. 0)
  2690. elif re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'], text[
  2691. :150]):
  2692. return '合同公告', (re.search(self.title_life_dic['合同公告'], title) or re.search(self.title_life_dic['合同公告'],
  2693. text[:150]) or re.search(
  2694. self.life_dic['合同公告'], text)).group(0)
  2695. elif re.search(self.life_dic['合同公告'].replace(';', '|'), text): # or re.search(self.life_dic['合同公告'], text[:300]):
  2696. num, kw = count_diffser(self.life_dic['合同公告'], text)
  2697. if num >= 3:
  2698. return '合同公告', kw
  2699. elif re.search(self.title_life_dic['招标公告'], title[-8:]):
  2700. return '招标公告', re.search(self.title_life_dic['招标公告'], title[-8:]).group(0)
  2701. elif not is_contain_winner(extract_json):
  2702. return '', '有合同关键词无中标角色返回空'
  2703. return '合同公告', re.search(self.life_dic['合同公告'].replace(';', '|'), text).group(0)
  2704. elif is_single_source(extract_json, title):
  2705. return '中标信息', '单一来源采购'
  2706. elif re.search(self.title_life_dic['中标信息'], title):
  2707. if re.search(self.title_life_dic['资审结果'], title+text[:150]):
  2708. return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
  2709. return '中标信息', re.search(self.title_life_dic['中标信息'], title).group(0)
  2710. elif re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:]):
  2711. if re.search(self.title_life_dic['资审结果'], title+text[:150]):
  2712. return '资审结果', re.search(self.title_life_dic['资审结果'], title+text[:150]).group(0)
  2713. # if re.search(self.wrong_win, text):
  2714. # return '招标公告', re.search(self.wrong_win, text).group(0)
  2715. return '中标信息', (
  2716. re.search(self.title_life_dic['中标信息'], text[:100]) or re.search(self.life_dic['中标信息'], text[:])).group(
  2717. 0)
  2718. elif re.search(self.life_dic['中标信息2'], text[:]):
  2719. if re.search(self.wrong_win, text):
  2720. return '招标公告', re.search(self.wrong_win, text).group(0)
  2721. return '中标信息', re.search(self.life_dic['中标信息2'], text[:]).group(0)
  2722. elif re.search(self.life_dic['中标信息3'], text[:]) and is_contain_winner(extract_json):
  2723. if re.search(self.wrong_win, text):
  2724. return '招标公告', re.search(self.wrong_win, text).group(0)
  2725. return '中标信息', re.search(self.life_dic['中标信息3'], text[:]).group(0)
  2726. elif re.search('公开选取.{,20}机构的公告', title):
  2727. if re.search('(中标|成交|中选)(中介|服务)?机构(名称)?[::\s]', text):
  2728. return '中标信息', '机构选取有中选机构'
  2729. else:
  2730. return '招标公告', '公开选取机构'
  2731. elif is_contain_winner(extract_json):
  2732. num, kw = count_diffser(self.life_dic['招标公告'], text)
  2733. if re.search(self.wrong_win, text):
  2734. return '招标公告', re.search(self.wrong_win, text).group(0)
  2735. elif num >= 2:
  2736. return '招标公告', kw
  2737. elif re.search('##richTextFetch##', text):
  2738. return '', '提取到中标人但包含附件返回空'
  2739. return '中标信息', '提取到中标人'
  2740. elif re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:]):
  2741. return '资审结果', (re.search(self.title_life_dic['资审结果'], title+text[:150]) or re.search(self.life_dic['资审结果'], text[:])).group(0)
  2742. elif re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'), text[:]):
  2743. if re.search('意向|预告|变更|更正|中标|中选|成交|答疑|废标|流标|终止', title):
  2744. return '', '招标正则召回标题有其他类别关键词,返回空'
  2745. return '招标公告', (re.search(self.title_life_dic['招标公告'], title) or re.search(self.life_dic['招标公告'].replace(';', '|'),
  2746. text[:])).group(0)
  2747. else:
  2748. return '', '未预测到关键词, 返回空'
  2749. not_extract_dic = {
  2750. 104: '招标文件',
  2751. 106: '法律法规',
  2752. 107: '新闻资讯',
  2753. 108: '拟建项目',
  2754. 109: '展会推广',
  2755. 110: '企业名录',
  2756. 111: '企业资质',
  2757. 112: '全国工程人员',
  2758. 113: '业主采购'
  2759. }
  2760. origin_dic = {51: '公告变更',
  2761. 52: '招标公告',
  2762. 101: '中标信息',
  2763. 102: '招标预告',
  2764. 103: '招标答疑',
  2765. 104: '招标文件',
  2766. 105: '资审结果',
  2767. 106: '法律法规',
  2768. 107: '新闻资讯',
  2769. 108: '拟建项目',
  2770. 109: '展会推广',
  2771. 110: '企业名录',
  2772. 111: '企业资质',
  2773. 112: '全国工程',
  2774. 113: '业主采购',
  2775. 114: '采购意向',
  2776. 115: '拍卖出让',
  2777. 116: '土地矿产',
  2778. 117: '产权交易',
  2779. 118: '废标公告',
  2780. 119: '候选人公示',
  2781. 120: '合同公告'}
  2782. if original_docchannel in not_extract_dic:
  2783. return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel]}}
  2784. if web_source_no in ['02104-7', '04733']: # 这些数据源无法识别
  2785. return {'docchannel': {'docchannel': '', 'doctype': '采招数据'}}
  2786. title = re.sub('[^\u4e00-\u9fa5]', '', title)
  2787. if len(title) > 50:
  2788. title = title[:20] + title[-30:]
  2789. text = html2text(html)
  2790. prem_json = json.dumps(prem, ensure_ascii=False)
  2791. result = {'docchannel': {'docchannel': '', 'doctype': ''}}
  2792. doc_type, type_kw = get_type(title, text)
  2793. doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
  2794. if doc_type in self.title_type_dic:
  2795. result['docchannel']['doctype'] = doc_type
  2796. if doc_life in self.title_life_dic:
  2797. result['docchannel']['docchannel'] = doc_life
  2798. if doc_type=="" or doc_life=="":
  2799. list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
  2800. token_l = [it.tokens for it in list_sentence]
  2801. tokens = [it for l in token_l for it in l]
  2802. content = ' '.join(tokens[:500])
  2803. data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
  2804. dochtmlcon=content) # 标题最多取50字
  2805. text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
  2806. title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
  2807. array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
  2808. array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
  2809. if doc_type == "":
  2810. pred = self.type_sess.run(self.type_softmax,
  2811. feed_dict={
  2812. self.type_title: array_title,
  2813. self.type_content: array_content,
  2814. self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
  2815. self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
  2816. self.type_prob: 1}
  2817. )
  2818. id = np.argmax(pred, axis=1)[0]
  2819. prob = pred[0][id]
  2820. result['docchannel']['doctype'] = self.id2type[id]
  2821. # print('公告类别:', self.id2type[id], '概率:',prob)
  2822. # if id == 0:
  2823. if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
  2824. if len(text)>150 and re.search(self.kws, content):
  2825. pred = self.lift_sess.run(self.lift_softmax,
  2826. feed_dict={
  2827. self.lift_title: array_title,
  2828. self.lift_content: array_content,
  2829. self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
  2830. self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
  2831. self.lift_prob: 1}
  2832. )
  2833. id = np.argmax(pred, axis=1)[0]
  2834. prob = pred[0][id]
  2835. if self.id2life[id] == '中标信息' and original_docchannel in [52, '52', '招标公告'] and not is_contain_winner(prem_json):
  2836. result['docchannel']['docchannel'] = '招标公告'
  2837. elif self.id2life[id] == '采购意向' and re.search('意向品牌|意向单位', text):
  2838. result['docchannel']['docchannel'] = '招标公告'
  2839. else:
  2840. result['docchannel']['docchannel'] = self.id2life[id]
  2841. # print('生命周期:',self.id2life[id], '概率:',prob)
  2842. # if id == 6:
  2843. if result['docchannel']['docchannel'] == '中标信息':
  2844. if self.is_houxuan(''.join([it for it in title if it.isalpha()]),
  2845. ''.join([it for it in content if it.isalpha()])):
  2846. result['docchannel']['docchannel'] = '候选人公示'
  2847. # return '候选人公示', prob
  2848. # return [{'docchannel': '候选人公示'}]
  2849. # print('公告类型:%s, 生命周期:%s, 关键词:%s '%(doc_type, doc_life, life_kw))
  2850. # print('result: ', result)
  2851. if result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(self.title_life_dic['废标公告'], title)==None:
  2852. result['docchannel']['docchannel'] = '中标信息'
  2853. if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
  2854. result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
  2855. else:
  2856. result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
  2857. return result
  2858. # 保证金支付方式提取
  2859. class DepositPaymentWay():
  2860. def __init__(self,):
  2861. self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[::]*([^,。]{,60})'
  2862. self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式'
  2863. kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)',
  2864. '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码',
  2865. '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出',
  2866. '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函']
  2867. self.kws = sorted(kws, key=lambda x: len(x), reverse=True)
  2868. def predict(self,content):
  2869. pay_way = {'deposit_patment_way':''}
  2870. result = []
  2871. pay = re.search(self.pt, content)
  2872. if pay:
  2873. # print(pay.group(0))
  2874. pay = pay.group(3)
  2875. for it in re.finditer('|'.join(self.kws), pay):
  2876. result.append(it.group(0))
  2877. pay_way['deposit_patment_way'] = ';'.join(result)
  2878. return pay_way
  2879. pay = re.search(self.pt2, content)
  2880. if pay:
  2881. # print(pay.group(0))
  2882. pay = pay.group(2)
  2883. for it in re.finditer('|'.join(self.kws), pay):
  2884. result.append(it.group(0))
  2885. pay_way['deposit_patment_way'] = ';'.join(result)
  2886. return pay_way
  2887. else:
  2888. return pay_way
  2889. # 总价单价提取
  2890. class TotalUnitMoney:
  2891. def __init__(self):
  2892. pass
  2893. def predict(self, list_sentences, list_entitys):
  2894. for i in range(len(list_entitys)):
  2895. list_entity = list_entitys[i]
  2896. # 总价单价
  2897. for _entity in list_entity:
  2898. if _entity.entity_type == 'money':
  2899. word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
  2900. # 总价在中投标金额中
  2901. if _entity.label == 1:
  2902. result = extract_total_money(word_of_sentence,
  2903. _entity.entity_text,
  2904. [_entity.wordOffset_begin, _entity.wordOffset_end])
  2905. if result:
  2906. _entity.is_total_money = 1
  2907. # 单价在普通金额中
  2908. else:
  2909. result = extract_unit_money(word_of_sentence,
  2910. _entity.entity_text,
  2911. [_entity.wordOffset_begin, _entity.wordOffset_end])
  2912. if result:
  2913. _entity.is_unit_money = 1
  2914. # print("total_unit_money", _entity.entity_text,
  2915. # _entity.is_total_money, _entity.is_unit_money)
  2916. def getSavedModel():
  2917. #predictor = FormPredictor()
  2918. graph = tf.Graph()
  2919. with graph.as_default():
  2920. model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
  2921. #print(tf.graph_util.remove_training_nodes(model))
  2922. tf.saved_model.simple_save(
  2923. tf.keras.backend.get_session(),
  2924. "./h5_savedmodel/",
  2925. inputs={"image": model.input},
  2926. outputs={"scores": model.output}
  2927. )
  2928. def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
  2929. '''
  2930. model = models.Sequential()
  2931. model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
  2932. model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
  2933. crf = CRF(len(chunk_tags), sparse_target=True)
  2934. model.add(crf)
  2935. model.summary()
  2936. model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
  2937. return model
  2938. '''
  2939. input = layers.Input(shape=(None,),dtype="int32")
  2940. if weights is not None:
  2941. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
  2942. else:
  2943. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
  2944. bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
  2945. bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
  2946. crf = CRF(len(chunk_tags),sparse_target=True)
  2947. crf_out = crf(bilstm_dense)
  2948. model = models.Model(input=[input],output = [crf_out])
  2949. model.summary()
  2950. model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
  2951. return model
  2952. import h5py
  2953. def h5_to_graph(sess,graph,h5file):
  2954. f = h5py.File(h5file,'r') #打开h5文件
  2955. def getValue(v):
  2956. _value = f["model_weights"]
  2957. list_names = str(v.name).split("/")
  2958. for _index in range(len(list_names)):
  2959. print(v.name)
  2960. if _index==1:
  2961. _value = _value[list_names[0]]
  2962. _value = _value[list_names[_index]]
  2963. return _value.value
  2964. def _load_attributes_from_hdf5_group(group, name):
  2965. """Loads attributes of the specified name from the HDF5 group.
  2966. This method deals with an inherent problem
  2967. of HDF5 file which is not able to store
  2968. data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
  2969. # Arguments
  2970. group: A pointer to a HDF5 group.
  2971. name: A name of the attributes to load.
  2972. # Returns
  2973. data: Attributes data.
  2974. """
  2975. if name in group.attrs:
  2976. data = [n.decode('utf8') for n in group.attrs[name]]
  2977. else:
  2978. data = []
  2979. chunk_id = 0
  2980. while ('%s%d' % (name, chunk_id)) in group.attrs:
  2981. data.extend([n.decode('utf8')
  2982. for n in group.attrs['%s%d' % (name, chunk_id)]])
  2983. chunk_id += 1
  2984. return data
  2985. def readGroup(gr,parent_name,data):
  2986. for subkey in gr:
  2987. print(subkey)
  2988. if parent_name!=subkey:
  2989. if parent_name=="":
  2990. _name = subkey
  2991. else:
  2992. _name = parent_name+"/"+subkey
  2993. else:
  2994. _name = parent_name
  2995. if str(type(gr[subkey]))=="<class 'h5py._hl.group.Group'>":
  2996. readGroup(gr[subkey],_name,data)
  2997. else:
  2998. data.append([_name,gr[subkey].value])
  2999. print(_name,gr[subkey].shape)
  3000. layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names')
  3001. list_name_value = []
  3002. readGroup(f["model_weights"], "", list_name_value)
  3003. '''
  3004. for k, name in enumerate(layer_names):
  3005. g = f["model_weights"][name]
  3006. weight_names = _load_attributes_from_hdf5_group(g, 'weight_names')
  3007. #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
  3008. for weight_name in weight_names:
  3009. list_name_value.append([weight_name,np.asarray(g[weight_name])])
  3010. '''
  3011. for name_value in list_name_value:
  3012. name = name_value[0]
  3013. '''
  3014. if re.search("dense",name) is not None:
  3015. name = name[:7]+"_1"+name[7:]
  3016. '''
  3017. value = name_value[1]
  3018. print(name,graph.get_tensor_by_name(name),np.shape(value))
  3019. sess.run(tf.assign(graph.get_tensor_by_name(name),value))
  3020. def initialize_uninitialized(sess):
  3021. global_vars = tf.global_variables()
  3022. is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars])
  3023. not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
  3024. adam_vars = []
  3025. for _vars in not_initialized_vars:
  3026. if re.search("Adam",_vars.name) is not None:
  3027. adam_vars.append(_vars)
  3028. print([str(i.name) for i in adam_vars]) # only for testing
  3029. if len(adam_vars):
  3030. sess.run(tf.variables_initializer(adam_vars))
  3031. def save_codename_model():
  3032. # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
  3033. filepath = "../projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
  3034. vocabpath = "../projectCode/models/vocab.pk"
  3035. classlabelspath = "../projectCode/models/classlabels.pk"
  3036. # vocab = load(vocabpath)
  3037. # class_labels = load(classlabelspath)
  3038. w2v_matrix = load('codename_w2v_matrix.pk')
  3039. graph = tf.get_default_graph()
  3040. with graph.as_default() as g:
  3041. ''''''
  3042. # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
  3043. #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
  3044. sess = tf.Session(graph=g)
  3045. # sess = tf.keras.backend.get_session()
  3046. char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
  3047. #with sess.as_default():
  3048. sess.run(tf.global_variables_initializer())
  3049. # print(sess.run("time_distributed_1/kernel:0"))
  3050. # model.load_weights(filepath)
  3051. saver = tf.train.Saver()
  3052. saver.restore(sess, filepath)
  3053. # print("logits",sess.run(logits))
  3054. # print("#",sess.run("time_distributed_1/kernel:0"))
  3055. # x = load("codename_x.pk")
  3056. #y = model.predict(x)
  3057. # y = sess.run(model.output,feed_dict={model.input:x})
  3058. # for item in np.argmax(y,-1):
  3059. # print(item)
  3060. tf.saved_model.simple_save(
  3061. sess,
  3062. "./codename_savedmodel_tf/",
  3063. inputs={"inputs": char_input,
  3064. "inputs_length":length,
  3065. 'keepprob':keepprob},
  3066. outputs={"logits": logits,
  3067. "trans":trans}
  3068. )
  3069. def save_role_model():
  3070. '''
  3071. @summary: 保存model为savedModel,部署到PAI平台上调用
  3072. '''
  3073. model_role = PREMPredict().model_role
  3074. with model_role.graph.as_default():
  3075. model = model_role.getModel()
  3076. sess = tf.Session(graph=model_role.graph)
  3077. print(type(model.input))
  3078. sess.run(tf.global_variables_initializer())
  3079. h5_to_graph(sess, model_role.graph, model_role.model_role_file)
  3080. model = model_role.getModel()
  3081. tf.saved_model.simple_save(sess,
  3082. "./role_savedmodel/",
  3083. inputs={"input0":model.input[0],
  3084. "input1":model.input[1],
  3085. "input2":model.input[2]},
  3086. outputs={"outputs":model.output}
  3087. )
  3088. def save_money_model():
  3089. model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
  3090. graph = tf.Graph()
  3091. with graph.as_default():
  3092. sess = tf.Session(graph=graph)
  3093. with sess.as_default():
  3094. # model = model_money.getModel()
  3095. # model.summary()
  3096. # sess.run(tf.global_variables_initializer())
  3097. # h5_to_graph(sess, model_money.graph, model_money.model_money_file)
  3098. model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  3099. model.summary()
  3100. print(model.weights)
  3101. tf.saved_model.simple_save(sess,
  3102. "./money_savedmodel2/",
  3103. inputs = {"input0":model.input[0],
  3104. "input1":model.input[1],
  3105. "input2":model.input[2]},
  3106. outputs = {"outputs":model.output}
  3107. )
  3108. def save_person_model():
  3109. model_person = EPCPredict().model_person
  3110. with model_person.graph.as_default():
  3111. x = load("person_x.pk")
  3112. _data = np.transpose(np.array(x),(1,0,2,3))
  3113. model = model_person.getModel()
  3114. sess = tf.Session(graph=model_person.graph)
  3115. with sess.as_default():
  3116. sess.run(tf.global_variables_initializer())
  3117. model_person.load_weights()
  3118. #h5_to_graph(sess, model_person.graph, model_person.model_person_file)
  3119. predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]})
  3120. #predict_y = model.predict([_data[0],_data[1]])
  3121. print(np.argmax(predict_y,-1))
  3122. tf.saved_model.simple_save(sess,
  3123. "./person_savedmodel/",
  3124. inputs={"input0":model.input[0],
  3125. "input1":model.input[1]},
  3126. outputs = {"outputs":model.output})
  3127. def save_form_model():
  3128. model_form = FormPredictor()
  3129. with model_form.graph.as_default():
  3130. model = model_form.getModel("item")
  3131. sess = tf.Session(graph=model_form.graph)
  3132. sess.run(tf.global_variables_initializer())
  3133. h5_to_graph(sess, model_form.graph, model_form.model_file_item)
  3134. tf.saved_model.simple_save(sess,
  3135. "./form_savedmodel/",
  3136. inputs={"inputs":model.input},
  3137. outputs = {"outputs":model.output})
  3138. def save_codesplit_model():
  3139. filepath_code = "../projectCode/models/model_code.hdf5"
  3140. graph = tf.Graph()
  3141. with graph.as_default():
  3142. model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  3143. sess = tf.Session()
  3144. sess.run(tf.global_variables_initializer())
  3145. h5_to_graph(sess, graph, filepath_code)
  3146. tf.saved_model.simple_save(sess,
  3147. "./codesplit_savedmodel/",
  3148. inputs={"input0":model_code.input[0],
  3149. "input1":model_code.input[1],
  3150. "input2":model_code.input[2]},
  3151. outputs={"outputs":model_code.output})
  3152. def save_timesplit_model():
  3153. filepath = '../time/model_label_time_classify.model.hdf5'
  3154. with tf.Graph().as_default() as graph:
  3155. time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  3156. with tf.Session() as sess:
  3157. sess.run(tf.global_variables_initializer())
  3158. h5_to_graph(sess, graph, filepath)
  3159. tf.saved_model.simple_save(sess,
  3160. "./timesplit_model/",
  3161. inputs={"input0":time_model.input[0],
  3162. "input1":time_model.input[1]},
  3163. outputs={"outputs":time_model.output})
  3164. if __name__=="__main__":
  3165. #save_role_model()
  3166. # save_codename_model()
  3167. # save_money_model()
  3168. #save_person_model()
  3169. #save_form_model()
  3170. #save_codesplit_model()
  3171. # save_timesplit_model()
  3172. '''
  3173. # with tf.Session(graph=tf.Graph()) as sess:
  3174. # from tensorflow.python.saved_model import tag_constants
  3175. # meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
  3176. # graph = tf.get_default_graph()
  3177. # signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  3178. # signature = meta_graph_def.signature_def
  3179. # input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
  3180. # input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
  3181. # outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
  3182. # x = load("person_x.pk")
  3183. # _data = np.transpose(x,[1,0,2,3])
  3184. # y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
  3185. # print(np.argmax(y,-1))
  3186. '''