predictor.py 288 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667
  1. '''
  2. Created on 2018年12月26日
  3. @author: User
  4. '''
  5. import os
  6. import sys
  7. from BiddingKG.dl.common.nerUtils import *
  8. sys.path.append(os.path.abspath("../.."))
  9. # from keras.engine import topology
  10. # from keras import models
  11. # from keras import layers
  12. # from keras_contrib.layers.crf import CRF
  13. # from keras.preprocessing.sequence import pad_sequences
  14. # from keras import optimizers,losses,metrics
  15. from BiddingKG.dl.common.Utils import *
  16. from BiddingKG.dl.interface.modelFactory import *
  17. import tensorflow as tf
  18. import pandas as pd
  19. from BiddingKG.dl.product.data_util import decode, process_data
  20. from BiddingKG.dl.interface.Entitys import Entity
  21. from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
  22. from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
  23. from bs4 import BeautifulSoup
  24. import copy
  25. import calendar
  26. import datetime
  27. # import fool # 统一用 selffool ,阿里云上只有selffool 包
  28. cpu_num = int(os.environ.get("CPU_NUM",0))
  29. sess_config = tf.ConfigProto(
  30. inter_op_parallelism_threads = cpu_num,
  31. intra_op_parallelism_threads = cpu_num,
  32. log_device_placement=True)
  33. sess_config = None
  34. from threading import RLock
  35. dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
  36. "prem":{"predictor":None,"Lock":RLock()},
  37. "epc":{"predictor":None,"Lock":RLock()},
  38. "roleRule":{"predictor":None,"Lock":RLock()},
  39. "roleRuleFinal":{"predictor":None,"Lock":RLock()},
  40. "tendereeRuleRecall":{"predictor":None,"Lock":RLock()},
  41. "form":{"predictor":None,"Lock":RLock()},
  42. "time":{"predictor":None,"Lock":RLock()},
  43. "punish":{"predictor":None,"Lock":RLock()},
  44. "product":{"predictor":None,"Lock":RLock()},
  45. "product_attrs":{"predictor":None,"Lock":RLock()},
  46. "channel": {"predictor": None, "Lock": RLock()},
  47. "deposit_payment_way": {"predictor": None, "Lock": RLock()},
  48. "total_unit_money": {"predictor": None, "Lock": RLock()},
  49. "industry": {"predictor": None, "Lock": RLock()},
  50. "rolegrade": {"predictor": None, "Lock": RLock()},
  51. "moneygrade": {"predictor": None, "Lock": RLock()},
  52. "district": {"predictor": None, "Lock": RLock()}
  53. }
  54. def getPredictor(_type):
  55. if _type in dict_predictor:
  56. with dict_predictor[_type]["Lock"]:
  57. if dict_predictor[_type]["predictor"] is None:
  58. if _type == "codeName":
  59. dict_predictor[_type]["predictor"] = CodeNamePredict(config=sess_config)
  60. if _type == "prem":
  61. dict_predictor[_type]["predictor"] = PREMPredict(config=sess_config)
  62. if _type == "epc":
  63. dict_predictor[_type]["predictor"] = EPCPredict(config=sess_config)
  64. if _type == "roleRule":
  65. dict_predictor[_type]["predictor"] = RoleRulePredictor()
  66. if _type == "roleRuleFinal":
  67. dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
  68. if _type == "tendereeRuleRecall":
  69. dict_predictor[_type]["predictor"] = TendereeRuleRecall()
  70. if _type == "form":
  71. dict_predictor[_type]["predictor"] = FormPredictor(config=sess_config)
  72. if _type == "time":
  73. dict_predictor[_type]["predictor"] = TimePredictor(config=sess_config)
  74. if _type == "punish":
  75. dict_predictor[_type]["predictor"] = Punish_Extract()
  76. if _type == "product":
  77. dict_predictor[_type]["predictor"] = ProductPredictor(config=sess_config)
  78. if _type == "product_attrs":
  79. dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
  80. if _type == "channel":
  81. dict_predictor[_type]["predictor"] = DocChannel(config=sess_config)
  82. if _type == 'deposit_payment_way':
  83. dict_predictor[_type]["predictor"] = DepositPaymentWay()
  84. if _type == 'total_unit_money':
  85. dict_predictor[_type]["predictor"] = TotalUnitMoney()
  86. if _type == 'industry':
  87. dict_predictor[_type]["predictor"] = IndustryPredictor()
  88. if _type == 'rolegrade':
  89. dict_predictor[_type]["predictor"] = RoleGrade()
  90. if _type == 'moneygrade':
  91. dict_predictor[_type]["predictor"] = MoneyGrade()
  92. if _type == 'district':
  93. dict_predictor[_type]["predictor"] = DistrictPredictor()
  94. return dict_predictor[_type]["predictor"]
  95. raise NameError("no this type of predictor")
  96. # 编号名称模型
  97. class CodeNamePredict():
  98. def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad(),config=None):
  99. self.model = None
  100. self.MAX_LEN = None
  101. self.model_code = None
  102. if EMBED_DIM is None:
  103. self.EMBED_DIM = 60
  104. else:
  105. self.EMBED_DIM = EMBED_DIM
  106. if BiRNN_UNITS is None:
  107. self.BiRNN_UNITS = 200
  108. else:
  109. self.BiRNN_UNITS = BiRNN_UNITS
  110. self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
  111. #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5"
  112. self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5"
  113. vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk"
  114. classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk"
  115. self.vocab = load(vocabpath)
  116. self.class_labels = load(classlabelspath)
  117. #生成提取编号和名称的正则
  118. id_PC_B = self.class_labels.index("PC_B")
  119. id_PC_M = self.class_labels.index("PC_M")
  120. id_PC_E = self.class_labels.index("PC_E")
  121. id_PN_B = self.class_labels.index("PN_B")
  122. id_PN_M = self.class_labels.index("PN_M")
  123. id_PN_E = self.class_labels.index("PN_E")
  124. self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E))
  125. self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E))
  126. # print("pc",self.PC_pattern)
  127. # print("pn",self.PN_pattern)
  128. self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
  129. self.inputs = None
  130. self.outputs = None
  131. self.sess_codename = tf.Session(graph=tf.Graph(),config=config)
  132. self.sess_codesplit = tf.Session(graph=tf.Graph(),config=config)
  133. self.inputs_code = None
  134. self.outputs_code = None
  135. if not lazyLoad:
  136. self.getModel()
  137. self.getModel_code()
  138. def getModel(self):
  139. '''
  140. @summary: 取得编号和名称模型
  141. '''
  142. if self.inputs is None:
  143. log("get model of codename")
  144. with self.sess_codename.as_default():
  145. with self.sess_codename.graph.as_default():
  146. meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf")
  147. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  148. signature_def = meta_graph_def.signature_def
  149. self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
  150. self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name)
  151. self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name)
  152. self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name)
  153. self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name)
  154. return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
  155. else:
  156. return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
  157. '''
  158. if self.model is None:
  159. self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
  160. self.model.load_weights(self.filepath)
  161. return self.model
  162. '''
  163. def getModel_code(self):
  164. if self.inputs_code is None:
  165. log("get model of code")
  166. with self.sess_codesplit.as_default():
  167. with self.sess_codesplit.graph.as_default():
  168. meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel")
  169. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  170. signature_def = meta_graph_def.signature_def
  171. self.inputs_code = []
  172. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
  173. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
  174. self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name))
  175. self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
  176. self.sess_codesplit.graph.finalize()
  177. return self.inputs_code,self.outputs_code
  178. else:
  179. return self.inputs_code,self.outputs_code
  180. '''
  181. if self.model_code is None:
  182. log("get model of model_code")
  183. with self.sess_codesplit.as_default():
  184. with self.sess_codesplit.graph.as_default():
  185. self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  186. return self.model_code
  187. '''
  188. def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
  189. '''
  190. model = models.Sequential()
  191. model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
  192. model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
  193. crf = CRF(len(chunk_tags), sparse_target=True)
  194. model.add(crf)
  195. model.summary()
  196. model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
  197. return model
  198. '''
  199. input = layers.Input(shape=(None,))
  200. if weights is not None:
  201. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
  202. else:
  203. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
  204. bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
  205. bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
  206. crf = CRF(len(chunk_tags),sparse_target=True)
  207. crf_out = crf(bilstm_dense)
  208. model = models.Model(input=[input],output = [crf_out])
  209. model.summary()
  210. model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
  211. return model
  212. #根据规则补全编号或名称两边的符号
  213. def fitDataByRule(self,data):
  214. symbol_dict = {"(":")",
  215. "(":")",
  216. "[":"]",
  217. "【":"】",
  218. ")":"(",
  219. ")":"(",
  220. "]":"[",
  221. "】":"【"}
  222. leftSymbol_pattern = re.compile("[\((\[【]")
  223. rightSymbol_pattern = re.compile("[\))\]】]")
  224. leftfinds = re.findall(leftSymbol_pattern,data)
  225. rightfinds = re.findall(rightSymbol_pattern,data)
  226. result = data
  227. if len(leftfinds)+len(rightfinds)==0:
  228. return data
  229. elif len(leftfinds)==len(rightfinds):
  230. return data
  231. elif abs(len(leftfinds)-len(rightfinds))==1:
  232. if len(leftfinds)>len(rightfinds):
  233. if symbol_dict.get(data[0]) is not None:
  234. result = data[1:]
  235. else:
  236. #print(symbol_dict.get(leftfinds[0]))
  237. result = data+symbol_dict.get(leftfinds[0])
  238. else:
  239. if symbol_dict.get(data[-1]) is not None:
  240. result = data[:-1]
  241. else:
  242. result = symbol_dict.get(rightfinds[0])+data
  243. return result
  244. def decode(self,logits, trans, sequence_lengths, tag_num):
  245. viterbi_sequences = []
  246. for logit, length in zip(logits, sequence_lengths):
  247. score = logit[:length]
  248. viterbi_seq, viterbi_score = viterbi_decode(score, trans)
  249. viterbi_sequences.append(viterbi_seq)
  250. return viterbi_sequences
  251. def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
  252. #@summary: 获取每篇文章的code和name
  253. pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
  254. result = []
  255. index_unk = self.word2index.get("<unk>")
  256. # index_pad = self.word2index.get("<pad>")
  257. if list_entitys is None:
  258. list_entitys = [[] for _ in range(len(list_sentences))]
  259. for list_sentence,list_entity in zip(list_sentences,list_entitys):
  260. if len(list_sentence)==0:
  261. result.append([{"code":[],"name":""}])
  262. continue
  263. doc_id = list_sentence[0].doc_id
  264. # sentences = []
  265. # for sentence in list_sentence:
  266. # if len(sentence.sentence_text)>MAX_AREA:
  267. # for _sentence_comma in re.split("[;;,\n]",sentence):
  268. # _comma_index = 0
  269. # while(_comma_index<len(_sentence_comma)):
  270. # sentences.append(_sentence_comma[_comma_index:_comma_index+MAX_AREA])
  271. # _comma_index += MAX_AREA
  272. # else:
  273. # sentences.append(sentence+"。")
  274. list_sentence.sort(key=lambda x:len(x.sentence_text),reverse=True)
  275. _begin_index = 0
  276. item = {"code":[],"name":""}
  277. code_set = set()
  278. dict_name_freq_score = dict()
  279. while(True):
  280. MAX_LEN = len(list_sentence[_begin_index].sentence_text)
  281. if MAX_LEN>MAX_AREA:
  282. MAX_LEN = MAX_AREA
  283. _LEN = MAX_AREA//MAX_LEN
  284. #预测
  285. x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  286. # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  287. x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
  288. x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
  289. if USE_API:
  290. requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},verify=True)
  291. predict_y = json.loads(requests_result.text)['result']
  292. # print("cost_time:", json.loads(requests_result.text)['cost_time'])
  293. # print(MAX_LEN,_LEN,_begin_index)
  294. else:
  295. with self.sess_codename.as_default():
  296. t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
  297. _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x,
  298. t_input_length:x_len,
  299. t_keepprob:1.0})
  300. predict_y = self.decode(_logits,_trans,x_len,7)
  301. # print('==========',_logits)
  302. '''
  303. for item11 in np.argmax(predict_y,-1):
  304. print(item11)
  305. print(predict_y)
  306. '''
  307. # print(predict_y)
  308. for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)):
  309. pad_sentence = sentence.sentence_text[:MAX_LEN]
  310. join_predict = "".join([str(s) for s in predict])
  311. # print(pad_sentence)
  312. # print(join_predict)
  313. code_x = []
  314. code_text = []
  315. temp_entitys = []
  316. for iter in re.finditer(self.PC_pattern,join_predict):
  317. get_len = 40
  318. if iter.span()[0]<get_len:
  319. begin = 0
  320. else:
  321. begin = iter.span()[0]-get_len
  322. end = iter.span()[1]+get_len
  323. code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
  324. code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",", ""))
  325. _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
  326. temp_entitys.append(_entity)
  327. #print("code",code_text)
  328. if len(code_x)>0:
  329. code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3))
  330. if USE_PAI_EAS:
  331. request = tf_predict_pb2.PredictRequest()
  332. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  333. request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0]))
  334. request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1))
  335. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  336. request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1]))
  337. request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1))
  338. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  339. request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2]))
  340. request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1))
  341. request_data = request.SerializeToString()
  342. list_outputs = ["outputs"]
  343. _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs)
  344. if _result is not None:
  345. predict_code = _result["outputs"]
  346. else:
  347. with self.sess_codesplit.as_default():
  348. with self.sess_codesplit.graph.as_default():
  349. predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  350. else:
  351. with self.sess_codesplit.as_default():
  352. with self.sess_codesplit.graph.as_default():
  353. inputs_code,outputs_code = self.getModel_code()
  354. predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})[0]
  355. #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
  356. #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  357. for h in range(len(predict_code)):
  358. if predict_code[h][0]>0.5:
  359. the_code = self.fitDataByRule(code_text[h])
  360. #add code to entitys
  361. list_entity.append(temp_entitys[h])
  362. if the_code not in code_set:
  363. code_set.add(the_code)
  364. item['code'] = list(code_set)
  365. for iter in re.finditer(self.PN_pattern,join_predict):
  366. _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  367. #add name to entitys
  368. _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
  369. list_entity.append(_entity)
  370. w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
  371. if _name not in dict_name_freq_score:
  372. # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
  373. dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w]
  374. else:
  375. dict_name_freq_score[_name][0] += 1
  376. '''
  377. for iter in re.finditer(self.PN_pattern,join_predict):
  378. print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
  379. if item[1]['name']=="":
  380. for iter in re.finditer(self.PN_pattern,join_predict):
  381. #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  382. item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  383. break
  384. '''
  385. if _begin_index+_LEN>=len(list_sentence):
  386. break
  387. _begin_index += _LEN
  388. list_name_freq_score = []
  389. # 2020/11/23 大网站规则调整
  390. if len(dict_name_freq_score) == 0:
  391. name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
  392. for sentence in list_sentence:
  393. # pad_sentence = sentence.sentence_text
  394. othername = re.search(name_re1, sentence.sentence_text)
  395. if othername != None:
  396. project_name = othername.group(3)
  397. beg = find_index([project_name], sentence.sentence_text)[0]
  398. end = beg + len(project_name)
  399. _name = self.fitDataByRule(sentence.sentence_text[beg:end])
  400. # add name to entitys
  401. _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
  402. sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
  403. entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
  404. end_index=0, wordOffset_begin=beg, wordOffset_end=end,in_attachment=sentence.in_attachment)
  405. list_entity.append(_entity)
  406. w = 1
  407. if _name not in dict_name_freq_score:
  408. # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
  409. dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w]
  410. else:
  411. dict_name_freq_score[_name][0] += 1
  412. # othername = re.search(name_re1, sentence.sentence_text)
  413. # if othername != None:
  414. # _name = othername.group(3)
  415. # if _name not in dict_name_freq_score:
  416. # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1]
  417. # else:
  418. # dict_name_freq_score[_name][0] += 1
  419. for _name in dict_name_freq_score.keys():
  420. list_name_freq_score.append([_name,dict_name_freq_score[_name]])
  421. # print(list_name_freq_score)
  422. if len(list_name_freq_score)>0:
  423. list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
  424. item['name'] = list_name_freq_score[0][0]
  425. # if list_name_freq_score[0][1][0]>1:
  426. # item[1]['name'] = list_name_freq_score[0][0]
  427. # else:
  428. # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True)
  429. # item[1]["name"] = list_name_freq_score[0][0]
  430. #下面代码加上去用正则添加某些识别不到的项目编号
  431. if item['code'] == []:
  432. for sentence in list_sentence:
  433. # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text)
  434. # if othercode != None:
  435. # item[1]['code'].append(othercode.group(2))
  436. # 2020/11/23 大网站规则调整
  437. othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text)
  438. if othercode != None:
  439. item['code'].append(othercode.group(3))
  440. item['code'] = [code for code in item['code'] if len(code)<500]
  441. item['code'].sort(key=lambda x:len(x),reverse=True)
  442. result.append(item)
  443. list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
  444. return result
  445. '''
  446. #当数据量过大时会报错
  447. def predict(self,articles,MAX_LEN = None):
  448. sentences = []
  449. for article in articles:
  450. for sentence in article.content.split("。"):
  451. sentences.append([sentence,article.id])
  452. if MAX_LEN is None:
  453. sent_len = [len(sentence[0]) for sentence in sentences]
  454. MAX_LEN = max(sent_len)
  455. #print(MAX_LEN)
  456. #若为空,则直接返回空
  457. result = []
  458. if MAX_LEN==0:
  459. for article in articles:
  460. result.append([article.id,{"code":[],"name":""}])
  461. return result
  462. index_unk = self.word2index.get("<unk>")
  463. index_pad = self.word2index.get("<pad>")
  464. x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences]
  465. x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
  466. predict_y = self.getModel().predict(x)
  467. last_doc_id = ""
  468. item = []
  469. for sentence,predict in zip(sentences,np.argmax(predict_y,-1)):
  470. pad_sentence = sentence[0][:MAX_LEN]
  471. doc_id = sentence[1]
  472. join_predict = "".join([str(s) for s in predict])
  473. if doc_id!=last_doc_id:
  474. if last_doc_id!="":
  475. result.append(item)
  476. item = [doc_id,{"code":[],"name":""}]
  477. code_set = set()
  478. code_x = []
  479. code_text = []
  480. for iter in re.finditer(self.PC_pattern,join_predict):
  481. get_len = 40
  482. if iter.span()[0]<get_len:
  483. begin = 0
  484. else:
  485. begin = iter.span()[0]-get_len
  486. end = iter.span()[1]+get_len
  487. code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
  488. code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
  489. if len(code_x)>0:
  490. code_x = np.transpose(np.array(code_x),(1,0,2,3))
  491. predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
  492. for h in range(len(predict_code)):
  493. if predict_code[h][0]>0.5:
  494. the_code = self.fitDataByRule(code_text[h])
  495. if the_code not in code_set:
  496. code_set.add(the_code)
  497. item[1]['code'] = list(code_set)
  498. if item[1]['name']=="":
  499. for iter in re.finditer(self.PN_pattern,join_predict):
  500. #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  501. item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
  502. break
  503. last_doc_id = doc_id
  504. result.append(item)
  505. return result
  506. '''
  507. #角色金额模型
  508. class PREMPredict():
  509. def __init__(self,config=None):
  510. #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
  511. self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
  512. self.model_role = Model_role_classify_word(config=config)
  513. self.model_money = Model_money_classify(config=config)
  514. return
  515. def search_role_data(self,list_sentences,list_entitys):
  516. '''
  517. @summary:根据句子list和实体list查询角色模型的输入数据
  518. @param:
  519. list_sentences:文章的sentences
  520. list_entitys:文章的entitys
  521. @return:角色模型的输入数据
  522. '''
  523. text_list = []
  524. data_x = []
  525. points_entitys = []
  526. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  527. list_entity.sort(key=lambda x:x.sentence_index)
  528. list_sentence.sort(key=lambda x:x.sentence_index)
  529. p_entitys = 0
  530. p_sentences = 0
  531. while(p_entitys<len(list_entity)):
  532. entity = list_entity[p_entitys]
  533. if entity.entity_type in ['org','company']:
  534. while(p_sentences<len(list_sentence)):
  535. sentence = list_sentence[p_sentences]
  536. if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
  537. text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-13):entity.wordOffset_end+10])
  538. #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
  539. item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
  540. data_x.append(item_x)
  541. points_entitys.append(entity)
  542. break
  543. p_sentences += 1
  544. p_entitys += 1
  545. if len(points_entitys)==0:
  546. return None
  547. return [data_x,points_entitys, text_list]
  548. def search_money_data(self,list_sentences,list_entitys):
  549. '''
  550. @summary:根据句子list和实体list查询金额模型的输入数据
  551. @param:
  552. list_sentences:文章的sentences
  553. list_entitys:文章的entitys
  554. @return:金额模型的输入数据
  555. '''
  556. text_list = []
  557. data_x = []
  558. points_entitys = []
  559. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  560. list_entity.sort(key=lambda x:x.sentence_index)
  561. list_sentence.sort(key=lambda x:x.sentence_index)
  562. p_entitys = 0
  563. while(p_entitys<len(list_entity)):
  564. entity = list_entity[p_entitys]
  565. if entity.entity_type=="money":
  566. p_sentences = 0
  567. while(p_sentences<len(list_sentence)):
  568. sentence = list_sentence[p_sentences]
  569. if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
  570. text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 8):entity.wordOffset_end])
  571. #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
  572. #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
  573. item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
  574. data_x.append(item_x)
  575. points_entitys.append(entity)
  576. break
  577. p_sentences += 1
  578. p_entitys += 1
  579. if len(points_entitys)==0:
  580. return None
  581. return [data_x,points_entitys, text_list]
  582. def predict_role(self,list_sentences, list_entitys):
  583. datas = self.search_role_data(list_sentences, list_entitys)
  584. if datas is None:
  585. return
  586. points_entitys = datas[1]
  587. text_list = datas[2]
  588. if USE_PAI_EAS:
  589. _data = datas[0]
  590. _data = np.transpose(np.array(_data),(1,0,2))
  591. request = tf_predict_pb2.PredictRequest()
  592. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  593. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  594. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  595. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  596. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  597. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  598. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  599. request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
  600. request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
  601. request_data = request.SerializeToString()
  602. list_outputs = ["outputs"]
  603. _result = vpc_requests(role_url, role_authorization, request_data, list_outputs)
  604. if _result is not None:
  605. predict_y = _result["outputs"]
  606. else:
  607. predict_y = self.model_role.predict(datas[0])
  608. else:
  609. predict_y = self.model_role.predict(np.array(datas[0],dtype=np.float64))
  610. for i in range(len(predict_y)):
  611. entity = points_entitys[i]
  612. label = np.argmax(predict_y[i])
  613. values = predict_y[i]
  614. text = text_list[i]
  615. if label == 2:
  616. if re.search('中标单位和.{,25}签订合同', text):
  617. label = 0
  618. values[label] = 0.501
  619. elif re.search('尊敬的供应商:.{,25}我公司', text):
  620. label = 0
  621. values[label] = 0.801
  622. elif re.search('尊敬的供应商:', text):
  623. label = 0
  624. values[label] = 0.501
  625. elif re.search('[^\w]中标候选人', text[:15]) and re.search('[1一]', text[:15]) == None: #修复第4以上的预测错为中标人
  626. label = 5
  627. values[label] = 0.5
  628. elif re.search('是否中标:是,供应商', text) and label == 5:
  629. label = 2
  630. values[label] = 0.9
  631. elif label == 1 and re.search('委托(单位|人|方)[是为:]+', text[:10]) and re.search('受委托(单位|人|方)[是为:]+', text[:10])==None:
  632. label = 0
  633. values[label] = 0.501
  634. elif label == 1 and re.search('([,。:]|^)(服务|中选)机构(名称)?', text[:-10]):
  635. label = 2
  636. values[label] = 0.501
  637. entity.set_Role(label, values)
  638. def predict_money(self,list_sentences,list_entitys):
  639. datas = self.search_money_data(list_sentences, list_entitys)
  640. if datas is None:
  641. return
  642. points_entitys = datas[1]
  643. _data = datas[0]
  644. text_list = datas[2]
  645. if USE_PAI_EAS:
  646. _data = np.transpose(np.array(_data),(1,0,2,3))
  647. request = tf_predict_pb2.PredictRequest()
  648. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  649. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  650. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  651. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  652. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  653. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  654. request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
  655. request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
  656. request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
  657. request_data = request.SerializeToString()
  658. list_outputs = ["outputs"]
  659. _result = vpc_requests(money_url, money_authorization, request_data, list_outputs)
  660. if _result is not None:
  661. predict_y = _result["outputs"]
  662. else:
  663. predict_y = self.model_money.predict(_data)
  664. else:
  665. predict_y = self.model_money.predict(_data)
  666. for i in range(len(predict_y)):
  667. entity = points_entitys[i]
  668. label = np.argmax(predict_y[i])
  669. values = predict_y[i]
  670. text = text_list[i]
  671. if label == 1 and re.search('[::,。](总金额|总价|单价)', text):
  672. values[label] = 0.49
  673. elif label ==0 and entity.notes in ["投资", "工程造价"]:
  674. values[label] = 0.49
  675. entity.set_Money(label, values)
  676. def correct_money_by_rule(self, title, list_entitys, list_articles):
  677. if len(re.findall('监理|施工|设计|勘察', title)) == 1 and re.search('施工|总承包|epc|EPC', title) == None:
  678. keyword = re.search('监理|设计|勘察', title).group(0)
  679. for list_entity in list_entitys:
  680. for _entity in list_entity:
  681. # print('keyword:',keyword, '_entity.notes :',_entity.notes)
  682. if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label == 2:
  683. # if channel_dic['docchannel'] == "招标公告":
  684. if re.search('中标|成交|中选|中价|中租|结果|入围', title + list_articles[0].content[:100]) == None:
  685. _entity.values[0] = 0.51
  686. _entity.set_Money(0, _entity.values) # 2021/11/18 根据公告类别把费用改为招标或中投标金额
  687. else:
  688. _entity.values[1] = 0.51
  689. _entity.set_Money(1, _entity.values)
  690. def predict(self,list_sentences,list_entitys):
  691. self.predict_role(list_sentences,list_entitys)
  692. self.predict_money(list_sentences,list_entitys)
  693. #联系人模型
  694. class EPCPredict():
  695. def __init__(self,config=None):
  696. self.model_person = Model_person_classify(config=config)
  697. def search_person_data(self,list_sentences,list_entitys):
  698. '''
  699. @summary:根据句子list和实体list查询联系人模型的输入数据
  700. @param:
  701. list_sentences:文章的sentences
  702. list_entitys:文章的entitys
  703. @return:联系人模型的输入数据
  704. '''
  705. data_x = []
  706. points_entitys = []
  707. for list_entity,list_sentence in zip(list_entitys,list_sentences):
  708. p_entitys = 0
  709. dict_index_sentence = {}
  710. for _sentence in list_sentence:
  711. dict_index_sentence[_sentence.sentence_index] = _sentence
  712. _list_entity = [entity for entity in list_entity if entity.entity_type=="person"]
  713. while(p_entitys<len(_list_entity)):
  714. entity = _list_entity[p_entitys]
  715. if entity.entity_type=="person":
  716. sentence = dict_index_sentence[entity.sentence_index]
  717. item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
  718. data_x.append(item_x)
  719. points_entitys.append(entity)
  720. p_entitys += 1
  721. if len(points_entitys)==0:
  722. return None
  723. # return [data_x,points_entitys,dianhua]
  724. return [data_x,points_entitys]
  725. def predict_person(self,list_sentences, list_entitys):
  726. datas = self.search_person_data(list_sentences, list_entitys)
  727. if datas is None:
  728. return
  729. points_entitys = datas[1]
  730. # phone = datas[2]
  731. if USE_PAI_EAS:
  732. _data = datas[0]
  733. _data = np.transpose(np.array(_data),(1,0,2,3))
  734. request = tf_predict_pb2.PredictRequest()
  735. request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
  736. request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
  737. request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
  738. request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
  739. request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
  740. request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
  741. request_data = request.SerializeToString()
  742. list_outputs = ["outputs"]
  743. _result = vpc_requests(person_url, person_authorization, request_data, list_outputs)
  744. if _result is not None:
  745. predict_y = _result["outputs"]
  746. else:
  747. predict_y = self.model_person.predict(datas[0])
  748. else:
  749. predict_y = self.model_person.predict(datas[0])
  750. # assert len(predict_y)==len(points_entitys)==len(phone)
  751. assert len(predict_y)==len(points_entitys)
  752. for i in range(len(predict_y)):
  753. entity = points_entitys[i]
  754. label = np.argmax(predict_y[i])
  755. values = []
  756. for item in predict_y[i]:
  757. values.append(item)
  758. # phone_number = phone[i]
  759. # entity.set_Person(label,values,phone_number)
  760. entity.set_Person(label,values,[])
  761. # 为联系人匹配电话
  762. # self.person_search_phone(list_sentences, list_entitys)
  763. def person_search_phone(self,list_sentences, list_entitys):
  764. def phoneFromList(phones):
  765. # for phone in phones:
  766. # if len(phone)==11:
  767. # return re.sub('电话[:|:]|联系方式[:|:]','',phone)
  768. return re.sub('电话[:|:]|联系方式[:|:]', '', phones[0])
  769. for list_entity, list_sentence in zip(list_entitys, list_sentences):
  770. # p_entitys = 0
  771. # p_sentences = 0
  772. #
  773. # key_word = re.compile('电话[:|:].{0,4}\d{7,12}|联系方式[:|:].{0,4}\d{7,12}')
  774. # # phone = re.compile('1[3|4|5|7|8][0-9][-—-]?\d{4}[-—-]?\d{4}|\d{3,4}[-—-]\d{7,8}/\d{3,8}|\d{3,4}[-—-]\d{7,8}转\d{1,4}|\d{3,4}[-—-]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}') # 联系电话
  775. # # 2020/11/25 增加发现的号码段
  776. # phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-]?\d{4}[-—-]?\d{4}|'
  777. # '\d{3,4}[-—-][1-9]\d{6,7}/\d{3,8}|'
  778. # '\d{3,4}[-—-]\d{7,8}转\d{1,4}|'
  779. # '\d{3,4}[-—-]?[1-9]\d{6,7}|'
  780. # '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
  781. # '[1-9]\d{6,7}') # 联系电话
  782. # dict_index_sentence = {}
  783. # for _sentence in list_sentence:
  784. # dict_index_sentence[_sentence.sentence_index] = _sentence
  785. #
  786. # dict_context_itemx = {}
  787. # last_person = "####****++++$$^"
  788. # last_person_phone = "####****++++$^"
  789. # _list_entity = [entity for entity in list_entity if entity.entity_type == "person"]
  790. # while (p_entitys < len(_list_entity)):
  791. # entity = _list_entity[p_entitys]
  792. # if entity.entity_type == "person" and entity.label in [1,2,3]:
  793. # sentence = dict_index_sentence[entity.sentence_index]
  794. # # item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_PERSON_INPUT_SHAPE[1]),shape=settings.MODEL_PERSON_INPUT_SHAPE)
  795. #
  796. # # s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20)
  797. #
  798. # # 2021/5/8 取上下文的句子,解决表格处理的分句问题
  799. # left_sentence = dict_index_sentence.get(entity.sentence_index - 1)
  800. # left_sentence_tokens = left_sentence.tokens if left_sentence else []
  801. # right_sentence = dict_index_sentence.get(entity.sentence_index + 1)
  802. # right_sentence_tokens = right_sentence.tokens if right_sentence else []
  803. # entity_beginIndex = entity.begin_index + len(left_sentence_tokens)
  804. # entity_endIndex = entity.end_index + len(left_sentence_tokens)
  805. # context_sentences_tokens = left_sentence_tokens + sentence.tokens + right_sentence_tokens
  806. # s = spanWindow(tokens=context_sentences_tokens, begin_index=entity_beginIndex,
  807. # end_index=entity_endIndex, size=20)
  808. #
  809. # _key = "".join(["".join(x) for x in s])
  810. # if _key in dict_context_itemx:
  811. # _dianhua = dict_context_itemx[_key][0]
  812. # else:
  813. # s1 = ''.join(s[1])
  814. # # s1 = re.sub(',)', '-', s1)
  815. # s1 = re.sub('\s', '', s1)
  816. # have_key = re.findall(key_word, s1)
  817. # have_phone = re.findall(phone, s1)
  818. # s0 = ''.join(s[0])
  819. # # s0 = re.sub(',)', '-', s0)
  820. # s0 = re.sub('\s', '', s0)
  821. # have_key2 = re.findall(key_word, s0)
  822. # have_phone2 = re.findall(phone, s0)
  823. #
  824. # s3 = ''.join(s[1])
  825. # # s0 = re.sub(',)', '-', s0)
  826. # s3 = re.sub(',|,|\s', '', s3)
  827. # have_key3 = re.findall(key_word, s3)
  828. # have_phone3 = re.findall(phone, s3)
  829. #
  830. # s4 = ''.join(s[0])
  831. # # s0 = re.sub(',)', '-', s0)
  832. # s4 = re.sub(',|,|\s', '', s0)
  833. # have_key4 = re.findall(key_word, s4)
  834. # have_phone4 = re.findall(phone, s4)
  835. #
  836. # _dianhua = ""
  837. # if have_phone:
  838. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
  839. # last_person_phone) != -1:
  840. # if len(have_phone) > 1:
  841. # _dianhua = phoneFromList(have_phone[1:])
  842. # else:
  843. # _dianhua = phoneFromList(have_phone)
  844. # elif have_key:
  845. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
  846. # last_person_phone) != -1:
  847. # if len(have_key) > 1:
  848. # _dianhua = phoneFromList(have_key[1:])
  849. # else:
  850. # _dianhua = phoneFromList(have_key)
  851. # elif have_phone2:
  852. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
  853. # last_person_phone) != -1:
  854. # if len(have_phone2) > 1:
  855. # _dianhua = phoneFromList(have_phone2[1:])
  856. # else:
  857. # _dianhua = phoneFromList(have_phone2)
  858. # elif have_key2:
  859. # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
  860. # last_person_phone) != -1:
  861. # if len(have_key2) > 1:
  862. # _dianhua = phoneFromList(have_key2[1:])
  863. # else:
  864. # _dianhua = phoneFromList(have_key2)
  865. # elif have_phone3:
  866. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
  867. # last_person_phone) != -1:
  868. # if len(have_phone3) > 1:
  869. # _dianhua = phoneFromList(have_phone3[1:])
  870. # else:
  871. # _dianhua = phoneFromList(have_phone3)
  872. # elif have_key3:
  873. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
  874. # last_person_phone) != -1:
  875. # if len(have_key3) > 1:
  876. # _dianhua = phoneFromList(have_key3[1:])
  877. # else:
  878. # _dianhua = phoneFromList(have_key3)
  879. # elif have_phone4:
  880. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
  881. # last_person_phone) != -1:
  882. # if len(have_phone4) > 1:
  883. # _dianhua = phoneFromList(have_phone4)
  884. # else:
  885. # _dianhua = phoneFromList(have_phone4)
  886. # elif have_key4:
  887. # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
  888. # last_person_phone) != -1:
  889. # if len(have_key4) > 1:
  890. # _dianhua = phoneFromList(have_key4)
  891. # else:
  892. # _dianhua = phoneFromList(have_key4)
  893. # else:
  894. # _dianhua = ""
  895. # # dict_context_itemx[_key] = [item_x, _dianhua]
  896. # dict_context_itemx[_key] = [_dianhua]
  897. # # points_entitys.append(entity)
  898. # # dianhua.append(_dianhua)
  899. # last_person = entity.entity_text
  900. # if _dianhua:
  901. # # 更新联系人entity联系方式(person_phone)
  902. # entity.person_phone = _dianhua
  903. # last_person_phone = _dianhua
  904. # else:
  905. # last_person_phone = "####****++++$^"
  906. # p_entitys += 1
  907. from scipy.optimize import linear_sum_assignment
  908. from BiddingKG.dl.interface.Entitys import Match
  909. def dispatch(match_list):
  910. main_roles = list(set([match.main_role for match in match_list]))
  911. attributes = list(set([match.attribute for match in match_list]))
  912. label = np.zeros(shape=(len(main_roles), len(attributes)))
  913. for match in match_list:
  914. main_role = match.main_role
  915. attribute = match.attribute
  916. value = match.value
  917. label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
  918. # print(label)
  919. gragh = -label
  920. # km算法
  921. row, col = linear_sum_assignment(gragh)
  922. max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
  923. return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
  924. # km算法
  925. key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})')
  926. phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
  927. '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
  928. '0\d{2,3}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
  929. '0\d{2,3}[-—-―]\d{7,8}转\d{1,4}|'
  930. '0\d{2,3}[-—-―]?[1-9]\d{6,7}|'
  931. '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
  932. '[1-9]\d{6,7}')
  933. phone_entitys = []
  934. for _sentence in list_sentence:
  935. sentence_text = _sentence.sentence_text
  936. res_set = set()
  937. for i in re.finditer(phone,sentence_text):
  938. res_set.add((i.group(),i.start(),i.end()))
  939. for i in re.finditer(key_word,sentence_text):
  940. res_set.add((i.group(2),i.start()+len(i.group(1)),i.end()))
  941. for item in list(res_set):
  942. phone_left = sentence_text[max(0,item[1]-10):item[1]]
  943. phone_right = sentence_text[item[2]:item[2]+8]
  944. # 排除传真号 和 其它错误项
  945. if re.search("传,?真|信,?箱|邮,?箱",phone_left):
  946. if not re.search("电,?话",phone_left):
  947. continue
  948. if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]",phone_left):
  949. continue
  950. if re.search("[.,]\d{2,}",phone_right):
  951. continue
  952. _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2],in_attachment=_sentence.in_attachment)
  953. phone_entitys.append(_entity)
  954. person_entitys = []
  955. for entity in list_entity:
  956. if entity.entity_type == "person":
  957. entity.person_phone = ""
  958. person_entitys.append(entity)
  959. _list_entity = phone_entitys + person_entitys
  960. _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin))
  961. words_num_dict = dict()
  962. last_words_num = 0
  963. list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
  964. for sentence in list_sentence:
  965. _index = sentence.sentence_index
  966. if _index == 0:
  967. words_num_dict[_index] = 0
  968. else:
  969. words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
  970. last_words_num = len(sentence.sentence_text)
  971. match_list = []
  972. for index in range(len(_list_entity)):
  973. entity = _list_entity[index]
  974. if entity.entity_type=="person" and entity.label in [1,2,3]:
  975. match_nums = 0
  976. for after_index in range(index + 1, min(len(_list_entity), index + 5)):
  977. after_entity = _list_entity[after_index]
  978. if after_entity.entity_type=="phone":
  979. sentence_distance = after_entity.sentence_index - entity.sentence_index
  980. distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - (
  981. words_num_dict[entity.sentence_index] + entity.wordOffset_end)
  982. if sentence_distance < 2 and distance < 50:
  983. value = (-1 / 2 * (distance ** 2)) / 10000
  984. match_list.append(Match(entity, after_entity, value))
  985. match_nums += 1
  986. else:
  987. break
  988. if after_entity.entity_type=="person":
  989. if after_entity.label not in [1,2,3]:
  990. break
  991. if not match_nums:
  992. for previous_index in range(index-1, max(0,index-5), -1):
  993. previous_entity = _list_entity[previous_index]
  994. if previous_entity.entity_type == "phone":
  995. sentence_distance = entity.sentence_index - previous_entity.sentence_index
  996. distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - (
  997. words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end)
  998. if sentence_distance < 1 and distance<30:
  999. # 前向 没有 /10000
  1000. value = (-1 / 2 * (distance ** 2))
  1001. match_list.append(Match(entity, previous_entity, value))
  1002. else:
  1003. break
  1004. result = dispatch(match_list)
  1005. for match in result:
  1006. entity = match.main_role
  1007. # 更新 list_entity
  1008. entity_index = list_entity.index(entity)
  1009. list_entity[entity_index].person_phone = match.attribute.entity_text
  1010. def predict(self,list_sentences,list_entitys):
  1011. self.predict_person(list_sentences,list_entitys)
  1012. #表格预测
  1013. class FormPredictor():
  1014. def __init__(self,lazyLoad=getLazyLoad(),config=None):
  1015. self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
  1016. self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
  1017. self.model_form_item = Model_form_item(config=config)
  1018. self.model_dict = {"line":[None,self.model_file_line]}
  1019. self.model_form_context = Model_form_context(config=config)
  1020. def getModel(self,type):
  1021. if type=="item":
  1022. return self.model_form_item
  1023. elif type=="context":
  1024. return self.model_form_context
  1025. else:
  1026. return self.getModel(type)
  1027. def encode(self,data,**kwargs):
  1028. return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0]
  1029. return encodeInput_form(data)
  1030. def predict(self,form_datas,type):
  1031. if type=="item":
  1032. return self.model_form_item.predict(form_datas)
  1033. elif type=="context":
  1034. return self.model_form_context.predict(form_datas)
  1035. else:
  1036. return self.getModel(type).predict(form_datas)
  1037. #角色规则
  1038. #依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
  1039. class RoleRulePredictor():
  1040. def __init__(self):
  1041. # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
  1042. self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|买受|出售|标卖|处置)" \
  1043. "(人|方|单位|组织|用户|业主|主体|部门|公司)|文章来源|委托机构|产权所有人|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|结算单位)"\
  1044. "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
  1045. self.pattern_tenderee_left_w0 = "(?P<tenderee_left>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|买受|出售|标卖|处置)" \
  1046. "(人|方|单位|组织|用户|业主|主体|部门|公司)|文章来源|委托机构|产权所有人|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|结算单位)"\
  1047. "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
  1048. self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)" \
  1049. "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
  1050. "(是|为|:|:|\s*)+$)"
  1051. self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
  1052. self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
  1053. self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
  1054. self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
  1055. self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
  1056. # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
  1057. self.pattern_winTenderer_left = "(?P<winTenderer_left>(乙|承做|施工|供货|承包|承建|承租|竞得|受让|签约)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)[::是为]+$|" \
  1058. "(选定单位|指定的中介服务机构|实施主体|承制单位|供方)[::是为]+$|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|" \
  1059. "单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(供应|供货|承销|承保|承包|承接|服务|实施|合作)(机构|单位|商|方)(名称)?[::是为]+$)"
  1060. self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w1>(,|。|^)((中标(投标)?|中选|中价|成交)(人|单位|机构|供应商|客户|方|公司|厂商|商)|第?[一1]名)(名称)?[,,]?([((]按综合排名排序[))])?[::,,]$)" #解决表头识别不到加逗号情况,需前面为,。空
  1061. self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标|中选|中价|成交|入选)(人|单位|机构|供应商|客户|方|公司|厂商|商)(名称)?([((]按综合排名排序[))])?[::是为]+$)" #取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标
  1062. # self.pattern_winTenderer_center = "(?P<winTenderer_center>第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])"
  1063. # self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)"
  1064. self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为]((采购|中标)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))|" \
  1065. "^(报价|价格)最低,确定为本项目成交供应商|^:贵公司参与|^:?你方于|^中标。|^[成作]?为([\w、()()]+|本|此|该)项目的?(成交|中选|中标|服务)(供应商|单位|人)|^[((](中标|成交|承包)人名?称?[))]))"
  1066. self.pattern_winTenderer_whole = "(?P<winTenderer_center>贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|决定由.{5,20}承办|(谈判结果:|确定)由.{5,20}(向我单位)?供货|中标通知书.{,15}你方|单一来源从[()\w]{5,20}采购)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
  1067. # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)"
  1068. self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
  1069. self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
  1070. self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
  1071. self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
  1072. self.pattern_whole = [self.pattern_tenderee_left_w1,
  1073. self.pattern_tenderee_left,
  1074. self.pattern_tenderee_left_w0,
  1075. self.pattern_tenderee_center,
  1076. self.pattern_tenderee_right,
  1077. self.pattern_tendereeORagency_right,
  1078. self.pattern_agency_left,
  1079. self.pattern_agency_right,
  1080. self.pattern_winTenderer_left_w1,
  1081. self.pattern_winTenderer_left,
  1082. self.pattern_winTenderer_left_w0,
  1083. self.pattern_winTenderer_whole,
  1084. self.pattern_winTenderer_right,
  1085. self.pattern_secondTenderer_left,
  1086. self.pattern_secondTenderer_right,
  1087. self.pattern_thirdTenderer_left,
  1088. self.pattern_thirdTenderer_right
  1089. ] # 需按顺序排列, 第二、三中标要在中标正则后面
  1090. self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
  1091. self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|限价|拦标价|预算金额|标底|总计|限额")
  1092. self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价")
  1093. self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
  1094. self.pattern_money_other = re.compile("代理费|服务费")
  1095. self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
  1096. def _check_input(self,text, ignore=False):
  1097. if not text:
  1098. return []
  1099. if not isinstance(text, list):
  1100. text = [text]
  1101. null_index = [i for i, t in enumerate(text) if not t]
  1102. if null_index and not ignore:
  1103. raise Exception("null text in input ")
  1104. return text
  1105. def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
  1106. for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
  1107. list_codenames):
  1108. list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序
  1109. # list_name = list_codename["name"]
  1110. list_name = [] # 2022/1/5 改为实体列表内所有项目名称
  1111. for entity in list_entity:
  1112. if entity.entity_type == 'name':
  1113. list_name.append(entity.entity_text)
  1114. list_name = self._check_input(list_name) + [article.title]
  1115. for p_entity in list_entity:
  1116. if p_entity.entity_type in ["org", "company"]:
  1117. # 只解析角色为无的或者概率低于阈值的
  1118. if p_entity.label is None:
  1119. continue
  1120. # 将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
  1121. if str(p_entity.label) == "0":
  1122. find_flag = False
  1123. for _sentence in list_sentence:
  1124. if _sentence.sentence_index == p_entity.sentence_index:
  1125. _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1126. end_index=p_entity.end_index, size=20, center_include=True,
  1127. word_flag=True, use_text=True,
  1128. text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
  1129. for _name in list_name:
  1130. if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0: #加上前面一些信息,修复公司不在项目名称开头的,检测不到
  1131. find_flag = True
  1132. if p_entity.values[0] > on_value:
  1133. p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
  1134. else:
  1135. p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
  1136. if find_flag:
  1137. continue
  1138. # 正则从概率低于阈值或其他类别中召回角色
  1139. role_prob = float(p_entity.values[int(p_entity.label)])
  1140. if role_prob < on_value or str(p_entity.label) == "5":
  1141. # 将标题中的实体置为招标人
  1142. _list_name = self._check_input(list_name, ignore=True)
  1143. find_flag = False
  1144. for _name in _list_name: # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
  1145. if str(_name).find(re.sub(")", ")", re.sub("(", "(",
  1146. p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4:
  1147. for _sentence in list_sentence:
  1148. if _sentence.sentence_index == p_entity.sentence_index:
  1149. _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1150. end_index=p_entity.end_index, size=20, center_include=True,
  1151. word_flag=True, use_text=True, text=re.sub(")", ")",
  1152. re.sub("(", "(",
  1153. p_entity.entity_text)))
  1154. if str(_span[1] + _span[2][:len(str(_name))]).find(
  1155. _name) >= 0:
  1156. find_flag = True
  1157. _label = 0
  1158. p_entity.label = _label
  1159. p_entity.values[int(_label)] = on_value
  1160. break
  1161. if p_entity.sentence_index >= 4:
  1162. break
  1163. if find_flag:
  1164. break
  1165. # if str(_name).find(p_entity.entity_text)>=0:
  1166. # find_flag = True
  1167. # _label = 0
  1168. # p_entity.label = _label
  1169. # p_entity.values[int(_label)] = on_value
  1170. # break
  1171. # 若是实体在标题中,默认为招标人,不进行以下的规则匹配
  1172. if find_flag:
  1173. continue
  1174. for s_index in range(len(list_sentence)):
  1175. if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \
  1176. list_sentence[s_index].sentence_index:
  1177. tokens = list_sentence[s_index].tokens
  1178. begin_index = p_entity.begin_index
  1179. end_index = p_entity.end_index
  1180. size = 15
  1181. spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
  1182. word_flag=True, use_text=False)
  1183. # _flag = False
  1184. # 添加中标通知书类型特殊处理
  1185. try:
  1186. if s_index == 0 and re.search('中标通知书.{,30}[,:]%s:'%p_entity.entity_text.replace('(', '').replace(')', ''),
  1187. list_sentence[s_index].sentence_text.replace('(', '').replace(')', '')[:100]):
  1188. p_entity.label = 2
  1189. p_entity.values[2] = 0.5
  1190. # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
  1191. break
  1192. except Exception as e:
  1193. print('正则报错:', e)
  1194. # 使用正则+距离解决冲突
  1195. # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
  1196. list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:25], spans[2]] # 实体左、中、右 信息
  1197. for _i_span in range(len(list_spans)):
  1198. _flag = False
  1199. _prob_weight = 1
  1200. # print(list_spans[_i_span],p_entity.entity_text)
  1201. for _pattern in self.pattern_whole:
  1202. for _iter in re.finditer(_pattern, list_spans[_i_span]):
  1203. for _group, _v_group in _iter.groupdict().items():
  1204. if _v_group is not None and _v_group != "":
  1205. _role = _group.split("_")[0]
  1206. if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
  1207. # print('p_entity_sentenceindex:', p_entity.sentence_index)
  1208. if p_entity.sentence_index>=1: # 只在第一句进行这种模糊匹配
  1209. continue
  1210. if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
  1211. or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
  1212. _role = 'tenderee'
  1213. else:
  1214. _role = "agency"
  1215. _direct = _group.split("_")[1]
  1216. _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
  1217. # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1218. # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1219. if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标:否', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
  1220. list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
  1221. _flag = True
  1222. _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1223. "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1224. _prob_weight = 1.2 if _weight=='w1' else 1
  1225. # print('_v_group:',_group, _v_group, p_entity.entity_text)
  1226. if _i_span == 1 and _direct == "center":
  1227. _flag = True
  1228. _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1229. "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1230. _prob_weight = 1.2 if _weight == 'w1' else 1
  1231. # print('_v_group:', _group, _v_group, p_entity.entity_text)
  1232. if _i_span == 2 and _direct == "right":
  1233. _flag = True
  1234. _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
  1235. "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
  1236. _prob_weight = 1.2 if _weight == 'w1' else 1
  1237. # print('_v_group:', _group, _v_group, p_entity.entity_text)
  1238. # 得到结果
  1239. if _flag:
  1240. p_entity.label = _label
  1241. p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
  1242. # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
  1243. break
  1244. # 其他金额通过正则召回可能是招标或中投标的金额
  1245. if p_entity.entity_type in ["money"]:
  1246. if str(p_entity.label) == "2":
  1247. for _sentence in list_sentence:
  1248. if _sentence.sentence_index == p_entity.sentence_index:
  1249. _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1250. end_index=p_entity.end_index, size=10, center_include=True,
  1251. word_flag=True, text=p_entity.entity_text)
  1252. if re.search(',\w{2,}', _span[0]):
  1253. _span[0] = _span[0].split(',')[-1] #避免多个价格在一起造成误判
  1254. if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
  1255. self.pattern_money_other, _span[0]) is None:
  1256. p_entity.values[0] = 0.8 + p_entity.values[0] / 10
  1257. p_entity.label = 0
  1258. if re.search(self.pattern_money_tenderer, _span[0]) is not None:
  1259. if re.search(self.pattern_money_other, _span[0]) is not None:
  1260. if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
  1261. re.search(self.pattern_money_other, _span[0]).span()[1]:
  1262. p_entity.values[1] = 0.8 + p_entity.values[1] / 10
  1263. p_entity.label = 1
  1264. else:
  1265. p_entity.values[1] = 0.8 + p_entity.values[1] / 10
  1266. p_entity.label = 1
  1267. if re.search(self.pattern_money_tenderer_whole,
  1268. "".join(_span)) is not None and re.search(self.pattern_money_other,
  1269. _span[0]) is None:
  1270. p_entity.values[1] = 0.8 + p_entity.values[1] / 10
  1271. p_entity.label = 1
  1272. # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
  1273. list_p = []
  1274. state = 0
  1275. for p_entity in list_entity:
  1276. for _sentence in list_sentence:
  1277. if _sentence.sentence_index == p_entity.sentence_index:
  1278. _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
  1279. end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
  1280. text=p_entity.entity_text)
  1281. if state == 2:
  1282. for _p in list_p[1:]:
  1283. _p.values[0] = 0.8 + _p.values[0] / 10
  1284. _p.label = 0
  1285. state = 0
  1286. list_p = []
  1287. if state == 0:
  1288. if p_entity.entity_type in ["money"]:
  1289. if str(p_entity.label) == "0" and re.search(self.pattern_pack,
  1290. _span[0] + "-" + _span[2]) is not None:
  1291. state = 1
  1292. list_p.append(p_entity)
  1293. elif state == 1:
  1294. if p_entity.entity_type in ["money"]:
  1295. if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack,
  1296. _span[0] + "-" + _span[
  1297. 2]) is not None and re.search(
  1298. self.pattern_money_other,
  1299. _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[
  1300. 0].sentence_index:
  1301. list_p.append(p_entity)
  1302. else:
  1303. state = 2
  1304. if len(list_p) > 1:
  1305. for _p in list_p[1:]:
  1306. # print("==",_p.entity_text,_p.sentence_index,_p.label)
  1307. _p.values[0] = 0.8 + _p.values[0] / 10
  1308. _p.label = 0
  1309. state = 0
  1310. list_p = []
  1311. for p_entity in list_entity:
  1312. # 将属于集合中的不可能是中标人的标签置为无
  1313. if p_entity.entity_text in self.SET_NOT_TENDERER:
  1314. p_entity.label = 5
  1315. '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
  1316. class RoleRuleFinalAdd():
  1317. def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
  1318. '''
  1319. 最终规则召回角色
  1320. :param list_articles:
  1321. :param list_sentences:
  1322. :param list_entitys:
  1323. :param list_codenames:
  1324. :return:
  1325. '''
  1326. # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
  1327. main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
  1328. end_tokens = []
  1329. for sentence in main_sentences[-5:]:
  1330. end_tokens.extend(sentence.tokens)
  1331. text_end = "".join(end_tokens[-30:])
  1332. # print(text_end)
  1333. # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
  1334. sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
  1335. sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
  1336. sear_ent2 = re.search('[,:](户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
  1337. sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点|)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
  1338. sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
  1339. sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
  1340. tenderee_notfound = True
  1341. agency_notfound = True
  1342. tenderee_list = []
  1343. ents = []
  1344. for ent in list_entitys[0]:
  1345. if ent.entity_type in ['org', 'company']:
  1346. if ent.label == 0 and ent.values[ent.label]>=0.5:
  1347. if '公共资源交易中心' in ent.entity_text:
  1348. ent.label = 5
  1349. continue
  1350. tenderee_list.append(ent.entity_text)
  1351. tenderee_notfound = False
  1352. elif ent.label == 1:
  1353. agency_notfound = False
  1354. elif ent.label == 5:
  1355. if '公共资源交易中心' in ent.entity_text:
  1356. continue
  1357. ents.append(ent)
  1358. if sear_ent or sear_ent1 or sear_ent2 or sear_ent3 or sear_ent4:
  1359. for _sear_ent in [_sear for _sear in sear_list if _sear]:
  1360. ent_re = _sear_ent.group('entity')
  1361. ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
  1362. if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent_re)
  1363. or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None):
  1364. n = 0
  1365. for i in range(len(ents) - 1, -1, -1):
  1366. if not ents[i].in_attachment:
  1367. n += 1
  1368. if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
  1369. break
  1370. if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
  1371. ents[i].label = 0
  1372. ents[i].values[0] = 0.5
  1373. tenderee_notfound = False
  1374. # log('正则最后补充实体: %s'%(ent_re))
  1375. break
  1376. elif agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) and ent_re not in tenderee_list:
  1377. n = 0
  1378. for i in range(len(ents) - 1, -1, -1):
  1379. if not ents[i].in_attachment:
  1380. n += 1
  1381. if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
  1382. break
  1383. if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
  1384. ents[i].label = 1
  1385. ents[i].values[1] = 0.5
  1386. agency_notfound = False
  1387. # log('正则最后补充实体: %s'%(ent_re))
  1388. break
  1389. if not tenderee_notfound:
  1390. break
  1391. elif list_codenames[0]['name'] != "": #把标题包含的公司实体作为招标人
  1392. # tenderee_notfound = True
  1393. # ents = []
  1394. # for ent in list_entitys[0]:
  1395. # if ent.entity_type in ['org', 'company']:
  1396. # if ent.label == 0:
  1397. # tenderee_notfound = False
  1398. # elif ent.label == 1:
  1399. # agency_notfound = False
  1400. # elif ent.label == 5:
  1401. # ents.append(ent)
  1402. if tenderee_notfound == True:
  1403. # print('list_codenames',list_codenames[0]['name'])
  1404. for ent in ents:
  1405. if ent.entity_text in list_codenames[0]['name']:
  1406. ent.label = 0
  1407. ent.values[0] = 0.5
  1408. tenderee_notfound == False
  1409. # log('正则召回标题中包含的实体:%s'%ent.entity_text)
  1410. break
  1411. # 招标人角色召回规则
  1412. class TendereeRuleRecall():
  1413. def __init__(self):
  1414. # self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|"
  1415. # "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::是为][^。;,]{,5}$")
  1416. # self.tenderee_left_1 = re.compile("采购商公司|询价单位|项目法人单位|项目法人|项目业主名称|申购单位|预算单位|预算单位名称|预算单位单位名称|买方单位|需求公司|寻源单位|项目业主|采购商|业主单位咨询电话|需用单位|采购工厂|征集单位")
  1417. self.tenderee_left_1 = re.compile("((?:采购商|项目法人|项目业主)(名称)?|(?:采购商|询价|项目法人|项目业主|申购|预算|买方|需求|寻源|需用|征集)(单位|公司)((?:单位|公司)?名称)?|询价企业|"
  1418. "业主单位咨询电话|购买主体|采购工厂|需求方(信息[,:])?(单位|公司)?名称|采购单位[\((].{1,6}[\))])[::是为][^。;,]{,2}$")
  1419. self.tenderee_left_2 = re.compile("(招标承办单位|交易人(?:名称)?|招标人代表|(采购|招标)联系人|交易单位|发起(单位|组织)|收货单位|使用方|买家信息)[::是为][^。;,]{,2}$")
  1420. self.tenderee_left_3 = re.compile("[本我](?:公司|单位)[\(\[(【]?$")
  1421. # self.tenderee_left_4 = re.compile("(采购机构|组织机构|组织方|执行单位|采购组织单位|招标组织单位|招标组织部门|采购执行方|采购执行单位|询价执行组织|组织单位|联系单位|联系部门)[::是为][^。;,]{,2}$")
  1422. self.tenderee_left_4 = re.compile("(采购机构|(?:采购|招标|询价)?(组织|执行)(机构|方|单位|部门|组织)|联系(单位|部门)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::是为][^。;,]{,2}$")
  1423. self.tenderee_left_5 = re.compile("(撰写单位|发布(?:人|单位|机构|公司|部门|企业))[^。;,]{,2}$")
  1424. self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|"
  1425. "^[\((][^。;::\))]{,5}称(?:招标|采购)(?:人|单位)|"
  1426. "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|"
  1427. "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|"
  1428. "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|"
  1429. "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|"
  1430. "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)")
  1431. self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)")
  1432. self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P<project>[^。;,?!::]{4,40})")
  1433. # 公告主语判断规则
  1434. self.subject = re.compile("[我本][院校局]")
  1435. # 未识别实体召回正则
  1436. self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
  1437. "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
  1438. "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
  1439. self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
  1440. "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
  1441. "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
  1442. # 未识别实体尾部判断
  1443. self.unrecognized_end1 = re.compile(
  1444. "^[\u4e00-\u9fa5]{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心|联合社|合作社)")
  1445. self.unrecognized_end2 = re.compile("^[\u4e00-\u9fa5]{4,}(?:署|局|厅|处|室|科|部|站|所|股|行|园)")
  1446. def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
  1447. self.get_tenderee = False
  1448. ents = []
  1449. list_name = []
  1450. agency_set = set()
  1451. for ent in list_entitys[0]:
  1452. if ent.entity_type == 'name':
  1453. list_name.append(ent.entity_text)
  1454. if ent.entity_type in ['org', 'company']:
  1455. if ent.label == 0 and ent.values[ent.label]>=0.5:
  1456. self.get_tenderee = True
  1457. break
  1458. elif ent.label == 1:
  1459. if ent.values[ent.label]>0.5:
  1460. agency_set.add(ent.entity_text)
  1461. elif ent.label == 5:
  1462. if len(ent.entity_text)>=4:
  1463. ents.append(ent)
  1464. if not self.get_tenderee:
  1465. self.entity_context_rule(ents,list_name,list_sentences,list(agency_set))
  1466. if not self.get_tenderee:
  1467. self.subject_rule(ents,list_articles,list_sentences)
  1468. if not self.get_tenderee:
  1469. self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
  1470. if not self.get_tenderee:
  1471. self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
  1472. #entity上下文正则判断
  1473. def entity_context_rule(self,entitys,list_name,list_sentences,list_agency):
  1474. list_sentences[0].sort(key=lambda x:x.sentence_index)
  1475. entity_data = []
  1476. for ent in entitys:
  1477. _sentence = list_sentences[0][ent.sentence_index]
  1478. _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
  1479. end_index=ent.end_index, size=40, center_include=True,
  1480. word_flag=True, use_text=True,
  1481. text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
  1482. entity_data.append((ent,_span))
  1483. if not self.get_tenderee:
  1484. for _data in entity_data:
  1485. ent = _data[0]
  1486. _span = _data[1]
  1487. if re.search(self.tenderee_left_1,_span[0]):
  1488. ent.label = 0
  1489. ent.values[0] = 0.5 + ent.values[0] / 10
  1490. self.get_tenderee = True
  1491. if not self.get_tenderee:
  1492. for _data in entity_data:
  1493. ent = _data[0]
  1494. _span = _data[1]
  1495. if re.search(self.tenderee_left_2,_span[0]):
  1496. ent.label = 0
  1497. ent.values[0] = 0.5 + ent.values[0] / 10
  1498. self.get_tenderee = True
  1499. if not self.get_tenderee:
  1500. for _data in entity_data:
  1501. ent = _data[0]
  1502. _span = _data[1]
  1503. if re.search(self.tenderee_left_3,_span[0]):
  1504. ent.label = 0
  1505. ent.values[0] = 0.5 + ent.values[0] / 10
  1506. self.get_tenderee = True
  1507. if not self.get_tenderee:
  1508. for _data in entity_data:
  1509. ent = _data[0]
  1510. _span = _data[1]
  1511. if re.search(self.tenderee_left_4,_span[0]):
  1512. if len(list_agency)>0:
  1513. _same = False
  1514. for agency in list_agency:
  1515. if ent.entity_text in agency or agency in ent.entity_text:
  1516. _same = True
  1517. break
  1518. if not _same:
  1519. ent.label = 0
  1520. ent.values[0] = 0.5 + ent.values[0] / 10
  1521. self.get_tenderee = True
  1522. else:
  1523. if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent.entity_text
  1524. ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text) or re.search("自行.?采购",list_sentences[0][ent.sentence_index].sentence_text):
  1525. ent.label = 0
  1526. ent.values[0] = 0.5 + ent.values[0] / 10
  1527. self.get_tenderee = True
  1528. if not self.get_tenderee:
  1529. for _data in entity_data:
  1530. ent = _data[0]
  1531. _span = _data[1]
  1532. if re.search(self.tenderee_left_5,_span[0]):
  1533. if len(list_agency)>0:
  1534. _same = False
  1535. for agency in list_agency:
  1536. if ent.entity_text in agency or agency in ent.entity_text:
  1537. _same = True
  1538. break
  1539. if not _same:
  1540. ent.label = 0
  1541. ent.values[0] = 0.5 + ent.values[0] / 10
  1542. self.get_tenderee = True
  1543. else:
  1544. if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent.entity_text
  1545. ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text):
  1546. ent.label = 0
  1547. ent.values[0] = 0.5 + ent.values[0] / 10
  1548. self.get_tenderee = True
  1549. if not self.get_tenderee:
  1550. for _data in entity_data:
  1551. ent = _data[0]
  1552. _span = _data[1]
  1553. if re.search(self.tenderee_right, _span[2]):
  1554. ent.label = 0
  1555. ent.values[0] = 0.5 + ent.values[0] / 10
  1556. self.get_tenderee = True
  1557. if not self.get_tenderee:
  1558. for _data in entity_data:
  1559. ent = _data[0]
  1560. _span = _data[1]
  1561. if re.search(self.tenderee_right2, _span[2]):
  1562. ent.label = 0
  1563. ent.values[0] = 0.5 + ent.values[0] / 10
  1564. self.get_tenderee = True
  1565. if not self.get_tenderee:
  1566. if list_name:
  1567. for _data in entity_data:
  1568. ent = _data[0]
  1569. _span = _data[1]
  1570. pj_name = re.search(self.tenderee_right3, _span[2])
  1571. if pj_name:
  1572. pj_name = pj_name.groupdict()["project"]
  1573. for _name in list_name:
  1574. if _name in pj_name:
  1575. ent.label = 0
  1576. ent.values[0] = 0.5
  1577. self.get_tenderee = True
  1578. break
  1579. # for _data in entity_data:
  1580. # ent = _data[0]
  1581. # _span = _data[1]
  1582. # if re.search(self.tenderee_left,_span[0]):
  1583. # ent.label = 0
  1584. # ent.values[0] = 0.5 + ent.values[0] / 10
  1585. # self.get_tenderee = True
  1586. # elif re.search(self.tenderee_right,_span[2]):
  1587. # ent.label = 0
  1588. # ent.values[0] = 0.5 + ent.values[0] / 10
  1589. # self.get_tenderee = True
  1590. # elif re.search(self.tenderee_right2, _span[2]):
  1591. # ent.label = 0
  1592. # ent.values[0] = 0.5 + ent.values[0] / 10
  1593. # self.get_tenderee = True
  1594. # elif list_name:
  1595. # pj_name = re.search(self.tenderee_right3, _span[2])
  1596. # if pj_name:
  1597. # pj_name = pj_name.groupdict()["project"]
  1598. # for _name in list_name:
  1599. # if _name in pj_name:
  1600. # ent.label = 0
  1601. # ent.values[0] = 0.5
  1602. # self.get_tenderee = True
  1603. # break
  1604. # 公告主语判断
  1605. def subject_rule(self, entitys,list_articles,list_sentences):
  1606. content = list_articles[0].content.split('##attachment##')[0]
  1607. if re.search(self.subject,content):
  1608. _subject = re.search(self.subject,content).group()
  1609. for ent in entitys:
  1610. if re.search("院",_subject) and re.search("医院|学院",ent.entity_text):
  1611. ent.label = 0
  1612. ent.values[0] = 0.5 + ent.values[0] / 10
  1613. self.get_tenderee = True
  1614. elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text):
  1615. ent.label = 0
  1616. ent.values[0] = 0.5 + ent.values[0] / 10
  1617. self.get_tenderee = True
  1618. elif re.search("局", _subject) and re.search("局", ent.entity_text):
  1619. _sentence = list_sentences[0][ent.sentence_index]
  1620. _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
  1621. end_index=ent.end_index, size=20, center_include=True,
  1622. word_flag=True, use_text=True,
  1623. text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
  1624. if not re.search("监督|投诉",_span[0][-10:]):
  1625. ent.label = 0
  1626. ent.values[0] = 0.5 + ent.values[0] / 10
  1627. self.get_tenderee = True
  1628. # 正则召回未识别实体
  1629. def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
  1630. list_sentence = list_sentences[0]
  1631. for in_attachment in [False,True]:
  1632. for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
  1633. sentence_text = sentence.sentence_text
  1634. tokens = sentence.tokens
  1635. doc_id = sentence.doc_id
  1636. in_attachment = sentence.in_attachment
  1637. list_tokenbegin = []
  1638. begin = 0
  1639. for i in range(0, len(tokens)):
  1640. list_tokenbegin.append(begin)
  1641. begin += len(str(tokens[i]))
  1642. list_tokenbegin.append(begin + 1)
  1643. for _match in re.finditer(pattern,sentence_text):
  1644. _groupdict = _match.groupdict()
  1645. _match_text = _match.group()
  1646. _unrecognized_text = _groupdict["unrecognized"]
  1647. _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
  1648. if not _unrecognized:
  1649. _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
  1650. if _unrecognized:
  1651. _unrecognized = _unrecognized.group()
  1652. else:
  1653. continue
  1654. # print(_unrecognized)
  1655. if re.search("某",_unrecognized) or len(_unrecognized)>15:
  1656. continue
  1657. begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
  1658. for j in range(len(list_tokenbegin)):
  1659. if list_tokenbegin[j] == begin_index_temp:
  1660. begin_index = j
  1661. break
  1662. elif list_tokenbegin[j] > begin_index_temp:
  1663. begin_index = j - 1
  1664. break
  1665. index = begin_index_temp + len(_unrecognized)
  1666. end_index_temp = index
  1667. for j in range(begin_index, len(list_tokenbegin)):
  1668. if list_tokenbegin[j] >= index:
  1669. end_index = j - 1
  1670. break
  1671. entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
  1672. entity_text = _unrecognized
  1673. new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
  1674. begin_index_temp, end_index_temp, in_attachment=in_attachment)
  1675. new_entity.label = 0
  1676. new_entity.values = [on_value,0,0,0,0,0]
  1677. list_entitys[0].append(new_entity)
  1678. self.get_tenderee = True
  1679. if self.get_tenderee:
  1680. list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
  1681. break
  1682. class RoleGrade():
  1683. def __init__(self):
  1684. self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|甲)(人|方|单位))"
  1685. self.tenderee_center_9 = "(?P<tenderee_center_9>受.{5,20}委托)"
  1686. self.tenderee_left_8 = "(?P<tenderee_left_8>(业主|转让方|尊敬的供应商|出租方|处置方|(需求|建设|最终|发包)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
  1687. self.agency_left_9 = "(?P<agency_left_9>代理)"
  1688. self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得|乙方)|第[1一]|排名:1)"
  1689. self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商))"
  1690. self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排名:2))"
  1691. self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排名:3))"
  1692. self.pattern_list = [self.tenderee_left_9,self.tenderee_center_9, self.tenderee_left_8,self.agency_left_9, self.winTenderer_left_9,
  1693. self.winTenderer_left_8, self.secondTenderer_left_9, self.thirdTenderer_left_9]
  1694. def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
  1695. '''
  1696. 根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
  1697. :param list_articles:
  1698. :param list_sentences:
  1699. :param list_entitys:
  1700. :param codeName:
  1701. :return:
  1702. '''
  1703. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  1704. role2id = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}
  1705. for entity in list_entitys[0]:
  1706. if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> 0.5:
  1707. text = sentences[entity.sentence_index].sentence_text
  1708. in_att = sentences[entity.sentence_index].in_attachment
  1709. pre_prob = entity.values[entity.label]
  1710. b = entity.wordOffset_begin
  1711. e = entity.wordOffset_end
  1712. not_found = 1
  1713. for pattern in self.pattern_list:
  1714. if 'left' in pattern:
  1715. context = text[max(0, b-span):b]
  1716. elif 'right' in pattern:
  1717. context = text[e:e+span]
  1718. elif 'center' in pattern:
  1719. context = text[max(0, b-span):e+span]
  1720. else:
  1721. print('规则错误', pattern)
  1722. ser = re.search(pattern, context)
  1723. if ser:
  1724. groupdict = pattern.split('>')[0].replace('(?P<', '')
  1725. _role, _direct, _prob = groupdict.split('_')
  1726. _label = role2id.get(_role)
  1727. if _label != entity.label:
  1728. continue
  1729. _prob = int(_prob)*0.1
  1730. # print('规则修改角色概率前:', entity.entity_text, entity.label, entity.values)
  1731. if in_att:
  1732. _prob = _prob - 0.2
  1733. if pre_prob < _prob:
  1734. _prob = 0.65
  1735. entity.values[_label] = _prob + entity.values[_label] / 20
  1736. not_found = 0
  1737. # print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values)
  1738. break
  1739. if not_found and entity.values[entity.label]> min_prob:
  1740. _prob = min_prob - 0.1 if in_att else min_prob
  1741. entity.values[entity.label] = _prob + entity.values[entity.label] / 20
  1742. # print('找不到规则修改角色概率:', entity.entity_text, entity.label, entity.values)
  1743. class MoneyGrade():
  1744. def __init__(self):
  1745. self.tenderee_money_left_9 = "(?P<tenderee_left_9>最高(投标)?限价)|控制价|拦标价"
  1746. self.tenderee_money_left_8 = "(?P<tenderee_left_8>预算|限价|起始|起拍|底价|标底)"
  1747. self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同|总报价))"
  1748. self.tenderer_money_left_8 = "(?P<tenderer_left_8>(投标|总价))"
  1749. self.pattern_list = [self.tenderee_money_left_9, self.tenderee_money_left_8, self.tenderer_money_left_9]
  1750. def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
  1751. sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
  1752. role2id = {"tenderee": 0, "tenderer": 1}
  1753. for entity in list_entitys[0]:
  1754. if entity.entity_type in ['money'] and entity.label in [0, 1] and entity.values[entity.label]> 0.6:
  1755. text = sentences[entity.sentence_index].sentence_text
  1756. in_att = sentences[entity.sentence_index].in_attachment
  1757. b = entity.wordOffset_begin
  1758. e = entity.wordOffset_end
  1759. context = text[max(0, b - span):b]
  1760. not_found = 1
  1761. for pattern in self.pattern_list:
  1762. ser = re.search(pattern, context)
  1763. if ser:
  1764. groupdict = pattern.split('>')[0].replace('(?P<', '')
  1765. _role, _direct, _prob = groupdict.split('_')
  1766. _label = role2id.get(_role)
  1767. if _label != entity.label:
  1768. continue
  1769. _prob = int(_prob) * 0.1
  1770. # print('规则修改金额概率前:', entity.entity_text, entity.label, entity.values)
  1771. if in_att:
  1772. _prob = _prob - 0.2
  1773. entity.values[_label] = _prob + entity.values[_label] / 20
  1774. not_found = 0
  1775. # print('规则修改金额概率后:', entity.entity_text, entity.label, entity.values)
  1776. break
  1777. if not_found and entity.values[entity.label] > min_prob:
  1778. _prob = min_prob - 0.1 if in_att else min_prob
  1779. entity.values[entity.label] = _prob + entity.values[entity.label] / 20
  1780. # print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
  1781. # 时间类别
  1782. class TimePredictor():
  1783. def __init__(self,config=None):
  1784. self.sess = tf.Session(graph=tf.Graph(),config=config)
  1785. self.inputs_code = None
  1786. self.outputs_code = None
  1787. self.input_shape = (2,40,128)
  1788. self.load_model()
  1789. def load_model(self):
  1790. model_path = os.path.dirname(__file__)+'/timesplit_model'
  1791. if self.inputs_code is None:
  1792. log("get model of time")
  1793. with self.sess.as_default():
  1794. with self.sess.graph.as_default():
  1795. meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path)
  1796. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  1797. signature_def = meta_graph_def.signature_def
  1798. self.inputs_code = []
  1799. self.inputs_code.append(
  1800. self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
  1801. self.inputs_code.append(
  1802. self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
  1803. self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
  1804. return self.inputs_code, self.outputs_code
  1805. else:
  1806. return self.inputs_code, self.outputs_code
  1807. def search_time_data(self,list_sentences,list_entitys):
  1808. data_x = []
  1809. points_entitys = []
  1810. for list_sentence, list_entity in zip(list_sentences, list_entitys):
  1811. p_entitys = 0
  1812. p_sentences = 0
  1813. list_sentence.sort(key=lambda x: x.sentence_index)
  1814. while(p_entitys<len(list_entity)):
  1815. entity = list_entity[p_entitys]
  1816. if entity.entity_type in ['time']:
  1817. while(p_sentences<len(list_sentence)):
  1818. sentence = list_sentence[p_sentences]
  1819. if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
  1820. # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
  1821. # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
  1822. s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
  1823. left = s[0]
  1824. right = s[1]
  1825. context = [left, right]
  1826. x = self.embedding_words(context, shape=self.input_shape)
  1827. data_x.append(x)
  1828. points_entitys.append(entity)
  1829. break
  1830. p_sentences += 1
  1831. p_entitys += 1
  1832. if len(points_entitys)==0:
  1833. return None
  1834. data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
  1835. return [data_x, points_entitys]
  1836. def embedding_words(self, datas, shape):
  1837. '''
  1838. @summary:查找词汇对应的词向量
  1839. @param:
  1840. datas:词汇的list
  1841. shape:结果的shape
  1842. @return: array,返回对应shape的词嵌入
  1843. '''
  1844. model_w2v = getModel_w2v()
  1845. embed = np.zeros(shape)
  1846. length = shape[1]
  1847. out_index = 0
  1848. for data in datas:
  1849. index = 0
  1850. for item in data:
  1851. item_not_space = re.sub("\s*", "", item)
  1852. if index >= length:
  1853. break
  1854. if item_not_space in model_w2v.vocab:
  1855. embed[out_index][index] = model_w2v[item_not_space]
  1856. index += 1
  1857. else:
  1858. embed[out_index][index] = model_w2v['unk']
  1859. index += 1
  1860. out_index += 1
  1861. return embed
  1862. def predict(self, list_sentences,list_entitys):
  1863. datas = self.search_time_data(list_sentences, list_entitys)
  1864. if datas is None:
  1865. return
  1866. points_entitys = datas[1]
  1867. with self.sess.as_default():
  1868. predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0]
  1869. ,self.inputs_code[1]:datas[0][1]})[0]
  1870. for i in range(len(predict_y)):
  1871. entity = points_entitys[i]
  1872. label = np.argmax(predict_y[i])
  1873. values = []
  1874. for item in predict_y[i]:
  1875. values.append(item)
  1876. if label != 0:
  1877. if not timeFormat(entity.entity_text):
  1878. label = 0
  1879. values[0] = 0.5
  1880. entity.set_Role(label, values)
  1881. # 产品字段提取
  1882. class ProductPredictor():
  1883. def __init__(self,config=None):
  1884. vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
  1885. self.vocab = load(vocabpath)
  1886. self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab)))
  1887. self.sess = tf.Session(graph=tf.Graph(),config=config)
  1888. self.load_model()
  1889. def load_model(self):
  1890. # model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
  1891. model_path = os.path.dirname(__file__)+'/product_savedmodel/productAndfailreason.pb'
  1892. with self.sess.as_default():
  1893. with self.sess.graph.as_default():
  1894. output_graph_def = tf.GraphDef()
  1895. with open(model_path, 'rb') as f:
  1896. output_graph_def.ParseFromString(f.read())
  1897. tf.import_graph_def(output_graph_def, name='')
  1898. self.sess.run(tf.global_variables_initializer())
  1899. self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
  1900. self.length = self.sess.graph.get_tensor_by_name("Sum:0")
  1901. self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
  1902. self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
  1903. self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
  1904. def decode(self,logits, lengths, matrix):
  1905. paths = []
  1906. small = -1000.0
  1907. # start = np.asarray([[small] * 4 + [0]])
  1908. start = np.asarray([[small]*7+[0]])
  1909. for score, length in zip(logits, lengths):
  1910. score = score[:length]
  1911. pad = small * np.ones([length, 1])
  1912. logits = np.concatenate([score, pad], axis=1)
  1913. logits = np.concatenate([start, logits], axis=0)
  1914. path, _ = viterbi_decode(logits, matrix)
  1915. paths.append(path[1:])
  1916. return paths
  1917. def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
  1918. '''
  1919. 预测实体代码,每个句子最多取MAX_AREA个字,超过截断
  1920. :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
  1921. :param list_entitys: 多篇公告实体列表
  1922. :param MAX_AREA: 每个句子最多截取多少字
  1923. :return: 把预测出来的实体放进实体类
  1924. '''
  1925. with self.sess.as_default() as sess:
  1926. with self.sess.graph.as_default():
  1927. result = []
  1928. product_list = []
  1929. if fail and list_articles!=[]:
  1930. text_list = [list_articles[0].content[:MAX_AREA]]
  1931. chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in text] for text in text_list]
  1932. if USE_API:
  1933. requests_result = requests.post(API_URL + "/predict_product",
  1934. json={"inputs": chars}, verify=True)
  1935. batch_paths = json.loads(requests_result.text)['result']
  1936. lengths = json.loads(requests_result.text)['lengths']
  1937. else:
  1938. lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
  1939. feed_dict={
  1940. self.char_input: np.asarray(chars),
  1941. self.dropout: 1.0
  1942. })
  1943. batch_paths = self.decode(scores, lengths, tran_)
  1944. for text, path, length in zip(text_list, batch_paths, lengths):
  1945. tags = ''.join([str(it) for it in path[:length]])
  1946. # 提取产品
  1947. for it in re.finditer("12*3", tags):
  1948. start = it.start()
  1949. end = it.end()
  1950. _entity = Entity(doc_id=list_articles[0].doc_id, entity_id="%s_%s_%s_%s" % (
  1951. list_articles[0].doc_id, 0, start, end),
  1952. entity_text=text[start:end],
  1953. entity_type="product", sentence_index=0,
  1954. begin_index=0, end_index=0, wordOffset_begin=start,
  1955. wordOffset_end=end)
  1956. list_entitys[0].append(_entity)
  1957. product_list.append(text[start:end])
  1958. # 提取失败原因
  1959. for it in re.finditer("45*6", tags):
  1960. start = it.start()
  1961. end = it.end()
  1962. result.append(text[start:end].replace('?', '').strip())
  1963. reasons = []
  1964. for it in result:
  1965. if "(√)" in it or "(√)" in it:
  1966. reasons = [it]
  1967. break
  1968. if reasons != [] and (it not in reasons[-1] and it not in reasons):
  1969. reasons.append(it)
  1970. elif reasons == []:
  1971. reasons.append(it)
  1972. return {'fail_reason':';'.join(reasons)}, product_list
  1973. if list_entitys is None:
  1974. list_entitys = [[] for _ in range(len(list_sentences))]
  1975. for list_sentence, list_entity in zip(list_sentences,list_entitys):
  1976. if len(list_sentence)==0:
  1977. result.append({"product":[]})
  1978. continue
  1979. list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
  1980. _begin_index = 0
  1981. item = {"product":[]}
  1982. temp_list = []
  1983. while True:
  1984. MAX_LEN = len(list_sentence[_begin_index].sentence_text)
  1985. if MAX_LEN > MAX_AREA:
  1986. MAX_LEN = MAX_AREA
  1987. _LEN = MAX_AREA//MAX_LEN
  1988. chars = [sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
  1989. chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in l] for l in chars]
  1990. chars = pad_sequences(chars, maxlen=MAX_LEN, padding="post", truncating="post")
  1991. if USE_API:
  1992. requests_result = requests.post(API_URL + "/predict_product",
  1993. json={"inputs": chars.tolist()}, verify=True)
  1994. batch_paths = json.loads(requests_result.text)['result']
  1995. lengths = json.loads(requests_result.text)['lengths']
  1996. else:
  1997. lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
  1998. feed_dict={
  1999. self.char_input: np.asarray(chars),
  2000. self.dropout: 1.0
  2001. })
  2002. batch_paths = self.decode(scores, lengths, tran_)
  2003. for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
  2004. tags = ''.join([str(it) for it in path[:length]])
  2005. for it in re.finditer("12*3", tags):
  2006. start = it.start()
  2007. end = it.end()
  2008. _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
  2009. sentence.doc_id, sentence.sentence_index, start, end),
  2010. entity_text=sentence.sentence_text[start:end],
  2011. entity_type="product", sentence_index=sentence.sentence_index,
  2012. begin_index=0, end_index=0, wordOffset_begin=start,
  2013. wordOffset_end=end,in_attachment=sentence.in_attachment)
  2014. list_entity.append(_entity)
  2015. temp_list.append(sentence.sentence_text[start:end])
  2016. product_list.append(sentence.sentence_text[start:end])
  2017. # item["product"] = list(set(temp_list))
  2018. # result.append(item)
  2019. if _begin_index+_LEN >= len(list_sentence):
  2020. break
  2021. _begin_index += _LEN
  2022. item["product"] = list(set(temp_list))
  2023. result.append(item) # 修正bug
  2024. return {'fail_reason': ""},product_list
  2025. # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
  2026. class ProductAttributesPredictor():
  2027. def __init__(self,):
  2028. self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
  2029. self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
  2030. with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
  2031. self.header_set = pickle.load(f)
  2032. def isTrueTable(self, table):
  2033. '''真假表格规则:
  2034. 1、包含<caption>或<th>标签为真
  2035. 2、包含大量链接、表单、图片或嵌套表格为假
  2036. 3、表格尺寸太小为假
  2037. 4、外层<table>嵌套子<table>,一般子为真,外为假'''
  2038. if table.find_all(['caption', 'th']) != []:
  2039. return True
  2040. elif len(table.find_all(['form', 'a', 'img'])) > 5:
  2041. return False
  2042. elif len(table.find_all(['tr'])) < 2:
  2043. return False
  2044. elif len(table.find_all(['table'])) >= 1:
  2045. return False
  2046. else:
  2047. return True
  2048. def getTrs(self, tbody):
  2049. # 获取所有的tr
  2050. trs = []
  2051. objs = tbody.find_all(recursive=False)
  2052. for obj in objs:
  2053. if obj.name == "tr":
  2054. trs.append(obj)
  2055. if obj.name == "tbody":
  2056. for tr in obj.find_all("tr", recursive=False):
  2057. trs.append(tr)
  2058. return trs
  2059. def getTable(self, tbody):
  2060. trs = self.getTrs(tbody)
  2061. inner_table = []
  2062. if len(trs) < 2:
  2063. return inner_table
  2064. for tr in trs:
  2065. tr_line = []
  2066. tds = tr.findChildren(['td', 'th'], recursive=False)
  2067. if len(tds) < 2:
  2068. continue
  2069. for td in tds:
  2070. td_text = re.sub('\s', '', td.get_text())
  2071. td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/') # 修复272144312 # 产品单价数量提取结果有特殊符号\ 气动执行装置备件\密封组件\NBR+PT
  2072. tr_line.append(td_text)
  2073. inner_table.append(tr_line)
  2074. return inner_table
  2075. def fixSpan(self, tbody):
  2076. # 处理colspan, rowspan信息补全问题
  2077. trs = self.getTrs(tbody)
  2078. ths_len = 0
  2079. ths = list()
  2080. trs_set = set()
  2081. # 修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
  2082. # 遍历每一个tr
  2083. for indtr, tr in enumerate(trs):
  2084. ths_tmp = tr.findChildren('th', recursive=False)
  2085. # 不补全含有表格的tr
  2086. if len(tr.findChildren('table')) > 0:
  2087. continue
  2088. if len(ths_tmp) > 0:
  2089. ths_len = ths_len + len(ths_tmp)
  2090. for th in ths_tmp:
  2091. ths.append(th)
  2092. trs_set.add(tr)
  2093. # 遍历每行中的element
  2094. tds = tr.findChildren(recursive=False)
  2095. if len(tds) < 3:
  2096. continue # 列数太少的不补全
  2097. for indtd, td in enumerate(tds):
  2098. # 若有colspan 则补全同一行下一个位置
  2099. if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "":
  2100. col = int(re.sub("[^0-9]", "", str(td['colspan'])))
  2101. if col < 10 and len(td.get_text()) < 500:
  2102. td['colspan'] = 1
  2103. for i in range(1, col, 1):
  2104. td.insert_after(copy.copy(td))
  2105. for indtr, tr in enumerate(trs):
  2106. ths_tmp = tr.findChildren('th', recursive=False)
  2107. # 不补全含有表格的tr
  2108. if len(tr.findChildren('table')) > 0:
  2109. continue
  2110. if len(ths_tmp) > 0:
  2111. ths_len = ths_len + len(ths_tmp)
  2112. for th in ths_tmp:
  2113. ths.append(th)
  2114. trs_set.add(tr)
  2115. # 遍历每行中的element
  2116. tds = tr.findChildren(recursive=False)
  2117. same_span = 0
  2118. if len(tds) > 1 and 'rowspan' in tds[0].attrs:
  2119. span0 = tds[0].attrs['rowspan']
  2120. for td in tds:
  2121. if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0:
  2122. same_span += 1
  2123. if same_span == len(tds):
  2124. continue
  2125. for indtd, td in enumerate(tds):
  2126. # 若有rowspan 则补全下一行同样位置
  2127. if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "":
  2128. row = int(re.sub("[^0-9]", "", str(td['rowspan'])))
  2129. td['rowspan'] = 1
  2130. for i in range(1, row, 1):
  2131. # 获取下一行的所有td, 在对应的位置插入
  2132. if indtr + i < len(trs):
  2133. tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False)
  2134. if len(tds1) >= (indtd) and len(tds1) > 0:
  2135. if indtd > 0:
  2136. tds1[indtd - 1].insert_after(copy.copy(td))
  2137. else:
  2138. tds1[0].insert_before(copy.copy(td))
  2139. elif len(tds1) > 0 and len(tds1) == indtd - 1:
  2140. tds1[indtd - 2].insert_after(copy.copy(td))
  2141. def get_monthlen(self, year, month):
  2142. '''输入年份、月份 int类型 得到该月份天数'''
  2143. try:
  2144. weekday, num = calendar.monthrange(int(year), int(month))
  2145. except:
  2146. num = 30
  2147. return str(num)
  2148. def fix_time(self, text, html, page_time):
  2149. '''输入日期字段返回格式化日期'''
  2150. for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'),
  2151. ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]:
  2152. if it[0] in text:
  2153. text = text.replace(it[0], it[1])
  2154. if re.search('^\d{1,2}月$', text):
  2155. m = re.search('^(\d{1,2})月$', text).group(1)
  2156. if len(m) < 2:
  2157. m = '0' + m
  2158. year = re.search('(\d{4})年(.{,12}采购意向)?', html)
  2159. if year:
  2160. y = year.group(1)
  2161. num = self.get_monthlen(y, m)
  2162. if len(num) < 2:
  2163. num = '0' + num
  2164. order_begin = "%s-%s-01" % (y, m)
  2165. order_end = "%s-%s-%s" % (y, m, num)
  2166. elif page_time != "":
  2167. year = re.search('\d{4}', page_time)
  2168. if year:
  2169. y = year.group(0)
  2170. num = self.get_monthlen(y, m)
  2171. if len(num) < 2:
  2172. num = '0' + num
  2173. order_begin = "%s-%s-01" % (y, m)
  2174. order_end = "%s-%s-%s" % (y, m, num)
  2175. else:
  2176. y = str(datetime.datetime.now().year)
  2177. num = self.get_monthlen(y, m)
  2178. if len(num) < 2:
  2179. num = '0' + num
  2180. order_begin = "%s-%s-01" % (y, m)
  2181. order_end = "%s-%s-%s" % (y, m, num)
  2182. else:
  2183. y = str(datetime.datetime.now().year)
  2184. num = self.get_monthlen(y, m)
  2185. if len(num) < 2:
  2186. num = '0' + num
  2187. order_begin = "%s-%s-01" % (y, m)
  2188. order_end = "%s-%s-%s" % (y, m, num)
  2189. return order_begin, order_end
  2190. t1 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})月?$', text)
  2191. if t1:
  2192. year = t1.group(1)
  2193. month = t1.group(3)
  2194. num = self.get_monthlen(year, month)
  2195. if len(month)<2:
  2196. month = '0'+month
  2197. if len(num) < 2:
  2198. num = '0'+num
  2199. order_begin = "%s-%s-01" % (year, month)
  2200. order_end = "%s-%s-%s" % (year, month, num)
  2201. return order_begin, order_end
  2202. t2 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})(月|/|\.|-)(\d{1,2})日?$', text)
  2203. if t2:
  2204. y = t2.group(1)
  2205. m = t2.group(3)
  2206. d = t2.group(5)
  2207. m = '0'+ m if len(m)<2 else m
  2208. d = '0'+d if len(d)<2 else d
  2209. order_begin = order_end = "%s-%s-%s"%(y,m,d)
  2210. return order_begin, order_end
  2211. # 时间样式:"202105"
  2212. t3 = re.search("^(20\d{2})(\d{1,2})$",text)
  2213. if t3:
  2214. year = t3.group(1)
  2215. month = t3.group(2)
  2216. if int(month)>0 and int(month)<=12:
  2217. num = self.get_monthlen(year, month)
  2218. if len(month) < 2:
  2219. month = '0' + month
  2220. if len(num) < 2:
  2221. num = '0' + num
  2222. order_begin = "%s-%s-01" % (year, month)
  2223. order_end = "%s-%s-%s" % (year, month, num)
  2224. return order_begin, order_end
  2225. # 时间样式:"20210510"
  2226. t4 = re.search("^(20\d{2})(\d{2})(\d{2})$", text)
  2227. if t4:
  2228. year = t4.group(1)
  2229. month = t4.group(2)
  2230. day = t4.group(3)
  2231. if int(month) > 0 and int(month) <= 12 and int(day)>0 and int(day)<=31:
  2232. order_begin = order_end = "%s-%s-%s"%(year,month,day)
  2233. return order_begin, order_end
  2234. all_match = re.finditer('^(?P<y1>\d{4})(年|/|\.)(?P<m1>\d{1,2})(?:(月|/|\.)(?:(?P<d1>\d{1,2})日)?)?'
  2235. '(到|至|-)(?:(?P<y2>\d{4})(年|/|\.))?(?P<m2>\d{1,2})(?:(月|/|\.)'
  2236. '(?:(?P<d2>\d{1,2})日)?)?$', text)
  2237. y1 = m1 = d1 = y2 = m2 = d2 = ""
  2238. found_math = False
  2239. for _match in all_match:
  2240. if len(_match.group()) > 0:
  2241. found_math = True
  2242. for k, v in _match.groupdict().items():
  2243. if v!="" and v is not None:
  2244. if k == 'y1':
  2245. y1 = v
  2246. elif k == 'm1':
  2247. m1 = v
  2248. elif k == 'd1':
  2249. d1 = v
  2250. elif k == 'y2':
  2251. y2 = v
  2252. elif k == 'm2':
  2253. m2 = v
  2254. elif k == 'd2':
  2255. d2 = v
  2256. if not found_math:
  2257. return "", ""
  2258. y2 = y1 if y2 == "" else y2
  2259. d1 = '1' if d1 == "" else d1
  2260. d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
  2261. m1 = '0' + m1 if len(m1) < 2 else m1
  2262. m2 = '0' + m2 if len(m2) < 2 else m2
  2263. d1 = '0' + d1 if len(d1) < 2 else d1
  2264. d2 = '0' + d2 if len(d2) < 2 else d2
  2265. order_begin = "%s-%s-%s"%(y1,m1,d1)
  2266. order_end = "%s-%s-%s"%(y2,m2,d2)
  2267. return order_begin, order_end
  2268. def find_header(self, items, p1, p2):
  2269. '''
  2270. inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
  2271. :param items: 列表,内容为每个td 文本内容
  2272. :param p1: 优先表头正则
  2273. :param p2: 第二表头正则
  2274. :return: 表头所在列序号,是否表头,表头内容
  2275. '''
  2276. flag = False
  2277. header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
  2278. product = "" # 产品
  2279. quantity = "" # 数量
  2280. unitPrice = "" # 单价
  2281. brand = "" # 品牌
  2282. specs = "" # 规格
  2283. demand = "" # 采购需求
  2284. budget = "" # 预算金额
  2285. order_time = "" # 采购时间
  2286. for i in range(min(4, len(items))):
  2287. it = items[i]
  2288. if len(it) < 15 and re.search(p1, it) != None:
  2289. flag = True
  2290. product = it
  2291. header_dic['名称'] = i
  2292. break
  2293. if not flag:
  2294. for i in range(min(4, len(items))):
  2295. it = items[i]
  2296. if len(it) < 15 and re.search(p2, it) and re.search(
  2297. '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None:
  2298. flag = True
  2299. product = it
  2300. header_dic['名称'] = i
  2301. break
  2302. if flag:
  2303. for j in range(i + 1, len(items)):
  2304. if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
  2305. continue
  2306. if header_dic['数量']=="" and re.search('数量|采购量', items[j]):
  2307. header_dic['数量'] = j
  2308. quantity = items[j]
  2309. elif re.search('单价', items[j]):
  2310. header_dic['单价'] = j
  2311. unitPrice = items[j]
  2312. elif re.search('品牌', items[j]):
  2313. header_dic['品牌'] = j
  2314. brand = items[j]
  2315. elif re.search('规格|型号', items[j]):
  2316. header_dic['规格'] = j
  2317. specs = items[j]
  2318. elif re.search('需求|服务要求|服务标准', items[j]):
  2319. header_dic['需求'] = j
  2320. demand = items[j]
  2321. elif re.search('预算|控制金额', items[j]):
  2322. header_dic['预算'] = j
  2323. budget = items[j]
  2324. elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
  2325. header_dic['时间'] = j
  2326. order_time = items[j]
  2327. if header_dic.get('名称', "") != "" :
  2328. num = 0
  2329. for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
  2330. if it != "":
  2331. num += 1
  2332. if num >=2:
  2333. return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
  2334. flag = False
  2335. return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time)
  2336. def predict(self, docid='', html='', page_time=""):
  2337. '''
  2338. 正则寻找table表格内 产品相关信息
  2339. :param html:公告HTML原文
  2340. :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息
  2341. '''
  2342. soup = BeautifulSoup(html, 'lxml')
  2343. # flag_yx = True if re.search('采购意向', html) else False
  2344. flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False
  2345. tables = soup.find_all(['table'])
  2346. headers = []
  2347. headers_demand = []
  2348. header_col = []
  2349. product_link = []
  2350. demand_link = []
  2351. total_product_money = 0
  2352. for i in range(len(tables)-1, -1, -1):
  2353. table = tables[i]
  2354. if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
  2355. table.string = table.get_text()
  2356. table.name = 'turntable'
  2357. continue
  2358. if not self.isTrueTable(table):
  2359. continue
  2360. self.fixSpan(table)
  2361. inner_table = self.getTable(table)
  2362. # print(inner_table)
  2363. i = 0
  2364. found_header = False
  2365. header_colnum = 0
  2366. if flag_yx:
  2367. col0_l = []
  2368. col1_l = []
  2369. for tds in inner_table:
  2370. if len(tds) == 2:
  2371. col0_l.append(re.sub('[::]', '', tds[0]))
  2372. col1_l.append(tds[1])
  2373. elif len(tds)>=4 and len(inner_table)==2:
  2374. col0_l = inner_table[0]
  2375. col1_l = inner_table[1]
  2376. break
  2377. # print(set(col0_l))
  2378. # print('head: ',set(col0_l) & self.header_set)
  2379. if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2:
  2380. header_list2 = []
  2381. product = demand = budget = order_begin = order_end = ""
  2382. for i in range(len(col0_l)):
  2383. if re.search('项目名称', col0_l[i]):
  2384. header_list2.append(col0_l[i])
  2385. product = col1_l[i]
  2386. elif re.search('采购需求|需求概况', col0_l[i]):
  2387. header_list2.append(col0_l[i])
  2388. demand = col1_l[i]
  2389. elif re.search('采购预算|预算金额|控制金额', col0_l[i]):
  2390. header_list2.append(col0_l[i])
  2391. _budget = col1_l[i]
  2392. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", _budget)
  2393. if re_price:
  2394. _budget = re_price[0]
  2395. if '万元' in col0_l[i] and '万' not in _budget:
  2396. _budget += '万元'
  2397. budget = str(getUnifyMoney(_budget))
  2398. elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
  2399. header_list2.append(col0_l[i])
  2400. order_time = col1_l[i].strip()
  2401. order_begin, order_end = self.fix_time(order_time, html, page_time)
  2402. if order_begin != "" and order_end!="":
  2403. order_begin_year = int(order_begin.split("-")[0])
  2404. order_end_year = int(order_end.split("-")[0])
  2405. # 限制附件错误识别时间
  2406. if order_begin_year>=2050 or order_end_year>=2050:
  2407. order_begin = order_end = ""
  2408. # print(product,demand,budget,order_begin)
  2409. if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要
  2410. link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
  2411. 'order_begin': order_begin, 'order_end': order_end}
  2412. if link not in demand_link:
  2413. demand_link.append(link)
  2414. headers_demand.append('_'.join(header_list2))
  2415. continue
  2416. while i < (len(inner_table)):
  2417. tds = inner_table[i]
  2418. not_empty = [it for it in tds if it != ""]
  2419. if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2:
  2420. i += 1
  2421. continue
  2422. product = "" # 产品
  2423. quantity = "" # 数量
  2424. unitPrice = "" # 单价
  2425. brand = "" # 品牌
  2426. specs = "" # 规格
  2427. demand = "" # 采购需求
  2428. budget = "" # 预算金额
  2429. order_time = "" # 采购时间
  2430. order_begin = ""
  2431. order_end = ""
  2432. # print(tds,set(tds) & self.header_set)
  2433. if len(set([re.sub('[::]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
  2434. # if len(set(tds) & self.header_set) > len(tds) * 0.2:
  2435. header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
  2436. if found_header and len(headers)<1: # 只保留出现的第一个表头
  2437. headers.append('_'.join(header_list))
  2438. headers_demand.append('_'.join(header_list2))
  2439. header_colnum = len(tds)
  2440. header_col.append('_'.join(tds))
  2441. i += 1
  2442. continue
  2443. elif found_header:
  2444. if len(tds) != header_colnum: # 表头、属性列数不一致跳过
  2445. i += 1
  2446. continue
  2447. id1 = header_dic.get('名称', "")
  2448. id2 = header_dic.get('数量', "")
  2449. id3 = header_dic.get('单价', "")
  2450. id4 = header_dic.get('品牌', "")
  2451. id5 = header_dic.get('规格', "")
  2452. id6 = header_dic.get('需求', "")
  2453. id7 = header_dic.get('预算', "")
  2454. id8 = header_dic.get('时间', "")
  2455. if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
  2456. re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
  2457. product = tds[id1]
  2458. if id2 != "":
  2459. if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
  2460. quantity = tds[id2]
  2461. else:
  2462. quantity = ""
  2463. if id3 != "":
  2464. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
  2465. _unitPrice = tds[id3]
  2466. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
  2467. if re_price:
  2468. _unitPrice = re_price[0]
  2469. if '万元' in header_list[2] and '万' not in _unitPrice:
  2470. _unitPrice += '万元'
  2471. # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
  2472. unitPrice = str(getUnifyMoney(_unitPrice))
  2473. if id4 != "":
  2474. if re.search('\w', tds[id4]):
  2475. brand = tds[id4]
  2476. else:
  2477. brand = ""
  2478. if id5 != "":
  2479. if re.search('\w', tds[id5]):
  2480. specs = tds[id5][:500] # 限制最多500字
  2481. else:
  2482. specs = ""
  2483. if id6 != "":
  2484. if re.search('\w', tds[id6]):
  2485. demand = tds[id6]
  2486. else:
  2487. demand = ""
  2488. if id7 != "":
  2489. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
  2490. _budget = tds[id7]
  2491. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", _budget)
  2492. if re_price:
  2493. _budget = re_price[0]
  2494. if '万元' in header_list2[2] and '万' not in _budget:
  2495. _budget += '万元'
  2496. budget = str(getUnifyMoney(_budget))
  2497. if id8 != "":
  2498. if re.search('\w', tds[id8]):
  2499. order_time = tds[id8].strip()
  2500. order_begin, order_end = self.fix_time(order_time, html, page_time)
  2501. if quantity != "" or unitPrice != "" or brand != "" or specs != "":
  2502. if len(unitPrice) > 15 or len(product)>100: # 单价大于15位数或 产品名称长于100字
  2503. i += 1
  2504. continue
  2505. link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
  2506. 'brand': brand[:50], 'specs':specs}
  2507. if link not in product_link:
  2508. product_link.append(link)
  2509. mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
  2510. if link['unitPrice'] != "" and mat:
  2511. try:
  2512. total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', '')) if float(mat.group(1).replace(',', ''))<50000 else 0
  2513. except:
  2514. log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
  2515. if order_begin != "" and order_end != "":
  2516. order_begin_year = int(order_begin.split("-")[0])
  2517. order_end_year = int(order_end.split("-")[0])
  2518. # 限制附件错误识别时间
  2519. if order_begin_year >= 2050 or order_end_year >= 2050:
  2520. order_begin = order_end = ""
  2521. # print(budget,order_time)
  2522. if budget != "" and order_time != "":
  2523. link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
  2524. if link not in demand_link:
  2525. demand_link.append(link)
  2526. i += 1
  2527. else:
  2528. i += 1
  2529. if len(product_link)>0:
  2530. attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
  2531. else:
  2532. attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
  2533. if len(demand_link)>0:
  2534. demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
  2535. else:
  2536. demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
  2537. return [attr_dic, demand_dic], total_product_money
  2538. def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
  2539. if len(prem[0]['prem'])==1:
  2540. list_sentences[0].sort(key=lambda x:x.sentence_index)
  2541. list_sentence = list_sentences[0]
  2542. list_entity = list_entitys[0]
  2543. _data = product_attrs[1]['demand_info']['data']
  2544. re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
  2545. order_times = []
  2546. for entity in list_entity:
  2547. if entity.entity_type=='time':
  2548. sentence = list_sentence[entity.sentence_index]
  2549. s = spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index,
  2550. end_index=entity.end_index,size=20)
  2551. entity_left = "".join(s[0])
  2552. if re.search(re_bidding_time,entity_left):
  2553. time_text = entity.entity_text.strip()
  2554. standard_time = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*((?P<day>\d{1,2})日?)?)")
  2555. time_match = re.search(standard_time,time_text)
  2556. # print(time_text, time_match)
  2557. if time_match:
  2558. time_text = time_match.group()
  2559. order_times.append(time_text)
  2560. # print(order_times)
  2561. order_times = [tuple(self.fix_time(order_time, html, page_time)) for order_time in order_times]
  2562. order_times = [order_time for order_time in order_times if order_time[0]!=""]
  2563. if len(set(order_times))==1:
  2564. order_begin,order_end = order_times[0]
  2565. project_name = codeName[0]['name']
  2566. pack_info = [pack for pack in prem[0]['prem'].values()]
  2567. budget = pack_info[0].get('tendereeMoney',0)
  2568. product = prem[0]['product']
  2569. link = {'project_name': project_name, 'product': product, 'demand': project_name, 'budget': budget,
  2570. 'order_begin': order_begin, 'order_end': order_end}
  2571. _data.append(link)
  2572. product_attrs[1]['demand_info']['data'] = _data
  2573. return product_attrs
  2574. def predict_by_text(self,product_attrs,html,list_outlines,page_time=""):
  2575. list_outline = list_outlines[0]
  2576. get_product_attrs = False
  2577. for _outline in list_outline:
  2578. if re.search("信息|情况|清单",_outline.outline_summary):
  2579. outline_text = _outline.outline_text
  2580. outline_text = outline_text.replace(_outline.outline_summary,"")
  2581. key_value_list = [_split for _split in re.split("[,。;]",outline_text) if re.search("[::]",_split)]
  2582. head_list = []
  2583. head_value_list = []
  2584. for key_value in key_value_list:
  2585. key_value = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key_value)
  2586. temp = re.split("[::]",key_value)
  2587. key = temp[-2]
  2588. value = temp[-1]
  2589. head_list.append(key)
  2590. head_value_list.append(value)
  2591. head_set = set(head_list)
  2592. # print('head_set',head_set)
  2593. if len(head_set & self.header_set) > len(head_set)*0.2:
  2594. loop_list = []
  2595. begin_list = [0]
  2596. for index,head in enumerate(head_list):
  2597. if head not in loop_list:
  2598. loop_list.append(head)
  2599. else:
  2600. begin_list.append(index)
  2601. loop_list = []
  2602. loop_list.append(head)
  2603. headers = []
  2604. headers_demand = []
  2605. header_col = []
  2606. product_link = []
  2607. demand_link = []
  2608. for idx in range(len(begin_list)):
  2609. if idx==len(begin_list)-1:
  2610. deal_list = head_value_list[begin_list[idx]:]
  2611. tmp_head_list = head_list[begin_list[idx]:]
  2612. else:
  2613. deal_list = head_value_list[begin_list[idx]:begin_list[idx+1]]
  2614. tmp_head_list = head_list[begin_list[idx]:begin_list[idx+1]]
  2615. product = "" # 产品
  2616. quantity = "" # 数量
  2617. unitPrice = "" # 单价
  2618. brand = "" # 品牌
  2619. specs = "" # 规格
  2620. demand = "" # 采购需求
  2621. budget = "" # 预算金额
  2622. order_time = "" # 采购时间
  2623. order_begin = ""
  2624. order_end = ""
  2625. header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p1,self.p2)
  2626. if found_header:
  2627. headers.append('_'.join(header_list))
  2628. headers_demand.append('_'.join(header_list2))
  2629. header_col.append('_'.join(tmp_head_list))
  2630. # print('header_dic: ',header_dic)
  2631. id1 = header_dic.get('名称', "")
  2632. id2 = header_dic.get('数量', "")
  2633. id3 = header_dic.get('单价', "")
  2634. id4 = header_dic.get('品牌', "")
  2635. id5 = header_dic.get('规格', "")
  2636. id6 = header_dic.get('需求', "")
  2637. id7 = header_dic.get('预算', "")
  2638. id8 = header_dic.get('时间', "")
  2639. if re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
  2640. re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
  2641. product = deal_list[id1]
  2642. if id2 != "":
  2643. if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
  2644. quantity = deal_list[id2]
  2645. else:
  2646. quantity = ""
  2647. if id3 != "":
  2648. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]):
  2649. _unitPrice = deal_list[id3]
  2650. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
  2651. if re_price:
  2652. _unitPrice = re_price[0]
  2653. if '万元' in header_list[2] and '万' not in _unitPrice:
  2654. _unitPrice += '万元'
  2655. unitPrice = str(getUnifyMoney(_unitPrice))
  2656. if id4 != "":
  2657. if re.search('\w', deal_list[id4]):
  2658. brand = deal_list[id4]
  2659. else:
  2660. brand = ""
  2661. if id5 != "":
  2662. if re.search('\w', deal_list[id5]):
  2663. specs = deal_list[id5]
  2664. else:
  2665. specs = ""
  2666. if id6 != "":
  2667. if re.search('\w', deal_list[id6]):
  2668. demand = deal_list[id6]
  2669. else:
  2670. demand = ""
  2671. if id7 != "":
  2672. if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id7]):
  2673. _budget = deal_list[id7]
  2674. re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_budget)
  2675. if re_price:
  2676. _budget = re_price[0]
  2677. if '万元' in header_list2[2] and '万' not in _budget:
  2678. _budget += '万元'
  2679. budget = str(getUnifyMoney(_budget))
  2680. if id8 != "":
  2681. if re.search('\w', deal_list[id8]):
  2682. order_time = deal_list[id8].strip()
  2683. order_begin, order_end = self.fix_time(order_time, html, page_time)
  2684. # print(quantity,unitPrice,brand,specs)
  2685. if quantity != "" or unitPrice != "" or brand != "" or specs != "":
  2686. link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice,
  2687. 'brand': brand[:50], 'specs': specs}
  2688. if link not in product_link:
  2689. product_link.append(link)
  2690. # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
  2691. # if link['unitPrice'] != "" and mat:
  2692. # try:
  2693. # total_product_money += float(link['unitPrice']) * float(
  2694. # mat.group(1).replace(',', ''))
  2695. # except:
  2696. # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
  2697. # link['unitPrice'], link['quantity']))
  2698. if order_begin != "" and order_end != "":
  2699. order_begin_year = int(order_begin.split("-")[0])
  2700. order_end_year = int(order_end.split("-")[0])
  2701. # 限制附件错误识别时间
  2702. if order_begin_year >= 2050 or order_end_year >= 2050:
  2703. order_begin = order_end = ""
  2704. # print(budget, order_time)
  2705. if budget != "" and order_time != "":
  2706. link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
  2707. 'order_begin': order_begin, 'order_end': order_end}
  2708. if link not in demand_link:
  2709. demand_link.append(link)
  2710. if len(product_link) > 0:
  2711. attr_dic = {'product_attrs': {'data': product_link, 'header': list(set(headers)), 'header_col': list(set(header_col))}}
  2712. get_product_attrs = True
  2713. else:
  2714. attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
  2715. if len(demand_link) > 0:
  2716. demand_dic = {'demand_info': {'data': demand_link, 'header': headers_demand, 'header_col': header_col}}
  2717. else:
  2718. demand_dic = {'demand_info': {'data': [], 'header': [], 'header_col': []}}
  2719. product_attrs[0] = attr_dic
  2720. if len(product_attrs[1]['demand_info']['data']) == 0:
  2721. product_attrs[1] = demand_dic
  2722. if get_product_attrs:
  2723. break
  2724. return product_attrs
  2725. def add_product_attrs(self,channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time):
  2726. if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
  2727. product_attrs = self.predict_without_table(product_attrs, list_sentences,list_entitys,codeName,prem,text,page_time)
  2728. if len(product_attrs[0]['product_attrs']['data']) == 0:
  2729. product_attrs = self.predict_by_text(product_attrs,text,list_outlines,page_time)
  2730. if len(product_attrs[1]['demand_info']['data'])>0:
  2731. for d in product_attrs[1]['demand_info']['data']:
  2732. for product in set(prem[0]['product']):
  2733. if product in d['project_name'] and product not in d['product']:
  2734. d['product'].append(product) #把产品在项目名称中的添加进需求要素中
  2735. # docchannel类型提取
  2736. class DocChannel():
  2737. def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb',config=None):
  2738. self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
  2739. self.mask, self.mask_title = self.load_life(life_model,config)
  2740. self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
  2741. self.type_mask, self.type_mask_title = self.load_type(type_model)
  2742. self.sequen_len = 200 # 150 200
  2743. self.title_len = 30
  2744. self.sentence_num = 10
  2745. self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
  2746. lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
  2747. lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
  2748. self.id2type = {k: v for k, v in enumerate(lb_type)}
  2749. self.id2life = {k: v for k, v in enumerate(lb_life)}
  2750. self.load_pattern()
  2751. def load_pattern(self):
  2752. self.type_dic = {
  2753. '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
  2754. '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
  2755. '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
  2756. '采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)|
  2757. }
  2758. self.title_type_dic = {
  2759. '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
  2760. '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
  2761. '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
  2762. '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标',
  2763. # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
  2764. '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
  2765. }
  2766. self.life_dic = {
  2767. '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
  2768. '招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
  2769. '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
  2770. '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
  2771. '招标答疑': '现澄清(为|如下)|答疑补遗|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清|(最高(投标)?限价|控制价|拦标价)公示', # |异议的回复
  2772. '公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
  2773. '候选人公示': '候选人公示|评标结果公示|中标候选人名单公示',
  2774. '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果)\w{,4}(进行公示|公[示布]如下)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(候选人|人|成交)|成交)\w{,3}(信息|情况)[::\s]',
  2775. '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示',
  2776. '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
  2777. '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标',
  2778. # |确定成交供应商[:,\s]
  2779. '合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|签订日期)|(供应商乙方|乙方供应商):|合同总?金额',
  2780. '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
  2781. '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)|成交情况:\s*[流废]标',
  2782. '废标公告neg': '超过此报价将作为[废流]标处理|否则按[废流]标处理|终止规则:|视为流标'
  2783. }
  2784. self.title_life_dic = {
  2785. '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示|意向公开',
  2786. '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|(供应|招标)计划表?$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
  2787. '公告变更': '第[\d一二]次变更|(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$',
  2788. '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
  2789. '废标公告': '(终止|中止|废标|废除|流标|失败|作废|异常|撤销|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)',
  2790. '合同公告': '(合同(成交|变更)?|(履约|验收)(结果)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$',
  2791. '候选人公示': '候选人(变更)?公示|评标(结果)?公示|中标前?公示|中标预公示',
  2792. '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|开标(记录|信息|情况)|单一来源|中标通知书|中标$',
  2793. '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
  2794. '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
  2795. }
  2796. def load_life(self,life_model,config):
  2797. with tf.Graph().as_default() as graph:
  2798. output_graph_def = graph.as_graph_def()
  2799. with open(os.path.dirname(__file__)+life_model, 'rb') as f:
  2800. output_graph_def.ParseFromString(f.read())
  2801. tf.import_graph_def(output_graph_def, name='')
  2802. # print("%d ops in the final graph" % len(output_graph_def.node))
  2803. del output_graph_def
  2804. sess = tf.Session(graph=graph,config=config)
  2805. sess.run(tf.global_variables_initializer())
  2806. inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
  2807. prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
  2808. title = sess.graph.get_tensor_by_name('inputs/title:0')
  2809. mask = sess.graph.get_tensor_by_name('inputs/mask:0')
  2810. mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
  2811. # logit = sess.graph.get_tensor_by_name('output/logit:0')
  2812. softmax = sess.graph.get_tensor_by_name('output/softmax:0')
  2813. return sess, title, inputs, prob, softmax, mask, mask_title
  2814. def load_type(self,type_model):
  2815. with tf.Graph().as_default() as graph:
  2816. output_graph_def = graph.as_graph_def()
  2817. with open(os.path.dirname(__file__)+type_model, 'rb') as f:
  2818. output_graph_def.ParseFromString(f.read())
  2819. tf.import_graph_def(output_graph_def, name='')
  2820. # print("%d ops in the final graph" % len(output_graph_def.node))
  2821. del output_graph_def
  2822. sess = tf.Session(graph=graph)
  2823. sess.run(tf.global_variables_initializer())
  2824. inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
  2825. prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
  2826. title = sess.graph.get_tensor_by_name('inputs/title:0')
  2827. mask = sess.graph.get_tensor_by_name('inputs/mask:0')
  2828. mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
  2829. # logit = sess.graph.get_tensor_by_name('output/logit:0')
  2830. softmax = sess.graph.get_tensor_by_name('output/softmax:0')
  2831. return sess, title, inputs, prob, softmax, mask, mask_title
  2832. def predict_process(self, docid='', doctitle='', dochtmlcon=''):
  2833. # print('准备预处理')
  2834. def get_kw_senten(s, span=10):
  2835. doc_sens = []
  2836. tmp = 0
  2837. num = 0
  2838. end_idx = 0
  2839. for it in re.finditer(self.kws, s): # '|'.join(keywordset)
  2840. left = s[end_idx:it.end()].split()
  2841. right = s[it.end():].split()
  2842. tmp_seg = s[tmp:it.start()].split()
  2843. if len(tmp_seg) > span or tmp == 0:
  2844. doc_sens.append(' '.join(left[-span:] + right[:span]))
  2845. end_idx = it.end() + 1 + len(' '.join(right[:span]))
  2846. tmp = it.end()
  2847. num += 1
  2848. if num >= self.sentence_num:
  2849. break
  2850. if doc_sens == []:
  2851. doc_sens.append(s)
  2852. return doc_sens
  2853. def word2id(wordlist, max_len=self.sequen_len):
  2854. ids = [getIndexOfWords(w) for w in wordlist]
  2855. ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
  2856. assert len(ids) == max_len
  2857. return ids
  2858. cost_time = dict()
  2859. datas = []
  2860. datas_title = []
  2861. try:
  2862. segword_title = ' '.join(selffool.cut(doctitle)[0])
  2863. segword_content = dochtmlcon
  2864. except:
  2865. segword_content = ''
  2866. segword_title = ''
  2867. if isinstance(segword_content, float):
  2868. segword_content = ''
  2869. if isinstance(segword_title, float):
  2870. segword_title = ''
  2871. segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
  2872. replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
  2873. replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
  2874. segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
  2875. segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
  2876. doc_word_list = segword_content.split()
  2877. if len(doc_word_list) > self.sequen_len / 2:
  2878. doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
  2879. doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
  2880. else:
  2881. doc_sens = ' '.join(doc_word_list[:self.sequen_len])
  2882. # print('标题:',segword_title)
  2883. # print('正文:',segword_content)
  2884. datas.append(doc_sens.split())
  2885. datas_title.append(segword_title.split())
  2886. # print('完成预处理')
  2887. return datas, datas_title
  2888. def is_houxuan(self, title, content):
  2889. '''
  2890. 通过标题和中文内容判断是否属于候选人公示类别
  2891. :param title: 公告标题
  2892. :param content: 公告正文文本内容
  2893. :return: 1 是候选人公示 ;0 不是
  2894. '''
  2895. if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围)
  2896. if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
  2897. return 0
  2898. return 1
  2899. if re.search('候选人的?公示', content[:100]):
  2900. if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
  2901. return 0
  2902. return 1
  2903. else:
  2904. return 0
  2905. def predict(self, title='', list_sentence='', web_source_no='', original_docchannel=''):
  2906. not_extract_dic = {
  2907. 104: '招标文件',
  2908. 106: '法律法规',
  2909. 107: '新闻资讯',
  2910. 108: '拟建项目',
  2911. 109: '展会推广',
  2912. 110: '企业名录',
  2913. 111: '企业资质',
  2914. 112: '全国工程人员',
  2915. 113: '业主采购'
  2916. }
  2917. if original_docchannel in not_extract_dic:
  2918. return {'docchannel': {'docchannel':'', 'doctype':not_extract_dic[original_docchannel], "original_docchannel_id": str(original_docchannel)}}
  2919. if web_source_no in ['02104-7']:
  2920. return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}}
  2921. if isinstance(list_sentence, list):
  2922. token_l = [it.tokens for it in list_sentence]
  2923. tokens = [it for l in token_l for it in l]
  2924. content = ' '.join(tokens[:500])
  2925. title = re.sub('[^\u4e00-\u9fa5]', '', title)
  2926. if len(title)>50:
  2927. title = title[:20]+title[-30:]
  2928. data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
  2929. text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
  2930. title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
  2931. result = {'docchannel': {'docchannel':'', 'doctype':'', "original_docchannel_id": str(original_docchannel)}}
  2932. array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
  2933. array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
  2934. pred = self.type_sess.run(self.type_softmax,
  2935. feed_dict={
  2936. self.type_title: array_title,
  2937. self.type_content: array_content,
  2938. self.type_mask:[[0]*text_len+[1]*(self.sequen_len-text_len)],
  2939. self.type_mask_title:[[0]*title_len+[1]*(self.title_len-title_len)],
  2940. self.type_prob:1}
  2941. )
  2942. id = np.argmax(pred, axis=1)[0]
  2943. prob = pred[0][id]
  2944. result['docchannel']['doctype'] = self.id2type[id]
  2945. # print('公告类别:', self.id2type[id], '概率:',prob)
  2946. # if id == 0:
  2947. if result['docchannel']['doctype'] not in ['', '新闻资讯']:
  2948. pred = self.lift_sess.run(self.lift_softmax,
  2949. feed_dict={
  2950. self.lift_title: array_title,
  2951. self.lift_content: array_content,
  2952. self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
  2953. self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
  2954. self.lift_prob:1}
  2955. )
  2956. id = np.argmax(pred, axis=1)[0]
  2957. prob = pred[0][id]
  2958. result['docchannel']['docchannel'] = self.id2life[id]
  2959. # print('生命周期:纯模型预测',self.id2life[id], '概率:',prob)
  2960. # if id == 6:
  2961. if result['docchannel']['docchannel'] == '中标信息':
  2962. if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
  2963. result['docchannel']['docchannel'] = '候选人公示'
  2964. # return '候选人公示', prob
  2965. # return [{'docchannel': '候选人公示'}]
  2966. return result
  2967. # return [{'docchannel':self.id2life[id]}]
  2968. # else:
  2969. # # return self.id2type[id], prob
  2970. # return [{'docchannel':self.id2type[id]}]
  2971. def predict_rule(self, title, content, channel_dic, prem_dic):
  2972. '''2022/2/10加入规则去除某些数据源及内容过短且不包含类别关键词的公告不做预测'''
  2973. hetong = '(合同|验收|履约)(公告|公示)|合同号?$' # 合同标题正则
  2974. zhongbiao_t = '(中标|中选|成交|入选|入围|结果|确认)(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选)结果|开标(记录|信息|情况)|单一来源|直接(选取|选定)|中标通知书|中标$'
  2975. zhongbiao_c = '(中标|中选|成交|拟选用|拟邀请|最终选定的?|拟定)(供应商|供货商|服务商|企业|公司|单位|(候选)?人)(名称)?[::]|[,。:.](供应商|供货商|服务商)(名称)?:|指定的中介服务机构:|建设服务单位:'
  2976. zhaobiao_t = '(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈)(公告|公示|$)'
  2977. title_cn = re.sub('[^\u4e00-\u9fa5]', '', title)
  2978. if len(re.sub('[^\u4e00-\u9fa5]', "", content))<50 and channel_dic['docchannel']['doctype'] != '新闻资讯':
  2979. if re.search(hetong, title_cn) != None:
  2980. channel_dic['docchannel']['docchannel'] = '合同公告'
  2981. elif re.search(zhongbiao_t, title_cn):
  2982. channel_dic['docchannel']['docchannel'] = '中标信息'
  2983. elif re.search(zhaobiao_t, title_cn):
  2984. channel_dic['docchannel']['docchannel'] = '招标公告'
  2985. else:
  2986. channel_dic['docchannel']['docchannel'] = ''
  2987. elif channel_dic['docchannel'].get('docchannel', '') == '招标公告' and 'win_tenderer' in json.dumps(prem_dic,
  2988. ensure_ascii=False):
  2989. if re.search(hetong, title_cn) != None:
  2990. channel_dic['docchannel']['docchannel'] = '合同公告'
  2991. log('正则把招标公告修改为合同公告')
  2992. elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
  2993. content):
  2994. channel_dic['docchannel']['docchannel'] = '中标信息'
  2995. log('正则把招标公告修改为中标信息')
  2996. elif channel_dic['docchannel'].get('docchannel', '') == '中标信息' and 'win_tenderer' not in json.dumps(prem_dic,
  2997. ensure_ascii=False):
  2998. if re.search(hetong, title_cn):
  2999. channel_dic['docchannel']['docchannel'] = '合同公告'
  3000. log('正则把中标信息修改为合同公告')
  3001. elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
  3002. content):
  3003. pass
  3004. elif re.search(zhaobiao_t, title_cn):
  3005. channel_dic['docchannel']['docchannel'] = '招标公告'
  3006. log('正则把中标信息修改为招标公告')
  3007. elif re.search('中标|成交|中选|入选|入围|结果|供应商|供货商|候选人', title_cn+content)==None:
  3008. channel_dic['docchannel']['docchannel'] = ''
  3009. log('正则把中标信息修改为空')
  3010. return channel_dic
  3011. def predict_merge(self, title, list_sentence, html, bidway, prem, original_docchannel='', web_source_no=''):
  3012. '''
  3013. 正则,模型混合预测,返回公告类型及生命周期
  3014. :param title: 公告标题
  3015. :param content: 预处理后的返回的句子实体列表 list_sentence
  3016. :param html: 公告原文 html 内容
  3017. :param bidway: 招标方式
  3018. :param prem: 提取的prem 字典
  3019. :return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
  3020. '''
  3021. def cut_single_cn_space(text):
  3022. new_text = ""
  3023. for w in text.split():
  3024. if len(w) == 1 or re.search('^[\u4e00-\u9fa5][::]', w):
  3025. new_text += w
  3026. else:
  3027. new_text += ' ' + w
  3028. return new_text
  3029. def html2text(html):
  3030. ser = re.search('<div[^<>]*richTextFetch', html)
  3031. # if ser and len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()]))>500:
  3032. # html = html[:ser.start()]+'##richTextFetch##'
  3033. if ser:
  3034. if len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()])) > 200:
  3035. html = html[:ser.start()] + '##richTextFetch##'
  3036. else:
  3037. html = html[:ser.start() + 500]
  3038. text = re.sub('<[^<]*?>', '', html).replace('&nbsp;', ' ')
  3039. # text = re.sub('http[0-9a-zA-Z-.:/]+|[0-9a-zA-Z-./@]+', '', text)
  3040. text = re.sub('\s+', ' ', text)
  3041. # text = re.sub('[/|[()()]', '', text)
  3042. text = cut_single_cn_space(text)
  3043. return text[:20000]
  3044. def count_diffser(pattern, text):
  3045. num = 0
  3046. kw = []
  3047. for p in pattern.split(';'):
  3048. if re.search(p, text):
  3049. num += 1
  3050. kw.append(re.search(p, text).group(0))
  3051. return num, ';'.join(kw)
  3052. def is_contain_winner(extract_json):
  3053. if re.search('win_tenderer', extract_json):
  3054. return True
  3055. else:
  3056. return False
  3057. def is_single_source(bidway, title):
  3058. if re.search('单一来源|单一性采购', title):
  3059. return True
  3060. elif bidway == '单一来源':
  3061. return True
  3062. else:
  3063. return False
  3064. def get_type(title, text):
  3065. if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'],
  3066. text): # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None
  3067. if re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]):
  3068. return '采招数据', re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]).group(0)
  3069. return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0)
  3070. elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
  3071. if re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]):
  3072. return '采招数据', re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]).group(0)
  3073. return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
  3074. elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text):
  3075. if re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]):
  3076. return '采招数据', re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]).group(0)
  3077. return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0)
  3078. elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text):
  3079. return '采招数据', (
  3080. re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group(
  3081. 0)
  3082. elif re.search(self.title_type_dic['新闻资讯'], title):
  3083. if re.search(self.title_type_dic['采招数据'], title +text.strip().split(' ')[0]):
  3084. return '采招数据', re.search(self.title_type_dic['采招数据'], title +text.strip().split(' ')[0]).group(0)
  3085. return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0)
  3086. else:
  3087. return '', '没有公告类型关键词,返回空'
  3088. def get_life(title, text):
  3089. title = re.sub('[-()()0-9a-z]|第?[二三四]次公?告?', '', title)
  3090. first_line = text.split()[0] if len(text.split()) > 2 else ''
  3091. if title.strip()[-2:] not in ['公告', '公示'] and 5 < len(first_line) < 30 and first_line[-2:] in ['公告', '公示']:
  3092. # print('title: ', title, first_line)
  3093. title += first_line
  3094. # print('title: ', title)
  3095. def count_score(l):
  3096. return len(l) + len(set(l)) * 2
  3097. life_kw_title = {}
  3098. life_kw_content = {}
  3099. life_score = {}
  3100. # msc = ""
  3101. # 查找标题每个类别关键词
  3102. for k, v in self.title_life_dic.items():
  3103. k2 = re.sub('[\da-z]', '', k)
  3104. if k2 not in life_kw_title:
  3105. life_kw_title[k2] = []
  3106. for it in re.finditer(v, title):
  3107. life_kw_title[k2].append(it.group(0))
  3108. # 查找正文每个类别关键词
  3109. for k, v in self.life_dic.items():
  3110. k2 = re.sub('[\da-z]', '', k)
  3111. if k2 not in life_kw_content:
  3112. life_kw_content[k2] = {'pos': [], 'neg': []}
  3113. for it in re.finditer(v, text):
  3114. if 'neg' not in k:
  3115. life_kw_content[k2]['pos'].append(it.group(0))
  3116. else:
  3117. life_kw_content[k2]['neg'].append(it.group(0))
  3118. for k2 in life_kw_content:
  3119. life_score[k2] = count_score(life_kw_content[k2]['pos']) - count_score(
  3120. life_kw_content[k2]['neg'])
  3121. life_kw_title = {k: v for k, v in life_kw_title.items() if v != []}
  3122. life_kw_content = {k: v for k, v in life_kw_content.items() if life_score[k] > 0}
  3123. msc = [life_kw_title, life_kw_content, life_score]
  3124. msc = json.dumps(msc, ensure_ascii=False)
  3125. max_score = 0
  3126. life_list = []
  3127. for k in life_score.keys():
  3128. if life_score[k] > max_score:
  3129. max_score = life_score[k]
  3130. life_list = [k]
  3131. elif life_score[k] == max_score and life_score[k] > 0:
  3132. life_list.append(k)
  3133. if '采购意向' in life_kw_title or '采购意向' in life_list:
  3134. return '采购意向', msc
  3135. elif '招标预告' in life_kw_title or '招标预告' in life_list:
  3136. if set(['中标信息', '候选人公示', '合同公告']) & set(life_kw_content) != set():
  3137. return '', msc
  3138. return '招标预告', msc
  3139. elif '公告变更' in life_kw_title or '公告变更' in life_list:
  3140. if life_score.get('候选人公示', 0) > 3 or '候选人公示' in life_kw_title:
  3141. return '候选人公示', msc
  3142. elif life_score.get('合同公告', 0) > 3 or '合同公告' in life_kw_title:
  3143. return '合同公告', msc
  3144. elif life_score.get('中标信息', 0) > 3 or '中标信息' in life_kw_title:
  3145. return '中标信息', msc
  3146. elif '招标公告' in life_kw_title and life_score.get('公告变更', 0) < 4:
  3147. return '招标公告', msc
  3148. return '公告变更', msc
  3149. elif '招标答疑' in life_kw_title or '招标答疑' in life_list:
  3150. if '招标公告' in life_kw_title and life_score.get('招标答疑', 0) < 4:
  3151. return '招标公告', msc
  3152. elif life_score.get('招标答疑', 0) < max_score:
  3153. if max_score > 3 and len(life_list) == 1:
  3154. return life_list[0], msc
  3155. return '', msc
  3156. return '招标答疑', msc
  3157. elif '候选人公示' in life_kw_title or '候选人公示' in life_list:
  3158. if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
  3159. return '招标公告', msc
  3160. elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
  3161. return '废标公告', msc
  3162. return '候选人公示', msc
  3163. elif '合同公告' in life_kw_title or '合同公告' in life_list:
  3164. if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
  3165. return '招标公告', msc
  3166. elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
  3167. return '废标公告', msc
  3168. return '合同公告', msc
  3169. elif '中标信息' in life_kw_title or '中标信息' in life_list:
  3170. if '招标公告' in life_kw_title and life_score.get('招标公告',
  3171. 0) > 2: # (life_score.get('招标公告', 0)>2 or life_score.get('中标信息', 0)<4) 0.7886409793924245
  3172. return '招标公告', msc
  3173. elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
  3174. return '废标公告', msc
  3175. elif life_score.get('候选人公示', 0) > 3:
  3176. return '候选人公示', msc
  3177. elif life_score.get('合同公告', 0) > 5:
  3178. return '合同公告', msc
  3179. return '中标信息', msc
  3180. elif '废标公告' in life_kw_title or '废标公告' in life_list:
  3181. if life_score.get('招标公告', 0) > 3 and '废标公告' not in life_kw_title:
  3182. return '招标公告', msc
  3183. return '废标公告', msc
  3184. elif '资审结果' in life_kw_title or '资审结果' in life_list:
  3185. return '资审结果', msc
  3186. elif '招标公告' in life_kw_title or '招标公告' in life_list:
  3187. return '招标公告', msc
  3188. return '', msc
  3189. def get_model_inputs(list_sentence):
  3190. list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
  3191. token_l = [it.tokens for it in list_sentence]
  3192. tokens = [it for l in token_l for it in l]
  3193. content = ' '.join(tokens[:500])
  3194. data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
  3195. dochtmlcon=content) # 标题最多取50字
  3196. text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
  3197. title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
  3198. array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
  3199. array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
  3200. return array_content, array_title ,text_len, title_len, content
  3201. def type_model_predict():
  3202. pred = self.type_sess.run(self.type_softmax,
  3203. feed_dict={
  3204. self.type_title: array_title,
  3205. self.type_content: array_content,
  3206. self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
  3207. self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
  3208. self.type_prob: 1}
  3209. )
  3210. id = np.argmax(pred, axis=1)[0]
  3211. prob = pred[0][id]
  3212. return id, prob
  3213. def life_model_predict():
  3214. pred = self.lift_sess.run(self.lift_softmax,
  3215. feed_dict={
  3216. self.lift_title: array_title,
  3217. self.lift_content: array_content,
  3218. self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
  3219. self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
  3220. self.lift_prob: 1}
  3221. )
  3222. id = np.argmax(pred, axis=1)[0]
  3223. prob = pred[0][id]
  3224. return id, prob
  3225. def final_change(msc):
  3226. '''
  3227. 修改逻辑:
  3228. 1、中标公告、合同公告无中标人且原始为非中标,返回原类型
  3229. 2、废标公告有中标人且标题无废标关键词,返回中标信息
  3230. 3、答疑公告标题无答疑关键且原始为招标,返回原始类别
  3231. 4、招标公告有中标人且原始为中标,返回中标信息
  3232. 5、预测为招标,原始为预告、意向,返回原始类别
  3233. 6、预测及原始均在变更、答疑,返回原始类别
  3234. 7、预测为采招数据,原始为产权且有关键词,返回原始类别
  3235. 8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
  3236. 9、若预测为非采招数据且源网为采招数据且标题无关键词返回采招数据
  3237. '''
  3238. if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
  3239. original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(prem_json)==False:
  3240. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  3241. msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
  3242. elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
  3243. self.title_life_dic['废标公告'], title) == None:
  3244. result['docchannel']['docchannel'] = '中标信息'
  3245. msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
  3246. elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
  3247. self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
  3248. original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
  3249. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  3250. msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;'
  3251. elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and origin_dic.get(
  3252. original_docchannel, '') == '中标信息':
  3253. result['docchannel']['docchannel'] = '中标信息'
  3254. msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
  3255. elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get(
  3256. original_docchannel, '') in ['采购意向', '招标预告']:
  3257. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  3258. msc += '最终规则修改:预测及原始均在招标、预告、意向,返回原始类别'
  3259. elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get(
  3260. original_docchannel, '') in ['招标答疑', '公告变更']:
  3261. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  3262. msc += '最终规则修改:预测及原始均在答疑、变更,返回原始类别'
  3263. elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get(
  3264. original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产', text):
  3265. result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
  3266. msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
  3267. elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get(
  3268. original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search(
  3269. self.title_life_dic['废标公告'], title) == None:
  3270. result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
  3271. msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
  3272. elif result['docchannel']['doctype'] != '采招数据' and origin_dic.get(
  3273. original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产|挂牌|出让|拍卖|招拍|划拨', title)==None:
  3274. result['docchannel']['doctype'] = '采招数据'
  3275. msc += '最终规则修改:预测为非采招数据,原始为采招数据且无关键词,返回采招数据'
  3276. '''下面是新格式增加返回字段'''
  3277. if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
  3278. result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
  3279. else:
  3280. result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
  3281. return msc
  3282. not_extract_dic = {
  3283. 104: '招标文件',
  3284. 106: '法律法规',
  3285. 107: '新闻资讯',
  3286. 108: '拟建项目',
  3287. 109: '展会推广',
  3288. 110: '企业名录',
  3289. 111: '企业资质',
  3290. 112: '全国工程人员',
  3291. 113: '业主采购'
  3292. }
  3293. origin_dic = {51: '公告变更',
  3294. 52: '招标公告',
  3295. 101: '中标信息',
  3296. 102: '招标预告',
  3297. 103: '招标答疑',
  3298. 104: '招标文件',
  3299. 105: '资审结果',
  3300. 106: '法律法规',
  3301. 107: '新闻资讯',
  3302. 108: '拟建项目',
  3303. 109: '展会推广',
  3304. 110: '企业名录',
  3305. 111: '企业资质',
  3306. 112: '全国工程',
  3307. 113: '业主采购',
  3308. 114: '采购意向',
  3309. 115: '拍卖出让',
  3310. 116: '土地矿产',
  3311. 117: '产权交易',
  3312. 118: '废标公告',
  3313. 119: '候选人公示',
  3314. 120: '合同公告'}
  3315. if original_docchannel in not_extract_dic:
  3316. return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel], 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '公告类别不在提取范围'
  3317. if web_source_no in ['02104-7', '04733', 'DX007628-6']: # 这些数据源无法识别
  3318. return {'docchannel': {'docchannel': '', 'doctype': '采招数据', 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '此数据源公告分类不明确,返回数据源类别'
  3319. title = re.sub('[^\u4e00-\u9fa5]', '', title)
  3320. if len(title) > 50:
  3321. title = title[:20] + title[-30:]
  3322. text = html2text(html)
  3323. prem_json = json.dumps(prem, ensure_ascii=False)
  3324. result = {'docchannel': {'docchannel': '', 'doctype': ''}}
  3325. doc_type, type_kw = get_type(title, text)
  3326. # doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
  3327. doc_life, life_kw = get_life(title, text)
  3328. if doc_type in self.title_type_dic:
  3329. result['docchannel']['doctype'] = doc_type
  3330. if doc_life in self.title_life_dic:
  3331. result['docchannel']['docchannel'] = doc_life
  3332. # print('channel正则预测结果:', result)
  3333. msc = '正则结果:类型:%s, 关键词:%s, 周期:%s, 关键词:%s'%(doc_type, type_kw,doc_life, life_kw)+'\n'+'模型结果:'
  3334. # print('类型:%s, 关键词:%s, 周期:%s, 关键词:%s'%(doc_type, type_kw,doc_life, life_kw))
  3335. if doc_type == "" or doc_life == "":
  3336. array_content, array_title, text_len, title_len, content = get_model_inputs(list_sentence)
  3337. if doc_type =="":
  3338. type_id, type_prob = type_model_predict()
  3339. type_model = self.id2type[type_id]
  3340. result['docchannel']['doctype'] = type_model
  3341. msc += type_model + ' 概率:%.4f;'%type_prob
  3342. # print('公告类别:', self.id2type[id], '概率:',prob)
  3343. # if id == 0:
  3344. if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
  3345. if len(text)>150 and re.search(self.kws, content):
  3346. life_id, life_prob = life_model_predict()
  3347. life_model = self.id2life[life_id]
  3348. result['docchannel']['docchannel'] = life_model
  3349. msc += life_model + ' 概率:%.4f;\n'%life_prob
  3350. msc = final_change(msc)
  3351. # print('channel ', msc)
  3352. return result, msc
  3353. # 保证金支付方式提取
  3354. class DepositPaymentWay():
  3355. def __init__(self,):
  3356. self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[::]*([^,。]{,60})'
  3357. self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式'
  3358. kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)',
  3359. '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码',
  3360. '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出',
  3361. '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函']
  3362. self.kws = sorted(kws, key=lambda x: len(x), reverse=True)
  3363. def predict(self,content):
  3364. pay_way = {'deposit_patment_way':''}
  3365. result = []
  3366. pay = re.search(self.pt, content)
  3367. if pay:
  3368. # print(pay.group(0))
  3369. pay = pay.group(3)
  3370. for it in re.finditer('|'.join(self.kws), pay):
  3371. result.append(it.group(0))
  3372. pay_way['deposit_patment_way'] = ';'.join(result)
  3373. return pay_way
  3374. pay = re.search(self.pt2, content)
  3375. if pay:
  3376. # print(pay.group(0))
  3377. pay = pay.group(2)
  3378. for it in re.finditer('|'.join(self.kws), pay):
  3379. result.append(it.group(0))
  3380. pay_way['deposit_patment_way'] = ';'.join(result)
  3381. return pay_way
  3382. else:
  3383. return pay_way
  3384. # 总价单价提取
  3385. class TotalUnitMoney:
  3386. def __init__(self):
  3387. pass
  3388. def predict(self, list_sentences, list_entitys):
  3389. for i in range(len(list_entitys)):
  3390. list_entity = list_entitys[i]
  3391. # 总价单价
  3392. for _entity in list_entity:
  3393. if _entity.entity_type == 'money':
  3394. word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
  3395. # 总价在中投标金额中
  3396. if _entity.label == 1:
  3397. result = extract_total_money(word_of_sentence,
  3398. _entity.entity_text,
  3399. [_entity.wordOffset_begin, _entity.wordOffset_end])
  3400. if result:
  3401. _entity.is_total_money = 1
  3402. # 单价在普通金额中
  3403. else:
  3404. result = extract_unit_money(word_of_sentence,
  3405. _entity.entity_text,
  3406. [_entity.wordOffset_begin, _entity.wordOffset_end])
  3407. if result:
  3408. _entity.is_unit_money = 1
  3409. # print("total_unit_money", _entity.entity_text,
  3410. # _entity.is_total_money, _entity.is_unit_money)
  3411. # 行业分类
  3412. class IndustryPredictor():
  3413. def __init__(self,):
  3414. self.model_path = os.path.dirname(__file__)+ '/industry_model'
  3415. self.id2lb = {0: '专业施工', 1: '专用仪器仪表', 2: '专用设备修理', 3: '互联网信息服务', 4: '互联网安全服务', 5: '互联网平台', 6: '互联网接入及相关服务', 7: '人力资源服务',
  3416. 8: '人造原油', 9: '仓储业', 10: '仪器仪表', 11: '仪器仪表修理', 12: '会计、审计及税务服务', 13: '会议、展览及相关服务', 14: '住宅、商业用房',
  3417. 15: '体育场地设施管理', 16: '体育组织', 17: '体育设备', 18: '保险服务', 19: '信息处理和存储支持服务', 20: '信息技术咨询服务',
  3418. 21: '信息系统集成和物联网技术服务', 22: '修缮工程', 23: '健康咨询', 24: '公路旅客运输', 25: '其他专业咨询与调查', 26: '其他专业技术服务',
  3419. 27: '其他交通运输设备', 28: '其他公共设施管理', 29: '其他土木工程建筑', 30: '其他工程服务', 31: '其他建筑建材', 32: '其他运输业', 33: '农业和林业机械',
  3420. 34: '农业服务', 35: '农产品', 36: '农副食品,动、植物油制品', 37: '出版业', 38: '办公消耗用品及类似物品', 39: '办公设备', 40: '化学原料及化学制品',
  3421. 41: '化学纤维', 42: '化学药品和中药专用设备', 43: '医疗设备', 44: '医药品', 45: '卫星传输服务', 46: '卫生', 47: '印刷服务', 48: '图书和档案',
  3422. 49: '图书档案设备', 50: '图书馆与档案馆', 51: '土地管理业', 52: '地质勘查', 53: '地震服务', 54: '场馆、站港用房', 55: '城市公共交通运输',
  3423. 56: '塑料制品、半成品及辅料', 57: '天然石料', 58: '娱乐设备', 59: '婚姻服务', 60: '安全保护服务', 61: '安全生产设备', 62: '家具用具',
  3424. 63: '家用电器修理', 64: '工业、生产用房', 65: '工业与专业设计及其他专业技术服务', 66: '工矿工程建筑', 67: '工程技术与设计服务', 68: '工程机械',
  3425. 69: '工程监理服务', 70: '工程评价服务', 71: '工程造价服务', 72: '市场调查', 73: '广告业', 74: '广播', 75: '广播、电视、电影设备',
  3426. 76: '广播电视传输服务', 77: '废弃资源综合利用业', 78: '建筑涂料', 79: '建筑物、构筑物附属结构', 80: '建筑物拆除和场地准备活动', 81: '建筑装饰和装修业',
  3427. 82: '录音制作', 83: '影视节目制作', 84: '房地产中介服务', 85: '房地产开发经营', 86: '房地产租赁经营', 87: '房屋租赁', 88: '招标代理',
  3428. 89: '探矿、采矿、选矿和造块设备', 90: '政法、检测专用设备', 91: '教育服务', 92: '教育设备', 93: '文物及非物质文化遗产保护', 94: '文物和陈列品',
  3429. 95: '文艺创作与表演', 96: '文艺设备', 97: '新闻业', 98: '旅行社及相关服务', 99: '日杂用品', 100: '有色金属冶炼及压延产品', 101: '有色金属矿',
  3430. 102: '木材、板材等', 103: '木材采集和加工设备', 104: '机械设备', 105: '机械设备经营租赁', 106: '林业产品', 107: '林业服务', 108: '架线和管道工程建筑',
  3431. 109: '核工业专用设备', 110: '橡胶制品', 111: '殡葬服务', 112: '殡葬设备及用品', 113: '气象服务', 114: '水上交通运输设备', 115: '水上运输业',
  3432. 116: '水利和水运工程建筑', 117: '水工机械', 118: '水文服务', 119: '水资源管理', 120: '污水处理及其再生利用', 121: '汽车、摩托车修理与维护',
  3433. 122: '法律服务', 123: '洗染服务', 124: '测绘地理信息服务', 125: '海洋仪器设备', 126: '海洋工程建筑', 127: '海洋服务', 128: '消防设备',
  3434. 129: '清洁服务', 130: '渔业产品', 131: '渔业服务', 132: '炼焦和金属冶炼轧制设备', 133: '烟草加工设备', 134: '热力生产和供应', 135: '焦炭及其副产品',
  3435. 136: '煤炭采选产品', 137: '燃气生产和供应业', 138: '物业管理', 139: '特种用途动、植物', 140: '环保咨询', 141: '环境与生态监测检测服务',
  3436. 142: '环境污染防治设备', 143: '环境治理业', 144: '玻璃及其制品', 145: '理发及美容服务', 146: '生态保护', 147: '电信',
  3437. 148: '电力、城市燃气、蒸汽和热水、水', 149: '电力供应', 150: '电力工业专用设备', 151: '电力工程施工', 152: '电力生产', 153: '电子和通信测量仪器',
  3438. 154: '电工、电子专用生产设备', 155: '电影放映', 156: '电气安装', 157: '电气设备', 158: '电气设备修理', 159: '畜牧业服务', 160: '监控设备',
  3439. 161: '石油制品', 162: '石油和化学工业专用设备', 163: '石油和天然气开采产品', 164: '石油天然气开采专用设备', 165: '研究和试验发展', 166: '社会工作',
  3440. 167: '社会经济咨询', 168: '科技推广和应用服务业', 169: '科研、医疗、教育用房', 170: '管道和设备安装', 171: '粮油作物和饲料加工设备', 172: '纸、纸制品及印刷品',
  3441. 173: '纺织原料、毛皮、被服装具', 174: '纺织设备', 175: '绿化管理', 176: '缝纫、服饰、制革和毛皮加工设备', 177: '航空器及其配套设备', 178: '航空客货运输',
  3442. 179: '航空航天工业专用设备', 180: '节能环保工程施工', 181: '装卸搬运', 182: '计算机和办公设备维修', 183: '计算机设备', 184: '计量标准器具及量具、衡器',
  3443. 185: '货币处理专用设备', 186: '货币金融服务', 187: '质检技术服务', 188: '资本市场服务', 189: '车辆', 190: '边界勘界和联检专用设备', 191: '运行维护服务',
  3444. 192: '通信设备', 193: '通用设备修理', 194: '道路货物运输', 195: '邮政专用设备', 196: '邮政业', 197: '采矿业和制造业服务',
  3445. 198: '铁路、船舶、航空航天等运输设备修理', 199: '铁路、道路、隧道和桥梁工程建筑', 200: '铁路运输设备', 201: '防洪除涝设施管理', 202: '陶瓷制品',
  3446. 203: '雷达、无线电和卫星导航设备', 204: '非金属矿', 205: '非金属矿物制品工业专用设备', 206: '非金属矿物材料', 207: '食品加工专用设备', 208: '食品及加工盐',
  3447. 209: '餐饮业', 210: '饮料、酒精及精制茶', 211: '饮料加工设备', 212: '饲养动物及其产品', 213: '黑色金属冶炼及压延产品', 214: '黑色金属矿'}
  3448. self.industry_dic = {'专业施工': {'大类': '专业施工', '门类': '建筑业'},
  3449. '专用仪器仪表': {'大类': '专用设备', '门类': '零售批发'},
  3450. '专用设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
  3451. '互联网信息服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
  3452. '互联网安全服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
  3453. '互联网平台': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
  3454. '互联网接入及相关服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
  3455. '人力资源服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3456. '人造原油': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'},
  3457. '仓储业': {'大类': '装卸搬运和运输代理业', '门类': '交通运输、仓储和邮政业'},
  3458. '仪器仪表': {'大类': '通用设备', '门类': '零售批发'},
  3459. '仪器仪表修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
  3460. '会计、审计及税务服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3461. '会议、展览及相关服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3462. '住宅、商业用房': {'大类': '房屋建筑业', '门类': '建筑业'},
  3463. '体育场地设施管理': {'大类': '体育', '门类': '文化、体育和娱乐业'},
  3464. '体育组织': {'大类': '体育', '门类': '文化、体育和娱乐业'},
  3465. '体育设备': {'大类': '专用设备', '门类': '零售批发'},
  3466. '保险服务': {'大类': '保险业', '门类': '金融业'},
  3467. '信息处理和存储支持服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
  3468. '信息技术咨询服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
  3469. '信息系统集成和物联网技术服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
  3470. '修缮工程': {'大类': '修缮工程', '门类': '建筑业'},
  3471. '健康咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3472. '公路旅客运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'},
  3473. '其他专业咨询与调查': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3474. '其他专业技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3475. '其他交通运输设备': {'大类': '专用设备', '门类': '零售批发'},
  3476. '其他公共设施管理': {'大类': '公共设施管理业', '门类': '水利、环境和公共设施管理业'},
  3477. '其他土木工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  3478. '其他工程服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'},
  3479. '其他建筑建材': {'大类': '建筑建材', '门类': '零售批发'},
  3480. '其他运输业': {'大类': '其他运输业', '门类': '交通运输、仓储和邮政业'},
  3481. '农业和林业机械': {'大类': '专用设备', '门类': '零售批发'},
  3482. '农业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
  3483. '农产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
  3484. '农副食品,动、植物油制品': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'},
  3485. '出版业': {'大类': '新闻和出版业', '门类': '文化、体育和娱乐业'},
  3486. '办公消耗用品及类似物品': {'大类': '办公消耗用品及类似物品', '门类': '零售批发'},
  3487. '办公设备': {'大类': '通用设备', '门类': '零售批发'},
  3488. '化学原料及化学制品': {'大类': '基础化学品及相关产品', '门类': '零售批发'},
  3489. '化学纤维': {'大类': '基础化学品及相关产品', '门类': '零售批发'},
  3490. '化学药品和中药专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3491. '医疗设备': {'大类': '专用设备', '门类': '零售批发'},
  3492. '医药品': {'大类': '医药品', '门类': '零售批发'},
  3493. '卫星传输服务': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'},
  3494. '卫生': {'大类': '卫生', '门类': '卫生和社会工作'},
  3495. '印刷服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3496. '图书和档案': {'大类': '图书和档案', '门类': '零售批发'},
  3497. '图书档案设备': {'大类': '通用设备', '门类': '零售批发'},
  3498. '图书馆与档案馆': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'},
  3499. '土地管理业': {'大类': '土地管理业', '门类': '水利、环境和公共设施管理业'},
  3500. '地质勘查': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3501. '地震服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3502. '场馆、站港用房': {'大类': '房屋建筑业', '门类': '建筑业'},
  3503. '城市公共交通运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'},
  3504. '塑料制品、半成品及辅料': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
  3505. '天然石料': {'大类': '建筑建材', '门类': '零售批发'},
  3506. '娱乐设备': {'大类': '专用设备', '门类': '零售批发'},
  3507. '婚姻服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
  3508. '安全保护服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3509. '安全生产设备': {'大类': '专用设备', '门类': '零售批发'},
  3510. '家具用具': {'大类': '家具用具', '门类': '零售批发'},
  3511. '家用电器修理': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'},
  3512. '工业、生产用房': {'大类': '房屋建筑业', '门类': '建筑业'},
  3513. '工业与专业设计及其他专业技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3514. '工矿工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  3515. '工程技术与设计服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3516. '工程机械': {'大类': '专用设备', '门类': '零售批发'},
  3517. '工程监理服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'},
  3518. '工程评价服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3519. '工程造价服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'},
  3520. '市场调查': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3521. '广告业': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3522. '广播': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
  3523. '广播、电视、电影设备': {'大类': '通用设备', '门类': '零售批发'},
  3524. '广播电视传输服务': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'},
  3525. '废弃资源综合利用业': {'大类': '废弃资源综合利用业', '门类': '废弃资源综合利用业'},
  3526. '建筑涂料': {'大类': '建筑建材', '门类': '零售批发'},
  3527. '建筑物、构筑物附属结构': {'大类': '建筑建材', '门类': '零售批发'},
  3528. '建筑物拆除和场地准备活动': {'大类': '建筑装饰和其他建筑业', '门类': '建筑业'},
  3529. '建筑装饰和装修业': {'大类': '建筑装饰和其他建筑业', '门类': '建筑业'},
  3530. '录音制作': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
  3531. '影视节目制作': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
  3532. '房地产中介服务': {'大类': '房地产业', '门类': '房地产业'},
  3533. '房地产开发经营': {'大类': '房地产业', '门类': '房地产业'},
  3534. '房地产租赁经营': {'大类': '房地产业', '门类': '房地产业'},
  3535. '房屋租赁': {'大类': '租赁业', '门类': '租赁和商务服务业'},
  3536. '招标代理': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3537. '探矿、采矿、选矿和造块设备': {'大类': '专用设备', '门类': '零售批发'},
  3538. '政法、检测专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3539. '教育服务': {'大类': '教育服务', '门类': '教育'},
  3540. '教育设备': {'大类': '专用设备', '门类': '零售批发'},
  3541. '文体设备和用品出租': {'大类': '租赁业', '门类': '租赁和商务服务业'},
  3542. '文物及非物质文化遗产保护': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'},
  3543. '文物和陈列品': {'大类': '文物和陈列品', '门类': '零售批发'},
  3544. '文艺创作与表演': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'},
  3545. '文艺设备': {'大类': '专用设备', '门类': '零售批发'},
  3546. '新闻业': {'大类': '新闻和出版业', '门类': '文化、体育和娱乐业'},
  3547. '旅行社及相关服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3548. '日杂用品': {'大类': '日杂用品', '门类': '零售批发'},
  3549. '有色金属冶炼及压延产品': {'大类': '建筑建材', '门类': '零售批发'},
  3550. '有色金属矿': {'大类': '矿与矿物', '门类': '零售批发'},
  3551. '木材、板材等': {'大类': '建筑建材', '门类': '零售批发'},
  3552. '木材采集和加工设备': {'大类': '专用设备', '门类': '零售批发'},
  3553. '机械设备': {'大类': '通用设备', '门类': '零售批发'},
  3554. '机械设备经营租赁': {'大类': '租赁业', '门类': '租赁和商务服务业'},
  3555. '林业产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
  3556. '林业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
  3557. '架线和管道工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  3558. '核工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3559. '橡胶制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
  3560. '殡葬服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
  3561. '殡葬设备及用品': {'大类': '专用设备', '门类': '零售批发'},
  3562. '气象服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3563. '水上交通运输设备': {'大类': '专用设备', '门类': '零售批发'},
  3564. '水上运输业': {'大类': '水上运输业', '门类': '交通运输、仓储和邮政业'},
  3565. '水利和水运工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  3566. '水工机械': {'大类': '专用设备', '门类': '零售批发'},
  3567. '水文服务': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'},
  3568. '水资源管理': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'},
  3569. '污水处理及其再生利用': {'大类': '水的生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
  3570. '汽车、摩托车修理与维护': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'},
  3571. '法律服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3572. '洗染服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
  3573. '测绘地理信息服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3574. '海洋仪器设备': {'大类': '专用设备', '门类': '零售批发'},
  3575. '海洋工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  3576. '海洋服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3577. '消防设备': {'大类': '专用设备', '门类': '零售批发'},
  3578. '清洁服务': {'大类': '其他服务业', '门类': '居民服务、修理和其他服务业'},
  3579. '渔业产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
  3580. '渔业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
  3581. '炼焦和金属冶炼轧制设备': {'大类': '专用设备', '门类': '零售批发'},
  3582. '烟草加工设备': {'大类': '专用设备', '门类': '零售批发'},
  3583. '热力生产和供应': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
  3584. '焦炭及其副产品': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'},
  3585. '煤炭采选产品': {'大类': '矿与矿物', '门类': '零售批发'},
  3586. '燃气生产和供应业': {'大类': '燃气生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
  3587. '物业管理': {'大类': '房地产业', '门类': '房地产业'},
  3588. '特种用途动、植物': {'大类': '农林牧渔业产品', '门类': '零售批发'},
  3589. '环保咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3590. '环境与生态监测检测服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3591. '环境污染防治设备': {'大类': '专用设备', '门类': '零售批发'},
  3592. '环境治理业': {'大类': '生态保护和环境治理业', '门类': '水利、环境和公共设施管理业'},
  3593. '玻璃及其制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
  3594. '理发及美容服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
  3595. '生态保护': {'大类': '生态保护和环境治理业', '门类': '水利、环境和公共设施管理业'},
  3596. '电信': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'},
  3597. '电力、城市燃气、蒸汽和热水、水': {'大类': '电力、城市燃气、蒸汽和热水、水', '门类': '零售批发'},
  3598. '电力供应': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
  3599. '电力工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3600. '电力工程施工': {'大类': '土木工程建筑业', '门类': '建筑业'},
  3601. '电力生产': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
  3602. '电子和通信测量仪器': {'大类': '通用设备', '门类': '零售批发'},
  3603. '电工、电子专用生产设备': {'大类': '专用设备', '门类': '零售批发'},
  3604. '电影放映': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
  3605. '电气安装': {'大类': '建筑安装业', '门类': '建筑业'},
  3606. '电气设备': {'大类': '通用设备', '门类': '零售批发'},
  3607. '电气设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
  3608. '畜牧业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
  3609. '监控设备': {'大类': '通用设备', '门类': '零售批发'},
  3610. '石油制品': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'},
  3611. '石油和化学工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3612. '石油和天然气开采产品': {'大类': '矿与矿物', '门类': '零售批发'},
  3613. '石油天然气开采专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3614. '研究和试验发展': {'大类': '研究和试验发展', '门类': '科学研究和技术服务业'},
  3615. '社会工作': {'大类': '社会工作', '门类': '卫生和社会工作'},
  3616. '社会经济咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
  3617. '科技推广和应用服务业': {'大类': '科技推广和应用服务业', '门类': '科学研究和技术服务业'},
  3618. '科研、医疗、教育用房': {'大类': '房屋建筑业', '门类': '建筑业'},
  3619. '管道和设备安装': {'大类': '建筑安装业', '门类': '建筑业'},
  3620. '粮油作物和饲料加工设备': {'大类': '专用设备', '门类': '零售批发'},
  3621. '纸、纸制品及印刷品': {'大类': '纸、纸制品及印刷品', '门类': '零售批发'},
  3622. '纺织原料、毛皮、被服装具': {'大类': '纺织原料、毛皮、被服装具', '门类': '零售批发'},
  3623. '纺织设备': {'大类': '专用设备', '门类': '零售批发'},
  3624. '绿化管理': {'大类': '公共设施管理业', '门类': '水利、环境和公共设施管理业'},
  3625. '缝纫、服饰、制革和毛皮加工设备': {'大类': '专用设备', '门类': '零售批发'},
  3626. '航空器及其配套设备': {'大类': '专用设备', '门类': '零售批发'},
  3627. '航空客货运输': {'大类': '航空运输业', '门类': '交通运输、仓储和邮政业'},
  3628. '航空航天工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3629. '节能环保工程施工': {'大类': '土木工程建筑业', '门类': '建筑业'},
  3630. '装卸搬运': {'大类': '装卸搬运和运输代理业', '门类': '交通运输、仓储和邮政业'},
  3631. '计算机和办公设备维修': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'},
  3632. '计算机设备': {'大类': '通用设备', '门类': '零售批发'},
  3633. '计量标准器具及量具、衡器': {'大类': '通用设备', '门类': '零售批发'},
  3634. '货币处理专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3635. '货币金融服务': {'大类': '货币金融服务', '门类': '金融业'},
  3636. '质检技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
  3637. '资本市场服务': {'大类': '资本市场服务', '门类': '金融业'},
  3638. '车辆': {'大类': '通用设备', '门类': '零售批发'},
  3639. '边界勘界和联检专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3640. '运行维护服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
  3641. '通信设备': {'大类': '通用设备', '门类': '零售批发'},
  3642. '通用设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
  3643. '道路货物运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'},
  3644. '邮政专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3645. '邮政业': {'大类': '邮政业', '门类': '交通运输、仓储和邮政业'},
  3646. '采矿业和制造业服务': {'大类': '采矿业和制造业服务', '门类': '农林牧副渔服务'},
  3647. '铁路、船舶、航空航天等运输设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
  3648. '铁路、道路、隧道和桥梁工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
  3649. '铁路运输设备': {'大类': '专用设备', '门类': '零售批发'},
  3650. '防洪除涝设施管理': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'},
  3651. '陶瓷制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
  3652. '雷达、无线电和卫星导航设备': {'大类': '通用设备', '门类': '零售批发'},
  3653. '非金属矿': {'大类': '矿与矿物', '门类': '零售批发'},
  3654. '非金属矿物制品工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3655. '非金属矿物材料': {'大类': '建筑建材', '门类': '零售批发'},
  3656. '食品加工专用设备': {'大类': '专用设备', '门类': '零售批发'},
  3657. '食品及加工盐': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'},
  3658. '餐饮业': {'大类': '餐饮业', '门类': '住宿和餐饮业'},
  3659. '饮料、酒精及精制茶': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'},
  3660. '饮料加工设备': {'大类': '专用设备', '门类': '零售批发'},
  3661. '饲养动物及其产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
  3662. '黑色金属冶炼及压延产品': {'大类': '建筑建材', '门类': '零售批发'},
  3663. '黑色金属矿': {'大类': '矿与矿物', '门类': '零售批发'}}
  3664. self.sess = tf.Session(graph=tf.Graph())
  3665. self.get_model()
  3666. with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_industry_keyword_org/tw_industry_keyword_org.json', 'r',
  3667. encoding='utf-8') as fp1:
  3668. self.json_data_industry = json.load(fp1)
  3669. with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_company_classification_keyword/tw_company_classification_keyword.json', 'r',
  3670. encoding='utf-8') as fp2:
  3671. self.json_data_company = json.load(fp2)
  3672. with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_custom_keyword/tw_custom_keyword.json', 'r', encoding='utf-8') as fp3:
  3673. self.json_data_custom = json.load(fp3)
  3674. def get_model(self):
  3675. with self.sess.as_default() as sess:
  3676. with self.sess.graph.as_default():
  3677. meta_graph_def = tf.saved_model.loader.load(sess,
  3678. tags=['serve'],
  3679. export_dir=os.path.dirname(__file__)+'/industry_model')
  3680. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  3681. signature_def = meta_graph_def.signature_def
  3682. self.title = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['title'].name)
  3683. self.project = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['project'].name)
  3684. self.product = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['product'].name)
  3685. self.outputs = sess.graph.get_tensor_by_name(signature_def[signature_key].outputs['outputs'].name)
  3686. def text2array(self, text, tenderee='', maxSententLen=20):
  3687. tenderee = tenderee.replace('(', '(').replace(')', ')')
  3688. text = text.replace('(', '(').replace(')', ')')
  3689. text = re.sub(
  3690. '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性',
  3691. '', text)
  3692. text = text.replace(tenderee, '')
  3693. text = ' ' if text=="" else text
  3694. words_docs_list = selffool.cut(text)
  3695. words_docs_list = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)][-maxSententLen:] for l in words_docs_list]
  3696. array = embedding(words_docs_list, shape=(len(words_docs_list), maxSententLen, 128))
  3697. return array
  3698. def process(self, title, project, product, tenderee):
  3699. return self.text2array(title, tenderee), self.text2array(project, tenderee), self.text2array(product)
  3700. def predict_model(self, title, project, product, tenderee=''):
  3701. title_array, project_array, product_array = self.process(title, project, product, tenderee)
  3702. rs = self.sess.run(self.outputs,
  3703. feed_dict={
  3704. self.title:title_array,
  3705. self.project:project_array,
  3706. self.product:product_array
  3707. }
  3708. )
  3709. pred = np.argmax(rs[0])
  3710. return self.id2lb[pred], rs[0][pred]
  3711. # # 返回top2 结果
  3712. # pred_list = np.argsort(-rs[0])
  3713. # return self.id2lb[pred_list[0]], self.id2lb[pred_list[1]], rs[0][pred_list[0]], rs[0][pred_list[1]]
  3714. def predict_rule(self, doctitle, tenderee, win_tenderer, project_name, product):
  3715. doctitle = doctitle if doctitle else ''
  3716. tenderee = tenderee if tenderee else ''
  3717. win_tenderer = win_tenderer if win_tenderer else ''
  3718. project_name = project_name if project_name else ''
  3719. product = product if product else ''
  3720. text_ind = (doctitle + project_name + product).replace(tenderee, '')
  3721. text_com = win_tenderer
  3722. length_ind_text = len(text_ind) + 1
  3723. length_com_text = len(text_com) + 1
  3724. # print(text)
  3725. dic_res = {} # 行业分类字典
  3726. score_lst = [] # 得分列表
  3727. word_lst = [] # 关键词列表
  3728. # 主要内容关键词
  3729. if text_ind:
  3730. # logging.info("data_ind%s"%str(_json_data_industry[0]))
  3731. for data_industry in self.json_data_industry:
  3732. industry = data_industry['xiaolei']
  3733. key_word = data_industry['key_word']
  3734. key_word_2 = data_industry['key_word2']
  3735. power = float(data_industry['power']) if data_industry['power'] else 0
  3736. this_score = power * (text_ind.count(key_word) * len(key_word) / length_ind_text)
  3737. if key_word_2:
  3738. # key_word_compose = key_word + "+" + key_word_2
  3739. if text_ind.count(key_word_2) == 0:
  3740. this_score = 0
  3741. if this_score > 0:
  3742. # print(industry,key_word,this_score)
  3743. if industry in dic_res.keys():
  3744. dic_res[industry] += this_score
  3745. else:
  3746. dic_res[industry] = this_score
  3747. if key_word not in word_lst:
  3748. word_lst.append(key_word)
  3749. # 供应商关键词
  3750. if text_com:
  3751. for data_company in self.json_data_company:
  3752. industry = data_company['industry_type']
  3753. key_word = data_company['company_word']
  3754. power = float(data_company['industry_rate']) if data_company['industry_rate'] else 0
  3755. this_score = power * (text_com.count(key_word) * len(key_word) / length_com_text)
  3756. if this_score > 0:
  3757. # print(industry,key_word,this_score)
  3758. if industry in dic_res.keys():
  3759. dic_res[industry] += this_score
  3760. else:
  3761. dic_res[industry] = this_score
  3762. if key_word not in word_lst:
  3763. word_lst.append(key_word)
  3764. # 自定义关键词
  3765. if text_ind:
  3766. custom_ind = [
  3767. ['tenderee', '医院|疾病预防', ['设备', '系统', '器'], '医疗设备'],
  3768. ['tenderee', '学校|大学|小学|中学|学院|幼儿园', ['设备', '器'], '教育设备'],
  3769. ['tenderee', '学校|大学|小学|中学|学院|幼儿园|医院', ['工程'], '科研、医疗、教育用房'],
  3770. ['tenderee', '供电局|电网|国网|电力|电厂|粤电', ['设备', '器', '物资'], '电力工业专用设备'],
  3771. ['tenderee', '公安|法院|检察院', ['设备', '器'], '政法、检测专用设备'],
  3772. ['tenderee', '^中铁|^中交|^中建|中国建筑', ['材料'], '其他建筑建材'],
  3773. ['doctextcon', '信息技术服务|系统开发|信息化|信息系统', ['监理'], '信息技术咨询服务'],
  3774. ['doctextcon', '工程', ['消防'], '专业施工'],
  3775. ['doctextcon', '铁路|航空|船舶|航天|广铁', ['维修'], '铁路、船舶、航空航天等运输设备修理'],
  3776. ['doctextcon', '设备|仪|器', ['租赁'], '机械设备经营租赁'],
  3777. ['doctextcon', '交通|铁路|公路|道路|桥梁', ['工程'], '铁路、道路、隧道和桥梁工程建筑'],
  3778. ['win_tenderer', '电力', ['设备', '器'], '电力工业专用设备'],
  3779. ['win_tenderer', '信息|网络科技', ['系统'], '信息系统集成和物联网技术服务'],
  3780. ['tenderee,doctextcon', '铁路|广铁|铁道', ['设备', '器', '物资', '材料', '铁路'], '铁路运输设备'],
  3781. ]
  3782. for data_custom in self.json_data_custom:
  3783. industry_custom = data_custom['industry']
  3784. key_word = data_custom['company_word']
  3785. power = float(data_custom['industry_rate'])
  3786. for k in range(len(custom_ind)):
  3787. subject = ''
  3788. if 'tenderee' in custom_ind[k][0]:
  3789. subject += tenderee
  3790. if 'win_tenderer' in custom_ind[k][0]:
  3791. subject += win_tenderer
  3792. if 'doctextcon' in custom_ind[k][0]:
  3793. subject += text_ind
  3794. ptn = custom_ind[k][1]
  3795. # print('ptn',ptn)
  3796. if re.search(ptn, subject) and industry_custom in custom_ind[k][2]:
  3797. industry = custom_ind[k][3]
  3798. else:
  3799. continue
  3800. this_score = power * (text_ind.count(key_word) * len(key_word) / len(subject))
  3801. if this_score > 0:
  3802. # print(industry,key_word,this_score)
  3803. if industry in dic_res.keys():
  3804. dic_res[industry] += this_score
  3805. else:
  3806. dic_res[industry] = this_score
  3807. if key_word not in word_lst:
  3808. word_lst.append(key_word)
  3809. sort_res = sorted(dic_res.items(), key=lambda x: x[1], reverse=True)
  3810. lst_res = [s[0] for s in sort_res]
  3811. score_lst = [str(round(float(s[1]), 2)) for s in sort_res]
  3812. if len(lst_res) > 0:
  3813. return lst_res, score_lst, word_lst
  3814. else:
  3815. return [""], [], []
  3816. def predict_merge(self, pinmu_type, industry_lst):
  3817. '''
  3818. 通过一系列规则最终决定使用模型还是规则的结果
  3819. :param pinmu_type: 模型预测类别
  3820. :param industry_lst: 规则预测类别列表
  3821. :return:
  3822. '''
  3823. industry_type = industry_lst[0]
  3824. if industry_type == "":
  3825. return pinmu_type
  3826. if industry_type == '专用设备修理' and re.search('修理|维修|装修|修缮', pinmu_type):
  3827. final_type = pinmu_type
  3828. elif industry_type == '其他土木工程建筑' and re.search('工程|建筑|用房|施工|安装|质检|其他专业咨询与调查', pinmu_type):
  3829. final_type = pinmu_type
  3830. elif pinmu_type == '专用设备修理' and re.search('工程|修理', industry_type):
  3831. final_type = industry_type
  3832. elif pinmu_type == '信息系统集成和物联网技术服务' and re.search('卫星传输|信息处理和存储支持服务|信息技术咨询服务|运行维护服务|其他专业技术服务|医疗设备|医药品',
  3833. industry_type):
  3834. final_type = industry_type
  3835. elif industry_type == '仪器仪表' and re.search('仪器|器具|医疗设备', pinmu_type):
  3836. final_type = pinmu_type
  3837. elif industry_type == '医药品' and re.search('医疗设备', pinmu_type):
  3838. final_type = pinmu_type
  3839. elif industry_type == '医药品' and re.search('医疗设备', pinmu_type):
  3840. final_type = pinmu_type
  3841. elif re.search('设备', industry_type) and re.search('修理|维修', pinmu_type):
  3842. final_type = pinmu_type
  3843. elif industry_type == '社会工作' and re.search('工程', pinmu_type):
  3844. final_type = pinmu_type
  3845. elif industry_type == '信息系统集成和物联网技术服务' and re.search('信息处理|设备', pinmu_type):
  3846. final_type = pinmu_type
  3847. elif industry_type == '研究和试验发展' and re.search('其他专业咨询与调查|质检技术服务|信息系统集成|其他工程服务', pinmu_type):
  3848. final_type = pinmu_type
  3849. elif industry_type == '其他专业咨询与调查' and re.search('工程造价服务', pinmu_type):
  3850. final_type = pinmu_type
  3851. elif industry_type == '广告业' and re.search('印刷服务|影视节目制作|信息系统', pinmu_type):
  3852. final_type = pinmu_type
  3853. elif industry_type == '清洁服务' and re.search('工程|环境污染防治设备|修理', pinmu_type):
  3854. final_type = pinmu_type
  3855. elif industry_type == '其他公共设施管理' and re.search('信息系统', pinmu_type):
  3856. final_type = pinmu_type
  3857. elif industry_type == '其他专业技术服务' and re.search('工程技术与设计服务|质检技术服务|环境与生态监测检测服务', pinmu_type):
  3858. final_type = pinmu_type
  3859. elif industry_type == '机械设备经营租赁' and re.search('电信', pinmu_type):
  3860. final_type = pinmu_type
  3861. elif industry_type == '货币金融服务' and re.search('信息系统集成和物联网技术服务', pinmu_type):
  3862. final_type = pinmu_type
  3863. elif industry_type == '体育场地设施管理' and re.search('体育设备', pinmu_type):
  3864. final_type = pinmu_type
  3865. elif industry_type == '安全保护服务' and re.search('信息系统|监控设备|互联网安全服务', pinmu_type):
  3866. final_type = pinmu_type
  3867. elif industry_type == '互联网接入及相关服务' and re.search('通信设备', pinmu_type):
  3868. final_type = pinmu_type
  3869. elif industry_type == '卫生' and re.search('医疗设备|信息系统', pinmu_type):
  3870. final_type = pinmu_type
  3871. elif pinmu_type == '研究和试验发展' and re.search('其他工程服务', industry_type):
  3872. final_type = industry_type
  3873. elif pinmu_type == '办公设备' and re.search('教育设备', industry_type):
  3874. final_type = industry_type
  3875. elif re.search('车辆|机械设备经营租赁', pinmu_type) and re.search('公路旅客运输', industry_type):
  3876. final_type = industry_type
  3877. elif len(industry_lst) > 1 and pinmu_type == industry_lst[1] and re.search('会计|法律|物业|家具|印刷|互联网安全',
  3878. industry_type) == None \
  3879. and re.search('其他|人力资源服务', pinmu_type) == None:
  3880. final_type = pinmu_type
  3881. elif industry_type != "":
  3882. final_type = industry_type
  3883. else:
  3884. final_type = pinmu_type
  3885. return final_type
  3886. def predict(self, title, project, product, prem):
  3887. def get_ree_win(prem):
  3888. tenderee = ""
  3889. win_tenderer = ""
  3890. try:
  3891. for v in prem[0]['prem'].values():
  3892. for link in v['roleList']:
  3893. if link['role_name'] == 'tenderee' and tenderee == "":
  3894. tenderee = link['role_text']
  3895. elif link['role_name'] == 'win_tenderer' and win_tenderer == "":
  3896. win_tenderer = link['role_text']
  3897. except Exception as e:
  3898. print('解析prem 获取招标人、中标人出错')
  3899. return tenderee, win_tenderer
  3900. tenderee, win_tenderer = get_ree_win(prem)
  3901. result_model, prob = self.predict_model(title, project, product, tenderee)
  3902. industry_lst, score_lst, word_lst = self.predict_rule(title, tenderee, win_tenderer, project, product)
  3903. final_type = self.predict_merge(result_model, industry_lst)
  3904. # print('模型:%s;规则:%s;最终:%s'%(result_model, industry_lst[0], final_type))
  3905. # return {'industry': final_type}
  3906. return {'industry': {
  3907. 'class_name': final_type,
  3908. 'subclass': self.industry_dic[final_type]['大类'],
  3909. 'class': self.industry_dic[final_type]['门类']
  3910. }
  3911. }
  3912. class DistrictPredictor():
  3913. def __init__(self):
  3914. with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
  3915. dist_dic = pickle.load(f)
  3916. short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
  3917. full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
  3918. short2id = {}
  3919. full2id = {}
  3920. for k, v in dist_dic.items():
  3921. if v['简称'] not in short2id:
  3922. short2id[v['简称']] = [k]
  3923. else:
  3924. short2id[v['简称']].append(k)
  3925. if v['全称'] not in full2id:
  3926. full2id[v['全称']] = [k]
  3927. else:
  3928. full2id[v['全称']].append(k)
  3929. self.dist_dic = dist_dic
  3930. self.short_name = short_name
  3931. self.full_name = full_name
  3932. self.short2id = short2id
  3933. self.full2id = full2id
  3934. def predict(self, project_name, prem, title, list_articles, web_source_name = ""):
  3935. '''
  3936. 先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
  3937. :param project_name:
  3938. :param prem:
  3939. :param title:
  3940. :param list_articles:
  3941. :param web_source_name:
  3942. :return:
  3943. '''
  3944. def get_ree_addr(prem):
  3945. tenderee = ""
  3946. tenderee_address = ""
  3947. try:
  3948. for v in prem[0]['prem'].values():
  3949. for link in v['roleList']:
  3950. if link['role_name'] == 'tenderee' and tenderee == "":
  3951. tenderee = link['role_text']
  3952. tenderee_address = link['address']
  3953. except Exception as e:
  3954. print('解析prem 获取招标人、及地址出错')
  3955. return tenderee, tenderee_address
  3956. def get_area(text, web_source_name):
  3957. score_l = []
  3958. id_set = set()
  3959. if re.search(self.short_name, text):
  3960. for it in re.finditer(self.full_name, text):
  3961. name = it.group(0)
  3962. score = len(name) / len(text)
  3963. for _id in self.full2id[name]:
  3964. area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
  3965. # score_l.append([_id, score] + area)
  3966. w = self.dist_dic[_id]['权重']
  3967. score_l.append([_id, score + w] + area)
  3968. flag = 0
  3969. for it in re.finditer(self.short_name, text):
  3970. if it.end() < len(text) and re.search('^(村|镇|街|路|江|河|湖|北路|南路|东路|大道|社区)', text[it.end():]) == None:
  3971. name = it.group(0)
  3972. score = (it.start() + len(name)) / len(text)
  3973. for _id in self.short2id[name]:
  3974. score2 = 0
  3975. w = self.dist_dic[_id]['权重']
  3976. _type = self.dist_dic[_id]['类型']
  3977. area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
  3978. if area[0] in ['2', '16', '20', '30']:
  3979. _type += 10
  3980. score2 += w
  3981. if _id not in id_set:
  3982. if _type == 20:
  3983. type_w = 3
  3984. elif _type == 30:
  3985. type_w = 2
  3986. else:
  3987. type_w = 1
  3988. id_set.add(_id)
  3989. score2 += w * type_w
  3990. score_l.append([_id, score * w + score2] + area)
  3991. if flag == 1:
  3992. pass
  3993. # print('score', score)
  3994. if re.search('公司', web_source_name) == None:
  3995. for it in re.finditer(self.short_name, web_source_name):
  3996. name = it.group(0)
  3997. for _id in self.short2id[name]:
  3998. area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
  3999. w = self.dist_dic[_id]['权重']
  4000. score = w * 0.2
  4001. score_l.append([_id, score] + area)
  4002. area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知'}
  4003. if len(score_l) == 0:
  4004. return {'district': area_dic}
  4005. else:
  4006. df = pd.DataFrame(score_l, columns=['id', 'score', 'province', 'city', 'district'])
  4007. df_pro = df.groupby('province').sum().sort_values(by=['score'], ascending=False)
  4008. pro_id = df_pro.index[0]
  4009. if df_pro.loc[pro_id, 'score'] < 0.1: # 省级评分小于0.1的不要
  4010. # print('评分低于0.1', df_pro.loc[pro_id, 'score'], self.dist_dic[pro_id]['地区'])
  4011. return {'district': area_dic}
  4012. area_dic['province'] = self.dist_dic[pro_id]['地区']
  4013. area_dic['area'] = self.dist_dic[pro_id]['大区']
  4014. df = df[df['city'] != ""]
  4015. df = df[df['province'] == pro_id]
  4016. if len(df) > 0:
  4017. df_city = df.groupby('city').sum().sort_values(by=['score'], ascending=False)
  4018. city_id = df_city.index[0]
  4019. area_dic['city'] = self.dist_dic[city_id]['地区']
  4020. df = df[df['district'] != ""]
  4021. df = df[df['city'] == city_id]
  4022. if len(df) > 0:
  4023. df_dist = df.groupby('district').sum().sort_values(by=['score'], ascending=False)
  4024. dist_id = df_dist.index[0]
  4025. area_dic['district'] = self.dist_dic[dist_id]['地区']
  4026. # print(area_dic)
  4027. return {'district': area_dic}
  4028. tenderee, tenderee_address = get_ree_addr(prem)
  4029. project_name = str(project_name).replace(str(tenderee), '')
  4030. text1 = "{} {} {}".format(project_name, tenderee, tenderee_address)
  4031. web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
  4032. text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) #预防提取错 合肥 路南 新会 等地区
  4033. rs = get_area(text1, web_source_name)
  4034. if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
  4035. text2 = title + list_articles[0].content if len(list_articles[0].content)<2000 else title + list_articles[0].content[:1000] + list_articles[0].content[-1000:]
  4036. text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
  4037. rs2 = get_area(text2, web_source_name)
  4038. if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
  4039. rs = rs2
  4040. elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
  4041. rs = rs2
  4042. return rs
  4043. def getSavedModel():
  4044. #predictor = FormPredictor()
  4045. graph = tf.Graph()
  4046. with graph.as_default():
  4047. model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
  4048. #print(tf.graph_util.remove_training_nodes(model))
  4049. tf.saved_model.simple_save(
  4050. tf.keras.backend.get_session(),
  4051. "./h5_savedmodel/",
  4052. inputs={"image": model.input},
  4053. outputs={"scores": model.output}
  4054. )
  4055. def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
  4056. '''
  4057. model = models.Sequential()
  4058. model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
  4059. model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
  4060. crf = CRF(len(chunk_tags), sparse_target=True)
  4061. model.add(crf)
  4062. model.summary()
  4063. model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
  4064. return model
  4065. '''
  4066. input = layers.Input(shape=(None,),dtype="int32")
  4067. if weights is not None:
  4068. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
  4069. else:
  4070. embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
  4071. bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
  4072. bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
  4073. crf = CRF(len(chunk_tags),sparse_target=True)
  4074. crf_out = crf(bilstm_dense)
  4075. model = models.Model(input=[input],output = [crf_out])
  4076. model.summary()
  4077. model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
  4078. return model
  4079. import h5py
  4080. def h5_to_graph(sess,graph,h5file):
  4081. f = h5py.File(h5file,'r') #打开h5文件
  4082. def getValue(v):
  4083. _value = f["model_weights"]
  4084. list_names = str(v.name).split("/")
  4085. for _index in range(len(list_names)):
  4086. print(v.name)
  4087. if _index==1:
  4088. _value = _value[list_names[0]]
  4089. _value = _value[list_names[_index]]
  4090. return _value.value
  4091. def _load_attributes_from_hdf5_group(group, name):
  4092. """Loads attributes of the specified name from the HDF5 group.
  4093. This method deals with an inherent problem
  4094. of HDF5 file which is not able to store
  4095. data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
  4096. # Arguments
  4097. group: A pointer to a HDF5 group.
  4098. name: A name of the attributes to load.
  4099. # Returns
  4100. data: Attributes data.
  4101. """
  4102. if name in group.attrs:
  4103. data = [n.decode('utf8') for n in group.attrs[name]]
  4104. else:
  4105. data = []
  4106. chunk_id = 0
  4107. while ('%s%d' % (name, chunk_id)) in group.attrs:
  4108. data.extend([n.decode('utf8')
  4109. for n in group.attrs['%s%d' % (name, chunk_id)]])
  4110. chunk_id += 1
  4111. return data
  4112. def readGroup(gr,parent_name,data):
  4113. for subkey in gr:
  4114. print(subkey)
  4115. if parent_name!=subkey:
  4116. if parent_name=="":
  4117. _name = subkey
  4118. else:
  4119. _name = parent_name+"/"+subkey
  4120. else:
  4121. _name = parent_name
  4122. if str(type(gr[subkey]))=="<class 'h5py._hl.group.Group'>":
  4123. readGroup(gr[subkey],_name,data)
  4124. else:
  4125. data.append([_name,gr[subkey].value])
  4126. print(_name,gr[subkey].shape)
  4127. layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names')
  4128. list_name_value = []
  4129. readGroup(f["model_weights"], "", list_name_value)
  4130. '''
  4131. for k, name in enumerate(layer_names):
  4132. g = f["model_weights"][name]
  4133. weight_names = _load_attributes_from_hdf5_group(g, 'weight_names')
  4134. #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
  4135. for weight_name in weight_names:
  4136. list_name_value.append([weight_name,np.asarray(g[weight_name])])
  4137. '''
  4138. for name_value in list_name_value:
  4139. name = name_value[0]
  4140. '''
  4141. if re.search("dense",name) is not None:
  4142. name = name[:7]+"_1"+name[7:]
  4143. '''
  4144. value = name_value[1]
  4145. print(name,graph.get_tensor_by_name(name),np.shape(value))
  4146. sess.run(tf.assign(graph.get_tensor_by_name(name),value))
  4147. def initialize_uninitialized(sess):
  4148. global_vars = tf.global_variables()
  4149. is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars])
  4150. not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
  4151. adam_vars = []
  4152. for _vars in not_initialized_vars:
  4153. if re.search("Adam",_vars.name) is not None:
  4154. adam_vars.append(_vars)
  4155. print([str(i.name) for i in adam_vars]) # only for testing
  4156. if len(adam_vars):
  4157. sess.run(tf.variables_initializer(adam_vars))
  4158. def save_codename_model():
  4159. # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
  4160. filepath = "../../dl_dev/projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
  4161. vocabpath = "../projectCode/models/vocab.pk"
  4162. classlabelspath = "../projectCode/models/classlabels.pk"
  4163. # vocab = load(vocabpath)
  4164. # class_labels = load(classlabelspath)
  4165. w2v_matrix = load('codename_w2v_matrix.pk')
  4166. graph = tf.get_default_graph()
  4167. with graph.as_default() as g:
  4168. ''''''
  4169. # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
  4170. #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
  4171. sess = tf.Session(graph=g)
  4172. # sess = tf.keras.backend.get_session()
  4173. char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
  4174. #with sess.as_default():
  4175. sess.run(tf.global_variables_initializer())
  4176. # print(sess.run("time_distributed_1/kernel:0"))
  4177. # model.load_weights(filepath)
  4178. saver = tf.train.Saver()
  4179. saver.restore(sess, filepath)
  4180. # print("logits",sess.run(logits))
  4181. # print("#",sess.run("time_distributed_1/kernel:0"))
  4182. # x = load("codename_x.pk")
  4183. #y = model.predict(x)
  4184. # y = sess.run(model.output,feed_dict={model.input:x})
  4185. # for item in np.argmax(y,-1):
  4186. # print(item)
  4187. tf.saved_model.simple_save(
  4188. sess,
  4189. "./codename_savedmodel_tf/",
  4190. inputs={"inputs": char_input,
  4191. "inputs_length":length,
  4192. 'keepprob':keepprob},
  4193. outputs={"logits": logits,
  4194. "trans":trans}
  4195. )
  4196. def save_role_model():
  4197. '''
  4198. @summary: 保存model为savedModel,部署到PAI平台上调用
  4199. '''
  4200. model_role = PREMPredict().model_role
  4201. with model_role.graph.as_default():
  4202. model = model_role.getModel()
  4203. sess = tf.Session(graph=model_role.graph)
  4204. print(type(model.input))
  4205. sess.run(tf.global_variables_initializer())
  4206. h5_to_graph(sess, model_role.graph, model_role.model_role_file)
  4207. model = model_role.getModel()
  4208. tf.saved_model.simple_save(sess,
  4209. "./role_savedmodel/",
  4210. inputs={"input0":model.input[0],
  4211. "input1":model.input[1],
  4212. "input2":model.input[2]},
  4213. outputs={"outputs":model.output}
  4214. )
  4215. def save_money_model():
  4216. model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
  4217. graph = tf.Graph()
  4218. with graph.as_default():
  4219. sess = tf.Session(graph=graph)
  4220. with sess.as_default():
  4221. # model = model_money.getModel()
  4222. # model.summary()
  4223. # sess.run(tf.global_variables_initializer())
  4224. # h5_to_graph(sess, model_money.graph, model_money.model_money_file)
  4225. model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  4226. model.summary()
  4227. print(model.weights)
  4228. tf.saved_model.simple_save(sess,
  4229. "./money_savedmodel2/",
  4230. inputs = {"input0":model.input[0],
  4231. "input1":model.input[1],
  4232. "input2":model.input[2]},
  4233. outputs = {"outputs":model.output}
  4234. )
  4235. def save_person_model():
  4236. model_person = EPCPredict().model_person
  4237. with model_person.graph.as_default():
  4238. x = load("person_x.pk")
  4239. _data = np.transpose(np.array(x),(1,0,2,3))
  4240. model = model_person.getModel()
  4241. sess = tf.Session(graph=model_person.graph)
  4242. with sess.as_default():
  4243. sess.run(tf.global_variables_initializer())
  4244. model_person.load_weights()
  4245. #h5_to_graph(sess, model_person.graph, model_person.model_person_file)
  4246. predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]})
  4247. #predict_y = model.predict([_data[0],_data[1]])
  4248. print(np.argmax(predict_y,-1))
  4249. tf.saved_model.simple_save(sess,
  4250. "./person_savedmodel/",
  4251. inputs={"input0":model.input[0],
  4252. "input1":model.input[1]},
  4253. outputs = {"outputs":model.output})
  4254. def save_form_model():
  4255. model_form = FormPredictor()
  4256. with model_form.graph.as_default():
  4257. model = model_form.getModel("item")
  4258. sess = tf.Session(graph=model_form.graph)
  4259. sess.run(tf.global_variables_initializer())
  4260. h5_to_graph(sess, model_form.graph, model_form.model_file_item)
  4261. tf.saved_model.simple_save(sess,
  4262. "./form_savedmodel/",
  4263. inputs={"inputs":model.input},
  4264. outputs = {"outputs":model.output})
  4265. def save_codesplit_model():
  4266. filepath_code = "../../dl_dev/projectCode/models/model_code.hdf5"
  4267. graph = tf.Graph()
  4268. with graph.as_default():
  4269. model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  4270. sess = tf.Session()
  4271. sess.run(tf.global_variables_initializer())
  4272. h5_to_graph(sess, graph, filepath_code)
  4273. tf.saved_model.simple_save(sess,
  4274. "./codesplit_savedmodel/",
  4275. inputs={"input0":model_code.input[0],
  4276. "input1":model_code.input[1],
  4277. "input2":model_code.input[2]},
  4278. outputs={"outputs":model_code.output})
  4279. def save_timesplit_model():
  4280. filepath = '../time/model_label_time_classify.model.hdf5'
  4281. with tf.Graph().as_default() as graph:
  4282. time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  4283. with tf.Session() as sess:
  4284. sess.run(tf.global_variables_initializer())
  4285. h5_to_graph(sess, graph, filepath)
  4286. tf.saved_model.simple_save(sess,
  4287. "./timesplit_model/",
  4288. inputs={"input0":time_model.input[0],
  4289. "input1":time_model.input[1]},
  4290. outputs={"outputs":time_model.output})
  4291. if __name__=="__main__":
  4292. #save_role_model()
  4293. # save_codename_model()
  4294. # save_money_model()
  4295. #save_person_model()
  4296. #save_form_model()
  4297. #save_codesplit_model()
  4298. # save_timesplit_model()
  4299. '''
  4300. # with tf.Session(graph=tf.Graph()) as sess:
  4301. # from tensorflow.python.saved_model import tag_constants
  4302. # meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
  4303. # graph = tf.get_default_graph()
  4304. # signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  4305. # signature = meta_graph_def.signature_def
  4306. # input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
  4307. # input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
  4308. # outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
  4309. # x = load("person_x.pk")
  4310. # _data = np.transpose(x,[1,0,2,3])
  4311. # y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
  4312. # print(np.argmax(y,-1))
  4313. '''
  4314. MAX_LEN = 1000
  4315. vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
  4316. vocab = load(vocabpath)
  4317. word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
  4318. index_unk = word2index.get("<unk>")
  4319. sentence = "招标人:广州市重点公共建设项目管理中心,联系人:李工,联系方式:020-22905689,招标代理:广东重工建设监理有限公司," \
  4320. "代理联系人:薛家伟,代理联系方式:13535014481,招标监督机构:广州市重点公共建设项目管理中心,监督电话:020-22905690," \
  4321. "备注:以上为招标公告简要描述,招标公告详细信息请查看“招标公告”附件,"
  4322. sentence = sentence*5
  4323. list_sentence = [sentence]*200
  4324. # print(list_sentence)
  4325. x = [[word2index.get(word, index_unk) for word in sentence] for sentence in
  4326. list_sentence]
  4327. x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
  4328. # print(x_len)
  4329. x = pad_sequences(x, maxlen=MAX_LEN, padding="post", truncating="post")
  4330. requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},
  4331. verify=True)
  4332. # predict_y = json.loads(requests_result.text)['result']
  4333. print("cost_time:", json.loads(requests_result.text)['cost_time'])
  4334. print(MAX_LEN, len(sentence), len(list_sentence))
  4335. requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},
  4336. verify=True)
  4337. # predict_y = json.loads(requests_result.text)['result']
  4338. print("cost_time:", json.loads(requests_result.text)['cost_time'])
  4339. print(MAX_LEN, len(sentence), len(list_sentence))