12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852 |
- '''
- Created on 2018年12月26日
- @author: User
- '''
- import os
- import sys
- from BiddingKG.dl.common.nerUtils import *
- sys.path.append(os.path.abspath("../.."))
- # from keras.engine import topology
- # from keras import models
- # from keras import layers
- # from keras_contrib.layers.crf import CRF
- # from keras.preprocessing.sequence import pad_sequences
- # from keras import optimizers,losses,metrics
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.interface.modelFactory import *
- import tensorflow as tf
- import pandas as pd
- from BiddingKG.dl.product.data_util import decode, process_data
- from BiddingKG.dl.interface.Entitys import Entity
- from BiddingKG.dl.complaint.punish_predictor import Punish_Extract
- from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money
- from bs4 import BeautifulSoup
- import copy
- import calendar
- import datetime
- # import fool # 统一用 selffool ,阿里云上只有selffool 包
- cpu_num = int(os.environ.get("CPU_NUM",0))
- sess_config = tf.ConfigProto(
- inter_op_parallelism_threads = cpu_num,
- intra_op_parallelism_threads = cpu_num,
- log_device_placement=True)
- sess_config = None
- file = os.path.dirname(__file__) + '/agency_set.pkl'
- with open(file, 'rb') as f:
- agency_set = pickle.load(f)
- from threading import RLock
- dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()},
- "prem":{"predictor":None,"Lock":RLock()},
- "epc":{"predictor":None,"Lock":RLock()},
- "roleRule":{"predictor":None,"Lock":RLock()},
- "roleRuleFinal":{"predictor":None,"Lock":RLock()},
- "tendereeRuleRecall":{"predictor":None,"Lock":RLock()},
- "form":{"predictor":None,"Lock":RLock()},
- "time":{"predictor":None,"Lock":RLock()},
- "punish":{"predictor":None,"Lock":RLock()},
- "product":{"predictor":None,"Lock":RLock()},
- "product_attrs":{"predictor":None,"Lock":RLock()},
- "channel": {"predictor": None, "Lock": RLock()},
- "deposit_payment_way": {"predictor": None, "Lock": RLock()},
- "total_unit_money": {"predictor": None, "Lock": RLock()},
- "industry": {"predictor": None, "Lock": RLock()},
- "rolegrade": {"predictor": None, "Lock": RLock()},
- "moneygrade": {"predictor": None, "Lock": RLock()},
- "district": {"predictor": None, "Lock": RLock()},
- 'tableprem': {"predictor": None, "Lock": RLock()},
- 'candidate': {"predictor": None, "Lock": RLock()},
- }
- def getPredictor(_type):
- if _type in dict_predictor:
- with dict_predictor[_type]["Lock"]:
- if dict_predictor[_type]["predictor"] is None:
- if _type == "codeName":
- dict_predictor[_type]["predictor"] = CodeNamePredict(config=sess_config)
- if _type == "prem":
- dict_predictor[_type]["predictor"] = PREMPredict(config=sess_config)
- if _type == "epc":
- dict_predictor[_type]["predictor"] = EPCPredict(config=sess_config)
- if _type == "roleRule":
- dict_predictor[_type]["predictor"] = RoleRulePredictor()
- if _type == "roleRuleFinal":
- dict_predictor[_type]["predictor"] = RoleRuleFinalAdd()
- if _type == "tendereeRuleRecall":
- dict_predictor[_type]["predictor"] = TendereeRuleRecall()
- if _type == "form":
- dict_predictor[_type]["predictor"] = FormPredictor(config=sess_config)
- if _type == "time":
- dict_predictor[_type]["predictor"] = TimePredictor(config=sess_config)
- if _type == "punish":
- dict_predictor[_type]["predictor"] = Punish_Extract()
- if _type == "product":
- dict_predictor[_type]["predictor"] = ProductPredictor(config=sess_config)
- if _type == "product_attrs":
- dict_predictor[_type]["predictor"] = ProductAttributesPredictor()
- if _type == "channel":
- dict_predictor[_type]["predictor"] = DocChannel(config=sess_config)
- if _type == 'deposit_payment_way':
- dict_predictor[_type]["predictor"] = DepositPaymentWay()
- if _type == 'total_unit_money':
- dict_predictor[_type]["predictor"] = TotalUnitMoney()
- if _type == 'industry':
- dict_predictor[_type]["predictor"] = IndustryPredictor()
- if _type == 'rolegrade':
- dict_predictor[_type]["predictor"] = RoleGrade()
- if _type == 'moneygrade':
- dict_predictor[_type]["predictor"] = MoneyGrade()
- if _type == 'district':
- dict_predictor[_type]["predictor"] = DistrictPredictor()
- if _type == 'tableprem':
- dict_predictor[_type]["predictor"] = TablePremExtractor()
- if _type == 'candidate':
- dict_predictor[_type]["predictor"] = CandidateExtractor()
- return dict_predictor[_type]["predictor"]
- raise NameError("no this type of predictor")
- # 编号名称模型
- class CodeNamePredict():
-
- def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad(),config=None):
-
- self.model = None
- self.MAX_LEN = None
- self.model_code = None
- if EMBED_DIM is None:
- self.EMBED_DIM = 60
- else:
- self.EMBED_DIM = EMBED_DIM
- if BiRNN_UNITS is None:
- self.BiRNN_UNITS = 200
- else:
- self.BiRNN_UNITS = BiRNN_UNITS
- self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
- #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5"
- self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5"
- vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk"
- classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk"
- self.vocab = load(vocabpath)
- self.class_labels = load(classlabelspath)
-
- #生成提取编号和名称的正则
- id_PC_B = self.class_labels.index("PC_B")
- id_PC_M = self.class_labels.index("PC_M")
- id_PC_E = self.class_labels.index("PC_E")
- id_PN_B = self.class_labels.index("PN_B")
- id_PN_M = self.class_labels.index("PN_M")
- id_PN_E = self.class_labels.index("PN_E")
- self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E))
- self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E))
- # print("pc",self.PC_pattern)
- # print("pn",self.PN_pattern)
- self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
-
- self.inputs = None
- self.outputs = None
- self.sess_codename = tf.Session(graph=tf.Graph(),config=config)
- self.sess_codesplit = tf.Session(graph=tf.Graph(),config=config)
- self.inputs_code = None
- self.outputs_code = None
- if not lazyLoad:
- self.getModel()
- self.getModel_code()
-
-
-
- def getModel(self):
- '''
- @summary: 取得编号和名称模型
- '''
- if self.inputs is None:
- log("get model of codename")
- with self.sess_codename.as_default():
- with self.sess_codename.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf")
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
- self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name)
- self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name)
- self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name)
- self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name)
- return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
- else:
- return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
- '''
- if self.model is None:
- self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
- self.model.load_weights(self.filepath)
- return self.model
- '''
-
- def getModel_code(self):
- if self.inputs_code is None:
- log("get model of code")
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel")
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- self.inputs_code = []
- self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
- self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
- self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name))
- self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
- self.sess_codesplit.graph.finalize()
- return self.inputs_code,self.outputs_code
- else:
- return self.inputs_code,self.outputs_code
- '''
- if self.model_code is None:
- log("get model of model_code")
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- return self.model_code
- '''
-
- def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
- '''
- model = models.Sequential()
- model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
- model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
- crf = CRF(len(chunk_tags), sparse_target=True)
- model.add(crf)
- model.summary()
- model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
- return model
- '''
- input = layers.Input(shape=(None,))
- if weights is not None:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
- else:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
- bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
- bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
- crf = CRF(len(chunk_tags),sparse_target=True)
- crf_out = crf(bilstm_dense)
- model = models.Model(input=[input],output = [crf_out])
- model.summary()
- model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
- return model
-
- #根据规则补全编号或名称两边的符号
- def fitDataByRule(self,data):
- symbol_dict = {"(":")",
- "(":")",
- "[":"]",
- "【":"】",
- ")":"(",
- ")":"(",
- "]":"[",
- "】":"【"}
- leftSymbol_pattern = re.compile("[\((\[【]")
- rightSymbol_pattern = re.compile("[\))\]】]")
- leftfinds = re.findall(leftSymbol_pattern,data)
- rightfinds = re.findall(rightSymbol_pattern,data)
- result = data
- if len(leftfinds)+len(rightfinds)==0:
- return data
- elif len(leftfinds)==len(rightfinds):
- return data
- elif abs(len(leftfinds)-len(rightfinds))==1:
- if len(leftfinds)>len(rightfinds):
- if symbol_dict.get(data[0]) is not None:
- result = data[1:]
- else:
- #print(symbol_dict.get(leftfinds[0]))
- result = data+symbol_dict.get(leftfinds[0])
- else:
- if symbol_dict.get(data[-1]) is not None:
- result = data[:-1]
- else:
- result = symbol_dict.get(rightfinds[0])+data
- return result
- def decode(self,logits, trans, sequence_lengths, tag_num):
- viterbi_sequences = []
- for logit, length in zip(logits, sequence_lengths):
- score = logit[:length]
- viterbi_seq, viterbi_score = viterbi_decode(score, trans)
- viterbi_sequences.append(viterbi_seq)
- return viterbi_sequences
-
- def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
- #@summary: 获取每篇文章的code和name
- # pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店")
- pattern_score = re.compile('建设项目|服务项目|工程项目|工程施工|建设工程|服务中心|基础设施|物业管理|工程设计|妇幼保健|咨询服务|管理系统|管理中心|改建工程|配套工程|公安局|幼儿园|管理局|使用权|办公楼|教育局|管理处|图书馆|经营权|项目|采购|工程|改造|服务|设备|中心|医院|系统|建设|监理|施工|维修|学院|安装|设计|关于|标段|招标|技术|询价|管理|学校|小学|中学|平台|提升|设施|检测|整治|社区|装修|政府|绿化|物资|租赁|地块|医疗|编制|公开|规划|监控|教育|维护|校区|治理|升级|安置|竞价|购置|评估|勘察|承包|实验|大学|材料|生产|耗材|招租|硬化|维保|用地|消防|审计|拍卖|物业|入围|养护|机关|企业|用房|出让|资产|分局|验收|宣传|处置|校园|研究|咨询|修缮|更换|装饰|劳务|保养|物流|出租|局|院')
- result = []
- index_unk = self.word2index.get("<unk>")
- # index_pad = self.word2index.get("<pad>")
- if list_entitys is None:
- list_entitys = [[] for _ in range(len(list_sentences))]
- for list_sentence,list_entity in zip(list_sentences,list_entitys):
- if len(list_sentence)==0:
- result.append([{"code":[],"name":""}])
- continue
- doc_id = list_sentence[0].doc_id
- # sentences = []
- # for sentence in list_sentence:
- # if len(sentence.sentence_text)>MAX_AREA:
- # for _sentence_comma in re.split("[;;,\n]",sentence):
- # _comma_index = 0
- # while(_comma_index<len(_sentence_comma)):
- # sentences.append(_sentence_comma[_comma_index:_comma_index+MAX_AREA])
- # _comma_index += MAX_AREA
- # else:
- # sentences.append(sentence+"。")
- list_sentence.sort(key=lambda x:len(x.sentence_text),reverse=True)
- _begin_index = 0
-
- item = {"code":[],"name":""}
- code_set = set()
- dict_name_freq_score = dict()
- while(True):
- MAX_LEN = len(list_sentence[_begin_index].sentence_text)
- if MAX_LEN>MAX_AREA:
- MAX_LEN = MAX_AREA
- _LEN = MAX_AREA//MAX_LEN
- #预测
- x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
- # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
- x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
- x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
- if USE_API:
- requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},verify=True)
- predict_y = json.loads(requests_result.text)['result']
- # print("cost_time:", json.loads(requests_result.text)['cost_time'])
- # print(MAX_LEN,_LEN,_begin_index)
- else:
- with self.sess_codename.as_default():
- t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
- _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x,
- t_input_length:x_len,
- t_keepprob:1.0})
- predict_y = self.decode(_logits,_trans,x_len,7)
- # print('==========',_logits)
- '''
- for item11 in np.argmax(predict_y,-1):
- print(item11)
- print(predict_y)
- '''
- # print(predict_y)
- for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)):
- pad_sentence = sentence.sentence_text[:MAX_LEN]
- join_predict = "".join([str(s) for s in predict])
- # print(pad_sentence)
- # print(join_predict)
- code_x = []
- code_text = []
- pre_text = []
- temp_entitys = []
- for iter in re.finditer(self.PC_pattern,join_predict):
- get_len = 40
- if iter.span()[0]<get_len:
- begin = 0
- else:
- begin = iter.span()[0]-get_len
- end = iter.span()[1]+get_len
- code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
- code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]].replace(",", ""))
- pre_text.append(pad_sentence[begin:iter.span()[0]])
- _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=pad_sentence[iter.span()[0]:iter.span()[1]].replace(",",""),entity_type="code",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
- temp_entitys.append(_entity)
- #print("code",code_text)
- if len(code_x)>0:
- code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3))
- if USE_PAI_EAS:
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0]))
- request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1]))
- request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1))
- request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2]))
- request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs)
- if _result is not None:
- predict_code = _result["outputs"]
- else:
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
- else:
- with self.sess_codesplit.as_default():
- with self.sess_codesplit.graph.as_default():
- inputs_code,outputs_code = self.getModel_code()
- predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})[0]
- #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
- #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
- for h in range(len(predict_code)):
- if predict_code[h][0]>0.5:
- the_code = self.fitDataByRule(code_text[h])
- # print(the_code)
- #add code to entitys
- list_entity.append(temp_entitys[h])
- if re.search(',|/|;|、|,', the_code) and len(the_code)>25:
- for it in re.split(',|/|;|、|,', the_code):
- if len(it) > 8:
- if it not in code_set:
- code_set.add(it)
- # item['code'].append(it)
- if re.search("(项目编号|招标编号):?$", pre_text[h]):
- item['code'].append((it, 0))
- elif re.search('采购(计划)?编号:?$', pre_text[h]):
- item['code'].append((it, 1))
- elif re.search('(询价|合同)编号:?$', pre_text[h]):
- item['code'].append((it, 2))
- else:
- item['code'].append((it, 3))
- elif len(item['code']) > 0:
- new_it = item['code'][-1][0] + re.search(',|/|;|、|,', the_code).group(0) + it
- if new_it not in code_set:
- code_set.add(new_it)
- # item['code'][-1] = new_it
- if re.search("(项目编号|招标编号):?$", pre_text[h]):
- item['code'][-1] = (new_it, 0)
- elif re.search('采购(计划)?编号:?$', pre_text[h]):
- item['code'][-1] = (new_it, 1)
- elif re.search('(询价|合同)编号:?$', pre_text[h]):
- item['code'][-1] = (new_it, 2)
- else:
- item['code'][-1] = (new_it, 3)
- else:
- if the_code not in code_set:
- code_set.add(the_code)
- # item['code'].append(the_code)
- if re.search("(项目编号|招标编号):?$", pre_text[h]):
- item['code'].append((the_code, 0))
- elif re.search('采购(计划)?编号:?$', pre_text[h]):
- item['code'].append((the_code, 1))
- elif re.search('(询价|合同)编号:?$', pre_text[h]):
- item['code'].append((the_code, 2))
- else:
- item['code'].append((the_code, 3))
- break
- elif the_code not in code_set:
- code_set.add(the_code)
- # item['code'].append(the_code)
- if re.search("(项目编号|招标编号):?$", pre_text[h]):
- item['code'].append((the_code, 0))
- elif re.search('采购(计划)?编号:?$', pre_text[h]):
- item['code'].append((the_code, 1))
- elif re.search('(询价|合同)编号:?$', pre_text[h]):
- item['code'].append((the_code, 2))
- else:
- item['code'].append((the_code, 3))
- # if the_code not in code_set:
- # code_set.add(the_code)
- # item['code'] = list(code_set)
- for iter in re.finditer(self.PN_pattern,join_predict):
- _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- if len(_name)>200: # 避免模型预测类似 202750503 这种很长重复字很多的错误项目名称
- continue
- #add name to entitys
- _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1],in_attachment=sentence.in_attachment)
- list_entity.append(_entity)
- # w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
- w = 1 if re.search('(项目|工程|招标|采购|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题|项目)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5
- if _name not in dict_name_freq_score:
- # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
- len_name = len(_name) if len(_name) <50 else 100-len(_name) # 2023/03/02 超出50长度的逐渐递减
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05)*w+(5-sentence.sentence_index)*0.2]
- else:
- dict_name_freq_score[_name][0] += 1
- '''
- for iter in re.finditer(self.PN_pattern,join_predict):
- print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
- if item[1]['name']=="":
- for iter in re.finditer(self.PN_pattern,join_predict):
- #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- break
- '''
- if _begin_index+_LEN>=len(list_sentence):
- break
- _begin_index += _LEN
-
- list_name_freq_score = []
- # print('模型预测项目名称:', dict_name_freq_score)
- # 2020/11/23 大网站规则调整
- if len(dict_name_freq_score) == 0:
- # name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]'
- name_re1 = '(项目|工程|招标|采购(条目)?|合同|标项|标的|计划|询价|询价单|询价通知书|申购单|申购)(名称|标名|标题|主题)[::\s]+(?P<name>[^,。:;]{2,60})[,。]'
- for sentence in list_sentence:
- # pad_sentence = sentence.sentence_text
- othername = re.search(name_re1, sentence.sentence_text)
- if othername != None:
- project_name = othername.group('name')
- if re.search('[\u4e00-\u9fa5]+', project_name) == None: # 没有中文的项目名称去除
- # log('没有中文的项目名称去除')
- continue
- beg = find_index([project_name], sentence.sentence_text)[0]
- end = beg + len(project_name)
- _name = self.fitDataByRule(sentence.sentence_text[beg:end])
- # print('规则召回项目名称:', _name)
- # add name to entitys
- _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
- sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name,
- entity_type="name", sentence_index=sentence.sentence_index, begin_index=0,
- end_index=0, wordOffset_begin=beg, wordOffset_end=end,in_attachment=sentence.in_attachment)
- list_entity.append(_entity)
- w = 1
- if _name not in dict_name_freq_score:
- # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
- dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w+(5-sentence.sentence_index)*0.2]
- else:
- dict_name_freq_score[_name][0] += 1
- # othername = re.search(name_re1, sentence.sentence_text)
- # if othername != None:
- # _name = othername.group(3)
- # if _name not in dict_name_freq_score:
- # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1]
- # else:
- # dict_name_freq_score[_name][0] += 1
- for _name in dict_name_freq_score.keys():
- list_name_freq_score.append([_name,dict_name_freq_score[_name]])
- # print(list_name_freq_score)
- if len(list_name_freq_score)>0:
- list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
- item['name'] = list_name_freq_score[0][0]
- # for it in list_name_freq_score:
- # print('项目名称及分值:',it[0],it[1], it[1][0]*it[1][1])
- # if list_name_freq_score[0][1][0]>1:
- # item[1]['name'] = list_name_freq_score[0][0]
- # else:
- # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True)
- # item[1]["name"] = list_name_freq_score[0][0]
-
- #下面代码加上去用正则添加某些识别不到的项目编号
- if item['code'] == []:
- for sentence in list_sentence:
- # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text)
- # if othercode != None:
- # item[1]['code'].append(othercode.group(2))
- # 2020/11/23 大网站规则调整
- othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价[单书]|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告|工程|寻源|标书|包件|谈判|申购)(单据?号|编号|标号|编码|代码|备案号|号)[::\s]+(?P<code>[^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。\u4e00-\u9fa5]', sentence.sentence_text)
- if othercode != None:
- # item['code'].append(othercode.group('code'))
- if re.search("(项目编号|招标编号):?$", othercode.group(0)):
- item['code'].append((othercode.group('code'), 0))
- elif re.search('采购(计划)?编号:?$', othercode.group(0)):
- item['code'].append((othercode.group('code'), 1))
- elif re.search('(询价|合同)编号:?$', othercode.group(0)):
- item['code'].append((othercode.group('code'), 2))
- else:
- item['code'].append((othercode.group('code'), 3))
- # print('规则召回项目编号:', othercode.group('code'))
- # item['code'] = [code for code in item['code'] if len(code)<500]
- # item['code'].sort(key=lambda x:len(x),reverse=True)
- item['code'] = [code for code in item['code'] if len(code[0]) < 500]
- item['code'].sort(key=lambda x: x[1])
- item['code'] = [it[0] for it in item['code']]
- result.append(item)
- list_sentence.sort(key=lambda x: x.sentence_index,reverse=False)
- return result
-
-
- '''
- #当数据量过大时会报错
- def predict(self,articles,MAX_LEN = None):
- sentences = []
- for article in articles:
- for sentence in article.content.split("。"):
- sentences.append([sentence,article.id])
- if MAX_LEN is None:
- sent_len = [len(sentence[0]) for sentence in sentences]
- MAX_LEN = max(sent_len)
- #print(MAX_LEN)
-
- #若为空,则直接返回空
- result = []
- if MAX_LEN==0:
- for article in articles:
- result.append([article.id,{"code":[],"name":""}])
- return result
-
- index_unk = self.word2index.get("<unk>")
- index_pad = self.word2index.get("<pad>")
-
- x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences]
- x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
-
- predict_y = self.getModel().predict(x)
-
-
- last_doc_id = ""
- item = []
- for sentence,predict in zip(sentences,np.argmax(predict_y,-1)):
- pad_sentence = sentence[0][:MAX_LEN]
- doc_id = sentence[1]
- join_predict = "".join([str(s) for s in predict])
- if doc_id!=last_doc_id:
- if last_doc_id!="":
- result.append(item)
- item = [doc_id,{"code":[],"name":""}]
- code_set = set()
- code_x = []
- code_text = []
- for iter in re.finditer(self.PC_pattern,join_predict):
- get_len = 40
- if iter.span()[0]<get_len:
- begin = 0
- else:
- begin = iter.span()[0]-get_len
- end = iter.span()[1]+get_len
- code_x.append(embedding_word([pad_sentence[begin:iter.span()[0]],pad_sentence[iter.span()[0]:iter.span()[1]],pad_sentence[iter.span()[1]:end]],shape=(3,get_len,60)))
- code_text.append(pad_sentence[iter.span()[0]:iter.span()[1]])
- if len(code_x)>0:
- code_x = np.transpose(np.array(code_x),(1,0,2,3))
- predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])
- for h in range(len(predict_code)):
- if predict_code[h][0]>0.5:
- the_code = self.fitDataByRule(code_text[h])
- if the_code not in code_set:
- code_set.add(the_code)
- item[1]['code'] = list(code_set)
- if item[1]['name']=="":
- for iter in re.finditer(self.PN_pattern,join_predict):
- #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])
- break
-
- last_doc_id = doc_id
- result.append(item)
- return result
- '''
-
- #角色金额模型
- class PREMPredict():
-
- def __init__(self,config=None):
- #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
- self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
- self.model_role = Model_role_classify_word(config=config)
- self.model_money = Model_money_classify(config=config)
-
- return
-
- def search_role_data(self,list_sentences,list_entitys):
- '''
- @summary:根据句子list和实体list查询角色模型的输入数据
- @param:
- list_sentences:文章的sentences
- list_entitys:文章的entitys
- @return:角色模型的输入数据
- '''
- text_list = []
- data_x = []
- points_entitys = []
- for list_entity,list_sentence in zip(list_entitys,list_sentences):
- list_entity.sort(key=lambda x:x.sentence_index)
- list_sentence.sort(key=lambda x:x.sentence_index)
- p_entitys = 0
- p_sentences = 0
- while(p_entitys<len(list_entity)):
- entity = list_entity[p_entitys]
- if entity.entity_type in ['org','company']:
- while(p_sentences<len(list_sentence)):
- sentence = list_sentence[p_sentences]
- if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
- # text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin-13):entity.wordOffset_end+10])
- text_sen = sentence.sentence_text
- b = entity.wordOffset_begin
- e = entity.wordOffset_end
- text_list.append((text_sen[max(0, b-13):b], text_sen[b:e], text_sen[e:e+10]))
- #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_ROLE_INPUT_SHAPE[1]),shape=settings.MODEL_ROLE_INPUT_SHAPE)
- item_x = self.model_role.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,entity_text=entity.entity_text)
- data_x.append(item_x)
- points_entitys.append(entity)
- break
- p_sentences += 1
-
- p_entitys += 1
-
- if len(points_entitys)==0:
- return None
-
- return [data_x,points_entitys, text_list]
-
-
- def search_money_data(self,list_sentences,list_entitys):
- '''
- @summary:根据句子list和实体list查询金额模型的输入数据
- @param:
- list_sentences:文章的sentences
- list_entitys:文章的entitys
- @return:金额模型的输入数据
- '''
- text_list = []
- data_x = []
- points_entitys = []
- for list_entity,list_sentence in zip(list_entitys,list_sentences):
- list_entity.sort(key=lambda x:x.sentence_index)
- list_sentence.sort(key=lambda x:x.sentence_index)
- p_entitys = 0
-
- while(p_entitys<len(list_entity)):
- entity = list_entity[p_entitys]
- if entity.entity_type=="money":
- p_sentences = 0
- while(p_sentences<len(list_sentence)):
- sentence = list_sentence[p_sentences]
- if entity.doc_id==sentence.doc_id and entity.sentence_index==sentence.sentence_index:
- # text_list.append(sentence.sentence_text[max(0, entity.wordOffset_begin - 13):entity.wordOffset_begin])
- text_sen = sentence.sentence_text
- b = entity.wordOffset_begin
- e = entity.wordOffset_end
- text_list.append((text_sen[max(0, b - 13):b], text_sen[b:e], text_sen[e:e + 10]))
- #item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_MONEY_INPUT_SHAPE[1]),shape=settings.MODEL_MONEY_INPUT_SHAPE)
- #item_x = embedding_word(spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index, end_index=entity.end_index, size=10, center_include=True, word_flag=True),shape=settings.MODEL_MONEY_INPUT_SHAPE)
- item_x = self.model_money.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
- data_x.append(item_x)
- points_entitys.append(entity)
- break
- p_sentences += 1
- p_entitys += 1
-
- if len(points_entitys)==0:
- return None
-
- return [data_x,points_entitys, text_list]
-
- def predict_role(self,list_sentences, list_entitys):
- datas = self.search_role_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- text_list = datas[2]
- if USE_PAI_EAS:
- _data = datas[0]
- _data = np.transpose(np.array(_data),(1,0,2))
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
- request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
- request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
- request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
- request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(role_url, role_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- predict_y = self.model_role.predict(datas[0])
- else:
- predict_y = self.model_role.predict(np.array(datas[0],dtype=np.float64))
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = predict_y[i]
- # text = text_list[i]
- text_tup = text_list[i]
- front, middle, behind = text_tup
- whole = "".join(text_tup)
- # print('模型预测角色:', front, entity.entity_text, behind,label, values)
- if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
- label = 5
- elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
- label = 5
- elif label == 2:
- if re.search('中标单位和.{,25}签订合同', whole):
- label = 0
- values[label] = 0.501
- elif re.search('尊敬的供应商:.{,25}我公司', whole):
- label = 0
- values[label] = 0.801
- elif re.search('尊敬的供应商:$', front):
- label = 0
- values[label] = 0.501
- elif re.search('第[4-9四五六]中标候选人', front): #修复第4以上的预测错为中标人
- label = 5
- values[label] = 0.5
- elif re.search('(序号|排名|排序|名次):([4-9]|\d{2,}),', front): # 293225236 附件中 排名预测错误
- values[2] = 0.5
- label = 5
- elif re.search('是否中标:是,供应商', front) and label == 5:
- label = 2
- values[label] = 0.9
- elif label == 1 and re.search('委托(单位|人|方)[是为:]+',front) and re.search('受委托(单位|人|方)[是为:]+', front)==None:
- label = 0
- values[label] = 0.501
- elif label == 1 and re.search('([,。:]|^)(第一)?(服务|中选|中标)(中介服务|代理)?(公司|机构)(名称)?', front):
- label = 2
- values[label] = 0.501
- elif label in [3,4] and re.search('第[二三]分(公司|店),中标(人|供应商|单位|公司):$', front):
- label = 2
- values[label] = 0.7
- elif label == 3 and re.search('决定选择第二名', front) and re.search('^作为(中标|成交)(人|供应商|单位|公司)', behind):
- label = 2
- values[label] = 0.8
- elif re.search('(中标|成交)通知书[,:]$', front) and re.search('^:', behind) and label != 2:
- label = 2
- values[label] = 0.8
- entity.set_Role(label, values)
- def predict_money(self,list_sentences,list_entitys):
- datas = self.search_money_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- _data = datas[0]
- text_list = datas[2]
- if USE_PAI_EAS:
- _data = np.transpose(np.array(_data),(1,0,2,3))
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
- request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
- request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
- request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input2"].array_shape.dim.extend(np.shape(_data[2]))
- request.inputs["input2"].float_val.extend(np.array(_data[2],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(money_url, money_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- predict_y = self.model_money.predict(_data)
- else:
- predict_y = self.model_money.predict(_data)
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = predict_y[i]
- # text = text_list[i]
- text_tup = text_list[i]
- front, middle, behind = text_tup
- whole = "".join(text_tup)
- # print('金额: ', entity.entity_text, label, values, front, middle, behind)
- if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
- # print('模型预测金额: ', entity.entity_text, label, values, front, middle, behind)
- label = 2
- elif label == 1: # 错误中标金额处理
- if re.search('[::,。](总金额|总价|单价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
- values[label] = 0.49
- elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
- values[label] = 0.49
- elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
- values[label] = 0.49
- elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
- values[label] = 0.49
- elif label ==0: # 错误招标金额处理
- if entity.notes in ["投资", "总投资","工程造价"] or re.search('最低限价:?$', front) or re.search('服务内容:([\d,.]+万?亿?元?-?)$', front):
- values[label] = 0.49
- elif re.search('^(以[上下])?按[\d.%]+收取|^以[上下]|^[()]?[+×*-][\d.%]+', behind):
- values[label] = 0.49
- elif re.search('(含|在|包括|[大小等高低]于)$|[\d.%]+[+×*-]$', front):
- values[label] = 0.49
- elif re.search('报价:预估不?含税总价[为:]$', front) and (label != 1 or values[label]<0.5):
- label = 1
- values[label] = 0.8
- entity.set_Money(label, values)
- def correct_money_by_rule(self, title, list_entitys, list_articles):
- if len(re.findall('监理|施工|设计|勘察', title)) == 1 and re.search('施工|总承包|epc|EPC', title) == None:
- keyword = re.search('监理|设计|勘察', title).group(0)
- for list_entity in list_entitys:
- for _entity in list_entity:
- # print('keyword:',keyword, '_entity.notes :',_entity.notes)
- if _entity.entity_type == "money" and _entity.notes == keyword and _entity.label == 2:
- # if channel_dic['docchannel'] == "招标公告":
- if re.search('中标|成交|中选|中价|中租|结果|入围', title + list_articles[0].content[:100]) == None:
- _entity.values[0] = 0.51
- _entity.set_Money(0, _entity.values) # 2021/11/18 根据公告类别把费用改为招标或中投标金额
- else:
- _entity.values[1] = 0.51
- _entity.set_Money(1, _entity.values)
- def predict(self,list_sentences,list_entitys):
- self.predict_role(list_sentences,list_entitys)
- self.predict_money(list_sentences,list_entitys)
-
-
- #联系人模型
- class EPCPredict():
-
- def __init__(self,config=None):
- self.model_person = Model_person_classify(config=config)
-
- def search_person_data(self,list_sentences,list_entitys):
- '''
- @summary:根据句子list和实体list查询联系人模型的输入数据
- @param:
- list_sentences:文章的sentences
- list_entitys:文章的entitys
- @return:联系人模型的输入数据
- '''
- data_x = []
- points_entitys = []
- pre_texts = []
- for list_entity,list_sentence in zip(list_entitys,list_sentences):
-
- p_entitys = 0
- dict_index_sentence = {}
- for _sentence in list_sentence:
- dict_index_sentence[_sentence.sentence_index] = _sentence
- _list_entity = [entity for entity in list_entity if entity.entity_type=="person"]
- while(p_entitys<len(_list_entity)):
- entity = _list_entity[p_entitys]
- if entity.entity_type=="person":
- sentence = dict_index_sentence[entity.sentence_index]
- item_x = self.model_person.encode(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index)
- data_x.append(item_x)
- points_entitys.append(entity)
- pre_texts.append(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20))
- p_entitys += 1
- if len(points_entitys)==0:
- return None
-
- # return [data_x,points_entitys,dianhua]
- return [data_x,points_entitys, pre_texts]
- def predict_person(self,list_sentences, list_entitys):
- datas = self.search_person_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- pre_texts = datas[2]
- # phone = datas[2]
- if USE_PAI_EAS:
- _data = datas[0]
- _data = np.transpose(np.array(_data),(1,0,2,3))
- request = tf_predict_pb2.PredictRequest()
- request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input0"].array_shape.dim.extend(np.shape(_data[0]))
- request.inputs["input0"].float_val.extend(np.array(_data[0],dtype=np.float64).reshape(-1))
- request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["input1"].array_shape.dim.extend(np.shape(_data[1]))
- request.inputs["input1"].float_val.extend(np.array(_data[1],dtype=np.float64).reshape(-1))
- request_data = request.SerializeToString()
- list_outputs = ["outputs"]
- _result = vpc_requests(person_url, person_authorization, request_data, list_outputs)
- if _result is not None:
- predict_y = _result["outputs"]
- else:
- predict_y = self.model_person.predict(datas[0])
- else:
- predict_y = self.model_person.predict(datas[0])
- # assert len(predict_y)==len(points_entitys)==len(phone)
- assert len(predict_y)==len(points_entitys)
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- pre_text = ''.join(pre_texts[i][0])
- # print('pre_text', pre_text)
- if label==0 and re.search('(谈判|磋商|询价|资格审查|评审专家|(评选|议标|评标|评审)委员会?|专家|评委)(小?组|小?组成员)?(成员|名单)[:,](\w{2,4}((组长)|(成员))?[、,,])*$', pre_text):
- # print(entity.entity_text, re.search('(谈判|磋商|询价|资格审查|评审专家|(评选|议标|评标|评审)委员会?|专家|评委)(小?组|小?组成员)?(成员|名单)[:,](\w{2,4}((组长)|(成员))?[、,,])*$', pre_text).group(0))
- label = 4
- values = []
- for item in predict_y[i]:
- values.append(item)
- # phone_number = phone[i]
- # entity.set_Person(label,values,phone_number)
- entity.set_Person(label,values,[])
- # 为联系人匹配电话
- # self.person_search_phone(list_sentences, list_entitys)
- def person_search_phone(self,list_sentences, list_entitys):
- def phoneFromList(phones):
- # for phone in phones:
- # if len(phone)==11:
- # return re.sub('电话[:|:]|联系方式[:|:]','',phone)
- return re.sub('电话[:|:]|联系方式[:|:]', '', phones[0])
- for list_entity, list_sentence in zip(list_entitys, list_sentences):
- # p_entitys = 0
- # p_sentences = 0
- #
- # key_word = re.compile('电话[:|:].{0,4}\d{7,12}|联系方式[:|:].{0,4}\d{7,12}')
- # # phone = re.compile('1[3|4|5|7|8][0-9][-—-]?\d{4}[-—-]?\d{4}|\d{3,4}[-—-]\d{7,8}/\d{3,8}|\d{3,4}[-—-]\d{7,8}转\d{1,4}|\d{3,4}[-—-]\d{7,8}|[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}') # 联系电话
- # # 2020/11/25 增加发现的号码段
- # phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-]?\d{4}[-—-]?\d{4}|'
- # '\d{3,4}[-—-][1-9]\d{6,7}/\d{3,8}|'
- # '\d{3,4}[-—-]\d{7,8}转\d{1,4}|'
- # '\d{3,4}[-—-]?[1-9]\d{6,7}|'
- # '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
- # '[1-9]\d{6,7}') # 联系电话
- # dict_index_sentence = {}
- # for _sentence in list_sentence:
- # dict_index_sentence[_sentence.sentence_index] = _sentence
- #
- # dict_context_itemx = {}
- # last_person = "####****++++$$^"
- # last_person_phone = "####****++++$^"
- # _list_entity = [entity for entity in list_entity if entity.entity_type == "person"]
- # while (p_entitys < len(_list_entity)):
- # entity = _list_entity[p_entitys]
- # if entity.entity_type == "person" and entity.label in [1,2,3]:
- # sentence = dict_index_sentence[entity.sentence_index]
- # # item_x = embedding(spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=settings.MODEL_PERSON_INPUT_SHAPE[1]),shape=settings.MODEL_PERSON_INPUT_SHAPE)
- #
- # # s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=20)
- #
- # # 2021/5/8 取上下文的句子,解决表格处理的分句问题
- # left_sentence = dict_index_sentence.get(entity.sentence_index - 1)
- # left_sentence_tokens = left_sentence.tokens if left_sentence else []
- # right_sentence = dict_index_sentence.get(entity.sentence_index + 1)
- # right_sentence_tokens = right_sentence.tokens if right_sentence else []
- # entity_beginIndex = entity.begin_index + len(left_sentence_tokens)
- # entity_endIndex = entity.end_index + len(left_sentence_tokens)
- # context_sentences_tokens = left_sentence_tokens + sentence.tokens + right_sentence_tokens
- # s = spanWindow(tokens=context_sentences_tokens, begin_index=entity_beginIndex,
- # end_index=entity_endIndex, size=20)
- #
- # _key = "".join(["".join(x) for x in s])
- # if _key in dict_context_itemx:
- # _dianhua = dict_context_itemx[_key][0]
- # else:
- # s1 = ''.join(s[1])
- # # s1 = re.sub(',)', '-', s1)
- # s1 = re.sub('\s', '', s1)
- # have_key = re.findall(key_word, s1)
- # have_phone = re.findall(phone, s1)
- # s0 = ''.join(s[0])
- # # s0 = re.sub(',)', '-', s0)
- # s0 = re.sub('\s', '', s0)
- # have_key2 = re.findall(key_word, s0)
- # have_phone2 = re.findall(phone, s0)
- #
- # s3 = ''.join(s[1])
- # # s0 = re.sub(',)', '-', s0)
- # s3 = re.sub(',|,|\s', '', s3)
- # have_key3 = re.findall(key_word, s3)
- # have_phone3 = re.findall(phone, s3)
- #
- # s4 = ''.join(s[0])
- # # s0 = re.sub(',)', '-', s0)
- # s4 = re.sub(',|,|\s', '', s0)
- # have_key4 = re.findall(key_word, s4)
- # have_phone4 = re.findall(phone, s4)
- #
- # _dianhua = ""
- # if have_phone:
- # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
- # last_person_phone) != -1:
- # if len(have_phone) > 1:
- # _dianhua = phoneFromList(have_phone[1:])
- # else:
- # _dianhua = phoneFromList(have_phone)
- # elif have_key:
- # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find(
- # last_person_phone) != -1:
- # if len(have_key) > 1:
- # _dianhua = phoneFromList(have_key[1:])
- # else:
- # _dianhua = phoneFromList(have_key)
- # elif have_phone2:
- # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
- # last_person_phone) != -1:
- # if len(have_phone2) > 1:
- # _dianhua = phoneFromList(have_phone2[1:])
- # else:
- # _dianhua = phoneFromList(have_phone2)
- # elif have_key2:
- # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find(
- # last_person_phone) != -1:
- # if len(have_key2) > 1:
- # _dianhua = phoneFromList(have_key2[1:])
- # else:
- # _dianhua = phoneFromList(have_key2)
- # elif have_phone3:
- # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
- # last_person_phone) != -1:
- # if len(have_phone3) > 1:
- # _dianhua = phoneFromList(have_phone3[1:])
- # else:
- # _dianhua = phoneFromList(have_phone3)
- # elif have_key3:
- # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find(
- # last_person_phone) != -1:
- # if len(have_key3) > 1:
- # _dianhua = phoneFromList(have_key3[1:])
- # else:
- # _dianhua = phoneFromList(have_key3)
- # elif have_phone4:
- # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
- # last_person_phone) != -1:
- # if len(have_phone4) > 1:
- # _dianhua = phoneFromList(have_phone4)
- # else:
- # _dianhua = phoneFromList(have_phone4)
- # elif have_key4:
- # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find(
- # last_person_phone) != -1:
- # if len(have_key4) > 1:
- # _dianhua = phoneFromList(have_key4)
- # else:
- # _dianhua = phoneFromList(have_key4)
- # else:
- # _dianhua = ""
- # # dict_context_itemx[_key] = [item_x, _dianhua]
- # dict_context_itemx[_key] = [_dianhua]
- # # points_entitys.append(entity)
- # # dianhua.append(_dianhua)
- # last_person = entity.entity_text
- # if _dianhua:
- # # 更新联系人entity联系方式(person_phone)
- # entity.person_phone = _dianhua
- # last_person_phone = _dianhua
- # else:
- # last_person_phone = "####****++++$^"
- # p_entitys += 1
- from scipy.optimize import linear_sum_assignment
- from BiddingKG.dl.interface.Entitys import Match
- def dispatch(match_list):
- main_roles = list(set([match.main_role for match in match_list]))
- attributes = list(set([match.attribute for match in match_list]))
- label = np.zeros(shape=(len(main_roles), len(attributes)))
- for match in match_list:
- main_role = match.main_role
- attribute = match.attribute
- value = match.value
- label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000
- # print(label)
- gragh = -label
- # km算法
- row, col = linear_sum_assignment(gragh)
- max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value]
- return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch]
- # km算法
- key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})')
- phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|'
- '\+86.?1[3|4|5|6|7|8|9]\d{9}|'
- '0\d{2,3}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|'
- '0\d{2,3}[-—-―]\d{7,8}转\d{1,4}|'
- '0\d{2,3}[-—-―]?[1-9]\d{6,7}|'
- '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|'
- '[1-9]\d{6,7}')
- phone_entitys = []
- for _sentence in list_sentence:
- sentence_text = _sentence.sentence_text
- res_set = set()
- for i in re.finditer(phone,sentence_text):
- res_set.add((i.group(),i.start(),i.end()))
- for i in re.finditer(key_word,sentence_text):
- res_set.add((i.group(2),i.start()+len(i.group(1)),i.end()))
- for item in list(res_set):
- phone_left = sentence_text[max(0,item[1]-10):item[1]]
- phone_right = sentence_text[item[2]:item[2]+8]
- # 排除传真号 和 其它错误项
- if re.search("传,?真|信,?箱|邮,?箱",phone_left):
- if not re.search("电,?话",phone_left):
- continue
- if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]",phone_left):
- continue
- if re.search("[.,]\d{2,}",phone_right):
- continue
- _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2],in_attachment=_sentence.in_attachment)
- phone_entitys.append(_entity)
- person_entitys = []
- for entity in list_entity:
- if entity.entity_type == "person":
- entity.person_phone = ""
- person_entitys.append(entity)
- _list_entity = phone_entitys + person_entitys
- _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin))
- words_num_dict = dict()
- last_words_num = 0
- list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index)
- for sentence in list_sentence:
- _index = sentence.sentence_index
- if _index == 0:
- words_num_dict[_index] = 0
- else:
- words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num
- last_words_num = len(sentence.sentence_text)
- match_list = []
- for index in range(len(_list_entity)):
- entity = _list_entity[index]
- if entity.entity_type=="person" and entity.label in [1,2,3]:
- match_nums = 0
- for after_index in range(index + 1, min(len(_list_entity), index + 5)):
- after_entity = _list_entity[after_index]
- if after_entity.entity_type=="phone":
- sentence_distance = after_entity.sentence_index - entity.sentence_index
- distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - (
- words_num_dict[entity.sentence_index] + entity.wordOffset_end)
- if sentence_distance < 2 and distance < 50:
- value = (-1 / 2 * (distance ** 2)) / 10000
- match_list.append(Match(entity, after_entity, value))
- match_nums += 1
- else:
- break
- if after_entity.entity_type=="person":
- if after_entity.label not in [1,2,3]:
- break
- if not match_nums:
- for previous_index in range(index-1, max(0,index-5), -1):
- previous_entity = _list_entity[previous_index]
- if previous_entity.entity_type == "phone":
- sentence_distance = entity.sentence_index - previous_entity.sentence_index
- distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - (
- words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end)
- if sentence_distance < 1 and distance<30:
- # 前向 没有 /10000
- value = (-1 / 2 * (distance ** 2))
- match_list.append(Match(entity, previous_entity, value))
- else:
- break
- result = dispatch(match_list)
- for match in result:
- entity = match.main_role
- # 更新 list_entity
- entity_index = list_entity.index(entity)
- list_entity[entity_index].person_phone = match.attribute.entity_text
- def predict(self,list_sentences,list_entitys):
- self.predict_person(list_sentences,list_entitys)
-
- #表格预测
- class FormPredictor():
-
- def __init__(self,lazyLoad=getLazyLoad(),config=None):
- self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5"
- self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5"
- self.model_form_item = Model_form_item(config=config)
- self.model_dict = {"line":[None,self.model_file_line]}
- self.model_form_context = Model_form_context(config=config)
-
- def getModel(self,type):
- if type=="item":
- return self.model_form_item
- elif type=="context":
- return self.model_form_context
- else:
- return self.getModel(type)
- def encode(self,data,**kwargs):
- return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0]
- return encodeInput_form(data)
-
- def predict(self,form_datas,type):
- if type=="item":
- return self.model_form_item.predict(form_datas)
- elif type=="context":
- return self.model_form_context.predict(form_datas)
- else:
- return self.getModel(type).predict(form_datas)
-
- #角色规则
- #依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率
- class RoleRulePredictor():
-
- def __init__(self):
- # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
- self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|买受|出售|标卖|处置)" \
- "(人|方|单位|组织|用户|业主|主体|部门|公司)|文章来源|委托机构|产权所有人|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|结算单位)"\
- "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
- self.pattern_tenderee_left_w0 = "(?P<tenderee_left>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|买受|出售|标卖|处置)" \
- "(人|方|单位|组织|用户|业主|主体|部门|公司)|文章来源|委托机构|产权所有人|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|结算单位)"\
- "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
- self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)" \
- "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
- "(是|为|:|:|\s*)+$)"
- self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
- self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束))" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
- self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
- self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
- self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)" # |^受托 会与 受托生产等冲突,代理表达一般会在后面有逗号
- # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
- self.pattern_winTenderer_left = "(?P<winTenderer_left>" \
- "(乙|竞得|受让|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承租((包))?)(候选)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
- "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致)[::是为]+$" \
- "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$" \
- "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$" \
- "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$)" # 承办单位:不作为中标 83914772
- self.pattern_winTenderer_left_w0 = "(?P<winTenderer_left_w0>" \
- "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|第?[一1]名|第一(中标)?候选人)" \
- "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$)" # 解决表头识别不到加逗号情况,需前面为,。空
- self.pattern_winTenderer_left_w1 = "(?P<winTenderer_left_w1>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)" \
- "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$)" # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系 # 中标候选人不能作为中标
- self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|厂商)))|" \
- "^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
- "|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格" \
- "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))"
- self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标" \
- "|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购)" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
- self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
- self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
-
- self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
- self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
- self.condadate_left = "(?P<candidate_left>((中标|成交|入围)候选(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)[::是为]+$)"
- self.pattern_whole = [self.pattern_tenderee_left_w1,
- self.pattern_tenderee_left,
- self.pattern_tenderee_left_w0,
- self.pattern_tenderee_center,
- self.pattern_tenderee_right,
- self.pattern_tendereeORagency_right,
- self.pattern_agency_left,
- self.pattern_agency_right,
- self.pattern_winTenderer_left_w1,
- self.pattern_winTenderer_left,
- self.pattern_winTenderer_left_w0,
- self.pattern_winTenderer_whole,
- self.pattern_winTenderer_right,
- self.pattern_secondTenderer_left,
- self.pattern_secondTenderer_right,
- self.pattern_thirdTenderer_left,
- self.pattern_thirdTenderer_right
- ] # 需按顺序排列, 第二、三中标要在中标正则后面
- self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
-
- self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")
- self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):") # 单写 总价 不能作为中标金额,很多表格有单价、总价
- self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
- self.pattern_money_other = re.compile("代理费|服务费")
- self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
-
- def _check_input(self,text, ignore=False):
- if not text:
- return []
-
- if not isinstance(text, list):
- text = [text]
-
- null_index = [i for i, t in enumerate(text) if not t]
- if null_index and not ignore:
- raise Exception("null text in input ")
-
- return text
- def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
- for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
- list_codenames):
- list_sentence.sort(key=lambda x: x.sentence_index) # 2022/1/5 按句子顺序排序
- # list_name = list_codename["name"]
- list_name = [] # 2022/1/5 改为实体列表内所有项目名称
- candidates = [] # 保存不能确定为第几的候选人 2023/04/14
- notfound_tenderer = True # 未找到前三候选人
- for entity in list_entity:
- if entity.entity_type == 'name':
- list_name.append(entity.entity_text)
- list_name = self._check_input(list_name) + [article.title]
- for p_entity in list_entity:
- if p_entity.entity_type in ["org", "company"]:
- # 只解析角色为无的或者概率低于阈值的
- if p_entity.label is None:
- continue
- # 将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人
- if str(p_entity.label) == "0":
- find_flag = False
- for _sentence in list_sentence:
- if _sentence.sentence_index == p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
- end_index=p_entity.end_index, size=20, center_include=True,
- word_flag=True, use_text=True,
- text=re.sub(")", ")", re.sub("(", "(", p_entity.entity_text)))
- if re.search(self.pattern_tenderee_left, _span[0]) or re.search(self.pattern_tenderee_left_w0, _span[0]): # 前面有关键词的实体不判断是否在项目名称中出现
- find_flag = True
- break
- for _name in list_name:
- if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0: #加上前面一些信息,修复公司不在项目名称开头的,检测不到
- find_flag = True
- if p_entity.values[0] > on_value:
- p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
- else:
- p_entity.values[0] = on_value # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
- if find_flag:
- continue
- # 正则从概率低于阈值或其他类别中召回角色
- role_prob = float(p_entity.values[int(p_entity.label)])
- if role_prob < on_value or str(p_entity.label) == "5":
- # 将标题中的实体置为招标人
- _list_name = self._check_input(list_name, ignore=True)
- find_flag = False
- for _name in _list_name: # 2022/1/5修正只要项目名称出现过的角色,所有位置都标注为招标人
- if str(_name).find(re.sub(")", ")", re.sub("(", "(",
- p_entity.entity_text))) >= 0 and p_entity.sentence_index < 4:
- for _sentence in list_sentence:
- if _sentence.sentence_index == p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
- end_index=p_entity.end_index, size=20, center_include=True,
- word_flag=True, use_text=True, text=re.sub(")", ")",
- re.sub("(", "(",
- p_entity.entity_text)))
- if str(_span[1] + _span[2][:len(str(_name))]).find(
- _name) >= 0:
- if p_entity.entity_text in agency_set: # 在代理人集合的作为代理人
- find_flag = True
- _label = 1
- p_entity.label = _label
- p_entity.values[int(_label)] = on_value
- break
- else:
- find_flag = True
- _label = 0
- p_entity.label = _label
- p_entity.values[int(_label)] = on_value
- break
- if p_entity.sentence_index >= 4:
- break
- if find_flag:
- break
- # 若是实体在标题中,默认为招标人,不进行以下的规则匹配
- if find_flag:
- continue
- for s_index in range(len(list_sentence)):
- if p_entity.doc_id == list_sentence[s_index].doc_id and p_entity.sentence_index == \
- list_sentence[s_index].sentence_index:
- tokens = list_sentence[s_index].tokens
- begin_index = p_entity.begin_index
- end_index = p_entity.end_index
- size = 15
- spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
- word_flag=True, use_text=False)
- # _flag = False
- # 添加中标通知书类型特殊处理
- try:
- if s_index == 0 and re.search('中标通知书.{,30}[,:]%s:'%p_entity.entity_text.replace('(', '').replace(')', ''),
- list_sentence[s_index].sentence_text.replace('(', '').replace(')', '')[:100]):
- p_entity.label = 2
- p_entity.values[2] = 0.5
- notfound_tenderer = False
- # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
- break
- except Exception as e:
- print('正则报错:', e)
- # 使用正则+距离解决冲突
- # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
- list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:25], spans[2]] # 实体左、中、右 信息
- for _i_span in range(len(list_spans)):
- _flag = False
- _prob_weight = 1
- # print(list_spans[_i_span],p_entity.entity_text)
- for _pattern in self.pattern_whole:
- for _iter in re.finditer(_pattern, list_spans[_i_span]):
- for _group, _v_group in _iter.groupdict().items():
- if _v_group is not None and _v_group != "":
- _role = _group.split("_")[0]
- if _role == "tendereeORagency": # 2022/3/9 新增不确定招标代理判断逻辑
- # print('p_entity_sentenceindex:', p_entity.sentence_index)
- if p_entity.sentence_index>=1: # 只在第一句进行这种模糊匹配
- continue
- if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
- or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
- _role = 'tenderee'
- else:
- _role = "agency"
- _direct = _group.split("_")[1]
- _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
- # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
- # "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
- if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})', #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
- list_spans[0]) == None: # 2021/12/22 修正错误中标召回 例子208668937
- _flag = True
- _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
- "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
- _prob_weight = 1.2 if _weight=='w1' else 1
- # print('_v_group:',_group, _v_group, p_entity.entity_text)
- if _i_span == 1 and _direct == "center" and _v_group.find(p_entity.entity_text) != -1 and re.search('以[^,。;]{10,30}为准', list_spans[1])==None:
- _flag = True
- _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
- "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
- _prob_weight = 1.2 if _weight == 'w1' else 1
- # print('_v_group:', _group, _v_group, p_entity.entity_text)
- if _i_span == 2 and _direct == "right":
- _flag = True
- _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
- "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
- _prob_weight = 1.2 if _weight == 'w1' else 1
- # print('_v_group:', _group, _v_group, p_entity.entity_text)
- # 得到结果
- if _flag:
- if _label in [2, 3, 4]:
- notfound_tenderer = False
- p_entity.label = _label
- p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
- # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group, _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
- break
- if _i_span == 0 and re.search(self.condadate_left, list_spans[_i_span]):
- candidates.append(p_entity)
- elif str(p_entity.label) in ['2', '3', '4']:
- notfound_tenderer = False
- # 其他金额通过正则召回可能是招标或中投标的金额
- if p_entity.entity_type in ["money"]:
- if str(p_entity.label) == "2":
- for _sentence in list_sentence:
- if _sentence.sentence_index == p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
- end_index=p_entity.end_index, size=10, center_include=True,
- word_flag=True, text=p_entity.entity_text)
- if re.search('(含|在|包括)(\d+)?$', _span[0]):
- continue
- if re.search(',\w{2,}', _span[0]):
- _span[0] = _span[0].split(',')[-1] #避免多个价格在一起造成误判
- if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
- self.pattern_money_other, _span[0]) is None:
- p_entity.values[0] = 0.8 + p_entity.values[0] / 10
- p_entity.label = 0
- # print('规则召回预算金额:', p_entity.entity_text, _span[0])
- if re.search(self.pattern_money_tenderer, _span[0]) is not None:
- if re.search(self.pattern_money_other, _span[0]) is not None:
- if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
- re.search(self.pattern_money_other, _span[0]).span()[1]:
- p_entity.values[1] = 0.8 + p_entity.values[1] / 10
- p_entity.label = 1
- else:
- p_entity.values[1] = 0.8 + p_entity.values[1] / 10
- p_entity.label = 1
- if re.search(self.pattern_money_tenderer_whole,
- "".join(_span)) is not None and re.search(self.pattern_money_other,
- _span[0]) is None:
- p_entity.values[1] = 0.8 + p_entity.values[1] / 10
- p_entity.label = 1
- elif re.search('(预算金额|最高(投标)?上?限[价额]?格?|招标控制价))?:?([\d.,]+万?元[,(]其中)?(第?[一二三四五0-9](标[段|包]|[分子]包):?[\d.,]+万?元,)*第?[一二三四五0-9](标[段|包]|[分子]包):?$'
- , _sentence.sentence_text[:p_entity.wordOffset_begin]): # 处理几个标段金额相邻情况 例子:191705231
- p_entity.values[0] = 0.8 + p_entity.values[0] / 10
- p_entity.label = 0
- # print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
- if notfound_tenderer and len(candidates) == 1 and re.search(
- '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
- article.content[:100]):
- for p_entity in candidates:
- # print('只有一个候选人的作为中标人', p_entity.entity_text)
- p_entity.label = 2
- p_entity.values[2] = on_value
- # 增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额
- list_p = []
- state = 0
- for p_entity in list_entity:
- for _sentence in list_sentence:
- if _sentence.sentence_index == p_entity.sentence_index:
- _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
- end_index=p_entity.end_index, size=20, center_include=True, word_flag=True,
- text=p_entity.entity_text)
- if state == 2:
- for _p in list_p[1:]:
- _p.values[0] = 0.8 + _p.values[0] / 10
- _p.label = 0
- state = 0
- list_p = []
- if state == 0:
- if p_entity.entity_type in ["money"]:
- if str(p_entity.label) == "0" and re.search(self.pattern_pack,
- _span[0] + "-" + _span[2]) is not None:
- state = 1
- list_p.append(p_entity)
- elif state == 1:
- if p_entity.entity_type in ["money"]:
- if str(p_entity.label) in ["0", "2"] and re.search(self.pattern_pack,
- _span[0] + "-" + _span[
- 2]) is not None and re.search(
- self.pattern_money_other,
- _span[0] + "-" + _span[2]) is None and p_entity.sentence_index == list_p[
- 0].sentence_index:
- list_p.append(p_entity)
- else:
- state = 2
- if len(list_p) > 1:
- for _p in list_p[1:]:
- # print("==",_p.entity_text,_p.sentence_index,_p.label)
- _p.values[0] = 0.8 + _p.values[0] / 10
- _p.label = 0
- state = 0
- list_p = []
- for p_entity in list_entity:
- # 将属于集合中的不可能是中标人的标签置为无
- if p_entity.entity_text in self.SET_NOT_TENDERER:
- p_entity.label = 5
- '''正则补充最后一句实体日期格式为招标或代理 2021/12/30'''
- class RoleRuleFinalAdd():
- def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
- '''
- 最终规则召回角色
- :param list_articles:
- :param list_sentences:
- :param list_entitys:
- :param list_codenames:
- :return:
- '''
- # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
- main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
- end_tokens = []
- for sentence in main_sentences[-5:]:
- end_tokens.extend(sentence.tokens)
- text_end = "".join(end_tokens[-30:])
- text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处)', '', text_end) # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
- # print(text_end)
- # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
- sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,6}(分公司|部))?),?\s*[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
- sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
- sear_ent2 = re.search('[,:](户名|开户名称|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
- sear_ent3 = re.search('(买家信息|所有权人|土地权属单位|报名咨询|[收送交]货地点)[,:](?P<entity>[\u4e00-\u9fa5()()]{5,20})[0-9\-]*[,。]', list_articles[0].content[:5000])
- sear_ent4 = re.search('(发布(?:人|单位|机构|企业)|项目业主|所属公司|寻源单位)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})[,。]', list_articles[0].content[:5000])
- sear_list = [sear_ent4 , sear_ent3 , sear_ent2 ,sear_ent1, sear_ent]
- tenderee_notfound = True
- agency_notfound = True
- tenderee_list = []
- agency_list = []
- ents = []
- for ent in list_entitys[0]:
- if ent.entity_type in ['org', 'company']:
- if ent.label == 0 and ent.values[ent.label]>=0.5:
- if '公共资源交易中心' in ent.entity_text: # 公共资源交易中心不算招标或代理,只算平台
- ent.label = 5
- continue
- tenderee_list.append(ent.entity_text)
- tenderee_notfound = False
- elif ent.label == 1:
- agency_list.append(ent.entity_text)
- agency_notfound = False
- elif ent.label == 5:
- if '公共资源交易中心' in ent.entity_text:
- continue
- ents.append(ent)
- if sear_ent or sear_ent1 or sear_ent2 or sear_ent3 or sear_ent4:
- for _sear_ent in [_sear for _sear in sear_list if _sear]:
- ent_re = _sear_ent.group('entity')
- ent_re = ent_re.replace(',', '').replace("(","(").replace(")",")")
- if tenderee_notfound == True and (re.search('医院|学校|大学|中学|小学|幼儿园|(政府|部|委员会|署|行|局|厅|处|室|科|股|站)$', ent_re)
- or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None) \
- and ent_re not in agency_list and ent_re not in agency_set:
- n = 0
- for i in range(len(ents) - 1, -1, -1):
- if not ents[i].in_attachment:
- n += 1
- if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
- break
- if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
- ents[i].label = 0
- ents[i].values[0] = 0.5
- tenderee_notfound = False
- # log('正则最后补充实体: %s'%(ent_re))
- break
- elif agency_notfound == True and ent_re not in tenderee_list and (
- re.search('(招投?标|采购|代理|咨询|管理)(服务)?(有限)?(责任)?公司|(采购|招投?标|交易|代理)(服务)?中心|(招标|代理|咨询|管理|监理)', ent_re) or ent_re in agency_set):
- n = 0
- for i in range(len(ents) - 1, -1, -1):
- if not ents[i].in_attachment:
- n += 1
- if n > 3 and _sear_ent==sear_ent: # 文章末尾角色加日期这种只找后三个实体
- break
- if ents[i].entity_text == ent_re or (ents[i].entity_text in ent_re and len(ents[i].entity_text)/len(ent_re)>0.6):
- ents[i].label = 1
- ents[i].values[1] = 0.5
- agency_notfound = False
- # log('正则最后补充实体: %s'%(ent_re))
- break
- if not tenderee_notfound:
- break
- elif list_codenames[0]['name'] != "": #把标题包含的公司实体作为招标人
- # tenderee_notfound = True
- # ents = []
- # for ent in list_entitys[0]:
- # if ent.entity_type in ['org', 'company']:
- # if ent.label == 0:
- # tenderee_notfound = False
- # elif ent.label == 1:
- # agency_notfound = False
- # elif ent.label == 5:
- # ents.append(ent)
- if tenderee_notfound == True:
- # print('list_codenames',list_codenames[0]['name'])
- for ent in ents:
- if ent.entity_text in list_codenames[0]['name']:
- ent.label = 0
- ent.values[0] = 0.5
- tenderee_notfound == False
- # log('正则召回标题中包含的实体:%s'%ent.entity_text)
- break
- # 招标人角色召回规则
- class TendereeRuleRecall():
- def __init__(self):
- # self.tenderee_left = re.compile("(发布(人|单位|机构)|需求方(信息[,:])?(单位|公司)?名称|购买主体|收货单位|项目申请单位|发起组织|联系单位|"
- # "询价(机构|企业)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::是为][^。;,]{,5}$")
- # self.tenderee_left_1 = re.compile("采购商公司|询价单位|项目法人单位|项目法人|项目业主名称|申购单位|预算单位|预算单位名称|预算单位单位名称|买方单位|需求公司|寻源单位|项目业主|采购商|业主单位咨询电话|需用单位|采购工厂|征集单位")
- self.tenderee_left_1 = re.compile("((?:采购商|项目法人|项目业主)(名称)?|(?:采购商|询价|项目法人|项目业主|申购|预算|买方|需求|寻源|需用|征集)(单位|公司)((?:单位|公司)?名称)?|询价企业|"
- "业主单位咨询电话|购买主体|采购工厂|需求方(信息[,:])?(单位|公司)?名称|采购单位[\((].{1,6}[\))])[::是为][^。;,]{,2}$")
- self.tenderee_left_2 = re.compile("(招标承办单位|交易人(?:名称)?|招标人代表|(采购|招标)联系人|交易单位|发起(单位|组织)|收货单位|使用方|买家信息)[::是为][^。;,]{,2}$")
- self.tenderee_left_3 = re.compile("[本我](?:公司|单位)[\(\[(【]?$")
- # self.tenderee_left_4 = re.compile("(采购机构|组织机构|组织方|执行单位|采购组织单位|招标组织单位|招标组织部门|采购执行方|采购执行单位|询价执行组织|组织单位|联系单位|联系部门)[::是为][^。;,]{,2}$")
- self.tenderee_left_4 = re.compile("(采购机构|(?:采购|招标|询价)?(组织|执行)(机构|方|单位|部门|组织)|联系(单位|部门)|联系(人|方式),?(单位|公司)(名称)?|联系(人|方式),名称)[::是为][^。;,]{,2}$")
- self.tenderee_left_5 = re.compile("(撰写单位|发布(?:人|单位|机构|公司|部门|企业))[^。;,]{,2}$")
- self.tenderee_right = re.compile("^[^。;::]{,5}[((](以?下简?称)?,?[,\"“]*[我本][\u4e00-\u9fa5]{1,2}[,\"”]*[))]|"
- "^[\((][^。;::\))]{,5}称(?:招标|采购)(?:人|单位)|"
- "^[^。;::]{,10}[对就][^。;,]+,?[^。;,]{,20}进行[^。;,]*(采购|询比?价|遴选|招投?标|征集)|"
- "^[^。;::]{,10}关于[^。;,]+,?[^。;,]{,20}的[^。;,]{,20}公告|"
- "^[^。;,::]{,10}的[^。;,]+,?[^。;,]{,20}正在[^。;,]{,5}进行|"
- "^[^。;,::]{,10}的[^。;,]+,?[^。,;]{,20}已?[^。;,]{,20}批准|"
- "^[^。;,::]{,15}(选定|选取|征集|遴选)[^。;,]{,20}(供应商|(代理|咨询|设计)[^。;,]{,5}机构|代理人)")
- self.tenderee_right2 = re.compile("^[^。;,::]{,10}(招标办|采购部|办事处|采购小?组)")
- self.tenderee_right3 = re.compile("^[^。;,::]{,10}(对|就|关于|的)(?P<project>[^。;,?!::]{4,40})")
- # 公告主语判断规则
- self.subject = re.compile("[我本][院校局]")
- # 未识别实体召回正则
- self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
- "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
- "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
- self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
- "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
- "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
- # 未识别实体尾部判断
- self.unrecognized_end1 = re.compile(
- "^[\u4e00-\u9fa5]{2,}?(?:公司|医院|学校|学院|大学|中学|小学|幼儿园|政府|指挥部|办公室|项目部|业主大会|监狱|教育局|委员会|研究所|招标办|采购部|办事处|水利局|公墓|中心|联合社|合作社)")
- self.unrecognized_end2 = re.compile("^[\u4e00-\u9fa5]{4,}(?:署|局|厅|处|室|科|部|站|所|股|行|园)")
- def predict(self, list_articles,list_sentences, list_entitys, list_codenames):
- self.get_tenderee = False
- ents = []
- list_name = []
- agency_set = set()
- for ent in list_entitys[0]:
- if ent.entity_type == 'name':
- list_name.append(ent.entity_text)
- if ent.entity_type in ['org', 'company']:
- if ent.label == 0 and ent.values[ent.label]>=0.5:
- self.get_tenderee = True
- break
- elif ent.label == 1:
- if ent.values[ent.label]>0.5:
- agency_set.add(ent.entity_text)
- elif ent.label == 5:
- if len(ent.entity_text)>=4:
- ents.append(ent)
- if not self.get_tenderee:
- self.entity_context_rule(ents,list_name,list_sentences,list(agency_set))
- if not self.get_tenderee:
- self.subject_rule(ents,list_articles,list_sentences)
- if not self.get_tenderee:
- self.unrecognized_entity_rule(self.unrecognized1,list_sentences,list_entitys,0.55)
- if not self.get_tenderee:
- self.unrecognized_entity_rule(self.unrecognized2,list_sentences,list_entitys,0.5)
- #entity上下文正则判断
- def entity_context_rule(self,entitys,list_name,list_sentences,list_agency):
- list_sentences[0].sort(key=lambda x:x.sentence_index)
- entity_data = []
- for ent in entitys:
- _sentence = list_sentences[0][ent.sentence_index]
- _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
- end_index=ent.end_index, size=40, center_include=True,
- word_flag=True, use_text=True,
- text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
- entity_data.append((ent,_span))
- if not self.get_tenderee:
- for _data in entity_data:
- ent = _data[0]
- _span = _data[1]
- if re.search(self.tenderee_left_1,_span[0]):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- if not self.get_tenderee:
- for _data in entity_data:
- ent = _data[0]
- _span = _data[1]
- if re.search(self.tenderee_left_2,_span[0]):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- if not self.get_tenderee:
- for _data in entity_data:
- ent = _data[0]
- _span = _data[1]
- if re.search(self.tenderee_left_3,_span[0]):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- if not self.get_tenderee:
- for _data in entity_data:
- ent = _data[0]
- _span = _data[1]
- if re.search(self.tenderee_left_4,_span[0]):
- if len(list_agency)>0:
- _same = False
- for agency in list_agency:
- if ent.entity_text in agency or agency in ent.entity_text:
- _same = True
- break
- if not _same:
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- else:
- if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent.entity_text
- ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text) or re.search("自行.?采购",list_sentences[0][ent.sentence_index].sentence_text):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- if not self.get_tenderee:
- for _data in entity_data:
- ent = _data[0]
- _span = _data[1]
- if re.search(self.tenderee_left_5,_span[0]):
- if len(list_agency)>0:
- _same = False
- for agency in list_agency:
- if ent.entity_text in agency or agency in ent.entity_text:
- _same = True
- break
- if not _same:
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- else:
- if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', ent.entity_text
- ) or not re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent.entity_text):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- if not self.get_tenderee:
- for _data in entity_data:
- ent = _data[0]
- _span = _data[1]
- if re.search(self.tenderee_right, _span[2]):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- if not self.get_tenderee:
- for _data in entity_data:
- ent = _data[0]
- _span = _data[1]
- if re.search(self.tenderee_right2, _span[2]):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- if not self.get_tenderee:
- if list_name:
- for _data in entity_data:
- ent = _data[0]
- _span = _data[1]
- pj_name = re.search(self.tenderee_right3, _span[2])
- if pj_name:
- pj_name = pj_name.groupdict()["project"]
- for _name in list_name:
- if _name in pj_name:
- ent.label = 0
- ent.values[0] = 0.5
- self.get_tenderee = True
- break
- # for _data in entity_data:
- # ent = _data[0]
- # _span = _data[1]
- # if re.search(self.tenderee_left,_span[0]):
- # ent.label = 0
- # ent.values[0] = 0.5 + ent.values[0] / 10
- # self.get_tenderee = True
- # elif re.search(self.tenderee_right,_span[2]):
- # ent.label = 0
- # ent.values[0] = 0.5 + ent.values[0] / 10
- # self.get_tenderee = True
- # elif re.search(self.tenderee_right2, _span[2]):
- # ent.label = 0
- # ent.values[0] = 0.5 + ent.values[0] / 10
- # self.get_tenderee = True
- # elif list_name:
- # pj_name = re.search(self.tenderee_right3, _span[2])
- # if pj_name:
- # pj_name = pj_name.groupdict()["project"]
- # for _name in list_name:
- # if _name in pj_name:
- # ent.label = 0
- # ent.values[0] = 0.5
- # self.get_tenderee = True
- # break
- # 公告主语判断
- def subject_rule(self, entitys,list_articles,list_sentences):
- content = list_articles[0].content.split('##attachment##')[0]
- if re.search(self.subject,content):
- _subject = re.search(self.subject,content).group()
- for ent in entitys:
- if re.search("院",_subject) and re.search("医院|学院",ent.entity_text):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- elif re.search("校",_subject) and re.search("学校|学院|大学|高中|初中|中学|小学",ent.entity_text):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- elif re.search("局", _subject) and re.search("局", ent.entity_text):
- _sentence = list_sentences[0][ent.sentence_index]
- _span = spanWindow(tokens=_sentence.tokens, begin_index=ent.begin_index,
- end_index=ent.end_index, size=20, center_include=True,
- word_flag=True, use_text=True,
- text=re.sub(")", ")", re.sub("(", "(", ent.entity_text)))
- if not re.search("监督|投诉",_span[0][-10:]):
- ent.label = 0
- ent.values[0] = 0.5 + ent.values[0] / 10
- self.get_tenderee = True
- # 正则召回未识别实体
- def unrecognized_entity_rule(self,pattern,list_sentences,list_entitys,on_value=0.5):
- list_sentence = list_sentences[0]
- for in_attachment in [False,True]:
- for sentence in [sentence for sentence in list_sentence if sentence.in_attachment==in_attachment]:
- sentence_text = sentence.sentence_text
- tokens = sentence.tokens
- doc_id = sentence.doc_id
- in_attachment = sentence.in_attachment
- list_tokenbegin = []
- begin = 0
- for i in range(0, len(tokens)):
- list_tokenbegin.append(begin)
- begin += len(str(tokens[i]))
- list_tokenbegin.append(begin + 1)
- for _match in re.finditer(pattern,sentence_text):
- _groupdict = _match.groupdict()
- _match_text = _match.group()
- _unrecognized_text = _groupdict["unrecognized"]
- _unrecognized = re.search(self.unrecognized_end1,_unrecognized_text)
- if not _unrecognized:
- _unrecognized = re.search(self.unrecognized_end2, _unrecognized_text)
- if _unrecognized:
- _unrecognized = _unrecognized.group()
- else:
- continue
- # print(_unrecognized)
- if re.search("某",_unrecognized) or len(_unrecognized)>15:
- continue
- begin_index_temp = _match.start()+len(_groupdict['tenderee_left'])
- for j in range(len(list_tokenbegin)):
- if list_tokenbegin[j] == begin_index_temp:
- begin_index = j
- break
- elif list_tokenbegin[j] > begin_index_temp:
- begin_index = j - 1
- break
- index = begin_index_temp + len(_unrecognized)
- end_index_temp = index
- for j in range(begin_index, len(list_tokenbegin)):
- if list_tokenbegin[j] >= index:
- end_index = j - 1
- break
- entity_id = "%s_%d_%d_%d" % (doc_id, sentence.sentence_index, begin_index, end_index)
- entity_text = _unrecognized
- new_entity = Entity(doc_id, entity_id, entity_text, 'company', sentence.sentence_index, begin_index, end_index,
- begin_index_temp, end_index_temp, in_attachment=in_attachment)
- new_entity.label = 0
- new_entity.values = [on_value,0,0,0,0,0]
- list_entitys[0].append(new_entity)
- self.get_tenderee = True
- if self.get_tenderee:
- list_entitys[0] = sorted(list_entitys[0], key=lambda x: (x.sentence_index, x.begin_index))
- break
- class RoleGrade():
- def __init__(self):
- self.tenderee_left_9 = "(?P<tenderee_left_9>(招标|采购|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|方|单位))"
- self.tenderee_center_9 = "(?P<tenderee_center_9>受.{5,20}委托)"
- self.tenderee_left_8 = "(?P<tenderee_left_8>(业主|转让方|尊敬的供应商|出租方|处置方|(需求|建设|最终|发包|甲)(人|方|单位|组织|用户|业主|主体|部门|公司)))"
- self.agency_left_9 = "(?P<agency_left_9>代理)"
- self.winTenderer_left_9 = "(?P<winTenderer_left_9>(中标|中选|中价|成交|竞得)|第[1一]|排名:1)"
- self.winTenderer_left_8 = "(?P<winTenderer_left_8>(入选供应商|供货商|乙方))"
- self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排名:2))"
- self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排名:3))"
- self.pattern_list = [self.tenderee_left_9,self.tenderee_center_9, self.tenderee_left_8,self.agency_left_9, self.winTenderer_left_9,
- self.winTenderer_left_8, self.secondTenderer_left_9, self.thirdTenderer_left_9]
- def predict(self, list_sentences, list_entitys, span=15, min_prob=0.7):
- '''
- 根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
- 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
- :param list_articles:
- :param list_sentences:
- :param list_entitys:
- :param codeName:
- :return:
- '''
- sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
- role2id = {"tenderee": 0, "agency": 1, "winTenderer": 2, "secondTenderer": 3, "thirdTenderer": 4}
- org_winner = []
- company_winner = []
- org_tenderee = []
- for entity in list_entitys[0]:
- if entity.entity_type in ['org', 'company'] and entity.label in [0, 1, 2, 3, 4] and entity.values[entity.label]> min_prob:
- text = sentences[entity.sentence_index].sentence_text
- in_att = sentences[entity.sentence_index].in_attachment
- pre_prob = entity.values[entity.label]
- b = entity.wordOffset_begin
- e = entity.wordOffset_end
- not_found = 1
- for pattern in self.pattern_list:
- if 'left' in pattern:
- context = text[max(0, b-span):b]
- elif 'right' in pattern:
- context = text[e:e+span]
- elif 'center' in pattern:
- context = text[max(0, b-span):e+span]
- else:
- print('规则错误', pattern)
- ser = re.search(pattern, context)
- if ser:
- groupdict = pattern.split('>')[0].replace('(?P<', '')
- _role, _direct, _prob = groupdict.split('_')
- _label = role2id.get(_role)
- if _label != entity.label:
- continue
- _prob = int(_prob)*0.1
- # print('规则修改角色概率前:', entity.entity_text, entity.label, entity.values)
- if in_att:
- _prob = _prob - 0.2
- if pre_prob < _prob:
- _prob = 0.65
- entity.values[_label] = _prob + entity.values[_label] / 20
- not_found = 0
- # print('规则修改角色概率后:', entity.entity_text, entity.label, entity.values)
- break
- if not_found and entity.values[entity.label]> min_prob:
- _prob = min_prob - 0.1 if in_att else min_prob
- entity.values[entity.label] = _prob + entity.values[entity.label] / 20
- # print('找不到规则修改角色概率:', entity.entity_text, entity.label, entity.values)
- if entity.label == 2 and entity.values[entity.label]> min_prob:
- if entity.entity_type == 'org':
- org_winner.append(entity)
- elif entity.entity_type == 'company':
- company_winner.append(entity) # 保存中标人实体
- if entity.label == 0 and entity.values[entity.label]> min_prob:
- org_tenderee.append(entity.entity_text) # 保存所有招标人名称
- if entity.entity_type in ['org', 'company'] and entity.label == 0 and entity.entity_text in agency_set and entity.values[entity.label]<0.6: # 修改概率小于0.6的且在大数据代理集合里面的招标人为代理人
- # log('修改概率小于0.6的且在大数据代理集合里面的招标人为代理人%s:'%entity.entity_text)
- entity.label = 1
- entity.values[entity.label] = 0.5
- if org_winner != []:
- flag = 0
- if org_tenderee != []:
- for ent in org_winner:
- if ent.entity_text in org_tenderee:
- # log('如果org中标人同时为招标人角色,降低中标概率:%s, %s' % (ent.entity_text, ent.label))
- ent.values[2] = 0.6
- flag = 1
- if flag == 0 and company_winner != []:
- for ent in org_winner:
- if ent.label == 2 and ent.values[2] > 0.6:
- # log('如果同时包含org和company中标人,降低org中标人概率为0.6:%s, %s' % (ent.entity_text, ent.values[2]))
- ent.values[2] = 0.6
- class MoneyGrade():
- def __init__(self):
- self.tenderee_money_left_9 = "(?P<tenderee_left_9>最高(投标)?限价)|控制价|拦标价"
- self.tenderee_money_left_8 = "(?P<tenderee_left_8>预算|限价|起始|起拍|底价|标底)"
- self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同|总报价))"
- self.tenderer_money_left_8 = "(?P<tenderer_left_8>(投标|总价))"
- self.pattern_list = [self.tenderee_money_left_9, self.tenderee_money_left_8, self.tenderer_money_left_9]
- def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
- sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
- role2id = {"tenderee": 0, "tenderer": 1}
- for entity in list_entitys[0]:
- if entity.entity_type in ['money'] and entity.label in [0, 1] and entity.values[entity.label]> 0.6:
- text = sentences[entity.sentence_index].sentence_text
- in_att = sentences[entity.sentence_index].in_attachment
- b = entity.wordOffset_begin
- e = entity.wordOffset_end
- context = text[max(0, b - span):b]
- not_found = 1
- for pattern in self.pattern_list:
- ser = re.search(pattern, context)
- if ser:
- groupdict = pattern.split('>')[0].replace('(?P<', '')
- _role, _direct, _prob = groupdict.split('_')
- _label = role2id.get(_role)
- if _label != entity.label:
- continue
- _prob = int(_prob) * 0.1
- # print('规则修改金额概率前:', entity.entity_text, entity.label, entity.values)
- if in_att:
- _prob = _prob - 0.2
- entity.values[_label] = _prob + entity.values[_label] / 20
- not_found = 0
- # print('规则修改金额概率后:', entity.entity_text, entity.label, entity.values)
- break
- if not_found and entity.values[entity.label] > min_prob:
- _prob = min_prob - 0.1 if in_att else min_prob
- entity.values[entity.label] = _prob + entity.values[entity.label] / 20
- # print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
- # 时间类别
- class TimePredictor():
- def __init__(self,config=None):
- self.sess = tf.Session(graph=tf.Graph(),config=config)
- self.inputs_code = None
- self.outputs_code = None
- self.input_shape = (2,40,128)
- self.load_model()
- def load_model(self):
- model_path = os.path.dirname(__file__)+'/timesplit_model'
- if self.inputs_code is None:
- log("get model of time")
- with self.sess.as_default():
- with self.sess.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path)
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- self.inputs_code = []
- self.inputs_code.append(
- self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name))
- self.inputs_code.append(
- self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name))
- self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
- return self.inputs_code, self.outputs_code
- else:
- return self.inputs_code, self.outputs_code
- def search_time_data(self,list_sentences,list_entitys):
- data_x = []
- points_entitys = []
- for list_sentence, list_entity in zip(list_sentences, list_entitys):
- p_entitys = 0
- p_sentences = 0
- list_sentence.sort(key=lambda x: x.sentence_index)
- while(p_entitys<len(list_entity)):
- entity = list_entity[p_entitys]
- if entity.entity_type in ['time']:
- while(p_sentences<len(list_sentence)):
- sentence = list_sentence[p_sentences]
- if entity.doc_id == sentence.doc_id and entity.sentence_index == sentence.sentence_index:
- # left = sentence.sentence_text[max(0,entity.wordOffset_begin-self.input_shape[1]):entity.wordOffset_begin]
- # right = sentence.sentence_text[entity.wordOffset_end:entity.wordOffset_end+self.input_shape[1]]
- s = spanWindow(tokens=sentence.tokens,begin_index=entity.begin_index,end_index=entity.end_index,size=self.input_shape[1])
- left = s[0]
- right = s[1]
- context = [left, right]
- x = self.embedding_words(context, shape=self.input_shape)
- data_x.append(x)
- points_entitys.append(entity)
- break
- p_sentences += 1
- p_entitys += 1
- if len(points_entitys)==0:
- return None
- data_x = np.transpose(np.array(data_x), (1, 0, 2, 3))
- return [data_x, points_entitys]
- def embedding_words(self, datas, shape):
- '''
- @summary:查找词汇对应的词向量
- @param:
- datas:词汇的list
- shape:结果的shape
- @return: array,返回对应shape的词嵌入
- '''
- model_w2v = getModel_w2v()
- embed = np.zeros(shape)
- length = shape[1]
- out_index = 0
- for data in datas:
- index = 0
- for item in data:
- item_not_space = re.sub("\s*", "", item)
- if index >= length:
- break
- if item_not_space in model_w2v.vocab:
- embed[out_index][index] = model_w2v[item_not_space]
- index += 1
- else:
- embed[out_index][index] = model_w2v['unk']
- index += 1
- out_index += 1
- return embed
- def predict(self, list_sentences,list_entitys):
- datas = self.search_time_data(list_sentences, list_entitys)
- if datas is None:
- return
- points_entitys = datas[1]
- with self.sess.as_default():
- predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0]
- ,self.inputs_code[1]:datas[0][1]})[0]
- for i in range(len(predict_y)):
- entity = points_entitys[i]
- label = np.argmax(predict_y[i])
- values = []
- for item in predict_y[i]:
- values.append(item)
- if label != 0:
- if not timeFormat(entity.entity_text):
- label = 0
- values[0] = 0.5
- entity.set_Role(label, values)
- # 产品字段提取
- class ProductPredictor():
- def __init__(self,config=None):
- vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
- self.vocab = load(vocabpath)
- self.word2index = dict((w, i) for i, w in enumerate(np.array(self.vocab)))
- self.sess = tf.Session(graph=tf.Graph(),config=config)
- self.load_model()
- def load_model(self):
- # model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb'
- model_path = os.path.dirname(__file__)+'/product_savedmodel/productAndfailreason.pb'
- with self.sess.as_default():
- with self.sess.graph.as_default():
- output_graph_def = tf.GraphDef()
- with open(model_path, 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- self.sess.run(tf.global_variables_initializer())
- self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0')
- self.length = self.sess.graph.get_tensor_by_name("Sum:0")
- self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0")
- self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0")
- self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0")
- def decode(self,logits, lengths, matrix):
- paths = []
- small = -1000.0
- # start = np.asarray([[small] * 4 + [0]])
- start = np.asarray([[small]*7+[0]])
- for score, length in zip(logits, lengths):
- score = score[:length]
- pad = small * np.ones([length, 1])
- logits = np.concatenate([score, pad], axis=1)
- logits = np.concatenate([start, logits], axis=0)
- path, _ = viterbi_decode(logits, matrix)
- paths.append(path[1:])
- return paths
- def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
- '''
- 预测实体代码,每个句子最多取MAX_AREA个字,超过截断
- :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
- :param list_entitys: 多篇公告实体列表
- :param MAX_AREA: 每个句子最多截取多少字
- :return: 把预测出来的实体放进实体类
- '''
- with self.sess.as_default() as sess:
- with self.sess.graph.as_default():
- result = []
- product_list = []
- if fail and list_articles!=[]:
- text_list = [list_articles[0].content[:MAX_AREA]]
- chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in text] for text in text_list]
- if USE_API:
- requests_result = requests.post(API_URL + "/predict_product",
- json={"inputs": chars}, verify=True)
- batch_paths = json.loads(requests_result.text)['result']
- lengths = json.loads(requests_result.text)['lengths']
- else:
- lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
- feed_dict={
- self.char_input: np.asarray(chars),
- self.dropout: 1.0
- })
- batch_paths = self.decode(scores, lengths, tran_)
- for text, path, length in zip(text_list, batch_paths, lengths):
- tags = ''.join([str(it) for it in path[:length]])
- # 提取产品
- for it in re.finditer("12*3", tags):
- start = it.start()
- end = it.end()
- _entity = Entity(doc_id=list_articles[0].id, entity_id="%s_%s_%s_%s" % (
- list_articles[0].doc_id, 0, start, end),
- entity_text=text[start:end],
- entity_type="product", sentence_index=0,
- begin_index=0, end_index=0, wordOffset_begin=start,
- wordOffset_end=end)
- list_entitys[0].append(_entity)
- product_list.append(text[start:end])
- # 提取失败原因
- for it in re.finditer("45*6", tags):
- start = it.start()
- end = it.end()
- result.append(text[start:end].replace('?', '').strip())
- reasons = []
- for it in result:
- if "(√)" in it or "(√)" in it:
- reasons = [it]
- break
- if reasons != [] and (it not in reasons[-1] and it not in reasons):
- reasons.append(it)
- elif reasons == []:
- reasons.append(it)
- if reasons == []: # 如果模型识别不到失败原因 就用规则补充
- for text in text_list:
- ser1 = re.search('\w{,4}(理由|原因):\s*((第\d+包|标项\d+|原因类型)?[::]?[\s*\w,]{2,30}((不满?足|少于|未达)((法定)?[123一二三两]家|(规定)?要求)|(项目|采购)(终止|废标)),?)+',text)
- ser2 = re.search(
- '\w{,4}(理由|原因):\s*(第\d+包|标项\d+|原因类型)?[::]?[\s*\w]{4,30},', text)
- if ser1:
- reasons.append(ser1.group(0))
- break
- elif ser2:
- reasons.append(ser2.group(0))
- break
- return {'fail_reason':';'.join(reasons)}, product_list
- if list_entitys is None:
- list_entitys = [[] for _ in range(len(list_sentences))]
- for list_sentence, list_entity in zip(list_sentences,list_entitys):
- if len(list_sentence)==0:
- result.append({"product":[]})
- continue
- list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
- _begin_index = 0
- item = {"product":[]}
- temp_list = []
- while True:
- MAX_LEN = len(list_sentence[_begin_index].sentence_text)
- if MAX_LEN > MAX_AREA:
- MAX_LEN = MAX_AREA
- _LEN = MAX_AREA//MAX_LEN
- chars = [sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
- chars = [[self.word2index.get(it, self.word2index.get('<unk>')) for it in l] for l in chars]
- chars = pad_sequences(chars, maxlen=MAX_LEN, padding="post", truncating="post")
- if USE_API:
- requests_result = requests.post(API_URL + "/predict_product",
- json={"inputs": chars.tolist()}, verify=True)
- batch_paths = json.loads(requests_result.text)['result']
- lengths = json.loads(requests_result.text)['lengths']
- else:
- lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran],
- feed_dict={
- self.char_input: np.asarray(chars),
- self.dropout: 1.0
- })
- batch_paths = self.decode(scores, lengths, tran_)
- for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths):
- tags = ''.join([str(it) for it in path[:length]])
- for it in re.finditer("12*3", tags):
- start = it.start()
- end = it.end()
- _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % (
- sentence.doc_id, sentence.sentence_index, start, end),
- entity_text=sentence.sentence_text[start:end],
- entity_type="product", sentence_index=sentence.sentence_index,
- begin_index=0, end_index=0, wordOffset_begin=start,
- wordOffset_end=end,in_attachment=sentence.in_attachment)
- list_entity.append(_entity)
- temp_list.append(sentence.sentence_text[start:end])
- product_list.append(sentence.sentence_text[start:end])
- # item["product"] = list(set(temp_list))
- # result.append(item)
- if _begin_index+_LEN >= len(list_sentence):
- break
- _begin_index += _LEN
- item["product"] = list(set(temp_list))
- result.append(item) # 修正bug
- return {'fail_reason': ""},product_list
- # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取
- class ProductAttributesPredictor():
- def __init__(self,):
- self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)'
- self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称'
- with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
- self.header_set = pickle.load(f)
- def isTrueTable(self, table):
- '''真假表格规则:
- 1、包含<caption>或<th>标签为真
- 2、包含大量链接、表单、图片或嵌套表格为假
- 3、表格尺寸太小为假
- 4、外层<table>嵌套子<table>,一般子为真,外为假'''
- if table.find_all(['caption', 'th']) != []:
- return True
- elif len(table.find_all(['form', 'a', 'img'])) > 5:
- return False
- elif len(table.find_all(['tr'])) < 2:
- return False
- elif len(table.find_all(['table'])) >= 1:
- return False
- else:
- return True
- def getTrs(self, tbody):
- # 获取所有的tr
- trs = []
- objs = tbody.find_all(recursive=False)
- for obj in objs:
- if obj.name == "tr":
- trs.append(obj)
- if obj.name == "tbody":
- for tr in obj.find_all("tr", recursive=False):
- trs.append(tr)
- return trs
- def getTable(self, tbody):
- trs = self.getTrs(tbody)
- inner_table = []
- if len(trs) < 2:
- return inner_table
- for tr in trs:
- tr_line = []
- tds = tr.findChildren(['td', 'th'], recursive=False)
- if len(tds) < 2:
- continue
- for td in tds:
- td_text = re.sub('\s+|…', ' ', td.get_text()).strip()
- td_text = td_text.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '/').replace('"', '') # 修复272144312 # 产品单价数量提取结果有特殊符号\ 气动执行装置备件\密封组件\NBR+PT
- td_text = td_text.replace("(", "(").replace(")", ")").replace(':', ':')
- tr_line.append(td_text)
- inner_table.append(tr_line)
- return inner_table
- def fixSpan(self, tbody):
- # 处理colspan, rowspan信息补全问题
- trs = self.getTrs(tbody)
- ths_len = 0
- ths = list()
- trs_set = set()
- # 修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱
- # 遍历每一个tr
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- # 不补全含有表格的tr
- if len(tr.findChildren('table')) > 0:
- continue
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- if len(tds) < 3:
- continue # 列数太少的不补全
- for indtd, td in enumerate(tds):
- # 若有colspan 则补全同一行下一个位置
- if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "":
- col = int(re.sub("[^0-9]", "", str(td['colspan'])))
- if col < 10 and len(td.get_text()) < 500:
- td['colspan'] = 1
- for i in range(1, col, 1):
- td.insert_after(copy.copy(td))
- for indtr, tr in enumerate(trs):
- ths_tmp = tr.findChildren('th', recursive=False)
- # 不补全含有表格的tr
- if len(tr.findChildren('table')) > 0:
- continue
- if len(ths_tmp) > 0:
- ths_len = ths_len + len(ths_tmp)
- for th in ths_tmp:
- ths.append(th)
- trs_set.add(tr)
- # 遍历每行中的element
- tds = tr.findChildren(recursive=False)
- same_span = 0
- if len(tds) > 1 and 'rowspan' in tds[0].attrs:
- span0 = tds[0].attrs['rowspan']
- for td in tds:
- if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0:
- same_span += 1
- if same_span == len(tds):
- continue
- for indtd, td in enumerate(tds):
- # 若有rowspan 则补全下一行同样位置
- if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "":
- row = int(re.sub("[^0-9]", "", str(td['rowspan'])))
- td['rowspan'] = 1
- for i in range(1, row, 1):
- # 获取下一行的所有td, 在对应的位置插入
- if indtr + i < len(trs):
- tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False)
- if len(tds1) >= (indtd) and len(tds1) > 0:
- if indtd > 0:
- tds1[indtd - 1].insert_after(copy.copy(td))
- else:
- tds1[0].insert_before(copy.copy(td))
- elif len(tds1) > 0 and len(tds1) == indtd - 1:
- tds1[indtd - 2].insert_after(copy.copy(td))
- def get_monthlen(self, year, month):
- '''输入年份、月份 int类型 得到该月份天数'''
- try:
- weekday, num = calendar.monthrange(int(year), int(month))
- except:
- num = 30
- return str(num)
- def fix_time(self, text, html, page_time):
- '''输入日期字段返回格式化日期'''
- for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'),
- ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]:
- if it[0] in text:
- text = text.replace(it[0], it[1])
- if re.search('^\d{1,2}月$', text):
- m = re.search('^(\d{1,2})月$', text).group(1)
- if len(m) < 2:
- m = '0' + m
- year = re.search('(\d{4})年(.{,12}采购意向)?', html)
- if year:
- y = year.group(1)
- num = self.get_monthlen(y, m)
- if len(num) < 2:
- num = '0' + num
- order_begin = "%s-%s-01" % (y, m)
- order_end = "%s-%s-%s" % (y, m, num)
- elif page_time != "":
- year = re.search('\d{4}', page_time)
- if year:
- y = year.group(0)
- num = self.get_monthlen(y, m)
- if len(num) < 2:
- num = '0' + num
- order_begin = "%s-%s-01" % (y, m)
- order_end = "%s-%s-%s" % (y, m, num)
- else:
- y = str(datetime.datetime.now().year)
- num = self.get_monthlen(y, m)
- if len(num) < 2:
- num = '0' + num
- order_begin = "%s-%s-01" % (y, m)
- order_end = "%s-%s-%s" % (y, m, num)
- else:
- y = str(datetime.datetime.now().year)
- num = self.get_monthlen(y, m)
- if len(num) < 2:
- num = '0' + num
- order_begin = "%s-%s-01" % (y, m)
- order_end = "%s-%s-%s" % (y, m, num)
- return order_begin, order_end
- t1 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})月?$', text)
- if t1:
- year = t1.group(1)
- month = t1.group(3)
- num = self.get_monthlen(year, month)
- if len(month)<2:
- month = '0'+month
- if len(num) < 2:
- num = '0'+num
- order_begin = "%s-%s-01" % (year, month)
- order_end = "%s-%s-%s" % (year, month, num)
- return order_begin, order_end
- t2 = re.search('^(\d{4})(年|/|\.|-)(\d{1,2})(月|/|\.|-)(\d{1,2})日?$', text)
- if t2:
- y = t2.group(1)
- m = t2.group(3)
- d = t2.group(5)
- m = '0'+ m if len(m)<2 else m
- d = '0'+d if len(d)<2 else d
- order_begin = order_end = "%s-%s-%s"%(y,m,d)
- return order_begin, order_end
- # 时间样式:"202105"
- t3 = re.search("^(20\d{2})(\d{1,2})$",text)
- if t3:
- year = t3.group(1)
- month = t3.group(2)
- if int(month)>0 and int(month)<=12:
- num = self.get_monthlen(year, month)
- if len(month) < 2:
- month = '0' + month
- if len(num) < 2:
- num = '0' + num
- order_begin = "%s-%s-01" % (year, month)
- order_end = "%s-%s-%s" % (year, month, num)
- return order_begin, order_end
- # 时间样式:"20210510"
- t4 = re.search("^(20\d{2})(\d{2})(\d{2})$", text)
- if t4:
- year = t4.group(1)
- month = t4.group(2)
- day = t4.group(3)
- if int(month) > 0 and int(month) <= 12 and int(day)>0 and int(day)<=31:
- order_begin = order_end = "%s-%s-%s"%(year,month,day)
- return order_begin, order_end
- all_match = re.finditer('^(?P<y1>\d{4})(年|/|\.)(?P<m1>\d{1,2})(?:(月|/|\.)(?:(?P<d1>\d{1,2})日)?)?'
- '(到|至|-)(?:(?P<y2>\d{4})(年|/|\.))?(?P<m2>\d{1,2})(?:(月|/|\.)'
- '(?:(?P<d2>\d{1,2})日)?)?$', text)
- y1 = m1 = d1 = y2 = m2 = d2 = ""
- found_math = False
- for _match in all_match:
- if len(_match.group()) > 0:
- found_math = True
- for k, v in _match.groupdict().items():
- if v!="" and v is not None:
- if k == 'y1':
- y1 = v
- elif k == 'm1':
- m1 = v
- elif k == 'd1':
- d1 = v
- elif k == 'y2':
- y2 = v
- elif k == 'm2':
- m2 = v
- elif k == 'd2':
- d2 = v
- if not found_math:
- return "", ""
- y2 = y1 if y2 == "" else y2
- d1 = '1' if d1 == "" else d1
- d2 = self.get_monthlen(y2, m2) if d2 == "" else d2
- m1 = '0' + m1 if len(m1) < 2 else m1
- m2 = '0' + m2 if len(m2) < 2 else m2
- d1 = '0' + d1 if len(d1) < 2 else d1
- d2 = '0' + d2 if len(d2) < 2 else d2
- order_begin = "%s-%s-%s"%(y1,m1,d1)
- order_end = "%s-%s-%s"%(y2,m2,d2)
- return order_begin, order_end
- def fix_quantity(self, quantity_text, header_quan_unit):
- '''
- 产品数量标准化,统一为数值型字符串
- :param quantity_text: 原始数量字符串
- :param header_quan_unit: 表头数量单位字符串
- :return: 返回数量及单位
- '''
- quantity = quantity_text
- quantity = re.sub('[()(),,约]', '', quantity)
- quantity = re.sub('[一壹]', '1', quantity)
- ser = re.search('^(\d+\.?\d*)([㎡\w/]{,5})', quantity)
- if ser:
- quantity = str(ser.group(1))
- quantity_unit = ser.group(2)
- if quantity_unit == "" and header_quan_unit != "":
- quantity_unit = header_quan_unit
- else:
- quantity = ""
- quantity_unit = ""
- return quantity, quantity_unit
- def find_header(self, items, p1, p2):
- '''
- inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容
- :param items: 列表,内容为每个td 文本内容
- :param p1: 优先表头正则
- :param p2: 第二表头正则
- :return: 表头所在列序号,是否表头,表头内容
- '''
- flag = False
- header_dic = {'名称': '', '数量': '', '单位': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''}
- product = "" # 产品
- quantity = "" # 数量
- quantity_unit = "" # 数量单位
- unitPrice = "" # 单价
- brand = "" # 品牌
- specs = "" # 规格
- demand = "" # 采购需求
- budget = "" # 预算金额
- order_time = "" # 采购时间
- for i in range(min(4, len(items))):
- it = items[i]
- if len(it) < 15 and re.search(p1, it) != None:
- flag = True
- product = it
- header_dic['名称'] = i
- break
- if not flag:
- for i in range(min(4, len(items))):
- it = items[i]
- if len(it) < 15 and re.search(p2, it) and re.search(
- '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None:
- flag = True
- product = it
- header_dic['名称'] = i
- break
- if flag:
- for j in range(i + 1, len(items)):
- if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10:
- continue
- if header_dic['数量']=="" and re.search('数量|采购量', items[j]) and re.search('单价|用途|要求|规格|型号|运输|承运', items[j])==None:
- header_dic['数量'] = j
- quantity = items[j]
- elif header_dic['单位']=="" and re.search('^(数量单位|计量单位|单位)$', items[j]):
- header_dic['单位'] = j
- quantity_unit = items[j]
- elif re.search('单价', items[j]):
- header_dic['单价'] = j
- unitPrice = items[j]
- elif re.search('品牌', items[j]):
- header_dic['品牌'] = j
- brand = items[j]
- elif re.search('规格|型号', items[j]):
- header_dic['规格'] = j
- specs = items[j]
- elif re.search('需求|服务要求|服务标准', items[j]):
- header_dic['需求'] = j
- demand = items[j]
- elif re.search('预算|控制金额', items[j]):
- header_dic['预算'] = j
- budget = items[j]
- elif re.search('时间|采购实施月份|采购月份|采购日期', items[j]):
- header_dic['时间'] = j
- order_time = items[j]
- if header_dic.get('名称', "") != "" :
- num = 0
- for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time):
- if it != "":
- num += 1
- if num >=2:
- return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
- flag = False
- return header_dic, flag, (product, quantity, quantity_unit, unitPrice, brand, specs), (product, demand, budget, order_time)
- def predict(self, docid='', html='', page_time=""):
- '''
- 正则寻找table表格内 产品相关信息
- :param html:公告HTML原文
- :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息
- '''
- soup = BeautifulSoup(html, 'lxml')
- # flag_yx = True if re.search('采购意向', html) else False
- flag_yx = True if re.search('采购意向|招标意向|选取意向|意向公告|意向公示|意向公开', html) else False
- tables = soup.find_all(['table'])
- headers = []
- headers_demand = []
- header_col = []
- product_link = []
- demand_link = []
- total_product_money = 0
- for i in range(len(tables)-1, -1, -1):
- table = tables[i]
- if table.parent.name == 'td' and len(table.find_all('td')) <= 3:
- table.string = table.get_text()
- table.name = 'turntable'
- continue
- if not self.isTrueTable(table):
- continue
- self.fixSpan(table)
- inner_table = self.getTable(table)
- # print(inner_table)
- i = 0
- found_header = False
- header_quan_unit = "" # 数量表头 包含单位
- header_colnum = 0
- if flag_yx:
- col0_l = []
- col1_l = []
- for tds in inner_table:
- if len(tds) == 2:
- col0_l.append(re.sub('[::]', '', tds[0])) # 处理只有两列的情况
- col1_l.append(tds[1])
- elif len(tds)>=4 and len(inner_table)==2: # 处理只有两行的情况
- col0_l = inner_table[0]
- col1_l = inner_table[1]
- break
- # print(set(col0_l))
- # print('head: ',set(col0_l) & self.header_set)
- if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2 and len(col0_l)==len(col1_l): # 保证两个列数一致
- header_list2 = []
- product = demand = budget = order_begin = order_end = ""
- for i in range(len(col0_l)):
- if re.search('项目名称', col0_l[i]):
- header_list2.append(col0_l[i])
- product = col1_l[i]
- elif re.search('采购需求|需求概况', col0_l[i]):
- header_list2.append(col0_l[i])
- demand = col1_l[i]
- elif re.search('采购预算|预算金额|控制金额', col0_l[i]):
- header_list2.append(col0_l[i])
- _budget = col1_l[i]
- re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", _budget)
- if re_price:
- _budget = re_price[0]
- if '万元' in col0_l[i] and '万' not in _budget:
- _budget += '万元'
- budget = str(getUnifyMoney(_budget))
- elif re.search('采购时间|采购实施月份|采购月份|采购日期', col0_l[i]):
- header_list2.append(col0_l[i])
- order_time = col1_l[i].strip()
- order_begin, order_end = self.fix_time(order_time, html, page_time)
- if order_begin != "" and order_end!="":
- order_begin_year = int(order_begin.split("-")[0])
- order_end_year = int(order_end.split("-")[0])
- # 限制附件错误识别时间
- if order_begin_year>=2050 or order_end_year>=2050:
- order_begin = order_end = ""
- # print(product,demand,budget,order_begin)
- if product!= "" and demand != "" and budget!="" and order_begin != "" and len(budget)<15: # 限制金额小于15位数的才要
- link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
- 'order_begin': order_begin, 'order_end': order_end}
- if link not in demand_link:
- demand_link.append(link)
- headers_demand.append('_'.join(header_list2))
- continue
- while i < (len(inner_table)):
- tds = inner_table[i]
- not_empty = [it for it in tds if it != ""]
- if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2: # 一半列是空的或者小于两列的 继续
- i += 1
- continue
- product = "" # 产品
- quantity = "" # 数量
- quantity_unit = "" # 数量单位
- unitPrice = "" # 单价
- brand = "" # 品牌
- specs = "" # 规格
- demand = "" # 采购需求
- budget = "" # 预算金额
- order_time = "" # 采购时间
- order_begin = ""
- order_end = ""
- # print(tds,set(tds) & self.header_set)
- if len(set([re.sub('[::]','',td) for td in tds]) & self.header_set) > len(tds) * 0.2:
- # if len(set(tds) & self.header_set) > len(tds) * 0.2:
- header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2)
- if found_header:
- header_colnum = len(tds) # 保存表头所在行列数
- if found_header and isinstance(header_list, tuple) and len(header_list) > 2: # 获取表头中的 数量单位
- quantity_header = header_list[1].replace('单位:', '')
- if re.search('(([\w/]{,5}))', quantity_header):
- header_quan_unit = re.search('(([\w/]{,5}))', quantity_header).group(1)
- else:
- header_quan_unit = ""
- if found_header and len(headers)<1: # 只保留出现的第一个表头
- headers.append('_'.join(header_list))
- headers_demand.append('_'.join(header_list2))
- header_col.append('_'.join(tds))
- i += 1
- continue
- elif found_header:
- if len(tds) != header_colnum: # 表头、属性列数不一致跳过
- i += 1
- continue
- id1 = header_dic.get('名称', "")
- id2 = header_dic.get('数量', "")
- id2_2 = header_dic.get('单位', "")
- id3 = header_dic.get('单价', "")
- id4 = header_dic.get('品牌', "")
- id5 = header_dic.get('规格', "")
- id6 = header_dic.get('需求', "")
- id7 = header_dic.get('预算', "")
- id8 = header_dic.get('时间', "")
- not_attr = 0
- for k, v in header_dic.items():
- if isinstance(v, int):
- if v >= len(tds) or tds[v] in self.header_set:
- not_attr = 1
- break
- if not_attr: # 只要属性里面有一项为表头,停止匹配
- i += 1
- found_header = False
- continue
- if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \
- re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None:
- product = tds[id1]
- if id2 != "":
- if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]):
- quantity = tds[id2]
- # quantity = re.sub('[()(),,约]', '', quantity)
- # quantity = re.sub('[一壹]', '1', quantity)
- # ser = re.search('^(\d+\.?\d*)([㎡\w/]{,5})', quantity)
- # if ser:
- # quantity = str(ser.group(1))
- # quantity_unit = ser.group(2)
- # if quantity_unit == "" and header_quan_unit != "":
- # quantity_unit = header_quan_unit
- # else:
- # quantity = ""
- # quantity_unit = ""
- if id2_2 != "":
- if re.search('^\w{1,4}$', tds[id2_2]):
- quantity_unit = tds[id2_2]
- if id3 != "":
- if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]):
- unitPrice = tds[id3]
- elif re.search('^[\d,.亿万元人民币欧美日金额:()()]+$', tds[id3].strip()):
- unitPrice = tds[id3]
- # _unitPrice = tds[id3]
- # re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
- # if re_price:
- # _unitPrice = re_price[0]
- # if '万元' in header_list[2] and '万' not in _unitPrice:
- # _unitPrice += '万元'
- # # unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice)
- # unitPrice = str(getUnifyMoney(_unitPrice))
- if id4 != "":
- if re.search('\w', tds[id4]):
- brand = tds[id4]
- else:
- brand = ""
- if id5 != "":
- if re.search('\w', tds[id5]):
- specs = tds[id5][:500] # 限制最多500字
- else:
- specs = ""
- if id6 != "":
- if re.search('\w', tds[id6]):
- demand = tds[id6]
- else:
- demand = ""
- if id7 != "":
- if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]):
- budget = tds[id7]
- # _budget = tds[id7]
- # re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", _budget)
- # if re_price:
- # _budget = re_price[0]
- # if '万元' in header_list2[2] and '万' not in _budget:
- # _budget += '万元'
- # budget = str(getUnifyMoney(_budget))
- if id8 != "":
- if re.search('\w', tds[id8]):
- order_time = tds[id8].strip()
- order_begin, order_end = self.fix_time(order_time, html, page_time)
- if quantity != "" or unitPrice != "" or brand != "" or specs != "":
- if quantity != "":
- quantity, quantity_unit = self.fix_quantity(quantity, header_quan_unit)
- if unitPrice != "":
- unitPrice, _money_unit = money_process(unitPrice, header_list[2])
- unitPrice = str(unitPrice) if unitPrice != 0 else ""
- if budget != "":
- budget, _money_unit = money_process(budget, header_list2[2])
- budget = str(budget) if budget != 0 else ''
- if id2 != "" and id3 != "" and len(re.split('[,,\s]', tds[id2])) > 1 and len(re.split('[,,\s]', tds[id1])) == len(re.split('[,,\s]', tds[id2])): # 处理一个空格包含多个产品,逗号或空格分割情况 例子 292846806 292650743
- products = re.split('[,,\s]', tds[id1])
- quantitys = re.split('[,,\s]', tds[id2])
- unitPrices = re.split('[,,\s]', tds[id3])
- brands = re.split('[,,\s]', brand)
- specses = re.split('[,,\s]', specs)
- if len(products) == len(quantitys) == len(unitPrices) == len(brands) == len(specses):
- for product, quantity, unitPrice, brand, specs in zip(products,quantitys,unitPrices, brands, specses):
- link = {'product': product, 'quantity': quantity,
- 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
- 'brand': brand[:50], 'specs': specs}
- if link not in product_link:
- product_link.append(link)
- mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
- if link['unitPrice'] != "" and mat:
- try:
- total_product_money += float(link['unitPrice']) * float(
- mat.group(1).replace(',', '')) if float(
- mat.group(1).replace(',', '')) < 50000 else 0
- except:
- log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
- link['unitPrice'], link['quantity']))
- elif len(unitPrice) > 15 or len(product)>100: # 单价大于15位数或 产品名称长于100字
- i += 1
- continue
- else:
- link = {'product': product, 'quantity': quantity, 'quantity_unit': quantity_unit, 'unitPrice': unitPrice,
- 'brand': brand[:50], 'specs':specs}
- if link not in product_link:
- product_link.append(link)
- mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
- if link['unitPrice'] != "" and mat:
- try:
- total_product_money += float(link['unitPrice'])*float(mat.group(1).replace(',', '')) if float(mat.group(1).replace(',', ''))<50000 else 0
- except:
- log('产品属性单价数量相乘出错, 单价: %s, 数量: %s'%(link['unitPrice'], link['quantity']))
- if order_begin != "" and order_end != "":
- order_begin_year = int(order_begin.split("-")[0])
- order_end_year = int(order_end.split("-")[0])
- # 限制附件错误识别时间
- if order_begin_year >= 2050 or order_end_year >= 2050:
- order_begin = order_end = ""
- # print(budget,order_time)
- if budget != "" and order_time != "":
- link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end}
- if link not in demand_link:
- demand_link.append(link)
- i += 1
- else:
- i += 1
- if len(product_link)>0:
- attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}}
- else:
- attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
- if len(demand_link)>0:
- demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}}
- else:
- demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}}
- return [attr_dic, demand_dic], total_product_money
- def predict_without_table(self,product_attrs,list_sentences,list_entitys,codeName,prem, html='', page_time=""):
- if len(prem[0]['prem'])==1:
- list_sentences[0].sort(key=lambda x:x.sentence_index)
- list_sentence = list_sentences[0]
- list_entity = list_entitys[0]
- _data = product_attrs[1]['demand_info']['data']
- re_bidding_time = re.compile("(采购时间|采购实施月份|采购月份|采购日期)[::,].{0,2}$")
- order_times = []
- for entity in list_entity:
- if entity.entity_type=='time':
- sentence = list_sentence[entity.sentence_index]
- s = spanWindow(tokens=sentence.tokens, begin_index=entity.begin_index,
- end_index=entity.end_index,size=20)
- entity_left = "".join(s[0])
- if re.search(re_bidding_time,entity_left):
- time_text = entity.entity_text.strip()
- standard_time = re.compile("((?P<year>\d{4}|\d{2})\s*[-\/年\.]\s*(?P<month>\d{1,2})\s*[-\/月\.]\s*((?P<day>\d{1,2})日?)?)")
- time_match = re.search(standard_time,time_text)
- # print(time_text, time_match)
- if time_match:
- time_text = time_match.group()
- order_times.append(time_text)
- # print(order_times)
- order_times = [tuple(self.fix_time(order_time, html, page_time)) for order_time in order_times]
- order_times = [order_time for order_time in order_times if order_time[0]!=""]
- if len(set(order_times))==1:
- order_begin,order_end = order_times[0]
- project_name = codeName[0]['name']
- pack_info = [pack for pack in prem[0]['prem'].values()]
- budget = pack_info[0].get('tendereeMoney',0)
- product = prem[0]['product']
- link = {'project_name': project_name, 'product': product, 'demand': project_name, 'budget': budget,
- 'order_begin': order_begin, 'order_end': order_end}
- _data.append(link)
- product_attrs[1]['demand_info']['data'] = _data
- return product_attrs
- def predict_by_text(self,product_attrs,html,list_outlines,page_time=""):
- list_outline = list_outlines[0]
- get_product_attrs = False
- for _outline in list_outline:
- if re.search("信息|情况|清单|概况",_outline.outline_summary):
- outline_text = _outline.outline_text
- outline_text = outline_text.replace(_outline.outline_summary,"")
- key_value_list = [_split for _split in re.split("[,。;]",outline_text) if re.search("[::]",_split)]
- if not key_value_list:
- continue
- head_list = []
- head_value_list = []
- for key_value in key_value_list:
- key_value = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key_value)
- temp = re.split("[::]",key_value)
- key = temp[-2]
- key = re.sub("^[一二三四五六七八九十]{1,3}[、.]|^[\d]{1,2}[、.]\d{,2}|^[\((]?[一二三四五六七八九十]{1,3}[\))][、]?","",key)
- value = temp[-1]
- head_list.append(key)
- head_value_list.append(value)
- head_set = set(head_list)
- # print('head_set',head_set)
- if len(head_set & self.header_set) > len(head_set)*0.2:
- loop_list = []
- begin_list = [0]
- for index,head in enumerate(head_list):
- if head not in loop_list:
- loop_list.append(head)
- else:
- begin_list.append(index)
- loop_list = []
- loop_list.append(head)
- headers = []
- headers_demand = []
- header_col = []
- product_link = []
- demand_link = []
- for idx in range(len(begin_list)):
- if idx==len(begin_list)-1:
- deal_list = head_value_list[begin_list[idx]:]
- tmp_head_list = head_list[begin_list[idx]:]
- else:
- deal_list = head_value_list[begin_list[idx]:begin_list[idx+1]]
- tmp_head_list = head_list[begin_list[idx]:begin_list[idx+1]]
- product = "" # 产品
- quantity = "" # 数量
- quantity_unit = "" # 单位
- unitPrice = "" # 单价
- brand = "" # 品牌
- specs = "" # 规格
- demand = "" # 采购需求
- budget = "" # 预算金额
- order_time = "" # 采购时间
- order_begin = ""
- order_end = ""
- header_dic, found_header, header_list, header_list2 = self.find_header(tmp_head_list, self.p1,self.p2)
- if found_header:
- headers.append('_'.join(header_list))
- headers_demand.append('_'.join(header_list2))
- header_col.append('_'.join(tmp_head_list))
- # print('header_dic: ',header_dic)
- id1 = header_dic.get('名称', "")
- id2 = header_dic.get('数量', "")
- id2_2 = header_dic.get('单位', "")
- id3 = header_dic.get('单价', "")
- id4 = header_dic.get('品牌', "")
- id5 = header_dic.get('规格', "")
- id6 = header_dic.get('需求', "")
- id7 = header_dic.get('预算', "")
- id8 = header_dic.get('时间', "")
- if re.search('[a-zA-Z\u4e00-\u9fa5]', deal_list[id1]) and deal_list[id1] not in self.header_set and \
- re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', deal_list[id1]) == None:
- product = deal_list[id1]
- if id2 != "":
- if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
- quantity = deal_list[id2]
- quantity = re.sub('[()(),,约]', '', quantity)
- quantity = re.sub('[一壹]', '1', quantity)
- ser = re.search('^(\d+(?:\.\d+)?)([㎡\w/]{,5})', quantity)
- if ser:
- quantity = str(ser.group(1))
- quantity_unit = ser.group(2)
- if float(quantity)>=10000*10000:
- quantity = ""
- quantity_unit = ""
- else:
- quantity = ""
- quantity_unit = ""
- if id2_2 != "":
- if re.search('^\w{1,4}$', deal_list[id2_2]):
- quantity_unit = deal_list[id2_2]
- else:
- quantity_unit = ""
- # if id2 != "":
- # if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', deal_list[id2]):
- # quantity = deal_list[id2]
- # else:
- # quantity = ""
- if id3 != "":
- if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id3]):
- _unitPrice = deal_list[id3]
- re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_unitPrice)
- if re_price:
- _unitPrice = re_price[0]
- if '万元' in header_list[2] and '万' not in _unitPrice:
- _unitPrice += '万元'
- unitPrice = getUnifyMoney(_unitPrice)
- if unitPrice>=10000*10000:
- unitPrice = ""
- unitPrice = str(unitPrice)
- if id4 != "":
- if re.search('\w', deal_list[id4]):
- brand = deal_list[id4]
- else:
- brand = ""
- if id5 != "":
- if re.search('\w', deal_list[id5]):
- specs = deal_list[id5]
- else:
- specs = ""
- if id6 != "":
- if re.search('\w', deal_list[id6]):
- demand = deal_list[id6]
- else:
- demand = ""
- if id7 != "":
- if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', deal_list[id7]):
- _budget = deal_list[id7]
- re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?",_budget)
- if re_price:
- _budget = re_price[0]
- if '万元' in header_list2[2] and '万' not in _budget:
- _budget += '万元'
- budget = str(getUnifyMoney(_budget))
- if float(budget)>= 100000*10000:
- budget = ""
- if id8 != "":
- if re.search('\w', deal_list[id8]):
- order_time = deal_list[id8].strip()
- order_begin, order_end = self.fix_time(order_time, html, page_time)
- # print(quantity,unitPrice,brand,specs)
- if quantity != "" or unitPrice != "" or brand != "" or specs != "":
- link = {'product': product, 'quantity': quantity, 'quantity_unit':quantity_unit,'unitPrice': unitPrice,
- 'brand': brand[:50], 'specs': specs}
- if link not in product_link:
- product_link.append(link)
- # mat = re.match('([0-9.,]+)[((]?\w{,3}[))]?$', link['quantity'])
- # if link['unitPrice'] != "" and mat:
- # try:
- # total_product_money += float(link['unitPrice']) * float(
- # mat.group(1).replace(',', ''))
- # except:
- # log('产品属性单价数量相乘出错, 单价: %s, 数量: %s' % (
- # link['unitPrice'], link['quantity']))
- if order_begin != "" and order_end != "":
- order_begin_year = int(order_begin.split("-")[0])
- order_end_year = int(order_end.split("-")[0])
- # 限制附件错误识别时间
- if order_begin_year >= 2050 or order_end_year >= 2050:
- order_begin = order_end = ""
- # print(budget, order_time)
- if budget != "" and order_time != "":
- link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget,
- 'order_begin': order_begin, 'order_end': order_end}
- if link not in demand_link:
- demand_link.append(link)
- if len(product_link) > 0:
- attr_dic = {'product_attrs': {'data': product_link, 'header': list(set(headers)), 'header_col': list(set(header_col))}}
- get_product_attrs = True
- else:
- attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}}
- if len(demand_link) > 0:
- demand_dic = {'demand_info': {'data': demand_link, 'header': headers_demand, 'header_col': header_col}}
- else:
- demand_dic = {'demand_info': {'data': [], 'header': [], 'header_col': []}}
- product_attrs[0] = attr_dic
- if len(product_attrs[1]['demand_info']['data']) == 0:
- product_attrs[1] = demand_dic
- if get_product_attrs:
- break
- return product_attrs
- def add_product_attrs(self,channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time):
- if channel_dic['docchannel']['docchannel']=="采购意向" and len(product_attrs[1]['demand_info']['data']) == 0:
- product_attrs = self.predict_without_table(product_attrs, list_sentences,list_entitys,codeName,prem,text,page_time)
- if len(product_attrs[0]['product_attrs']['data']) == 0:
- product_attrs = self.predict_by_text(product_attrs,text,list_outlines,page_time)
- if len(product_attrs[1]['demand_info']['data'])>0:
- for d in product_attrs[1]['demand_info']['data']:
- for product in set(prem[0]['product']):
- if product in d['project_name'] and product not in d['product']:
- d['product'].append(product) #把产品在项目名称中的添加进需求要素中
- # docchannel类型提取
- class DocChannel():
- def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb',config=None):
- self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
- self.mask, self.mask_title = self.load_life(life_model,config)
- self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
- self.type_mask, self.type_mask_title = self.load_type(type_model)
- self.sequen_len = 200 # 150 200
- self.title_len = 30
- self.sentence_num = 10
- self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
- lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
- lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
- self.id2type = {k: v for k, v in enumerate(lb_type)}
- self.id2life = {k: v for k, v in enumerate(lb_life)}
- self.load_pattern()
- def load_pattern(self):
- self.type_dic = {
- '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
- '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
- '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
- '采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;' # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)|
- }
- self.title_type_dic = {
- '土地矿产': '(土地|用地|宗地|荒地|山地|海域|矿)(出让|出租|招租|租赁|承包|流转|使用权|经营权|征收|划拨|中标|成交)|供地结果|矿业权|探矿权|采矿权|(土地|用地|宗地|地块)(使用权)?(终止|中止|网上)?(挂牌|出让|拍卖|招拍|划拨)|征收土地',
- '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|公示)|拍卖|变卖|流拍|竞拍',
- '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让',
- '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判)的?(公告|公示|中标|成交|结果|$)|工程招标',
- # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
- '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)'
- }
- self.life_dic = {
- '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
- '招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
- '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格要求|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)',
- '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
- '招标答疑': '现澄清(为|如下)|答疑补遗|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清|(最高(投标)?限价|控制价|拦标价)公示', # |异议的回复
- '公告变更': '第[\d一二]次变更|(更正|变更)(公告|公示|信息|内容|事项|原因|理由|日期|时间|如下)|原公告((主要)?(信息|内容)|发布时间)|(变更|更正)[前后]内容|现?在?(变更|更正|修改|更改)(内容)?为|(公告|如下|信息|内容|事项|结果|文件|发布|时间|日期)(更正|变更)',
- '公告变更neg': '履约变更内容',
- '候选人公示': '候选人公示|评标结果公示|中标候选人名单公示|现将中标候选人(进行公示|公[示布]如下)|(中标|中选)候选人(信息|情况)[::\s]',
- '候选人公示neg': '中标候选人公示期|中标候选人公示前',
- '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(人|成交)|成交)\w{,3}(信息|情况)[::\s]',
- '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源(采购|招标)?的?(中标|成交|结果)', # |单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示
- '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]', # |唯一
- '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家|通知中标单位|影响(成交|中标)结果',
- # |确定成交供应商[:,\s]
- '合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|完成(日期|时间))|(供应商乙方|乙方供应商):|合同总?金额|履约信息',
- '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
- '废标公告2': '(无效|中止|终止|废标|流标|失败|作废|异常|撤销)的?(原因|理由)|本项目因故取消|本(项目|次)(公开)?\w{2}失败|已终止\s*原因:|(人|人数|供应商|单位)(不足|未达\w{,3}数量)|已终止|不足[3三]家|无(废标)|成交情况:\s*[流废]标|现予以废置',
- '废标公告neg': '超过此报价将作为[废流]标处理|否则按[废流]标处理|终止规则:|成交规则:|视为流标|竞价失败的一切其他情形'
- }
- self.title_life_dic = {
- '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示|意向公开',
- '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|(供应|招标)计划表?$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
- '公告变更': '第[\d一二]次变更|(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$',
- '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
- '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
- '合同公告': '(合同(成交|变更)?|(履约|验收)(结果)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$',
- '候选人公示': '候选人(变更)?公示|评标(结果)?公示|中标前?公示|中标预公示',
- '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|开标(记录|信息|情况)|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$',
- '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
- '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
- }
- def load_life(self,life_model,config):
- with tf.Graph().as_default() as graph:
- output_graph_def = graph.as_graph_def()
- with open(os.path.dirname(__file__)+life_model, 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- # print("%d ops in the final graph" % len(output_graph_def.node))
- del output_graph_def
- sess = tf.Session(graph=graph,config=config)
- sess.run(tf.global_variables_initializer())
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- mask = sess.graph.get_tensor_by_name('inputs/mask:0')
- mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
- # logit = sess.graph.get_tensor_by_name('output/logit:0')
- softmax = sess.graph.get_tensor_by_name('output/softmax:0')
- return sess, title, inputs, prob, softmax, mask, mask_title
- def load_type(self,type_model):
- with tf.Graph().as_default() as graph:
- output_graph_def = graph.as_graph_def()
- with open(os.path.dirname(__file__)+type_model, 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- # print("%d ops in the final graph" % len(output_graph_def.node))
- del output_graph_def
- sess = tf.Session(graph=graph)
- sess.run(tf.global_variables_initializer())
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- mask = sess.graph.get_tensor_by_name('inputs/mask:0')
- mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
- # logit = sess.graph.get_tensor_by_name('output/logit:0')
- softmax = sess.graph.get_tensor_by_name('output/softmax:0')
- return sess, title, inputs, prob, softmax, mask, mask_title
- def predict_process(self, docid='', doctitle='', dochtmlcon=''):
- # print('准备预处理')
- def get_kw_senten(s, span=10):
- doc_sens = []
- tmp = 0
- num = 0
- end_idx = 0
- for it in re.finditer(self.kws, s): # '|'.join(keywordset)
- left = s[end_idx:it.end()].split()
- right = s[it.end():].split()
- tmp_seg = s[tmp:it.start()].split()
- if len(tmp_seg) > span or tmp == 0:
- doc_sens.append(' '.join(left[-span:] + right[:span]))
- end_idx = it.end() + 1 + len(' '.join(right[:span]))
- tmp = it.end()
- num += 1
- if num >= self.sentence_num:
- break
- if doc_sens == []:
- doc_sens.append(s)
- return doc_sens
- def word2id(wordlist, max_len=self.sequen_len):
- ids = [getIndexOfWords(w) for w in wordlist]
- ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
- assert len(ids) == max_len
- return ids
- cost_time = dict()
- datas = []
- datas_title = []
- try:
- segword_title = ' '.join(selffool.cut(doctitle)[0])
- segword_content = dochtmlcon
- except:
- segword_content = ''
- segword_title = ''
- if isinstance(segword_content, float):
- segword_content = ''
- if isinstance(segword_title, float):
- segword_title = ''
- segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
- replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
- replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
- segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
- segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
- doc_word_list = segword_content.split()
- if len(doc_word_list) > self.sequen_len / 2:
- doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
- doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
- else:
- doc_sens = ' '.join(doc_word_list[:self.sequen_len])
- # print('标题:',segword_title)
- # print('正文:',segword_content)
- datas.append(doc_sens.split())
- datas_title.append(segword_title.split())
- # print('完成预处理')
- return datas, datas_title
- def is_houxuan(self, title, content):
- '''
- 通过标题和中文内容判断是否属于候选人公示类别
- :param title: 公告标题
- :param content: 公告正文文本内容
- :return: 1 是候选人公示 ;0 不是
- '''
- if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围)
- if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
- return 0
- return 1
- if re.search('候选人的?公示', content[:100]):
- if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
- return 0
- return 1
- else:
- return 0
- def predict(self, title='', list_sentence='', web_source_no='', original_docchannel=''):
- not_extract_dic = {
- 104: '招标文件',
- 106: '法律法规',
- 107: '新闻资讯',
- 108: '拟建项目',
- 109: '展会推广',
- 110: '企业名录',
- 111: '企业资质',
- 112: '全国工程人员',
- 113: '业主采购'
- }
- if original_docchannel in not_extract_dic:
- return {'docchannel': {'docchannel':'', 'doctype':not_extract_dic[original_docchannel], "original_docchannel_id": str(original_docchannel)}}
- if web_source_no in ['02104-7']:
- return {'docchannel': {'docchannel':'', 'doctype':'采招数据'}}
- if isinstance(list_sentence, list):
- token_l = [it.tokens for it in list_sentence]
- tokens = [it for l in token_l for it in l]
- content = ' '.join(tokens[:500])
- title = re.sub('[^\u4e00-\u9fa5]', '', title)
- if len(title)>50:
- title = title[:20]+title[-30:]
- data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字
- text_len = len(data_content[0]) if len(data_content[0])<self.sequen_len else self.sequen_len
- title_len = len(data_title[0]) if len(data_title[0])<self.title_len else self.title_len
- result = {'docchannel': {'docchannel':'', 'doctype':'', "original_docchannel_id": str(original_docchannel)}}
- array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
- array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
- pred = self.type_sess.run(self.type_softmax,
- feed_dict={
- self.type_title: array_title,
- self.type_content: array_content,
- self.type_mask:[[0]*text_len+[1]*(self.sequen_len-text_len)],
- self.type_mask_title:[[0]*title_len+[1]*(self.title_len-title_len)],
- self.type_prob:1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- result['docchannel']['doctype'] = self.id2type[id]
- # print('公告类别:', self.id2type[id], '概率:',prob)
- # if id == 0:
- if result['docchannel']['doctype'] not in ['', '新闻资讯']:
- pred = self.lift_sess.run(self.lift_softmax,
- feed_dict={
- self.lift_title: array_title,
- self.lift_content: array_content,
- self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
- self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
- self.lift_prob:1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- result['docchannel']['docchannel'] = self.id2life[id]
- # print('生命周期:纯模型预测',self.id2life[id], '概率:',prob)
- # if id == 6:
- if result['docchannel']['docchannel'] == '中标信息':
- if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
- result['docchannel']['docchannel'] = '候选人公示'
- # return '候选人公示', prob
- # return [{'docchannel': '候选人公示'}]
- return result
- # return [{'docchannel':self.id2life[id]}]
- # else:
- # # return self.id2type[id], prob
- # return [{'docchannel':self.id2type[id]}]
- def predict_rule(self, title, content, channel_dic, prem_dic):
- '''2022/2/10加入规则去除某些数据源及内容过短且不包含类别关键词的公告不做预测'''
- hetong = '(合同|验收|履约)(公告|公示)|合同号?$' # 合同标题正则
- zhongbiao_t = '(中标|中选|成交|入选|入围|结果|确认)(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选)结果|开标(记录|信息|情况)|单一来源|直接(选取|选定)|中标通知书|中标$'
- zhongbiao_c = '(中标|中选|成交|拟选用|拟邀请|最终选定的?|拟定)(供应商|供货商|服务商|企业|公司|单位|(候选)?人)(名称)?[::]|[,。:.](供应商|供货商|服务商)(名称)?:|指定的中介服务机构:|建设服务单位:'
- zhaobiao_t = '(遴选|采购|招标|竞价|议价|比选|询价|评选|谈判|邀标|邀请|洽谈|约谈)(公告|公示|$)'
- title_cn = re.sub('[^\u4e00-\u9fa5]', '', title)
- if len(re.sub('[^\u4e00-\u9fa5]', "", content))<50 and channel_dic['docchannel']['doctype'] != '新闻资讯':
- if re.search(hetong, title_cn) != None:
- channel_dic['docchannel']['docchannel'] = '合同公告'
- elif re.search(zhongbiao_t, title_cn):
- channel_dic['docchannel']['docchannel'] = '中标信息'
- elif re.search(zhaobiao_t, title_cn):
- channel_dic['docchannel']['docchannel'] = '招标公告'
- else:
- channel_dic['docchannel']['docchannel'] = ''
- elif channel_dic['docchannel'].get('docchannel', '') == '招标公告' and 'win_tenderer' in json.dumps(prem_dic,
- ensure_ascii=False):
- if re.search(hetong, title_cn) != None:
- channel_dic['docchannel']['docchannel'] = '合同公告'
- log('正则把招标公告修改为合同公告')
- elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
- content):
- channel_dic['docchannel']['docchannel'] = '中标信息'
- log('正则把招标公告修改为中标信息')
- elif channel_dic['docchannel'].get('docchannel', '') == '中标信息' and 'win_tenderer' not in json.dumps(prem_dic,
- ensure_ascii=False):
- if re.search(hetong, title_cn):
- channel_dic['docchannel']['docchannel'] = '合同公告'
- log('正则把中标信息修改为合同公告')
- elif re.search(zhongbiao_t, title_cn) or re.search(zhongbiao_t, content[:200]) or re.search(zhongbiao_c,
- content):
- pass
- elif re.search(zhaobiao_t, title_cn):
- channel_dic['docchannel']['docchannel'] = '招标公告'
- log('正则把中标信息修改为招标公告')
- elif re.search('中标|成交|中选|入选|入围|结果|供应商|供货商|候选人', title_cn+content)==None:
- channel_dic['docchannel']['docchannel'] = ''
- log('正则把中标信息修改为空')
- return channel_dic
- def predict_merge(self, title, list_sentence, html, bidway, prem, original_docchannel='', web_source_no=''):
- '''
- 正则,模型混合预测,返回公告类型及生命周期
- :param title: 公告标题
- :param content: 预处理后的返回的句子实体列表 list_sentence
- :param html: 公告原文 html 内容
- :param bidway: 招标方式
- :param prem: 提取的prem 字典
- :return: {'docchannel': {'docchannel':'中标信息', 'doctype':'采招数据'}} 字典格式
- '''
- def cut_single_cn_space(text):
- new_text = ""
- for w in text.split():
- if len(w) == 1 or re.search('^[\u4e00-\u9fa5][::]', w):
- new_text += w
- else:
- new_text += ' ' + w
- return new_text
- def html2text(html):
- ser = re.search('<div[^<>]*richTextFetch', html)
- # if ser and len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()]))>500:
- # html = html[:ser.start()]+'##richTextFetch##'
- if ser:
- if len(re.sub('[^\u4e00-\u9fa5]', '', html[:ser.start()])) > 200:
- html = html[:ser.start()] + '##richTextFetch##'
- else:
- html = html[:ser.start() + 500]
- text = re.sub('<[^<]*?>', '', html).replace(' ', ' ')
- # text = re.sub('http[0-9a-zA-Z-.:/]+|[0-9a-zA-Z-./@]+', '', text)
- text = re.sub('\s+', ' ', text)
- # text = re.sub('[/|[()()]', '', text)
- text = cut_single_cn_space(text)
- return text[:20000]
- def count_diffser(pattern, text):
- num = 0
- kw = []
- for p in pattern.split(';'):
- if re.search(p, text):
- num += 1
- kw.append(re.search(p, text).group(0))
- return num, ';'.join(kw)
- def is_contain_winner(extract_json):
- if re.search('win_tenderer', extract_json):
- return True
- else:
- return False
- def is_single_source(bidway, title):
- if re.search('单一来源|单一性采购', title):
- return True
- elif bidway == '单一来源':
- return True
- else:
- return False
- def get_type(title, text):
- if re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'],
- text): # and re.search('(土地|用地|宗地|地块)(经营权)?(流转|承包|出租|招租|租赁|确权)', text)==None
- if re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]):
- return '采招数据', re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]).group(0)
- return '土地矿产', (re.search(self.title_type_dic['土地矿产'], title) or re.search(self.type_dic['土地矿产'], text)).group(0)
- elif (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)):
- if re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]):
- return '采招数据', re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]).group(0)
- return '拍卖出让', (re.search(self.title_type_dic['拍卖出让'], title) or re.search(self.type_dic['拍卖出让'], text)).group(0)
- elif re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text):
- if re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]):
- return '采招数据', re.search(self.title_type_dic['采招数据'], title + text.strip().split(' ')[0]).group(0)
- return '产权交易', (re.search(self.title_type_dic['产权交易'], title) or re.search(self.type_dic['产权交易'], text)).group(0)
- elif re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text):
- return '采招数据', (
- re.search(self.title_type_dic['采招数据'], title) or re.search(self.type_dic['采招数据'], title + text)).group(
- 0)
- elif re.search(self.title_type_dic['新闻资讯'], title):
- if re.search(self.title_type_dic['采招数据'], title +text.strip().split(' ')[0]):
- return '采招数据', re.search(self.title_type_dic['采招数据'], title +text.strip().split(' ')[0]).group(0)
- return '新闻资讯', re.search(self.title_type_dic['新闻资讯'], title).group(0)
- else:
- return '', '没有公告类型关键词,返回空'
- def get_life(title, text):
- title = re.sub('[-()()0-9a-z]|第?[二三四]次公?告?', '', title)
- first_line = text.split()[0] if len(text.split()) > 2 else ''
- if title.strip()[-2:] not in ['公告', '公示'] and 5 < len(first_line) < 50 and first_line[-2:] in ['公告', '公示']:
- # print('title: ', title, first_line)
- title += first_line
- def count_score(l):
- return len(l) + len(set(l)) * 2
- life_kw_title = {}
- life_kw_content = {}
- life_score = {}
- # msc = ""
- # 查找标题每个类别关键词
- for k, v in self.title_life_dic.items():
- k2 = re.sub('[\da-z]', '', k)
- if k2 not in life_kw_title:
- life_kw_title[k2] = []
- for it in re.finditer(v, title):
- life_kw_title[k2].append(it.group(0))
- # 查找正文每个类别关键词
- for k, v in self.life_dic.items():
- k2 = re.sub('[\da-z]', '', k)
- if k2 not in life_kw_content:
- life_kw_content[k2] = {'pos': [], 'neg': []}
- for it in re.finditer(v, text):
- if 'neg' not in k:
- life_kw_content[k2]['pos'].append(it.group(0))
- else:
- life_kw_content[k2]['neg'].append(it.group(0))
- for k2 in life_kw_content:
- life_score[k2] = count_score(life_kw_content[k2]['pos']) - count_score(
- life_kw_content[k2]['neg'])
- life_kw_title = {k: v for k, v in life_kw_title.items() if v != []}
- life_kw_content = {k: v for k, v in life_kw_content.items() if life_score[k] > 0}
- msc = [life_kw_title, life_kw_content, life_score]
- msc = json.dumps(msc, ensure_ascii=False)
- max_score = 0
- life_list = []
- for k in life_score.keys():
- if life_score[k] > max_score:
- max_score = life_score[k]
- life_list = [k]
- elif life_score[k] == max_score and life_score[k] > 0:
- life_list.append(k)
- if '采购意向' in life_kw_title or '采购意向' in life_list:
- return '采购意向', msc
- elif '招标预告' in life_kw_title or '招标预告' in life_list:
- if '中标信息' in life_kw_title or '中标信息' in life_list:
- return '中标信息', msc
- elif set(['候选人公示', '合同公告']) & set(life_kw_title) != set():
- return '', msc
- return '招标预告', msc
- elif '公告变更' in life_kw_title or '公告变更' in life_list:
- if life_score.get('候选人公示', 0) > 3 or '候选人公示' in life_kw_title:
- return '候选人公示', msc
- elif life_score.get('合同公告', 0) > 3 or '合同公告' in life_kw_title:
- return '合同公告', msc
- elif life_score.get('中标信息', 0) > 3 or '中标信息' in life_kw_title:
- return '中标信息', msc
- elif '招标公告' in life_kw_title and re.search('变更|更正', title[-4:])==None and life_score.get('公告变更', 0) < 4:
- return '招标公告', msc
- return '公告变更', msc
- elif '招标答疑' in life_kw_title or '招标答疑' in life_list:
- if '招标公告' in life_kw_title and life_score.get('招标答疑', 0) < 4:
- return '招标公告', msc
- elif life_score.get('招标答疑', 0) < max_score:
- if max_score > 3 and len(life_list) == 1:
- return life_list[0], msc
- return '', msc
- return '招标答疑', msc
- elif '候选人公示' in life_kw_title or '候选人公示' in life_list:
- if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
- return '招标公告', msc
- elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
- return '废标公告', msc
- return '候选人公示', msc
- elif '合同公告' in life_kw_title or '合同公告' in life_list:
- if '招标公告' in life_kw_title and life_score.get('招标公告', 0) > 3:
- return '招标公告', msc
- elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
- return '废标公告', msc
- return '合同公告', msc
- elif '中标信息' in life_kw_title or '中标信息' in life_list:
- if '招标公告' in life_kw_title and life_score.get('招标公告',
- 0) > 2: # (life_score.get('招标公告', 0)>2 or life_score.get('中标信息', 0)<4) 0.7886409793924245
- return '招标公告', msc
- elif '废标公告' in life_kw_title or life_score.get('废标公告', 0) > 5:
- return '废标公告', msc
- elif life_score.get('候选人公示', 0) > 3:
- return '候选人公示', msc
- elif life_score.get('合同公告', 0) > 5:
- return '合同公告', msc
- return '中标信息', msc
- elif '废标公告' in life_kw_title or '废标公告' in life_list:
- if life_score.get('招标公告', 0) > 3 and '废标公告' not in life_kw_title:
- return '招标公告', msc
- return '废标公告', msc
- elif '资审结果' in life_kw_title or '资审结果' in life_list:
- return '资审结果', msc
- elif '招标公告' in life_kw_title or '招标公告' in life_list:
- return '招标公告', msc
- return '', msc
- def get_model_inputs(list_sentence):
- list_sentence = sorted(list_sentence, key=lambda x:x.sentence_index)
- token_l = [it.tokens for it in list_sentence]
- tokens = [it for l in token_l for it in l]
- content = ' '.join(tokens[:500])
- data_content, data_title = self.predict_process(docid='', doctitle=title[-50:],
- dochtmlcon=content) # 标题最多取50字
- text_len = len(data_content[0]) if len(data_content[0]) < self.sequen_len else self.sequen_len
- title_len = len(data_title[0]) if len(data_title[0]) < self.title_len else self.title_len
- array_content = embedding(data_content, shape=(len(data_content), self.sequen_len, 128))
- array_title = embedding(data_title, shape=(len(data_title), self.title_len, 128))
- return array_content, array_title ,text_len, title_len, content
- def type_model_predict():
- pred = self.type_sess.run(self.type_softmax,
- feed_dict={
- self.type_title: array_title,
- self.type_content: array_content,
- self.type_mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
- self.type_mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
- self.type_prob: 1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- return id, prob
- def life_model_predict():
- pred = self.lift_sess.run(self.lift_softmax,
- feed_dict={
- self.lift_title: array_title,
- self.lift_content: array_content,
- self.mask: [[0] * text_len + [1] * (self.sequen_len - text_len)],
- self.mask_title: [[0] * title_len + [1] * (self.title_len - title_len)],
- self.lift_prob: 1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- return id, prob
- def final_change(msc):
- '''
- 修改逻辑:
- 1、中标公告、合同公告无中标人且原始为非中标,返回原类型
- 2、废标公告有中标人且标题无废标关键词,返回中标信息
- 3、答疑公告标题无答疑关键且原始为招标,返回原始类别
- 4、招标公告有中标人且原始为中标,返回中标信息
- 5、预测为招标,原始为预告、意向,返回原始类别
- 6、预测及原始均在变更、答疑,返回原始类别
- 7、预测为采招数据,原始为产权且有关键词,返回原始类别
- 8、废标公告原始为招标、预告且标题无废标关键期,返回原始类别
- 9、若预测为非采招数据且源网为采招数据且有招标关键词返回采招数据
- 10、招标公告有中标人,且标题有直购关键词,改为中标信息
- 11、预测预告,原始为意向、招标且标题无预告关键词,返回原始类别
- '''
- if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
- original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(prem_json)==False:
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
- msc += '最终规则修改:中标公告、合同公告无中标人且原始为非中标,返回原类型'
- elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
- self.title_life_dic['废标公告'], title) == None:
- result['docchannel']['docchannel'] = '中标信息'
- msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
- elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
- self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
- original_docchannel, '') in ['招标公告', '采购意向', '招标预告']:
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
- msc += '最终规则修改:答疑公告标题无答疑关键且原始为招标,返回原始类别;'
- elif result['docchannel']['docchannel'] == '招标公告' and is_contain_winner(prem_json) and origin_dic.get(
- original_docchannel, '') == '中标信息':
- result['docchannel']['docchannel'] = '中标信息'
- msc += '最终规则修改:预测为招标公告却有中标人且原始为中标改为中标信息;'
- elif result['docchannel']['docchannel'] in ['招标公告'] and origin_dic.get(
- original_docchannel, '') in ['采购意向', '招标预告']:
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
- msc += '最终规则修改:预测为招标,原始为预告、意向,返回原始类别'
- elif result['docchannel']['docchannel'] in ['招标预告'] and origin_dic.get(
- original_docchannel, '') in ['采购意向', '招标公告'] and re.search(
- self.title_life_dic['招标预告'], title)==None:
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
- msc += '最终规则修改:预测预告,原始为意向、招标且标题无预告关键词,返回原始类别'
- elif result['docchannel']['docchannel'] in ['招标答疑', '公告变更'] and origin_dic.get(
- original_docchannel, '') in ['招标答疑', '公告变更']:
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
- msc += '最终规则修改:预测及原始均在答疑、变更,返回原始类别'
- elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get(
- original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价|资产', text):
- result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
- msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
- elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get(
- original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search(
- self.title_life_dic['废标公告'], title) == None:
- result['docchannel']['docchannel'] = origin_dic.get(original_docchannel, '')
- msc += '最终规则修改:废标公告原始为招标、预告且标题无废标关键期,返回原始类别;'
- elif result['docchannel']['docchannel'] in ['招标公告', '招标预告'] and is_contain_winner(
- prem_json) and re.search('直购', title):
- result['docchannel']['docchannel'] = '中标信息'
- msc += "最终规则修改:预测为招标却有中标人且标题有直购关键词返回中标"
- if result['docchannel']['doctype'] in ['产权交易', '土地矿产', '拍卖出让'] and origin_dic.get(
- original_docchannel, '') not in ['产权交易', '土地矿产', '拍卖出让'] \
- and re.search('产权|转让|受让|招租|招商|出租|承租|竞价|资产|挂牌|出让|拍卖|招拍|划拨|销售', title) == None\
- and re.search('(采购|招投?标|投标)(信息|内容|项目|公告|数量|人|单位|方式)|(建设|工程|服务|施工|监理|勘察|设计)项目', text):
- result['docchannel']['doctype'] = '采招数据'
- msc += ' 最终规则修改:预测为非采招数据,原始为采招数据且有招标关键词,返回采招数据'
- '''下面是新格式增加返回字段'''
- if result['docchannel']['docchannel'] != '': # 预测到生命周期的复制到life_docchannel,否则用数据源结果
- result['docchannel']['life_docchannel'] = result['docchannel']['docchannel']
- else:
- result['docchannel']['life_docchannel'] = origin_dic.get(original_docchannel, '原始类别')
- return msc
- not_extract_dic = {
- 104: '招标文件',
- 106: '法律法规',
- 107: '新闻资讯',
- 108: '拟建项目',
- 109: '展会推广',
- 110: '企业名录',
- 111: '企业资质',
- 112: '全国工程人员',
- 113: '业主采购'
- }
- origin_dic = {51: '公告变更',
- 52: '招标公告',
- 101: '中标信息',
- 102: '招标预告',
- 103: '招标答疑',
- 104: '招标文件',
- 105: '资审结果',
- 106: '法律法规',
- 107: '新闻资讯',
- 108: '拟建项目',
- 109: '展会推广',
- 110: '企业名录',
- 111: '企业资质',
- 112: '全国工程',
- 113: '业主采购',
- 114: '采购意向',
- 115: '拍卖出让',
- 116: '土地矿产',
- 117: '产权交易',
- 118: '废标公告',
- 119: '候选人公示',
- 120: '合同公告'}
- if original_docchannel in not_extract_dic:
- return {'docchannel': {'docchannel': '', 'doctype': not_extract_dic[original_docchannel], 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '公告类别不在提取范围'
- if web_source_no in ['02104-7', '04733', 'DX007628-6']: # 这些数据源无法识别
- return {'docchannel': {'docchannel': '', 'doctype': '采招数据', 'life_docchannel': origin_dic.get(original_docchannel, '原始类别')}}, '此数据源公告分类不明确,返回数据源类别'
- title = re.sub('[^\u4e00-\u9fa5]+|出租车', '', title)
- if len(title) > 50:
- title = title[:20] + title[-30:]
- text = html2text(html)
- prem_json = json.dumps(prem, ensure_ascii=False)
- result = {'docchannel': {'docchannel': '', 'doctype': ''}}
- doc_type, type_kw = get_type(title, text)
- # doc_life, life_kw = get_life(title, text, prem_json, bidway, original_docchannel)
- doc_life, life_kw = get_life(title, text)
- if doc_type in self.title_type_dic:
- result['docchannel']['doctype'] = doc_type
- if doc_life in self.title_life_dic:
- result['docchannel']['docchannel'] = doc_life
- # print('channel正则预测结果:', result)
- msc = '正则结果:类型:%s, 关键词:%s, 周期:%s, 关键词:%s'%(doc_type, type_kw,doc_life, life_kw)+'\n'+'模型结果:'
- # print('类型:%s, 关键词:%s, 周期:%s, 关键词:%s'%(doc_type, type_kw,doc_life, life_kw))
- if doc_type == "" or doc_life == "":
- array_content, array_title, text_len, title_len, content = get_model_inputs(list_sentence)
- if doc_type =="":
- type_id, type_prob = type_model_predict()
- type_model = self.id2type[type_id]
- result['docchannel']['doctype'] = type_model
- msc += type_model + ' 概率:%.4f;'%type_prob
- # print('公告类别:', self.id2type[id], '概率:',prob)
- # if id == 0:
- if doc_life=="" and result['docchannel']['doctype'] not in ['', '新闻资讯']:
- if len(text)>150 and re.search(self.kws, content):
- life_id, life_prob = life_model_predict()
- life_model = self.id2life[life_id]
- result['docchannel']['docchannel'] = life_model
- msc += life_model + ' 概率:%.4f;\n'%life_prob
- msc = final_change(msc)
- # print('channel ', msc)
- return result, msc
- # 保证金支付方式提取
- class DepositPaymentWay():
- def __init__(self,):
- self.pt = '(保证金的?(交纳|缴纳|应按下列|入账|支付)方式)[::]*([^,。]{,60})'
- self.pt2 = '保证金(必?须以|必?须?通过|以)(.{,8})方式'
- kws = ['银行转账', '公?对公方?式?转账', '对公转账', '柜台转账', '(线上|网上)自?行?(缴纳|交纳|缴退|收退)',
- '网上银行支付', '现金存入', '直接缴纳', '支票', '汇票', '本票', '电汇', '转账', '汇款', '随机码',
- '入账', '基本账户转出', '基本账户汇入', '诚信库中登记的账户转出',
- '银行保函', '电子保函', '担保函', '保证保险', '合法担保机构出具的担保', '金融机构、担保机构出具的保函']
- self.kws = sorted(kws, key=lambda x: len(x), reverse=True)
- def predict(self,content):
- pay_way = {'deposit_patment_way':''}
- result = []
- pay = re.search(self.pt, content)
- if pay:
- # print(pay.group(0))
- pay = pay.group(3)
- for it in re.finditer('|'.join(self.kws), pay):
- result.append(it.group(0))
- pay_way['deposit_patment_way'] = ';'.join(result)
- return pay_way
- pay = re.search(self.pt2, content)
- if pay:
- # print(pay.group(0))
- pay = pay.group(2)
- for it in re.finditer('|'.join(self.kws), pay):
- result.append(it.group(0))
- pay_way['deposit_patment_way'] = ';'.join(result)
- return pay_way
- else:
- return pay_way
- # 总价单价提取
- class TotalUnitMoney:
- def __init__(self):
- pass
- def predict(self, list_sentences, list_entitys):
- for i in range(len(list_entitys)):
- list_entity = list_entitys[i]
- # 总价单价
- for _entity in list_entity:
- if _entity.entity_type == 'money':
- word_of_sentence = list_sentences[i][_entity.sentence_index].sentence_text
- # 总价在中投标金额中
- if _entity.label == 1:
- result = extract_total_money(word_of_sentence,
- _entity.entity_text,
- [_entity.wordOffset_begin, _entity.wordOffset_end])
- if result:
- _entity.is_total_money = 1
- # 单价在普通金额中
- else:
- result = extract_unit_money(word_of_sentence,
- _entity.entity_text,
- [_entity.wordOffset_begin, _entity.wordOffset_end])
- if result:
- _entity.is_unit_money = 1
- # print("total_unit_money", _entity.entity_text,
- # _entity.is_total_money, _entity.is_unit_money)
- # 行业分类
- class IndustryPredictor():
- def __init__(self,):
- self.model_path = os.path.dirname(__file__)+ '/industry_model'
- self.id2lb = {0: '专业施工', 1: '专用仪器仪表', 2: '专用设备修理', 3: '互联网信息服务', 4: '互联网安全服务', 5: '互联网平台', 6: '互联网接入及相关服务', 7: '人力资源服务',
- 8: '人造原油', 9: '仓储业', 10: '仪器仪表', 11: '仪器仪表修理', 12: '会计、审计及税务服务', 13: '会议、展览及相关服务', 14: '住宅、商业用房',
- 15: '体育场地设施管理', 16: '体育组织', 17: '体育设备', 18: '保险服务', 19: '信息处理和存储支持服务', 20: '信息技术咨询服务',
- 21: '信息系统集成和物联网技术服务', 22: '修缮工程', 23: '健康咨询', 24: '公路旅客运输', 25: '其他专业咨询与调查', 26: '其他专业技术服务',
- 27: '其他交通运输设备', 28: '其他公共设施管理', 29: '其他土木工程建筑', 30: '其他工程服务', 31: '其他建筑建材', 32: '其他运输业', 33: '农业和林业机械',
- 34: '农业服务', 35: '农产品', 36: '农副食品,动、植物油制品', 37: '出版业', 38: '办公消耗用品及类似物品', 39: '办公设备', 40: '化学原料及化学制品',
- 41: '化学纤维', 42: '化学药品和中药专用设备', 43: '医疗设备', 44: '医药品', 45: '卫星传输服务', 46: '卫生', 47: '印刷服务', 48: '图书和档案',
- 49: '图书档案设备', 50: '图书馆与档案馆', 51: '土地管理业', 52: '地质勘查', 53: '地震服务', 54: '场馆、站港用房', 55: '城市公共交通运输',
- 56: '塑料制品、半成品及辅料', 57: '天然石料', 58: '娱乐设备', 59: '婚姻服务', 60: '安全保护服务', 61: '安全生产设备', 62: '家具用具',
- 63: '家用电器修理', 64: '工业、生产用房', 65: '工业与专业设计及其他专业技术服务', 66: '工矿工程建筑', 67: '工程技术与设计服务', 68: '工程机械',
- 69: '工程监理服务', 70: '工程评价服务', 71: '工程造价服务', 72: '市场调查', 73: '广告业', 74: '广播', 75: '广播、电视、电影设备',
- 76: '广播电视传输服务', 77: '废弃资源综合利用业', 78: '建筑涂料', 79: '建筑物、构筑物附属结构', 80: '建筑物拆除和场地准备活动', 81: '建筑装饰和装修业',
- 82: '录音制作', 83: '影视节目制作', 84: '房地产中介服务', 85: '房地产开发经营', 86: '房地产租赁经营', 87: '房屋租赁', 88: '招标代理',
- 89: '探矿、采矿、选矿和造块设备', 90: '政法、检测专用设备', 91: '教育服务', 92: '教育设备', 93: '文物及非物质文化遗产保护', 94: '文物和陈列品',
- 95: '文艺创作与表演', 96: '文艺设备', 97: '新闻业', 98: '旅行社及相关服务', 99: '日杂用品', 100: '有色金属冶炼及压延产品', 101: '有色金属矿',
- 102: '木材、板材等', 103: '木材采集和加工设备', 104: '机械设备', 105: '机械设备经营租赁', 106: '林业产品', 107: '林业服务', 108: '架线和管道工程建筑',
- 109: '核工业专用设备', 110: '橡胶制品', 111: '殡葬服务', 112: '殡葬设备及用品', 113: '气象服务', 114: '水上交通运输设备', 115: '水上运输业',
- 116: '水利和水运工程建筑', 117: '水工机械', 118: '水文服务', 119: '水资源管理', 120: '污水处理及其再生利用', 121: '汽车、摩托车修理与维护',
- 122: '法律服务', 123: '洗染服务', 124: '测绘地理信息服务', 125: '海洋仪器设备', 126: '海洋工程建筑', 127: '海洋服务', 128: '消防设备',
- 129: '清洁服务', 130: '渔业产品', 131: '渔业服务', 132: '炼焦和金属冶炼轧制设备', 133: '烟草加工设备', 134: '热力生产和供应', 135: '焦炭及其副产品',
- 136: '煤炭采选产品', 137: '燃气生产和供应业', 138: '物业管理', 139: '特种用途动、植物', 140: '环保咨询', 141: '环境与生态监测检测服务',
- 142: '环境污染防治设备', 143: '环境治理业', 144: '玻璃及其制品', 145: '理发及美容服务', 146: '生态保护', 147: '电信',
- 148: '电力、城市燃气、蒸汽和热水、水', 149: '电力供应', 150: '电力工业专用设备', 151: '电力工程施工', 152: '电力生产', 153: '电子和通信测量仪器',
- 154: '电工、电子专用生产设备', 155: '电影放映', 156: '电气安装', 157: '电气设备', 158: '电气设备修理', 159: '畜牧业服务', 160: '监控设备',
- 161: '石油制品', 162: '石油和化学工业专用设备', 163: '石油和天然气开采产品', 164: '石油天然气开采专用设备', 165: '研究和试验发展', 166: '社会工作',
- 167: '社会经济咨询', 168: '科技推广和应用服务业', 169: '科研、医疗、教育用房', 170: '管道和设备安装', 171: '粮油作物和饲料加工设备', 172: '纸、纸制品及印刷品',
- 173: '纺织原料、毛皮、被服装具', 174: '纺织设备', 175: '绿化管理', 176: '缝纫、服饰、制革和毛皮加工设备', 177: '航空器及其配套设备', 178: '航空客货运输',
- 179: '航空航天工业专用设备', 180: '节能环保工程施工', 181: '装卸搬运', 182: '计算机和办公设备维修', 183: '计算机设备', 184: '计量标准器具及量具、衡器',
- 185: '货币处理专用设备', 186: '货币金融服务', 187: '质检技术服务', 188: '资本市场服务', 189: '车辆', 190: '边界勘界和联检专用设备', 191: '运行维护服务',
- 192: '通信设备', 193: '通用设备修理', 194: '道路货物运输', 195: '邮政专用设备', 196: '邮政业', 197: '采矿业和制造业服务',
- 198: '铁路、船舶、航空航天等运输设备修理', 199: '铁路、道路、隧道和桥梁工程建筑', 200: '铁路运输设备', 201: '防洪除涝设施管理', 202: '陶瓷制品',
- 203: '雷达、无线电和卫星导航设备', 204: '非金属矿', 205: '非金属矿物制品工业专用设备', 206: '非金属矿物材料', 207: '食品加工专用设备', 208: '食品及加工盐',
- 209: '餐饮业', 210: '饮料、酒精及精制茶', 211: '饮料加工设备', 212: '饲养动物及其产品', 213: '黑色金属冶炼及压延产品', 214: '黑色金属矿'}
- self.industry_dic = {'专业施工': {'大类': '专业施工', '门类': '建筑业'},
- '专用仪器仪表': {'大类': '专用设备', '门类': '零售批发'},
- '专用设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
- '互联网信息服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
- '互联网安全服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
- '互联网平台': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
- '互联网接入及相关服务': {'大类': '互联网和相关服务', '门类': '信息传输、软件和信息技术服务业'},
- '人力资源服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '人造原油': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'},
- '仓储业': {'大类': '装卸搬运和运输代理业', '门类': '交通运输、仓储和邮政业'},
- '仪器仪表': {'大类': '通用设备', '门类': '零售批发'},
- '仪器仪表修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
- '会计、审计及税务服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '会议、展览及相关服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '住宅、商业用房': {'大类': '房屋建筑业', '门类': '建筑业'},
- '体育场地设施管理': {'大类': '体育', '门类': '文化、体育和娱乐业'},
- '体育组织': {'大类': '体育', '门类': '文化、体育和娱乐业'},
- '体育设备': {'大类': '专用设备', '门类': '零售批发'},
- '保险服务': {'大类': '保险业', '门类': '金融业'},
- '信息处理和存储支持服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
- '信息技术咨询服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
- '信息系统集成和物联网技术服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
- '修缮工程': {'大类': '修缮工程', '门类': '建筑业'},
- '健康咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '公路旅客运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'},
- '其他专业咨询与调查': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '其他专业技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '其他交通运输设备': {'大类': '专用设备', '门类': '零售批发'},
- '其他公共设施管理': {'大类': '公共设施管理业', '门类': '水利、环境和公共设施管理业'},
- '其他土木工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
- '其他工程服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'},
- '其他建筑建材': {'大类': '建筑建材', '门类': '零售批发'},
- '其他运输业': {'大类': '其他运输业', '门类': '交通运输、仓储和邮政业'},
- '农业和林业机械': {'大类': '专用设备', '门类': '零售批发'},
- '农业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
- '农产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
- '农副食品,动、植物油制品': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'},
- '出版业': {'大类': '新闻和出版业', '门类': '文化、体育和娱乐业'},
- '办公消耗用品及类似物品': {'大类': '办公消耗用品及类似物品', '门类': '零售批发'},
- '办公设备': {'大类': '通用设备', '门类': '零售批发'},
- '化学原料及化学制品': {'大类': '基础化学品及相关产品', '门类': '零售批发'},
- '化学纤维': {'大类': '基础化学品及相关产品', '门类': '零售批发'},
- '化学药品和中药专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '医疗设备': {'大类': '专用设备', '门类': '零售批发'},
- '医药品': {'大类': '医药品', '门类': '零售批发'},
- '卫星传输服务': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'},
- '卫生': {'大类': '卫生', '门类': '卫生和社会工作'},
- '印刷服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '图书和档案': {'大类': '图书和档案', '门类': '零售批发'},
- '图书档案设备': {'大类': '通用设备', '门类': '零售批发'},
- '图书馆与档案馆': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'},
- '土地管理业': {'大类': '土地管理业', '门类': '水利、环境和公共设施管理业'},
- '地质勘查': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '地震服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '场馆、站港用房': {'大类': '房屋建筑业', '门类': '建筑业'},
- '城市公共交通运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'},
- '塑料制品、半成品及辅料': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
- '天然石料': {'大类': '建筑建材', '门类': '零售批发'},
- '娱乐设备': {'大类': '专用设备', '门类': '零售批发'},
- '婚姻服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
- '安全保护服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '安全生产设备': {'大类': '专用设备', '门类': '零售批发'},
- '家具用具': {'大类': '家具用具', '门类': '零售批发'},
- '家用电器修理': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'},
- '工业、生产用房': {'大类': '房屋建筑业', '门类': '建筑业'},
- '工业与专业设计及其他专业技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '工矿工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
- '工程技术与设计服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '工程机械': {'大类': '专用设备', '门类': '零售批发'},
- '工程监理服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'},
- '工程评价服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '工程造价服务': {'大类': '工程服务', '门类': '科学研究和技术服务业'},
- '市场调查': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '广告业': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '广播': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
- '广播、电视、电影设备': {'大类': '通用设备', '门类': '零售批发'},
- '广播电视传输服务': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'},
- '废弃资源综合利用业': {'大类': '废弃资源综合利用业', '门类': '废弃资源综合利用业'},
- '建筑涂料': {'大类': '建筑建材', '门类': '零售批发'},
- '建筑物、构筑物附属结构': {'大类': '建筑建材', '门类': '零售批发'},
- '建筑物拆除和场地准备活动': {'大类': '建筑装饰和其他建筑业', '门类': '建筑业'},
- '建筑装饰和装修业': {'大类': '建筑装饰和其他建筑业', '门类': '建筑业'},
- '录音制作': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
- '影视节目制作': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
- '房地产中介服务': {'大类': '房地产业', '门类': '房地产业'},
- '房地产开发经营': {'大类': '房地产业', '门类': '房地产业'},
- '房地产租赁经营': {'大类': '房地产业', '门类': '房地产业'},
- '房屋租赁': {'大类': '租赁业', '门类': '租赁和商务服务业'},
- '招标代理': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '探矿、采矿、选矿和造块设备': {'大类': '专用设备', '门类': '零售批发'},
- '政法、检测专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '教育服务': {'大类': '教育服务', '门类': '教育'},
- '教育设备': {'大类': '专用设备', '门类': '零售批发'},
- '文体设备和用品出租': {'大类': '租赁业', '门类': '租赁和商务服务业'},
- '文物及非物质文化遗产保护': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'},
- '文物和陈列品': {'大类': '文物和陈列品', '门类': '零售批发'},
- '文艺创作与表演': {'大类': '文化艺术业', '门类': '文化、体育和娱乐业'},
- '文艺设备': {'大类': '专用设备', '门类': '零售批发'},
- '新闻业': {'大类': '新闻和出版业', '门类': '文化、体育和娱乐业'},
- '旅行社及相关服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '日杂用品': {'大类': '日杂用品', '门类': '零售批发'},
- '有色金属冶炼及压延产品': {'大类': '建筑建材', '门类': '零售批发'},
- '有色金属矿': {'大类': '矿与矿物', '门类': '零售批发'},
- '木材、板材等': {'大类': '建筑建材', '门类': '零售批发'},
- '木材采集和加工设备': {'大类': '专用设备', '门类': '零售批发'},
- '机械设备': {'大类': '通用设备', '门类': '零售批发'},
- '机械设备经营租赁': {'大类': '租赁业', '门类': '租赁和商务服务业'},
- '林业产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
- '林业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
- '架线和管道工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
- '核工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '橡胶制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
- '殡葬服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
- '殡葬设备及用品': {'大类': '专用设备', '门类': '零售批发'},
- '气象服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '水上交通运输设备': {'大类': '专用设备', '门类': '零售批发'},
- '水上运输业': {'大类': '水上运输业', '门类': '交通运输、仓储和邮政业'},
- '水利和水运工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
- '水工机械': {'大类': '专用设备', '门类': '零售批发'},
- '水文服务': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'},
- '水资源管理': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'},
- '污水处理及其再生利用': {'大类': '水的生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
- '汽车、摩托车修理与维护': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'},
- '法律服务': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '洗染服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
- '测绘地理信息服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '海洋仪器设备': {'大类': '专用设备', '门类': '零售批发'},
- '海洋工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
- '海洋服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '消防设备': {'大类': '专用设备', '门类': '零售批发'},
- '清洁服务': {'大类': '其他服务业', '门类': '居民服务、修理和其他服务业'},
- '渔业产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
- '渔业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
- '炼焦和金属冶炼轧制设备': {'大类': '专用设备', '门类': '零售批发'},
- '烟草加工设备': {'大类': '专用设备', '门类': '零售批发'},
- '热力生产和供应': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
- '焦炭及其副产品': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'},
- '煤炭采选产品': {'大类': '矿与矿物', '门类': '零售批发'},
- '燃气生产和供应业': {'大类': '燃气生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
- '物业管理': {'大类': '房地产业', '门类': '房地产业'},
- '特种用途动、植物': {'大类': '农林牧渔业产品', '门类': '零售批发'},
- '环保咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '环境与生态监测检测服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '环境污染防治设备': {'大类': '专用设备', '门类': '零售批发'},
- '环境治理业': {'大类': '生态保护和环境治理业', '门类': '水利、环境和公共设施管理业'},
- '玻璃及其制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
- '理发及美容服务': {'大类': '居民服务业', '门类': '居民服务、修理和其他服务业'},
- '生态保护': {'大类': '生态保护和环境治理业', '门类': '水利、环境和公共设施管理业'},
- '电信': {'大类': '电信、广播电视和卫星传输服务', '门类': '信息传输、软件和信息技术服务业'},
- '电力、城市燃气、蒸汽和热水、水': {'大类': '电力、城市燃气、蒸汽和热水、水', '门类': '零售批发'},
- '电力供应': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
- '电力工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '电力工程施工': {'大类': '土木工程建筑业', '门类': '建筑业'},
- '电力生产': {'大类': '电力、热力生产和供应业', '门类': '电力、热力、燃气及水生产和供应业'},
- '电子和通信测量仪器': {'大类': '通用设备', '门类': '零售批发'},
- '电工、电子专用生产设备': {'大类': '专用设备', '门类': '零售批发'},
- '电影放映': {'大类': '广播、电视、电影和影视录音制作业', '门类': '文化、体育和娱乐业'},
- '电气安装': {'大类': '建筑安装业', '门类': '建筑业'},
- '电气设备': {'大类': '通用设备', '门类': '零售批发'},
- '电气设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
- '畜牧业服务': {'大类': '农林牧副渔服务', '门类': '农林牧副渔服务'},
- '监控设备': {'大类': '通用设备', '门类': '零售批发'},
- '石油制品': {'大类': '炼焦产品、炼油产品', '门类': '零售批发'},
- '石油和化学工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '石油和天然气开采产品': {'大类': '矿与矿物', '门类': '零售批发'},
- '石油天然气开采专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '研究和试验发展': {'大类': '研究和试验发展', '门类': '科学研究和技术服务业'},
- '社会工作': {'大类': '社会工作', '门类': '卫生和社会工作'},
- '社会经济咨询': {'大类': '商务服务业', '门类': '租赁和商务服务业'},
- '科技推广和应用服务业': {'大类': '科技推广和应用服务业', '门类': '科学研究和技术服务业'},
- '科研、医疗、教育用房': {'大类': '房屋建筑业', '门类': '建筑业'},
- '管道和设备安装': {'大类': '建筑安装业', '门类': '建筑业'},
- '粮油作物和饲料加工设备': {'大类': '专用设备', '门类': '零售批发'},
- '纸、纸制品及印刷品': {'大类': '纸、纸制品及印刷品', '门类': '零售批发'},
- '纺织原料、毛皮、被服装具': {'大类': '纺织原料、毛皮、被服装具', '门类': '零售批发'},
- '纺织设备': {'大类': '专用设备', '门类': '零售批发'},
- '绿化管理': {'大类': '公共设施管理业', '门类': '水利、环境和公共设施管理业'},
- '缝纫、服饰、制革和毛皮加工设备': {'大类': '专用设备', '门类': '零售批发'},
- '航空器及其配套设备': {'大类': '专用设备', '门类': '零售批发'},
- '航空客货运输': {'大类': '航空运输业', '门类': '交通运输、仓储和邮政业'},
- '航空航天工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '节能环保工程施工': {'大类': '土木工程建筑业', '门类': '建筑业'},
- '装卸搬运': {'大类': '装卸搬运和运输代理业', '门类': '交通运输、仓储和邮政业'},
- '计算机和办公设备维修': {'大类': '机动车、电子产品和日用产品修理业', '门类': '居民服务、修理和其他服务业'},
- '计算机设备': {'大类': '通用设备', '门类': '零售批发'},
- '计量标准器具及量具、衡器': {'大类': '通用设备', '门类': '零售批发'},
- '货币处理专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '货币金融服务': {'大类': '货币金融服务', '门类': '金融业'},
- '质检技术服务': {'大类': '专业技术服务业', '门类': '科学研究和技术服务业'},
- '资本市场服务': {'大类': '资本市场服务', '门类': '金融业'},
- '车辆': {'大类': '通用设备', '门类': '零售批发'},
- '边界勘界和联检专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '运行维护服务': {'大类': '软件和信息技术服务业', '门类': '信息传输、软件和信息技术服务业'},
- '通信设备': {'大类': '通用设备', '门类': '零售批发'},
- '通用设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
- '道路货物运输': {'大类': '道路运输业', '门类': '交通运输、仓储和邮政业'},
- '邮政专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '邮政业': {'大类': '邮政业', '门类': '交通运输、仓储和邮政业'},
- '采矿业和制造业服务': {'大类': '采矿业和制造业服务', '门类': '农林牧副渔服务'},
- '铁路、船舶、航空航天等运输设备修理': {'大类': '金属制品、机械和设备修理业', '门类': '金属制品、机械和设备修理业'},
- '铁路、道路、隧道和桥梁工程建筑': {'大类': '土木工程建筑业', '门类': '建筑业'},
- '铁路运输设备': {'大类': '专用设备', '门类': '零售批发'},
- '防洪除涝设施管理': {'大类': '水利管理业', '门类': '水利、环境和公共设施管理业'},
- '陶瓷制品': {'大类': '橡胶、塑料、玻璃和陶瓷制品', '门类': '零售批发'},
- '雷达、无线电和卫星导航设备': {'大类': '通用设备', '门类': '零售批发'},
- '非金属矿': {'大类': '矿与矿物', '门类': '零售批发'},
- '非金属矿物制品工业专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '非金属矿物材料': {'大类': '建筑建材', '门类': '零售批发'},
- '食品加工专用设备': {'大类': '专用设备', '门类': '零售批发'},
- '食品及加工盐': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'},
- '餐饮业': {'大类': '餐饮业', '门类': '住宿和餐饮业'},
- '饮料、酒精及精制茶': {'大类': '食品、饮料和烟草原料', '门类': '零售批发'},
- '饮料加工设备': {'大类': '专用设备', '门类': '零售批发'},
- '饲养动物及其产品': {'大类': '农林牧渔业产品', '门类': '零售批发'},
- '黑色金属冶炼及压延产品': {'大类': '建筑建材', '门类': '零售批发'},
- '黑色金属矿': {'大类': '矿与矿物', '门类': '零售批发'}}
- self.sess = tf.Session(graph=tf.Graph())
- self.get_model()
- with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_industry_keyword_org/tw_industry_keyword_org.json', 'r',
- encoding='utf-8') as fp1:
- self.json_data_industry = json.load(fp1)
- with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_company_classification_keyword/tw_company_classification_keyword.json', 'r',
- encoding='utf-8') as fp2:
- self.json_data_company = json.load(fp2)
- with open(os.path.dirname(__file__)+'/industry_rule_kw_json/tw_custom_keyword/tw_custom_keyword.json', 'r', encoding='utf-8') as fp3:
- self.json_data_custom = json.load(fp3)
- def get_model(self):
- with self.sess.as_default() as sess:
- with self.sess.graph.as_default():
- meta_graph_def = tf.saved_model.loader.load(sess,
- tags=['serve'],
- export_dir=os.path.dirname(__file__)+'/industry_model')
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph_def.signature_def
- self.title = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['title'].name)
- self.project = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['project'].name)
- self.product = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs['product'].name)
- self.outputs = sess.graph.get_tensor_by_name(signature_def[signature_key].outputs['outputs'].name)
- def text2array(self, text, tenderee='', maxSententLen=20):
- tenderee = tenderee.replace('(', '(').replace(')', ')')
- text = text.replace('(', '(').replace(')', ')')
- text = re.sub(
- '(废标|终止|综?合?评审|评标|开标|资审|履约|验收|成交|中标人?|中选人?|单一来源|合同|候选人|结果|变更|更正|答疑|澄清|意向|需求|采购|招标|询比?价|磋商|谈判|比选|比价|竞价|议价)的?(公告|预告|公示)?|关于为?|选取|定点|直接|邀请函?|通知书?|备案|公开|公示|公告|记录|竞争性',
- '', text)
- text = text.replace(tenderee, '')
- text = ' ' if text=="" else text
- words_docs_list = selffool.cut(text)
- words_docs_list = [[it for it in l if re.search('^[\u4e00-\u9fa5]+$', it)][-maxSententLen:] for l in words_docs_list]
- array = embedding(words_docs_list, shape=(len(words_docs_list), maxSententLen, 128))
- return array
- def process(self, title, project, product, tenderee):
- return self.text2array(title, tenderee), self.text2array(project, tenderee), self.text2array(product)
- def predict_model(self, title, project, product, tenderee=''):
- title_array, project_array, product_array = self.process(title, project, product, tenderee)
- rs = self.sess.run(self.outputs,
- feed_dict={
- self.title:title_array,
- self.project:project_array,
- self.product:product_array
- }
- )
- pred = np.argmax(rs[0])
- return self.id2lb[pred], rs[0][pred]
- # # 返回top2 结果
- # pred_list = np.argsort(-rs[0])
- # return self.id2lb[pred_list[0]], self.id2lb[pred_list[1]], rs[0][pred_list[0]], rs[0][pred_list[1]]
- def predict_rule(self, doctitle, tenderee, win_tenderer, project_name, product):
- doctitle = doctitle if doctitle else ''
- tenderee = tenderee if tenderee else ''
- win_tenderer = win_tenderer if win_tenderer else ''
- project_name = project_name if project_name else ''
- product = product if product else ''
- text_ind = (doctitle + project_name + product).replace(tenderee, '')
- text_com = win_tenderer
- length_ind_text = len(text_ind) + 1
- length_com_text = len(text_com) + 1
- # print(text)
- dic_res = {} # 行业分类字典
- score_lst = [] # 得分列表
- word_lst = [] # 关键词列表
- # 主要内容关键词
- if text_ind:
- # logging.info("data_ind%s"%str(_json_data_industry[0]))
- for data_industry in self.json_data_industry:
- industry = data_industry['xiaolei']
- key_word = data_industry['key_word']
- key_word_2 = data_industry['key_word2']
- power = float(data_industry['power']) if data_industry['power'] else 0
- this_score = power * (text_ind.count(key_word) * len(key_word) / length_ind_text)
- if key_word_2:
- # key_word_compose = key_word + "+" + key_word_2
- if text_ind.count(key_word_2) == 0:
- this_score = 0
- if this_score > 0:
- # print(industry,key_word,this_score)
- if industry in dic_res.keys():
- dic_res[industry] += this_score
- else:
- dic_res[industry] = this_score
- if key_word not in word_lst:
- word_lst.append(key_word)
- # 供应商关键词
- if text_com:
- for data_company in self.json_data_company:
- industry = data_company['industry_type']
- key_word = data_company['company_word']
- power = float(data_company['industry_rate']) if data_company['industry_rate'] else 0
- this_score = power * (text_com.count(key_word) * len(key_word) / length_com_text)
- if this_score > 0:
- # print(industry,key_word,this_score)
- if industry in dic_res.keys():
- dic_res[industry] += this_score
- else:
- dic_res[industry] = this_score
- if key_word not in word_lst:
- word_lst.append(key_word)
- # 自定义关键词
- if text_ind:
- custom_ind = [
- ['tenderee', '医院|疾病预防', ['设备', '系统', '器'], '医疗设备'],
- ['tenderee', '学校|大学|小学|中学|学院|幼儿园', ['设备', '器'], '教育设备'],
- ['tenderee', '学校|大学|小学|中学|学院|幼儿园|医院', ['工程'], '科研、医疗、教育用房'],
- ['tenderee', '供电局|电网|国网|电力|电厂|粤电', ['设备', '器', '物资'], '电力工业专用设备'],
- ['tenderee', '公安|法院|检察院', ['设备', '器'], '政法、检测专用设备'],
- ['tenderee', '^中铁|^中交|^中建|中国建筑', ['材料'], '其他建筑建材'],
- ['doctextcon', '信息技术服务|系统开发|信息化|信息系统', ['监理'], '信息技术咨询服务'],
- ['doctextcon', '工程', ['消防'], '专业施工'],
- ['doctextcon', '铁路|航空|船舶|航天|广铁', ['维修'], '铁路、船舶、航空航天等运输设备修理'],
- ['doctextcon', '设备|仪|器', ['租赁'], '机械设备经营租赁'],
- ['doctextcon', '交通|铁路|公路|道路|桥梁', ['工程'], '铁路、道路、隧道和桥梁工程建筑'],
- ['win_tenderer', '电力', ['设备', '器'], '电力工业专用设备'],
- ['win_tenderer', '信息|网络科技', ['系统'], '信息系统集成和物联网技术服务'],
- ['tenderee,doctextcon', '铁路|广铁|铁道', ['设备', '器', '物资', '材料', '铁路'], '铁路运输设备'],
- ]
- for data_custom in self.json_data_custom:
- industry_custom = data_custom['industry']
- key_word = data_custom['company_word']
- power = float(data_custom['industry_rate'])
- for k in range(len(custom_ind)):
- subject = ''
- if 'tenderee' in custom_ind[k][0]:
- subject += tenderee
- if 'win_tenderer' in custom_ind[k][0]:
- subject += win_tenderer
- if 'doctextcon' in custom_ind[k][0]:
- subject += text_ind
- ptn = custom_ind[k][1]
- # print('ptn',ptn)
- if re.search(ptn, subject) and industry_custom in custom_ind[k][2]:
- industry = custom_ind[k][3]
- else:
- continue
- this_score = power * (text_ind.count(key_word) * len(key_word) / len(subject))
- if this_score > 0:
- # print(industry,key_word,this_score)
- if industry in dic_res.keys():
- dic_res[industry] += this_score
- else:
- dic_res[industry] = this_score
- if key_word not in word_lst:
- word_lst.append(key_word)
- sort_res = sorted(dic_res.items(), key=lambda x: x[1], reverse=True)
- lst_res = [s[0] for s in sort_res]
- score_lst = [str(round(float(s[1]), 2)) for s in sort_res]
- if len(lst_res) > 0:
- return lst_res, score_lst, word_lst
- else:
- return [""], [], []
- def predict_merge(self, pinmu_type, industry_lst):
- '''
- 通过一系列规则最终决定使用模型还是规则的结果
- :param pinmu_type: 模型预测类别
- :param industry_lst: 规则预测类别列表
- :return:
- '''
- industry_type = industry_lst[0]
- if industry_type == "":
- return pinmu_type
- if industry_type == '专用设备修理' and re.search('修理|维修|装修|修缮', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '其他土木工程建筑' and re.search('工程|建筑|用房|施工|安装|质检|其他专业咨询与调查', pinmu_type):
- final_type = pinmu_type
- elif pinmu_type == '专用设备修理' and re.search('工程|修理', industry_type):
- final_type = industry_type
- elif pinmu_type == '信息系统集成和物联网技术服务' and re.search('卫星传输|信息处理和存储支持服务|信息技术咨询服务|运行维护服务|其他专业技术服务|医疗设备|医药品',
- industry_type):
- final_type = industry_type
- elif industry_type == '仪器仪表' and re.search('仪器|器具|医疗设备', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '医药品' and re.search('医疗设备', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '医药品' and re.search('医疗设备', pinmu_type):
- final_type = pinmu_type
- elif re.search('设备', industry_type) and re.search('修理|维修', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '社会工作' and re.search('工程', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '信息系统集成和物联网技术服务' and re.search('信息处理|设备', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '研究和试验发展' and re.search('其他专业咨询与调查|质检技术服务|信息系统集成|其他工程服务', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '其他专业咨询与调查' and re.search('工程造价服务', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '广告业' and re.search('印刷服务|影视节目制作|信息系统', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '清洁服务' and re.search('工程|环境污染防治设备|修理', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '其他公共设施管理' and re.search('信息系统', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '其他专业技术服务' and re.search('工程技术与设计服务|质检技术服务|环境与生态监测检测服务', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '机械设备经营租赁' and re.search('电信', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '货币金融服务' and re.search('信息系统集成和物联网技术服务', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '体育场地设施管理' and re.search('体育设备', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '安全保护服务' and re.search('信息系统|监控设备|互联网安全服务', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '互联网接入及相关服务' and re.search('通信设备', pinmu_type):
- final_type = pinmu_type
- elif industry_type == '卫生' and re.search('医疗设备|信息系统', pinmu_type):
- final_type = pinmu_type
- elif pinmu_type == '研究和试验发展' and re.search('其他工程服务', industry_type):
- final_type = industry_type
- elif pinmu_type == '办公设备' and re.search('教育设备', industry_type):
- final_type = industry_type
- elif re.search('车辆|机械设备经营租赁', pinmu_type) and re.search('公路旅客运输', industry_type):
- final_type = industry_type
- elif len(industry_lst) > 1 and pinmu_type == industry_lst[1] and re.search('会计|法律|物业|家具|印刷|互联网安全',
- industry_type) == None \
- and re.search('其他|人力资源服务', pinmu_type) == None:
- final_type = pinmu_type
- elif industry_type != "":
- final_type = industry_type
- else:
- final_type = pinmu_type
- return final_type
- def predict(self, title, project, product, prem):
- def get_ree_win(prem):
- tenderee = ""
- win_tenderer = ""
- try:
- for v in prem[0]['prem'].values():
- for link in v['roleList']:
- if link['role_name'] == 'tenderee' and tenderee == "":
- tenderee = link['role_text']
- elif link['role_name'] == 'win_tenderer' and win_tenderer == "":
- win_tenderer = link['role_text']
- except Exception as e:
- print('解析prem 获取招标人、中标人出错')
- return tenderee, win_tenderer
- tenderee, win_tenderer = get_ree_win(prem)
- result_model, prob = self.predict_model(title, project, product, tenderee)
- industry_lst, score_lst, word_lst = self.predict_rule(title, tenderee, win_tenderer, project, product)
- final_type = self.predict_merge(result_model, industry_lst)
- # print('模型:%s;规则:%s;最终:%s'%(result_model, industry_lst[0], final_type))
- # return {'industry': final_type}
- return {'industry': {
- 'class_name': final_type,
- 'subclass': self.industry_dic[final_type]['大类'],
- 'class': self.industry_dic[final_type]['门类']
- }
- }
- class DistrictPredictor():
- def __init__(self):
- with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
- dist_dic = pickle.load(f)
- short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
- full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
- short2id = {}
- full2id = {}
- for k, v in dist_dic.items():
- if v['简称'] not in short2id:
- short2id[v['简称']] = [k]
- else:
- short2id[v['简称']].append(k)
- if v['全称'] not in full2id:
- full2id[v['全称']] = [k]
- else:
- full2id[v['全称']].append(k)
- self.dist_dic = dist_dic
- self.short_name = short_name
- self.full_name = full_name
- self.short2id = short2id
- self.full2id = full2id
- # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
- def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
- '''
- 先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
- :param project_name:
- :param prem:
- :param title:
- :param list_articles:
- :param web_source_name:
- :return:
- '''
- def get_ree_addr(prem):
- tenderee = ""
- tenderee_address = ""
- try:
- for v in prem[0]['prem'].values():
- for link in v['roleList']:
- if link['role_name'] == 'tenderee' and tenderee == "":
- tenderee = link['role_text']
- tenderee_address = link['address']
- except Exception as e:
- print('解析prem 获取招标人、及地址出错')
- return tenderee, tenderee_address
- def get_area(text, web_source_name, not_in_content=True):
- score_l = []
- id_set = set()
- if re.search(self.short_name, text):
- for it in re.finditer(self.full_name, text):
- name = it.group(0)
- score = len(name) / len(text)
- for _id in self.full2id[name]:
- area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
- # score_l.append([_id, score] + area)
- # w = self.dist_dic[_id]['权重']
- score_l.append([_id, score + 1] + area) # 匹配全称的加1 ,不加权重,因为权重某些赋值不好
- flag = 0
- for it in re.finditer(self.short_name, text):
- if it.end() < len(text) and re.search('^(村|镇|街|路|江|河|湖|北路|南路|东路|大道|社区)', text[it.end():]) == None:
- name = it.group(0)
- score = (it.start() + len(name)) / len(text)
- for _id in self.short2id[name]:
- score2 = 0
- w = self.dist_dic[_id]['权重']
- _type = self.dist_dic[_id]['类型']
- area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
- if area[0] in ['2', '16', '20', '30']:
- _type += 10
- if w < 1 and it.end() < len(text) and text[it.end()] in ['省', '市', '县']: # 如果简称后面 有省市县权重改为1
- w = 1
- score2 += w
- if _id not in id_set:
- if _type == 20:
- type_w = 3
- elif _type == 30:
- if it.start()>3 and text[it.start()-1] == '市': # 城市后面 简称不能作为市
- type_w = 0
- else:
- type_w = 2
- else:
- if it.end()<len(text) and text[it.end()] == '市': # 简称后面 有市字 改为市级
- type_w = 2
- else:
- type_w = 0.5
- id_set.add(_id)
- score2 += w * type_w
- score_l.append([_id, score * w + score2] + area)
- if flag == 1:
- pass
- # print('score', score)
- if re.search('公司', web_source_name) == None:
- for it in re.finditer(self.short_name, web_source_name):
- name = it.group(0)
- for _id in self.short2id[name]:
- area = self.dist_dic[_id]['area'] + [''] * (3 - len(self.dist_dic[_id]['area']))
- w = self.dist_dic[_id]['权重']
- score = w * 0.2
- score_l.append([_id, score] + area)
- area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
- if len(score_l) == 0:
- return {'district': area_dic}
- else:
- df = pd.DataFrame(score_l, columns=['id', 'score', 'province', 'city', 'district'])
- df['简称'] = df['id'].apply(lambda x: self.dist_dic[x]['地区'])
- # print('地区评分:')
- # print(df)
- df_pro = df.groupby('province').sum().sort_values(by=['score'], ascending=False)
- pro_id = df_pro.index[0]
- if df_pro.loc[pro_id, 'score'] < 0.1 and not_in_content: # 不是二次全文匹配的 省级评分小于0.1的不要
- # print('评分低于0.1', df_pro.loc[pro_id, 'score'], self.dist_dic[pro_id]['地区'])
- return {'district': area_dic}
- area_dic['province'] = self.dist_dic[pro_id]['地区']
- area_dic['area'] = self.dist_dic[pro_id]['大区']
- df = df[df['city'] != ""]
- df = df[df['province'] == pro_id]
- if len(df) > 0:
- df_city = df.groupby('city').sum().sort_values(by=['score'], ascending=False)
- city_id = df_city.index[0]
- area_dic['city'] = self.dist_dic[city_id]['地区']
- df = df[df['district'] != ""]
- df = df[df['city'] == city_id]
- if len(df) > 0:
- df_dist = df.groupby('district').sum().sort_values(by=['score'], ascending=False)
- dist_id = df_dist.index[0]
- area_dic['district'] = self.dist_dic[dist_id]['地区']
- # print(area_dic)
- return {'district': area_dic}
- def get_role_address(text):
- '''正则匹配获取招标人地址
- 3:地址直接在招标人后面 招标人:xxx,地址:xxx
- 4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
- '''
- p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
- p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
- p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
- if re.search(p3, text):
- return re.search(p3, text).group('addr')
- elif re.search(p4, text):
- return re.search(p4, text).group('addr')
- elif re.search(p5, text):
- return re.search(p5, text).group('addr')
- else:
- return ''
- def get_project_addr(text):
- p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
- if re.search(p1, text):
- return re.search(p1, text).group(0)
- else:
- return ''
- def get_bid_addr(text):
- p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
- if re.search(p2, text):
- return re.search(p2, text).group(0)
- else:
- return ''
- def get_all_addr(list_entitys):
- tenderee_l = []
- addr_l = []
- for ent in list_entitys[0]:
- if ent.entity_type == 'location' and len(ent.entity_text)>2:
- addr_l.append(ent.entity_text)
- elif ent.entity_type in ['org', 'company']:
- if ent.label in [0, 1]: # 加招标或代理
- tenderee_l.append(ent.entity_text)
- return ' '.join(addr_l), ' '.join(tenderee_l)
- def get_title_addr(text):
- p1 = '(\w{2,8}[省市州区县][^\w]*)+'
- if re.search(p1, text):
- return re.search(p1, text).group(0)
- else:
- return ''
- if '##attachment##' in list_articles[0].content:
- content, attachment = list_articles[0].content.split('##attachment##')
- if len(content) < 200:
- content += attachment
- else:
- content = list_articles[0].content
- tenderee, tenderee_address = get_ree_addr(prem)
- msc = ""
- pro_addr = get_project_addr(content)
- if pro_addr != "":
- msc += '使用规则提取的项目地址;'
- tenderee_address = pro_addr
- else:
- role_addr = get_role_address(content)
- if role_addr != "":
- msc += '使用规则提取的联系人地址;'
- tenderee_address = role_addr
- if tenderee_address == "":
- title_addr = get_title_addr(title)
- if title_addr != "":
- msc += '使用规则提取的标题地址;'
- tenderee_address = title_addr
- else:
- bid_addr = get_bid_addr(content)
- if bid_addr != "":
- msc += '使用规则提取的开标地址;'
- tenderee_address = bid_addr
- project_name = str(project_name)
- tenderee = str(tenderee)
- # print('招标人地址',role_addr, tenderee_address)
- project_name = project_name + title if project_name not in title else project_name
- project_name = project_name.replace(tenderee, '')
- text1 = "{0} {1} {2}".format(project_name, tenderee, tenderee_address)
- web_source_name = str(web_source_name) # 修复某些不是字符串类型造成报错
- text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1) #预防提取错 合肥 路南 新会 等地区
- # print('text1:', text1)
- msc += '## 第一次预测输入:%s ##;'%text1
- rs = get_area(text1, web_source_name)
- msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
- rs['district']['province'], rs['district']['city'], rs['district']['district'])
- # self.f.write('%s %s \n' % (list_articles[0].id, msc))
- # print('地区匹配:', msc)
- if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
- msc = ""
- all_addr, tenderees = get_all_addr(list_entitys)
- text2 = tenderees + " " + all_addr + ' ' + title
- msc += '使用实体列表所有招标人+所有地址;'
- # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
- text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
- # print('text2:', text2)
- msc += '## 第二次预测输入:%s ##'%text2
- rs2 = get_area(text2, web_source_name, not_in_content=False)
- rs2['district']['is_in_text'] = True
- if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
- rs = rs2
- elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
- rs = rs2
- msc += '预测结果:省份:%s, 城市:%s,区县:%s'%(
- rs['district']['province'],rs['district']['city'],rs['district']['district'])
- # self.f.write('%s %s \n'%(list_articles[0].id, msc))
- # print('地区匹配:', msc)
- return rs
- class TableTag2List():
- '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
- def table2list(self, table, text_process=None):
- self._output = []
- row_ind = 0
- col_ind = 0
- for row in table.find_all('tr'):
- # record the smallest row_span, so that we know how many rows
- # we should skip
- smallest_row_span = 1
- if len(row.find_all(['td', 'th'], recursive=False)) > 20:
- log('未补全前表格列数大于20的不做表格处理')
- return []
- for cell in row.children:
- if cell.name in ('td', 'th'):
- # check multiple rows
- # pdb.set_trace()
- row_span = int(re.sub('[^0-9]', '', cell.get('rowspan'))) if cell.get('rowspan') and cell.get('rowspan').isdigit() else 1
- # try updating smallest_row_span
- smallest_row_span = min(smallest_row_span, row_span)
- # check multiple columns
- col_span = int(re.sub('[^0-9]', '', cell.get('colspan'))) if cell.get('colspan') and cell.get('colspan').isdigit() else 1
- # find the right index
- while True:
- if self._check_cell_validity(row_ind, col_ind):
- break
- col_ind += 1
- # insert into self._output
- try:
- if text_process != None:
- text = [re.sub('\xa0','',text_process(cell,final=False)),0]
- else:
- text = str(cell.get_text()).replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
- text = re.sub('\s', '', text)[:200] # 只需取前200字即可
- text = ' ' if text == "" else text
- self._insert(row_ind, col_ind, row_span, col_span, text)
- except UnicodeEncodeError:
- raise Exception( 'Failed to decode text; you might want to specify kwargs transformer=unicode' )
- # update col_ind
- col_ind += col_span
- if col_ind > 50 and text_process == None: # 表格要素提取及候选人提取的 表格列数大于50的去掉
- return []
- # update row_ind
- row_ind += smallest_row_span
- col_ind = 0
- return self._output
- def _check_validity(self, i, j, height, width):
- """
- check if a rectangle (i, j, height, width) can be put into self.output
- """
- return all(self._check_cell_validity(ii, jj) for ii in range(i, i+height) for jj in range(j, j+width))
- def _check_cell_validity(self, i, j):
- """
- check if a cell (i, j) can be put into self._output
- """
- if i >= len(self._output):
- return True
- if j >= len(self._output[i]):
- return True
- if self._output[i][j] == "":
- return True
- return False
- def _insert(self, i, j, height, width, val):
- # pdb.set_trace()
- for ii in range(i, i+height):
- for jj in range(j, j+width):
- self._insert_cell(ii, jj, val)
- def _insert_cell(self, i, j, val):
- while i >= len(self._output):
- self._output.append([])
- while j >= len(self._output[i]):
- self._output[i].append("")
- if self._output[i][j] == "":
- self._output[i][j] = val
- class TablePremExtractor(object):
- def __init__(self):
- '''各要素表头规则'''
- self.head_rule_dic = {
- 'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
- 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
- "project_name": "(包[段组件]|标[段包的]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|通用|主要标的)(名称?|内容)",
- "win_sort": "是否(中标|成交|中选)|排名|排序|名次|未(中标|成交)原因",
- "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源|邀请)?供应商(名称)?$",
- "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
- "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(单价|总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
- "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
- }
- with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
- self.headerset = pickle.load(f)
- self.tb = TableTag2List()
- def find_header(self, td_list):
- fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|/万?元', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
- header_dic = dict()
- flag = False
- contain_header = False
- if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
- flag = True
- for i in range(len(td_list)) :
- text = td_list[i]
- if len(text) > 15: # 长度大于15 不进行表头匹配
- continue
- if re.search('未(中标|成交)原因', text): # 不提取此种表格
- return flag, contain_header, dict()
- num = 0
- for k, v in self.head_rule_dic.items():
- if re.search(v, text):
- if k in ['tenderer'] and re.search('是否', text):
- continue
- header_dic[k] = (i, text)
- num += 1
- if num>1:
- print('表头错误,一个td匹配到两个表头:', header_dic)
- return flag, contain_header, dict()
- if re.search(';金额((万?元))?;', ';'.join(td_list)): # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
- if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
- for i in range(len(td_list)):
- text = td_list[i]
- if re.search('^金额((万?元))?$',text):
- header_dic['bid_amount'] = (i, text)
- break
- elif 'tenderee' in header_dic and 'budget' not in header_dic:
- for i in range(len(td_list)):
- text = td_list[i]
- if re.search('^金额((万?元))?$', text):
- header_dic['budget'] = (i, text)
- break
- if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
- 'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
- return flag, contain_header, header_dic
- elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
- return flag,contain_header, header_dic
- elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
- contain_header = True
- return flag, contain_header, dict()
- def is_role(self, text):
- if len(text) > 25 or len(text)<4:
- return False
- elif len(re.findall('有限责?任?公司', text)) > 1:
- return False
- elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text):
- return True
- else:
- ners = selffool.ner(text)
- if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]):
- return True
- return False
- def get_role(self, text, nlp_enterprise):
- '''
- 获取字符串text角色实体
- :param text: 待获取实体字符串
- :param nlp_enterprise: 公告中的角色实体列表
- :return:
- '''
- if text in nlp_enterprise:
- return text
- if len(text) > 25 or len(text)<4:
- return ''
- ners = getNers([text], useselffool=True)
- roles = []
- if ners:
- for ner in ners[0]:
- if ner[2] in ['org', 'company', 'location']:
- roles.append(ner[3])
- if roles and len(''.join(roles)) > len(text)*0.8:
- return roles[0]
- else:
- return ''
- def extract_from_df(self, df, headers):
- prem_dic = {}
- previous_package = "" # 上一行包号
- multi_same_package = False # 非连续的重复包号
- package_fix2raw = dict() # 处理后包号:处理前包号 字典
- link_set = set()
- not_package = True if 'project_name' in headers and re.search('(货物|商品|产品|通用|主要标的)(名称?|内容)', headers['project_name'][1]) and \
- 'package_code' not in headers and 'budget' not in headers and "bid_amount" not in headers else False
- for i in df.index:
- same_package = False # 连续重复包号,一般是 rowspan 造成;一包 多个采购
- project_code = df.loc[i, headers['project_code'][0]] if "project_code" in headers else ""
- package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
- project_name = df.loc[i, headers['project_name'][0]] if "project_name" in headers else ""
- tenderee = df.loc[i, headers['tenderee'][0]] if "tenderee" in headers else ""
- tenderer = df.loc[i, headers['tenderer'][0]] if "tenderer" in headers else ""
- budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
- bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
- win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
- if set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset != set(): # 只要有一项为表头 停止匹配
- # print('只要有一项为表头 停止匹配', set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort]) & self.headerset)
- break
- if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2: # 内容为空或全部一样 停止匹配
- # print('内容为空或全部一样 停止匹配')
- break
- if re.search('详见', project_name): # 去除某些表达: 详见招标文件
- project_name = ""
- if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
- package_code_raw = project_name
- project_name = ""
- package_code = package_code_raw
- if re.search('合计|总计', package_code+project_code):
- continue
- if package_code != '' and package_code == previous_package: # 处理 208162730 一个包采购多种东西情况
- same_package = True
- project_name = ''
- previous_package = package_code
- if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
- continue
- if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
- continue
- if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
- continue
- if win_sort == "" and "tenderer" in headers and re.search('候选|入围', headers['tenderer'][1]) and re.search('推荐中标候选人', headers['tenderer'][1])==None:
- tenderer = ""
- # tenderee = tenderee if self.is_role(tenderee) else ""
- # tenderer = tenderer if self.is_role(tenderer) else ""
- tenderee = self.get_role(tenderee, self.nlp_enterprise)
- tenderer = self.get_role(tenderer, self.nlp_enterprise)
- if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
- break
- if not_package:
- if (project_code, package_code, tenderee, tenderer, budget_, bid_amount_) in link_set:
- continue
- link_set.add((project_code, package_code, tenderee, tenderer, budget_, bid_amount_))
- else:
- if (project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_) in link_set:
- continue
- link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
- package = package_code if package_code else str(len(prem_dic)+1) #str(i+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
- package = uniform_package_name(package)
- if multi_same_package == False and package not in package_fix2raw: # 如果处理后的标段号 已经在列表里面,采用原始标段号文本
- package_fix2raw[package] = package_code_raw
- elif same_package == False:
- multi_same_package = True
- if multi_same_package:
- package = package_code_raw
- if package not in prem_dic or not same_package:
- prem_dic[package] = {
- 'code': '',
- 'name': '',
- 'roleList': [],
- 'tendereeMoney': 0,
- 'tendereeMoneyUnit': ""
- }
- prem_dic[package]['code'] = project_code
- prem_dic[package]['name'] = project_name
- if budget_ != "":
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', budget_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
- break
- budget_header = headers['budget'][1] if 'budget' in headers else ''
- budget, money_unit = money_process(budget_, budget_header)
- if budget > 0:
- if same_package and prem_dic[package]['tendereeMoney'] != budget: #
- prem_dic[package]['tendereeMoney'] += budget
- else:
- prem_dic[package]['tendereeMoney'] = budget
- prem_dic[package]['tendereeMoneyUnit'] = money_unit
- if tenderee and not same_package:
- prem_dic[package]['roleList'].append({
- "address": "",
- "linklist": [],
- "role_money": {
- "discount_ratio": "",
- "downward_floating_ratio": "",
- "floating_ratio": "",
- "money": 0,
- "money_unit": ""
- },
- "role_name": "tenderee",
- "role_text": tenderee,
- "serviceTime": ""
- })
- if tenderer and not same_package:
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
- bid_amount_)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
- break
- bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and 'bid_amount' in headers else (0, '')
- prem_dic[package]['roleList'].append({
- "address": "",
- "linklist": [],
- "role_money": {
- "discount_ratio": "",
- "downward_floating_ratio": "",
- "floating_ratio": "",
- "money": bid_amount,
- "money_unit": money_unit
- },
- "role_name": "win_tenderer",
- "role_text": tenderer,
- "serviceTime": ""
- })
- if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0: # 只有项目编号和名称的 丢弃 并不再继续往下匹配
- prem_dic.pop(package)
- break
- if multi_same_package:
- for k, v in package_fix2raw.items():
- if k in prem_dic:
- prem_dic[v] = prem_dic.pop(k)
- return prem_dic
- def get_prem(self, soup):
- tables = soup.find_all('table')
- tables.reverse()
- rs_dic = {}
- for table in tables:
- text = table.text.strip()
- previous = table.findPreviousSibling()
- text2 = previous .text.strip() if previous else ""
- # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
- if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
- tb_ex = table.extract()
- if previous:
- sib = previous.extract()
- continue
- trs = self.tb.table2list(table)
- # table.extract()
- i = 0
- headers = ""
- table_prem = {}
- while i < len(trs) - 1:
- flag_, contain_header_, headers_ = self.find_header(trs[i])
- if flag_ and headers_ != dict():
- table_items = []
- headers = headers_
- for j in range(i + 1, len(trs)):
- if len(trs[j]) == len(trs[i]):
- flag_, contain_header_, headers_ = self.find_header(trs[j])
- if flag_ or contain_header_:
- break
- else:
- table_items.append(trs[j])
- else:
- # print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
- break
- if len(table_items) > 0:
- df = pd.DataFrame(table_items)
- prem_ = self.extract_from_df(df, headers)
- # rs_dic.update(prem_)
- table_prem.update(prem_)
- i = j - 1
- i += 1
- if table_prem and len(trs) == 2 and 'package_code' not in headers and '1' in table_prem and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
- sib = table.find_previous_sibling()
- sib_text = sib.get_text()
- ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}', sib_text)
- if sib.name in ['p', 'div'] and len(sib_text)<30 and ser_sib:
- package_sib = ser_sib.group(0)
- package_sib = uniform_package_name(package_sib)
- table_prem[package_sib] = table_prem.pop('1')
- if table_prem:
- rs_dic.update(table_prem)
- table.extract()
- return rs_dic
- def predict(self, html, nlp_enterprise):
- soup = BeautifulSoup(html, 'lxml')
- richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
- self.nlp_enterprise = nlp_enterprise
- if richText:
- richText = richText.extract() # 过滤掉附件
- prem = self.get_prem(soup)
- if prem == {} and richText:
- prem = self.get_prem(richText)
- if len(prem) == 1: # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
- k = list(prem)[0]
- if k == '1' or len(k) > 2:
- prem['Project'] = prem.pop(k)
- return prem
- class CandidateExtractor(object):
- def __init__(self):
- '''各要素表头规则'''
- self.head_rule_dic = {
- 'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
- "win_sort": "排名|排序|名次|推荐顺序",
- 'win_or_not': '是否中标|是否入围|是否入库|入围结论',
- "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
- "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
- "win_tenderer": "第一名|第一(中标|成交)?候选人",
- "second_tenderer": "第二名|第二(中标|成交)?候选人",
- "third_tenderer": "第三名|第三(中标|成交)?候选人",
- }
- '''非表格候选人正则'''
- self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?:$'
- self.tb = TableTag2List()
- with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
- self.headerset = pickle.load(f)
- def find_header(self, td_list):
- fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$', '', it) for it in td_list] # 去除表头无关信息,方便匹配判断是否为表头
- header_dic = dict()
- flag = False
- contain_header = False
- if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
- flag = True
- for i in range(len(td_list)) :
- text = td_list[i]
- if len(text) > 15: # 长度大于15 不进行表头匹配
- continue
- if re.search('未(中标|成交)原因', text): # 不提取此种表格
- return flag, contain_header, dict()
- num = 0
- for k, v in self.head_rule_dic.items():
- if re.search(v, text):
- if k in ['candidate', 'win_tenderer', 'second_tenderer', 'third_tenderer'] and re.search('是否', text):
- continue
- header_dic[k] = (i, text)
- if k != 'candidate': # candidate 可与前三候选重复
- num += 1
- if num>1:
- print('表头错误,一个td匹配到两个表头:', header_dic)
- return flag, contain_header, dict()
- if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
- return flag, contain_header, header_dic
- elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(fix_td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
- contain_header = True
- return flag, contain_header, dict()
- def is_role(self, text):
- if len(text) > 25 or len(text) < 4:
- return False
- elif len(re.findall('有限责?任?公司', text)) > 1:
- return False
- elif re.search('[\w()]{4,}(有限责?任?公司|学校|学院|大学|中学|小学|医院|管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处)$', text):
- return True
- else:
- ners = selffool.ner(text)
- if len(ners[0]) == 1 and ('company' in ners[0][0] or 'org' in ners[0][0]):
- return True
- return False
- def get_role(self, text, nlp_enterprise):
- '''
- 获取字符串text角色实体
- :param text: 待获取实体字符串
- :param nlp_enterprise: 公告中的角色实体列表
- :return:
- '''
- if text in nlp_enterprise:
- return text
- if len(text) > 25 or len(text)<4:
- return ''
- ners = getNers([text], useselffool=True)
- roles = []
- if ners:
- for ner in ners[0]:
- if ner[2] in ['org', 'company', 'location']:
- roles.append(ner[3])
- if roles and len(''.join(roles)) > len(text)*0.8:
- return roles[0]
- else:
- return ''
- def extract_from_df(self, df, headers):
- prem_dic = {}
- link_set = set()
- candidate_set = set()
- role_dic = dict() # 保存一二三候选人并排的情况
- for i in df.index:
- package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
- candidate_ = df.loc[i, headers['candidate'][0]] if "candidate" in headers else ""
- win_or_not = df.loc[i, headers['win_or_not'][0]] if "win_or_not" in headers else ""
- # budget_ = df.loc[i, headers['budget'][0]] if "budget" in headers else ""
- bid_amount_ = df.loc[i, headers['bid_amount'][0]] if "bid_amount" in headers else ""
- win_sort = df.loc[i, headers['win_sort'][0]] if "win_sort" in headers else ""
- win_tenderer = df.loc[i, headers['win_tenderer'][0]] if "win_tenderer" in headers else ""
- second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
- third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
- if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
- break
- if len(set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2: # 全部为空或内容一样 停止匹配
- break
- if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
- col_indx = headers['candidate'][0] -1
- pre_col = df.loc[i, col_indx]
- if col_indx > 0 and pre_col == candidate_:
- pre_col = df.loc[i, col_indx - 1]
- if re.search('第[一二三]名|第[一二三](中标)?候选人', pre_col):
- win_sort = pre_col
- package_code = package_code_raw
- # candidate = candidate_ if self.is_role(candidate_) else ""
- # tenderer = tenderer if self.is_role(tenderer) else ""
- candidate = self.get_role(candidate_, self.nlp_enterprise)
- # if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
- # break
- if(candidate_,win_tenderer, second_tenderer,third_tenderer, bid_amount_) in link_set:
- continue
- link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_))
- package = package_code
- package = uniform_package_name(package) if package !="" else "Project"
- if candidate:
- if win_or_not and re.search('否|未入围', win_or_not):
- pass
- else:
- candidate_set.add(candidate)
- if win_tenderer and second_tenderer and third_tenderer:
- if re.search("(候选人|投标人)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人)名?称?", df.loc[i, 1]):
- for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
- [win_tenderer, second_tenderer, third_tenderer]):
- text = self.get_role(text, self.nlp_enterprise)
- if text:
- # if self.is_role(text):
- if type not in role_dic:
- role_dic[type] = dict()
- role_dic[type]['role_text'] = text
- if type in ['second_tenderer', 'third_tenderer']:
- candidate_set.add(text)
- elif re.search('投标报价|报价$', df.loc[i, 0]) or re.search('投标报价|报价$', df.loc[i, 1]):
- header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
- for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
- [win_tenderer, second_tenderer, third_tenderer]):
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
- text)) > 5: # 金额字段出现超过5个非金额字符,中断匹配
- break
- money, money_unit = money_process(text, header)
- if money > 0:
- if type not in role_dic:
- role_dic[type] = dict()
- role_dic[type]['money'] = money
- role_dic[type]['money_unit'] = money_unit
- else:
- break
- elif candidate and win_sort:
- role_type = ""
- if re.search('第[一1]|^[一1]$', win_sort):
- role_type = "win_tenderer"
- elif re.search('第[二2]|^[二2]$', win_sort):
- role_type = "second_tenderer"
- elif re.search('第[三3]|^[三3]$', win_sort):
- role_type = "third_tenderer"
- if role_type != "":
- if package not in prem_dic:
- prem_dic[package] = {
- 'code': '',
- 'name': '',
- 'roleList': [],
- 'tendereeMoney': 0,
- 'tendereeMoneyUnit': ""
- }
- if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', bid_amount_))> 5: # 金额字段出现超过5个非金额字符,中断匹配
- break
- bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if "bid_amount" in headers else (0, "")
- prem_dic[package]['roleList'].append({
- "address": "",
- "linklist": [],
- "role_money": {
- "discount_ratio": "",
- "downward_floating_ratio": "",
- "floating_ratio": "",
- "money": bid_amount,
- "money_unit": money_unit
- },
- "role_name": role_type,
- "role_text": candidate,
- "serviceTime": ""
- })
- if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃
- prem_dic.pop(package)
- if role_dic and prem_dic == dict():
- if package not in prem_dic:
- prem_dic[package] = {
- 'code': '',
- 'name': '',
- 'roleList': [],
- 'tendereeMoney': 0,
- 'tendereeMoneyUnit': ""
- }
- for role_type, v in role_dic.items():
- role_text = v.get('role_text', '')
- if role_text == "":
- continue
- money = v.get('money', 0)
- money_unit = v.get('money_unit', '')
- prem_dic[package]['roleList'].append({
- "address": "",
- "linklist": [],
- "role_money": {
- "discount_ratio": "",
- "downward_floating_ratio": "",
- "floating_ratio": "",
- "money": money,
- "money_unit": money_unit
- },
- "role_name": role_type,
- "role_text": role_text,
- "serviceTime": ""
- })
- if len(prem_dic[package]['roleList']) == 0: # 只有项目编号和名称的 丢弃
- prem_dic.pop(package)
- return prem_dic, candidate_set
- def get_prem(self, soup):
- tables = soup.find_all('table')
- tables.reverse()
- rs_dic = {}
- candidate_set = set()
- for table in tables:
- trs = self.tb.table2list(table)
- table.extract()
- i = 0
- headers = ""
- while i < len(trs) - 1:
- flag_, contain_header_, headers_ = self.find_header(trs[i])
- if flag_ and headers_ != dict():
- table_items = []
- headers = headers_
- for j in range(i + 1, len(trs)):
- if len(trs[j]) == len(trs[i]):
- flag_, contain_header_, headers_ = self.find_header(trs[j])
- if flag_ or contain_header_:
- break
- else:
- table_items.append(trs[j])
- else:
- # print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
- break
- if len(table_items) > 1:
- df = pd.DataFrame(table_items)
- prem_, candidate_set_ = self.extract_from_df(df, headers)
- rs_dic.update(prem_)
- candidate_set.update(candidate_set_)
- i = j - 1
- i += 1
- return rs_dic, candidate_set
- def get_candidates_from_text(self, list_sentences, list_entitys):
- candidates = set()
- sentences = sorted(list_sentences[0], key=lambda x: x.sentence_index)
- for ent in list_entitys[0]:
- if ent.entity_type in ['org', 'company']:
- sen_index = ent.sentence_index
- text = sentences[sen_index].sentence_text
- b = ent.wordOffset_begin
- e = ent.wordOffset_end
- if ent.label in [2,3,4]: # 直接加实体预测的候选人, 否则规则检查是否为候选人
- candidates.add(ent.entity_text)
- elif isinstance(b, int) and isinstance(e, int):
- foreword = text[max(0, b - 10):b]
- if re.search(self.p, foreword):
- candidates.add(ent.entity_text)
- return candidates
- def predict(self, html, list_sentences, list_entitys, nlp_enterprise):
- self.nlp_enterprise = nlp_enterprise
- html = html.replace('比选申请单位', '中标候选人') # 82347769
- soup = BeautifulSoup(html, 'lxml')
- richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
- if richText:
- richText = richText.extract() # 过滤掉附件
- prem, candidate_set = self.get_prem(soup)
- if prem == {} and richText:
- prem, candidate_set = self.get_prem(richText)
- if prem == {} and candidate_set == set():
- candidate_set = self.get_candidates_from_text(list_sentences, list_entitys)
- return prem, {'candidate': ','.join(candidate_set)}
- def getSavedModel():
- #predictor = FormPredictor()
- graph = tf.Graph()
- with graph.as_default():
- model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
-
- #print(tf.graph_util.remove_training_nodes(model))
- tf.saved_model.simple_save(
- tf.keras.backend.get_session(),
- "./h5_savedmodel/",
- inputs={"image": model.input},
- outputs={"scores": model.output}
- )
-
- def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
- '''
- model = models.Sequential()
- model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding
- model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True)))
- crf = CRF(len(chunk_tags), sparse_target=True)
- model.add(crf)
- model.summary()
- model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
- return model
- '''
- input = layers.Input(shape=(None,),dtype="int32")
- if weights is not None:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input)
- else:
- embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input)
- bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding)
- bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm)
- crf = CRF(len(chunk_tags),sparse_target=True)
- crf_out = crf(bilstm_dense)
- model = models.Model(input=[input],output = [crf_out])
- model.summary()
- model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
- return model
- import h5py
- def h5_to_graph(sess,graph,h5file):
-
- f = h5py.File(h5file,'r') #打开h5文件
- def getValue(v):
- _value = f["model_weights"]
- list_names = str(v.name).split("/")
- for _index in range(len(list_names)):
- print(v.name)
- if _index==1:
- _value = _value[list_names[0]]
- _value = _value[list_names[_index]]
- return _value.value
-
- def _load_attributes_from_hdf5_group(group, name):
- """Loads attributes of the specified name from the HDF5 group.
-
- This method deals with an inherent problem
- of HDF5 file which is not able to store
- data larger than HDF5_OBJECT_HEADER_LIMIT bytes.
-
- # Arguments
- group: A pointer to a HDF5 group.
- name: A name of the attributes to load.
-
- # Returns
- data: Attributes data.
- """
- if name in group.attrs:
- data = [n.decode('utf8') for n in group.attrs[name]]
- else:
- data = []
- chunk_id = 0
- while ('%s%d' % (name, chunk_id)) in group.attrs:
- data.extend([n.decode('utf8')
- for n in group.attrs['%s%d' % (name, chunk_id)]])
- chunk_id += 1
- return data
-
- def readGroup(gr,parent_name,data):
- for subkey in gr:
- print(subkey)
- if parent_name!=subkey:
- if parent_name=="":
- _name = subkey
- else:
- _name = parent_name+"/"+subkey
- else:
- _name = parent_name
- if str(type(gr[subkey]))=="<class 'h5py._hl.group.Group'>":
- readGroup(gr[subkey],_name,data)
- else:
- data.append([_name,gr[subkey].value])
- print(_name,gr[subkey].shape)
-
-
- layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names')
- list_name_value = []
- readGroup(f["model_weights"], "", list_name_value)
- '''
- for k, name in enumerate(layer_names):
- g = f["model_weights"][name]
- weight_names = _load_attributes_from_hdf5_group(g, 'weight_names')
- #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
- for weight_name in weight_names:
- list_name_value.append([weight_name,np.asarray(g[weight_name])])
- '''
- for name_value in list_name_value:
- name = name_value[0]
- '''
- if re.search("dense",name) is not None:
- name = name[:7]+"_1"+name[7:]
- '''
- value = name_value[1]
- print(name,graph.get_tensor_by_name(name),np.shape(value))
- sess.run(tf.assign(graph.get_tensor_by_name(name),value))
- def initialize_uninitialized(sess):
- global_vars = tf.global_variables()
- is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars])
- not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]
-
- adam_vars = []
- for _vars in not_initialized_vars:
- if re.search("Adam",_vars.name) is not None:
- adam_vars.append(_vars)
-
- print([str(i.name) for i in adam_vars]) # only for testing
- if len(adam_vars):
- sess.run(tf.variables_initializer(adam_vars))
-
-
- def save_codename_model():
- # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
- filepath = "../../dl_dev/projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt"
- vocabpath = "../projectCode/models/vocab.pk"
- classlabelspath = "../projectCode/models/classlabels.pk"
- # vocab = load(vocabpath)
- # class_labels = load(classlabelspath)
- w2v_matrix = load('codename_w2v_matrix.pk')
- graph = tf.get_default_graph()
- with graph.as_default() as g:
- ''''''
- # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
- #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
-
- sess = tf.Session(graph=g)
- # sess = tf.keras.backend.get_session()
- char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
- #with sess.as_default():
- sess.run(tf.global_variables_initializer())
- # print(sess.run("time_distributed_1/kernel:0"))
- # model.load_weights(filepath)
- saver = tf.train.Saver()
- saver.restore(sess, filepath)
- # print("logits",sess.run(logits))
-
- # print("#",sess.run("time_distributed_1/kernel:0"))
- # x = load("codename_x.pk")
- #y = model.predict(x)
- # y = sess.run(model.output,feed_dict={model.input:x})
-
- # for item in np.argmax(y,-1):
- # print(item)
- tf.saved_model.simple_save(
- sess,
- "./codename_savedmodel_tf/",
- inputs={"inputs": char_input,
- "inputs_length":length,
- 'keepprob':keepprob},
- outputs={"logits": logits,
- "trans":trans}
- )
-
-
- def save_role_model():
- '''
- @summary: 保存model为savedModel,部署到PAI平台上调用
- '''
- model_role = PREMPredict().model_role
- with model_role.graph.as_default():
- model = model_role.getModel()
- sess = tf.Session(graph=model_role.graph)
- print(type(model.input))
-
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, model_role.graph, model_role.model_role_file)
- model = model_role.getModel()
-
- tf.saved_model.simple_save(sess,
- "./role_savedmodel/",
- inputs={"input0":model.input[0],
- "input1":model.input[1],
- "input2":model.input[2]},
- outputs={"outputs":model.output}
- )
- def save_money_model():
- model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
- graph = tf.Graph()
- with graph.as_default():
- sess = tf.Session(graph=graph)
- with sess.as_default():
- # model = model_money.getModel()
- # model.summary()
- # sess.run(tf.global_variables_initializer())
- # h5_to_graph(sess, model_money.graph, model_money.model_money_file)
- model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- model.summary()
- print(model.weights)
- tf.saved_model.simple_save(sess,
- "./money_savedmodel2/",
- inputs = {"input0":model.input[0],
- "input1":model.input[1],
- "input2":model.input[2]},
- outputs = {"outputs":model.output}
- )
-
- def save_person_model():
- model_person = EPCPredict().model_person
- with model_person.graph.as_default():
-
- x = load("person_x.pk")
- _data = np.transpose(np.array(x),(1,0,2,3))
- model = model_person.getModel()
-
- sess = tf.Session(graph=model_person.graph)
- with sess.as_default():
-
- sess.run(tf.global_variables_initializer())
- model_person.load_weights()
-
-
- #h5_to_graph(sess, model_person.graph, model_person.model_person_file)
-
- predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]})
- #predict_y = model.predict([_data[0],_data[1]])
- print(np.argmax(predict_y,-1))
-
- tf.saved_model.simple_save(sess,
- "./person_savedmodel/",
- inputs={"input0":model.input[0],
- "input1":model.input[1]},
- outputs = {"outputs":model.output})
-
- def save_form_model():
- model_form = FormPredictor()
- with model_form.graph.as_default():
- model = model_form.getModel("item")
- sess = tf.Session(graph=model_form.graph)
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, model_form.graph, model_form.model_file_item)
- tf.saved_model.simple_save(sess,
- "./form_savedmodel/",
- inputs={"inputs":model.input},
- outputs = {"outputs":model.output})
-
- def save_codesplit_model():
- filepath_code = "../../dl_dev/projectCode/models/model_code.hdf5"
-
-
- graph = tf.Graph()
- with graph.as_default():
- model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- sess = tf.Session()
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, graph, filepath_code)
- tf.saved_model.simple_save(sess,
- "./codesplit_savedmodel/",
- inputs={"input0":model_code.input[0],
- "input1":model_code.input[1],
- "input2":model_code.input[2]},
- outputs={"outputs":model_code.output})
- def save_timesplit_model():
- filepath = '../time/model_label_time_classify.model.hdf5'
- with tf.Graph().as_default() as graph:
- time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- with tf.Session() as sess:
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, graph, filepath)
- tf.saved_model.simple_save(sess,
- "./timesplit_model/",
- inputs={"input0":time_model.input[0],
- "input1":time_model.input[1]},
- outputs={"outputs":time_model.output})
- if __name__=="__main__":
- #save_role_model()
- # save_codename_model()
- # save_money_model()
- #save_person_model()
- #save_form_model()
- #save_codesplit_model()
- # save_timesplit_model()
- '''
- # with tf.Session(graph=tf.Graph()) as sess:
- # from tensorflow.python.saved_model import tag_constants
- # meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
- # graph = tf.get_default_graph()
- # signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- # signature = meta_graph_def.signature_def
- # input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
- # input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
- # outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
- # x = load("person_x.pk")
- # _data = np.transpose(x,[1,0,2,3])
- # y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
- # print(np.argmax(y,-1))
- '''
- MAX_LEN = 1000
- vocabpath = os.path.dirname(__file__) + "/codename_vocab.pk"
- vocab = load(vocabpath)
- word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
- index_unk = word2index.get("<unk>")
- sentence = "招标人:广州市重点公共建设项目管理中心,联系人:李工,联系方式:020-22905689,招标代理:广东重工建设监理有限公司," \
- "代理联系人:薛家伟,代理联系方式:13535014481,招标监督机构:广州市重点公共建设项目管理中心,监督电话:020-22905690," \
- "备注:以上为招标公告简要描述,招标公告详细信息请查看“招标公告”附件,"
- sentence = sentence*5
- list_sentence = [sentence]*200
- # print(list_sentence)
- x = [[word2index.get(word, index_unk) for word in sentence] for sentence in
- list_sentence]
- x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
- # print(x_len)
- x = pad_sequences(x, maxlen=MAX_LEN, padding="post", truncating="post")
- requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},
- verify=True)
- # predict_y = json.loads(requests_result.text)['result']
- print("cost_time:", json.loads(requests_result.text)['cost_time'])
- print(MAX_LEN, len(sentence), len(list_sentence))
- requests_result = requests.post(API_URL + "/predict_codeName", json={"inouts": x.tolist(), "inouts_len": x_len},
- verify=True)
- # predict_y = json.loads(requests_result.text)['result']
- print("cost_time:", json.loads(requests_result.text)['cost_time'])
- print(MAX_LEN, len(sentence), len(list_sentence))
|