| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231 |
- #encoding:UTF8
- import sys
- import os
- sys.path.append("..")
- print(sys.path)
- import pandas as pd
- from dataSource.source import *
- import json
- from utils.multiThread import MultiThreadHandler
- import queue
- from utils.Utils import *
- from dataSource.pool import ConnectorPool
- import re
- from tablestore import *
- import traceback
- from utils.hashUtil import aesCipher
- from uuid import uuid4
- from export.exportUtils import *
- from export.DoubaoUtils import chat_doubao,get_json_from_text,chat_doubao_bot,chat_doubao_messages
- data_path = "../data/"
- def getCompanyTenderer():
- def _handle(item,result_queue):
- company = item
- dict_result = {"company":company,"count":0,"competitor":"","project_name":""}
- dict_result["company"] = company
- graph = getConnect_neo4j()
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhao_biao_id,p.zhong_biao_id"%(company)
- finded = graph.run(cql)
- finded_ids = json.loads(json.dumps(finded.data()))
- dict_result["count"] = len(finded_ids)
- mongoDB = getConnect_mongodb()
- coll_zb = mongoDB.zhongbiao_extraction
- if len(finded_ids)>0:
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.project_name limit 3"%(company)
- finded = graph.run(cql)
- finded_names = json.loads(json.dumps(finded.data()))
- list_names = [_i["p.project_name"] for _i in finded_names]
- dict_result["project_name"] = str(list_names)
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company)
- finded = graph.run(cql)
- finded_money = json.loads(json.dumps(finded.data()))
- whole_money = 0
- for item in finded_money:
- if item["r.price"] is not None:
- whole_money += getUnifyMoney(item["r.price"])
- dict_result["whole_money"] = str(whole_money)
- dict_competitor = {}
- for item in finded_ids:
- docId = item["p.zhong_biao_id"]
- if docId is not None:
- rows = coll_zb.find({"docId":docId})
- for row in rows:
- keys = ["second_tenderer","third_tenderer"]
- for _key in keys:
- if _key in row:
- if row[_key] not in dict_competitor:
- dict_competitor[row[_key]] = 0
- dict_competitor[row[_key]] += 1
- list_competitor = []
- for _key in dict_competitor:
- list_competitor.append([_key,dict_competitor[_key]])
- list_competitor.sort(key=lambda x:x[1],reverse=True)
- list_competitors = [i[0] for i in list_competitor[:10]]
- dict_result["competitor"] = str(list_competitors)
- result_queue.put(dict_result)
- # filename = "成交客户匹配中标项目的需求.xlsx"
- # df = pd.read_excel(filename)
- # list_company = df["公司名字"]
- # company = list_company[0]
- list_company = []
- filename = "../data/服务型客户.txt"
- with open(filename,"r",encoding="GBK") as f:
- while(True):
- line = f.readline()
- if not line:
- break
- list_company.append(line.strip())
- task_queue = queue.Queue()
- for company in list_company:
- task_queue.put(company)
- result_queue = queue.Queue()
- handler = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=10)
- handler.run()
- list_company = []
- list_zb = []
- list_count = []
- list_project = []
- list_money = []
- list_competitor = []
- while(True):
- try:
- _result = result_queue.get(False)
- list_company.append(_result.get("company",""))
- list_zb.append("是" if _result.get("count","")>0 else "否")
- list_count.append(_result.get("count",""))
- list_project.append(_result.get("project_name",""))
- list_money.append(_result.get("whole_money",""))
- list_competitor.append(_result.get("competitor",""))
- except Exception as e:
- print(e)
- break
- df1 = pd.DataFrame({"公司名字":list_company,"是否中标":list_zb,"中标次数":list_count,"中标项目":list_project,"中标金额":list_money,"潜在竞争对手":list_competitor})
- df1.to_excel("%s_export.xls"%(filename),columns=["公司名字","是否中标","中标次数","中标项目","中标金额","潜在竞争对手"])
- def export_count_includeKeyword():
- filename = "../data/other/jc001.xlsx"
- list_name = []
- list_count = []
- df = pd.read_excel(filename)
- _index = 0
- for row in df["品目"]:
- _name = row
- data = solrQuery("document",{"q":'dochtmlcon:"%s"'%_name,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
- if data is not None:
- _count = data["response"]["numFound"]
- else:
- _count = 0
- list_name.append(_name)
- list_count.append(_count)
- _index += 1
- print(_index)
- df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
- df1.to_excel("%s_export.xls"%filename)
- def export_count_includeKeyword_multiThread():
- def _handler(item,result_queue):
- data = solrQuery("document",{"q":'dochtmlcon:"%s"'%item,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
- if data is not None:
- _count = data["response"]["numFound"]
- else:
- _count = 0
- result_queue.put([item,_count])
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- filename = "../data/other/jc001.xlsx"
- list_name = []
- list_count = []
- df = pd.read_excel(filename)
- _index = 0
- for row in df["品目"]:
- _name = row
- task_queue.put(_name)
- _index += 1
- multHandler = MultiThreadHandler(task_queue,_handler,result_queue,thread_count=20)
- multHandler.run()
- while(True):
- try:
- item = result_queue.get(False)
- list_name.append(item[0])
- list_count.append(item[1])
- except queue.Empty as e:
- break
- df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
- df1.to_excel("%s_export.xls"%filename)
- def exportKeywords():
- def _handle(item,result_queue,pool_mongo):
- docId = item["docId"]
- mongo = pool_mongo.getConnector()
- zhongbiao = mongo.zhongbiao_extraction
- zhaobiao = mongo.zhaobiao_extraction
- _project = ""
- rows = zhaobiao.find({"docId":docId},{"project_name":1})
- find_flag = False
- for row in rows:
- find_flag = True
- _project = row.get("project_name","")
- if not find_flag:
- rows = zhongbiao.find({"docId":docId},{"project_name":1})
- for row in rows:
- _project = row.get("project_name","")
- item["project_name"] = _project
- pool_mongo.putConnector(mongo)
- result_queue.put(item)
- list_key = []
- dict_key_ids = dict()
- with open("../data/品目.txt", "r", encoding="utf8") as f:
- while(True):
- row = f.readline()
- if not row:
- break
- list_key.append(row)
- dict_key_ids[row] = []
- data = solrQuery("document",{"q":'dochtmlcon:"%s" AND dochtmlcon:"法院"'%row,"fq":'(publishtime:[2019-01-01T00:00:00Z TO 2019-12-31T23:59:59Z])',"fl":"id","rows":10000000})
- for item in data["response"]["docs"]:
- dict_key_ids[row].append(item["id"])
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for _key in dict_key_ids.keys():
- for item in dict_key_ids[_key]:
- task_queue.put({"docId":item,"project_name":""})
- pool_mongo = ConnectorPool(init_num=10,max_num=200,method_init=getConnect_mongodb)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=200,pool_mongo=pool_mongo)
- mt.run()
- dict_docId_projectname = {}
- while(True):
- try:
- item = result_queue.get(False)
- dict_docId_projectname[item["docId"]] = item["project_name"]
- except Exception:
- break
- dict_key_count = dict()
- for _key in dict_key_ids.keys():
- set_key = set()
- for docId in dict_key_ids[_key]:
- set_key.add(dict_docId_projectname.get(docId,""))
- dict_key_count[_key] = len(set_key)
- print("==")
- for _key in list_key:
- print(len(dict_key_ids[_key]))
- print("==")
- for _key in list_key:
- print(dict_key_count[_key])
- print("==")
- def getIndustryCompany():
- def _handle(item,result_queue,pool_mongo,pool_neo4j,pool_mysql,pool_ots):
- # mongoDB = getConnect_mongodb()
- log(item["enterprise_name"])
- mongoDB = pool_mongo.getConnector()
- # coll_zb = mongoDB.enterprise_profile
- # rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1,"legalPersonName":1,"actualCapital":1, "regCapital":1,"estiblishTime":1,"socialStaffNum":1,"legal_person":1,"phone":1,"businessScope":1,"industry":1 })
- # for row in rows:
- # item["regCapital"] = row.get("regCapital","")
- # item["legal_person"] = row.get("legal_person","")
- # item["phone"] = row.get("phone","")
- # item["actualCapital"] = row.get("actualCapital","")
- # item["industry"] = row.get("industry","")
- # item["estiblishTime"] = row.get("estiblishTime","")
- # item["socialStaffNum"] = row.get("socialStaffNum","")
- # item["businessScope"] = row.get("businessScope","")
- # graph = getConnect_neo4j()
- ots_client = pool_ots.getConnector()
- primary_key = [('name',item["enterprise_name"])]
- columns_to_get = ["reg_capital","legal_person","phone","actual_capital","industry","estiblishTime","social_staff_num","business_scope"]
- consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
- if return_row is not None:
- for att in return_row.attribute_columns:
- item[att[0]] = att[1]
- list_same_industry_company = []
- if "industry" in item:
- bool_query = BoolQuery(must_queries=[TermQuery("industry",item["industry"])])
- col = ColumnsToGet(['enterprise_name'], ColumnReturnType.SPECIFIED)
- rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
- SearchQuery(bool_query, limit=10, get_total_count=True),
- col)
- for row in rows:
- for item1 in row[0]:
- list_same_industry_company.append(item1[1])
- # if "industry" in item:
- # rows = coll_zb.find({"industry":item["industry"]},{"enterprise_name":1}).limit(10)
- # for row in rows:
- # print(row)
- # list_same_industry_company.append(row.get("enterprise_name",""))
- item["same_industry_company"] = list_same_industry_company
- graph = pool_neo4j.getConnector()
- company_name = item["enterprise_name"]
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
- finded = graph.run(cql)
- data = json.loads(json.dumps(finded.data()))
- _count = data[0]["_c"]
- # list_project = []
- # for _data in data:
- # if _count<=3:
- # if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
- # if _data["project_name"] is not None:
- # list_project.append(_data["project_name"])
- # _count += 1
- item["count"] = _count
- # item["project"] = str(list_project)
- result_queue.put(item)
- pool_mongo.putConnector(mongoDB)
- pool_neo4j.putConnector(graph)
- pool_ots.putConnector(ots_client)
- log_tofile("export.log")
- pool_mongo = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_mongodb)
- pool_neo4j = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_neo4j)
- pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- # list_company = getCompanys()
- # filename = "".join(["环境","生态","再生","回收","环保"])
- list_company = []
- filename = "../data/同行客户匹配.xlsx"
- df = pd.read_excel(filename,sheetname=0)
- for _com in df["公司名称"]:
- print(_com)
- if _com is not None and _com.strip()!="":
- _company = {"enterprise_name":""}
- _company["enterprise_name"] = _com
- list_company.append(_company)
- task_queue = queue.Queue()
- for item in list_company:
- task_queue.put(item)
- result_queue = queue.Queue()
- _muti = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j,pool_mysql=pool_mysql,pool_ots=pool_ots)
- _muti.run()
- df_company = {}
- set_key = set()
- if len(list_company)>0:
- for item in list_company:
- for _key in item.keys():
- set_key.add(_key)
- if _key not in df_company:
- df_company[_key] = []
- list_key = list(set_key)
- for item in list_company:
- for _key in list_key:
- df_company[_key].append(item.get(_key,""))
- df1 = pd.DataFrame(df_company)
- df1.to_excel("%s_export.xlsx"%(filename))
- def exportWin_tenderer(time_from,time_to):
- '''
- :return:
- '''
- ost_client = getConnect_ots()
- last_docid = 0
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
- TermQuery("docchannel",101),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- RangeQuery('docid', last_docid, include_lower=False)])
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
- ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- list_project = []
- def _getRow(list_project,rows,last_docid):
- for row in rows:
- project_name = row[1][0][1]
- docid = row[0][1][1]
- last_docid = docid
- list_pack = json.loads(row[1][1][1])
- _set_tenderer = set()
- win_tenderer = ""
- for _pack in list_pack:
- if "win_tenderer" in _pack and win_tenderer=="":
- win_tenderer = _pack["win_tenderer"]
- if "second_tenderer" in _pack:
- _set_tenderer.add(_pack["second_tenderer"])
- if "third_tenderer" in _pack:
- _set_tenderer.add(_pack["third_tenderer"])
- list_project.append({"docid":docid,"project_name":project_name,"win_tenderer":win_tenderer,"tenderer":list(_set_tenderer)})
- return last_docid
- _getRow(list_project,rows,last_docid)
- while(next_token):
- print("%d/%d"%(len(list_project),total_count))
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- last_docid = _getRow(list_project,rows,last_docid)
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for item in list_project:
- task_queue.put(item)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- def _handle(item,result_queue,pool_ots):
- if item["win_tenderer"]!="":
- ots_client = pool_ots.getConnector()
- consumed, return_row, next_token = ost_client.get_row("enterprise", [("name",item["win_tenderer"])], ["province","reg_capital","estiblish_time","business_scope"], None, 1)
- _dict = dict()
- for _item in return_row.attribute_columns:
- _dict[_item[0]] = _item[1]
- for _key in _dict.keys():
- item[_key] = _dict[_key]
- data = solrQuery("contact",{"q":'company_name:"%s"'%item["win_tenderer"],"fl":"contact_person,mobile_no,phone_no","rows":10})
- for _item in data["response"]["docs"]:
- for _key in _item.keys():
- item[_key] = _item[_key]
- break
- pool_ots.putConnector(ots_client)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots);
- mt.run()
- keys = ["docid","project_name","win_tenderer","tenderer","province","reg_capital","business_scope","estiblish_time","contact_person","mobile_no","phone_no"]
- df_data = {}
- for _key in keys:
- df_data[_key] = []
- for item in list_project:
- for _key in keys:
- if _key in item:
- df_data[_key].append(item[_key])
- else:
- df_data[_key].append("")
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s-%s中标信息.xlsx"%(time_from,time_to),columns=keys)
- def exportContact():
- time_from = "2021-01-14"
- time_to = "2021-01-15"
- filename = "../data/%s-%s中标信息.xlsx"%(time_from,time_to)
- df1 = pd.read_excel(filename)
- set_company = set()
- for item in df1["tenderer"]:
- list_company = re.split("\['|', '|'\]|\[\]",item)
- for _company in list_company:
- if _company!="":
- set_company.add(_company)
- companys = list(set_company)
- task_queue = queue.Queue()
- list_company = []
- for _company in companys:
- item = {"company_name":_company}
- list_company.append(item)
- task_queue.put(item)
- result_queue = queue.Queue()
- def _handle(item,result_queue):
- company = item["company_name"]
- data = solrQuery("contact",{"q":'company_name:"%s"'%company,"fl":"company_name,contact_person,mobile_no,phone_no","rows":10})
- for _item in data["response"]["docs"]:
- for _key in _item.keys():
- item[_key] = _item[_key]
- break
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30);
- mt.run()
- keys = ["company_name","contact_person","mobile_no","phone_no"]
- df_data = {}
- for _key in keys:
- df_data[_key] = []
- ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
- for item in list_company:
- for _key in keys:
- if _key in item:
- df_data[_key].append(ILLEGAL_CHARACTERS_RE.sub(r'', item[_key]))
- else:
- df_data[_key].append("")
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s-%s竞争对手信息.xlsx"%(time_from,time_to),columns=keys)
- def countKeyword():
- conn = getConnection_mysql()
- cursor = conn.cursor()
- print(0)
- sql = "select dochtmlcon from sys_document_22 where docchannel=52 and page_time>='2020-09-01' and page_time<='2020-09-07'"
- cursor.execute(sql)
- print(0.1)
- df = pd.read_excel("万郡绿建细分关键词.xls")
- list_keywords = df["细分类别"]
- dict_keywords = dict()
- for _key in list_keywords:
- dict_keywords[_key] = 0
- print(1)
- from bs4 import BeautifulSoup
- while(True):
- rows = cursor.fetchmany(10000)
- print("==")
- if not rows:
- break
- for row in rows:
- _html = BeautifulSoup(row[0],"lxml").getText()
- for _key in list_keywords:
- if re.search(_key,_html) is not None:
- dict_keywords[_key] += 1
- print(dict_keywords)
- list_count = []
- for _key in list_keywords:
- list_count.append(dict_keywords[_key])
- df1 = pd.DataFrame({"关键字":list_keywords,"数量":list_count})
- df1.to_excel("关键词统计.xlsx")
- def countKeyword_solr():
- def _handle(item,result_queue):
- keyword = item["keyword"]
- data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:101 AND dochtmlcon:"法院" '%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
- _num = data["response"]["numFound"]
- item["zhongbiao"] = _num
- data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:52 AND dochtmlcon:"法院"'%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
- _num = data["response"]["numFound"]
- item["zhaobiao"] = _num
- result_queue.put(item)
- file = "../data/关键词11.xlsx"
- df = pd.read_excel(file)
- task_queue = queue.Queue()
- print(df.keys())
- for item in df["业务关键词"]:
- task_queue.put({"keyword":item,"zhaobiao":0,"zhongbiao":0})
- result_queue = queue.Queue()
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=10)
- mt.run()
- list_keyword = []
- list_zhaobiao = []
- list_zhongbiao = []
- while(True):
- try:
- item = result_queue.get(False)
- list_keyword.append(item["keyword"])
- list_zhaobiao.append(item["zhaobiao"])
- list_zhongbiao.append(item["zhongbiao"])
- except Exception:
- break
- df1 = pd.DataFrame({"业务关键词":list_keyword,"招标公告":list_zhaobiao,"中标信息":list_zhongbiao})
- df1.to_excel("%s_export.xlsx"%file,columns=["业务关键词","招标公告","中标信息"])
- def query_from_solr():
- data = solrQuery("document",{"q":'dochtmlcon:"法律" AND (docchannel:51 OR docchannel:104 or docchannel:52 or docchannel:102) AND province:"湖南" ',"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-01-20T23:59:59Z])',"fl":"id","rows":10})
- _num = data["response"]["numFound"]
- print(_num)
- def export_province_keyword_count():
- def _handle(item,result_queue,pool_ots):
- columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
- ots_client = pool_ots.getConnector()
- _province = item["province"]
- print(item)
- # keywords = item["keyword"]+" "+item["key"]
- list_keyword = item["keyword"]
- # for _temp in keywords.split(" "):
- # if len(_temp)>0:
- # list_keyword.append(_temp)
- should_queries = []
- must_not_q = []
- for _temp in list_keyword:
- should_queries.append(MatchPhraseQuery("doctitle","%s"%_temp))
- must_not_q.append(WildcardQuery("tenderee","*%s*"%_temp))
- bool_query_keyword = BoolQuery(should_queries=should_queries,minimum_should_match=2)
- page_time = item["page_time"]
- bool_query = BoolQuery(must_queries=[#bool_query_keyword
- # ,WildcardQuery("publishtime","%s*"%page_time)
- # ,MatchPhraseQuery("doctitle","服务")
- RangeQuery("page_time","2018-01-01","2019-01-01",include_lower=True,include_upper=False),
- TermQuery("docchannel",52),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- WildcardQuery('city', '%s*'%_province)
- # ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_tenderer",0,include_lower=True))
- ]
- # ,must_not_queries=must_not_q
- )
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("sub_docs_json.bidding_budget",SortOrder.DESC)]), limit=20, get_total_count=True),
- ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED))
- item["count"] = total_count
- list_data = getRow_ots(rows)
- item["list_data"] = list_data
- print(item)
- pool_ots.putConnector(ots_client)
- df = pd.read_excel("../data/省份信息.xlsx")
- list_province = []
- for _name,_type,_parent in zip(df["cname"],df["ctype"],df["parentid"]):
- if _type==30 and _parent==4:
- list_province.append(_name)
- # filename = "../data/2021-02关键词导出数据.xlsx"
- # dict_keyword = {}
- # df1 = pd.read_excel(filename,dtype=str)
- # for _key,_keyword in zip(df1["key1"],df1["keyword"]):
- # print("===",str(_keyword))
- # dict_keyword[_key] = "" if str(_keyword)=="nan" else _keyword
- # for _key in df1["关键词"]:
- # dict_keyword[_key] = ""
- keyword_str = '''
- 快递 物流 供应链 运输 配送
- 仓储 冷链 整车 服务
- '''
- list_key = []
- for _k in re.split("\s",keyword_str):
- _k1 = _k.strip()
- if len(_k1)>0:
- list_key.append(_k1)
- list_task = []
- page_time = "2020-11"
- for _province in list_province:
- list_task.append({"page_time":page_time,"province":_province,"key":list_key,"keyword":list_key,"count":0})
- task_queue = queue.Queue()
- for item in list_task:
- task_queue.put(item)
- result_queue = queue.Queue()
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
- mt.run()
- dict_key_data = dict()
- # list_data = []
- # for item in list_task:
- # list_data.extend(item["list_data"])
- # dict_channel = getDict_docchannel()
- # df_data= {}
- # print(list_data)
- # for row in list_data:
- # item = {}
- # _dict = row
- # set_dict_item(item,"docid",_dict.get("docid",""))
- # set_dict_item(item,"公告标题",_dict.get("doctitle",""))
- # set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- # set_dict_item(item,"省份",_dict.get("province",""))
- # # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- # set_dict_item(item,"城市",_dict.get("city",""))
- # set_dict_item(item,"发布时间",_dict.get("page_time",""))
- #
- # set_dict_item(item,"项目编号",_dict.get("project_code",""))
- # set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- # set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- # set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- # set_dict_item(item,"代理单位",_dict.get("agency",""))
- # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- #
- # sub_docs_json = _dict.get("sub_docs_json")
- # for _doc in json.loads(sub_docs_json):
- # if "win_tenderer" in _doc:
- # set_dict_item(item,"中标单位",_doc["win_tenderer"])
- # if "win_tenderee_manager" in _doc:
- # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
- # if "win_tenderee_phone" in _doc:
- # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
- # if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- # set_dict_item(item,"中标金额",_doc["win_bid_price"])
- # if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- # set_dict_item(item,"招标金额",_doc["bidding_budget"])
- # if "招标金额" not in item:
- # set_dict_item(item,"招标金额","")
- # if "中标金额" not in item:
- # set_dict_item(item,"中标金额","")
- # if "中标单位" not in item:
- # set_dict_item(item,"中标单位","")
- # if "中标单位联系人" not in item:
- # set_dict_item(item,"中标单位联系人","")
- # if "中标单位联系电话" not in item:
- # set_dict_item(item,"中标单位联系电话","")
- #
- #
- # _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
- # # if _line in set_line:
- # # continue
- # # if item["招标金额"]=="":
- # # continue
- # # set_line.add(_line)
- # for k,v in item.items():
- # if k not in df_data:
- # df_data[k] = []
- # df_data[k].append(v)
- # df1 = pd.DataFrame(df_data)
- # df1.to_excel("../data/%s_顺丰中标数据.xlsx"%getCurrent_date('%Y-%m-%d_%H%M%S'),columns=list_df_columns)
- for item in list_task:
- print("%s\t%d"%(item["province"],item["count"]))
- # for item in list_task:
- # dict_key_data[item["key"]][item["province"]] = item
- # dict_key_province = dict()
- # dict_key_province["关键词"] = []
- # for _province in list_province:
- # dict_key_province[_province] = []
- # for _key in dict_keyword.keys():
- # dict_key_province["关键词"].append(_key)
- # for _province in list_province:
- # dict_key_province[_province].append(dict_key_data[_key][_province]["count"])
- # columns = ["关键词"]
- # columns.extend(list_province)
- # df2 = pd.DataFrame(dict_key_province)
- # df2.to_excel("../data/%s_导出数据.xlsx"%filename,columns=columns)
- def export_keyword_count():
- def _handle(item,result_queue,ots_client):
- shoud_q_docchannel = BoolQuery(should_queries=[
- # RangeQuery("docchannel",51,105,True,True)
- TermQuery("docchannel",101),
- RangeQuery("docchannel",118,120,True,True)
- ]
- )
- should_q_keyword = BoolQuery(should_queries=[
- MatchPhraseQuery("doctitle",item["keyword"]),
- MatchPhraseQuery("doctextcon",item["keyword"]),
- MatchPhraseQuery("attachmenttextcon",item["keyword"])
- ])
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- generateBoolShouldQuery(["docchannel"],[51, 52, 101, 118, 119, 120, 114, 51, 103],TermQuery),
- # TermQuery("docchannel",101),
- # shoud_q_docchannel,
- should_q_keyword
- # MatchPhraseQuery(item["type"], item["keyword"])
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1, get_total_count=True),
- ColumnsToGet(return_type=ColumnReturnType.NONE))
- item["total_count"] = total_count
- if total_count>0:
- item["exists"] =1
- else:
- item["exists"] =0
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- # TermQuery("docchannel",52),
- # shoud_q_docchannel,
- # should_q_keyword
- TermQuery("tenderee",item["keyword"])
- # MatchPhraseQuery(item["type"], item["keyword"])
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1, get_total_count=True),
- ColumnsToGet(return_type=ColumnReturnType.NONE))
- item["zhaobiao_count"] = total_count
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- # TermQuery("docchannel",101),
- # shoud_q_docchannel,
- # should_q_keyword,
- NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["keyword"]))
- # MatchPhraseQuery(item["type"], item["keyword"])
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1, get_total_count=True),
- ColumnsToGet(return_type=ColumnReturnType.NONE))
- item["zhongbiao_count"] = total_count
- range_from = "2024-08-01"
- range_to = "2024-11-01"
- _type = "doctitle"
- assert _type in ["doctitle","doctextcon"]
- list_dict_key_count = []
- filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-12\疑似少数据的中标企业名单.csv"
- with open(filename,"r",encoding="utf8") as f:
- while 1:
- line = f.readline()
- if not line:
- break
- line = line.strip()
- if line=="name":
- continue
- list_dict_key_count.append({"keyword":line,"count":0,"exists":0,"range_from":range_from,"range_to":range_to,"type":_type})
- # if len(list_dict_key_count)>=1000:
- # break
- # df = pd.read_csv(filename,encoding="ISO-8859-1")
- # for item in df["name"]:
- # list_dict_key_count.append({"keyword":item,"count":0,"exists":0,"range_from":range_from,"range_to":range_to,"type":_type})
- # str_keys = '''
- # 智慧税务
- # 发票管理
- #
- # '''
- # for item in re.split("\s|\r|\n|,|,|、",str_keys):
- # if item.strip()!="":
- # list_dict_key_count.append({"keyword":item,"total_count":0,"zhaobiao_count":0,"zhongbiao_count":0,"range_from":range_from,"range_to":range_to,"type":_type})
- task_queue = queue.Queue()
- for item in list_dict_key_count:
- task_queue.put(item)
- result_queue = queue.Queue()
- ots_client = getConnect_ots()
- # pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,ots_client=ots_client)
- mt.run()
- columns = ["keyword","total_count","exists","zhaobiao_count","zhongbiao_count","range_from","range_to","type"]
- df_data = {}
- for _c in columns:
- df_data[_c] = []
- for item in list_dict_key_count:
- for _c in columns:
- if _c in item:
- df_data[_c].append(item[_c])
- else:
- df_data[_c].append("")
- for k,v in df_data.items():
- print(k,len(v))
- df2 = pd.DataFrame(df_data)
- df2.to_excel("%s_数量导出全部类别.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=columns)
- def export_keyword_title():
- ots_client = getConnect_ots()
- range_from = "2020-01-01"
- range_to = "2022-12-23"
- list_condition = [["医务室"],
- ["医院"],
- ["卫生院"],
- ["卫生所"],
- ["卫生室"],
- ["社区卫生服务中心"]]
- list_should_query = []
- for _c in list_condition:
- if len(_c)==1:
- list_should_query.append(MatchPhraseQuery("doctitle",_c[0]))
- else:
- _must_query = []
- for _q in _c:
- _must_query.append(MatchPhraseQuery("doctitle",_q))
- list_should_query.append(BoolQuery(must_queries=_must_query))
- keyword_query = BoolQuery(should_queries=list_should_query)
- bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",range_from,range_to),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- keyword_query
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
- df_data = {"docid":[],"doctitle":[],"tenderee":[]}
- def setData(df_data,rows):
- list_dict = getRow_ots(rows)
- for _dict in list_dict:
- docid = _dict.get("docid","")
- doctitle = _dict.get("doctitle","")
- tenderee = _dict.get("tenderee","")
- df_data["docid"].append(docid)
- df_data["doctitle"].append(doctitle)
- df_data["tenderee"].append(tenderee)
- setData(df_data,rows)
- _count = len(rows)
- while next_token:
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
- setData(df_data,rows)
- _count += 100
- print(_count,total_count)
- file_begin = 0
- file_length = 100000
- _count = 0
- while file_begin<len(df_data["docid"]):
- _dict = dict()
- for _key,v in df_data.items():
- _dict[_key] = v[file_begin:file_begin+file_length]
- _count += 1
- file_begin += file_length
- df = pd.DataFrame(_dict)
- df.to_csv("../data/%s-%s_tenderee_doctitle_%d.csv"%(range_from,range_to,_count))
- def exportArticle_by_websource():
- # conn = getConnection_testmysql()
- # cursor = conn.cursor()
- # sql = "select web_source_no from web_source"
- # cursor.execute(sql)
- # rows = cursor.fetchmany(10)
- # dict_websource = dict()
- # while(rows):
- # for row in rows:
- # web_source_no = row[0]
- # dict_websource[web_source_no] = []
- # rows = cursor.fetchmany(1000)
- #
- # task_queue = queue.Queue()
- # for _key in dict_websource.keys():
- # task_queue.put({"key":_key,"list":dict_websource[_key]})
- #
- # pool_ots = ConnectorPool(init_num=100,max_num=1000,method_init=getConnect_ots)
- # result_queue = queue.Queue()
- # def _handle(item,result_queue,pool_ots):
- # _key = item["key"]
- # print(_key)
- # ots_client = pool_ots.getConnector()
- # bool_query = BoolQuery(must_queries=[RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- # TermQuery('web_source_no', '%s'%_key)
- # ])
- #
- # is_all_succeed = False
- #
- # while(not is_all_succeed):
- # try:
- # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- # SearchQuery(bool_query, limit=100, get_total_count=True),
- # ColumnsToGet(["docid","docchannel","dochtmlcon"],return_type=ColumnReturnType.SPECIFIED))
- # list_zhaobiao = []
- # list_zhongbiao = []
- # for row in rows:
- # _dict = dict()
- # for values in row:
- # for _v in values:
- # _dict[_v[0]] = _v[1]
- # if _dict["docchannel"]==52:
- # list_zhaobiao.append(_dict)
- # elif _dict["docchannel"]==101:
- # list_zhongbiao.append(_dict)
- # item["list"].extend(list_zhaobiao[:5])
- # item["list"].extend(list_zhongbiao[:5])
- # except Exception as e:
- # print(str(e))
- #
- # pool_ots.putConnector(ots_client)
- #
- # mt = MultiThreadHandler(task_queue = task_queue,task_handler=_handle,result_queue=result_queue,thread_count=100,pool_ots=pool_ots)
- # mt.run()
- # df_data = {"docid":[],"web_source_no":[],"docchannel":[],"dochtmlcon":[]}
- # for k,v in dict_websource.items():
- # for item in v:
- # df_data["docid"].append(item["docid"])
- # df_data["web_source_no"].append(k)
- # df_data["docchannel"].append(item["docchannel"])
- # df_data["dochtmlcon"].append(item["dochtmlcon"])
- # df = pd.DataFrame(df_data)
- # df.to_csv("../data/websouce_doc.csv",columns=["docid","web_source_no","docchannel","dochtmlcon"],encoding="UTF8")
- df = pd.read_csv("../data/other/websouce_doc.csv")
- df_2000 = {"document_id":[],"document_text":[]}
- print("total_count",len(df["docid"]))
- begin = 230000
- end = 260000
- _count = 0
- for _id,_text in zip(df["docid"][begin:end],df["dochtmlcon"][begin:end]):
- if len(_text)>100000:
- continue
- df_2000["document_id"].append(_id)
- df_2000["document_text"].append(_text)
- df_2 = pd.DataFrame(df_2000)
- df_2.to_csv("../data/websouce_doc_%d-%d.csv"%(begin,end),columns=["document_id","document_text"],encoding="utf8",index=False)
- # save(dict_websource,"../data/dict_websource.pk")
- def getWinTenderer(sub_doc_json):
- if sub_doc_json is not None:
- sub_doc = json.loads(sub_doc_json)
- for _doc in sub_doc:
- if "win_tenderer" in _doc:
- return _doc["win_tenderer"]
- return ""
- def exportDocument_by_keywords(page_time,
- list_keyword = ["创客","STEAM","人工智能","课程服务","机器人中学","机器人小学","机器人幼儿园","机器人学校","Labplus","盛思","makeblock柴火","寓乐湾","美科科技","STEAM","能力风暴","优必选","蘑菇云","Dfrobot","中鸣","飞瑞敖","编程猫培生","八爪鱼","八爪鱼教育","童心制物"]):
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for _k in list_keyword:
- task_queue.put(_k)
- def _handle(keyword,result_queue):
- should_queries = []
- for _temp in [keyword]:
- should_queries.append(MatchPhraseQuery("doctitle",_temp))
- bool_query_keyword = BoolQuery(should_queries=should_queries)
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[RangeQuery('publishtime', range_from='2017-12-20'),
- MatchPhraseQuery("doctitle",keyword),
- TermQuery("docchannel","101")
- ])
- is_all_succeed = False
- _count = 0
- total_count = 1
- next_token = None
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
- ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- for row in rows:
- _dict = dict()
- for values in row:
- for _v in values:
- _dict[_v[0]] = _v[1]
- result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
- print(keyword,next_token,total_count)
- while(next_token):
- try:
- # print(next_token)
- _count += len(rows)
- print("%s:%d/%d"%(keyword,_count,total_count))
- for row in rows:
- _dict = dict()
- for values in row:
- for _v in values:
- _dict[_v[0]] = _v[1]
- result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- except Exception as e:
- traceback.print_exc()
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
- mt.run()
- df_data = {"docid":[],"keyword":[],"tenderee":[],"win_tenderer":[]}
- while(True):
- try:
- item = result_queue.get(block=True,timeout=1)
- for _k in df_data.keys():
- if _k in item:
- df_data[_k].append(item[_k])
- else:
- df_data[_k].append("")
- except queue.Empty as e:
- break
- except Exception as e:
- traceback.print_exc()
- df = pd.DataFrame(df_data)
- df.to_csv("../data/exportArticle1_title.csv",columns=["docid","keyword","tenderee","win_tenderer"])
- def exportGovement():
- should_queries1 = []
- for _temp in ["教育局","地化所","税务局","国土局","学校","大学","中学","小学","幼儿园","医院"]:
- should_queries1.append(WildcardQuery("tenderee","*%s*"%_temp))
- should_queries2 = []
- for _temp in ["浙江","江苏","湖北","西北","陕西","甘肃","青海","宁夏","新疆","重庆","四川","云南","贵州"]:
- should_queries2.append(WildcardQuery("province","*%s*"%_temp))
- ots_client = getConnect_ots()
- page_time = "2020-12"
- bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=should_queries1),
- BoolQuery(should_queries=should_queries2),
- TermQuery("docchannel","52"),
- RangeQuery("publishtime",page_time)])
- columns = ["tenderee","tenderee_contact","tenderee_phone"]
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=100, sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- print(total_count)
- def getRow(rows,df_data,columns):
- for row in rows:
- _dict = dict()
- for part in row:
- for item in part:
- _dict[item[0]] = item[1]
- if "tenderee_contact" in _dict and "tenderee_phone" in _dict:
- for key in columns:
- df_data[key].append(_dict.get(key,""))
- all_rows = 0
- df_data = {}
- for key in columns:
- df_data[key] = []
- getRow(rows,df_data,columns)
- _count = 100
- while(next_token):
- print(_count,total_count)
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100,get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count += 100
- getRow(rows,df_data,columns)
- df2 = pd.DataFrame(df_data)
- df2.to_excel("../data/%s政府招标人导出数据.xlsx"%page_time,columns=columns)
- def export_attachment():
- filename = "运营商20240417(1).xlsx"
- df = pd.read_excel(filename)
- auth = oss2.Auth("LTAI5tFuoxHm8Uxrr5nT8wTZ", "Yp01bylJFx0al6teCaccY8hbtllBGg")
- bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
- attachment_bucket_name = "attachment-hub"
- bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)
- ots_client = getConnect_ots()
- list_query = []
- for _title,_no,tenderee,win_tenderer in zip(df["新标题"],df["项目编号"],df["新招采单位"],df["新中标单位"]):
- _dict = {"title":_title,
- "project_code":_no,
- "tenderee":tenderee,
- "win_tenderer":win_tenderer}
- list_query.append(_dict)
- def _handle(_dict,result_queue):
- title = _dict["title"]
- project_code = _dict["project_code"]
- tenderee = _dict["tenderee"]
- win_tenderer = _dict["win_tenderer"]
- if isinstance(project_code,str):
- bool_query = BoolQuery(must_queries=[
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[title],MatchPhraseQuery),
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[project_code],MatchPhraseQuery),
- NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
- RangeQuery("page_time","2022-01-01")
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
- ColumnsToGet(["page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- for _data in list_data:
- docid = _data["docid"]
- page_attachments = _data["page_attachments"]
- for _attach in json.loads(page_attachments):
- filemd5 = _attach["fileMd5"]
- consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",filemd5)],columns_to_get=["classification","path"])
- dict_k = getRow_ots_primary(return_row)
- if dict_k is not None and dict_k.get("classification")=="招标文件":
- _dict["docid"] = docid
- _dict["path"] = dict_k.get("path")
- break
- if "docid" in _dict:
- break
- if "docid" not in _dict:
- if isinstance(tenderee,str):
- bool_query = BoolQuery(must_queries=[
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[title],MatchPhraseQuery),
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[tenderee],MatchPhraseQuery),
- NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
- RangeQuery("page_time","2022-01-01")
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
- ColumnsToGet(["page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- for _data in list_data:
- docid = _data["docid"]
- page_attachments = _data["page_attachments"]
- for _attach in json.loads(page_attachments):
- filemd5 = _attach["fileMd5"]
- consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",filemd5)],columns_to_get=["classification","path"])
- dict_k = getRow_ots_primary(return_row)
- if dict_k is not None and dict_k.get("classification")=="招标文件":
- _dict["docid"] = docid
- _dict["path"] = dict_k.get("path")
- break
- if "docid" in _dict:
- break
- if "docid" not in _dict:
- if isinstance(win_tenderer,str):
- bool_query = BoolQuery(must_queries=[
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[title],MatchPhraseQuery),
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[win_tenderer],MatchPhraseQuery),
- NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
- RangeQuery("page_time","2022-01-01")
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
- ColumnsToGet(["page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- for _data in list_data:
- docid = _data["docid"]
- page_attachments = _data["page_attachments"]
- for _attach in json.loads(page_attachments):
- filemd5 = _attach["fileMd5"]
- consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",filemd5)],columns_to_get=["classification","path"])
- dict_k = getRow_ots_primary(return_row)
- if dict_k is not None and dict_k.get("classification")=="招标文件":
- _dict["docid"] = docid
- _dict["path"] = dict_k.get("path")
- break
- if "docid" in _dict:
- break
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for item in list_query:
- task_queue.put(item)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
- mt.run()
- list_data = []
- for item in list_query:
- list_data.append([item.get("title"),item.get("docid")])
- print(item.get("title"),item.get("docid"),"docid" in item)
- path = item.get("path")
- if path is not None:
- try:
- oss2.resumable_download(bucket,path,"附件/%s.%s"%(re.sub("[<>\[\]{}/\';,.‘、\"]",'',item.get("title")),path.split(".")[-1]))
- except Exception as e:
- print("download error %s %d"%(item.get("title"),item.get("docid")))
- df = pd.DataFrame(list_data)
- df.to_excel("a.xlsx")
- def exportIndustryCount():
- import codecs
- time_from = "2020-12-21"
- time_to = "2020-12-25"
- # dict_channel = {"51":{"type":"公告变更"},
- # "52":{"type":"招标公告"},
- # "101":{"type":"中标信息"},
- # "102":{"type":"招标预告"},
- # "103":{"type":"招标答疑"},
- # "104":{"type":"招标文件"},
- # "105":{"type":"资审结果"},
- # "103":{"type":"招标控制价"},
- # "100":{"type":"未知类型"}}
- dict_industry = {}
- meta_industry = load("../data/other/class2dalei_menlei.pkl")
- for _key in meta_industry.keys():
- dict_industry[_key] = {"type":_key}
- print(dict_industry.keys())
- return
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for _key in dict_industry.keys():
- task_queue.put(dict_industry[_key])
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"]),
- RangeQuery("publishtime",time_from,time_to,include_lower=True,include_upper=True)])
- columns = ["docid"]
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1,get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- item["count"] = total_count
- columns = ["dochtmlcon"]
- bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"])])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=10,sort=Sort(sorters=[FieldSort("publishtime",SortOrder.ASC)]),get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- for row in rows:
- _dict = dict()
- for part in row:
- for v in part:
- _dict[v[0]] = v[1]
- with codecs.open("../data/industry/%s_%d.html"%(item["type"],_dict["docid"]),"w",encoding="UTF8") as f:
- f.write(_dict["dochtmlcon"])
- pool_ots.putConnector(ots_client)
- pool_ots = ConnectorPool(init_num=20,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
- mt.run()
- columns = ["type","count"]
- df_data = {}
- for _c in columns:
- df_data[_c] = []
- for _indus in dict_industry.keys():
- for _c in columns:
- df_data[_c].append(dict_industry[_indus][_c])
- df = pd.DataFrame(df_data)
- # df.to_excel("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
- df.to_csv("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
- def exportDocument_By_time(time_from,time_to,columns=["docid","doctitle","project_name","dochtmlcon"]):
- '''
- :return:
- '''
- ost_client = getConnect_ots()
- last_docid = 0
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True)])
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count = len(rows)
- df_data = {}
- def getData(df_data,rows):
- list_dict = getRow_ots(rows)
- for _dict in list_dict:
- for _k,_v in _dict.items():
- if _k not in df_data:
- df_data[_k] = []
- df_data[_k].append(getLegal_str(_v))
- getData(df_data,rows)
- while(next_token):
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count += len(rows)
- getData(df_data,rows)
- df = pd.DataFrame(df_data)
- df.to_excel("%s/%s-%s公告信息.xlsx"%(data_path,time_from,time_to),columns=columns)
- def processDocument():
- filename = "../data/2021-01-29-2021-01-29公告信息.xlsx"
- df = pd.read_excel(filename)
- df.to_csv("../data/2021-01-29-2021-01-29公告信息.csv")
- return
- list_dict = []
- for docid,doctitle,project_name,dochtmlcon in zip(df["docid"],df["doctitle"],df["project_name"],df["dochtmlcon"]):
- list_dict.append({"docid":docid,"doctitle":doctitle,"project_name":project_name,"dochtmlcon":dochtmlcon})
- task_queue = queue.Queue()
- for _dict in list_dict:
- task_queue.put(_dict)
- result_queue = queue.Queue()
- def _handle(_dict,result_queue,pool_mysql):
- conn = pool_mysql.getConnector()
- cursor = conn.cursor()
- sql = "insert into test_extract(docid,doctitle,page_time) values(%d,%s,%s)"%(_dict["docid"],_dict["doctitle"],_dict["dochtmlcon"])
- cursor.execute(sql)
- conn.commit()
- pool_mysql.putConnector(conn)
- # url = "http://192.168.2.101:15030"
- # myheaders = {'Content-Type': 'application/json'}
- # print(int(_dict["docid"]))
- # data = {"doc_id":int(_dict["docid"]),"title":_dict["doctitle"],"content":_dict["dochtmlcon"]}
- # resp = requests.post(url,json=data,headers=myheaders, verify=True)
- # result = json.loads(resp.content.decode("utf8"),"utf8")
- # _dict["product"] = result["product"]
- pool_mysql = ConnectorPool(init_num=20,max_num=30,method_init=getConnection_testmysql)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=5,pool_mysql=pool_mysql)
- mt.run()
- # columns = ["docid","doctitle","project_name","product"]
- #
- # df_data = {}
- # for _c in columns:
- # df_data[_c] = []
- # for _dict in list_dict:
- # for _c in columns:
- # df_data[_c].append(_dict.get(_c,""))
- # df = pd.DataFrame(df_data)
- # df.to_excel("%s.product.xlsx"%(filename),columns=columns)
- def export_extract_check():
- '''
- :return:导出数据提取校验的结果并生成报告
- '''
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- sql = " select docid,json_result from exportdb.extract_check "
- cursor.execute(sql)
- dict_global = {}
- df_global = {"key_type":[],"online_count":[],"test_count":[],"diff_count":[],"diff_percent":[]}
- df_document = {"docid":[]}
- while True:
- rows = cursor.fetchmany(10000)
- if not rows:
- break
- for docid,json_result in rows:
- df_document["docid"].append(docid)
- _result = json.loads(json_result)
- for k,v in _result.items():
- key = k.split("_")
- _key = "_".join(key[:-1])
- if "punish" in _key or "complainants" in _key or "institutions" in _key:
- continue
- if k not in df_document:
- df_document[k] = []
- df_document[k].append(v)
- key_type = key[-1]
- if _key not in dict_global:
- dict_global[_key] = {}
- if key_type not in dict_global[_key]:
- dict_global[_key][key_type] = 0
- if key_type=="diff":
- dict_global[_key][key_type] += v
- if key_type in ("online","test"):
- if isinstance(v,str):
- if v!="":
- dict_global[_key][key_type] += 1
- elif isinstance(v,list):
- dict_global[_key][key_type] += len(v)
- for k,v in dict_global.items():
- df_global["key_type"].append(k)
- df_global["online_count"].append(v["online"])
- df_global["test_count"].append(v["test"])
- df_global["diff_count"].append(v["diff"])
- df_global["diff_percent"].append(v["diff"]/v["online"] if v["online"]>0 else 0)
- filename = "../data/%s_extract_check.xlsx"%(time.strftime("%Y-%m-%d"))
- with pd.ExcelWriter(filename) as writer:
- df1 = pd.DataFrame(df_global)
- df1.to_excel(writer,sheet_name="global")
- for k,v in df_document.items():
- print(k,len(v))
- df2 = pd.DataFrame(df_document)
- df2.to_excel(writer,sheet_name="document")
- writer.save()
- writer.close()
- def exportDocument_dump():
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- columns = ["docid","docchannel","page_time","web_source_no","doctitle","tenderee","agency","project_code","project_name","sub_docs_json"]
- df_keys = ["docid","docchannel","page_time","web_source_no","doctitle","doctitle_refine","tenderee","agency","project_code","project_name","bidding_budget","win_bid_price","win_tenderer","URL"]
- df_data = {}
- for _key in df_keys:
- df_data[_key] = []
- bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-03-03"),
- RangeQuery("status",201,300,True,True)])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- def getData(df_data,rows):
- list_data = getRow_ots(rows)
- for row in list_data:
- dict_find = {}
- for _key in df_keys:
- dict_find[_key] = 0
- for _k,_v in row.items():
- if _k in df_keys:
- dict_find[_k] = 1
- if _k=="project_code":
- _v = '"%s"'%_v
- df_data[_k].append(_v)
- doctitle = row.get("doctitle","")
- df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
- df_data["URL"].append("http://www.bidizhaobiao.com/info-%d.html"%(row["docid"]))
- dict_find["URL"] = 1
- dict_find["doctitle_refine"] = 1
- sub_docs_json = row.get("sub_docs_json","[{}]")
- doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
- if sub_docs_json is not None:
- for sub_docs in json.loads(sub_docs_json):
- for _key_sub_docs in sub_docs.keys():
- if _key_sub_docs in doc_columns:
- if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
- if _key_sub_docs in ["bidding_budget","win_bid_price"]:
- if float(sub_docs[_key_sub_docs])>0:
- doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- else:
- doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- for _k,_v in doc_columns.items():
- dict_find[_k] = 1
- df_data[_k].append(_v)
- for _k,_v in dict_find.items():
- if _v==0:
- df_data[_k].append("")
- _count = len(rows)
- getData(df_data,rows)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- getData(df_data,rows)
- _count += len(rows)
- # for docid in df["docid"]:
- # bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
- #
- # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- # SearchQuery(bool_query , limit=100, get_total_count=True),
- # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- # list_data = getRow_ots(rows)
- # if len(list_data)>0:
- # dict_find = {}
- # for _key in df_keys:
- # dict_find[_key] = 0
- # for _k,_v in list_data[0].items():
- # if _k in df_keys:
- # dict_find[_k] = 1
- # df_data[_k].append(_v)
- # doctitle = list_data[0].get("doctitle","")
- # df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
- # dict_find["doctitle_refine"] = 1
- # sub_docs_json = list_data[0].get("sub_docs_json","[{}]")
- # doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
- # if sub_docs_json is not None:
- # for sub_docs in json.loads(sub_docs_json):
- # for _key_sub_docs in sub_docs.keys():
- # if _key_sub_docs in doc_columns:
- # if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
- # if _key_sub_docs in ["bidding_budget","win_bid_price"]:
- # if float(sub_docs[_key_sub_docs])>0:
- # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- # else:
- # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- # for _k,_v in doc_columns.items():
- # dict_find[_k] = 1
- # df_data[_k].append(_v)
- # for _k,_v in dict_find.items():
- # if _v==0:
- # df_data[_k].append("")
- df1 = pd.DataFrame(df_data)
- df1.to_csv("../data/0303去重.csv",columns=df_keys)
- def exportDocument_dump_mysql():
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- columns = ["project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer","group_id","docid"]
- df_data = {}
- for _c in columns:
- df_data[_c] = []
- sql = " select "+",".join(columns)+" from run_dumplicate_document_his where group_id in (select group_id from run_dumplicate_document_his group by group_id having count(1)>1)"
- cursor.execute(sql)
- while True:
- rows = cursor.fetchmany(100000)
- if not rows:
- break
- for row in rows:
- for _i in range(len(columns)):
- df_data[columns[_i]].append(row[_i])
- df = pd.DataFrame(df_data)
- df.to_csv("../data/0304去重.csv",columns=["group_id","docid","project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer"])
- print(cursor.description)
- def getDict_docchannel():
- filename = "docchannel.pk"
- if os.path.exists(filename):
- _dict = load(filename)
- return _dict
- conn = getConnection_mysql()
- cursor = conn.cursor()
- sql = "select channel_id,chnlname from sys_channel "
- cursor.execute(sql)
- rows = cursor.fetchall()
- _dict = dict()
- for row in rows:
- _dict[row[0]] = row[1]
- save(_dict,filename)
- return _dict
- def exportDocument_by_doctitle():
- def timeAdd_minute(_time,minutes):
- a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+60*minutes
- _time1 = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(a))
- return _time1
- columns = ["docid","attachmenttextcon","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource","time_bidclose","time_bidopen"]
- columns = ["doctitle","attachmenttextcon","doctextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no"]
- columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no"]
- dict_channel = getDict_docchannel()
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- list_keyword = []
- # for _keyword in re.split("\s|/|、",str_keyword):
- # if len(_keyword.strip())==0 and isinstance(_keyword,(str)):
- # continue
- # print(_keyword)
- # item = {"keyword":_keyword.strip()}
- # list_keyword.append(item)
- # task_queue.put(item)
- filename = "有效调用次数统计_20220830_v1.xlsx"
- df = pd.read_excel(filename)
- for company in df["enterpriseName"]:
- _dict = {"keyword":company}
- task_queue.put(_dict)
- list_keyword.append(_dict)
- start_day = "2019-01-01"
- count_days = 90
- # for _i in range(count_days):
- #
- # current_date = timeAdd(start_day,_i)
- # for _j in range(24*6):
- # start_minute = timeAdd_minute(current_date,10*_j)
- # end_minute = timeAdd_minute(current_date,10*(_j+1))
- #
- # item = {"start_minute":start_minute,"end_minute":end_minute}
- # list_keyword.append(item)
- # task_queue.put(item)
- ots_client = getConnect_ots()
- def _handle(item,result_queue,ots_client):
- # should_q_keyword = BoolQuery(should_queries=[
- # # MatchPhraseQuery("tenderee",item["keyword"]),
- # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["keyword"])),
- # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.second_tenderer",item["keyword"])),
- # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.third_tenderer",item["keyword"]))
- # MatchPhraseQuery("doctextcon",item["keyword"]),
- # MatchPhraseQuery("doctitle",item["keyword"]),
- # MatchPhraseQuery("attachmenttextcon",item["keyword"])
- # ])
- #
- # should_q2 = BoolQuery(should_queries=[WildcardQuery('province', '%s*'%"广东")
- # # ,WildcardQuery('province', '%s*'%"湖南")
- # # ,WildcardQuery('province', '%s*'%"广西")
- # ])
- #
- # should_q_tenderee = BoolQuery(should_queries=[WildcardQuery("tenderee","*中学*"),
- # WildcardQuery("tenderee","*大学*"),
- # WildcardQuery("tenderee","*小学*"),
- # WildcardQuery("tenderee","*教育局*")])
- bool_query = BoolQuery(must_queries=[
- # RangeQuery("page_time","2019-01-01","2023-01-01"),
- # TermQuery("page_time","2022-02-18"),
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[item["keyword"]],MatchPhraseQuery),
- # RangeQuery("crtime",item["start_minute"],item["end_minute"])
- RangeQuery("status",151,300,True,True),
- # TermQuery("tenderee",item["keyword"])
- # ,TermQuery("docchannel",52)
- # ,should_q_keyword
- # ,should_q_tenderee
- # ,should_q2
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("crtime",SortOrder.DESC)]), limit=1, get_total_count=True),
- ColumnsToGet(return_type=ColumnReturnType.NONE))
- item["total_count"] = total_count
- list_data = getRow_ots(rows)
- for _data in list_data:
- _data["keyword"] = item["keyword"]
- result_queue.put(_data)
- _count = len(list_data)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- _count += len(list_data)
- for _data in list_data:
- _data["keyword"] = item["keyword"]
- result_queue.put(_data)
- mt = MultiThreadHandler(task_queue,_handle,result_queue,40,ots_client=ots_client)
- mt.run()
- set_docid = set()
- list_item = []
- # for item in list_keyword:
- # print(item["keyword"])
- total_count = 0
- for item in list_keyword:
- total_count += item["total_count"]
- print(item["total_count"])
- print("total_count:%d"%(total_count))
- keys = list_keyword[0].keys()
- df_data = {}
- for item in list_keyword:
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- dumplicate = False
- try:
- while True:
- _dict = result_queue.get(False)
- _docid = _dict.get("docid")
- if _docid in set_docid and not dumplicate:
- continue
- set_docid.add(_docid)
- list_item.append(_dict)
- except Exception as e:
- print(e)
- list_docid = list(set_docid)
- with open("list_docid.txt","w",encoding="utf8") as f:
- for _docid in list_docid:
- f.write(str(_docid))
- f.write("\n")
- f.flush()
- # log("get document taotal_count:%d"%len(list_item))
- # set_line = set()
- # getRowData(df_data,list_item,set_line,list_keyword,dict_channel,dumplicate)
- # set_enterprise = set()
- # for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
- # set_enterprise.add(_tenderee)
- # set_enterprise.add(_agency)
- # set_enterprise.add(_win_tenderer)
- # if "" in set_enterprise:
- # set_enterprise.remove("")
- # if None in set_enterprise:
- # set_enterprise.remove(None)
- # dict_enterprise = getDictEnterprise(list(set_enterprise))
- # if len(set_enterprise)>0:
- # for _i in range(len(df_data["招标单位"])):
- # # _enterprise_name = df_data["招标单位"][_i]
- # # if df_data["招标联系人电话"][_i]=="":
- # # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- # # if contacts is not None:
- # # _person,_phone = getOneContact(contacts)
- # # df_data["招标联系人"][_i] = _person
- # # df_data["招标联系人电话"][_i] = _phone
- #
- # _enterprise_name = df_data["代理单位"][_i]
- # if df_data["代理联系人电话"][_i]=="":
- # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- # if contacts is not None:
- # _person,_phone = getOneContact(contacts)
- # df_data["代理联系人"][_i] = _person
- # df_data["代理联系人电话"][_i] = _phone
- #
- # _enterprise_name = df_data["中标单位"][_i]
- # if df_data["中标单位联系电话"][_i]=="":
- # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- # if contacts is not None:
- # _person,_phone = getOneContact(contacts)
- # df_data["中标单位联系人"][_i] = _person
- # df_data["中标单位联系电话"][_i] = _phone
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s_export11.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
- set_columns = set()
- list_df_columns = []
- def set_dict_item(_dict,name,v):
- _dict[name] = getLegal_str(v)
- if name not in set_columns:
- set_columns.add(name)
- list_df_columns.append(getLegal_str(name))
- def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v):
- _dict[name] = getLegal_str(v)
- if name not in set_columns1:
- set_columns1.add(name)
- list_df_columns1.append(name)
- def exportDocument_medicine(start_time,end_time):
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- # columns = ["doctitle","docchannel","time_bidopen","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
- columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen"]
- set_enter = set()
- str_enter = '''
- 北京嘉和美康信息技术有限公司
- 阿里健康科技(中国)有限公司
- 北大医疗信息技术有限公司
- 创业慧康科技股份有限公司
- 东华医为科技有限公司
- 望海康信(北京)科技股份公司
- 国新健康保障服务有限公司
- 南京海泰医疗信息系统有限公司
- 南京海泰信息技术有限公司
- 浙江和仁科技股份有限公司
- 北京惠每科技有限公司
- 金蝶医疗软件科技有限公司
- 北京京东健康有限公司
- 四川久远银海软件股份有限公司
- 零氪科技(北京)有限公司
- 北京麦迪克斯科技有限公司
- 苏州麦迪斯顿医疗科技股份有限公司
- 江苏曼荼罗软件股份有限公司
- 北京平安联想智慧医疗信息技术有限公司
- 青岛百洋智能科技股份有限公司
- 上海森亿医疗科技有限公司
- 万达信息股份有限公司
- 微医集团(浙江)有限公司
- 卫宁健康科技集团股份有限公司
- 心医国际数字医疗系统(大连)有限公司
- 医渡云(北京)技术有限公司
- 医惠科技有限公司
- 易联众信息技术股份有限公司
- 智业软件股份有限公司
- 中电数据服务有限公司
- 重庆中联信息产业有限责任公司
- 杭州卓健信息科技股份有限公司
- 大连万达集团股份有限公司
- '''
- for a in re.split("\s+",str_enter):
- if a.strip()!="":
- set_enter.add(a.strip())
- dict_channel = getDict_docchannel()
- # list_province = ["江西","湖南","四川","安徽"]
- list_province = ["全国"]
- for _province in list_province:
- df_data = {}
- str_p = '''
- 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件
- 县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗
- 数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化
- 临床应用
- '''
- list_prov = re.split("\s",str_p)
- list_mu = []
- for _p in list_prov:
- if _p.strip()=="":
- continue
- print(_p)
- list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
- s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
- list_should_ten = []
- for _p in re.split("、",s_tenderee):
- if _p.split()=="":
- continue
- list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
- list_should_win = []
- for _win in list(set_enter):
- list_should_win.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_win)))
- list_should_chan = []
- list_should_chan.append(TermQuery("docchannel",52))
- list_should_chan.append(TermQuery("docchannel",101))
- list_should_chan.append(TermQuery("docchannel",102))
- should_q1 = BoolQuery(should_queries=list_mu)
- should_q2 = BoolQuery(should_queries=list_should_ten)
- should_q3 = BoolQuery(should_queries=list_should_chan)
- bool_query = BoolQuery(must_queries=[
- BoolQuery(should_queries=list_should_win),
- RangeQuery("page_time",start_time,end_time,include_lower=True,include_upper=True),
- RangeQuery("status",151,300,True,True),
- # should_q1,
- # should_q2,
- # should_q3,
- ],
- # must_not_queries=[
- # MatchPhraseQuery("doctextcon","器械"),
- # MatchPhraseQuery("doctextcon","仪器"),
- # ]
- )
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- set_line = set()
- _count = len(rows)
- # getData(df_data,rows,set_line)
- list_row = getRow_ots(rows)
- getRowData(df_data,list_row,set_line,[],dict_channel,False)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- list_row = getRow_ots(rows)
- getRowData(df_data,list_row,set_line,[],dict_channel,False)
- _count += len(rows)
- if len(list(df_data.keys()))>0:
- if len(df_data[list(df_data.keys())[0]])>=300:
- break
- set_enterprise = set()
- for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
- set_enterprise.add(_tenderee)
- set_enterprise.add(_agency)
- set_enterprise.add(_win_tenderer)
- if "" in set_enterprise:
- set_enterprise.remove("")
- if None in set_enterprise:
- set_enterprise.remove(None)
- dict_enterprise = getDictEnterprise(list(set_enterprise))
- if len(set_enterprise)>0:
- for _i in range(len(df_data["招标单位"])):
- _enterprise_name = df_data["招标单位"][_i]
- if df_data["招标联系人电话"][_i]=="":
- contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- if contacts is not None:
- _person,_phone = getOneContact(contacts)
- df_data["招标联系人"][_i] = _person
- df_data["招标联系人电话"][_i] = _phone
- _enterprise_name = df_data["代理单位"][_i]
- if df_data["代理联系人电话"][_i]=="":
- contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- if contacts is not None:
- _person,_phone = getOneContact(contacts)
- df_data["代理联系人"][_i] = _person
- df_data["代理联系人电话"][_i] = _phone
- _enterprise_name = df_data["中标单位"][_i]
- if df_data["中标单位联系电话"][_i]=="":
- contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- if contacts is not None:
- _person,_phone = getOneContact(contacts)
- df_data["中标单位联系人"][_i] = _person
- df_data["中标单位联系电话"][_i] = _phone
- return df_data
- df1 = pd.DataFrame(df_data)
- print(len(df_data["docid"]))
- df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
- def getRowDataWithKey(df_data,rows,columns):
- global list_df_columns
- list_df_columns = columns
- for row in rows:
- for c in columns:
- if c not in df_data:
- df_data[c] = []
- df_data[c].append(row.get(c))
- def getRowData(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
- dict_line = {}
- # list_data = getRow_ots(rows)
- _index = 0
- for row in rows:
- _index += 1
- item = {}
- _dict = row
- set_dict_item(item,"docid",_dict.get("docid",""))
- # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
- set_dict_item(item,"crtime",_dict.get("crtime",""))
- # set_dict_item(item,"要素数",_dict.get("extract_count",0))
- set_dict_item(item,"公告标题",_dict.get("doctitle",""))
- set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
- set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- set_dict_item(item,"正文实体",_dict.get("nlp_enterprise",""))
- set_dict_item(item,"附件实体",_dict.get("nlp_enterprise_attachment",""))
- # set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
- # set_dict_item(item,"原公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))
- set_dict_item(item,"公告内容",getLegal_str(_dict.get("doctextcon","")))
- set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))
- if "keyword" in _dict:
- set_dict_item(item,"关键词",_dict["keyword"])
- else:
- _wholeword = re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")[:30000]+row.get("attachmenttextcon","")[:30000]).replace("(","(").replace(")",")"))
- _pattern = "|".join([re.escape(str(a).replace("(","(").replace(")",")")) for a in list_keyword])
- set_dict_item(item,"关键词",",".join(list(set(re.findall(_pattern,_wholeword)))))
- # set_dict_item(item,"关键词",_dict.get("keyword",""))
- set_dict_item(item,"产品",_dict.get("product",""))
- set_dict_item(item,"服务期限",_dict.get("service_time",""))
- set_dict_item(item,"省份",_dict.get("province",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"城市",_dict.get("city",""))
- set_dict_item(item,"区县",_dict.get("district",""))
- set_dict_item(item,"发布时间",_dict.get("page_time",""))
- set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
- set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
- # set_dict_item(item,"创建时间",_dict.get("crtime",""))
- set_dict_item(item,"招标方式",_dict.get("bidway",""))
- set_dict_item(item,"行业一级分类",_dict.get("industry",""))
- set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
- set_dict_item(item,"uuid",_dict.get("uuid"))
- # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
- set_dict_item(item,"项目编号",_dict.get("project_code",""))
- set_dict_item(item,"项目名称",_dict.get("project_name",""))
- set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- set_dict_item(item,"代理单位",_dict.get("agency",""))
- set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- set_dict_item(item,"评审专家",_dict.get("person_review",""))
- set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
- set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
- set_dict_item(item,"获取文件开始时间",_dict.get("time_get_file_start",""))
- set_dict_item(item,"获取文件结束时间",_dict.get("time_get_file_end",""))
- set_dict_item(item,"保证金递交开始时间",_dict.get("time_earnest_money_start",""))
- set_dict_item(item,"保证金递交结束时间",_dict.get("time_earnest_money_end",""))
- sub_docs_json = _dict.get("sub_docs_json")
- set_tenderer = set()
- multi_win = []
- multi_package_win = []
- win_joint = []
- if sub_docs_json is not None:
- docs = json.loads(sub_docs_json)
- docs.sort(key=lambda x:float(x.get("win_bid_price",0)))
- for _doc in docs:
- if "win_tenderer" in _doc:
- set_dict_item(item,"中标单位",_doc["win_tenderer"])
- multi_package_win.append(_doc["win_tenderer"])
- if "second_tenderer" in _doc:
- set_dict_item(item,"第二候选单位",_doc["second_tenderer"])
- set_tenderer.add(_doc.get("second_tenderer"))
- if "third_tenderer" in _doc:
- set_dict_item(item,"第三候选单位",_doc["third_tenderer"])
- set_tenderer.add(_doc.get("third_tenderer"))
- if "win_tenderer_manager" in _doc:
- set_dict_item(item,"中标单位联系人",_doc["win_tenderer_manager"])
- if "win_tenderer_phone" in _doc:
- set_dict_item(item,"中标单位联系电话",_doc["win_tenderer_phone"])
- if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- set_dict_item(item,"中标金额",_doc["win_bid_price"])
- if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- set_dict_item(item,"招标金额",_doc["bidding_budget"])
- if "win_tenderer_joint" in _doc:
- win_tenderer_joints = _doc.get("win_tenderer_joint","").split(",")
- for _joints in win_tenderer_joints:
- win_joint.append(_joints)
- if "win_tenderer_joints" in _doc:
- win_tenderer_joints = json.loads(_doc["win_tenderer_joints"])
- for _win_joint in win_tenderer_joints:
- win_joint.append(_win_joint.get("name",""))
- if "multi_winner" in _doc:
- multi_winners = _doc.get("multi_winner","").split(",")
- for _mul in multi_winners:
- multi_win.append(_mul)
- if "multi_winners" in _doc:
- multi_winners = json.loads(_doc["multi_winners"])
- for multi_w in multi_winners:
- multi_win.append(multi_w.get("name",""))
- set_dict_item(item,"多标段中标人",",".join(list(multi_package_win)))
- set_dict_item(item,"多中标人",",".join(list(multi_win)))
- set_dict_item(item,"联合中标人",",".join(list(win_joint)))
- set_dict_item(item,"入围供应商",",".join(list(set_tenderer)))
- if "第二候选单位" not in item:
- set_dict_item(item,"第二候选单位","")
- if "第三候选单位" not in item:
- set_dict_item(item,"第三候选单位","")
- if "招标金额" not in item:
- set_dict_item(item,"招标金额","")
- if "中标金额" not in item:
- set_dict_item(item,"中标金额","")
- if "中标单位" not in item:
- set_dict_item(item,"中标单位","")
- if "中标单位联系人" not in item:
- set_dict_item(item,"中标单位联系人","")
- if "中标单位联系电话" not in item:
- set_dict_item(item,"中标单位联系电话","")
- set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- set_dict_item(item,"detail_link",_dict.get("detail_link",""))
- # if item["中标单位"] not in set_enter:
- # continue
- if not dumplicate:
- if item["项目编号"]!="":
- _line = "%s-%s-%s-%s-%s-%s"%(item["公告类别"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["招标金额"]),str(item["中标金额"]))
- if _line in dict_line:
- dict_line[_line].append(item)
- continue
- dict_line[_line] = [item]
- _line2 = "%s-%s-%s-%s-%s-%s"%(item["公告标题"],item["公告类别"],item["招标单位"],str(item["招标金额"]),item["中标单位"],str(item["中标金额"]))
- if _line2 in dict_line:
- dict_line[_line2].append(item)
- continue
- dict_line[_line2] = [item]
- # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None:
- # continue
- # if _dict.get("docid","") in set_ig_docid:
- # continue
- # if item["招标金额"]=="":
- # continue
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- if not dumplicate:
- dict_dump = {}
- columns = ["group_id"]
- columns.extend(list_df_columns)
- for k in columns:
- dict_dump[k] = []
- group_id = 1
- for k,v in dict_line.items():
- if len(v)==1:
- continue
- for item in v:
- dict_dump["group_id"].append(group_id)
- for k in list_df_columns:
- dict_dump[k].append(item.get(k))
- group_id += 1
- df_dump = pd.DataFrame(dict_dump)
- df_dump.to_excel("%s/../data/dumplicate/%s_重复数据.xlsx"%(os.path.dirname(__file__),getCurrent_date("%Y-%m-%d_%H%M%S")))
- def getRowData_shenpi(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
- dict_line = {}
- # list_data = getRow_ots(rows)
- _index = 0
- for row in rows:
- _index += 1
- item = {}
- _dict = row
- extract_json = json.loads(_dict.get("extract_json","{}"))
- set_dict_item(item,"docid",_dict.get("docid",""))
- set_dict_item(item,"original_id",_dict.get("original_id",""))
- # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
- set_dict_item(item,"crtime",_dict.get("crtime",""))
- set_dict_item(item,"province",_dict.get("province",""))
- set_dict_item(item,"city",_dict.get("city",""))
- set_dict_item(item,"district",_dict.get("district",""))
- set_dict_item(item,"单位集合",json.dumps(extract_json.get("dict_enterprise",""),ensure_ascii=False))
- # set_dict_item(item,"要素数",_dict.get("extract_count",0))
- set_dict_item(item,"公告标题",_dict.get("doctitle",""))
- set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
- set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
- # set_dict_item(item,"原公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))
- # set_dict_item(item,"detail_link",_dict.get("detail_link",""))
- set_dict_item(item,"产品",_dict.get("product",""))
- set_dict_item(item,"发布时间",_dict.get("page_time",""))
- set_dict_item(item,"行业一级分类",_dict.get("industry",""))
- set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
- # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
- set_dict_item(item,"项目编号",_dict.get("project_code",""))
- set_dict_item(item,"项目名称",_dict.get("project_name",""))
- set_dict_item(item,"审批事项",_dict.get("approval_items",""))
- set_dict_item(item,"审批结果",_dict.get("approval_result",""))
- set_dict_item(item,"审批部门",_dict.get("approver",""))
- set_dict_item(item,"建设单位",_dict.get("construct_company",""))
- set_dict_item(item,"建设规模",_dict.get("construction_scale",""))
- set_dict_item(item,"申报单位",_dict.get("declare_company",""))
- set_dict_item(item,"审批文号",_dict.get("doc_num",""))
- set_dict_item(item,"环评机构",_dict.get("evaluation_agency",""))
- set_dict_item(item,"项目法人",_dict.get("legal_person",""))
- set_dict_item(item,"资金来源",_dict.get("moneysource",""))
- set_dict_item(item,"申报类型",_dict.get("pro_type",""))
- set_dict_item(item,"项目地址",_dict.get("project_addr",""))
- set_dict_item(item,"项目属性",_dict.get("properties",""))
- set_dict_item(item,"开工时间",_dict.get("time_commencement",""))
- set_dict_item(item,"竣工时间",_dict.get("time_completion",""))
- set_dict_item(item,"申报时间",_dict.get("time_declare",""))
- set_dict_item(item,"总投资",_dict.get("total_tenderee_money",""))
- set_dict_item(item,"建设年限",_dict.get("year_limit",""))
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- def getRowData_sp1(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
- dict_line = {}
- # list_data = getRow_ots(rows)
- _index = 0
- set_id = set()
- for row in rows:
- _index += 1
- item = {}
- _dict = row
- _id = _dict.get("id")
- if _id is not None and _id in set_id:
- continue
- set_id.add(_id)
- set_dict_item(item,"id",_dict.get("id",""))
- # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
- # set_dict_item(item,"crtime",_dict.get("crtime",""))
- set_dict_item(item,"detaillink",_dict.get("detaillink",""))
- # set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
- set_dict_item(item,"公告类别",dict_sptype.get(str(_dict.get("sp_type","")),""))
- set_dict_item(item,"page_time",getLegal_str(_dict.get("page_time","")))
- # set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))
- set_dict_item(item,"page_title",_dict.get("page_title",""))
- set_dict_item(item,"record_id",_dict.get("record_id",""))
- set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- def getRowData_sp(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
- dict_line = {}
- # list_data = getRow_ots(rows)
- _index = 0
- set_id = set()
- for row in rows:
- _index += 1
- item = {}
- _dict = row
- set_dict_item(item,"docid",_dict.get("docid",""))
- # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
- # set_dict_item(item,"crtime",_dict.get("crtime",""))
- set_dict_item(item,"公告标题",_dict.get("page_title",""))
- # set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
- set_dict_item(item,"公告类别",dict_sptype.get(str(_dict.get("sp_type","")),""))
- set_dict_item(item,"公告内容",getLegal_str(_dict.get("page_content","")))
- # set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))
- if "keyword" in _dict:
- set_dict_item(item,"关键词",_dict["keyword"])
- else:
- set_dict_item(item,"关键词",",".join(list(set(re.findall("|".join([re.escape(str(a).replace("(","(").replace(")",")")) for a in list_keyword]),re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")[:30000]+row.get("attachmenttextcon","")[:30000]).replace("(","(").replace(")",")")))))))
- # set_dict_item(item,"关键词",_dict.get("keyword",""))
- set_dict_item(item,"产品",_dict.get("product",""))
- set_dict_item(item,"服务期限",_dict.get("service_time",""))
- set_dict_item(item,"省份",_dict.get("province",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"城市",_dict.get("city",""))
- set_dict_item(item,"区县",_dict.get("district",""))
- set_dict_item(item,"发布时间",_dict.get("page_time",""))
- set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
- set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
- # set_dict_item(item,"创建时间",_dict.get("crtime",""))
- set_dict_item(item,"招标方式",_dict.get("bidway",""))
- set_dict_item(item,"行业一级分类",_dict.get("industry",""))
- set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
- set_dict_item(item,"来源",_dict.get("web_source_name",""))
- set_dict_item(item,"uuid",_dict.get("uuid"))
- # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
- set_dict_item(item,"项目编号",_dict.get("page_code",""))
- set_dict_item(item,"项目名称",_dict.get("project_name",""))
- set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- set_dict_item(item,"代理单位",_dict.get("agency",""))
- set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- set_dict_item(item,"评审专家",_dict.get("person_review",""))
- set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
- set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
- set_dict_item(item,"获取文件开始时间",_dict.get("time_get_file_start",""))
- set_dict_item(item,"获取文件结束时间",_dict.get("time_get_file_end",""))
- set_dict_item(item,"保证金递交开始时间",_dict.get("time_earnest_money_start",""))
- set_dict_item(item,"保证金递交结束时间",_dict.get("time_earnest_money_end",""))
- sub_docs_json = _dict.get("sub_docs_json")
- set_tenderer = set()
- if sub_docs_json is not None:
- docs = json.loads(sub_docs_json)
- docs.sort(key=lambda x:x.get("win_bid_price",0))
- for _doc in docs:
- if "win_tenderer" in _doc:
- set_dict_item(item,"中标单位",_doc["win_tenderer"])
- if "second_tenderer" in _doc:
- set_dict_item(item,"第二候选单位",_doc["second_tenderer"])
- set_tenderer.add(_doc.get("second_tenderer"))
- if "third_tenderer" in _doc:
- set_dict_item(item,"第三候选单位",_doc["third_tenderer"])
- set_tenderer.add(_doc.get("third_tenderer"))
- if "win_tenderee_manager" in _doc:
- set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
- if "win_tenderee_phone" in _doc:
- set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
- if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- set_dict_item(item,"中标金额",_doc["win_bid_price"])
- if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- set_dict_item(item,"招标金额",_doc["bidding_budget"])
- set_dict_item(item,"入围供应商",",".join(list(set_tenderer)))
- if "第二候选单位" not in item:
- set_dict_item(item,"第二候选单位","")
- if "第三候选单位" not in item:
- set_dict_item(item,"第三候选单位","")
- if "招标金额" not in item:
- set_dict_item(item,"招标金额","")
- if "中标金额" not in item:
- set_dict_item(item,"中标金额","")
- if "中标单位" not in item:
- set_dict_item(item,"中标单位","")
- if "中标单位联系人" not in item:
- set_dict_item(item,"中标单位联系人","")
- if "中标单位联系电话" not in item:
- set_dict_item(item,"中标单位联系电话","")
- # if item["中标单位"] not in set_enter:
- # continue
- if not dumplicate:
- if item["项目编号"]!="":
- _line = "%s-%s-%s-%s-%s-%s"%(item["公告类别"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["招标金额"]),str(item["中标金额"]))
- if _line in dict_line:
- dict_line[_line].append(item)
- continue
- dict_line[_line] = [item]
- _line2 = "%s-%s-%s-%s-%s-%s"%(item["公告标题"],item["公告类别"],item["招标单位"],str(item["招标金额"]),item["中标单位"],str(item["中标金额"]))
- if _line2 in dict_line:
- dict_line[_line2].append(item)
- continue
- dict_line[_line2] = [item]
- # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None:
- # continue
- # if _dict.get("docid","") in set_ig_docid:
- # continue
- # if item["招标金额"]=="":
- # continue
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- if not dumplicate:
- dict_dump = {}
- columns = ["group_id"]
- columns.extend(list_df_columns)
- for k in columns:
- dict_dump[k] = []
- group_id = 1
- for k,v in dict_line.items():
- if len(v)==1:
- continue
- for item in v:
- dict_dump["group_id"].append(group_id)
- for k in list_df_columns:
- dict_dump[k].append(item.get(k))
- group_id += 1
- df_dump = pd.DataFrame(dict_dump)
- df_dump.to_excel("%s/../data/dumplicate/%s_重复数据.xlsx"%(os.path.dirname(__file__),getCurrent_date("%Y-%m-%d_%H%M%S")))
- def filterRow(list_row,column,list_not_keywrods):
- list_result = []
- for row in list_row:
- _product = row.get(column,"")
- sub_docs_json = row.get("sub_docs_json","")
- doctitle = row.get("doctitle","")
- tenderee = row.get("tenderee","")
- if tenderee!="":
- continue
- nlp_enterprise = row.get("nlp_enterprise")
- nlp_enterprise_attachment = row.get("nlp_enterprise_attachment")
- list_nlp_enterprise = []
- list_enterprise_attachment = []
- if nlp_enterprise is not None:
- list_nlp_enterprise = json.loads(nlp_enterprise)
- if nlp_enterprise_attachment is not None:
- list_enterprise_attachment = json.loads(nlp_enterprise_attachment)
- if max(len(list_nlp_enterprise),len(list_enterprise_attachment))==1:
- list_result.append(row)
- # if re.search("设计",sub_docs_json) is not None:
- # if re.search("装修",str(doctitle)+str(sub_docs_json)) is None:
- # list_result.append(row)
- # else:
- # print("===",_product)
- # if re.search("|".join([re.escape(i) for i in list_not_keywrods]),_product) is not None:
- # continue
- # list_result.append(row)
- # if row.get("关键词",1)==row.get("招标单位",2) or row.get("关键词",2)==row.get("中标单位",3):
- # list_result.append(row)
- # doctitle = row.get("doctitle")
- # doctextcon = row.get("doctextcon")
- # if len(re.sub('\s','',doctextcon))==len(doctitle)+4:
- # list_result.append(row)
- # tenderee_phone = row.get("tenderee_phone","")
- # if len(tenderee_phone)==11:
- # list_result.append(row)
- return list_result
- dict_sptype = {"2":"审批信息",
- "4":"审批结果",
- "8":"核准公示",
- "16":"核准结果",
- "32":"备案公示",
- "64":"备案结果",
- "128":"推介项目",
- "256":"推介结果",
- "512":"项目撤销",
- "1024":"筹备阶段"}
- def getKeywordByFile():
- filename = "审批标题对比检查结果(20220831).xlsx"
- df = pd.read_excel(filename,sheetname=1)
- list_data = []
- for _title,_no,_type in zip(df["标题"],df["编号"],df["检查结果"]):
- if _type not in ["接口错"]:
- continue
- _dict = {"title":_title,
- "web_source":_no}
- list_data.append(_dict)
- return list_data
- def exportDocument_by_pagetime():
- ots_client = getConnect_ots()
- ots_capacity = getConnect_capacity()
- # columns = ["doctitle","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
- # columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
- # columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_name"]
- columns = ["doctitle","docchannel","nlp_enterprise","nlp_enterprise_attachment","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end","detail_link"]
- # columns = ["docchannel","docid","project_name","product","doctitle","page_time","province","city","time_get_file_end","time_bidclose","project_code","sub_docs_json","tenderee","info_type","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","detail_link","bidway","crtime","extract_count","products"]
- # columns = ["page_time","doctitle","crtime","web_source_no","web_source_name","detail_link","original_docchannel","uuid","docid"]
- # columns = ["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnest_money_end","time_earnest_money_start","time_get_file_end","time_get_file_start","time_publicity_end","time_publicity_start","time_registration_end","time_registration_start"]
- dict_channel = getDict_docchannel()
- # columns = ["doctitle","dochtmlcon","page_time","web_source_no","web_source_name","sub_docs_json"]
- # columns = ["tenderee","tenderee_contact","tenderee_phone"]
- # columns = ["extract_json","original_id","crtime","province","city","district","doctitle","web_source_no","web_source_name","product","page_time","industry","info_type","project_code","project_name","approval_items","approval_result","approver","construct_company","construction_scale","declare_company","doc_num","evaluation_agency","legal_person","moneysource","pro_type","project_addr","properties","time_commencement","time_completion","time_declare","total_tenderee_money","year_limit"]
- # columns = ["extract_json","doctitle","status","page_time","docchannel"]
- list_query = []
- str_keyword = '''
- 博物馆 、文物
- '''
- list_keyword = splitIntoList(str_keyword,"[\s\n、,,|]")
- str_con_keyword = '''
- 博物馆 、文物
- '''
- con_keyword = splitIntoList(str_con_keyword,'[\s\n、,,|]')
- print(con_keyword)
- should_q_win = []
- for _keyword in list_keyword:
- should_q_win.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)))
- q_win = BoolQuery(should_queries=should_q_win)
- str_not_keyword = '''
- 物业、保洁、家具、装修、维修、修缮、车辆、消防、广告、印刷、安防、防雷
- 地区:全国
- '''
- list_not_key = splitIntoList(str_not_keyword,"[\s\n、,,]")
- tenderee_keywrod = "医院、大学、高校、高中"
- list_t_key = splitIntoList(tenderee_keywrod,"[\s\n、,,]")
- should_q_system = [TermQuery("procurement_system","企业采购系统"),
- TermQuery("procurement_system","部队采购系统")]
- q_system = BoolQuery(should_queries=should_q_system)
- log(str(list_keyword))
- s_province = "北京,天津,深圳,上海,浙江,江苏,安徽"
- list_province = splitIntoList(s_province,"[,,\s]")
- st = "环境监测中心、环境监测总站、环保局、水务局、水利局"
- list_tenderee = splitIntoList(st,"、|\s")
- # list_title = getKeywordByFile()
- #
- # for _d in list_title:
- # _title = _d["title"]
- # web_source = _d["web_source"]
- # bool_query = BoolQuery(must_queries=[
- # generateBoolShouldQuery(["page_title"],[_title],MatchPhraseQuery),
- # TermQuery("web_source_no",web_source)
- # ])
- # list_query.append({"query":bool_query})
- for _keyword in list_keyword:
- bool_query = BoolQuery(must_queries=[
- # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)),
- # generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle"],["院","交通","学"],MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle"],["智慧"],MatchPhraseQuery),
- # ExistsQuery("tenderee"),
- # generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["学校","大学","中学","小学"],MatchPhraseQuery),
- # generateBoolShouldQuery(["web_source_name"],list_keyword,TermQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_keyword,MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],con_keyword,MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["服务期","服务时间","合同期限","服务范围","质保期","履行期限","履约期限","交货期"],MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["雪茄柜"],MatchPhraseQuery),
- # BoolQuery(should_queries=[
- # generateBoolShouldQuery(["doctitle"],["公告","公示","招标","中标","采购","工程","项目","询价","施工","比价","服务","监理","设计"],MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["项目名称","招标人","中标人","项目编号","采购组织机构","采购人","招标范围","投标保证金","报价地址","询价通知书"],MatchPhraseQuery)
- # ]),
- # generateBoolShouldQuery(["doctitle"],["中标"],MatchPhraseQuery),
- # generateBoolShouldQuery(["docid"],list_keyword,TermQuery),
- # q_win,
- # should_q,
- # generateBoolShouldQuery(["tenderee"],[company],TermQuery),
- # generateBoolShouldQuery(["province"],["安徽","江苏"],TermQuery),
- generateBoolShouldQuery(["docchannel"],[52,101,118,119,120,121,122],TermQuery),
- # generateBoolShouldQuery(["docchannel"],[52],TermQuery),
- # generateBoolShouldQuery(["docchannel"],[101,118,119,120,121,122],TermQuery),
- # generateBoolShouldQuery(["docchannel"],[302],TermQuery),
- # generateBoolShouldQuery(["docchannel"],[101,119,120,121,122],TermQuery),
- # generateBoolShouldQuery(["docchannel"],[51,52,101,102,103,104,105,114,118,119,120],TermQuery),
- # generateBoolShouldQuery(["docchannel"],[102,52,101,114,119,120],TermQuery),
- RangeQuery("page_time","2024-01-01","2025-12-12",True,False),
- # NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
- # TermQuery("web_source_name","专项债券信息网"),
- # generateBoolShouldQuery(["city"],["西安","渭南"],TermQuery),
- # TermQuery("info_type","物业管理"),
- # RangeQuery("crtime","2024-07-21 00:00:00","2024-06-21 13:00:00"),
- # TermQuery("save",0),
- RangeQuery("status",201,301,True,True),
- # RangeQuery("crtime","2024-01-01","2025-01-01",True,False),
- # RangeQuery("page_time",range_from="2024-08-01",range_to="2025-01-01"),
- # BoolQuery(should_queries=[TermQuery("page_time","2022-09-15"),
- # TermQuery("page_time","2022-10-20"),
- # TermQuery("page_time","2022-10-31")])
- # TermQuery("page_time","2025-01-07"),
- # RangeQuery("crtime","2025-05-19","2025-05-20"),
- # TermQuery("docid",237163857),
- # RangeQuery("tenderee","","1"),
- # WildcardQuery("tenderee","*雅居乐*"),
- # RangeQuery("crtime","2023-07-22 00:00:00"),
- # BoolQuery(should_queries=[NestedQuery("products",RangeQuery("products.unit_price",1)),
- # NestedQuery("products",RangeQuery("products.total_price",1)),])
- # NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",1000000)),
- # NestedQuery("page_attachments",TermQuery("page_attachments.fileMd5","92775529171409a32513f134a61d73c8")),
- TermQuery("province","广东"),
- # TermQuery("city","上海"),
- # generateBoolShouldQuery(["tenderee"],list_tenderee,WildcardQuery),
- # generateBoolShouldQuery(["tenderee"],["应急管理局","城市管理局","大数据局","政务服务管理局","消防局"],WildcardQuery),
- WildcardQuery("tenderee","*中国电信*"),
- # BoolQuery(should_queries=[WildcardQuery("tenderee","*医院*"),
- # WildcardQuery("tenderee","*学校*")])
- # BoolQuery(should_queries=[
- # # NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.bidding_budget",100000000)),
- # # NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",100000000)),
- # # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","奇安信网神信息技术(北京)股份有限公司")),
- # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)),
- # ])
- # TermQuery("procurement_system","公安系统"),
- # generateBoolShouldQuery(["province"],["重庆"],WildcardQuery),
- # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
- # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
- ],
- # should_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐禾*")),
- # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*彩食鲜*")),
- # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*望家欢*")),
- # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐味*")),
- # ],
- # must_not_queries=[
- # # TermQuery("exist_table",1),
- # # WildcardQuery("tenderee","*"),
- # # TermQuery("attachment_extract_status",1),
- # # generateBoolShouldQuery(["tenderee"],["银行","集团","公司"],WildcardQuery),
- # # generateBoolShouldQuery(["doctitle"],list_not_key,MatchPhraseQuery),
- # # generateBoolShouldQuery(["province"],["湖南","广西","广东"],MatchPhraseQuery),
- # NestedQuery("sub_docs_json",ExistsQuery("sub_docs_json.win_tenderer"))
- # # q_system,
- # ]
- )
- list_query.append({"query":bool_query,"limit":10000})
- break
- # list_row = getDocument(list_query,["docid","service_time"],"document","document_index")
- # list_query = []
- # for row in list_row:
- # list_query.append({"query":TermQuery("docid",row["docid"]),"limit":1})
- list_row = getDocument(list_query,columns,"document","document_index")
- # list_row = getDocument(list_query,columns,"document_product","document_product_index")
- # list_row = getDocument(list_query,columns,"t_shen_pi_xiang_mu","t_shen_pi_xiang_mu_index")
- def judge_save(row,result_queue):
- docid = row.get("docid")
- best_docid = row.get("best_docid")
- if best_docid is not None and best_docid!="":
- best_docid = int(best_docid)
- if best_docid == int(docid):
- return
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",best_docid%500+1),("docid",best_docid)],columns_to_get=["web_source_no","page_time","doctitle","crtime","detail_link","fingerprint"])
- _dict = getRow_ots_primary(return_row)
- if _dict is not None:
- if _dict.get("web_source_no")==row.get("web_source_no"):
- new_dict = {"dup_docid":row.get("docid"),
- "dup_page_time":row.get("page_time"),
- "dup_doctitle":row.get("doctitle"),
- "dup_web_source_no":row.get("web_source_no"),
- "dup_crtime":row.get("crtime"),
- "dup_detail_link":row.get("detail_link"),
- "dup_fingerprint":row.get("fingerprint"),
- "best_docid":_dict.get("docid"),
- "best_page_time":_dict.get("page_time"),
- "best_doctitle":_dict.get("doctitle"),
- "best_web_source_no":_dict.get("web_source_no"),
- "best_crtime":_dict.get("crtime"),
- "best_detail_link":_dict.get("detail_link"),
- "best_fingerprint":_dict.get("fingerprint"),
- "detail_link_same":row.get("detail_link")==_dict.get("detail_link"),
- "fingerprint_same":row.get("fingerprint")==_dict.get("fingerprint"),
- }
- result_queue.put(new_dict)
- from export.html2text import html2text_with_tablehtml
- def _judge_service_time(row,result_queue):
- docid = int(row.get("docid"))
- partition_key = docid%500+1
- consumed, return_row, next_token = ots_capacity.get_row("document",[("partitionkey",partition_key),("docid",docid)],columns_to_get=["dochtmlcon"])
- _dict = getRow_ots_primary(return_row)
- if _dict is not None:
- row_service_time = json.loads(row.get("service_time","{}"))
- _html = _dict.get("dochtmlcon","")
- _text = html2text_with_tablehtml(_html)
- if len(_text)>20000:
- return
- _prompt = '''
- 请从以下公告中提取服务期限,其中服务开始时间和服务结束时间是yyyy-mm-dd的格式,服务天数是数字天数,没有则给""
- service_start 服务开始时间
- service_end 服务结束时间
- service_days 服务天数
- 返回json格式{"service_end":"","service_start":"","service_days":""}
- '''
- _result = chat_doubao(_prompt+_text,model_name='ep-20250314164242-jd62g')
- _json = get_json_from_text(_result)
- if _json is not None:
- try:
- _dict = json.loads(_json)
- new_dict = {
- "docid":row.get("docid"),
- "service_start_extract":row_service_time.get("service_start",""),
- "service_end_extract":row_service_time.get("service_end",""),
- "service_days_extract":row_service_time.get("service_days",""),
- "service_start_ai":_dict.get("service_start",""),
- "service_end_ai":_dict.get("service_end",""),
- "service_days_ai":_dict.get("service_days",""),
- "extract_equal_ai":row_service_time.get("service_start","")[:7]==_dict.get("service_start","")[:7] and row_service_time.get("service_end","")[:7]==_dict.get("service_end","")[:7] and str(row_service_time.get("service_days",""))==str(_dict.get("service_days","")),
- }
- result_queue.put(new_dict)
- except:
- _dict = {}
- # task_queue = Queue()
- # for row in list_row:
- # task_queue.put(row)
- # result_queue = Queue()
- # mt = MultiThreadHandler(task_queue,_judge_service_time,result_queue,thread_count=10)
- # mt.run()
- # new_rows = []
- # while 1:
- # try:
- # data = result_queue.get(timeout=1)
- # new_rows.append(data)
- # except:
- # break
- # list_query = []
- #
- # for _row in list_row:
- # _uuid = uuid4().hex
- # page_attachments = json.loads(_row.get("page_attachments"))
- # l_s = []
- # for _at in page_attachments:
- # l_s.append(NestedQuery("page_attachments",TermQuery("page_attachments.fileMd5",_at.get("fileMd5"))))
- # list_query.append({"query":BoolQuery(should_queries=l_s),"limit":500,"keyword":_uuid})
- # list_row = getDocument(list_query,columns,"document","document_index")
- df_data = {}
- set_line = set()
- # # list_row = filterRow(list_row,"doctitle",list_not_key)
- # log("get document %d rows"%len(list_row))
- # # getRowDataWithKey(df_data,list_row,columns)
- getRowData(df_data,list_row,set_line,list_keyword,dict_channel,True)
- # # getRowData_shenpi(df_data,list_row,set_line,list_keyword,dict_channel,True)
- # # getRowData_sp1(df_data,list_row,set_line,list_keyword,dict_sptype,True)
- # fixContactPerson(df_data,list_df_columns,get_legal_person=False)
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
- # print("get document %d rows"%len(new_rows))
- # df1 = pd.DataFrame(new_rows)
- # df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')))
- # keys = df_data.keys()
- # print("keys",keys)
- # dict_company = {}
- # set_dup_keys = set()
- # set_docid = set()
- # for _i in range(len(df_data[list(keys)[0]])):
- # if df_data["关键词"][_i]==df_data["招标单位"][_i] or df_data["关键词"][_i]==df_data["中标单位"][_i]:
- # company_name = df_data["关键词"][_i]
- # if company_name not in dict_company:
- # dict_company[company_name] = {"企业名称":company_name,"招标":[],"中标":[]}
- # if df_data["关键词"][_i]==df_data["招标单位"][_i]:
- # if str(df_data["招标金额"][_i])!="nan":
- # _key = "%s-%s"%(company_name,str(df_data["招标金额"][_i]))
- # if _key not in set_dup_keys:
- # set_docid.add(df_data["docid"][_i])
- # dict_company[company_name]["招标"].append({"标题":df_data["公告标题"][_i],
- # "招标方式":df_data["招标方式"][_i],
- # "招标单位":df_data["招标单位"][_i],
- # "招标金额":df_data["招标金额"][_i]})
- # set_dup_keys.add(_key)
- # if df_data["关键词"][_i]==df_data["中标单位"][_i]:
- # if str(df_data["中标金额"][_i])!="nan":
- # _key = "%s-%s"%(str(df_data["中标单位"][_i]),str(df_data["中标金额"][_i]))
- # if _key not in set_dup_keys:
- # set_docid.add(df_data["docid"][_i])
- # dict_company[company_name]["中标"].append({"标题1":df_data["公告标题"][_i],
- # "招标方式1":df_data["招标方式"][_i],
- # "中标单位1":df_data["中标单位"][_i],
- # "中标金额1":df_data["中标金额"][_i]})
- # set_dup_keys.add(_key)
- # df_keys = ["企业名称","标题","招标方式","招标单位","招标金额","标题1","招标方式1","中标单位1","中标金额1"]
- # df_da = {}
- # for k in df_keys:
- # df_da[k] = []
- # for k,v in dict_company.items():
- # list_zhaobiao = v["招标"]
- # list_zhongbiao = v["中标"]
- # _nums = max(min(len(list_zhaobiao),5),min(len(list_zhongbiao),5))
- # for i in range(_nums):
- # df_da["企业名称"].append(k)
- # if i>=len(list_zhaobiao):
- # df_da["标题"].append("")
- # df_da["招标方式"].append("")
- # df_da["招标单位"].append("")
- # df_da["招标金额"].append("")
- # else:
- # df_da["标题"].append(list_zhaobiao[i]["标题"])
- # df_da["招标方式"].append(list_zhaobiao[i]["招标方式"])
- # df_da["招标单位"].append(list_zhaobiao[i]["招标单位"])
- # df_da["招标金额"].append(list_zhaobiao[i]["招标金额"])
- #
- # if i>=len(list_zhongbiao):
- # df_da["标题1"].append("")
- # df_da["招标方式1"].append("")
- # df_da["中标单位1"].append("")
- # df_da["中标金额1"].append("")
- # else:
- # df_da["标题1"].append(list_zhongbiao[i]["标题1"])
- # df_da["招标方式1"].append(list_zhongbiao[i]["招标方式1"])
- # df_da["中标单位1"].append(list_zhongbiao[i]["中标单位1"])
- # df_da["中标金额1"].append(list_zhongbiao[i]["中标金额1"])
- # df2 = pd.DataFrame(df_da)
- # df2.to_excel("tmp333.xlsx",columns=df_keys)
- #
- # df_3 = {}
- # for k in keys:
- # df_3[k] = []
- # for _i in range(len(df_data[list(keys)[0]])):
- # docid = df_data["docid"][_i]
- # if docid in set_docid:
- # for k in keys:
- # df_3[k].append(df_data[k][_i])
- # df3 = pd.DataFrame(df_3)
- # df3.to_excel("tmp_333_mx.xlsx",columns=keys)
- # fixContactPerson(df_data,list_df_columns)
- #
- def findProjects():
- df = pd.read_excel("两广地区中标时间为空标注_预匹配1.xlsx",0)
- list_items = []
- for docids,project_code,project_name,tenderee,zhao_biao_page_time in zip(df["docids"],df["project_code"],df["project_name"],df["tenderee"],df["zhao_biao_page_time"]):
- if not isinstance(project_code,(str)):
- project_code = "$$$"
- if not isinstance(project_name,(str)):
- project_name = "$$$"
- if not isinstance(tenderee,(str)):
- tenderee = ""
- print(dir(zhao_biao_page_time))
- _dict = {"docids":docids,
- "project_code":project_code,
- "project_name":project_name,
- "tenderee":tenderee,
- "zhao_biao_page_time":zhao_biao_page_time.strftime("%Y-%m-%d"),
- "end_time":timeAdd(zhao_biao_page_time.strftime("%Y-%m-%d"),180)}
- list_items.append(_dict)
- task_queue = queue.Queue()
- for item in list_items:
- task_queue.put(item)
- def _handle(item,result_queue,ots_client):
- docids = item.get("docids")
- list_s_n = []
- for docid in re.split(",",str(docids)):
- list_s_n.append(TermQuery("docid",docid))
- query_not = BoolQuery(should_queries=list_s_n)
- # bool_query =BoolQuery(must_queries=[query_not])
- # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- # SearchQuery(bool_query,limit=50,get_total_count=True),
- # ColumnsToGet(["sub_docs_json"],ColumnReturnType.SPECIFIED))
- # if total_count>0:
- # dict_rows = getRow_ots(rows)
- # _find = False
- # for _row in dict_rows:
- # sub_docs_json = _row.get("sub_docs_json",'[]')
- # sub_docs = json.loads(sub_docs_json)
- # for _doc in sub_docs:
- # if "bidding_budget" in _doc and _doc.get("bidding_budget",0)>0:
- # item["new_budding_budget"] = _doc.get("bidding_budget",0)
- # _find = True
- # break
- # if _find:
- # break
- #
- # return
- _find = True
- bool_query =BoolQuery(must_queries=[
- generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_code","$$$$"),item.get("project_name","$$$$")],MatchPhraseQuery),
- generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
- RangeQuery("status",151,301),
- NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
- RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))
- ],must_not_queries=[query_not])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,limit=50,get_total_count=True),
- ColumnsToGet(["doctitle","tenderee","sub_docs_json"],ColumnReturnType.SPECIFIED))
- if total_count>0:
- dict_rows = getRow_ots(rows)
- str_docid = ""
- for _row in dict_rows:
- str_docid+="%d,"%_row.get("docid")
- sub_docs_json = _row.get("sub_docs_json",'[]')
- sub_docs = json.loads(sub_docs_json)
- if item.get("tenderee","--")!=_row.get("tenderee","-#"):
- continue
- for _doc in sub_docs:
- if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
- item["new_win_bid_price"] = _doc.get("win_bid_price")
- item["new_win_tenderer"] = _doc.get("win_tenderer")
- item["new_finded_docid"] = _row.get("docid")
- _find = True
- break
- if _find:
- return
- item["maybe_docids"] = str_docid
- bool_query =BoolQuery(must_queries=[
- generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_code","$$$$")],MatchPhraseQuery),
- generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
- RangeQuery("status",151,301),
- NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
- RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))
- ],must_not_queries=[query_not])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,limit=50,get_total_count=True),
- ColumnsToGet(["doctitle","tenderee","sub_docs_json"],ColumnReturnType.SPECIFIED))
- if total_count>0:
- dict_rows = getRow_ots(rows)
- str_docid = ""
- for _row in dict_rows:
- str_docid+="%d,"%_row.get("docid")
- sub_docs_json = _row.get("sub_docs_json",'[]')
- sub_docs = json.loads(sub_docs_json)
- if item.get("tenderee","--")!=_row.get("tenderee","-#"):
- continue
- for _doc in sub_docs:
- if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
- item["new_win_bid_price"] = _doc.get("win_bid_price")
- item["new_win_tenderer"] = _doc.get("win_tenderer")
- item["new_finded_docid"] = _row.get("docid")
- _find = True
- break
- if _find:
- return
- item["maybe_docids"] = str_docid
- bool_query =BoolQuery(must_queries=[
- generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_name","$$$$")],MatchPhraseQuery),
- generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
- RangeQuery("status",151,301),
- NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
- RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))
- ],must_not_queries=[query_not])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,limit=50,get_total_count=True),
- ColumnsToGet(["doctitle"],ColumnReturnType.SPECIFIED))
- if total_count>0:
- dict_rows = getRow_ots(rows)
- str_docid = ""
- for _row in dict_rows:
- str_docid+="%d,"%_row.get("docid")
- sub_docs_json = _row.get("sub_docs_json",'[]')
- sub_docs = json.loads(sub_docs_json)
- if item.get("tenderee","--")!=_row.get("tenderee","-#"):
- continue
- for _doc in sub_docs:
- if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
- item["new_win_bid_price"] = _doc.get("win_bid_price")
- item["new_win_tenderer"] = _doc.get("win_tenderer")
- item["new_finded_docid"] = _row.get("docid")
- _find = True
- break
- if _find:
- return
- item["maybe_docids"] = str_docid
- return
- mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=getConnect_ots())
- mt.run()
- df_data = {"docids":[],
- "project_code":[],
- "project_name":[],
- "maybe_docids":[],
- "new_budding_budget":[],
- "new_win_bid_price":[],
- "new_win_tenderer":[],
- "new_finded_docid":[]}
- keys = df_data.keys()
- for item in list_items:
- for k in keys:
- df_data[k].append(item.get(k))
- df2 = pd.DataFrame(df_data)
- df2.to_excel("两广补充数据.xlsx")
- def attachCompanyContact():
- files = ["../data/2021-03-17_四川_关键词导出.csv",
- "../data/2021-03-17_安徽_关键词导出.csv",
- "../data/2021-03-17_江西_关键词导出.csv",
- "../data/2021-03-17_湖南_关键词导出.csv"]
- files = ["../data/20210609(最新).xlsx"]
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- set_enter = set()
- for file in files:
- df = pd.read_excel(file)
- columns = ["招标单位","中标单位","代理单位"]
- for _c in columns:
- for item in df[_c]:
- if isinstance(item,str):
- item = item.strip()
- if item!="":
- set_enter.add(item)
- dict_enter = getDictEnterprise(list(set_enter))
- for file in files:
- task_queue = queue.Queue()
- df = pd.read_excel(file,encoding="UTF8")
- keys = df.keys()[1:]
- list_item = []
- for row in df.itertuples():
- _dict = {}
- for _key in keys:
- if _key in dir(row):
- _v = row.__getattribute__(_key)
- else:
- _v = ''
- _dict[_key] = _v
- if str(_dict["招标联系人"]) in ("","nan") or str(_dict["招标联系人电话"]) in ("","nan"):
- contact_person,mobile = getOneContact(dict_enter.get(_dict["招标单位"],{}).get("contacts","[]"))
- if contact_person!="":
- _dict["招标联系人"] = contact_person
- _dict["招标联系人电话"] = mobile
- if str(_dict["中标联系人"]) in ("","nan") or str(_dict["中标联系人电话"]) in ("","nan"):
- contact_person,mobile = getOneContact(dict_enter.get(_dict["中标单位"],{}).get("contacts","[]"))
- if contact_person!="":
- _dict["中标联系人"] = contact_person
- _dict["中标联系人电话"] = mobile
- if str(_dict["代理联系人"]) in ("","nan") or str(_dict["代理联系人电话"]) in ("","nan"):
- contact_person,mobile = getOneContact(dict_enter.get(_dict["代理单位"],{}).get("contacts","[]"))
- if contact_person!="":
- _dict["代理联系人"] = contact_person
- _dict["代理联系人电话"] = mobile
- list_item.append(_dict)
- for item in list_item:
- task_queue.put(item)
- df_data = {}
- for _k in keys:
- df_data[_k] = []
- for item in list_item:
- for _k in keys:
- df_data[_k].append(getLegal_str(item.get(_k,"-")))
- df1 = pd.DataFrame(df_data)
- df1.to_excel("%s_attach.xlsx"%file,columns=keys)
- def dumpWebSourceNo():
- conn_oracle = getConnection_oracle()
- cursor_oracle = conn_oracle.cursor()
- sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO "
- cursor_oracle.execute(sql)
- rows = cursor_oracle.fetchall()
- conn_mysql = getConnection_testmysql()
- cursor_mysql = conn_mysql.cursor()
- for row in rows:
- sql = " insert into webSource(web_source_no,web_source_name) values('%s','%s')"%(row[0],row[1])
- print(sql)
- cursor_mysql.execute(sql)
- conn_mysql.commit()
- def exportNzj():
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- columns = ["contacts","covered_area","follows","docids","page_time","progress","project_description","project_follow","project_code","project_name","project_type"]
- def getData(df_data,rows,set_line):
- list_data = getRow_ots(rows)
- for row in list_data:
- item = {}
- _dict = row
- set_dict_item(item,"docids",_dict.get("docids",""))
- set_dict_item(item,"contacts",_dict.get("contacts",""))
- set_dict_item(item,"covered_area",_dict.get("covered_area",""))
- set_dict_item(item,"follows",_dict.get("follows",""))
- set_dict_item(item,"project_type",_dict.get("project_type",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"page_time",_dict.get("page_time",""))
- set_dict_item(item,"progress",_dict.get("progress",""))
- set_dict_item(item,"project_description",_dict.get("project_description",""))
- set_dict_item(item,"project_follow",_dict.get("project_follow",""))
- set_dict_item(item,"project_code",_dict.get("project_code",""))
- set_dict_item(item,"project_name",_dict.get("project_name",""))
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df_data = {}
- bool_query = BoolQuery(must_queries=[ExistsQuery("docids")])
- rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- set_line = set()
- _count = len(rows)
- getData(df_data,rows,set_line)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- getData(df_data,rows,set_line)
- _count += len(rows)
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/2021-03-31_拟在建数据导出1.xlsx",columns=list_df_columns)
- def turn_status():
- df = pd.read_excel("../data/欧科自然资源5w以上数据.xlsx")
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- for docid in df["公告id"]:
- partitionkey = int(docid)%500+1
- sql = " insert into turn_status(partitionkey,docid) values(%d,%d)"%(partitionkey,docid)
- cursor.execute(sql)
- conn.commit()
- def attachBidding_budget():
- conn_mysql = getConnection_testmysql()
- cursor = conn_mysql.cursor()
- sql = "select docid from analysis_r2 where bidding_budget=''"
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- cursor.execute(sql)
- rows = cursor.fetchmany(10000)
- while(rows):
- for row in rows:
- task_queue.put(row[0])
- rows = cursor.fetchmany(10000)
- pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_testmysql)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- def _handle(item,result_queue,pool_mysql,pool_ots):
- ots_client = pool_ots.getConnector()
- bool_query = BoolQuery(must_queries=[TermQuery("docids",item)])
- rows, next_token, total_count, is_all_succeed = ots_client.search("project2", "project2_index",
- SearchQuery(bool_query , limit=1, get_total_count=True),
- ColumnsToGet(["bidding_budget"],return_type=ColumnReturnType.SPECIFIED))
- list_dict = getRow_ots(rows)
- if len(list_dict)>0:
- conn = pool_mysql.getConnector()
- cursor = conn.cursor()
- sql = " update analysis_r2 set bidding_budget='%s' where docid=%d"%(str(list_dict[0].get("bidding_budget","")),item)
- cursor.execute(sql)
- conn.commit()
- pool_mysql.putConnector(conn)
- pool_ots.putConnector(ots_client)
- mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=pool_mysql,pool_ots=pool_ots)
- mt.run()
- def debug_documentMerge():
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- sql = "select merge_docids from project_group_final_log "
- cursor.execute(sql)
- task_queue = queue.Queue()
- for row in cursor.fetchall():
- task_queue.put(row[0])
- print(task_queue.qsize())
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- list_docids = item.split(",")
- must_q = []
- for _docid in list_docids:
- must_q.append(TermQuery("docids",_docid))
- bool_query = BoolQuery(must_queries=must_q)
- rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
- SearchQuery(bool_query,limit=1,get_total_count=True),
- ColumnsToGet(column_names=["docids"],return_type=ColumnReturnType.SPECIFIED))
- if total_count==0:
- print(item)
- result_queue.put(item)
- pool_ots.putConnector(ots_client)
- result_queue = queue.Queue()
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
- mt.run()
- while(True):
- try:
- item = result_queue.get(True)
- print(item)
- except Exception as e:
- print(str(e))
- break
- def signDocument():
- filename = "C:\\Users\\Administrator\\Desktop\\中标信息1.xlsx"
- sign_filename = "%s_sign.xlsx"%filename
- df = pd.read_excel(filename)
- df_data = {"sign":[]}
- for item in df["segword"]:
- content = re.sub("\s*","",item)
- _find = re.search("(?P<key>采购失败|招标失败)",content)
- if _find is not None:
- df_data["sign"].append(_find.groupdict().get("key"))
- else:
- df_data["sign"].append("无")
- df1 = pd.DataFrame(df_data)
- df1.to_excel(sign_filename)
- def exportWin_tenderer_count():
- ots_client = getConnect_ots()
- str_enter = '''
- 红旗渠建设集团有限公司
- 河南瑞华建筑集团有限公司
- 林州瑞达工程管理有限公司
- 河南鸿盛建筑工程有限公司
- 天一建设发展有限公司
- 河南省大成建设工程有限公司
- 中润昌弘建工集团有限公司
- 河南省中创建筑工程有限公司
- 河南锦达建设有限公司
- 林州宏基建筑工程有限公司
- 河南富世建筑工程有限公司
- 中恒方圆建筑工程有限公司
- 河南华隆建设工程有限公司
- 河南昊锦建设集团有限公司
- 河南新隆建工集团有限公司
- 中城华安建设集团有限公司
- 河南恒通公路桥梁建设有限公司
- 林州二建集团建设有限公司
- 河南华安建设集团有限公司
- 河南裕鸿建筑工程有限公司
- 中商建投建设有限公司
- 河南鑫利恒工程有限公司
- 林州市永盛建筑有限公司
- 林州市顺鑫建筑有限公司
- 中水京林建设有限公司
- 河南宏海建设有限公司
- 河南宏岳建设有限公司
- 河南元筑实业有限公司
- 河南基兆建筑工程有限公司
- 林州金瑞建筑工程有限公司
- 林州建工集团有限公司
- 河南万融建筑工程有限公司
- 林州东风建设有限公司
- 河南鸿泰建筑安装有限公司
- 河南源泰建筑有限公司
- 河南优德建筑工程有限公司
- 安阳鸿盛建设劳务有限公司
- 河南省安阳市安装工程有限责任公司
- 河南港城建设工程有限公司
- 河南天汇建筑工程有限公司
- 河南省惠浦建设发展有限公司
- 林州市建筑工程有限公司
- 河南正天建筑工程有限公司
- 河南颂邦建筑工程有限公司
- 林州市华源建设有限公司
- 河南中投建设有限公司
- 林州华林建筑劳务有限公司
- 河南基祥建设工程有限公司
- 河南文水水电工程有限公司
- 林州兴业建筑工程有限公司
- 河南中州建筑有限公司
- 河南省佳禾园林建设有限公司
- 林州万亚建筑工程有限公司
- 河南林正建设工程有限公司
- 河南鼎兴建设工程有限公司
- 河南平原建工集团有限公司
- 河南林九建设工程有限公司
- 林州市三才建筑工程有限公司
- 安阳建设(集团)有限责任公司
- 盛世恒达建设有限公司
- 河南城洲建设工程有限公司
- 河南国埔建筑工程有限公司
- 中创市政建设发展有限公司
- 河南正祥建筑工程有限公司
- 河南宏九建筑工程有限公司
- 河南金阳建筑工程有限公司
- 河南天容建设工程有限责任公司
- 河南聚宇建筑工程有限公司
- 河南瑞旗建设工程有限公司
- 河南利盛建设工程有限公司
- 林州四海建设有限公司
- 林州市建工城建集团有限公司
- 河南众佑建设工程有限公司
- 河南德诚建设有限公司
- 河南景华建筑工程有限公司
- 河南华江建筑工程有限公司
- 林州永丰建设集团有限公司
- 林州福东建设工程有限公司
- 河南恒森建筑工程有限公司
- 河南朝泓建设工程有限公司
- 河南润京建设有限公司
- 林州市红旗渠公路工程有限公司
- 林州中宇建设工程有限公司
- 河南长锦建设工程有限公司
- 河南汇商建筑工程有限公司
- 河南省豫鹤建设工程有限公司
- 河南江城建筑工程有限公司
- 中海华祥建设发展有限公司
- 河南宁中路桥建筑有限公司
- 河南天河建设工程有限公司
- 林州市路桥建筑工程有限公司
- 河南省中北建设有限公司
- 河南汇亿建筑工程有限公司
- 河南金帝建筑安装有限公司
- 河南省望安建筑工程有限公司
- 泰欣建设有限公司
- 河南筑鑫建筑工程有限公司
- 元熙建设工程有限公司
- 旭隆建设集团有限公司
- 河南省城控建工集团有限公司
- 河南晨丰建筑工程有限公司
- 河南嘉丰建设有限公司
- 林州市合众建筑劳务有限公司
- 河南金瓦刀建筑劳务有限公司
- 河南中实建筑工程有限公司
- 畅通路桥工程建设有限责任公司
- 河南军恒建设有限公司
- 中钊建设集团有限公司
- 河南德宁建设集团有限公司
- 林州兴鸿建筑工程有限公司
- 林州市明泽建筑工程有限公司
- 河南紫光建筑工程有限公司
- 河南誉天建筑工程有限公司
- 林州景丰建筑劳务有限公司
- 河南江瀚建筑劳务有限公司
- 河南弘之昌建筑工程有限公司
- 河南祥泰钻井工程有限公司
- 河南迅阳建筑劳务有限公司
- 河南嘉成建筑工程有限公司
- 河南兴锦建设工程有限公司
- 河南邦坤建设工程有限公司
- 河南锦毅市政工程建筑有限公司
- 河南广益建筑工程有限公司
- 河南创胜建筑工程有限公司
- 河南勤铭建筑工程有限公司
- 河南铭锋建设工程有限公司
- 平源建设有限公司
- 河南隆通建筑工程有限公司
- 河南省基本建设有限公司
- 河南丰茂建筑劳务有限公司
- 河南城安建筑工程有限公司
- 林州市富源建筑劳务有限公司
- 德方建设有限公司
- 河南泰联建筑工程有限公司
- 河南新建投工程有限公司
- 河南省鲁班建工集团有限公司
- 林州方超建筑劳务有限公司
- 林州市采桑建筑劳务输出有限公司
- 河南省仁昱建筑工程有限公司
- 河南鸾林建设工程有限公司
- 宜民建设集团有限公司
- 林州聚兴建筑工程有限公司
- 河南省聚国建筑工程有限公司
- 林州市大东建筑劳务有限公司
- 河南欣东劳务有限公司
- 中建润德景观建筑工程有限公司
- 河南辰宇建设工程有限公司
- 号东建设工程有限公司
- 河南润北建筑工程有限公司
- 河南邦昊建设工程有限公司
- 林州市建设投资有限责任公司
- 林州市太行建设工程有限公司
- 河南峡安建筑工程有限公司
- 河南安疆建筑工程有限公司
- 河南淇河建设工程有限公司
- 河南晶品建设有限公司
- 河南翔固建筑工程有限公司
- 纵横九州路桥建设有限公司
- 河南青林建筑工程有限公司
- 合久建设有限公司
- 河南明昊建筑工程有限公司
- 河南滨河建设工程有限公司
- 河南群腾建筑工程有限公司
- 河南隆亨建筑工程有限公司
- 骏达建设有限公司
- 河南仁安建设工程有限公司
- 河南旻尚园林建筑工程有限公司
- 河南省匡正建设工程有限公司
- 河南金凡建筑工程有限公司
- 河南佰丰建筑工程有限公司
- 德普建设有限公司
- 国润新天地工程技术有限公司
- 中潮建设发展有限公司
- 河南捷正建筑工程有限公司
- 林州百万工匠建筑劳务有限公司
- 河南祥彬建筑工程有限公司
- 河南林祥工程建设有限公司
- 河南唐尧建筑劳务有限公司
- 河南汇祥建设有限公司
- 河南友信建设有限公司
- 林州市鼎昇建筑工程有限公司
- 林州市富兴建筑劳务有限公司
- 林州厚德建筑劳务有限公司
- 河南振亚工程建设有限公司
- 河南英茂建筑工程有限公司
- 河南丰岩建设工程有限公司
- 林州市昌都建筑工程有限公司
- 林州四建建筑工程有限公司
- 林州和兴建筑劳务有限公司
- 林州市鸿升建筑工程有限公司
- 河南润泰建设工程有限公司
- 河南鑫路通建筑劳务有限公司
- 河南信守建筑劳务有限公司
- 林州安达鸿昌建筑劳务有限公司
- 河南意达建设有限公司
- 河南金穗来建筑工程有限公司
- 河南东风建筑工程有限公司
- 河南筑胜建筑劳务有限公司
- 河南民润建筑工程有限公司
- 林州市中锦路桥建设工程有限公司
- 林州一建建筑工程有限公司
- 林州市宏瑞建筑劳务有限公司
- 林州鸿恩建筑劳务有限公司
- 河南晟元建筑工程有限公司
- 中国建筑第六工程局有限公司
- 河南筑泰建筑工程有限公司
- 河南省亚建建筑工程有限公司
- 河南辰弘建筑工程有限公司
- 河南先创建筑工程有限公司
- 林豫建工集团有限公司
- 河南省盛民建筑工程有限公司
- 河南泓发市政工程有限公司
- 河南帝恩建筑劳务有限公司
- 河南天泉建设工程有限公司
- 河南恒升工程建设有限公司
- 林州市浩远电力建筑工程有限公司
- 河南友瑞建筑工程有限公司
- 河南冠州路桥工程有限公司
- 三角鼎建设工程有限公司
- 河南富坤建筑工程有限公司
- 林州市恒源建筑工程有限公司
- 河南广汇建筑工程有限公司
- 河南隆豫建设有限公司
- 林州市九洲工程劳务有限公司
- 林州瑜辉建筑工程有限公司
- 河南福恩建筑工程有限公司
- 河南通盛路桥建设有限公司
- 河南央泰建设工程有限公司
- 林州市红旗渠公路养护工程有限公司
- 林州大兴建设工程有限公司
- 河南锐丰建设工程有限公司
- 林州市中泰建筑劳务有限公司
- 林州成业建筑工程有限公司
- 河南建创建筑工程有限公司
- 河南宏兴建设工程有限公司
- 河南隆鼎建筑工程有限公司
- 林州市天罡建筑劳务有限公司
- 汇聚建设发展有限公司
- 中铁中城工程有限公司
- 河南景天建筑劳务有限公司
- 林州蒙建建设工程有限公司
- 富华建设工程有限公司
- 河南殿轩建筑劳务有限公司
- 河南瑞通建设工程有限公司
- 林州金桥劳务工程有限公司
- 河南省景隆实业有限公司
- 河南升洲建筑工程有限公司
- 河南里程建筑劳务有限公司
- 林州市润景建设工程有限公司
- 河南巨坤建筑工程有限公司
- 河南九牛建设劳务有限公司
- 吉修建设工程有限公司
- 河南图润建筑工程有限公司
- 河南鼎鑫建筑劳务有限公司
- 河南港航建设工程有限公司
- 河南省盛飞建设工程有限公司
- 林州市兴义建筑劳务有限公司
- 河南秉程建筑工程有限公司
- 河南硕亚水电路桥工程有限公司
- 河南科才建筑劳务有限公司
- 河南荣泰建筑安装工程有限公司
- 河南省天丰建筑工程有限公司
- 河南方元建筑工程有限公司
- 恒上建设有限公司
- 河南省德信建筑工程有限公司
- 河南诚宸建设工程有限公司
- 河南置信建筑工程有限公司
- 河南省鑫河建设有限公司
- 河南成兴建设工程有限公司
- 林州中港建筑工程有限公司
- 河南富春建设工程有限公司
- 中科豫资建设发展有限公司
- 河南京都建筑安装有限公司
- 安阳市宇豪爆破工程有限公司
- 河南华特建筑工程有限公司
- 河南颍淮建工有限公司
- 林州市八建工程有限公司
- 河南展辉建筑工程有限公司
- 河南中博建筑有限公司
- 河南方圆建设有限公司
- 河南大鼎建筑工程有限公司
- 林州中天建设有限公司
- 河南久东建筑工程有限公司
- 河南九一建设工程有限公司
- 九州水文建设集团有限公司
- 河南省建安防水防腐工程有限公司
- 中建宏图建设发展有限公司
- 筑宇建设有限公司
- 林州市宏图建设工程有限公司
- 河南林润建设工程有限公司
- 嘉泰建设发展有限公司
- 河南丰茂建筑安装工程有限公司
- 河南万泰建设工程有限公司
- 林州市红旗渠市政工程有限公司
- 林州建总建筑工程有限公司
- 河南聚之祥建设有限公司
- 河南鼎之信建设工程有限公司
- 河南省华瑞建设工程有限公司
- 河南世光电力工程有限公司
- 河南地远建筑工程有限公司
- 河南鑫品建筑工程有限公司
- 河南省东旗建筑工程有限公司
- 润华建设有限公司
- 林州富民建筑劳务有限公司
- 林州市晨诚建筑劳务有限公司
- 河南万胜建设有限公司
- 河南龙磐建筑工程有限公司
- 河南顺昌建筑劳务有限公司
- 林州恒瑞建设工程有限公司
- 河南大成建设劳务有限公司
- 河南大一建筑劳务有限公司
- 河南盛威建筑工程有限公司
- 河南坤之宇建筑工程有限公司
- 众信电力工程有限公司
- 河南昱佛建筑工程有限公司
- 河南淇源建筑工程有限公司
- 林州凤宝建筑安装有限公司
- 河南中发岩土工程有限公司
- 河南中都建设工程有限公司
- 河南祥凯建筑工程有限公司
- 河南乐泰建筑工程有限公司
- 林州宏达建筑劳务有限公司
- 河南华盛建设集团有限公司
- 河南凯通建设工程有限公司
- 国腾路桥工程有限公司
- 中建方达建设工程有限公司
- 河南省天都建设工程有限公司
- 昌隆建设工程有限公司
- 河南洹上村园林绿化工程有限公司
- 河南双锦建设工程有限公司
- 河南子丰市政工程有限公司
- 林州首创建筑工程有限公司
- 河南众鑫建筑工程有限公司
- 河南宁崴建筑工程有限公司
- 林州市航安建筑劳务有限公司
- 林州益成建设工程有限公司
- 林州市昌弘建筑工程有限公司
- 河南正耀建设有限公司
- 河南鑫鹏建设工程有限公司
- 林州恒泰建筑工程有限公司
- 林竣建设有限公司
- 河南朝众建筑工程有限公司
- 林州科鸿建筑工程有限公司
- 东辰建设发展有限公司
- 河南创新新能源科技有限公司
- 河南省永业建筑工程有限公司
- 林州市煜凯建筑工程有限公司
- 宝鼎建设工程有限公司
- 林州市航安建筑工程有限公司
- 河南业展建设工程有限公司
- 河南联竣建筑工程有限公司
- 河南聚超建筑工程有限公司
- 林州远方电力工程有限公司
- 河南蒙寅建筑劳务有限公司
- 方元建筑劳务有限公司
- 龙兴建设工程有限公司
- 河南春谦建设工程有限公司
- 河南正博公路工程有限公司
- 林州市汇鑫安装工程有限公司
- 林州市祥隆劳务有限公司
- 河南胜杰建筑工程有限公司
- 河南恩普建筑工程有限公司
- 河南港津建筑工程有限公司
- 河南昌明建筑工程有限公司
- 中豫城控建设集团有限公司
- 林州晨宇建设工程有限公司
- 河南豫柯建筑工程有限公司
- 河南捷润建筑工程有限公司
- 中方通建设工程有限公司
- 河南多果建筑工程有限公司
- 河南尚伟建筑工程有限公司
- 林州新航程建筑工程有限公司
- 河南金华建筑工程有限公司
- 国云工程技术有限公司
- 河南路威路桥工程有限公司
- 林州中盛建设工程有限公司
- 林州市恒基建设有限公司
- 河南润恒建筑工程有限公司
- 河南华安水利工程有限公司
- 中城易通建设发展有限公司
- 河南浚洲建筑工程有限公司
- 林州市锦晟建筑劳务有限公司
- 河南省北安建筑工程有限公司
- 林州泰岳建设工程有限公司
- 河南联洋建筑工程有限公司
- 河南港大市政建筑工程有限公司
- 林州东盛建筑劳务有限公司
- 河南省天鉴建设工程有限公司
- 河南瑞凝建筑工程有限公司
- 林州市东瑞建筑劳务有限公司
- 河南众达建筑劳务有限公司
- 河南省帝增建筑工程有限公司
- 河南省升灿建筑工程有限公司
- 河南苑景建筑劳务分包有限公司
- 林州众立建设工程有限公司
- 红旺建筑工程有限公司
- 林州市圣兴建筑劳务有限公司
- 林州诚林建筑劳务有限公司
- 林州建工劳务有限公司
- 河南巨业建筑工程有限公司
- 中科华夏建设开发有限公司
- 君晟建筑工程有限公司
- 郑州新动力建筑劳务分包有限公司
- 河南省福德建筑工程有限公司
- 林州源大建筑工程有限公司
- 河南大瑞园林建设有限公司
- 河南秋禾建筑劳务有限公司
- 河南腾翔建筑工程有限公司
- 河南天之华建设工程有限公司
- 河南祥和建筑安装有限公司
- 河南省鼎文建设工程有限公司
- 河南周城建设发展有限公司
- 河南庆泰建筑工程有限公司
- 中科信合建设发展有限公司
- 林州恒隆建设工程有限公司
- 河南省力恒建筑工程有限公司
- 林州市四季青绿化有限责任公司
- 林州市景盛建筑工程有限公司
- 河南建基建设工程有限公司
- 河南宝凯建筑工程有限公司
- 林州市四合建筑劳务有限公司
- 河南和耀建筑工程有限公司
- 林州市凯达建筑劳务有限公司
- 林州市恒信建筑劳务有限公司
- 开翔建设工程有限公司
- 河南省新创达建设工程有限公司
- 林州鑫龙建筑工程有限公司
- 河南省昌博建筑工程有限公司
- 河南君利泰建筑工程有限公司
- 林州杏林建筑工程有限公司
- 河南千禧建设工程有限公司
- 中建诚正建筑工程有限公司
- 河南省聚千建筑工程有限公司
- 林州海之鸿建筑工程有限公司
- 河南振鼎建筑工程有限公司
- 林州方成建筑劳务有限公司
- 河南众众建设工程有限公司
- 林州市万润建筑劳务有限公司
- 启创建设工程有限公司
- 河南子明建筑工程有限公司
- 安阳市兴鼎路桥工程有限公司
- 河南智擎建筑劳务有限公司
- 河南鼎平市政工程有限公司
- 林州宏阳建筑工程有限公司
- 河南豫泰建筑工程有限公司
- 林州市鸿浩建筑劳务有限公司
- 林州市锦华建筑工程有限公司
- 河南瑞锋建设有限公司
- 河南欧信建筑劳务有限公司
- 林州市中兴建筑劳务有限公司
- 林州市大德建设工程有限公司
- 河南华文建设有限公司
- 河南凌焜建筑工程有限公司
- 河南安居建设有限公司
- 林州鲲鹏建筑工程有限公司
- 林州经纬建筑工程有限公司
- 林州祥川建筑工程有限公司
- 林州市鑫淼建筑劳务有限公司
- 河南祥泰路桥有限公司
- 景祥建设工程有限公司
- 河南省兴华建安工程有限公司
- 河南古森建筑劳务有限公司
- 平祥建设工程有限公司
- 河南大博建设工程有限公司
- 河南华普建设工程有限公司
- 河南东邦建设工程有限公司
- 卓冠建设工程有限公司
- 河南品瑞建筑工程有限公司
- 河南宝金建设工程有限公司
- 中城鑫邦建设有限公司
- 河南省鸿运建设工程有限公司
- 林州明奥建筑工程有限公司
- 河南金手指建设工程有限公司
- 林州市弘顺建筑劳务有限公司
- 林州市林海建筑劳务有限公司
- 河南艺兆市政工程有限公司
- 林州誉峰建筑工程有限公司
- 河南卓骏建筑工程有限公司
- 林州众成建筑工程有限公司
- 河南城通市政工程有限公司
- 林州市晋源建筑工程有限公司
- 河南飞越建筑工程有限公司
- 林州鑫泰建筑工程有限公司
- 林州市太行建筑劳务有限公司
- 河南筑丰建设发展有限公司
- 林州一帆建筑劳务有限公司
- 林州宏久建筑工程有限公司
- 林州市盛祥建筑劳务有限公司
- 河南黎润建设工程有限公司
- 林州市永安建筑劳务有限公司
- 河南省长江建设实业有限公司
- 河南腾润建设工程有限公司
- 河南国梁建设工程有限公司
- 河南诚聚建筑工程有限公司
- 河南德邦市政工程有限公司
- 河南安德建设工程有限公司
- 河南森川建筑工程有限公司
- 林州市顺通公路工程有限公司
- 河南领邦建筑工程有限公司
- 河南博兴建设工程有限公司
- 东泽消防工程有限公司
- '''
- list_enter = []
- for _p in re.split("\s",str_enter):
- if _p.strip()=="":
- continue
- list_enter.append({"name":_p.strip()})
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- try:
- bool_query = BoolQuery(must_queries=[
- NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["name"]))
- ,RangeQuery("status",201,300,include_lower=True,include_upper=True)
- ,RangeQuery("page_time","2020-01-01")
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1, get_total_count=True),
- ColumnsToGet(['docid'], ColumnReturnType.SPECIFIED))
- item["total_count"] = total_count
- # bool_query = BoolQuery(must_queries=[
- # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["name"]))
- # ,RangeQuery("status",201,300,include_lower=True,include_upper=True)
- # ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",0,1000000,include_upper=True))
- # ])
- #
- # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- # SearchQuery(bool_query, limit=1, get_total_count=True),
- # ColumnsToGet(['docid'], ColumnReturnType.SPECIFIED))
- # item["l_total_count"] = total_count
- except Exception as e:
- pass
- pool_ots.putConnector(ots_client)
- pool_ots = ConnectorPool(init_num=30,max_num=40,method_init=getConnect_ots)
- task_queue = queue.Queue()
- for item in list_enter:
- task_queue.put(item)
- mt = MultiThreadHandler(task_queue,_handle,None,30,pool_ots=pool_ots)
- mt.run()
- df_data = {}
- for item in list_enter:
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s.xls"%getCurrent_date("%Y-%m-%d_%H%M%S"))
- from bs4 import BeautifulSoup
- def downloadAttach(_url,_path):
- try:
- result = requests.get(_url,stream=True,timeout=20)
- if result.status_code==200:
- with open(_path,"wb") as f:
- f.write(result.content)
- else:
- log("download failed with code %d of url:%s"%(result.status_code,_url))
- except Exception:
- log("download failed of url:%s"%(_url))
- def extract_pageAttachments(_html):
- fileSuffix = [".zip", ".rar", ".tar", ".7z", ".wim", ".docx", ".doc", ".xlsx", ".xls", ".pdf", ".txt", ".hnzf", ".bmp", ".jpg", ".jpeg", ".png", ".tif", ".swf"]
- _soup = BeautifulSoup(_html,"lxml")
- list_a = _soup.find_all("a")
- list_img = _soup.find_all("img")
- page_attachments = []
- for _a in list_a:
- _text =_a.get_text()
- _url = _a.attrs.get("href","")
- if _url.find("http://www.bidizhaobiao.com")>=0:
- continue
- is_attach = False
- for suf in fileSuffix:
- if _text.find(suf)>=0 or _url.find(suf)>=0:
- is_attach = True
- if is_attach:
- page_attachments.append({"fileLink":_url,"fileTitle":_text})
- for _a in list_img:
- _text =_a.get_text()
- _url = _a.attrs.get("src","")
- if _url.find("http://www.bidizhaobiao.com")>=0:
- continue
- is_attach = False
- for suf in fileSuffix:
- if _text.find(suf)>=0 or _url.find(suf)>=0:
- is_attach = True
- if is_attach:
- page_attachments.append({"fileLink":_url,"fileTitle":_text})
- return page_attachments
- def exportDocument_attachment():
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[TermQuery("docid",165528701)])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
- columns_to_get=ColumnsToGet(["dochtmlcon"],ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- for _data in list_data:
- extract_pageAttachments(_data["dochtmlcon"])
- def transUUid():
- conn_oracle = getConnection_oracle()
- cursor = conn_oracle.cursor()
- tables = ['T_ZHAO_BIAO_GONG_GAO','T_ZHONG_BIAO_XIN_XI']
- conn_mysql = getConnection_testmysql()
- cursor_mysql = conn_mysql.cursor()
- for _t in tables:
- sql = " select id,page_time,'%s' from bxkc.%s where page_time>='%s' and page_time<='%s' order by page_time "%(_t,_t,"2021-06-01","2021-08-31")
- print(sql)
- cursor.execute(sql)
- _count = 0
- while(True):
- insert_sql = "insert into fix_document(uuid,page_time,table_name) values"
- rows = cursor.fetchmany(10000)
- if not rows:
- break
- _count += len(rows)
- print(_count)
- for row in rows:
- _uuid = row[0]
- page_time = row[1]
- table_name = row[2]
- insert_sql += "('%s','%s','%s'),"%(_uuid,page_time,table_name)
- insert_sql = insert_sql[:-1]
- cursor_mysql.execute(insert_sql)
- conn_mysql.commit()
- def fix_document():
- conn_oracle = getConnection_oracle()
- cursor_oracle = conn_oracle.cursor()
- conn_mysql = getConnection_testmysql()
- cursor_mysql = conn_mysql.cursor()
- sql = "select uuid,page_time,table_name from fix_document_final where page_time>='2021-06-24' "
- cursor_mysql.execute(sql)
- _count = 0
- while True:
- rows = cursor_mysql.fetchmany(1000)
- if not rows:
- break
- _count += len(rows)
- print(_count)
- insert_sql = ""
- for row in rows:
- _uuid = row[0]
- page_time = row[1]
- table_name = row[2]
- insert_sql += " insert into BXKC.fix_document_final(id,page_time,TABLENAME) values('%s','%s','%s');"%(_uuid,page_time,table_name)
- insert_sql = "begin %s end;"%(insert_sql)
- cursor_oracle.execute(insert_sql)
- conn_oracle.commit()
- def exportDocument_forRecommen():
- filename = "../data/推荐 (1).csv"
- df = pd.read_csv(filename,encoding="GBK")
- ots_client = getConnect_ots()
- columns = ["province","city","page_time","doctitle","product"]
- current_date = getCurrent_date("%Y-%m-%d")
- adict_data = []
- _index = 0
- for company,json_docid in zip(df["company"][:10000],df["json_docid"][:10000]):
- _index += 1
- _province = ""
- # consumed, return_row, next_token = ots_client.get_row("enterprise",[("name",company)],columns_to_get=["province"])
- # dict_k = getRow_ots_primary(return_row)
- # _province = dict_k.get("province","")
- print("序号:%d,%s,%s"%(_index,company,_province))
- dict_recommen = json.loads(json_docid)
- for str_way,str_docid in dict_recommen.items():
- should_q = []
- for _docid in str_docid.split(","):
- should_q.append(TermQuery("docid",_docid))
- bool_query = BoolQuery(must_queries=[
- # TermQuery("province",_province)
- # ,RangeQuery("page_time",timeAdd(current_date,-7),current_date,True,True)
- # ,
- BoolQuery(should_queries=should_q)]
- )
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,get_total_count=True,limit=100),
- ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- adict_row = getRow_ots(rows)
- for dict_row in adict_row:
- dict_item = dict()
- set_dict_item(dict_item,"公司名称",company)
- set_dict_item(dict_item,"推荐路径",str_way)
- set_dict_item(dict_item,"公告id",dict_row.get("docid",""))
- set_dict_item(dict_item,"省份",dict_row.get("province",""))
- set_dict_item(dict_item,"城市",dict_row.get("city",""))
- set_dict_item(dict_item,"page_time",dict_row.get("page_time",""))
- set_dict_item(dict_item,"doctitle",dict_row.get("doctitle",""))
- set_dict_item(dict_item,"product",dict_row.get("product",""))
- adict_data.append(dict_item)
- dict_data = {}
- for dict_item in adict_data:
- for k in list_df_columns:
- if k not in dict_data:
- dict_data[k] = []
- dict_data[k].append(dict_item.get(k,""))
- df1 = pd.DataFrame(dict_data)
- df1.to_excel("../data/%s_推荐.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=list_df_columns)
- def exportDocument_by_days(page_time):
- dict_channel = getDict_docchannel()
- ots_client = getConnect_ots()
- filename = "供货贷含[建筑]企业名单.xlsx"
- df = pd.read_excel(filename)
- bool_query = BoolQuery(must_queries=[TermQuery("page_time",page_time),
- # RangeQuery("status",201,301),
- ])
- # columns = ["doctitle","docchannel","product","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","original_docchannel","detail_link"]
- columns = ["doctitle","docchannel","product","bidway","moneysource","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","original_docchannel","detail_link","page_attachments","service_time"]
- dict_channel = getDict_docchannel()
- def hidePhone(phone):
- if phone is None or phone=="":
- return ""
- return "*"*(len(phone)-4)+phone[-4:]
- def getData(df_data,rows,set_line,list_keyword,set_columns,df_columns):
- list_data = getRow_ots(rows)
- for row in list_data:
- item = {}
- _dict = row
- set_dict_item_columns(set_columns,df_columns,item,"docid",_dict.get("docid",""))
- set_dict_item_columns(set_columns,df_columns,item,"公告标题",_dict.get("doctitle",""))
- # set_dict_item_columns(set_columns,df_columns,item,"公告内容",_dict.get("doctextcon",""))
- set_dict_item_columns(set_columns,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- # set_dict_item_columns(set_columns,df_columns,item,"关键词",",".join(list(set(re.findall("|".join(list_keyword),_dict.get("doctextcon",""))))))
- set_dict_item_columns(set_columns,df_columns,item,"产品",_dict.get("product",""))
- set_dict_item_columns(set_columns,df_columns,item,"省份",_dict.get("province",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item_columns(set_columns,df_columns,item,"资金来源",_dict.get("moneysource",""))
- set_dict_item_columns(set_columns,df_columns,item,"招标方式",_dict.get("bidway",""))
- set_dict_item_columns(set_columns,df_columns,item,"服务期限",_dict.get("service_time",""))
- set_dict_item_columns(set_columns,df_columns,item,"城市",_dict.get("city",""))
- set_dict_item_columns(set_columns,df_columns,item,"区县",_dict.get("district",""))
- set_dict_item_columns(set_columns,df_columns,item,"发布时间",_dict.get("page_time",""))
- set_dict_item_columns(set_columns,df_columns,item,"创建时间",_dict.get("crtime",""))
- set_dict_item_columns(set_columns,df_columns,item,"行业一级分类",_dict.get("industry",""))
- set_dict_item_columns(set_columns,df_columns,item,"行业二级分类",_dict.get("info_type",""))
- # set_dict_item_columns(set_columns,df_columns,item,"uuid",_dict.get("uuid"))
- # set_dict_item_columns(set_columns,df_columns,item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
- set_dict_item_columns(set_columns,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- set_dict_item_columns(set_columns,df_columns,item,"原网公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))
- set_dict_item_columns(set_columns,df_columns,item,"status","正常" if _dict.get("status",201) <=300 else "去重")
- set_dict_item_columns(set_columns,df_columns,item,"detail_link",_dict.get("detail_link"))
- set_dict_item_columns(set_columns,df_columns,item,"web_source_no",_dict.get("web_source_no",""))
- set_dict_item_columns(set_columns,df_columns,item,"web_source_name",_dict.get("web_source_name",""))
- set_dict_item_columns(set_columns,df_columns,item,"项目名称",_dict.get("project_name",""))
- set_dict_item_columns(set_columns,df_columns,item,"项目编号",_dict.get("project_code",""))
- set_dict_item_columns(set_columns,df_columns,item,"招标单位",_dict.get("tenderee",""))
- set_dict_item_columns(set_columns,df_columns,item,"招标联系人",_dict.get("tenderee_contact",""))
- set_dict_item_columns(set_columns,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone",""))
- set_dict_item_columns(set_columns,df_columns,item,"代理单位",_dict.get("agency",""))
- set_dict_item_columns(set_columns,df_columns,item,"代理联系人",_dict.get("agency_contact",""))
- set_dict_item_columns(set_columns,df_columns,item,"代理联系人电话",_dict.get("agency_phone",""))
- set_dict_item_columns(set_columns,df_columns,item,"url","http://www.bidizhaobiao.com/info-%d.html"%(_dict.get("docid","")))
- set_dict_item_columns(set_columns,df_columns,item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- set_dict_item_columns(set_columns,df_columns,item,"截标时间",_dict.get("time_bidclose",""))
- set_dict_item_columns(set_columns,df_columns,item,"page_attachments",_dict.get("page_attachments","[]"))
- sub_docs_json = _dict.get("sub_docs_json")
- set_tenderer = set()
- if sub_docs_json is not None:
- docs = json.loads(sub_docs_json)
- docs.sort(key=lambda x:x.get("win_bid_price",0))
- for _doc in docs:
- if "win_tenderer" in _doc:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位",_doc["win_tenderer"])
- if "second_tenderer" in _doc:
- set_dict_item_columns(set_columns,df_columns,item,"第二候选单位",_doc["second_tenderer"])
- set_tenderer.add(_doc.get("second_tenderer"))
- if "third_tenderer" in _doc:
- set_dict_item_columns(set_columns,df_columns,item,"第三候选单位",_doc["third_tenderer"])
- set_tenderer.add(_doc.get("third_tenderer"))
- if "win_tenderee_manager" in _doc:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位联系人",_doc["win_tenderee_manager"])
- if "win_tenderee_phone" in _doc:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位联系电话",_doc["win_tenderee_phone"])
- if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- set_dict_item_columns(set_columns,df_columns,item,"中标金额",_doc["win_bid_price"])
- if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- set_dict_item_columns(set_columns,df_columns,item,"招标金额",_doc["bidding_budget"])
- set_dict_item_columns(set_columns,df_columns,item,"入围供应商",",".join(list(set_tenderer)))
- if "第二候选单位" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"第二候选单位","")
- if "第三候选单位" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"第三候选单位","")
- if "招标金额" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"招标金额","")
- if "中标金额" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"中标金额","")
- if "中标单位" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位","")
- if "中标单位联系人" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位联系人","")
- if "中标单位联系电话" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位联系电话","")
- # if item["中标单位"] not in set_enter:
- # continue
- _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
- # if _line in set_line:
- # continue
- # if item["招标金额"]=="":
- # continue
- # set_line.add(_line)
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df_data = {}
- set_columns = set()
- df_columns = []
- # for name in df["ent_name_real"]:
- # if isinstance(name,str) and name!="":
- # list_should_q = []
- # # list_should_q.append(MatchPhraseQuery("doctextcon",name))
- # # list_should_q.append(MatchPhraseQuery("attachmenttextcon",name))
- # NestedQuery("sub_docs_json","sub_docs_json.win_tenderer",name)
- # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01-01"),
- # RangeQuery("status",201,301),
- # # BoolQuery(should_queries=list_should_q),
- # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",name))
- # ])
- #
- # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- # SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
- # ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- #
- #
- # while True:
- # getData(df_data,rows,set(),"",set_columns,df_columns)
- # if not next_token:
- # break
- # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- # SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
- # ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- # if len(df_data.keys())>0:
- # print(len(df_data[list(df_data.keys())[0]]),total_count)
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
- ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- while True:
- getData(df_data,rows,set(),"",set_columns,df_columns)
- if not next_token:
- break
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
- ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- if len(df_data.keys())>0:
- print(len(df_data[list(df_data.keys())[0]]),total_count)
- # appendAttachmentPath(df_data,"page_attachments","附件链接")
- # df_columns.append("附件链接")
- # df_columns = ["docid","公告标题","公告类别","发布时间","公告内容","省份","城市","项目编号","招标单位","招标金额","资金来源","招标方式","代理单位","中标单位","中标金额","第二候选单位","第三候选单位","url","附件链接"]
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s_%s.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S"),page_time),columns=df_columns)
- def appendAttachmentPath(df_data,key,new_key):
- list_data = []
- for _attach in df_data[key]:
- _dict = {key:_attach}
- list_data.append(_dict)
- task_queue = Queue()
- for _d in list_data:
- task_queue.put(_d)
- auth = oss2.Auth("LTAI5tFuoxHm8Uxrr5nT8wTZ", "Yp01bylJFx0al6teCaccY8hbtllBGg")
- bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
- attachment_bucket_name = "attachment-hub"
- bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)
- ots_client = getConnect_ots()
- def search(ots_client,table_name,key_tuple,columns_to_get):
- try:
- # 调用get_row接口查询,最后一个参数值1表示只需要返回一个版本的值。
- consumed, return_row, next_token = ots_client.get_row(table_name, key_tuple, columns_to_get, None, 1)
- if return_row is not None:
- _dict = getRow_ots_primary(return_row)
- return _dict
- return None
- # 客户端异常,一般为参数错误或者网络异常。
- except OTSClientError as e:
- traceback.print_exc()
- log("get row failed, http_status:%d, error_message:%s" % (e.get_http_status(), e.get_error_message()))
- # 服务端异常,一般为参数错误或者流控错误。
- except OTSServiceError as e:
- traceback.print_exc()
- log("get row failed, http_status:%d, error_code:%s, error_message:%s, request_id:%s" % (e.get_http_status(), e.get_error_code(), e.get_error_message(), e.get_request_id()))
- def _handle(item,result_queue):
- page_attachments = json.loads(item.get(key,"[]"))
- list_url = []
- for _a in page_attachments:
- fileMd5 = _a.get("fileMd5")
- print("==",fileMd5)
- _s_dict = search(ots_client,"attachment",[("filemd5",fileMd5)],["path"])
- if _s_dict is not None:
- _path = _s_dict.get("path")
- if _path is not None:
- _url = bucket.sign_url("GET",_path,86500*5)
- list_url.append(_url)
- item[new_key] = json.dumps(list_url)
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- df_data[new_key] = []
- for _d in list_data:
- df_data[new_key].append(_d.get(new_key))
- def export_competition():
- file = "select___from_province_indus_entity_top1.xlsx"
- df1 = pd.read_excel(file)
- ots_client = getConnect_ots()
- task_queue = queue.Queue()
- list_entity = []
- for province,industry,entitys in zip(df1["province"],df1["industry"],df1["entitys"]):
- l_e = json.loads(entitys)
- for l in l_e:
- list_entity.append({"province":province,
- "industry":industry,
- "win_tenderer":l.get("win_tenderee","")})
- for item in list_entity:
- task_queue.put(item)
- def _handle(item,result_queue):
- def getData(rows,_set):
- dict_rows = getRow_ots(rows)
- for _dict in dict_rows:
- sub_docs_json = _dict.get("sub_docs_json")
- if sub_docs_json is not None:
- for sub_docs in json.loads(sub_docs_json):
- if sub_docs.get("win_tenderer") is not None:
- _set.add(sub_docs.get("win_tenderer"))
- if sub_docs.get("second_tenderer") is not None:
- _set.add(sub_docs.get("second_tenderer"))
- if sub_docs.get("third_tenderer") is not None:
- _set.add(sub_docs.get("third_tenderer"))
- columns = ["sub_docs_json"]
- _company = item.get("win_tenderer")
- should_q = BoolQuery(should_queries=[NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_company)),
- NestedQuery("sub_docs_json",TermQuery("sub_docs_json.second_tenderer",_company)),
- NestedQuery("sub_docs_json",TermQuery("sub_docs_json.third_tenderer",_company))])
- bool_query = BoolQuery(must_queries=[
- # should_q,
- MatchPhraseQuery("doctextcon",_company),
- RangeQuery("docchannel",101)])
- _set = set()
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
- ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- getData(rows,_set)
- _count = 0
- _page = 0
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
- ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- getData(rows,_set)
- _count += 1
- _page += 1
- if len(_set)>20 or _page>20:
- break
- if item["win_tenderer"] in _set:
- _set.remove(item["win_tenderer"])
- item["competition"] = ",".join(list(_set))
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- df_data = {}
- keys = ["province","industry","win_tenderer","competition"]
- for key in keys:
- if key not in df_data:
- df_data[key] = []
- for item in list_entity:
- for key in keys:
- df_data[key].append(item.get(key))
- df2 = pd.DataFrame(df_data)
- df2.to_excel("competition.xlsx",columns=keys)
- def document_dumplicate():
- df = pd.read_excel("../data/2022-01-19_214329_export11.xlsx")
- print(df.keys())
- def export_document_no_price():
- df = pd.read_csv("select___from_document_no_price_tmp.csv",encoding="gbk")
- conn_oracle = getConnection_oracle()
- cursor = conn_oracle.cursor()
- sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO"
- cursor.execute(sql)
- dict_source = {}
- while 1:
- rows = cursor.fetchmany(10000)
- if not rows:
- break
- for row in rows:
- dict_source[row[0]] = row[1]
- dict_source[row[0].split("(")[0]] = row[1]
- list_name = []
- set_web_source = set()
- for web_source_no in df["web_source_no"]:
- set_web_source.add(web_source_no)
- list_name.append(dict_source.get(web_source_no,""))
- dict_source_year = {}
- for web_source_no,year,counts_no_price,counts_all,rate in zip(df["web_source_no"],df["year"],df["counts_no_price"],df["counts_all"],df["rate"]):
- dict_source_year["%s&%s"%(web_source_no,year)] = {"counts_no_price":counts_no_price,"counts_all":counts_all,"rate":rate}
- new_data = {"web_source_no":[],
- "web_source_name":[],
- "counts_no_price":[],
- "counts_all":[],
- "rate":[],
- "counts_no_price1":[],
- "counts_all1":[],
- "rate1":[]}
- for web_source_no in list(set_web_source):
- new_data["web_source_no"].append(web_source_no)
- new_data["web_source_name"].append(dict_source.get(web_source_no,""))
- d_2020 = dict_source_year.get("%s&%s"%(web_source_no,"2020"),{})
- d_2021 = dict_source_year.get("%s&%s"%(web_source_no,"2021"),{})
- new_data["counts_no_price"].append(d_2020.get("counts_no_price"))
- new_data["counts_all"].append(d_2020.get("counts_all"))
- new_data["rate"].append(d_2020.get("rate"))
- new_data["counts_no_price1"].append(d_2021.get("counts_no_price"))
- new_data["counts_all1"].append( d_2021.get("counts_all"))
- new_data["rate1"].append(d_2021.get("rate"))
- # new_data = {"year":df["year"],
- # "web_source_no":df["web_source_no"],
- # "web_source_name":list_name,
- # "counts_no_price":df["counts_no_price"],
- # "counts_all":df["counts_all"],
- # "rate":df["rate"]}
- df2 = pd.DataFrame(new_data)
- df2.to_excel("websource_no_price1.xlsx",columns=["web_source_no","web_source_name","counts_no_price","counts_all","rate","counts_no_price1","counts_all1","rate1"])
- def exportDetailLink():
- df = pd.read_excel("招投标数据测试反馈表3.xlsx")
- list_item = []
- for docid in df["docid"]:
- list_item.append({"docid":docid})
- task_queue = queue.Queue()
- for item in list_item:
- task_queue.put(item)
- def _handle(item,result_queue,ots_client,pool_oracle):
- try:
- conn = pool_oracle.getConnector()
- docid = int(item["docid"])
- partitionkey = int(docid%500+1)
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",int(docid))],["original_docchannel","detail_link","uuid"])
- _dict = getRow_ots_primary(return_row)
- if _dict.get("detail_link") is not None and len(_dict.get("detail_link"))>0:
- item["detail_link"] = _dict.get("detail_link")
- else:
- original_docchannel = _dict.get("original_docchannel")
- _uuid = _dict.get("uuid")
- d_tablename = {"51":"T_GONG_GAO_BIAN_GENG",
- "52":"T_ZHAO_BIAO_GONG_GAO",
- "101":"T_ZHONG_BIAO_XIN_XI",
- "102":"T_ZHAO_BIAO_YU_GAO",
- "103":"T_ZHAO_BIAO_DA_YI",
- "104":"T_ZHAO_BIAO_WEN_JIAN",
- "114":"T_CAI_GOU_YI_XIANG"
- }
- _tablename = d_tablename.get(str(original_docchannel))
- if _tablename is not None:
- cursor = conn.cursor()
- sql = "select detail_link from bxkc.%s where id='%s'"%(_tablename,_uuid)
- print(sql)
- cursor.execute(sql)
- rows = cursor.fetchall()
- if len(rows)>0:
- item["detail_link"] = rows[0][0]
- cursor.close()
- except Exception as e:
- traceback.print_exc()
- finally:
- pool_oracle.putConnector(conn)
- ots_client = getConnect_ots()
- pool_oracle = ConnectorPool(10,30,getConnection_oracle)
- mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client,pool_oracle=pool_oracle)
- mt.run()
- df_data = {"docid":[],
- "detail_link":[]}
- for item in list_item:
- for k,v in df_data.items():
- v.append(item.get(k,""))
- df2 = pd.DataFrame(df_data)
- df2.to_excel("222.xlsx")
- def process_doc():
- df = pd.read_excel("../data/2022-03-16_154617_数据导出.xlsx",1)
- list_check = []
- set_process_docid = set()
- for docid in df["process_docid"]:
- set_process_docid.add(docid)
- df = pd.read_excel("../data/2022-03-16_154617_数据导出.xlsx",0)
- for docid in df["docid"]:
- if docid in set_process_docid:
- list_check.append("1")
- else:
- list_check.append("0")
- df["check"] = list_check
- df.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')))
- def export_extract2():
- ots_client = getConnect_ots()
- df_keys = ["docid","extract_json","status"]
- df_data = {}
- for _key in df_keys:
- df_data[_key] = []
- bool_query = BoolQuery(must_queries=[
- RangeQuery("status",1,1000,True,True)])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document_extract2", "document_extract2_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(df_keys,return_type=ColumnReturnType.SPECIFIED))
- list_dict = getRow_ots((rows))
- for _dict in list_dict:
- if re.search("false",_dict.get("extract_json","")) is None:
- continue
- for k in df_keys:
- df_data[k].append(_dict.get(k))
- _count = len(list_dict)
- while next_token:
- rows, next_token, total_count, is_all_succeed = ots_client.search("document_extract2", "document_extract2_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(df_keys,return_type=ColumnReturnType.SPECIFIED))
- list_dict = getRow_ots((rows))
- for _dict in list_dict:
- if re.search("false",_dict.get("extract_json","")) is None:
- continue
- for k in df_keys:
- df_data[k].append(_dict.get(k))
- _count += len(list_dict)
- print("%d/%d"%(_count,total_count))
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s_extract2.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
- def export_by_file():
- df = pd.read_csv("../data/2022-04-01_121315_数据导出.csv",encoding="gbk")
- keys = df.keys()
- df_data = {}
- set_win = set()
- set_ruwei = set()
- for k in keys:
- df_data[k] = []
- for _i in range(len(df["产品"])):
- product = df["产品"][_i]
- if product is None or not isinstance(product,str):
- continue
- print(product)
- win_tenderer = df["中标单位"][_i]
- if win_tenderer is not None and isinstance(win_tenderer,str):
- set_win.add(win_tenderer)
- set_ruwei.add(win_tenderer)
- ruwei = df["入围供应商"][_i]
- if ruwei is not None and isinstance(ruwei,str):
- l_s = ruwei.split(",")
- for _s in l_s:
- set_ruwei.add(_s)
- if re.search("公路|道路|路基|路面|快速通道|高速|隧道|飞机跑道|桥梁|养护|路段|市政|照明工程|照明设施|亮灯|灯光改造|灯光工程|管道|架线|园林|景观|绿化|排水|河道整治|环境治理|交通|地铁|跌路|高铁|桥梁|大桥|桥段",product) is not None:
- for k in keys:
- df_data[k].append(df[k][_i])
- print("win count:%d ruwei:%d"%(len(set_win),len(set_ruwei)))
- # df1 = pd.DataFrame(df_data)
- # df1.to_excel("../data/%s_文件导出.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")),columns=keys)
- def export_dump():
- import pandas as pd
- df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=0)
- _set_number = set()
- _set_number |= set(df["号码"])
- print(len(_set_number))
- df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=1)
- _set_number |= set(df["号码"])
- print(len(_set_number))
- df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=2)
- keys = df.keys()
- df_data = {}
- for k in keys:
- df_data[k] = []
- for _i in range(len(df[keys[0]])):
- if df["号码"][_i] not in _set_number:
- for k in keys:
- df_data[k].append(df[k][_i])
- _set_number.add(df["号码"][_i])
- df2 = pd.DataFrame(df_data)
- df2.to_excel("tmp222.xlsx")
- def check_data_synchronization():
- filepath = "C:\\Users\\Administrator\\Desktop\\to_check.log"
- list_uuid = []
- _regrex = "ID='(?P<uuid>.+)'"
- with open(filepath,"r",encoding="utf8") as f:
- while 1:
- _line = f.readline()
- if not _line:
- break
- _match = re.search(_regrex,_line)
- if _match is not None:
- _uuid = _match.groupdict().get("uuid")
- if _uuid is not None:
- list_uuid.append(_uuid)
- print(len(list_uuid))
- task_queue = Queue()
- list_data = []
- for _uuid in list_uuid:
- _dict = {"uuid":_uuid}
- list_data.append(_dict)
- task_queue.put(_dict)
- ots_client = getConnect_ots()
- def _handle(_item,result_queue):
- bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,get_total_count=True),
- columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
- _item["exists"] = total_count
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- df_data = {"uuid":[],
- "exists":[]}
- for _data in list_data:
- for k,v in df_data.items():
- v.append(_data.get(k))
- import pandas as pd
- df2 = pd.DataFrame(df_data)
- df2.to_excel("check.xlsx")
- def group_xlsx():
- filename = "厂商&赛道列表.xlsx"
- df0 = pd.read_excel(filename,0)
- df1 = pd.read_excel(filename,1)
- df2 = pd.read_excel(filename,2)
- set_1 = set(df0["中国厂商"]) | set(df1["国际厂商"])
- set_2 = set(df2["a"]) | set(df2["b"]) | set(df2["c"])
- filename = "../data/2022-05-24_185801_数据导出.xlsx"
- df = pd.read_excel(filename)
- dict_docid = {}
- for docid,keyword in zip(df["docid"],df["关键词"]):
- if docid not in dict_docid:
- dict_docid[docid] = [[],[]]
- if keyword in set_1:
- dict_docid[docid][0].append(keyword)
- else:
- dict_docid[docid][1].append(keyword)
- set_docid = set()
- for k,v in dict_docid.items():
- if len(v[0])>=1 and len(v[1])>=1:
- set_docid.add(k)
- keys = df.keys()
- print(keys)
- df_data = {}
- for i in range(len(df["docid"])):
- print(i)
- docid = df["docid"][i]
- if docid in set_docid:
- for k in keys:
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(df[k][i])
- df_data["关键词"][-1] = str(dict_docid[docid][0][0])+"+"+str(dict_docid[docid][1][0])
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=keys)
- def static_process_time():
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[
- RangeQuery("crtime","2022-05-26","2022-05-27"),
- TermQuery("page_time","2022-05-26")
- ])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),get_total_count=True,limit=100),
- ColumnsToGet(column_names=["crtime","opertime","publishtime","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- list_data = []
- _l = getRow_ots(rows)
- list_data.extend(_l)
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
- SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
- ColumnsToGet(column_names=["crtime","opertime","publishtime","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- _l = getRow_ots(rows)
- list_data.extend(_l)
- print("%d/%d"%(len(list_data),total_count))
- list_dis = []
- list_dis_a = []
- list_dis_n_a = []
- for _data in list_data:
- crtime = _data.get("crtime")
- opertime = _data.get("opertime")
- page_attachments = _data.get("page_attachments","[]")
- _d = time.mktime(time.strptime(opertime,"%Y-%m-%d %H:%M:%S"))-time.mktime(time.strptime(crtime,"%Y-%m-%d %H:%M:%S"))
- list_dis.append(_d)
- if page_attachments=="[]":
- list_dis_n_a.append(_d)
- else:
- list_dis_a.append(_d)
- print("avg_time:",sum(list_dis)/len(list_dis),max(list_dis),min(list_dis))
- print("avg_time:",sum(list_dis_a)/len(list_dis_a),max(list_dis_a),min(list_dis_a))
- print("avg_time:",sum(list_dis_n_a)/len(list_dis_n_a),max(list_dis_n_a),min(list_dis_n_a))
- def export_dump_by_id():
- filename = "遗漏待验证1.csv"
- df = pd.read_csv(filename)
- list_k = []
- ots_client = getConnect_ots()
- for _main_url,_other_url in zip(df["_c0"],df["_c1"]):
- _d = {}
- main_docid = re.split("[-.]",_main_url)[3]
- l_other = []
- for _l in _other_url.split(","):
- _docid = re.split("[-.]",_l)[3]
- l_other.append(_docid)
- _d["main_docid"] = main_docid
- _d["other_docid"] = l_other
- list_k.append(_d)
- task_queue = Queue()
- for _q in list_k:
- task_queue.put(_q)
- def _handle(item,result_queue):
- columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","page_attachments"]
- main_docid = item["main_docid"]
- other_docid = item["other_docid"]
- list_should_q = []
- list_should_q.append(TermQuery("docid",main_docid))
- for _d in other_docid:
- list_should_q.append(TermQuery("docid",_d))
- _query = BoolQuery(should_queries=list_should_q)
- l_rows = []
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- dict_row = getRow_ots(rows)
- l_rows.extend(dict_row)
- log("total count:%d"%total_count)
- _count = len(dict_row)
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- dict_row = getRow_ots(rows)
- l_rows.extend(dict_row)
- _count += len(dict_row)
- item["data"] = l_rows
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- df_data = {"main_docid":[]}
- set_line = set()
- dict_channel = getDict_docchannel()
- for _d in list_k:
- list_row = _d.get("data")
- if list_row is not None:
- main_docid = _d.get("main_docid")
- getRowData(df_data,list_row,set_line,[],dict_channel,True)
- for _ in list_row:
- df_data["main_docid"].append(main_docid)
- df1 = pd.DataFrame(df_data)
- list_df_columns1 = ["main_docid"].extend(list_df_columns)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns1)
- def count_product():
- filename = "../data/2022-06-24_152201_数据导出.xlsx"
- df = pd.read_excel(filename)
- # _product = df["产品"]
- # dict_p_c = {}
- # for _p in _product:
- # if isinstance(_p,str) and _p!="":
- # l_p = _p.split(",")
- # for _p1 in l_p:
- # if _p1 not in dict_p_c:
- # dict_p_c[_p1] = 0
- # dict_p_c[_p1] += 1
- # df_data = {"产品":[],
- # "次数":[]}
- # for k,v in dict_p_c.items():
- # df_data["产品"].append(k)
- # df_data["次数"].append(v)
- # df1 = pd.DataFrame(df_data)
- # df1.to_excel("222.xlsx")
- keys = df.keys()
- df_data = {}
- for k in keys:
- df_data[k] = []
- product_pattern = "电脑|台式机|电脑|主机|网络|软件|开发|通信|系统|信息技术"
- df1 = pd.read_excel("222.xlsx")
- list_p = []
- for _p,_n in zip(df1["产品"],df1["need"]):
- if _n==1:
- list_p.append(_p)
- product_pattern = product_pattern+"|"+"|".join(list_p)
- _product = df["产品"]
- for _i in range(len(_product)):
- if re.search(product_pattern,str(_product[_i])) is not None:
- for k in keys:
- df_data[k].append(df[k][_i])
- df2 = pd.DataFrame(df_data)
- df2.to_excel("333.xlsx",columns=keys)
- from dataSource.source import getConnect_capacity
- def exportHonors_item_info():
- ots_capacity = getConnect_capacity()
- bool_query = BoolQuery(should_queries=[
- # TermQuery("ryjx","海河杯"),
- WildcardQuery("hjdw","*合肥建工集团有限公司*")
- ])
- rows,next_token,total_count,is_all_succeed = ots_capacity.search("honors_item_info","honors_item_info_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("create_time")]),get_total_count=True,limit=100),
- columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
- list_data = getRow_ots(rows)
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_capacity.search("honors_item_info","honors_item_info_index",
- SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
- columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
- list_data.extend(getRow_ots(rows))
- df_data = {}
- set_columns1 = set()
- list_df_columns1 = []
- for _data in list_data:
- _dict = {}
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"record_id",_data.get("record_id"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"bfdw",_data.get("bfdw"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"create_time",_data.get("create_time"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"cs",_data.get("cs"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"detail_link",_data.get("detail_link"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"fbsj",_data.get("fbsj"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjdw",_data.get("hjdw"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjdwjs",_data.get("hjdwjs"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjxm",_data.get("hjxm"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"jxjb",_data.get("jxjb"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"jxlx",_data.get("jxlx"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"ryjx",_data.get("ryjx"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"sf",_data.get("sf"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"xmfzr",_data.get("xmfzr"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"zgdw",_data.get("zgdw"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"zxj",_data.get("zxj"))
- for k,v in _dict.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df = pd.DataFrame(df_data)
- df.to_excel("honor_export.xlsx",columns=list_df_columns1)
- def check_dump_data():
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[
- generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery),
- RangeQuery("crtime","2022-09-07 20:00:00","2022-09-08 06:00:00"),
- RangeQuery("page_time","2022-09-07","2022-09-08"),
- RangeQuery("status",201,301)
- ])
- list_data = getDocument([{"query":bool_query}],["docid"],table_name="document",table_index="document_index")
- bool_query1 = BoolQuery(must_queries=[
- generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery),
- RangeQuery("crtime","2022-09-07 20:00:00","2022-09-08 06:00:00"),
- RangeQuery("page_time","2022-09-07","2022-09-08"),
- RangeQuery("status",81,100),
- TermQuery("save",1)
- ])
- list_data1 = getDocument([{"query":bool_query1}],["docid"],table_name="document_tmp",table_index="document_tmp_index")
- set_docid = set()
- set_docid_tmp = set()
- for _data in list_data:
- set_docid.add(_data.get("docid"))
- for _data in list_data1:
- set_docid_tmp.add(_data.get("docid"))
- print("document - tmp",set_docid-set_docid_tmp)
- print("tmp - document",set_docid_tmp-set_docid)
- def search_title_count():
- filename = "数据样例.xlsx"
- df = pd.read_excel(filename)
- list_title_dict = []
- for _title in df["标题"]:
- _dict = {"标题":_title}
- list_title_dict.append(_dict)
- task_queue = Queue()
- for _d in list_title_dict:
- task_queue.put(_d)
- ots_client = getConnect_ots()
- def _handle(item,result_queue):
- columns = ["status","tenderee","agency","sub_docs_json"]
- _title = item.get("标题","")
- if _title!="":
- bool_query = BoolQuery(must_queries=[MatchPhraseQuery("doctitle",_title)])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,get_total_count=True,limit=10),
- columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- item["比地数量"] = total_count
- if len(list_data)>0:
- _str_docid = ",".join([str(a.get("docid")) for a in list_data])
- item["比地_docid"] = _str_docid
- tenderee = list_data[0].get("tenderee")
- item["比地_招标人"] = tenderee
- agency = list_data[0].get("agency")
- item["比地_代理人"] = agency
- sub_docs_json = list_data[0].get("sub_docs_json")
- if sub_docs_json is not None:
- sub_docs = json.loads(sub_docs_json)
- win_tenderer = ""
- win_bid_price = ""
- for _doc in sub_docs:
- if _doc.get("win_tenderer","")!="":
- win_tenderer = _doc.get("win_tenderer")
- win_bid_price = _doc.get("win_bid_price")
- item["比地_中标人"] = win_tenderer
- item["比地_中标金额"] = win_bid_price
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- df_data = {}
- keys = ["标题","比地数量","比地_docid","比地_招标人","比地_代理人","比地_中标人","比地_中标金额"]
- for _d in list_title_dict:
- for k in keys:
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(_d.get(k,""))
- df1 = pd.DataFrame(df_data)
- df1.to_excel("比地对比数据.xlsx",columns=keys)
- def getDumplicate_docid():
- filename = "2022-11-02_154222_数据导出.xlsx"
- df = pd.read_excel(filename)
- list_docid = df["docid"]
- task_queue = Queue()
- list_d = []
- for _docid in list_docid:
- _dict = {"docid":_docid}
- list_d.append(_dict)
- task_queue.put(_dict)
- ots_client = getConnect_ots()
- def _handle(item,result_queue):
- _docid = item.get("docid")
- bool_query = BoolQuery(must_queries=[TermQuery("docid",int(_docid))])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
- SearchQuery(bool_query),
- columns_to_get=ColumnsToGet(["save"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- if len(list_data)>0:
- _save = list_data[0].get("save")
- item["save"] = _save
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- list_d_docid = []
- for _data in list_d:
- docid = _data.get("docid")
- save = _data.get("save")
- if save==0:
- list_d_docid.append(str(docid))
- print(",".join(list_d_docid))
- def getDocumentHtml():
- filename = "../data/2023-02-20_154118_数据导出.xlsx"
- df = pd.read_excel(filename)
- ots_client = getConnect_capacity()
- list_html_data = []
- _count = 0
- for docid in df["docid"][:10000]:
- partitionkey = int(docid)%500+1
- docid = int(docid)
- try:
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["dochtmlcon"])
- _dict = getRow_ots_primary(return_row)
- list_html_data.append(_dict)
- _count += 1
- print("%d/%d"%(_count,len(df["docid"])))
- except Exception as e:
- pass
- save(list_html_data,"list_html_data.pk")
- def exportAgencyCount():
- filename = "广州招标协会.xlsx"
- df = pd.read_excel(filename)
- a = df["a"]
- df_data = {}
- set_c = set()
- for line in a:
- list_c = line.split(" ")
- for _i in range(len(list_c)):
- _key = "c_%s"%(str(_i).rjust(2,'0'))
- if _key not in df_data:
- df_data[_key] = []
- set_c.add(_key)
- df_data[_key].append(list_c[_i])
- list_data = []
- list_query = []
- ots_client = getConnect_ots()
- for _agency in df_data["c_00"]:
- query = BoolQuery(must_queries=[TermQuery("city","广州"),
- TermQuery("docchannel",52),
- RangeQuery("status",201,301),
- RangeQuery("page_time","2022-01-01","2023-01-01"),
- TermQuery("agency",_agency),
- BoolQuery(should_queries=[
- BoolQuery(should_queries=[MatchPhraseQuery("doctitle","工程施工"),
- MatchPhraseQuery("doctextcon","建造师"),
- MatchPhraseQuery("attachmenttextcon","建造师")]),
- BoolQuery(should_queries=[MatchPhraseQuery("doctitle","监理"),
- MatchPhraseQuery("doctextcon","监理工程师"),
- MatchPhraseQuery("attachmenttextcon","监理工程师")]),
- BoolQuery(should_queries=[MatchPhraseQuery("doctitle","造价咨询"),
- MatchPhraseQuery("doctitle","预算"),
- MatchPhraseQuery("doctitle","造价审核"),
- MatchPhraseQuery("doctitle","结算"),
- MatchPhraseQuery("doctitle","概算")]),
- ])
- ],
- must_not_queries=[generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["广州公共资源交易中心"],MatchPhraseQuery)])
- # list_row = getDocument([{"query":query}],["agency","page_time","sub_docs_json"],thread_count=1)
- list_query.append({"query":query})
- df_data = {}
- set_line = set()
- columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
- list_row = getDocument(list_query,columns,thread_count=30)
- # list_row = filterRow(list_row,"product",list_not_key)
- log("get document %d rows"%len(list_row))
- # getRowDataWithKey(df_data,list_row,columns)
- dict_channel = getDict_docchannel()
- getRowData(df_data,list_row,set_line,[''],dict_channel,True)
- # getRowData_sp1(df_data,list_row,set_line,list_keyword,dict_sptype,True)
- # fixContactPerson(df_data,list_df_columns,get_legal_person=False)
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
- return
- # _dict = {"一季度预算":0,"一季度总数":0,"一季度有金额占比":0,
- # "二季度预算":0,"二季度总数":0,"二季度有金额占比":0,
- # "三季度预算":0,"三季度总数":0,"三季度有金额占比":0,
- # "四季度预算":0,"四季度总数":0,"四季度有金额占比":0,
- # }
- # print(_agency,len(list_row))
- #
- #
- # for _row in list_row:
- # print(_row.get("docid"))
- # page_time = _row.get("page_time","")
- # sub_docs_json = _row.get("sub_docs_json","")
- # _t = None
- # if page_time!="":
- # if page_time<="2022-03-31":
- # _t = "一季度"
- # elif page_time<="2022-06-31":
- # _t = "二季度"
- # elif page_time<="2022-09-31":
- # _t = "三季度"
- # elif page_time<="2022-12-31":
- # _t = "四季度"
- # if sub_docs_json != "":
- # sub_docs = json.loads(sub_docs_json)
- # for _doc in sub_docs:
- # bidding_budget = _doc.get("bidding_budget",0)
- # _dict["%s预算"%_t] += bidding_budget
- # _dict["%s总数"%_t] += 1
- # if bidding_budget>0:
- # _dict["%s有金额占比"%_t] += 1
- # print(_dict)
- # _sum = 0
- # _sum_n = 0
- # _sum_hm = 0
- # for k in ["一季度","二季度","三季度","四季度"]:
- # km = "%s预算"%k
- # kn = "%s总数"%k
- # khm = "%s有金额占比"%k
- # _sum += _dict[km]
- # _sum_n += _dict[kn]
- # _sum_hm += _dict[khm]
- # _dict["全年预算"] = _sum
- # _dict["全年总数"] = _sum_n
- # _dict["全年有金额占比"] = _sum_hm
- # for k in ["一季度","二季度","三季度","四季度","全年"]:
- # km = "%s预算"%k
- # kn = "%s总数"%k
- # khm = "%s有金额占比"%k
- # _dict[khm] = _dict[khm]/_dict[kn] if _dict[kn]>0 else 0
- # for k,v in _dict.items():
- # if k not in df_data:
- # df_data[k] = []
- # df_data[k].append(v)
- #
- # list_c = list(set_c)
- # list_c.sort(key=lambda x:x)
- # for k,v in df_data.items():
- # print(k,len(v))
- # df1 = pd.DataFrame(df_data)
- # list_c.append("一季度预算")
- # list_c.append("一季度总数")
- # list_c.append("一季度有金额占比")
- # list_c.append("二季度预算")
- # list_c.append("二季度总数")
- # list_c.append("二季度有金额占比")
- # list_c.append("三季度预算")
- # list_c.append("三季度总数")
- # list_c.append("三季度有金额占比")
- # list_c.append("四季度预算")
- # list_c.append("四季度总数")
- # list_c.append("四季度有金额占比")
- # list_c.append("全年预算")
- # list_c.append("全年总数")
- # list_c.append("全年有金额占比")
- # df1.to_excel("%s_1.xlsx"%(filename),columns=list_c)
- def attachAttachment():
- filename = "北京电信ICT样例(2023一季度)v1.1(2).xlsx"
- df = pd.read_excel(filename,1)
- list_data = []
- task_queue = Queue()
- for _docid in df["docid"]:
- _d = {"docid":_docid}
- list_data.append(_d)
- task_queue.put(_d)
- print("len_docid",len(df["docid"]),len(list_data))
- capacity = getConnect_capacity()
- def _handle(item,result_queue):
- docid = item["docid"]
- consumed, return_row, next_token = capacity.get_row("document",[("partitionkey",int(docid)%500+1),("docid",int(docid))],["dochtmlcon"])
- _d = getRow_ots_primary(return_row)
- _dochtmlcon = _d["dochtmlcon"]
- _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
- _soup = BeautifulSoup(_dochtmlcon,"lxml")
- _div = _soup.find("div",attrs={"class":"richTextFetch"})
- if _div is None:
- _div = ""
- item["attachment"] = _div
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- list_attachment = []
- for _d in list_data:
- list_attachment.append(getLegal_str(_d.get("attachment","")))
- df_data = {}
- df_data["附件html"] = list_attachment
- df_1 = pd.DataFrame(df_data)
- df_1.to_excel("附加html_"+filename)
- def compareData():
- filename = "D:\\BaiduNetdiskDownload\\bidi_check.csv"
- list_data = []
- with open(filename,"r",encoding="utf8") as f:
- list_lines = f.readlines()
- for _line in list_lines:
- docid,docchannel,win_tenderer,tenderee,win_bid_price,bidding_budget = [None if a[:2]=='\\N' else a for a in _line.split("\t")]
- _d = {"docid":int(docid),
- "docchannel":docchannel,
- "win_tenderer":win_tenderer,
- "tenderee":tenderee,
- "win_bid_price":float(win_bid_price) if win_bid_price is not None else None,
- "bidding_budget":float(bidding_budget) if bidding_budget is not None else None}
- list_data.append(_d)
- del list_lines
- # for _i in range(len(list_data)):
- # print(list_lines[_i])
- # print(list_data[_i])
- ots_client = getConnect_ots()
- task_queue = Queue()
- for _d in list_data:
- task_queue.put(_d)
- def _handle(item,result_queue):
- docid = item.get("docid")
- win_tenderer = item.get("win_tenderer")
- win_bid_price = item.get("win_bid_price")
- tenderee = item.get("tenderee")
- bidding_budget = item.get("bidding_budget")
- must_q = [TermQuery("docid",int(docid))]
- if tenderee is not None:
- must_q.append(TermQuery("tenderee",tenderee))
- if win_tenderer is not None:
- must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",win_tenderer)))
- if win_bid_price is not None:
- must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_bid_price",win_bid_price)))
- if bidding_budget is not None:
- must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.bidding_budget",bidding_budget)))
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(BoolQuery(must_queries=must_q),get_total_count=True),
- columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
- item["total_count"] = total_count
- if total_count==0:
- print("docid %d total_count is 0",docid)
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(BoolQuery(must_queries=[TermQuery("docid",docid)]),get_total_count=True),
- columns_to_get=ColumnsToGet(["tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- l_d = getRow_ots(rows)
- item["return_row"] = l_d
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- list_new_data = []
- for data in list_data:
- if data.get("total_count")==0:
- new_d = {"docid":data.get("docid"),
- "docchannel":data.get("docchannel")}
- return_row = data.get("return_row")
- if len(return_row)>0:
- # print(return_row)
- _row = return_row[0]
- tenderee = _row.get("tenderee")
- sub_docs_json = _row.get("sub_docs_json")
- bidding_budget = None
- win_tenderer = None
- win_bid_price = None
- if sub_docs_json is not None:
- sub_docs = json.loads(sub_docs_json)
- for _doc in sub_docs:
- if _doc.get("bidding_budget") is not None:
- bidding_budget = _doc.get("bidding_budget")
- if _doc.get("win_tenderer") is not None:
- win_tenderer = _doc.get("win_tenderer")
- win_bid_price = _doc.get("win_bid_price")
- new_d["tenderee"] = tenderee
- new_d["bidding_budget"] = bidding_budget
- new_d["win_tenderer"] = win_tenderer
- new_d["win_bid_price"] = win_bid_price
- list_new_data.append(new_d)
- df_data_c = ["docid","docchannel","win_tenderer","tenderee","win_bid_price","bidding_budget"]
- df_data = {}
- for c in df_data_c:
- df_data[c] = []
- for _d in list_new_data:
- for c in df_data_c:
- df_data[c].append(_d.get(c))
- df = pd.DataFrame(df_data)
- df.to_csv("bid_check_result.csv",columns=df_data_c)
- def exportProducts():
- filename = "货物关键词.xlsx"
- dict_channel = getDict_docchannel()
- df = pd.read_excel(filename)
- list_products = df["货物关键词"]
- list_q = []
- list_result = []
- ots_client = getConnect_ots()
- columns = ["产品","总数","匹配模式"]
- _index = 0
- task_queue = Queue()
- for _product in list_products:
- _index += 1
- print(_product,"%d/%d"%(_index,len(list_products)))
- bool_query = BoolQuery(must_queries=[NestedQuery("products",TermQuery("products.product",_product)),
- # RangeQuery("page_time","2021-01-01"),
- RangeQuery("status",201,301),
- TermQuery("docchannel",101)])
- _q = {"query":bool_query,"product":_product,"匹配模式":"精准"}
- task_queue.put(_q)
- bool_query = BoolQuery(must_queries=[NestedQuery("products",WildcardQuery("products.product","*%s*"%_product)),
- # RangeQuery("page_time","2021-01-01"),
- RangeQuery("status",201,301),
- TermQuery("docchannel",101)])
- _q = {"query":bool_query,"product":_product,"匹配模式":"包括"}
- task_queue.put(_q)
- def _handle(item,result_queue):
- bool_query = item["query"]
- _product = item["product"]
- _type = item["匹配模式"]
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,get_total_count=True),
- columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
- list_result.append({"产品":_product,"总数":total_count,"匹配模式":_type})
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- print("done result length:%d"%(len(list_result)))
- df_data = {}
- for _d in list_result:
- for c in columns:
- if c not in df_data:
- df_data[c] = []
- df_data[c].append(_d.get(c))
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=columns)
- def statics_attachment_counts():
- bool_query = BoolQuery(must_queries=[
- generateBoolShouldQuery(["industry"],["土木工程建筑业","建筑装饰和其他建筑业","房屋建筑业","专业施工","修缮工程","建筑安装业"],TermQuery),
- RangeQuery("page_time","2023-08-07","2023-08-14"),
- RangeQuery("status",201,301),
- NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
- ])
- ots_client = getConnect_ots()
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
- ColumnsToGet(["industry","docchannel","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
- ColumnsToGet(["industry","docchannel","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- list_data.extend(getRow_ots(rows))
- # if len(list_data)>1000:
- # break
- task_queue = Queue()
- for _data in list_data:
- task_queue.put(_data)
- def _handle(item,result_queue):
- page_attachments = item.get("page_attachments")
- _size = 0
- if page_attachments is not None and page_attachments!="":
- list_attach = json.loads(page_attachments)
- for _attach in list_attach:
- _md5 = _attach.get("fileMd5")
- if _md5 is not None:
- consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",_md5)],["size"])
- _d = getRow_ots_primary(return_row)
- _size += _d.get("size",0)
- item["size"] = _size
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- dict_result = {}
- for data in list_data:
- industry = data.get("industry")
- docchannel = data.get("docchannel")
- _type = ""
- if docchannel==52:
- _type = "招标"
- elif docchannel in (101,118,119,120):
- _type = "中标"
- else:
- _type = "其他"
- _key = "%s-%s"%(industry,_type)
- if _key not in dict_result:
- dict_result[_key] = 0
- dict_result[_key] += data.get("size",0)
- print(dict_result)
- for k,v in dict_result.items():
- print(k,"%.2fM"%(v/7/1024/1024))
- def static_dump():
- import pandas as pd
- filename = "select___from_bxkc_bxkc_delete_document_.csv"
- df = pd.read_csv(filename)
- print(df.keys())
- list_docid = df["docid"]
- list_dup_docid = df["dup_docid"]
- list_operate_time = df["operate_time"]
- list_a = []
- for docid,dup_docid in zip(list_docid,list_dup_docid):
- docid = int(docid)
- _flag = False
- if isinstance(dup_docid,str) and dup_docid is not None and dup_docid!="":
- _l = dup_docid.split(",")
- for _i in _l:
- if _i.strip()!="":
- docid1 = int(_i)
- if docid1>docid:
- _flag = True
- break
- if _flag:
- list_a.append("是")
- else:
- list_a.append("否")
- df_data = {"被去重docid":list_docid,
- "重复id":list_dup_docid,
- "是否展示后删除":list_a}
- df1 = pd.DataFrame(df_data)
- df1.to_csv("16号去重统计.csv")
- def append_title():
- import pandas as pd
- filename = "去重记录.xlsx"
- df = pd.read_excel(filename)
- list_docid = df["被去重id"]
- list_keep_id = df["保留id"]
- list_data = []
- task_queue = Queue()
- for _docid,keep_docid in zip(list_docid,list_keep_id):
- _d = {"dup_docid":int(_docid),
- "keep_docid":int(keep_docid)}
- list_data.append(_d)
- task_queue.put(_d)
- ots_client = getConnect_ots()
- def _handle(item,result_queue):
- dup_docid = item.get("dup_docid")
- keep_docid = item.get("keep_docid")
- dup_partitionkey = dup_docid%500+1
- keep_partitionkey = keep_docid%500+1
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",dup_partitionkey),("docid",dup_docid)],["status","doctitle","extract_count"])
- _d = getRow_ots_primary(return_row)
- if _d is not None:
- doctitle = _d.get("doctitle")
- item["dup_title"] = doctitle
- extract_count = _d.get("extract_count")
- item["dup_extract_count"] = extract_count
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",keep_partitionkey),("docid",keep_docid)],["status","doctitle","extract_count","extract_count"])
- _d = getRow_ots_primary(return_row)
- if _d is not None:
- doctitle = _d.get("doctitle")
- item["keep_title"] = doctitle
- status = _d.get("status")
- extract_count = _d.get("extract_count")
- item["keep_extract_count"] = extract_count
- if status>=201 and status<=300:
- item["保留id状态"] = "正常"
- elif status>=401:
- item["保留id状态"] = "去重"
- else:
- item["保留id状态"] = ""
- mt = MultiThreadHandler(task_queue,_handle,None,39)
- mt.run()
- keys = ["dup_docid","keep_docid","dup_title","keep_title","保留id状态","dup_extract_count","keep_extract_count"]
- df_data = {}
- for data in list_data:
- for k in keys:
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(data.get(k))
- df1 = pd.DataFrame(df_data)
- df1.to_excel("%s.xlsx"%(filename),columns=keys)
- def get_follows():
- _json = '''
- [
- ]
- '''
- ots_client = getConnect_ots()
- list_follows = json.loads(_json)
- new_list = []
- for follow in list_follows:
- docid = follow.get("docid")
- partitionkey = docid%500+1
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["tenderee"])
- _d = getRow_ots_primary(return_row)
- print("docid",_d.get("tenderee"))
- if _d.get("tenderee")=="泗阳意杨产业科技园实业有限公司":
- new_list.append(follow)
- print(json.dumps(new_list,ensure_ascii=False))
- def validateTitle(title):
- rstr = r"[\/\\\:\*\?\"\<\>\|\r\n]" # '/ \ : * ? " < > |'
- new_title = re.sub(rstr, "_", title) # 替换为下划线
- return new_title
- def exportParameters():
- from glob import glob
- attach_path = "F:/Workspace2016/BaseDataMaintenance/BaseDataMaintenance/maintenance/product/download"
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[TermQuery("parameter_status",1)])
- save_dir = "product"
- if not os.path.exists(save_dir):
- os.mkdir(save_dir)
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_product2","document_product2_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
- ColumnsToGet(["parameter","bid_filemd5s","name","original_name"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- write_count = 0
- for _data in list_data:
- bid_filemd5s = _data["bid_filemd5s"]
- parameter = _data["parameter"]
- name = _data["name"]
- original_name = _data["original_name"]
- list_md5s = bid_filemd5s.split(",")
- if len(list_md5s)==1:
- list_path = glob(os.path.join(attach_path,bid_filemd5s)+"*")
- for _path in list_path:
- if not _path.endswith(".html"):
- filename = _path.split("\\")[-1]
- with open(os.path.join(save_dir,filename),"wb") as f:
- f.write(open(_path,"rb").read())
- pname = "%s_name%s_original_name%s.html"%(bid_filemd5s,name,original_name[:10])
- pname = validateTitle(pname)
- with open(os.path.join(save_dir,pname),"w",encoding="utf8") as f:
- f.write(parameter)
- write_count += 1
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_product2","document_product2_index",
- SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
- ColumnsToGet(["parameter","bid_filemd5s","name","original_name"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- for _data in list_data:
- bid_filemd5s = _data["bid_filemd5s"]
- parameter = _data["parameter"]
- name = _data["name"]
- original_name = _data["original_name"]
- list_md5s = bid_filemd5s.split(",")
- if len(list_md5s)==1:
- list_path = glob(os.path.join(attach_path,bid_filemd5s)+"*")
- for _path in list_path:
- if not _path.endswith(".html"):
- filename = _path.split("\\")[-1]
- with open(os.path.join(save_dir,filename),"wb") as f:
- f.write(open(_path,"rb").read())
- pname = "%s_name%s_original_name%s.html"%(bid_filemd5s,name,original_name[:10])
- pname = validateTitle(pname)
- with open(os.path.join(save_dir,pname),"w",encoding="utf8") as f:
- f.write(parameter)
- write_count += 1
- if write_count>=2000:
- return
- def exportProjects():
- bool_query = BoolQuery(must_queries=[
- TermQuery("docid_number",1),
- ExistsQuery("zhong_biao_page_time"),
- RangeQuery("page_time","2023-01-01","2023-10-10")
- ],
- must_not_queries=[
- MatchPhraseQuery("doctitles","网上超市")
- ])
- ots_client = getConnect_ots()
- rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=100,get_total_count=True),
- ColumnsToGet(["docids","doctitles","project_codes"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
- SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
- ColumnsToGet(["docids","doctitles","project_codes"],return_type=ColumnReturnType.SPECIFIED))
- list_data.extend(getRow_ots(rows))
- if len(list_data)>10000:
- break
- task_queue = Queue()
- for data in list_data:
- task_queue.put(data)
- def _handle(item,result_queue):
- docids = item["docids"]
- project_codes = item.get("project_codes","")
- if len(project_codes)>0:
- list_codes = project_codes.split(",")
- should_q = []
- for code in list_codes:
- should_q.append(MatchPhraseQuery("doctextcon",code))
- should_q.append(MatchPhraseQuery("attachmenttextcon",code))
- _query = BoolQuery(must_queries=[BoolQuery(should_queries=should_q),RangeQuery("status",201,301)],
- must_not_queries=[TermQuery("docid",int(docids))])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(_query,limit=100),
- ColumnsToGet(["doctitle"],return_type=ColumnReturnType.SPECIFIED))
- item["result"] = json.dumps(getRow_ots(rows),ensure_ascii=False)
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- columns = ["docids","doctitles","project_codes","result"]
- df_data = {}
- for data in list_data:
- for c in columns:
- if c not in df_data:
- df_data[c] = []
- df_data[c].append(data.get(c,""))
- df = pd.DataFrame(df_data)
- df.to_excel("toMerge.xlsx",columns=columns)
- def match_contact():
- filename = r"C:\Users\Administrator\联系电话需求.xlsx"
- df = pd.read_excel(filename)
-
- ots_client = getConnect_ots()
- list_row = []
- for index,row in df.iterrows():
- a = row["乙方"]
- b = row["联系人"]
- c = row['联系电话(手机号)']
- d = row["想打听甲方联系人的项目"].strip()
- e = row['联系人.1']
- f = row['联系电话(手机号).1']
- print(a,b,c,d,e,f)
- bool_query = BoolQuery(must_queries=[
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[a,d],MatchPhraseQuery),
- TermQuery("tenderee",d),
- RangeQuery("status",201,301)
- ])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),get_total_count=True,limit=10),
- ColumnsToGet(["tenderee_contact","tenderee_phone"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- _find = False
- # if len(list_data)>0:
- # for data in list_data:
- # if re.search('^1\d{10}',data.get("tenderee_phone","")) is not None:
- # df["联系人.1"][index] = data.get("tenderee_contact","")
- # df['联系电话(手机号).1'][index] = data.get("tenderee_phone","")
- # print("===",data)
- # _find = True
- # break
- bool_query = BoolQuery(must_queries=[
- TermQuery("enterprise_name",a),
- TermQuery("is_mobile",1)
- ])
- rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("score",SortOrder.DESC),FieldSort("update_time",SortOrder.DESC)]),get_total_count=True,limit=10),
- ColumnsToGet(["contact_person","phone_no"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- if len(list_data)>0:
- _str = ""
- for _data in list_data:
- _str += _data.get("contact_person","")+"|"+_data.get("phone_no","")+"|\n"
- df["联系人"][index] = _str
- df['联系电话(手机号)'][index] = _str
- if not _find:
- bool_query = BoolQuery(must_queries=[
- TermQuery("enterprise_name",d),
- TermQuery("is_mobile",1)
- ])
- rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("score",SortOrder.DESC),FieldSort("update_time",SortOrder.DESC)]),get_total_count=True,limit=10),
- ColumnsToGet(["contact_person","phone_no"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- if len(list_data)>0:
- _str = ""
- for _data in list_data:
- _str += _data.get("contact_person","")+"|"+_data.get("phone_no","")+"|\n"
- df["联系人.1"][index] = _str
- df['联系电话(手机号).1'][index] = _str
- print(total_count)
- df.to_excel("result.xlsx")
- def match_products():
- # filename = "未订阅用户查看公告记录.xlsx"
- # df = pd.read_excel(filename)
- # list_user_id = df["user_id"]
- # list_doc_id = df["doc_id"]
- # ots_client = getConnect_ots()
- # list_product = []
- # _c = 0
- # for docid in list_doc_id:
- # _c += 1
- # print(_c,len(list_doc_id))
- # bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
- # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- # SearchQuery(bool_query),
- # columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
- # list_data = getRow_ots(rows)
- # if len(list_data)>0:
- # list_product.append(list_data[0].get("product",""))
- # else:
- # list_product.append("")
- # u_id = None
- # u_product = []
- # list_product_group = []
- # for user_id,product in zip(list_user_id,list_product):
- # if u_id is None:
- # u_id = user_id
- # u_product.append(product)
- # else:
- # if user_id==u_id:
- # u_product.append(product)
- # else:
- # l_p = []
- # for _p in u_product:
- # s = _p.split(",")
- # l_p.extend(s)
- # _dict = {}
- # for p in l_p:
- # if p not in _dict:
- # _dict[p] = 0
- # _dict[p] += 1
- # for _ in u_product:
- # list_product_group.append(json.dumps(_dict,ensure_ascii=False))
- # u_id = user_id
- # u_product = [product]
- #
- #
- # if len(u_product)>0:
- # l_p = []
- # for _p in u_product:
- # s = _p.split(",")
- # l_p.extend(s)
- # _dict = {}
- # for p in l_p:
- # if p not in _dict:
- # _dict[p] = 0
- # _dict[p] += 1
- # for _ in u_product:
- # list_product_group.append(json.dumps(_dict,ensure_ascii=False))
- #
- # _d = {"user_id":list_user_id,
- # "doc_id":list_doc_id,
- # "product":list_product,
- # "product_group":list_product_group}
- # for k,v in _d.items():
- # print(k,len(v))
- # df1 = pd.DataFrame(_d)
- # df1.to_excel(filename+"_1.xlsx",columns=["user_id","doc_id","product","product_group"])
- filename = "大单未订阅用户关键词.xlsx"
- df = pd.read_excel(filename)
- list_userid = df["userid"]
- list_kw = df["关键词"]
- list_kw1 = []
- for kw in list_kw:
- list_word = kw.split(",")
- list_word.sort(key=lambda x:len(x))
- list_words = []
- for w_i in range(len(list_word)):
- _find = False
- _w = list_word[w_i]
- if len(_w)>8:
- _find = True
- else:
- for w_j in range(w_i):
- _wj = list_word[w_j]
- if str(_w).find(_wj)>=0:
- _find = True
- break
- if not _find:
- list_words.append(_w)
- if len(list_words)==6:
- break
- list_kw1.append(",".join(list_words))
- _dict = {"userid":list_userid,
- "关键词":list_kw1}
- df1 = pd.DataFrame(_dict)
- df1.to_excel(filename+"_1.xlsx",columns=["userid","关键词"])
- def export_columns():
- filename = r"F:\Workspace2016\DataMining\data\2025-03-14_161851_数据导出.xlsx"
- df = pd.read_excel(filename)
- task_queue = Queue()
- result_queue = Queue()
- ots_client = getConnect_ots()
- ots_capacity = getConnect_capacity()
- for i in range(len(df)):
- docid = df.iloc[i]["docid"]
- docid = int(docid)
- task_queue.put(docid)
- from bs4 import BeautifulSoup
- import re
- def html2text_with_tablehtml(_html):
- # 如果输入是字符串,使用 BeautifulSoup 解析
- if isinstance(_html, str):
- _soup = BeautifulSoup(_html, "lxml")
- else:
- _soup = _html
- # 用于存储处理后的文本
- result_parts = []
- _find = False
- # 遍历所有直接子元素
- for child in _soup.find_all(recursive=False):
- if child.name in ["table", "tbody"]:
- # 如果是表格或表格主体,保留 HTML 代码
- result_parts.append("\n"+str(child)+"\n")
- else:
- # 递归处理其他元素并转换为文本
- text = html2text_with_tablehtml(child)
- result_parts.append(text)
- _find = True
- if not _find:
- _text = str(_soup.get_text())
- if len(_text)>0:
- if _soup.name in {"p","div","li"}:
- _text += "\n"
- result_parts.append(_text)
- # 将所有处理后的部分连接成一个字符串
- result = "".join(result_parts)
- return result
- def _handle(item,result_queue):
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",int(item%500+1)),("docid",int(item))],["web_source_no","web_source_name"])
- _dict = getRow_ots_primary(return_row)
- web_source_no = _dict.get("web_source_no","")
- web_source_name = _dict.get("web_source_name","")
- consumed, return_row, next_token = ots_capacity.get_row("document",[("partitionkey",int(item%500+1)),("docid",int(item))],["dochtmlcon"])
- _dict = getRow_ots_primary(return_row)
- dochtmlcon = _dict.get("dochtmlcon","")
- _text = html2text_with_tablehtml(dochtmlcon)
- result_queue.put((item,web_source_no,web_source_name,_text))
- mt = MultiThreadHandler(task_queue,_handle,result_queue,30)
- mt.run()
- _dict_docid = {}
- while True:
- try:
- item,web_source_no,web_source_name,_text = result_queue.get(False)
- _dict_docid[item] = (web_source_no,web_source_name,_text)
- except Exception as e:
- break
- for i in range(len(df)):
- docid = df.iloc[i]["docid"]
- docid = int(docid)
- if docid in _dict_docid:
- web_source_no,web_source_name,_text = _dict_docid[docid]
- df.loc[i,"web_source_no"] = web_source_no
- df.loc[i,"web_source_name"] = web_source_name
- df.loc[i,"tokens"] = len(_text)
- else:
- df.loc[i,"web_source_no"] = ""
- df.loc[i,"web_source_name"] = ""
- df.loc[i,"tokens"] = 0
- df.to_excel(filename+"_1.xlsx")
- def clean_subcription(s_subcriptions,s_exclude_subcriptions):
- '''
- 自动清洗订阅词
- :param s_subscriptions:
- :param s_exclude_subscriptions:
- :return:
- '''
- from export.html2text import html2text_with_tablehtml
- ots_client = getConnect_ots()
- ots_capacity = getConnect_capacity()
- def get_content(subcriptions,exclude_subcriptions,current_date):
- must_not_queries=[
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],exclude_subcriptions[:80],MatchPhraseQuery)
- ] if len(exclude_subcriptions)>0 else []
- bool_query = BoolQuery(must_queries=[
- TermsQuery("docchannel",[52,101,118,119,120,121,122,51]),
- TermQuery("page_time",current_date),
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],subcriptions[:80],MatchPhraseQuery),
- ],
- must_not_queries=must_not_queries)
- list_row = getDocument([{"query":bool_query,"limit":10}],["docid","doctitle"],thread_count=1)
- for row in list_row:
- docid = row["docid"]
- partitionkey = row["partitionkey"]
- consumed, return_row, next_token = ots_capacity.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["dochtmlcon"])
- _dict = getRow_ots_primary(return_row)
- dochtmlcon = _dict.get("dochtmlcon","")
- _text = html2text_with_tablehtml(dochtmlcon)
- row["text"] = _text[:10000]
- return list_row
- def _handle(item,result_queue):
- list_row = get_content(item["subcriptions"],item["exclude_subcriptions"],item["current_date"])
- item["list_row"] = list_row
- for row in list_row:
- result_queue.put(row)
- def clean_exclude_set(list_exclude_set):
- while True:
- new_list_exclude_set = []
- pop_index = set()
- for _i in range(len(list_exclude_set)):
- if _i in pop_index:
- continue
- _exclude_set = list_exclude_set[_i]
- _find = False
- for _j in range(_i+1,len(list_exclude_set)):
- if _j in pop_index:
- continue
- _exclude_set_j = list_exclude_set[_j]
- if len(_exclude_set & _exclude_set_j)>0:
- new_list_exclude_set.append(_exclude_set & _exclude_set_j)
- pop_index.add(_j)
- _find = True
- break
- if not _find:
- new_list_exclude_set.append(_exclude_set)
- pop_index.add(_i)
- if len(new_list_exclude_set)==len(list_exclude_set):
- return new_list_exclude_set
- list_exclude_set = new_list_exclude_set
- def _handle1(row,result_queue):
- _prompt = '''
- 客户对招投标信息感心趣
- 订阅词如下:施工围挡,防眩板,标志牌,标示牌,标识牌,指示牌,路名牌,道路标线,道路标志,热熔标线,交通标线,交通标识工程,马路划线,道路标志制作,道路标志设置,车库划线
- 根据如上订阅词,识别客户真正感兴趣的内容,判断以下公告的兴趣度,范围是0-10,如果兴趣度不高(即内容不是客户感兴趣的),请生成排除词,排除词针对这篇公告的核心内容且内容不是客户感兴趣的
- 返回格式如下:
- {"兴趣度":"","排除词":[]}
- %s
- '''%(row["text"])
- _result = chat_doubao(_prompt,model_name = "ep-20250314164242-jd62g")
- _json = get_json_from_text(_result)
- try:
- _dict = json.loads(_json)
- except Exception as e:
- _dict = {}
- row.update(_dict)
- subcriptions = s_subcriptions.split(",")
- exclude_subcriptions = s_exclude_subcriptions.split(",") if s_exclude_subcriptions!="" else []
- original_exclude_subcriptions = exclude_subcriptions.copy()
- task_queue = Queue()
- result_queue = Queue()
- _index = 0
- dict_count = {}
- current_date = getCurrent_date(format="%Y-%m-%d")
- list_exclude_set = []
- while True:
- list_data = []
- if _index>=10:
- break
- _index += 1
- current_date = timeAdd(current_date,-7)
- for subcription in subcriptions:
- _d = {"subcriptions":[subcription],
- "exclude_subcriptions":exclude_subcriptions,
- "current_date":current_date}
- list_data.append(_d)
- task_queue.put(_d)
- mt = MultiThreadHandler(task_queue,_handle,result_queue,10)
- mt.run()
- mt = MultiThreadHandler(result_queue,_handle1,None,30)
- mt.run()
- for row1 in list_data:
- total_interest = 0
- total_count = 0
- list_row = row1.get("list_row",[])
- for row in list_row:
- try:
- interest = int(row["兴趣度"])
- except Exception as e:
- interest = 0
- if interest>0:
- total_interest += interest
- total_count += 1
- if interest<=5:
- list_exclude = row.get("排除词",[])
- if len(list_exclude)>0:
- list_exclude_set.append(set(list_exclude))
- for _exclude in list_exclude:
- if _exclude in dict_count:
- dict_count[_exclude] += 1
- else:
- dict_count[_exclude] = 1
- if total_count>0:
- row1["avg_interest"] = round(total_interest/total_count,2)
- row1["total_interest"] = total_interest
- list_exclude_set = clean_exclude_set(list_exclude_set)
- exclude_set = {_exclude for _exclude_set in list_exclude_set for _exclude in _exclude_set}
- list_count = []
- for k,v in dict_count.items():
- if v>=2 and k in exclude_set:
- list_count.append((v,k))
- list_count.sort(key=lambda x:x[0],reverse=True)
- exclude_subcriptions = original_exclude_subcriptions.copy()
- # for v,k in list_count:
- # if k not in exclude_subcriptions:
- # exclude_subcriptions.append(k)
- #todo 根据排除词的共现来减少排除词
- print("exclude_subcriptions",exclude_subcriptions)
- for row1 in list_data:
- print("interest")
- print(row1.get("subcriptions"))
- print(row1.get("avg_interest"))
- print(row1.get("total_interest"))
- def fix_subcription():
- ots_client = getConnect_ots()
- def _handle(item,result_queue):
- bool_query = BoolQuery(must_queries=[
- NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item.get("公司名称"))),
- TermsQuery("docchannel",[101,118,119,120,121,122]),
- RangeQuery("status",201,301)
- ])
- list_row = getDocument([{"query":bool_query,"limit":50}],["docid","doctitle","product","products","province"],thread_count=1)
- remain_products = []
- total_products = {}
- total_titles = []
- total_province = set()
- for row in list_row:
- product = row.get("product","")
- products = row.get("products")
- doctitle = row.get("doctitle")
- province = row.get("province")
- print("===============")
- print(product)
- print(products)
- if province!="" and province not in ("全国","未知"):
- total_province.add(province)
- total_titles.append(doctitle)
- list_products = product.split(",")
- for p in list_products:
- if len(p)>8:
- continue
- if p not in total_products:
- total_products[p] = 0
- total_products[p] += 1
- if products is not None and products!="":
- products_json = json.loads(products)
- list_products = [a.get("product") for a in products_json]
- for p in list_products:
- if len(p)>8:
- continue
- if p not in total_products:
- total_products[p] = 0
- total_products[p] += 1
- # total_products.extend(list_products)
- # total_products = [a for a in total_products if a is not None and a!=""]
- list_products = []
- for k,v in total_products.items():
- list_products.append((k,v))
- list_products.sort(key=lambda x:x[1],reverse=True)
- # for product in total_products:
- # _find = False
- # for p_i in range(len(remain_products)):
- # p = remain_products[p_i]
- # if p.find(product)>=0:
- # remain_products[p_i] = product
- # _find = True
- # if product.find(p)>=0:
- # _find = True
- # if not _find:
- # remain_products.append(product)
- for p,_ in list_products[:20]:
- remain_products.append(p)
- prompt = "%s的主营产品是什么"%item.get("公司名称")
- # _business = chat_doubao_bot(prompt,"bot-20250725150712-pzfls")
- _business = ""
- item["搜索主营"] = _business
- messages = [
- {"role": "system", "content": "你是豆包,是由字节跳动开发的 AI 人工智能助手"},
- ]
- prompt1 = '''
- 您是一个招投标数据平台的数据专家,你的客户“【%s】”希望和贵司的招投标数据平台合作,需要您为他设定订阅词,目前通过该公司的中标公告中提取的产品词有“【%s】”(注意,这些词里面有一些是提取错误的,也有一些是因为框架标被混杂进来的产品词,请你结合你对这个公司的了解剔除无关关键词),客户自己在平台上自己搜索的关键词有“【%s】”(注意,客户自己搜索的关键词也不一定是业务有关,有些客户使用的时候不熟悉平台所以可能输入有误),请搜索客户公司有关信息,再结合你对客户公司业务的了解,再结合你对客户公司业务的了解,帮客户总结出一些有效的关键词,可以用来进行招投标信息的订阅,关键词数量不超过20个,每个词不超过8个字,用json 格式{"keywords":[]} 返回(注意有些关键词之间是相互包含关系的,只保留最小范围的, 如“种子光选机”和“光选机”, 只保留“光选机”一个就够了)
- '''%(item.get("公司名称"),",".join(remain_products),item.get("搜索词"))
- print(prompt1)
- messages.append({"role": "user", "content": prompt1})
- _result = chat_doubao_bot(messages,"bot-20250725150712-pzfls")
- messages.append({"role": "system", "content": _result})
- _json = get_json_from_text(_result)
- try:
- _dict = json.loads(_json)
- except Exception as e:
- _dict = {}
- if len(total_province)>3 or len(total_province)==0:
- item["地区"] = "全国"
- else:
- item["地区"] = ",".join(list(total_province))
- item["AI订阅词"] = ",".join(_dict.get("keywords",[]))
- must_queries=[
- generateBoolShouldQuery(["doctitle"],item["AI订阅词"].split(","),MatchPhraseQuery),
- RangeQuery("page_time",timeAdd(getCurrent_date("%Y-%m-%d"),-30)),
- TermQuery("docchannel",52)
- ]
- if item["地区"]!="全国":
- must_queries.append(generateBoolShouldQuery(["province"],item["地区"].split(","),TermQuery))
- bool_query = BoolQuery(must_queries=must_queries)
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=1,get_total_count=True),
- ColumnsToGet(["doctitle"],return_type=ColumnReturnType.SPECIFIED))
- if total_count<500:
- must_queries[0] = generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],item["AI订阅词"].split(","),MatchPhraseQuery)
- bool_query = BoolQuery(must_queries=must_queries)
- item["搜索范围"] = "全文"
- list_row = getDocument([{"query":bool_query,"limit":50}],["docid","doctitle",],thread_count=1)
- total_titles = []
- for row in list_row:
- doctitle = row.get("doctitle")
- total_titles.append(doctitle)
- prompt2 = '''
- 订阅后获取到的招标公告标题如下:
- 【%s】
-
- 请你筛选出不符合业务的公告标题,并总结出一些有效通用的标题排除词(排除词也是尽量可泛化一下, 例如”消防监理“属于监理服务,排除词应该直接用”监理“,也尽量用在上述标题中出现的表达搭配,随意省略中间部分词汇可能导致无法有效排除,因为排除词用的是全匹配逻辑),请把这些排除词用 json格式{"exclude_words":[]},json上下文符合正则```json(?P<json>.*)``` 输出(不超过 20 个,每个排除词单个词不超过 6 个字)
- '''%("\r\n".join(total_titles))
- messages.append({"role": "user", "content": prompt2})
- # _result = chat_doubao(prompt2,model_name = "ep-20250314164242-jd62g")
- # _result = chat_doubao_bot(messages,"bot-20250725150712-pzfls")
- _result = chat_doubao_messages(messages,model_name = "ep-20250212111145-fflr7")
- print(prompt2)
- _json = get_json_from_text(_result)
- print(_json)
- try:
- _dict = json.loads(_json)
- except Exception as e:
- _dict = {}
- print(_dict)
- print(_dict.get("exclude_words"))
- item["AI排除词"] = _dict.get("exclude_words")
- item["最终产品词"] = ",".join(remain_products)
- item["标题"] = "\n".join(total_titles)
- filename = "所有注册用户并且关注微信的数据信息.xlsx"
- df = pd.read_excel(filename)
- companynames = ["爱威科技股份有限公司","上海百若试验仪器有限公司","宁联电缆集团有限公司"]
- task_queue = Queue()
- list_data = []
- for company,_b,search in zip(df["公司名"],df["是否中标"],df["搜索词"]):
- print(_b,type(_b))
- if not _b:
- continue
- if not isinstance(search,str):
- continue
- list_search = search.split(",")
- list_search.sort(key=lambda x:len(x))
- _d = {"公司名称":company,
- "搜索词":",".join(list_search[:20]),
- "搜索主营":"",
- "搜索范围":"标题",
- "AI订阅词":"",
- "AI排除词":"",
- "最终产品词":"",
- "标题":"",
- "公告类型":"招标公告",
- "地区":"全国"}
- list_data.append(_d)
- if len(list_data)>100:
- break
- print("list_data",len(list_data))
- for data in list_data:
- task_queue.put(data)
- mt = MultiThreadHandler(task_queue,_handle,None,5)
- mt.run()
- df = pd.DataFrame(list_data)
- df.to_excel("../data/%s_extract2.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
- if __name__=="__main__":
- # compareData()
- # attachAttachment()
- # exportDocument_By_time(time_from="2021-01-29",time_to="2021-01-29",columns=["docid","doctitle","project_name","dochtmlcon"])
- # processDocument()
- # export_extract_check()
- # exportArticle_by_websource()
- # export_keyword_count()
- # export_province_keyword_count()
- # exportDocument_dump()
- # exportDocument_dump_mysql()
- # export_attachment()
- # statics_attachment_counts()
- # get_follows()
- # append_title()
- # exportDocument_by_doctitle()
- # exportIndustryCount()
- # exportDocument_by_pagetime()
- # export_columns()
- # match_products()
- # match_contact()
- # exportProjects()
- # exportProducts()
- # exportParameters()
- # exportAgencyCount()
- # getDocumentHtml()
- # getDumplicate_docid()
- # exportHonors_item_info()
- # check_dump_data()
- # search_title_count()
- # count_product()
- # export_dump_by_id()
- # group_xlsx()
- # static_process_time()
- # check_data_synchronization()
- # process_doc()
- # export_competition()
- # for page_time in ["2022-08-01"]:
- # exportDocument_by_days(page_time)
- # exportDocument_forRecommen()
- # exportDocument_attachment()
- # exportWin_tenderer_count()
- # attachCompanyContact()
- # dumpWebSourceNo()
- # print("http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%103571618))))
- # exportNzj()
- # turn_status()
- # attachBidding_budget()
- # debug_documentMerge()
- # exportDocument_medicine("2021-05-24","2021-05-30")
- # signDocument()
- # transUUid()
- # fix_document()
- # export_document_no_price()
- # findProjects()
- # exportDetailLink()
- # export_extract_check()
- # export_extract2()
- # export_by_file()
- # export_dump()
- # clean_subcription("施工围挡,防眩板,标志牌","")
- fix_subcription()
|