exportDocument.py 285 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231
  1. #encoding:UTF8
  2. import sys
  3. import os
  4. sys.path.append("..")
  5. print(sys.path)
  6. import pandas as pd
  7. from dataSource.source import *
  8. import json
  9. from utils.multiThread import MultiThreadHandler
  10. import queue
  11. from utils.Utils import *
  12. from dataSource.pool import ConnectorPool
  13. import re
  14. from tablestore import *
  15. import traceback
  16. from utils.hashUtil import aesCipher
  17. from uuid import uuid4
  18. from export.exportUtils import *
  19. from export.DoubaoUtils import chat_doubao,get_json_from_text,chat_doubao_bot,chat_doubao_messages
  20. data_path = "../data/"
  21. def getCompanyTenderer():
  22. def _handle(item,result_queue):
  23. company = item
  24. dict_result = {"company":company,"count":0,"competitor":"","project_name":""}
  25. dict_result["company"] = company
  26. graph = getConnect_neo4j()
  27. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhao_biao_id,p.zhong_biao_id"%(company)
  28. finded = graph.run(cql)
  29. finded_ids = json.loads(json.dumps(finded.data()))
  30. dict_result["count"] = len(finded_ids)
  31. mongoDB = getConnect_mongodb()
  32. coll_zb = mongoDB.zhongbiao_extraction
  33. if len(finded_ids)>0:
  34. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.project_name limit 3"%(company)
  35. finded = graph.run(cql)
  36. finded_names = json.loads(json.dumps(finded.data()))
  37. list_names = [_i["p.project_name"] for _i in finded_names]
  38. dict_result["project_name"] = str(list_names)
  39. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company)
  40. finded = graph.run(cql)
  41. finded_money = json.loads(json.dumps(finded.data()))
  42. whole_money = 0
  43. for item in finded_money:
  44. if item["r.price"] is not None:
  45. whole_money += getUnifyMoney(item["r.price"])
  46. dict_result["whole_money"] = str(whole_money)
  47. dict_competitor = {}
  48. for item in finded_ids:
  49. docId = item["p.zhong_biao_id"]
  50. if docId is not None:
  51. rows = coll_zb.find({"docId":docId})
  52. for row in rows:
  53. keys = ["second_tenderer","third_tenderer"]
  54. for _key in keys:
  55. if _key in row:
  56. if row[_key] not in dict_competitor:
  57. dict_competitor[row[_key]] = 0
  58. dict_competitor[row[_key]] += 1
  59. list_competitor = []
  60. for _key in dict_competitor:
  61. list_competitor.append([_key,dict_competitor[_key]])
  62. list_competitor.sort(key=lambda x:x[1],reverse=True)
  63. list_competitors = [i[0] for i in list_competitor[:10]]
  64. dict_result["competitor"] = str(list_competitors)
  65. result_queue.put(dict_result)
  66. # filename = "成交客户匹配中标项目的需求.xlsx"
  67. # df = pd.read_excel(filename)
  68. # list_company = df["公司名字"]
  69. # company = list_company[0]
  70. list_company = []
  71. filename = "../data/服务型客户.txt"
  72. with open(filename,"r",encoding="GBK") as f:
  73. while(True):
  74. line = f.readline()
  75. if not line:
  76. break
  77. list_company.append(line.strip())
  78. task_queue = queue.Queue()
  79. for company in list_company:
  80. task_queue.put(company)
  81. result_queue = queue.Queue()
  82. handler = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=10)
  83. handler.run()
  84. list_company = []
  85. list_zb = []
  86. list_count = []
  87. list_project = []
  88. list_money = []
  89. list_competitor = []
  90. while(True):
  91. try:
  92. _result = result_queue.get(False)
  93. list_company.append(_result.get("company",""))
  94. list_zb.append("是" if _result.get("count","")>0 else "否")
  95. list_count.append(_result.get("count",""))
  96. list_project.append(_result.get("project_name",""))
  97. list_money.append(_result.get("whole_money",""))
  98. list_competitor.append(_result.get("competitor",""))
  99. except Exception as e:
  100. print(e)
  101. break
  102. df1 = pd.DataFrame({"公司名字":list_company,"是否中标":list_zb,"中标次数":list_count,"中标项目":list_project,"中标金额":list_money,"潜在竞争对手":list_competitor})
  103. df1.to_excel("%s_export.xls"%(filename),columns=["公司名字","是否中标","中标次数","中标项目","中标金额","潜在竞争对手"])
  104. def export_count_includeKeyword():
  105. filename = "../data/other/jc001.xlsx"
  106. list_name = []
  107. list_count = []
  108. df = pd.read_excel(filename)
  109. _index = 0
  110. for row in df["品目"]:
  111. _name = row
  112. data = solrQuery("document",{"q":'dochtmlcon:"%s"'%_name,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
  113. if data is not None:
  114. _count = data["response"]["numFound"]
  115. else:
  116. _count = 0
  117. list_name.append(_name)
  118. list_count.append(_count)
  119. _index += 1
  120. print(_index)
  121. df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
  122. df1.to_excel("%s_export.xls"%filename)
  123. def export_count_includeKeyword_multiThread():
  124. def _handler(item,result_queue):
  125. data = solrQuery("document",{"q":'dochtmlcon:"%s"'%item,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
  126. if data is not None:
  127. _count = data["response"]["numFound"]
  128. else:
  129. _count = 0
  130. result_queue.put([item,_count])
  131. task_queue = queue.Queue()
  132. result_queue = queue.Queue()
  133. filename = "../data/other/jc001.xlsx"
  134. list_name = []
  135. list_count = []
  136. df = pd.read_excel(filename)
  137. _index = 0
  138. for row in df["品目"]:
  139. _name = row
  140. task_queue.put(_name)
  141. _index += 1
  142. multHandler = MultiThreadHandler(task_queue,_handler,result_queue,thread_count=20)
  143. multHandler.run()
  144. while(True):
  145. try:
  146. item = result_queue.get(False)
  147. list_name.append(item[0])
  148. list_count.append(item[1])
  149. except queue.Empty as e:
  150. break
  151. df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
  152. df1.to_excel("%s_export.xls"%filename)
  153. def exportKeywords():
  154. def _handle(item,result_queue,pool_mongo):
  155. docId = item["docId"]
  156. mongo = pool_mongo.getConnector()
  157. zhongbiao = mongo.zhongbiao_extraction
  158. zhaobiao = mongo.zhaobiao_extraction
  159. _project = ""
  160. rows = zhaobiao.find({"docId":docId},{"project_name":1})
  161. find_flag = False
  162. for row in rows:
  163. find_flag = True
  164. _project = row.get("project_name","")
  165. if not find_flag:
  166. rows = zhongbiao.find({"docId":docId},{"project_name":1})
  167. for row in rows:
  168. _project = row.get("project_name","")
  169. item["project_name"] = _project
  170. pool_mongo.putConnector(mongo)
  171. result_queue.put(item)
  172. list_key = []
  173. dict_key_ids = dict()
  174. with open("../data/品目.txt", "r", encoding="utf8") as f:
  175. while(True):
  176. row = f.readline()
  177. if not row:
  178. break
  179. list_key.append(row)
  180. dict_key_ids[row] = []
  181. data = solrQuery("document",{"q":'dochtmlcon:"%s" AND dochtmlcon:"法院"'%row,"fq":'(publishtime:[2019-01-01T00:00:00Z TO 2019-12-31T23:59:59Z])',"fl":"id","rows":10000000})
  182. for item in data["response"]["docs"]:
  183. dict_key_ids[row].append(item["id"])
  184. task_queue = queue.Queue()
  185. result_queue = queue.Queue()
  186. for _key in dict_key_ids.keys():
  187. for item in dict_key_ids[_key]:
  188. task_queue.put({"docId":item,"project_name":""})
  189. pool_mongo = ConnectorPool(init_num=10,max_num=200,method_init=getConnect_mongodb)
  190. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=200,pool_mongo=pool_mongo)
  191. mt.run()
  192. dict_docId_projectname = {}
  193. while(True):
  194. try:
  195. item = result_queue.get(False)
  196. dict_docId_projectname[item["docId"]] = item["project_name"]
  197. except Exception:
  198. break
  199. dict_key_count = dict()
  200. for _key in dict_key_ids.keys():
  201. set_key = set()
  202. for docId in dict_key_ids[_key]:
  203. set_key.add(dict_docId_projectname.get(docId,""))
  204. dict_key_count[_key] = len(set_key)
  205. print("==")
  206. for _key in list_key:
  207. print(len(dict_key_ids[_key]))
  208. print("==")
  209. for _key in list_key:
  210. print(dict_key_count[_key])
  211. print("==")
  212. def getIndustryCompany():
  213. def _handle(item,result_queue,pool_mongo,pool_neo4j,pool_mysql,pool_ots):
  214. # mongoDB = getConnect_mongodb()
  215. log(item["enterprise_name"])
  216. mongoDB = pool_mongo.getConnector()
  217. # coll_zb = mongoDB.enterprise_profile
  218. # rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1,"legalPersonName":1,"actualCapital":1, "regCapital":1,"estiblishTime":1,"socialStaffNum":1,"legal_person":1,"phone":1,"businessScope":1,"industry":1 })
  219. # for row in rows:
  220. # item["regCapital"] = row.get("regCapital","")
  221. # item["legal_person"] = row.get("legal_person","")
  222. # item["phone"] = row.get("phone","")
  223. # item["actualCapital"] = row.get("actualCapital","")
  224. # item["industry"] = row.get("industry","")
  225. # item["estiblishTime"] = row.get("estiblishTime","")
  226. # item["socialStaffNum"] = row.get("socialStaffNum","")
  227. # item["businessScope"] = row.get("businessScope","")
  228. # graph = getConnect_neo4j()
  229. ots_client = pool_ots.getConnector()
  230. primary_key = [('name',item["enterprise_name"])]
  231. columns_to_get = ["reg_capital","legal_person","phone","actual_capital","industry","estiblishTime","social_staff_num","business_scope"]
  232. consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
  233. if return_row is not None:
  234. for att in return_row.attribute_columns:
  235. item[att[0]] = att[1]
  236. list_same_industry_company = []
  237. if "industry" in item:
  238. bool_query = BoolQuery(must_queries=[TermQuery("industry",item["industry"])])
  239. col = ColumnsToGet(['enterprise_name'], ColumnReturnType.SPECIFIED)
  240. rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
  241. SearchQuery(bool_query, limit=10, get_total_count=True),
  242. col)
  243. for row in rows:
  244. for item1 in row[0]:
  245. list_same_industry_company.append(item1[1])
  246. # if "industry" in item:
  247. # rows = coll_zb.find({"industry":item["industry"]},{"enterprise_name":1}).limit(10)
  248. # for row in rows:
  249. # print(row)
  250. # list_same_industry_company.append(row.get("enterprise_name",""))
  251. item["same_industry_company"] = list_same_industry_company
  252. graph = pool_neo4j.getConnector()
  253. company_name = item["enterprise_name"]
  254. cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
  255. finded = graph.run(cql)
  256. data = json.loads(json.dumps(finded.data()))
  257. _count = data[0]["_c"]
  258. # list_project = []
  259. # for _data in data:
  260. # if _count<=3:
  261. # if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
  262. # if _data["project_name"] is not None:
  263. # list_project.append(_data["project_name"])
  264. # _count += 1
  265. item["count"] = _count
  266. # item["project"] = str(list_project)
  267. result_queue.put(item)
  268. pool_mongo.putConnector(mongoDB)
  269. pool_neo4j.putConnector(graph)
  270. pool_ots.putConnector(ots_client)
  271. log_tofile("export.log")
  272. pool_mongo = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_mongodb)
  273. pool_neo4j = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_neo4j)
  274. pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql)
  275. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  276. # list_company = getCompanys()
  277. # filename = "".join(["环境","生态","再生","回收","环保"])
  278. list_company = []
  279. filename = "../data/同行客户匹配.xlsx"
  280. df = pd.read_excel(filename,sheetname=0)
  281. for _com in df["公司名称"]:
  282. print(_com)
  283. if _com is not None and _com.strip()!="":
  284. _company = {"enterprise_name":""}
  285. _company["enterprise_name"] = _com
  286. list_company.append(_company)
  287. task_queue = queue.Queue()
  288. for item in list_company:
  289. task_queue.put(item)
  290. result_queue = queue.Queue()
  291. _muti = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j,pool_mysql=pool_mysql,pool_ots=pool_ots)
  292. _muti.run()
  293. df_company = {}
  294. set_key = set()
  295. if len(list_company)>0:
  296. for item in list_company:
  297. for _key in item.keys():
  298. set_key.add(_key)
  299. if _key not in df_company:
  300. df_company[_key] = []
  301. list_key = list(set_key)
  302. for item in list_company:
  303. for _key in list_key:
  304. df_company[_key].append(item.get(_key,""))
  305. df1 = pd.DataFrame(df_company)
  306. df1.to_excel("%s_export.xlsx"%(filename))
  307. def exportWin_tenderer(time_from,time_to):
  308. '''
  309. :return:
  310. '''
  311. ost_client = getConnect_ots()
  312. last_docid = 0
  313. bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
  314. TermQuery("docchannel",101),
  315. RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  316. RangeQuery('docid', last_docid, include_lower=False)])
  317. rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
  318. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
  319. ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
  320. list_project = []
  321. def _getRow(list_project,rows,last_docid):
  322. for row in rows:
  323. project_name = row[1][0][1]
  324. docid = row[0][1][1]
  325. last_docid = docid
  326. list_pack = json.loads(row[1][1][1])
  327. _set_tenderer = set()
  328. win_tenderer = ""
  329. for _pack in list_pack:
  330. if "win_tenderer" in _pack and win_tenderer=="":
  331. win_tenderer = _pack["win_tenderer"]
  332. if "second_tenderer" in _pack:
  333. _set_tenderer.add(_pack["second_tenderer"])
  334. if "third_tenderer" in _pack:
  335. _set_tenderer.add(_pack["third_tenderer"])
  336. list_project.append({"docid":docid,"project_name":project_name,"win_tenderer":win_tenderer,"tenderer":list(_set_tenderer)})
  337. return last_docid
  338. _getRow(list_project,rows,last_docid)
  339. while(next_token):
  340. print("%d/%d"%(len(list_project),total_count))
  341. rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
  342. SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
  343. ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
  344. last_docid = _getRow(list_project,rows,last_docid)
  345. task_queue = queue.Queue()
  346. result_queue = queue.Queue()
  347. for item in list_project:
  348. task_queue.put(item)
  349. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  350. def _handle(item,result_queue,pool_ots):
  351. if item["win_tenderer"]!="":
  352. ots_client = pool_ots.getConnector()
  353. consumed, return_row, next_token = ost_client.get_row("enterprise", [("name",item["win_tenderer"])], ["province","reg_capital","estiblish_time","business_scope"], None, 1)
  354. _dict = dict()
  355. for _item in return_row.attribute_columns:
  356. _dict[_item[0]] = _item[1]
  357. for _key in _dict.keys():
  358. item[_key] = _dict[_key]
  359. data = solrQuery("contact",{"q":'company_name:"%s"'%item["win_tenderer"],"fl":"contact_person,mobile_no,phone_no","rows":10})
  360. for _item in data["response"]["docs"]:
  361. for _key in _item.keys():
  362. item[_key] = _item[_key]
  363. break
  364. pool_ots.putConnector(ots_client)
  365. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots);
  366. mt.run()
  367. keys = ["docid","project_name","win_tenderer","tenderer","province","reg_capital","business_scope","estiblish_time","contact_person","mobile_no","phone_no"]
  368. df_data = {}
  369. for _key in keys:
  370. df_data[_key] = []
  371. for item in list_project:
  372. for _key in keys:
  373. if _key in item:
  374. df_data[_key].append(item[_key])
  375. else:
  376. df_data[_key].append("")
  377. df = pd.DataFrame(df_data)
  378. df.to_excel("../data/%s-%s中标信息.xlsx"%(time_from,time_to),columns=keys)
  379. def exportContact():
  380. time_from = "2021-01-14"
  381. time_to = "2021-01-15"
  382. filename = "../data/%s-%s中标信息.xlsx"%(time_from,time_to)
  383. df1 = pd.read_excel(filename)
  384. set_company = set()
  385. for item in df1["tenderer"]:
  386. list_company = re.split("\['|', '|'\]|\[\]",item)
  387. for _company in list_company:
  388. if _company!="":
  389. set_company.add(_company)
  390. companys = list(set_company)
  391. task_queue = queue.Queue()
  392. list_company = []
  393. for _company in companys:
  394. item = {"company_name":_company}
  395. list_company.append(item)
  396. task_queue.put(item)
  397. result_queue = queue.Queue()
  398. def _handle(item,result_queue):
  399. company = item["company_name"]
  400. data = solrQuery("contact",{"q":'company_name:"%s"'%company,"fl":"company_name,contact_person,mobile_no,phone_no","rows":10})
  401. for _item in data["response"]["docs"]:
  402. for _key in _item.keys():
  403. item[_key] = _item[_key]
  404. break
  405. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30);
  406. mt.run()
  407. keys = ["company_name","contact_person","mobile_no","phone_no"]
  408. df_data = {}
  409. for _key in keys:
  410. df_data[_key] = []
  411. ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
  412. for item in list_company:
  413. for _key in keys:
  414. if _key in item:
  415. df_data[_key].append(ILLEGAL_CHARACTERS_RE.sub(r'', item[_key]))
  416. else:
  417. df_data[_key].append("")
  418. df = pd.DataFrame(df_data)
  419. df.to_excel("../data/%s-%s竞争对手信息.xlsx"%(time_from,time_to),columns=keys)
  420. def countKeyword():
  421. conn = getConnection_mysql()
  422. cursor = conn.cursor()
  423. print(0)
  424. sql = "select dochtmlcon from sys_document_22 where docchannel=52 and page_time>='2020-09-01' and page_time<='2020-09-07'"
  425. cursor.execute(sql)
  426. print(0.1)
  427. df = pd.read_excel("万郡绿建细分关键词.xls")
  428. list_keywords = df["细分类别"]
  429. dict_keywords = dict()
  430. for _key in list_keywords:
  431. dict_keywords[_key] = 0
  432. print(1)
  433. from bs4 import BeautifulSoup
  434. while(True):
  435. rows = cursor.fetchmany(10000)
  436. print("==")
  437. if not rows:
  438. break
  439. for row in rows:
  440. _html = BeautifulSoup(row[0],"lxml").getText()
  441. for _key in list_keywords:
  442. if re.search(_key,_html) is not None:
  443. dict_keywords[_key] += 1
  444. print(dict_keywords)
  445. list_count = []
  446. for _key in list_keywords:
  447. list_count.append(dict_keywords[_key])
  448. df1 = pd.DataFrame({"关键字":list_keywords,"数量":list_count})
  449. df1.to_excel("关键词统计.xlsx")
  450. def countKeyword_solr():
  451. def _handle(item,result_queue):
  452. keyword = item["keyword"]
  453. data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:101 AND dochtmlcon:"法院" '%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
  454. _num = data["response"]["numFound"]
  455. item["zhongbiao"] = _num
  456. data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:52 AND dochtmlcon:"法院"'%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
  457. _num = data["response"]["numFound"]
  458. item["zhaobiao"] = _num
  459. result_queue.put(item)
  460. file = "../data/关键词11.xlsx"
  461. df = pd.read_excel(file)
  462. task_queue = queue.Queue()
  463. print(df.keys())
  464. for item in df["业务关键词"]:
  465. task_queue.put({"keyword":item,"zhaobiao":0,"zhongbiao":0})
  466. result_queue = queue.Queue()
  467. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=10)
  468. mt.run()
  469. list_keyword = []
  470. list_zhaobiao = []
  471. list_zhongbiao = []
  472. while(True):
  473. try:
  474. item = result_queue.get(False)
  475. list_keyword.append(item["keyword"])
  476. list_zhaobiao.append(item["zhaobiao"])
  477. list_zhongbiao.append(item["zhongbiao"])
  478. except Exception:
  479. break
  480. df1 = pd.DataFrame({"业务关键词":list_keyword,"招标公告":list_zhaobiao,"中标信息":list_zhongbiao})
  481. df1.to_excel("%s_export.xlsx"%file,columns=["业务关键词","招标公告","中标信息"])
  482. def query_from_solr():
  483. data = solrQuery("document",{"q":'dochtmlcon:"法律" AND (docchannel:51 OR docchannel:104 or docchannel:52 or docchannel:102) AND province:"湖南" ',"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-01-20T23:59:59Z])',"fl":"id","rows":10})
  484. _num = data["response"]["numFound"]
  485. print(_num)
  486. def export_province_keyword_count():
  487. def _handle(item,result_queue,pool_ots):
  488. columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
  489. ots_client = pool_ots.getConnector()
  490. _province = item["province"]
  491. print(item)
  492. # keywords = item["keyword"]+" "+item["key"]
  493. list_keyword = item["keyword"]
  494. # for _temp in keywords.split(" "):
  495. # if len(_temp)>0:
  496. # list_keyword.append(_temp)
  497. should_queries = []
  498. must_not_q = []
  499. for _temp in list_keyword:
  500. should_queries.append(MatchPhraseQuery("doctitle","%s"%_temp))
  501. must_not_q.append(WildcardQuery("tenderee","*%s*"%_temp))
  502. bool_query_keyword = BoolQuery(should_queries=should_queries,minimum_should_match=2)
  503. page_time = item["page_time"]
  504. bool_query = BoolQuery(must_queries=[#bool_query_keyword
  505. # ,WildcardQuery("publishtime","%s*"%page_time)
  506. # ,MatchPhraseQuery("doctitle","服务")
  507. RangeQuery("page_time","2018-01-01","2019-01-01",include_lower=True,include_upper=False),
  508. TermQuery("docchannel",52),
  509. RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  510. WildcardQuery('city', '%s*'%_province)
  511. # ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_tenderer",0,include_lower=True))
  512. ]
  513. # ,must_not_queries=must_not_q
  514. )
  515. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  516. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("sub_docs_json.bidding_budget",SortOrder.DESC)]), limit=20, get_total_count=True),
  517. ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED))
  518. item["count"] = total_count
  519. list_data = getRow_ots(rows)
  520. item["list_data"] = list_data
  521. print(item)
  522. pool_ots.putConnector(ots_client)
  523. df = pd.read_excel("../data/省份信息.xlsx")
  524. list_province = []
  525. for _name,_type,_parent in zip(df["cname"],df["ctype"],df["parentid"]):
  526. if _type==30 and _parent==4:
  527. list_province.append(_name)
  528. # filename = "../data/2021-02关键词导出数据.xlsx"
  529. # dict_keyword = {}
  530. # df1 = pd.read_excel(filename,dtype=str)
  531. # for _key,_keyword in zip(df1["key1"],df1["keyword"]):
  532. # print("===",str(_keyword))
  533. # dict_keyword[_key] = "" if str(_keyword)=="nan" else _keyword
  534. # for _key in df1["关键词"]:
  535. # dict_keyword[_key] = ""
  536. keyword_str = '''
  537. 快递 物流 供应链 运输 配送
  538. 仓储 冷链 整车 服务
  539. '''
  540. list_key = []
  541. for _k in re.split("\s",keyword_str):
  542. _k1 = _k.strip()
  543. if len(_k1)>0:
  544. list_key.append(_k1)
  545. list_task = []
  546. page_time = "2020-11"
  547. for _province in list_province:
  548. list_task.append({"page_time":page_time,"province":_province,"key":list_key,"keyword":list_key,"count":0})
  549. task_queue = queue.Queue()
  550. for item in list_task:
  551. task_queue.put(item)
  552. result_queue = queue.Queue()
  553. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  554. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
  555. mt.run()
  556. dict_key_data = dict()
  557. # list_data = []
  558. # for item in list_task:
  559. # list_data.extend(item["list_data"])
  560. # dict_channel = getDict_docchannel()
  561. # df_data= {}
  562. # print(list_data)
  563. # for row in list_data:
  564. # item = {}
  565. # _dict = row
  566. # set_dict_item(item,"docid",_dict.get("docid",""))
  567. # set_dict_item(item,"公告标题",_dict.get("doctitle",""))
  568. # set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
  569. # set_dict_item(item,"省份",_dict.get("province",""))
  570. # # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  571. # set_dict_item(item,"城市",_dict.get("city",""))
  572. # set_dict_item(item,"发布时间",_dict.get("page_time",""))
  573. #
  574. # set_dict_item(item,"项目编号",_dict.get("project_code",""))
  575. # set_dict_item(item,"招标单位",_dict.get("tenderee",""))
  576. # set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
  577. # set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
  578. # set_dict_item(item,"代理单位",_dict.get("agency",""))
  579. # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
  580. # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
  581. # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  582. #
  583. # sub_docs_json = _dict.get("sub_docs_json")
  584. # for _doc in json.loads(sub_docs_json):
  585. # if "win_tenderer" in _doc:
  586. # set_dict_item(item,"中标单位",_doc["win_tenderer"])
  587. # if "win_tenderee_manager" in _doc:
  588. # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
  589. # if "win_tenderee_phone" in _doc:
  590. # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
  591. # if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  592. # set_dict_item(item,"中标金额",_doc["win_bid_price"])
  593. # if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  594. # set_dict_item(item,"招标金额",_doc["bidding_budget"])
  595. # if "招标金额" not in item:
  596. # set_dict_item(item,"招标金额","")
  597. # if "中标金额" not in item:
  598. # set_dict_item(item,"中标金额","")
  599. # if "中标单位" not in item:
  600. # set_dict_item(item,"中标单位","")
  601. # if "中标单位联系人" not in item:
  602. # set_dict_item(item,"中标单位联系人","")
  603. # if "中标单位联系电话" not in item:
  604. # set_dict_item(item,"中标单位联系电话","")
  605. #
  606. #
  607. # _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
  608. # # if _line in set_line:
  609. # # continue
  610. # # if item["招标金额"]=="":
  611. # # continue
  612. # # set_line.add(_line)
  613. # for k,v in item.items():
  614. # if k not in df_data:
  615. # df_data[k] = []
  616. # df_data[k].append(v)
  617. # df1 = pd.DataFrame(df_data)
  618. # df1.to_excel("../data/%s_顺丰中标数据.xlsx"%getCurrent_date('%Y-%m-%d_%H%M%S'),columns=list_df_columns)
  619. for item in list_task:
  620. print("%s\t%d"%(item["province"],item["count"]))
  621. # for item in list_task:
  622. # dict_key_data[item["key"]][item["province"]] = item
  623. # dict_key_province = dict()
  624. # dict_key_province["关键词"] = []
  625. # for _province in list_province:
  626. # dict_key_province[_province] = []
  627. # for _key in dict_keyword.keys():
  628. # dict_key_province["关键词"].append(_key)
  629. # for _province in list_province:
  630. # dict_key_province[_province].append(dict_key_data[_key][_province]["count"])
  631. # columns = ["关键词"]
  632. # columns.extend(list_province)
  633. # df2 = pd.DataFrame(dict_key_province)
  634. # df2.to_excel("../data/%s_导出数据.xlsx"%filename,columns=columns)
  635. def export_keyword_count():
  636. def _handle(item,result_queue,ots_client):
  637. shoud_q_docchannel = BoolQuery(should_queries=[
  638. # RangeQuery("docchannel",51,105,True,True)
  639. TermQuery("docchannel",101),
  640. RangeQuery("docchannel",118,120,True,True)
  641. ]
  642. )
  643. should_q_keyword = BoolQuery(should_queries=[
  644. MatchPhraseQuery("doctitle",item["keyword"]),
  645. MatchPhraseQuery("doctextcon",item["keyword"]),
  646. MatchPhraseQuery("attachmenttextcon",item["keyword"])
  647. ])
  648. bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
  649. RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  650. generateBoolShouldQuery(["docchannel"],[51, 52, 101, 118, 119, 120, 114, 51, 103],TermQuery),
  651. # TermQuery("docchannel",101),
  652. # shoud_q_docchannel,
  653. should_q_keyword
  654. # MatchPhraseQuery(item["type"], item["keyword"])
  655. ])
  656. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  657. SearchQuery(bool_query, limit=1, get_total_count=True),
  658. ColumnsToGet(return_type=ColumnReturnType.NONE))
  659. item["total_count"] = total_count
  660. if total_count>0:
  661. item["exists"] =1
  662. else:
  663. item["exists"] =0
  664. bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
  665. RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  666. # TermQuery("docchannel",52),
  667. # shoud_q_docchannel,
  668. # should_q_keyword
  669. TermQuery("tenderee",item["keyword"])
  670. # MatchPhraseQuery(item["type"], item["keyword"])
  671. ])
  672. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  673. SearchQuery(bool_query, limit=1, get_total_count=True),
  674. ColumnsToGet(return_type=ColumnReturnType.NONE))
  675. item["zhaobiao_count"] = total_count
  676. bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
  677. RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  678. # TermQuery("docchannel",101),
  679. # shoud_q_docchannel,
  680. # should_q_keyword,
  681. NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["keyword"]))
  682. # MatchPhraseQuery(item["type"], item["keyword"])
  683. ])
  684. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  685. SearchQuery(bool_query, limit=1, get_total_count=True),
  686. ColumnsToGet(return_type=ColumnReturnType.NONE))
  687. item["zhongbiao_count"] = total_count
  688. range_from = "2024-08-01"
  689. range_to = "2024-11-01"
  690. _type = "doctitle"
  691. assert _type in ["doctitle","doctextcon"]
  692. list_dict_key_count = []
  693. filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-12\疑似少数据的中标企业名单.csv"
  694. with open(filename,"r",encoding="utf8") as f:
  695. while 1:
  696. line = f.readline()
  697. if not line:
  698. break
  699. line = line.strip()
  700. if line=="name":
  701. continue
  702. list_dict_key_count.append({"keyword":line,"count":0,"exists":0,"range_from":range_from,"range_to":range_to,"type":_type})
  703. # if len(list_dict_key_count)>=1000:
  704. # break
  705. # df = pd.read_csv(filename,encoding="ISO-8859-1")
  706. # for item in df["name"]:
  707. # list_dict_key_count.append({"keyword":item,"count":0,"exists":0,"range_from":range_from,"range_to":range_to,"type":_type})
  708. # str_keys = '''
  709. # 智慧税务
  710. # 发票管理
  711. #
  712. # '''
  713. # for item in re.split("\s|\r|\n|,|,|、",str_keys):
  714. # if item.strip()!="":
  715. # list_dict_key_count.append({"keyword":item,"total_count":0,"zhaobiao_count":0,"zhongbiao_count":0,"range_from":range_from,"range_to":range_to,"type":_type})
  716. task_queue = queue.Queue()
  717. for item in list_dict_key_count:
  718. task_queue.put(item)
  719. result_queue = queue.Queue()
  720. ots_client = getConnect_ots()
  721. # pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  722. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,ots_client=ots_client)
  723. mt.run()
  724. columns = ["keyword","total_count","exists","zhaobiao_count","zhongbiao_count","range_from","range_to","type"]
  725. df_data = {}
  726. for _c in columns:
  727. df_data[_c] = []
  728. for item in list_dict_key_count:
  729. for _c in columns:
  730. if _c in item:
  731. df_data[_c].append(item[_c])
  732. else:
  733. df_data[_c].append("")
  734. for k,v in df_data.items():
  735. print(k,len(v))
  736. df2 = pd.DataFrame(df_data)
  737. df2.to_excel("%s_数量导出全部类别.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=columns)
  738. def export_keyword_title():
  739. ots_client = getConnect_ots()
  740. range_from = "2020-01-01"
  741. range_to = "2022-12-23"
  742. list_condition = [["医务室"],
  743. ["医院"],
  744. ["卫生院"],
  745. ["卫生所"],
  746. ["卫生室"],
  747. ["社区卫生服务中心"]]
  748. list_should_query = []
  749. for _c in list_condition:
  750. if len(_c)==1:
  751. list_should_query.append(MatchPhraseQuery("doctitle",_c[0]))
  752. else:
  753. _must_query = []
  754. for _q in _c:
  755. _must_query.append(MatchPhraseQuery("doctitle",_q))
  756. list_should_query.append(BoolQuery(must_queries=_must_query))
  757. keyword_query = BoolQuery(should_queries=list_should_query)
  758. bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",range_from,range_to),
  759. RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  760. keyword_query
  761. ])
  762. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  763. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
  764. ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
  765. df_data = {"docid":[],"doctitle":[],"tenderee":[]}
  766. def setData(df_data,rows):
  767. list_dict = getRow_ots(rows)
  768. for _dict in list_dict:
  769. docid = _dict.get("docid","")
  770. doctitle = _dict.get("doctitle","")
  771. tenderee = _dict.get("tenderee","")
  772. df_data["docid"].append(docid)
  773. df_data["doctitle"].append(doctitle)
  774. df_data["tenderee"].append(tenderee)
  775. setData(df_data,rows)
  776. _count = len(rows)
  777. while next_token:
  778. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  779. SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
  780. ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
  781. setData(df_data,rows)
  782. _count += 100
  783. print(_count,total_count)
  784. file_begin = 0
  785. file_length = 100000
  786. _count = 0
  787. while file_begin<len(df_data["docid"]):
  788. _dict = dict()
  789. for _key,v in df_data.items():
  790. _dict[_key] = v[file_begin:file_begin+file_length]
  791. _count += 1
  792. file_begin += file_length
  793. df = pd.DataFrame(_dict)
  794. df.to_csv("../data/%s-%s_tenderee_doctitle_%d.csv"%(range_from,range_to,_count))
  795. def exportArticle_by_websource():
  796. # conn = getConnection_testmysql()
  797. # cursor = conn.cursor()
  798. # sql = "select web_source_no from web_source"
  799. # cursor.execute(sql)
  800. # rows = cursor.fetchmany(10)
  801. # dict_websource = dict()
  802. # while(rows):
  803. # for row in rows:
  804. # web_source_no = row[0]
  805. # dict_websource[web_source_no] = []
  806. # rows = cursor.fetchmany(1000)
  807. #
  808. # task_queue = queue.Queue()
  809. # for _key in dict_websource.keys():
  810. # task_queue.put({"key":_key,"list":dict_websource[_key]})
  811. #
  812. # pool_ots = ConnectorPool(init_num=100,max_num=1000,method_init=getConnect_ots)
  813. # result_queue = queue.Queue()
  814. # def _handle(item,result_queue,pool_ots):
  815. # _key = item["key"]
  816. # print(_key)
  817. # ots_client = pool_ots.getConnector()
  818. # bool_query = BoolQuery(must_queries=[RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
  819. # TermQuery('web_source_no', '%s'%_key)
  820. # ])
  821. #
  822. # is_all_succeed = False
  823. #
  824. # while(not is_all_succeed):
  825. # try:
  826. # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  827. # SearchQuery(bool_query, limit=100, get_total_count=True),
  828. # ColumnsToGet(["docid","docchannel","dochtmlcon"],return_type=ColumnReturnType.SPECIFIED))
  829. # list_zhaobiao = []
  830. # list_zhongbiao = []
  831. # for row in rows:
  832. # _dict = dict()
  833. # for values in row:
  834. # for _v in values:
  835. # _dict[_v[0]] = _v[1]
  836. # if _dict["docchannel"]==52:
  837. # list_zhaobiao.append(_dict)
  838. # elif _dict["docchannel"]==101:
  839. # list_zhongbiao.append(_dict)
  840. # item["list"].extend(list_zhaobiao[:5])
  841. # item["list"].extend(list_zhongbiao[:5])
  842. # except Exception as e:
  843. # print(str(e))
  844. #
  845. # pool_ots.putConnector(ots_client)
  846. #
  847. # mt = MultiThreadHandler(task_queue = task_queue,task_handler=_handle,result_queue=result_queue,thread_count=100,pool_ots=pool_ots)
  848. # mt.run()
  849. # df_data = {"docid":[],"web_source_no":[],"docchannel":[],"dochtmlcon":[]}
  850. # for k,v in dict_websource.items():
  851. # for item in v:
  852. # df_data["docid"].append(item["docid"])
  853. # df_data["web_source_no"].append(k)
  854. # df_data["docchannel"].append(item["docchannel"])
  855. # df_data["dochtmlcon"].append(item["dochtmlcon"])
  856. # df = pd.DataFrame(df_data)
  857. # df.to_csv("../data/websouce_doc.csv",columns=["docid","web_source_no","docchannel","dochtmlcon"],encoding="UTF8")
  858. df = pd.read_csv("../data/other/websouce_doc.csv")
  859. df_2000 = {"document_id":[],"document_text":[]}
  860. print("total_count",len(df["docid"]))
  861. begin = 230000
  862. end = 260000
  863. _count = 0
  864. for _id,_text in zip(df["docid"][begin:end],df["dochtmlcon"][begin:end]):
  865. if len(_text)>100000:
  866. continue
  867. df_2000["document_id"].append(_id)
  868. df_2000["document_text"].append(_text)
  869. df_2 = pd.DataFrame(df_2000)
  870. df_2.to_csv("../data/websouce_doc_%d-%d.csv"%(begin,end),columns=["document_id","document_text"],encoding="utf8",index=False)
  871. # save(dict_websource,"../data/dict_websource.pk")
  872. def getWinTenderer(sub_doc_json):
  873. if sub_doc_json is not None:
  874. sub_doc = json.loads(sub_doc_json)
  875. for _doc in sub_doc:
  876. if "win_tenderer" in _doc:
  877. return _doc["win_tenderer"]
  878. return ""
  879. def exportDocument_by_keywords(page_time,
  880. list_keyword = ["创客","STEAM","人工智能","课程服务","机器人中学","机器人小学","机器人幼儿园","机器人学校","Labplus","盛思","makeblock柴火","寓乐湾","美科科技","STEAM","能力风暴","优必选","蘑菇云","Dfrobot","中鸣","飞瑞敖","编程猫培生","八爪鱼","八爪鱼教育","童心制物"]):
  881. task_queue = queue.Queue()
  882. result_queue = queue.Queue()
  883. for _k in list_keyword:
  884. task_queue.put(_k)
  885. def _handle(keyword,result_queue):
  886. should_queries = []
  887. for _temp in [keyword]:
  888. should_queries.append(MatchPhraseQuery("doctitle",_temp))
  889. bool_query_keyword = BoolQuery(should_queries=should_queries)
  890. ots_client = getConnect_ots()
  891. bool_query = BoolQuery(must_queries=[RangeQuery('publishtime', range_from='2017-12-20'),
  892. MatchPhraseQuery("doctitle",keyword),
  893. TermQuery("docchannel","101")
  894. ])
  895. is_all_succeed = False
  896. _count = 0
  897. total_count = 1
  898. next_token = None
  899. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  900. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
  901. ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
  902. for row in rows:
  903. _dict = dict()
  904. for values in row:
  905. for _v in values:
  906. _dict[_v[0]] = _v[1]
  907. result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
  908. print(keyword,next_token,total_count)
  909. while(next_token):
  910. try:
  911. # print(next_token)
  912. _count += len(rows)
  913. print("%s:%d/%d"%(keyword,_count,total_count))
  914. for row in rows:
  915. _dict = dict()
  916. for values in row:
  917. for _v in values:
  918. _dict[_v[0]] = _v[1]
  919. result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
  920. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  921. SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
  922. ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
  923. except Exception as e:
  924. traceback.print_exc()
  925. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
  926. mt.run()
  927. df_data = {"docid":[],"keyword":[],"tenderee":[],"win_tenderer":[]}
  928. while(True):
  929. try:
  930. item = result_queue.get(block=True,timeout=1)
  931. for _k in df_data.keys():
  932. if _k in item:
  933. df_data[_k].append(item[_k])
  934. else:
  935. df_data[_k].append("")
  936. except queue.Empty as e:
  937. break
  938. except Exception as e:
  939. traceback.print_exc()
  940. df = pd.DataFrame(df_data)
  941. df.to_csv("../data/exportArticle1_title.csv",columns=["docid","keyword","tenderee","win_tenderer"])
  942. def exportGovement():
  943. should_queries1 = []
  944. for _temp in ["教育局","地化所","税务局","国土局","学校","大学","中学","小学","幼儿园","医院"]:
  945. should_queries1.append(WildcardQuery("tenderee","*%s*"%_temp))
  946. should_queries2 = []
  947. for _temp in ["浙江","江苏","湖北","西北","陕西","甘肃","青海","宁夏","新疆","重庆","四川","云南","贵州"]:
  948. should_queries2.append(WildcardQuery("province","*%s*"%_temp))
  949. ots_client = getConnect_ots()
  950. page_time = "2020-12"
  951. bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=should_queries1),
  952. BoolQuery(should_queries=should_queries2),
  953. TermQuery("docchannel","52"),
  954. RangeQuery("publishtime",page_time)])
  955. columns = ["tenderee","tenderee_contact","tenderee_phone"]
  956. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  957. SearchQuery(bool_query, limit=100, sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True),
  958. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  959. print(total_count)
  960. def getRow(rows,df_data,columns):
  961. for row in rows:
  962. _dict = dict()
  963. for part in row:
  964. for item in part:
  965. _dict[item[0]] = item[1]
  966. if "tenderee_contact" in _dict and "tenderee_phone" in _dict:
  967. for key in columns:
  968. df_data[key].append(_dict.get(key,""))
  969. all_rows = 0
  970. df_data = {}
  971. for key in columns:
  972. df_data[key] = []
  973. getRow(rows,df_data,columns)
  974. _count = 100
  975. while(next_token):
  976. print(_count,total_count)
  977. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  978. SearchQuery(bool_query,next_token=next_token, limit=100,get_total_count=True),
  979. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  980. _count += 100
  981. getRow(rows,df_data,columns)
  982. df2 = pd.DataFrame(df_data)
  983. df2.to_excel("../data/%s政府招标人导出数据.xlsx"%page_time,columns=columns)
  984. def export_attachment():
  985. filename = "运营商20240417(1).xlsx"
  986. df = pd.read_excel(filename)
  987. auth = oss2.Auth("LTAI5tFuoxHm8Uxrr5nT8wTZ", "Yp01bylJFx0al6teCaccY8hbtllBGg")
  988. bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
  989. attachment_bucket_name = "attachment-hub"
  990. bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)
  991. ots_client = getConnect_ots()
  992. list_query = []
  993. for _title,_no,tenderee,win_tenderer in zip(df["新标题"],df["项目编号"],df["新招采单位"],df["新中标单位"]):
  994. _dict = {"title":_title,
  995. "project_code":_no,
  996. "tenderee":tenderee,
  997. "win_tenderer":win_tenderer}
  998. list_query.append(_dict)
  999. def _handle(_dict,result_queue):
  1000. title = _dict["title"]
  1001. project_code = _dict["project_code"]
  1002. tenderee = _dict["tenderee"]
  1003. win_tenderer = _dict["win_tenderer"]
  1004. if isinstance(project_code,str):
  1005. bool_query = BoolQuery(must_queries=[
  1006. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[title],MatchPhraseQuery),
  1007. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[project_code],MatchPhraseQuery),
  1008. NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
  1009. RangeQuery("page_time","2022-01-01")
  1010. ])
  1011. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1012. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
  1013. ColumnsToGet(["page_attachments"],return_type=ColumnReturnType.SPECIFIED))
  1014. list_data = getRow_ots(rows)
  1015. for _data in list_data:
  1016. docid = _data["docid"]
  1017. page_attachments = _data["page_attachments"]
  1018. for _attach in json.loads(page_attachments):
  1019. filemd5 = _attach["fileMd5"]
  1020. consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",filemd5)],columns_to_get=["classification","path"])
  1021. dict_k = getRow_ots_primary(return_row)
  1022. if dict_k is not None and dict_k.get("classification")=="招标文件":
  1023. _dict["docid"] = docid
  1024. _dict["path"] = dict_k.get("path")
  1025. break
  1026. if "docid" in _dict:
  1027. break
  1028. if "docid" not in _dict:
  1029. if isinstance(tenderee,str):
  1030. bool_query = BoolQuery(must_queries=[
  1031. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[title],MatchPhraseQuery),
  1032. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[tenderee],MatchPhraseQuery),
  1033. NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
  1034. RangeQuery("page_time","2022-01-01")
  1035. ])
  1036. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1037. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
  1038. ColumnsToGet(["page_attachments"],return_type=ColumnReturnType.SPECIFIED))
  1039. list_data = getRow_ots(rows)
  1040. for _data in list_data:
  1041. docid = _data["docid"]
  1042. page_attachments = _data["page_attachments"]
  1043. for _attach in json.loads(page_attachments):
  1044. filemd5 = _attach["fileMd5"]
  1045. consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",filemd5)],columns_to_get=["classification","path"])
  1046. dict_k = getRow_ots_primary(return_row)
  1047. if dict_k is not None and dict_k.get("classification")=="招标文件":
  1048. _dict["docid"] = docid
  1049. _dict["path"] = dict_k.get("path")
  1050. break
  1051. if "docid" in _dict:
  1052. break
  1053. if "docid" not in _dict:
  1054. if isinstance(win_tenderer,str):
  1055. bool_query = BoolQuery(must_queries=[
  1056. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[title],MatchPhraseQuery),
  1057. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[win_tenderer],MatchPhraseQuery),
  1058. NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
  1059. RangeQuery("page_time","2022-01-01")
  1060. ])
  1061. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1062. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
  1063. ColumnsToGet(["page_attachments"],return_type=ColumnReturnType.SPECIFIED))
  1064. list_data = getRow_ots(rows)
  1065. for _data in list_data:
  1066. docid = _data["docid"]
  1067. page_attachments = _data["page_attachments"]
  1068. for _attach in json.loads(page_attachments):
  1069. filemd5 = _attach["fileMd5"]
  1070. consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",filemd5)],columns_to_get=["classification","path"])
  1071. dict_k = getRow_ots_primary(return_row)
  1072. if dict_k is not None and dict_k.get("classification")=="招标文件":
  1073. _dict["docid"] = docid
  1074. _dict["path"] = dict_k.get("path")
  1075. break
  1076. if "docid" in _dict:
  1077. break
  1078. task_queue = queue.Queue()
  1079. result_queue = queue.Queue()
  1080. for item in list_query:
  1081. task_queue.put(item)
  1082. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
  1083. mt.run()
  1084. list_data = []
  1085. for item in list_query:
  1086. list_data.append([item.get("title"),item.get("docid")])
  1087. print(item.get("title"),item.get("docid"),"docid" in item)
  1088. path = item.get("path")
  1089. if path is not None:
  1090. try:
  1091. oss2.resumable_download(bucket,path,"附件/%s.%s"%(re.sub("[<>\[\]{}/\';,.‘、\"]",'',item.get("title")),path.split(".")[-1]))
  1092. except Exception as e:
  1093. print("download error %s %d"%(item.get("title"),item.get("docid")))
  1094. df = pd.DataFrame(list_data)
  1095. df.to_excel("a.xlsx")
  1096. def exportIndustryCount():
  1097. import codecs
  1098. time_from = "2020-12-21"
  1099. time_to = "2020-12-25"
  1100. # dict_channel = {"51":{"type":"公告变更"},
  1101. # "52":{"type":"招标公告"},
  1102. # "101":{"type":"中标信息"},
  1103. # "102":{"type":"招标预告"},
  1104. # "103":{"type":"招标答疑"},
  1105. # "104":{"type":"招标文件"},
  1106. # "105":{"type":"资审结果"},
  1107. # "103":{"type":"招标控制价"},
  1108. # "100":{"type":"未知类型"}}
  1109. dict_industry = {}
  1110. meta_industry = load("../data/other/class2dalei_menlei.pkl")
  1111. for _key in meta_industry.keys():
  1112. dict_industry[_key] = {"type":_key}
  1113. print(dict_industry.keys())
  1114. return
  1115. task_queue = queue.Queue()
  1116. result_queue = queue.Queue()
  1117. for _key in dict_industry.keys():
  1118. task_queue.put(dict_industry[_key])
  1119. def _handle(item,result_queue,pool_ots):
  1120. ots_client = pool_ots.getConnector()
  1121. bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"]),
  1122. RangeQuery("publishtime",time_from,time_to,include_lower=True,include_upper=True)])
  1123. columns = ["docid"]
  1124. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1125. SearchQuery(bool_query, limit=1,get_total_count=True),
  1126. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1127. item["count"] = total_count
  1128. columns = ["dochtmlcon"]
  1129. bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"])])
  1130. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1131. SearchQuery(bool_query, limit=10,sort=Sort(sorters=[FieldSort("publishtime",SortOrder.ASC)]),get_total_count=True),
  1132. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1133. for row in rows:
  1134. _dict = dict()
  1135. for part in row:
  1136. for v in part:
  1137. _dict[v[0]] = v[1]
  1138. with codecs.open("../data/industry/%s_%d.html"%(item["type"],_dict["docid"]),"w",encoding="UTF8") as f:
  1139. f.write(_dict["dochtmlcon"])
  1140. pool_ots.putConnector(ots_client)
  1141. pool_ots = ConnectorPool(init_num=20,max_num=30,method_init=getConnect_ots)
  1142. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
  1143. mt.run()
  1144. columns = ["type","count"]
  1145. df_data = {}
  1146. for _c in columns:
  1147. df_data[_c] = []
  1148. for _indus in dict_industry.keys():
  1149. for _c in columns:
  1150. df_data[_c].append(dict_industry[_indus][_c])
  1151. df = pd.DataFrame(df_data)
  1152. # df.to_excel("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
  1153. df.to_csv("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
  1154. def exportDocument_By_time(time_from,time_to,columns=["docid","doctitle","project_name","dochtmlcon"]):
  1155. '''
  1156. :return:
  1157. '''
  1158. ost_client = getConnect_ots()
  1159. last_docid = 0
  1160. bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
  1161. RangeQuery('status', '201', '300', include_lower=True, include_upper=True)])
  1162. rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
  1163. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
  1164. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1165. _count = len(rows)
  1166. df_data = {}
  1167. def getData(df_data,rows):
  1168. list_dict = getRow_ots(rows)
  1169. for _dict in list_dict:
  1170. for _k,_v in _dict.items():
  1171. if _k not in df_data:
  1172. df_data[_k] = []
  1173. df_data[_k].append(getLegal_str(_v))
  1174. getData(df_data,rows)
  1175. while(next_token):
  1176. print("%d/%d"%(_count,total_count))
  1177. rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
  1178. SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
  1179. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1180. _count += len(rows)
  1181. getData(df_data,rows)
  1182. df = pd.DataFrame(df_data)
  1183. df.to_excel("%s/%s-%s公告信息.xlsx"%(data_path,time_from,time_to),columns=columns)
  1184. def processDocument():
  1185. filename = "../data/2021-01-29-2021-01-29公告信息.xlsx"
  1186. df = pd.read_excel(filename)
  1187. df.to_csv("../data/2021-01-29-2021-01-29公告信息.csv")
  1188. return
  1189. list_dict = []
  1190. for docid,doctitle,project_name,dochtmlcon in zip(df["docid"],df["doctitle"],df["project_name"],df["dochtmlcon"]):
  1191. list_dict.append({"docid":docid,"doctitle":doctitle,"project_name":project_name,"dochtmlcon":dochtmlcon})
  1192. task_queue = queue.Queue()
  1193. for _dict in list_dict:
  1194. task_queue.put(_dict)
  1195. result_queue = queue.Queue()
  1196. def _handle(_dict,result_queue,pool_mysql):
  1197. conn = pool_mysql.getConnector()
  1198. cursor = conn.cursor()
  1199. sql = "insert into test_extract(docid,doctitle,page_time) values(%d,%s,%s)"%(_dict["docid"],_dict["doctitle"],_dict["dochtmlcon"])
  1200. cursor.execute(sql)
  1201. conn.commit()
  1202. pool_mysql.putConnector(conn)
  1203. # url = "http://192.168.2.101:15030"
  1204. # myheaders = {'Content-Type': 'application/json'}
  1205. # print(int(_dict["docid"]))
  1206. # data = {"doc_id":int(_dict["docid"]),"title":_dict["doctitle"],"content":_dict["dochtmlcon"]}
  1207. # resp = requests.post(url,json=data,headers=myheaders, verify=True)
  1208. # result = json.loads(resp.content.decode("utf8"),"utf8")
  1209. # _dict["product"] = result["product"]
  1210. pool_mysql = ConnectorPool(init_num=20,max_num=30,method_init=getConnection_testmysql)
  1211. mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=5,pool_mysql=pool_mysql)
  1212. mt.run()
  1213. # columns = ["docid","doctitle","project_name","product"]
  1214. #
  1215. # df_data = {}
  1216. # for _c in columns:
  1217. # df_data[_c] = []
  1218. # for _dict in list_dict:
  1219. # for _c in columns:
  1220. # df_data[_c].append(_dict.get(_c,""))
  1221. # df = pd.DataFrame(df_data)
  1222. # df.to_excel("%s.product.xlsx"%(filename),columns=columns)
  1223. def export_extract_check():
  1224. '''
  1225. :return:导出数据提取校验的结果并生成报告
  1226. '''
  1227. conn = getConnection_testmysql()
  1228. cursor = conn.cursor()
  1229. sql = " select docid,json_result from exportdb.extract_check "
  1230. cursor.execute(sql)
  1231. dict_global = {}
  1232. df_global = {"key_type":[],"online_count":[],"test_count":[],"diff_count":[],"diff_percent":[]}
  1233. df_document = {"docid":[]}
  1234. while True:
  1235. rows = cursor.fetchmany(10000)
  1236. if not rows:
  1237. break
  1238. for docid,json_result in rows:
  1239. df_document["docid"].append(docid)
  1240. _result = json.loads(json_result)
  1241. for k,v in _result.items():
  1242. key = k.split("_")
  1243. _key = "_".join(key[:-1])
  1244. if "punish" in _key or "complainants" in _key or "institutions" in _key:
  1245. continue
  1246. if k not in df_document:
  1247. df_document[k] = []
  1248. df_document[k].append(v)
  1249. key_type = key[-1]
  1250. if _key not in dict_global:
  1251. dict_global[_key] = {}
  1252. if key_type not in dict_global[_key]:
  1253. dict_global[_key][key_type] = 0
  1254. if key_type=="diff":
  1255. dict_global[_key][key_type] += v
  1256. if key_type in ("online","test"):
  1257. if isinstance(v,str):
  1258. if v!="":
  1259. dict_global[_key][key_type] += 1
  1260. elif isinstance(v,list):
  1261. dict_global[_key][key_type] += len(v)
  1262. for k,v in dict_global.items():
  1263. df_global["key_type"].append(k)
  1264. df_global["online_count"].append(v["online"])
  1265. df_global["test_count"].append(v["test"])
  1266. df_global["diff_count"].append(v["diff"])
  1267. df_global["diff_percent"].append(v["diff"]/v["online"] if v["online"]>0 else 0)
  1268. filename = "../data/%s_extract_check.xlsx"%(time.strftime("%Y-%m-%d"))
  1269. with pd.ExcelWriter(filename) as writer:
  1270. df1 = pd.DataFrame(df_global)
  1271. df1.to_excel(writer,sheet_name="global")
  1272. for k,v in df_document.items():
  1273. print(k,len(v))
  1274. df2 = pd.DataFrame(df_document)
  1275. df2.to_excel(writer,sheet_name="document")
  1276. writer.save()
  1277. writer.close()
  1278. def exportDocument_dump():
  1279. # filename = "../data/重复公告.xlsx"
  1280. # df = pd.read_excel(filename)
  1281. ots_client = getConnect_ots()
  1282. columns = ["docid","docchannel","page_time","web_source_no","doctitle","tenderee","agency","project_code","project_name","sub_docs_json"]
  1283. df_keys = ["docid","docchannel","page_time","web_source_no","doctitle","doctitle_refine","tenderee","agency","project_code","project_name","bidding_budget","win_bid_price","win_tenderer","URL"]
  1284. df_data = {}
  1285. for _key in df_keys:
  1286. df_data[_key] = []
  1287. bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-03-03"),
  1288. RangeQuery("status",201,300,True,True)])
  1289. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1290. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
  1291. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1292. def getData(df_data,rows):
  1293. list_data = getRow_ots(rows)
  1294. for row in list_data:
  1295. dict_find = {}
  1296. for _key in df_keys:
  1297. dict_find[_key] = 0
  1298. for _k,_v in row.items():
  1299. if _k in df_keys:
  1300. dict_find[_k] = 1
  1301. if _k=="project_code":
  1302. _v = '"%s"'%_v
  1303. df_data[_k].append(_v)
  1304. doctitle = row.get("doctitle","")
  1305. df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
  1306. df_data["URL"].append("http://www.bidizhaobiao.com/info-%d.html"%(row["docid"]))
  1307. dict_find["URL"] = 1
  1308. dict_find["doctitle_refine"] = 1
  1309. sub_docs_json = row.get("sub_docs_json","[{}]")
  1310. doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  1311. if sub_docs_json is not None:
  1312. for sub_docs in json.loads(sub_docs_json):
  1313. for _key_sub_docs in sub_docs.keys():
  1314. if _key_sub_docs in doc_columns:
  1315. if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  1316. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  1317. if float(sub_docs[_key_sub_docs])>0:
  1318. doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  1319. else:
  1320. doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  1321. for _k,_v in doc_columns.items():
  1322. dict_find[_k] = 1
  1323. df_data[_k].append(_v)
  1324. for _k,_v in dict_find.items():
  1325. if _v==0:
  1326. df_data[_k].append("")
  1327. _count = len(rows)
  1328. getData(df_data,rows)
  1329. while next_token:
  1330. print("%d/%d"%(_count,total_count))
  1331. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1332. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  1333. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1334. getData(df_data,rows)
  1335. _count += len(rows)
  1336. # for docid in df["docid"]:
  1337. # bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
  1338. #
  1339. # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1340. # SearchQuery(bool_query , limit=100, get_total_count=True),
  1341. # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1342. # list_data = getRow_ots(rows)
  1343. # if len(list_data)>0:
  1344. # dict_find = {}
  1345. # for _key in df_keys:
  1346. # dict_find[_key] = 0
  1347. # for _k,_v in list_data[0].items():
  1348. # if _k in df_keys:
  1349. # dict_find[_k] = 1
  1350. # df_data[_k].append(_v)
  1351. # doctitle = list_data[0].get("doctitle","")
  1352. # df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
  1353. # dict_find["doctitle_refine"] = 1
  1354. # sub_docs_json = list_data[0].get("sub_docs_json","[{}]")
  1355. # doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  1356. # if sub_docs_json is not None:
  1357. # for sub_docs in json.loads(sub_docs_json):
  1358. # for _key_sub_docs in sub_docs.keys():
  1359. # if _key_sub_docs in doc_columns:
  1360. # if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  1361. # if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  1362. # if float(sub_docs[_key_sub_docs])>0:
  1363. # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  1364. # else:
  1365. # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  1366. # for _k,_v in doc_columns.items():
  1367. # dict_find[_k] = 1
  1368. # df_data[_k].append(_v)
  1369. # for _k,_v in dict_find.items():
  1370. # if _v==0:
  1371. # df_data[_k].append("")
  1372. df1 = pd.DataFrame(df_data)
  1373. df1.to_csv("../data/0303去重.csv",columns=df_keys)
  1374. def exportDocument_dump_mysql():
  1375. conn = getConnection_testmysql()
  1376. cursor = conn.cursor()
  1377. columns = ["project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer","group_id","docid"]
  1378. df_data = {}
  1379. for _c in columns:
  1380. df_data[_c] = []
  1381. sql = " select "+",".join(columns)+" from run_dumplicate_document_his where group_id in (select group_id from run_dumplicate_document_his group by group_id having count(1)>1)"
  1382. cursor.execute(sql)
  1383. while True:
  1384. rows = cursor.fetchmany(100000)
  1385. if not rows:
  1386. break
  1387. for row in rows:
  1388. for _i in range(len(columns)):
  1389. df_data[columns[_i]].append(row[_i])
  1390. df = pd.DataFrame(df_data)
  1391. df.to_csv("../data/0304去重.csv",columns=["group_id","docid","project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer"])
  1392. print(cursor.description)
  1393. def getDict_docchannel():
  1394. filename = "docchannel.pk"
  1395. if os.path.exists(filename):
  1396. _dict = load(filename)
  1397. return _dict
  1398. conn = getConnection_mysql()
  1399. cursor = conn.cursor()
  1400. sql = "select channel_id,chnlname from sys_channel "
  1401. cursor.execute(sql)
  1402. rows = cursor.fetchall()
  1403. _dict = dict()
  1404. for row in rows:
  1405. _dict[row[0]] = row[1]
  1406. save(_dict,filename)
  1407. return _dict
  1408. def exportDocument_by_doctitle():
  1409. def timeAdd_minute(_time,minutes):
  1410. a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+60*minutes
  1411. _time1 = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(a))
  1412. return _time1
  1413. columns = ["docid","attachmenttextcon","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource","time_bidclose","time_bidopen"]
  1414. columns = ["doctitle","attachmenttextcon","doctextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no"]
  1415. columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no"]
  1416. dict_channel = getDict_docchannel()
  1417. task_queue = queue.Queue()
  1418. result_queue = queue.Queue()
  1419. list_keyword = []
  1420. # for _keyword in re.split("\s|/|、",str_keyword):
  1421. # if len(_keyword.strip())==0 and isinstance(_keyword,(str)):
  1422. # continue
  1423. # print(_keyword)
  1424. # item = {"keyword":_keyword.strip()}
  1425. # list_keyword.append(item)
  1426. # task_queue.put(item)
  1427. filename = "有效调用次数统计_20220830_v1.xlsx"
  1428. df = pd.read_excel(filename)
  1429. for company in df["enterpriseName"]:
  1430. _dict = {"keyword":company}
  1431. task_queue.put(_dict)
  1432. list_keyword.append(_dict)
  1433. start_day = "2019-01-01"
  1434. count_days = 90
  1435. # for _i in range(count_days):
  1436. #
  1437. # current_date = timeAdd(start_day,_i)
  1438. # for _j in range(24*6):
  1439. # start_minute = timeAdd_minute(current_date,10*_j)
  1440. # end_minute = timeAdd_minute(current_date,10*(_j+1))
  1441. #
  1442. # item = {"start_minute":start_minute,"end_minute":end_minute}
  1443. # list_keyword.append(item)
  1444. # task_queue.put(item)
  1445. ots_client = getConnect_ots()
  1446. def _handle(item,result_queue,ots_client):
  1447. # should_q_keyword = BoolQuery(should_queries=[
  1448. # # MatchPhraseQuery("tenderee",item["keyword"]),
  1449. # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["keyword"])),
  1450. # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.second_tenderer",item["keyword"])),
  1451. # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.third_tenderer",item["keyword"]))
  1452. # MatchPhraseQuery("doctextcon",item["keyword"]),
  1453. # MatchPhraseQuery("doctitle",item["keyword"]),
  1454. # MatchPhraseQuery("attachmenttextcon",item["keyword"])
  1455. # ])
  1456. #
  1457. # should_q2 = BoolQuery(should_queries=[WildcardQuery('province', '%s*'%"广东")
  1458. # # ,WildcardQuery('province', '%s*'%"湖南")
  1459. # # ,WildcardQuery('province', '%s*'%"广西")
  1460. # ])
  1461. #
  1462. # should_q_tenderee = BoolQuery(should_queries=[WildcardQuery("tenderee","*中学*"),
  1463. # WildcardQuery("tenderee","*大学*"),
  1464. # WildcardQuery("tenderee","*小学*"),
  1465. # WildcardQuery("tenderee","*教育局*")])
  1466. bool_query = BoolQuery(must_queries=[
  1467. # RangeQuery("page_time","2019-01-01","2023-01-01"),
  1468. # TermQuery("page_time","2022-02-18"),
  1469. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[item["keyword"]],MatchPhraseQuery),
  1470. # RangeQuery("crtime",item["start_minute"],item["end_minute"])
  1471. RangeQuery("status",151,300,True,True),
  1472. # TermQuery("tenderee",item["keyword"])
  1473. # ,TermQuery("docchannel",52)
  1474. # ,should_q_keyword
  1475. # ,should_q_tenderee
  1476. # ,should_q2
  1477. ])
  1478. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1479. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("crtime",SortOrder.DESC)]), limit=1, get_total_count=True),
  1480. ColumnsToGet(return_type=ColumnReturnType.NONE))
  1481. item["total_count"] = total_count
  1482. list_data = getRow_ots(rows)
  1483. for _data in list_data:
  1484. _data["keyword"] = item["keyword"]
  1485. result_queue.put(_data)
  1486. _count = len(list_data)
  1487. while next_token:
  1488. print("%d/%d"%(_count,total_count))
  1489. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1490. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  1491. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1492. list_data = getRow_ots(rows)
  1493. _count += len(list_data)
  1494. for _data in list_data:
  1495. _data["keyword"] = item["keyword"]
  1496. result_queue.put(_data)
  1497. mt = MultiThreadHandler(task_queue,_handle,result_queue,40,ots_client=ots_client)
  1498. mt.run()
  1499. set_docid = set()
  1500. list_item = []
  1501. # for item in list_keyword:
  1502. # print(item["keyword"])
  1503. total_count = 0
  1504. for item in list_keyword:
  1505. total_count += item["total_count"]
  1506. print(item["total_count"])
  1507. print("total_count:%d"%(total_count))
  1508. keys = list_keyword[0].keys()
  1509. df_data = {}
  1510. for item in list_keyword:
  1511. for k,v in item.items():
  1512. if k not in df_data:
  1513. df_data[k] = []
  1514. df_data[k].append(v)
  1515. dumplicate = False
  1516. try:
  1517. while True:
  1518. _dict = result_queue.get(False)
  1519. _docid = _dict.get("docid")
  1520. if _docid in set_docid and not dumplicate:
  1521. continue
  1522. set_docid.add(_docid)
  1523. list_item.append(_dict)
  1524. except Exception as e:
  1525. print(e)
  1526. list_docid = list(set_docid)
  1527. with open("list_docid.txt","w",encoding="utf8") as f:
  1528. for _docid in list_docid:
  1529. f.write(str(_docid))
  1530. f.write("\n")
  1531. f.flush()
  1532. # log("get document taotal_count:%d"%len(list_item))
  1533. # set_line = set()
  1534. # getRowData(df_data,list_item,set_line,list_keyword,dict_channel,dumplicate)
  1535. # set_enterprise = set()
  1536. # for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
  1537. # set_enterprise.add(_tenderee)
  1538. # set_enterprise.add(_agency)
  1539. # set_enterprise.add(_win_tenderer)
  1540. # if "" in set_enterprise:
  1541. # set_enterprise.remove("")
  1542. # if None in set_enterprise:
  1543. # set_enterprise.remove(None)
  1544. # dict_enterprise = getDictEnterprise(list(set_enterprise))
  1545. # if len(set_enterprise)>0:
  1546. # for _i in range(len(df_data["招标单位"])):
  1547. # # _enterprise_name = df_data["招标单位"][_i]
  1548. # # if df_data["招标联系人电话"][_i]=="":
  1549. # # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1550. # # if contacts is not None:
  1551. # # _person,_phone = getOneContact(contacts)
  1552. # # df_data["招标联系人"][_i] = _person
  1553. # # df_data["招标联系人电话"][_i] = _phone
  1554. #
  1555. # _enterprise_name = df_data["代理单位"][_i]
  1556. # if df_data["代理联系人电话"][_i]=="":
  1557. # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1558. # if contacts is not None:
  1559. # _person,_phone = getOneContact(contacts)
  1560. # df_data["代理联系人"][_i] = _person
  1561. # df_data["代理联系人电话"][_i] = _phone
  1562. #
  1563. # _enterprise_name = df_data["中标单位"][_i]
  1564. # if df_data["中标单位联系电话"][_i]=="":
  1565. # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1566. # if contacts is not None:
  1567. # _person,_phone = getOneContact(contacts)
  1568. # df_data["中标单位联系人"][_i] = _person
  1569. # df_data["中标单位联系电话"][_i] = _phone
  1570. df = pd.DataFrame(df_data)
  1571. df.to_excel("../data/%s_export11.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
  1572. set_columns = set()
  1573. list_df_columns = []
  1574. def set_dict_item(_dict,name,v):
  1575. _dict[name] = getLegal_str(v)
  1576. if name not in set_columns:
  1577. set_columns.add(name)
  1578. list_df_columns.append(getLegal_str(name))
  1579. def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v):
  1580. _dict[name] = getLegal_str(v)
  1581. if name not in set_columns1:
  1582. set_columns1.add(name)
  1583. list_df_columns1.append(name)
  1584. def exportDocument_medicine(start_time,end_time):
  1585. # filename = "../data/重复公告.xlsx"
  1586. # df = pd.read_excel(filename)
  1587. ots_client = getConnect_ots()
  1588. # columns = ["doctitle","docchannel","time_bidopen","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
  1589. columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen"]
  1590. set_enter = set()
  1591. str_enter = '''
  1592. 北京嘉和美康信息技术有限公司
  1593. 阿里健康科技(中国)有限公司
  1594. 北大医疗信息技术有限公司
  1595. 创业慧康科技股份有限公司
  1596. 东华医为科技有限公司
  1597. 望海康信(北京)科技股份公司
  1598. 国新健康保障服务有限公司
  1599. 南京海泰医疗信息系统有限公司
  1600. 南京海泰信息技术有限公司
  1601. 浙江和仁科技股份有限公司
  1602. 北京惠每科技有限公司
  1603. 金蝶医疗软件科技有限公司
  1604. 北京京东健康有限公司
  1605. 四川久远银海软件股份有限公司
  1606. 零氪科技(北京)有限公司
  1607. 北京麦迪克斯科技有限公司
  1608. 苏州麦迪斯顿医疗科技股份有限公司
  1609. 江苏曼荼罗软件股份有限公司
  1610. 北京平安联想智慧医疗信息技术有限公司
  1611. 青岛百洋智能科技股份有限公司
  1612. 上海森亿医疗科技有限公司
  1613. 万达信息股份有限公司
  1614. 微医集团(浙江)有限公司
  1615. 卫宁健康科技集团股份有限公司
  1616. 心医国际数字医疗系统(大连)有限公司
  1617. 医渡云(北京)技术有限公司
  1618. 医惠科技有限公司
  1619. 易联众信息技术股份有限公司
  1620. 智业软件股份有限公司
  1621. 中电数据服务有限公司
  1622. 重庆中联信息产业有限责任公司
  1623. 杭州卓健信息科技股份有限公司
  1624. 大连万达集团股份有限公司
  1625. '''
  1626. for a in re.split("\s+",str_enter):
  1627. if a.strip()!="":
  1628. set_enter.add(a.strip())
  1629. dict_channel = getDict_docchannel()
  1630. # list_province = ["江西","湖南","四川","安徽"]
  1631. list_province = ["全国"]
  1632. for _province in list_province:
  1633. df_data = {}
  1634. str_p = '''
  1635. 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件
  1636. 县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗
  1637. 数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化
  1638. 临床应用
  1639. '''
  1640. list_prov = re.split("\s",str_p)
  1641. list_mu = []
  1642. for _p in list_prov:
  1643. if _p.strip()=="":
  1644. continue
  1645. print(_p)
  1646. list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
  1647. s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
  1648. list_should_ten = []
  1649. for _p in re.split("、",s_tenderee):
  1650. if _p.split()=="":
  1651. continue
  1652. list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
  1653. list_should_win = []
  1654. for _win in list(set_enter):
  1655. list_should_win.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_win)))
  1656. list_should_chan = []
  1657. list_should_chan.append(TermQuery("docchannel",52))
  1658. list_should_chan.append(TermQuery("docchannel",101))
  1659. list_should_chan.append(TermQuery("docchannel",102))
  1660. should_q1 = BoolQuery(should_queries=list_mu)
  1661. should_q2 = BoolQuery(should_queries=list_should_ten)
  1662. should_q3 = BoolQuery(should_queries=list_should_chan)
  1663. bool_query = BoolQuery(must_queries=[
  1664. BoolQuery(should_queries=list_should_win),
  1665. RangeQuery("page_time",start_time,end_time,include_lower=True,include_upper=True),
  1666. RangeQuery("status",151,300,True,True),
  1667. # should_q1,
  1668. # should_q2,
  1669. # should_q3,
  1670. ],
  1671. # must_not_queries=[
  1672. # MatchPhraseQuery("doctextcon","器械"),
  1673. # MatchPhraseQuery("doctextcon","仪器"),
  1674. # ]
  1675. )
  1676. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1677. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
  1678. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1679. set_line = set()
  1680. _count = len(rows)
  1681. # getData(df_data,rows,set_line)
  1682. list_row = getRow_ots(rows)
  1683. getRowData(df_data,list_row,set_line,[],dict_channel,False)
  1684. while next_token:
  1685. print("%d/%d"%(_count,total_count))
  1686. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  1687. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  1688. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1689. list_row = getRow_ots(rows)
  1690. getRowData(df_data,list_row,set_line,[],dict_channel,False)
  1691. _count += len(rows)
  1692. if len(list(df_data.keys()))>0:
  1693. if len(df_data[list(df_data.keys())[0]])>=300:
  1694. break
  1695. set_enterprise = set()
  1696. for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
  1697. set_enterprise.add(_tenderee)
  1698. set_enterprise.add(_agency)
  1699. set_enterprise.add(_win_tenderer)
  1700. if "" in set_enterprise:
  1701. set_enterprise.remove("")
  1702. if None in set_enterprise:
  1703. set_enterprise.remove(None)
  1704. dict_enterprise = getDictEnterprise(list(set_enterprise))
  1705. if len(set_enterprise)>0:
  1706. for _i in range(len(df_data["招标单位"])):
  1707. _enterprise_name = df_data["招标单位"][_i]
  1708. if df_data["招标联系人电话"][_i]=="":
  1709. contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1710. if contacts is not None:
  1711. _person,_phone = getOneContact(contacts)
  1712. df_data["招标联系人"][_i] = _person
  1713. df_data["招标联系人电话"][_i] = _phone
  1714. _enterprise_name = df_data["代理单位"][_i]
  1715. if df_data["代理联系人电话"][_i]=="":
  1716. contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1717. if contacts is not None:
  1718. _person,_phone = getOneContact(contacts)
  1719. df_data["代理联系人"][_i] = _person
  1720. df_data["代理联系人电话"][_i] = _phone
  1721. _enterprise_name = df_data["中标单位"][_i]
  1722. if df_data["中标单位联系电话"][_i]=="":
  1723. contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
  1724. if contacts is not None:
  1725. _person,_phone = getOneContact(contacts)
  1726. df_data["中标单位联系人"][_i] = _person
  1727. df_data["中标单位联系电话"][_i] = _phone
  1728. return df_data
  1729. df1 = pd.DataFrame(df_data)
  1730. print(len(df_data["docid"]))
  1731. df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
  1732. def getRowDataWithKey(df_data,rows,columns):
  1733. global list_df_columns
  1734. list_df_columns = columns
  1735. for row in rows:
  1736. for c in columns:
  1737. if c not in df_data:
  1738. df_data[c] = []
  1739. df_data[c].append(row.get(c))
  1740. def getRowData(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
  1741. dict_line = {}
  1742. # list_data = getRow_ots(rows)
  1743. _index = 0
  1744. for row in rows:
  1745. _index += 1
  1746. item = {}
  1747. _dict = row
  1748. set_dict_item(item,"docid",_dict.get("docid",""))
  1749. # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
  1750. set_dict_item(item,"crtime",_dict.get("crtime",""))
  1751. # set_dict_item(item,"要素数",_dict.get("extract_count",0))
  1752. set_dict_item(item,"公告标题",_dict.get("doctitle",""))
  1753. set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
  1754. set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
  1755. set_dict_item(item,"正文实体",_dict.get("nlp_enterprise",""))
  1756. set_dict_item(item,"附件实体",_dict.get("nlp_enterprise_attachment",""))
  1757. # set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
  1758. # set_dict_item(item,"原公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))
  1759. set_dict_item(item,"公告内容",getLegal_str(_dict.get("doctextcon","")))
  1760. set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))
  1761. if "keyword" in _dict:
  1762. set_dict_item(item,"关键词",_dict["keyword"])
  1763. else:
  1764. _wholeword = re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")[:30000]+row.get("attachmenttextcon","")[:30000]).replace("(","(").replace(")",")"))
  1765. _pattern = "|".join([re.escape(str(a).replace("(","(").replace(")",")")) for a in list_keyword])
  1766. set_dict_item(item,"关键词",",".join(list(set(re.findall(_pattern,_wholeword)))))
  1767. # set_dict_item(item,"关键词",_dict.get("keyword",""))
  1768. set_dict_item(item,"产品",_dict.get("product",""))
  1769. set_dict_item(item,"服务期限",_dict.get("service_time",""))
  1770. set_dict_item(item,"省份",_dict.get("province",""))
  1771. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  1772. set_dict_item(item,"城市",_dict.get("city",""))
  1773. set_dict_item(item,"区县",_dict.get("district",""))
  1774. set_dict_item(item,"发布时间",_dict.get("page_time",""))
  1775. set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
  1776. set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
  1777. # set_dict_item(item,"创建时间",_dict.get("crtime",""))
  1778. set_dict_item(item,"招标方式",_dict.get("bidway",""))
  1779. set_dict_item(item,"行业一级分类",_dict.get("industry",""))
  1780. set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
  1781. set_dict_item(item,"uuid",_dict.get("uuid"))
  1782. # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
  1783. set_dict_item(item,"项目编号",_dict.get("project_code",""))
  1784. set_dict_item(item,"项目名称",_dict.get("project_name",""))
  1785. set_dict_item(item,"招标单位",_dict.get("tenderee",""))
  1786. set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
  1787. set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
  1788. set_dict_item(item,"代理单位",_dict.get("agency",""))
  1789. set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
  1790. set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
  1791. set_dict_item(item,"评审专家",_dict.get("person_review",""))
  1792. set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
  1793. set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
  1794. set_dict_item(item,"获取文件开始时间",_dict.get("time_get_file_start",""))
  1795. set_dict_item(item,"获取文件结束时间",_dict.get("time_get_file_end",""))
  1796. set_dict_item(item,"保证金递交开始时间",_dict.get("time_earnest_money_start",""))
  1797. set_dict_item(item,"保证金递交结束时间",_dict.get("time_earnest_money_end",""))
  1798. sub_docs_json = _dict.get("sub_docs_json")
  1799. set_tenderer = set()
  1800. multi_win = []
  1801. multi_package_win = []
  1802. win_joint = []
  1803. if sub_docs_json is not None:
  1804. docs = json.loads(sub_docs_json)
  1805. docs.sort(key=lambda x:float(x.get("win_bid_price",0)))
  1806. for _doc in docs:
  1807. if "win_tenderer" in _doc:
  1808. set_dict_item(item,"中标单位",_doc["win_tenderer"])
  1809. multi_package_win.append(_doc["win_tenderer"])
  1810. if "second_tenderer" in _doc:
  1811. set_dict_item(item,"第二候选单位",_doc["second_tenderer"])
  1812. set_tenderer.add(_doc.get("second_tenderer"))
  1813. if "third_tenderer" in _doc:
  1814. set_dict_item(item,"第三候选单位",_doc["third_tenderer"])
  1815. set_tenderer.add(_doc.get("third_tenderer"))
  1816. if "win_tenderer_manager" in _doc:
  1817. set_dict_item(item,"中标单位联系人",_doc["win_tenderer_manager"])
  1818. if "win_tenderer_phone" in _doc:
  1819. set_dict_item(item,"中标单位联系电话",_doc["win_tenderer_phone"])
  1820. if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  1821. set_dict_item(item,"中标金额",_doc["win_bid_price"])
  1822. if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  1823. set_dict_item(item,"招标金额",_doc["bidding_budget"])
  1824. if "win_tenderer_joint" in _doc:
  1825. win_tenderer_joints = _doc.get("win_tenderer_joint","").split(",")
  1826. for _joints in win_tenderer_joints:
  1827. win_joint.append(_joints)
  1828. if "win_tenderer_joints" in _doc:
  1829. win_tenderer_joints = json.loads(_doc["win_tenderer_joints"])
  1830. for _win_joint in win_tenderer_joints:
  1831. win_joint.append(_win_joint.get("name",""))
  1832. if "multi_winner" in _doc:
  1833. multi_winners = _doc.get("multi_winner","").split(",")
  1834. for _mul in multi_winners:
  1835. multi_win.append(_mul)
  1836. if "multi_winners" in _doc:
  1837. multi_winners = json.loads(_doc["multi_winners"])
  1838. for multi_w in multi_winners:
  1839. multi_win.append(multi_w.get("name",""))
  1840. set_dict_item(item,"多标段中标人",",".join(list(multi_package_win)))
  1841. set_dict_item(item,"多中标人",",".join(list(multi_win)))
  1842. set_dict_item(item,"联合中标人",",".join(list(win_joint)))
  1843. set_dict_item(item,"入围供应商",",".join(list(set_tenderer)))
  1844. if "第二候选单位" not in item:
  1845. set_dict_item(item,"第二候选单位","")
  1846. if "第三候选单位" not in item:
  1847. set_dict_item(item,"第三候选单位","")
  1848. if "招标金额" not in item:
  1849. set_dict_item(item,"招标金额","")
  1850. if "中标金额" not in item:
  1851. set_dict_item(item,"中标金额","")
  1852. if "中标单位" not in item:
  1853. set_dict_item(item,"中标单位","")
  1854. if "中标单位联系人" not in item:
  1855. set_dict_item(item,"中标单位联系人","")
  1856. if "中标单位联系电话" not in item:
  1857. set_dict_item(item,"中标单位联系电话","")
  1858. set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  1859. set_dict_item(item,"detail_link",_dict.get("detail_link",""))
  1860. # if item["中标单位"] not in set_enter:
  1861. # continue
  1862. if not dumplicate:
  1863. if item["项目编号"]!="":
  1864. _line = "%s-%s-%s-%s-%s-%s"%(item["公告类别"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["招标金额"]),str(item["中标金额"]))
  1865. if _line in dict_line:
  1866. dict_line[_line].append(item)
  1867. continue
  1868. dict_line[_line] = [item]
  1869. _line2 = "%s-%s-%s-%s-%s-%s"%(item["公告标题"],item["公告类别"],item["招标单位"],str(item["招标金额"]),item["中标单位"],str(item["中标金额"]))
  1870. if _line2 in dict_line:
  1871. dict_line[_line2].append(item)
  1872. continue
  1873. dict_line[_line2] = [item]
  1874. # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None:
  1875. # continue
  1876. # if _dict.get("docid","") in set_ig_docid:
  1877. # continue
  1878. # if item["招标金额"]=="":
  1879. # continue
  1880. for k,v in item.items():
  1881. if k not in df_data:
  1882. df_data[k] = []
  1883. df_data[k].append(v)
  1884. if not dumplicate:
  1885. dict_dump = {}
  1886. columns = ["group_id"]
  1887. columns.extend(list_df_columns)
  1888. for k in columns:
  1889. dict_dump[k] = []
  1890. group_id = 1
  1891. for k,v in dict_line.items():
  1892. if len(v)==1:
  1893. continue
  1894. for item in v:
  1895. dict_dump["group_id"].append(group_id)
  1896. for k in list_df_columns:
  1897. dict_dump[k].append(item.get(k))
  1898. group_id += 1
  1899. df_dump = pd.DataFrame(dict_dump)
  1900. df_dump.to_excel("%s/../data/dumplicate/%s_重复数据.xlsx"%(os.path.dirname(__file__),getCurrent_date("%Y-%m-%d_%H%M%S")))
  1901. def getRowData_shenpi(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
  1902. dict_line = {}
  1903. # list_data = getRow_ots(rows)
  1904. _index = 0
  1905. for row in rows:
  1906. _index += 1
  1907. item = {}
  1908. _dict = row
  1909. extract_json = json.loads(_dict.get("extract_json","{}"))
  1910. set_dict_item(item,"docid",_dict.get("docid",""))
  1911. set_dict_item(item,"original_id",_dict.get("original_id",""))
  1912. # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
  1913. set_dict_item(item,"crtime",_dict.get("crtime",""))
  1914. set_dict_item(item,"province",_dict.get("province",""))
  1915. set_dict_item(item,"city",_dict.get("city",""))
  1916. set_dict_item(item,"district",_dict.get("district",""))
  1917. set_dict_item(item,"单位集合",json.dumps(extract_json.get("dict_enterprise",""),ensure_ascii=False))
  1918. # set_dict_item(item,"要素数",_dict.get("extract_count",0))
  1919. set_dict_item(item,"公告标题",_dict.get("doctitle",""))
  1920. set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
  1921. set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
  1922. # set_dict_item(item,"原公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))
  1923. # set_dict_item(item,"detail_link",_dict.get("detail_link",""))
  1924. set_dict_item(item,"产品",_dict.get("product",""))
  1925. set_dict_item(item,"发布时间",_dict.get("page_time",""))
  1926. set_dict_item(item,"行业一级分类",_dict.get("industry",""))
  1927. set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
  1928. # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
  1929. set_dict_item(item,"项目编号",_dict.get("project_code",""))
  1930. set_dict_item(item,"项目名称",_dict.get("project_name",""))
  1931. set_dict_item(item,"审批事项",_dict.get("approval_items",""))
  1932. set_dict_item(item,"审批结果",_dict.get("approval_result",""))
  1933. set_dict_item(item,"审批部门",_dict.get("approver",""))
  1934. set_dict_item(item,"建设单位",_dict.get("construct_company",""))
  1935. set_dict_item(item,"建设规模",_dict.get("construction_scale",""))
  1936. set_dict_item(item,"申报单位",_dict.get("declare_company",""))
  1937. set_dict_item(item,"审批文号",_dict.get("doc_num",""))
  1938. set_dict_item(item,"环评机构",_dict.get("evaluation_agency",""))
  1939. set_dict_item(item,"项目法人",_dict.get("legal_person",""))
  1940. set_dict_item(item,"资金来源",_dict.get("moneysource",""))
  1941. set_dict_item(item,"申报类型",_dict.get("pro_type",""))
  1942. set_dict_item(item,"项目地址",_dict.get("project_addr",""))
  1943. set_dict_item(item,"项目属性",_dict.get("properties",""))
  1944. set_dict_item(item,"开工时间",_dict.get("time_commencement",""))
  1945. set_dict_item(item,"竣工时间",_dict.get("time_completion",""))
  1946. set_dict_item(item,"申报时间",_dict.get("time_declare",""))
  1947. set_dict_item(item,"总投资",_dict.get("total_tenderee_money",""))
  1948. set_dict_item(item,"建设年限",_dict.get("year_limit",""))
  1949. for k,v in item.items():
  1950. if k not in df_data:
  1951. df_data[k] = []
  1952. df_data[k].append(v)
  1953. def getRowData_sp1(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
  1954. dict_line = {}
  1955. # list_data = getRow_ots(rows)
  1956. _index = 0
  1957. set_id = set()
  1958. for row in rows:
  1959. _index += 1
  1960. item = {}
  1961. _dict = row
  1962. _id = _dict.get("id")
  1963. if _id is not None and _id in set_id:
  1964. continue
  1965. set_id.add(_id)
  1966. set_dict_item(item,"id",_dict.get("id",""))
  1967. # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
  1968. # set_dict_item(item,"crtime",_dict.get("crtime",""))
  1969. set_dict_item(item,"detaillink",_dict.get("detaillink",""))
  1970. # set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
  1971. set_dict_item(item,"公告类别",dict_sptype.get(str(_dict.get("sp_type","")),""))
  1972. set_dict_item(item,"page_time",getLegal_str(_dict.get("page_time","")))
  1973. # set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))
  1974. set_dict_item(item,"page_title",_dict.get("page_title",""))
  1975. set_dict_item(item,"record_id",_dict.get("record_id",""))
  1976. set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
  1977. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  1978. set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
  1979. for k,v in item.items():
  1980. if k not in df_data:
  1981. df_data[k] = []
  1982. df_data[k].append(v)
  1983. def getRowData_sp(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
  1984. dict_line = {}
  1985. # list_data = getRow_ots(rows)
  1986. _index = 0
  1987. set_id = set()
  1988. for row in rows:
  1989. _index += 1
  1990. item = {}
  1991. _dict = row
  1992. set_dict_item(item,"docid",_dict.get("docid",""))
  1993. # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
  1994. # set_dict_item(item,"crtime",_dict.get("crtime",""))
  1995. set_dict_item(item,"公告标题",_dict.get("page_title",""))
  1996. # set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
  1997. set_dict_item(item,"公告类别",dict_sptype.get(str(_dict.get("sp_type","")),""))
  1998. set_dict_item(item,"公告内容",getLegal_str(_dict.get("page_content","")))
  1999. # set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))
  2000. if "keyword" in _dict:
  2001. set_dict_item(item,"关键词",_dict["keyword"])
  2002. else:
  2003. set_dict_item(item,"关键词",",".join(list(set(re.findall("|".join([re.escape(str(a).replace("(","(").replace(")",")")) for a in list_keyword]),re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")[:30000]+row.get("attachmenttextcon","")[:30000]).replace("(","(").replace(")",")")))))))
  2004. # set_dict_item(item,"关键词",_dict.get("keyword",""))
  2005. set_dict_item(item,"产品",_dict.get("product",""))
  2006. set_dict_item(item,"服务期限",_dict.get("service_time",""))
  2007. set_dict_item(item,"省份",_dict.get("province",""))
  2008. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  2009. set_dict_item(item,"城市",_dict.get("city",""))
  2010. set_dict_item(item,"区县",_dict.get("district",""))
  2011. set_dict_item(item,"发布时间",_dict.get("page_time",""))
  2012. set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
  2013. set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
  2014. # set_dict_item(item,"创建时间",_dict.get("crtime",""))
  2015. set_dict_item(item,"招标方式",_dict.get("bidway",""))
  2016. set_dict_item(item,"行业一级分类",_dict.get("industry",""))
  2017. set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
  2018. set_dict_item(item,"来源",_dict.get("web_source_name",""))
  2019. set_dict_item(item,"uuid",_dict.get("uuid"))
  2020. # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
  2021. set_dict_item(item,"项目编号",_dict.get("page_code",""))
  2022. set_dict_item(item,"项目名称",_dict.get("project_name",""))
  2023. set_dict_item(item,"招标单位",_dict.get("tenderee",""))
  2024. set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
  2025. set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
  2026. set_dict_item(item,"代理单位",_dict.get("agency",""))
  2027. set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
  2028. set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
  2029. # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  2030. set_dict_item(item,"评审专家",_dict.get("person_review",""))
  2031. set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
  2032. set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
  2033. set_dict_item(item,"获取文件开始时间",_dict.get("time_get_file_start",""))
  2034. set_dict_item(item,"获取文件结束时间",_dict.get("time_get_file_end",""))
  2035. set_dict_item(item,"保证金递交开始时间",_dict.get("time_earnest_money_start",""))
  2036. set_dict_item(item,"保证金递交结束时间",_dict.get("time_earnest_money_end",""))
  2037. sub_docs_json = _dict.get("sub_docs_json")
  2038. set_tenderer = set()
  2039. if sub_docs_json is not None:
  2040. docs = json.loads(sub_docs_json)
  2041. docs.sort(key=lambda x:x.get("win_bid_price",0))
  2042. for _doc in docs:
  2043. if "win_tenderer" in _doc:
  2044. set_dict_item(item,"中标单位",_doc["win_tenderer"])
  2045. if "second_tenderer" in _doc:
  2046. set_dict_item(item,"第二候选单位",_doc["second_tenderer"])
  2047. set_tenderer.add(_doc.get("second_tenderer"))
  2048. if "third_tenderer" in _doc:
  2049. set_dict_item(item,"第三候选单位",_doc["third_tenderer"])
  2050. set_tenderer.add(_doc.get("third_tenderer"))
  2051. if "win_tenderee_manager" in _doc:
  2052. set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
  2053. if "win_tenderee_phone" in _doc:
  2054. set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
  2055. if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  2056. set_dict_item(item,"中标金额",_doc["win_bid_price"])
  2057. if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  2058. set_dict_item(item,"招标金额",_doc["bidding_budget"])
  2059. set_dict_item(item,"入围供应商",",".join(list(set_tenderer)))
  2060. if "第二候选单位" not in item:
  2061. set_dict_item(item,"第二候选单位","")
  2062. if "第三候选单位" not in item:
  2063. set_dict_item(item,"第三候选单位","")
  2064. if "招标金额" not in item:
  2065. set_dict_item(item,"招标金额","")
  2066. if "中标金额" not in item:
  2067. set_dict_item(item,"中标金额","")
  2068. if "中标单位" not in item:
  2069. set_dict_item(item,"中标单位","")
  2070. if "中标单位联系人" not in item:
  2071. set_dict_item(item,"中标单位联系人","")
  2072. if "中标单位联系电话" not in item:
  2073. set_dict_item(item,"中标单位联系电话","")
  2074. # if item["中标单位"] not in set_enter:
  2075. # continue
  2076. if not dumplicate:
  2077. if item["项目编号"]!="":
  2078. _line = "%s-%s-%s-%s-%s-%s"%(item["公告类别"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["招标金额"]),str(item["中标金额"]))
  2079. if _line in dict_line:
  2080. dict_line[_line].append(item)
  2081. continue
  2082. dict_line[_line] = [item]
  2083. _line2 = "%s-%s-%s-%s-%s-%s"%(item["公告标题"],item["公告类别"],item["招标单位"],str(item["招标金额"]),item["中标单位"],str(item["中标金额"]))
  2084. if _line2 in dict_line:
  2085. dict_line[_line2].append(item)
  2086. continue
  2087. dict_line[_line2] = [item]
  2088. # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None:
  2089. # continue
  2090. # if _dict.get("docid","") in set_ig_docid:
  2091. # continue
  2092. # if item["招标金额"]=="":
  2093. # continue
  2094. for k,v in item.items():
  2095. if k not in df_data:
  2096. df_data[k] = []
  2097. df_data[k].append(v)
  2098. if not dumplicate:
  2099. dict_dump = {}
  2100. columns = ["group_id"]
  2101. columns.extend(list_df_columns)
  2102. for k in columns:
  2103. dict_dump[k] = []
  2104. group_id = 1
  2105. for k,v in dict_line.items():
  2106. if len(v)==1:
  2107. continue
  2108. for item in v:
  2109. dict_dump["group_id"].append(group_id)
  2110. for k in list_df_columns:
  2111. dict_dump[k].append(item.get(k))
  2112. group_id += 1
  2113. df_dump = pd.DataFrame(dict_dump)
  2114. df_dump.to_excel("%s/../data/dumplicate/%s_重复数据.xlsx"%(os.path.dirname(__file__),getCurrent_date("%Y-%m-%d_%H%M%S")))
  2115. def filterRow(list_row,column,list_not_keywrods):
  2116. list_result = []
  2117. for row in list_row:
  2118. _product = row.get(column,"")
  2119. sub_docs_json = row.get("sub_docs_json","")
  2120. doctitle = row.get("doctitle","")
  2121. tenderee = row.get("tenderee","")
  2122. if tenderee!="":
  2123. continue
  2124. nlp_enterprise = row.get("nlp_enterprise")
  2125. nlp_enterprise_attachment = row.get("nlp_enterprise_attachment")
  2126. list_nlp_enterprise = []
  2127. list_enterprise_attachment = []
  2128. if nlp_enterprise is not None:
  2129. list_nlp_enterprise = json.loads(nlp_enterprise)
  2130. if nlp_enterprise_attachment is not None:
  2131. list_enterprise_attachment = json.loads(nlp_enterprise_attachment)
  2132. if max(len(list_nlp_enterprise),len(list_enterprise_attachment))==1:
  2133. list_result.append(row)
  2134. # if re.search("设计",sub_docs_json) is not None:
  2135. # if re.search("装修",str(doctitle)+str(sub_docs_json)) is None:
  2136. # list_result.append(row)
  2137. # else:
  2138. # print("===",_product)
  2139. # if re.search("|".join([re.escape(i) for i in list_not_keywrods]),_product) is not None:
  2140. # continue
  2141. # list_result.append(row)
  2142. # if row.get("关键词",1)==row.get("招标单位",2) or row.get("关键词",2)==row.get("中标单位",3):
  2143. # list_result.append(row)
  2144. # doctitle = row.get("doctitle")
  2145. # doctextcon = row.get("doctextcon")
  2146. # if len(re.sub('\s','',doctextcon))==len(doctitle)+4:
  2147. # list_result.append(row)
  2148. # tenderee_phone = row.get("tenderee_phone","")
  2149. # if len(tenderee_phone)==11:
  2150. # list_result.append(row)
  2151. return list_result
  2152. dict_sptype = {"2":"审批信息",
  2153. "4":"审批结果",
  2154. "8":"核准公示",
  2155. "16":"核准结果",
  2156. "32":"备案公示",
  2157. "64":"备案结果",
  2158. "128":"推介项目",
  2159. "256":"推介结果",
  2160. "512":"项目撤销",
  2161. "1024":"筹备阶段"}
  2162. def getKeywordByFile():
  2163. filename = "审批标题对比检查结果(20220831).xlsx"
  2164. df = pd.read_excel(filename,sheetname=1)
  2165. list_data = []
  2166. for _title,_no,_type in zip(df["标题"],df["编号"],df["检查结果"]):
  2167. if _type not in ["接口错"]:
  2168. continue
  2169. _dict = {"title":_title,
  2170. "web_source":_no}
  2171. list_data.append(_dict)
  2172. return list_data
  2173. def exportDocument_by_pagetime():
  2174. ots_client = getConnect_ots()
  2175. ots_capacity = getConnect_capacity()
  2176. # columns = ["doctitle","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
  2177. # columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
  2178. # columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_name"]
  2179. columns = ["doctitle","docchannel","nlp_enterprise","nlp_enterprise_attachment","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end","detail_link"]
  2180. # columns = ["docchannel","docid","project_name","product","doctitle","page_time","province","city","time_get_file_end","time_bidclose","project_code","sub_docs_json","tenderee","info_type","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","detail_link","bidway","crtime","extract_count","products"]
  2181. # columns = ["page_time","doctitle","crtime","web_source_no","web_source_name","detail_link","original_docchannel","uuid","docid"]
  2182. # columns = ["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnest_money_end","time_earnest_money_start","time_get_file_end","time_get_file_start","time_publicity_end","time_publicity_start","time_registration_end","time_registration_start"]
  2183. dict_channel = getDict_docchannel()
  2184. # columns = ["doctitle","dochtmlcon","page_time","web_source_no","web_source_name","sub_docs_json"]
  2185. # columns = ["tenderee","tenderee_contact","tenderee_phone"]
  2186. # columns = ["extract_json","original_id","crtime","province","city","district","doctitle","web_source_no","web_source_name","product","page_time","industry","info_type","project_code","project_name","approval_items","approval_result","approver","construct_company","construction_scale","declare_company","doc_num","evaluation_agency","legal_person","moneysource","pro_type","project_addr","properties","time_commencement","time_completion","time_declare","total_tenderee_money","year_limit"]
  2187. # columns = ["extract_json","doctitle","status","page_time","docchannel"]
  2188. list_query = []
  2189. str_keyword = '''
  2190. 博物馆 、文物
  2191. '''
  2192. list_keyword = splitIntoList(str_keyword,"[\s\n、,,|]")
  2193. str_con_keyword = '''
  2194. 博物馆 、文物
  2195. '''
  2196. con_keyword = splitIntoList(str_con_keyword,'[\s\n、,,|]')
  2197. print(con_keyword)
  2198. should_q_win = []
  2199. for _keyword in list_keyword:
  2200. should_q_win.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)))
  2201. q_win = BoolQuery(should_queries=should_q_win)
  2202. str_not_keyword = '''
  2203. 物业、保洁、家具、装修、维修、修缮、车辆、消防、广告、印刷、安防、防雷
  2204. 地区:全国
  2205. '''
  2206. list_not_key = splitIntoList(str_not_keyword,"[\s\n、,,]")
  2207. tenderee_keywrod = "医院、大学、高校、高中"
  2208. list_t_key = splitIntoList(tenderee_keywrod,"[\s\n、,,]")
  2209. should_q_system = [TermQuery("procurement_system","企业采购系统"),
  2210. TermQuery("procurement_system","部队采购系统")]
  2211. q_system = BoolQuery(should_queries=should_q_system)
  2212. log(str(list_keyword))
  2213. s_province = "北京,天津,深圳,上海,浙江,江苏,安徽"
  2214. list_province = splitIntoList(s_province,"[,,\s]")
  2215. st = "环境监测中心、环境监测总站、环保局、水务局、水利局"
  2216. list_tenderee = splitIntoList(st,"、|\s")
  2217. # list_title = getKeywordByFile()
  2218. #
  2219. # for _d in list_title:
  2220. # _title = _d["title"]
  2221. # web_source = _d["web_source"]
  2222. # bool_query = BoolQuery(must_queries=[
  2223. # generateBoolShouldQuery(["page_title"],[_title],MatchPhraseQuery),
  2224. # TermQuery("web_source_no",web_source)
  2225. # ])
  2226. # list_query.append({"query":bool_query})
  2227. for _keyword in list_keyword:
  2228. bool_query = BoolQuery(must_queries=[
  2229. # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)),
  2230. # generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),
  2231. # generateBoolShouldQuery(["doctitle"],["院","交通","学"],MatchPhraseQuery),
  2232. # generateBoolShouldQuery(["doctitle"],["智慧"],MatchPhraseQuery),
  2233. # ExistsQuery("tenderee"),
  2234. # generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),
  2235. # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["学校","大学","中学","小学"],MatchPhraseQuery),
  2236. # generateBoolShouldQuery(["web_source_name"],list_keyword,TermQuery),
  2237. # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_keyword,MatchPhraseQuery),
  2238. # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],con_keyword,MatchPhraseQuery),
  2239. # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["服务期","服务时间","合同期限","服务范围","质保期","履行期限","履约期限","交货期"],MatchPhraseQuery),
  2240. # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["雪茄柜"],MatchPhraseQuery),
  2241. # BoolQuery(should_queries=[
  2242. # generateBoolShouldQuery(["doctitle"],["公告","公示","招标","中标","采购","工程","项目","询价","施工","比价","服务","监理","设计"],MatchPhraseQuery),
  2243. # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["项目名称","招标人","中标人","项目编号","采购组织机构","采购人","招标范围","投标保证金","报价地址","询价通知书"],MatchPhraseQuery)
  2244. # ]),
  2245. # generateBoolShouldQuery(["doctitle"],["中标"],MatchPhraseQuery),
  2246. # generateBoolShouldQuery(["docid"],list_keyword,TermQuery),
  2247. # q_win,
  2248. # should_q,
  2249. # generateBoolShouldQuery(["tenderee"],[company],TermQuery),
  2250. # generateBoolShouldQuery(["province"],["安徽","江苏"],TermQuery),
  2251. generateBoolShouldQuery(["docchannel"],[52,101,118,119,120,121,122],TermQuery),
  2252. # generateBoolShouldQuery(["docchannel"],[52],TermQuery),
  2253. # generateBoolShouldQuery(["docchannel"],[101,118,119,120,121,122],TermQuery),
  2254. # generateBoolShouldQuery(["docchannel"],[302],TermQuery),
  2255. # generateBoolShouldQuery(["docchannel"],[101,119,120,121,122],TermQuery),
  2256. # generateBoolShouldQuery(["docchannel"],[51,52,101,102,103,104,105,114,118,119,120],TermQuery),
  2257. # generateBoolShouldQuery(["docchannel"],[102,52,101,114,119,120],TermQuery),
  2258. RangeQuery("page_time","2024-01-01","2025-12-12",True,False),
  2259. # NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
  2260. # TermQuery("web_source_name","专项债券信息网"),
  2261. # generateBoolShouldQuery(["city"],["西安","渭南"],TermQuery),
  2262. # TermQuery("info_type","物业管理"),
  2263. # RangeQuery("crtime","2024-07-21 00:00:00","2024-06-21 13:00:00"),
  2264. # TermQuery("save",0),
  2265. RangeQuery("status",201,301,True,True),
  2266. # RangeQuery("crtime","2024-01-01","2025-01-01",True,False),
  2267. # RangeQuery("page_time",range_from="2024-08-01",range_to="2025-01-01"),
  2268. # BoolQuery(should_queries=[TermQuery("page_time","2022-09-15"),
  2269. # TermQuery("page_time","2022-10-20"),
  2270. # TermQuery("page_time","2022-10-31")])
  2271. # TermQuery("page_time","2025-01-07"),
  2272. # RangeQuery("crtime","2025-05-19","2025-05-20"),
  2273. # TermQuery("docid",237163857),
  2274. # RangeQuery("tenderee","","1"),
  2275. # WildcardQuery("tenderee","*雅居乐*"),
  2276. # RangeQuery("crtime","2023-07-22 00:00:00"),
  2277. # BoolQuery(should_queries=[NestedQuery("products",RangeQuery("products.unit_price",1)),
  2278. # NestedQuery("products",RangeQuery("products.total_price",1)),])
  2279. # NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",1000000)),
  2280. # NestedQuery("page_attachments",TermQuery("page_attachments.fileMd5","92775529171409a32513f134a61d73c8")),
  2281. TermQuery("province","广东"),
  2282. # TermQuery("city","上海"),
  2283. # generateBoolShouldQuery(["tenderee"],list_tenderee,WildcardQuery),
  2284. # generateBoolShouldQuery(["tenderee"],["应急管理局","城市管理局","大数据局","政务服务管理局","消防局"],WildcardQuery),
  2285. WildcardQuery("tenderee","*中国电信*"),
  2286. # BoolQuery(should_queries=[WildcardQuery("tenderee","*医院*"),
  2287. # WildcardQuery("tenderee","*学校*")])
  2288. # BoolQuery(should_queries=[
  2289. # # NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.bidding_budget",100000000)),
  2290. # # NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",100000000)),
  2291. # # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","奇安信网神信息技术(北京)股份有限公司")),
  2292. # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)),
  2293. # ])
  2294. # TermQuery("procurement_system","公安系统"),
  2295. # generateBoolShouldQuery(["province"],["重庆"],WildcardQuery),
  2296. # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
  2297. # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
  2298. ],
  2299. # should_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐禾*")),
  2300. # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*彩食鲜*")),
  2301. # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*望家欢*")),
  2302. # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐味*")),
  2303. # ],
  2304. # must_not_queries=[
  2305. # # TermQuery("exist_table",1),
  2306. # # WildcardQuery("tenderee","*"),
  2307. # # TermQuery("attachment_extract_status",1),
  2308. # # generateBoolShouldQuery(["tenderee"],["银行","集团","公司"],WildcardQuery),
  2309. # # generateBoolShouldQuery(["doctitle"],list_not_key,MatchPhraseQuery),
  2310. # # generateBoolShouldQuery(["province"],["湖南","广西","广东"],MatchPhraseQuery),
  2311. # NestedQuery("sub_docs_json",ExistsQuery("sub_docs_json.win_tenderer"))
  2312. # # q_system,
  2313. # ]
  2314. )
  2315. list_query.append({"query":bool_query,"limit":10000})
  2316. break
  2317. # list_row = getDocument(list_query,["docid","service_time"],"document","document_index")
  2318. # list_query = []
  2319. # for row in list_row:
  2320. # list_query.append({"query":TermQuery("docid",row["docid"]),"limit":1})
  2321. list_row = getDocument(list_query,columns,"document","document_index")
  2322. # list_row = getDocument(list_query,columns,"document_product","document_product_index")
  2323. # list_row = getDocument(list_query,columns,"t_shen_pi_xiang_mu","t_shen_pi_xiang_mu_index")
  2324. def judge_save(row,result_queue):
  2325. docid = row.get("docid")
  2326. best_docid = row.get("best_docid")
  2327. if best_docid is not None and best_docid!="":
  2328. best_docid = int(best_docid)
  2329. if best_docid == int(docid):
  2330. return
  2331. consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",best_docid%500+1),("docid",best_docid)],columns_to_get=["web_source_no","page_time","doctitle","crtime","detail_link","fingerprint"])
  2332. _dict = getRow_ots_primary(return_row)
  2333. if _dict is not None:
  2334. if _dict.get("web_source_no")==row.get("web_source_no"):
  2335. new_dict = {"dup_docid":row.get("docid"),
  2336. "dup_page_time":row.get("page_time"),
  2337. "dup_doctitle":row.get("doctitle"),
  2338. "dup_web_source_no":row.get("web_source_no"),
  2339. "dup_crtime":row.get("crtime"),
  2340. "dup_detail_link":row.get("detail_link"),
  2341. "dup_fingerprint":row.get("fingerprint"),
  2342. "best_docid":_dict.get("docid"),
  2343. "best_page_time":_dict.get("page_time"),
  2344. "best_doctitle":_dict.get("doctitle"),
  2345. "best_web_source_no":_dict.get("web_source_no"),
  2346. "best_crtime":_dict.get("crtime"),
  2347. "best_detail_link":_dict.get("detail_link"),
  2348. "best_fingerprint":_dict.get("fingerprint"),
  2349. "detail_link_same":row.get("detail_link")==_dict.get("detail_link"),
  2350. "fingerprint_same":row.get("fingerprint")==_dict.get("fingerprint"),
  2351. }
  2352. result_queue.put(new_dict)
  2353. from export.html2text import html2text_with_tablehtml
  2354. def _judge_service_time(row,result_queue):
  2355. docid = int(row.get("docid"))
  2356. partition_key = docid%500+1
  2357. consumed, return_row, next_token = ots_capacity.get_row("document",[("partitionkey",partition_key),("docid",docid)],columns_to_get=["dochtmlcon"])
  2358. _dict = getRow_ots_primary(return_row)
  2359. if _dict is not None:
  2360. row_service_time = json.loads(row.get("service_time","{}"))
  2361. _html = _dict.get("dochtmlcon","")
  2362. _text = html2text_with_tablehtml(_html)
  2363. if len(_text)>20000:
  2364. return
  2365. _prompt = '''
  2366. 请从以下公告中提取服务期限,其中服务开始时间和服务结束时间是yyyy-mm-dd的格式,服务天数是数字天数,没有则给""
  2367. service_start 服务开始时间
  2368. service_end 服务结束时间
  2369. service_days 服务天数
  2370. 返回json格式{"service_end":"","service_start":"","service_days":""}
  2371. '''
  2372. _result = chat_doubao(_prompt+_text,model_name='ep-20250314164242-jd62g')
  2373. _json = get_json_from_text(_result)
  2374. if _json is not None:
  2375. try:
  2376. _dict = json.loads(_json)
  2377. new_dict = {
  2378. "docid":row.get("docid"),
  2379. "service_start_extract":row_service_time.get("service_start",""),
  2380. "service_end_extract":row_service_time.get("service_end",""),
  2381. "service_days_extract":row_service_time.get("service_days",""),
  2382. "service_start_ai":_dict.get("service_start",""),
  2383. "service_end_ai":_dict.get("service_end",""),
  2384. "service_days_ai":_dict.get("service_days",""),
  2385. "extract_equal_ai":row_service_time.get("service_start","")[:7]==_dict.get("service_start","")[:7] and row_service_time.get("service_end","")[:7]==_dict.get("service_end","")[:7] and str(row_service_time.get("service_days",""))==str(_dict.get("service_days","")),
  2386. }
  2387. result_queue.put(new_dict)
  2388. except:
  2389. _dict = {}
  2390. # task_queue = Queue()
  2391. # for row in list_row:
  2392. # task_queue.put(row)
  2393. # result_queue = Queue()
  2394. # mt = MultiThreadHandler(task_queue,_judge_service_time,result_queue,thread_count=10)
  2395. # mt.run()
  2396. # new_rows = []
  2397. # while 1:
  2398. # try:
  2399. # data = result_queue.get(timeout=1)
  2400. # new_rows.append(data)
  2401. # except:
  2402. # break
  2403. # list_query = []
  2404. #
  2405. # for _row in list_row:
  2406. # _uuid = uuid4().hex
  2407. # page_attachments = json.loads(_row.get("page_attachments"))
  2408. # l_s = []
  2409. # for _at in page_attachments:
  2410. # l_s.append(NestedQuery("page_attachments",TermQuery("page_attachments.fileMd5",_at.get("fileMd5"))))
  2411. # list_query.append({"query":BoolQuery(should_queries=l_s),"limit":500,"keyword":_uuid})
  2412. # list_row = getDocument(list_query,columns,"document","document_index")
  2413. df_data = {}
  2414. set_line = set()
  2415. # # list_row = filterRow(list_row,"doctitle",list_not_key)
  2416. # log("get document %d rows"%len(list_row))
  2417. # # getRowDataWithKey(df_data,list_row,columns)
  2418. getRowData(df_data,list_row,set_line,list_keyword,dict_channel,True)
  2419. # # getRowData_shenpi(df_data,list_row,set_line,list_keyword,dict_channel,True)
  2420. # # getRowData_sp1(df_data,list_row,set_line,list_keyword,dict_sptype,True)
  2421. # fixContactPerson(df_data,list_df_columns,get_legal_person=False)
  2422. df1 = pd.DataFrame(df_data)
  2423. df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
  2424. # print("get document %d rows"%len(new_rows))
  2425. # df1 = pd.DataFrame(new_rows)
  2426. # df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')))
  2427. # keys = df_data.keys()
  2428. # print("keys",keys)
  2429. # dict_company = {}
  2430. # set_dup_keys = set()
  2431. # set_docid = set()
  2432. # for _i in range(len(df_data[list(keys)[0]])):
  2433. # if df_data["关键词"][_i]==df_data["招标单位"][_i] or df_data["关键词"][_i]==df_data["中标单位"][_i]:
  2434. # company_name = df_data["关键词"][_i]
  2435. # if company_name not in dict_company:
  2436. # dict_company[company_name] = {"企业名称":company_name,"招标":[],"中标":[]}
  2437. # if df_data["关键词"][_i]==df_data["招标单位"][_i]:
  2438. # if str(df_data["招标金额"][_i])!="nan":
  2439. # _key = "%s-%s"%(company_name,str(df_data["招标金额"][_i]))
  2440. # if _key not in set_dup_keys:
  2441. # set_docid.add(df_data["docid"][_i])
  2442. # dict_company[company_name]["招标"].append({"标题":df_data["公告标题"][_i],
  2443. # "招标方式":df_data["招标方式"][_i],
  2444. # "招标单位":df_data["招标单位"][_i],
  2445. # "招标金额":df_data["招标金额"][_i]})
  2446. # set_dup_keys.add(_key)
  2447. # if df_data["关键词"][_i]==df_data["中标单位"][_i]:
  2448. # if str(df_data["中标金额"][_i])!="nan":
  2449. # _key = "%s-%s"%(str(df_data["中标单位"][_i]),str(df_data["中标金额"][_i]))
  2450. # if _key not in set_dup_keys:
  2451. # set_docid.add(df_data["docid"][_i])
  2452. # dict_company[company_name]["中标"].append({"标题1":df_data["公告标题"][_i],
  2453. # "招标方式1":df_data["招标方式"][_i],
  2454. # "中标单位1":df_data["中标单位"][_i],
  2455. # "中标金额1":df_data["中标金额"][_i]})
  2456. # set_dup_keys.add(_key)
  2457. # df_keys = ["企业名称","标题","招标方式","招标单位","招标金额","标题1","招标方式1","中标单位1","中标金额1"]
  2458. # df_da = {}
  2459. # for k in df_keys:
  2460. # df_da[k] = []
  2461. # for k,v in dict_company.items():
  2462. # list_zhaobiao = v["招标"]
  2463. # list_zhongbiao = v["中标"]
  2464. # _nums = max(min(len(list_zhaobiao),5),min(len(list_zhongbiao),5))
  2465. # for i in range(_nums):
  2466. # df_da["企业名称"].append(k)
  2467. # if i>=len(list_zhaobiao):
  2468. # df_da["标题"].append("")
  2469. # df_da["招标方式"].append("")
  2470. # df_da["招标单位"].append("")
  2471. # df_da["招标金额"].append("")
  2472. # else:
  2473. # df_da["标题"].append(list_zhaobiao[i]["标题"])
  2474. # df_da["招标方式"].append(list_zhaobiao[i]["招标方式"])
  2475. # df_da["招标单位"].append(list_zhaobiao[i]["招标单位"])
  2476. # df_da["招标金额"].append(list_zhaobiao[i]["招标金额"])
  2477. #
  2478. # if i>=len(list_zhongbiao):
  2479. # df_da["标题1"].append("")
  2480. # df_da["招标方式1"].append("")
  2481. # df_da["中标单位1"].append("")
  2482. # df_da["中标金额1"].append("")
  2483. # else:
  2484. # df_da["标题1"].append(list_zhongbiao[i]["标题1"])
  2485. # df_da["招标方式1"].append(list_zhongbiao[i]["招标方式1"])
  2486. # df_da["中标单位1"].append(list_zhongbiao[i]["中标单位1"])
  2487. # df_da["中标金额1"].append(list_zhongbiao[i]["中标金额1"])
  2488. # df2 = pd.DataFrame(df_da)
  2489. # df2.to_excel("tmp333.xlsx",columns=df_keys)
  2490. #
  2491. # df_3 = {}
  2492. # for k in keys:
  2493. # df_3[k] = []
  2494. # for _i in range(len(df_data[list(keys)[0]])):
  2495. # docid = df_data["docid"][_i]
  2496. # if docid in set_docid:
  2497. # for k in keys:
  2498. # df_3[k].append(df_data[k][_i])
  2499. # df3 = pd.DataFrame(df_3)
  2500. # df3.to_excel("tmp_333_mx.xlsx",columns=keys)
  2501. # fixContactPerson(df_data,list_df_columns)
  2502. #
  2503. def findProjects():
  2504. df = pd.read_excel("两广地区中标时间为空标注_预匹配1.xlsx",0)
  2505. list_items = []
  2506. for docids,project_code,project_name,tenderee,zhao_biao_page_time in zip(df["docids"],df["project_code"],df["project_name"],df["tenderee"],df["zhao_biao_page_time"]):
  2507. if not isinstance(project_code,(str)):
  2508. project_code = "$$$"
  2509. if not isinstance(project_name,(str)):
  2510. project_name = "$$$"
  2511. if not isinstance(tenderee,(str)):
  2512. tenderee = ""
  2513. print(dir(zhao_biao_page_time))
  2514. _dict = {"docids":docids,
  2515. "project_code":project_code,
  2516. "project_name":project_name,
  2517. "tenderee":tenderee,
  2518. "zhao_biao_page_time":zhao_biao_page_time.strftime("%Y-%m-%d"),
  2519. "end_time":timeAdd(zhao_biao_page_time.strftime("%Y-%m-%d"),180)}
  2520. list_items.append(_dict)
  2521. task_queue = queue.Queue()
  2522. for item in list_items:
  2523. task_queue.put(item)
  2524. def _handle(item,result_queue,ots_client):
  2525. docids = item.get("docids")
  2526. list_s_n = []
  2527. for docid in re.split(",",str(docids)):
  2528. list_s_n.append(TermQuery("docid",docid))
  2529. query_not = BoolQuery(should_queries=list_s_n)
  2530. # bool_query =BoolQuery(must_queries=[query_not])
  2531. # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  2532. # SearchQuery(bool_query,limit=50,get_total_count=True),
  2533. # ColumnsToGet(["sub_docs_json"],ColumnReturnType.SPECIFIED))
  2534. # if total_count>0:
  2535. # dict_rows = getRow_ots(rows)
  2536. # _find = False
  2537. # for _row in dict_rows:
  2538. # sub_docs_json = _row.get("sub_docs_json",'[]')
  2539. # sub_docs = json.loads(sub_docs_json)
  2540. # for _doc in sub_docs:
  2541. # if "bidding_budget" in _doc and _doc.get("bidding_budget",0)>0:
  2542. # item["new_budding_budget"] = _doc.get("bidding_budget",0)
  2543. # _find = True
  2544. # break
  2545. # if _find:
  2546. # break
  2547. #
  2548. # return
  2549. _find = True
  2550. bool_query =BoolQuery(must_queries=[
  2551. generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_code","$$$$"),item.get("project_name","$$$$")],MatchPhraseQuery),
  2552. generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
  2553. RangeQuery("status",151,301),
  2554. NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
  2555. RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))
  2556. ],must_not_queries=[query_not])
  2557. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  2558. SearchQuery(bool_query,limit=50,get_total_count=True),
  2559. ColumnsToGet(["doctitle","tenderee","sub_docs_json"],ColumnReturnType.SPECIFIED))
  2560. if total_count>0:
  2561. dict_rows = getRow_ots(rows)
  2562. str_docid = ""
  2563. for _row in dict_rows:
  2564. str_docid+="%d,"%_row.get("docid")
  2565. sub_docs_json = _row.get("sub_docs_json",'[]')
  2566. sub_docs = json.loads(sub_docs_json)
  2567. if item.get("tenderee","--")!=_row.get("tenderee","-#"):
  2568. continue
  2569. for _doc in sub_docs:
  2570. if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
  2571. item["new_win_bid_price"] = _doc.get("win_bid_price")
  2572. item["new_win_tenderer"] = _doc.get("win_tenderer")
  2573. item["new_finded_docid"] = _row.get("docid")
  2574. _find = True
  2575. break
  2576. if _find:
  2577. return
  2578. item["maybe_docids"] = str_docid
  2579. bool_query =BoolQuery(must_queries=[
  2580. generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_code","$$$$")],MatchPhraseQuery),
  2581. generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
  2582. RangeQuery("status",151,301),
  2583. NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
  2584. RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))
  2585. ],must_not_queries=[query_not])
  2586. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  2587. SearchQuery(bool_query,limit=50,get_total_count=True),
  2588. ColumnsToGet(["doctitle","tenderee","sub_docs_json"],ColumnReturnType.SPECIFIED))
  2589. if total_count>0:
  2590. dict_rows = getRow_ots(rows)
  2591. str_docid = ""
  2592. for _row in dict_rows:
  2593. str_docid+="%d,"%_row.get("docid")
  2594. sub_docs_json = _row.get("sub_docs_json",'[]')
  2595. sub_docs = json.loads(sub_docs_json)
  2596. if item.get("tenderee","--")!=_row.get("tenderee","-#"):
  2597. continue
  2598. for _doc in sub_docs:
  2599. if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
  2600. item["new_win_bid_price"] = _doc.get("win_bid_price")
  2601. item["new_win_tenderer"] = _doc.get("win_tenderer")
  2602. item["new_finded_docid"] = _row.get("docid")
  2603. _find = True
  2604. break
  2605. if _find:
  2606. return
  2607. item["maybe_docids"] = str_docid
  2608. bool_query =BoolQuery(must_queries=[
  2609. generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_name","$$$$")],MatchPhraseQuery),
  2610. generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
  2611. RangeQuery("status",151,301),
  2612. NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
  2613. RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))
  2614. ],must_not_queries=[query_not])
  2615. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  2616. SearchQuery(bool_query,limit=50,get_total_count=True),
  2617. ColumnsToGet(["doctitle"],ColumnReturnType.SPECIFIED))
  2618. if total_count>0:
  2619. dict_rows = getRow_ots(rows)
  2620. str_docid = ""
  2621. for _row in dict_rows:
  2622. str_docid+="%d,"%_row.get("docid")
  2623. sub_docs_json = _row.get("sub_docs_json",'[]')
  2624. sub_docs = json.loads(sub_docs_json)
  2625. if item.get("tenderee","--")!=_row.get("tenderee","-#"):
  2626. continue
  2627. for _doc in sub_docs:
  2628. if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
  2629. item["new_win_bid_price"] = _doc.get("win_bid_price")
  2630. item["new_win_tenderer"] = _doc.get("win_tenderer")
  2631. item["new_finded_docid"] = _row.get("docid")
  2632. _find = True
  2633. break
  2634. if _find:
  2635. return
  2636. item["maybe_docids"] = str_docid
  2637. return
  2638. mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=getConnect_ots())
  2639. mt.run()
  2640. df_data = {"docids":[],
  2641. "project_code":[],
  2642. "project_name":[],
  2643. "maybe_docids":[],
  2644. "new_budding_budget":[],
  2645. "new_win_bid_price":[],
  2646. "new_win_tenderer":[],
  2647. "new_finded_docid":[]}
  2648. keys = df_data.keys()
  2649. for item in list_items:
  2650. for k in keys:
  2651. df_data[k].append(item.get(k))
  2652. df2 = pd.DataFrame(df_data)
  2653. df2.to_excel("两广补充数据.xlsx")
  2654. def attachCompanyContact():
  2655. files = ["../data/2021-03-17_四川_关键词导出.csv",
  2656. "../data/2021-03-17_安徽_关键词导出.csv",
  2657. "../data/2021-03-17_江西_关键词导出.csv",
  2658. "../data/2021-03-17_湖南_关键词导出.csv"]
  2659. files = ["../data/20210609(最新).xlsx"]
  2660. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  2661. set_enter = set()
  2662. for file in files:
  2663. df = pd.read_excel(file)
  2664. columns = ["招标单位","中标单位","代理单位"]
  2665. for _c in columns:
  2666. for item in df[_c]:
  2667. if isinstance(item,str):
  2668. item = item.strip()
  2669. if item!="":
  2670. set_enter.add(item)
  2671. dict_enter = getDictEnterprise(list(set_enter))
  2672. for file in files:
  2673. task_queue = queue.Queue()
  2674. df = pd.read_excel(file,encoding="UTF8")
  2675. keys = df.keys()[1:]
  2676. list_item = []
  2677. for row in df.itertuples():
  2678. _dict = {}
  2679. for _key in keys:
  2680. if _key in dir(row):
  2681. _v = row.__getattribute__(_key)
  2682. else:
  2683. _v = ''
  2684. _dict[_key] = _v
  2685. if str(_dict["招标联系人"]) in ("","nan") or str(_dict["招标联系人电话"]) in ("","nan"):
  2686. contact_person,mobile = getOneContact(dict_enter.get(_dict["招标单位"],{}).get("contacts","[]"))
  2687. if contact_person!="":
  2688. _dict["招标联系人"] = contact_person
  2689. _dict["招标联系人电话"] = mobile
  2690. if str(_dict["中标联系人"]) in ("","nan") or str(_dict["中标联系人电话"]) in ("","nan"):
  2691. contact_person,mobile = getOneContact(dict_enter.get(_dict["中标单位"],{}).get("contacts","[]"))
  2692. if contact_person!="":
  2693. _dict["中标联系人"] = contact_person
  2694. _dict["中标联系人电话"] = mobile
  2695. if str(_dict["代理联系人"]) in ("","nan") or str(_dict["代理联系人电话"]) in ("","nan"):
  2696. contact_person,mobile = getOneContact(dict_enter.get(_dict["代理单位"],{}).get("contacts","[]"))
  2697. if contact_person!="":
  2698. _dict["代理联系人"] = contact_person
  2699. _dict["代理联系人电话"] = mobile
  2700. list_item.append(_dict)
  2701. for item in list_item:
  2702. task_queue.put(item)
  2703. df_data = {}
  2704. for _k in keys:
  2705. df_data[_k] = []
  2706. for item in list_item:
  2707. for _k in keys:
  2708. df_data[_k].append(getLegal_str(item.get(_k,"-")))
  2709. df1 = pd.DataFrame(df_data)
  2710. df1.to_excel("%s_attach.xlsx"%file,columns=keys)
  2711. def dumpWebSourceNo():
  2712. conn_oracle = getConnection_oracle()
  2713. cursor_oracle = conn_oracle.cursor()
  2714. sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO "
  2715. cursor_oracle.execute(sql)
  2716. rows = cursor_oracle.fetchall()
  2717. conn_mysql = getConnection_testmysql()
  2718. cursor_mysql = conn_mysql.cursor()
  2719. for row in rows:
  2720. sql = " insert into webSource(web_source_no,web_source_name) values('%s','%s')"%(row[0],row[1])
  2721. print(sql)
  2722. cursor_mysql.execute(sql)
  2723. conn_mysql.commit()
  2724. def exportNzj():
  2725. # filename = "../data/重复公告.xlsx"
  2726. # df = pd.read_excel(filename)
  2727. ots_client = getConnect_ots()
  2728. columns = ["contacts","covered_area","follows","docids","page_time","progress","project_description","project_follow","project_code","project_name","project_type"]
  2729. def getData(df_data,rows,set_line):
  2730. list_data = getRow_ots(rows)
  2731. for row in list_data:
  2732. item = {}
  2733. _dict = row
  2734. set_dict_item(item,"docids",_dict.get("docids",""))
  2735. set_dict_item(item,"contacts",_dict.get("contacts",""))
  2736. set_dict_item(item,"covered_area",_dict.get("covered_area",""))
  2737. set_dict_item(item,"follows",_dict.get("follows",""))
  2738. set_dict_item(item,"project_type",_dict.get("project_type",""))
  2739. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  2740. set_dict_item(item,"page_time",_dict.get("page_time",""))
  2741. set_dict_item(item,"progress",_dict.get("progress",""))
  2742. set_dict_item(item,"project_description",_dict.get("project_description",""))
  2743. set_dict_item(item,"project_follow",_dict.get("project_follow",""))
  2744. set_dict_item(item,"project_code",_dict.get("project_code",""))
  2745. set_dict_item(item,"project_name",_dict.get("project_name",""))
  2746. for k,v in item.items():
  2747. if k not in df_data:
  2748. df_data[k] = []
  2749. df_data[k].append(v)
  2750. df_data = {}
  2751. bool_query = BoolQuery(must_queries=[ExistsQuery("docids")])
  2752. rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
  2753. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
  2754. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  2755. set_line = set()
  2756. _count = len(rows)
  2757. getData(df_data,rows,set_line)
  2758. while next_token:
  2759. print("%d/%d"%(_count,total_count))
  2760. rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
  2761. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  2762. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  2763. getData(df_data,rows,set_line)
  2764. _count += len(rows)
  2765. df1 = pd.DataFrame(df_data)
  2766. df1.to_excel("../data/2021-03-31_拟在建数据导出1.xlsx",columns=list_df_columns)
  2767. def turn_status():
  2768. df = pd.read_excel("../data/欧科自然资源5w以上数据.xlsx")
  2769. conn = getConnection_testmysql()
  2770. cursor = conn.cursor()
  2771. for docid in df["公告id"]:
  2772. partitionkey = int(docid)%500+1
  2773. sql = " insert into turn_status(partitionkey,docid) values(%d,%d)"%(partitionkey,docid)
  2774. cursor.execute(sql)
  2775. conn.commit()
  2776. def attachBidding_budget():
  2777. conn_mysql = getConnection_testmysql()
  2778. cursor = conn_mysql.cursor()
  2779. sql = "select docid from analysis_r2 where bidding_budget=''"
  2780. task_queue = queue.Queue()
  2781. result_queue = queue.Queue()
  2782. cursor.execute(sql)
  2783. rows = cursor.fetchmany(10000)
  2784. while(rows):
  2785. for row in rows:
  2786. task_queue.put(row[0])
  2787. rows = cursor.fetchmany(10000)
  2788. pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_testmysql)
  2789. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  2790. def _handle(item,result_queue,pool_mysql,pool_ots):
  2791. ots_client = pool_ots.getConnector()
  2792. bool_query = BoolQuery(must_queries=[TermQuery("docids",item)])
  2793. rows, next_token, total_count, is_all_succeed = ots_client.search("project2", "project2_index",
  2794. SearchQuery(bool_query , limit=1, get_total_count=True),
  2795. ColumnsToGet(["bidding_budget"],return_type=ColumnReturnType.SPECIFIED))
  2796. list_dict = getRow_ots(rows)
  2797. if len(list_dict)>0:
  2798. conn = pool_mysql.getConnector()
  2799. cursor = conn.cursor()
  2800. sql = " update analysis_r2 set bidding_budget='%s' where docid=%d"%(str(list_dict[0].get("bidding_budget","")),item)
  2801. cursor.execute(sql)
  2802. conn.commit()
  2803. pool_mysql.putConnector(conn)
  2804. pool_ots.putConnector(ots_client)
  2805. mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=pool_mysql,pool_ots=pool_ots)
  2806. mt.run()
  2807. def debug_documentMerge():
  2808. conn = getConnection_testmysql()
  2809. cursor = conn.cursor()
  2810. sql = "select merge_docids from project_group_final_log "
  2811. cursor.execute(sql)
  2812. task_queue = queue.Queue()
  2813. for row in cursor.fetchall():
  2814. task_queue.put(row[0])
  2815. print(task_queue.qsize())
  2816. def _handle(item,result_queue,pool_ots):
  2817. ots_client = pool_ots.getConnector()
  2818. list_docids = item.split(",")
  2819. must_q = []
  2820. for _docid in list_docids:
  2821. must_q.append(TermQuery("docids",_docid))
  2822. bool_query = BoolQuery(must_queries=must_q)
  2823. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  2824. SearchQuery(bool_query,limit=1,get_total_count=True),
  2825. ColumnsToGet(column_names=["docids"],return_type=ColumnReturnType.SPECIFIED))
  2826. if total_count==0:
  2827. print(item)
  2828. result_queue.put(item)
  2829. pool_ots.putConnector(ots_client)
  2830. result_queue = queue.Queue()
  2831. pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
  2832. mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
  2833. mt.run()
  2834. while(True):
  2835. try:
  2836. item = result_queue.get(True)
  2837. print(item)
  2838. except Exception as e:
  2839. print(str(e))
  2840. break
  2841. def signDocument():
  2842. filename = "C:\\Users\\Administrator\\Desktop\\中标信息1.xlsx"
  2843. sign_filename = "%s_sign.xlsx"%filename
  2844. df = pd.read_excel(filename)
  2845. df_data = {"sign":[]}
  2846. for item in df["segword"]:
  2847. content = re.sub("\s*","",item)
  2848. _find = re.search("(?P<key>采购失败|招标失败)",content)
  2849. if _find is not None:
  2850. df_data["sign"].append(_find.groupdict().get("key"))
  2851. else:
  2852. df_data["sign"].append("无")
  2853. df1 = pd.DataFrame(df_data)
  2854. df1.to_excel(sign_filename)
  2855. def exportWin_tenderer_count():
  2856. ots_client = getConnect_ots()
  2857. str_enter = '''
  2858. 红旗渠建设集团有限公司
  2859. 河南瑞华建筑集团有限公司
  2860. 林州瑞达工程管理有限公司
  2861. 河南鸿盛建筑工程有限公司
  2862. 天一建设发展有限公司
  2863. 河南省大成建设工程有限公司
  2864. 中润昌弘建工集团有限公司
  2865. 河南省中创建筑工程有限公司
  2866. 河南锦达建设有限公司
  2867. 林州宏基建筑工程有限公司
  2868. 河南富世建筑工程有限公司
  2869. 中恒方圆建筑工程有限公司
  2870. 河南华隆建设工程有限公司
  2871. 河南昊锦建设集团有限公司
  2872. 河南新隆建工集团有限公司
  2873. 中城华安建设集团有限公司
  2874. 河南恒通公路桥梁建设有限公司
  2875. 林州二建集团建设有限公司
  2876. 河南华安建设集团有限公司
  2877. 河南裕鸿建筑工程有限公司
  2878. 中商建投建设有限公司
  2879. 河南鑫利恒工程有限公司
  2880. 林州市永盛建筑有限公司
  2881. 林州市顺鑫建筑有限公司
  2882. 中水京林建设有限公司
  2883. 河南宏海建设有限公司
  2884. 河南宏岳建设有限公司
  2885. 河南元筑实业有限公司
  2886. 河南基兆建筑工程有限公司
  2887. 林州金瑞建筑工程有限公司
  2888. 林州建工集团有限公司
  2889. 河南万融建筑工程有限公司
  2890. 林州东风建设有限公司
  2891. 河南鸿泰建筑安装有限公司
  2892. 河南源泰建筑有限公司
  2893. 河南优德建筑工程有限公司
  2894. 安阳鸿盛建设劳务有限公司
  2895. 河南省安阳市安装工程有限责任公司
  2896. 河南港城建设工程有限公司
  2897. 河南天汇建筑工程有限公司
  2898. 河南省惠浦建设发展有限公司
  2899. 林州市建筑工程有限公司
  2900. 河南正天建筑工程有限公司
  2901. 河南颂邦建筑工程有限公司
  2902. 林州市华源建设有限公司
  2903. 河南中投建设有限公司
  2904. 林州华林建筑劳务有限公司
  2905. 河南基祥建设工程有限公司
  2906. 河南文水水电工程有限公司
  2907. 林州兴业建筑工程有限公司
  2908. 河南中州建筑有限公司
  2909. 河南省佳禾园林建设有限公司
  2910. 林州万亚建筑工程有限公司
  2911. 河南林正建设工程有限公司
  2912. 河南鼎兴建设工程有限公司
  2913. 河南平原建工集团有限公司
  2914. 河南林九建设工程有限公司
  2915. 林州市三才建筑工程有限公司
  2916. 安阳建设(集团)有限责任公司
  2917. 盛世恒达建设有限公司
  2918. 河南城洲建设工程有限公司
  2919. 河南国埔建筑工程有限公司
  2920. 中创市政建设发展有限公司
  2921. 河南正祥建筑工程有限公司
  2922. 河南宏九建筑工程有限公司
  2923. 河南金阳建筑工程有限公司
  2924. 河南天容建设工程有限责任公司
  2925. 河南聚宇建筑工程有限公司
  2926. 河南瑞旗建设工程有限公司
  2927. 河南利盛建设工程有限公司
  2928. 林州四海建设有限公司
  2929. 林州市建工城建集团有限公司
  2930. 河南众佑建设工程有限公司
  2931. 河南德诚建设有限公司
  2932. 河南景华建筑工程有限公司
  2933. 河南华江建筑工程有限公司
  2934. 林州永丰建设集团有限公司
  2935. 林州福东建设工程有限公司
  2936. 河南恒森建筑工程有限公司
  2937. 河南朝泓建设工程有限公司
  2938. 河南润京建设有限公司
  2939. 林州市红旗渠公路工程有限公司
  2940. 林州中宇建设工程有限公司
  2941. 河南长锦建设工程有限公司
  2942. 河南汇商建筑工程有限公司
  2943. 河南省豫鹤建设工程有限公司
  2944. 河南江城建筑工程有限公司
  2945. 中海华祥建设发展有限公司
  2946. 河南宁中路桥建筑有限公司
  2947. 河南天河建设工程有限公司
  2948. 林州市路桥建筑工程有限公司
  2949. 河南省中北建设有限公司
  2950. 河南汇亿建筑工程有限公司
  2951. 河南金帝建筑安装有限公司
  2952. 河南省望安建筑工程有限公司
  2953. 泰欣建设有限公司
  2954. 河南筑鑫建筑工程有限公司
  2955. 元熙建设工程有限公司
  2956. 旭隆建设集团有限公司
  2957. 河南省城控建工集团有限公司
  2958. 河南晨丰建筑工程有限公司
  2959. 河南嘉丰建设有限公司
  2960. 林州市合众建筑劳务有限公司
  2961. 河南金瓦刀建筑劳务有限公司
  2962. 河南中实建筑工程有限公司
  2963. 畅通路桥工程建设有限责任公司
  2964. 河南军恒建设有限公司
  2965. 中钊建设集团有限公司
  2966. 河南德宁建设集团有限公司
  2967. 林州兴鸿建筑工程有限公司
  2968. 林州市明泽建筑工程有限公司
  2969. 河南紫光建筑工程有限公司
  2970. 河南誉天建筑工程有限公司
  2971. 林州景丰建筑劳务有限公司
  2972. 河南江瀚建筑劳务有限公司
  2973. 河南弘之昌建筑工程有限公司
  2974. 河南祥泰钻井工程有限公司
  2975. 河南迅阳建筑劳务有限公司
  2976. 河南嘉成建筑工程有限公司
  2977. 河南兴锦建设工程有限公司
  2978. 河南邦坤建设工程有限公司
  2979. 河南锦毅市政工程建筑有限公司
  2980. 河南广益建筑工程有限公司
  2981. 河南创胜建筑工程有限公司
  2982. 河南勤铭建筑工程有限公司
  2983. 河南铭锋建设工程有限公司
  2984. 平源建设有限公司
  2985. 河南隆通建筑工程有限公司
  2986. 河南省基本建设有限公司
  2987. 河南丰茂建筑劳务有限公司
  2988. 河南城安建筑工程有限公司
  2989. 林州市富源建筑劳务有限公司
  2990. 德方建设有限公司
  2991. 河南泰联建筑工程有限公司
  2992. 河南新建投工程有限公司
  2993. 河南省鲁班建工集团有限公司
  2994. 林州方超建筑劳务有限公司
  2995. 林州市采桑建筑劳务输出有限公司
  2996. 河南省仁昱建筑工程有限公司
  2997. 河南鸾林建设工程有限公司
  2998. 宜民建设集团有限公司
  2999. 林州聚兴建筑工程有限公司
  3000. 河南省聚国建筑工程有限公司
  3001. 林州市大东建筑劳务有限公司
  3002. 河南欣东劳务有限公司
  3003. 中建润德景观建筑工程有限公司
  3004. 河南辰宇建设工程有限公司
  3005. 号东建设工程有限公司
  3006. 河南润北建筑工程有限公司
  3007. 河南邦昊建设工程有限公司
  3008. 林州市建设投资有限责任公司
  3009. 林州市太行建设工程有限公司
  3010. 河南峡安建筑工程有限公司
  3011. 河南安疆建筑工程有限公司
  3012. 河南淇河建设工程有限公司
  3013. 河南晶品建设有限公司
  3014. 河南翔固建筑工程有限公司
  3015. 纵横九州路桥建设有限公司
  3016. 河南青林建筑工程有限公司
  3017. 合久建设有限公司
  3018. 河南明昊建筑工程有限公司
  3019. 河南滨河建设工程有限公司
  3020. 河南群腾建筑工程有限公司
  3021. 河南隆亨建筑工程有限公司
  3022. 骏达建设有限公司
  3023. 河南仁安建设工程有限公司
  3024. 河南旻尚园林建筑工程有限公司
  3025. 河南省匡正建设工程有限公司
  3026. 河南金凡建筑工程有限公司
  3027. 河南佰丰建筑工程有限公司
  3028. 德普建设有限公司
  3029. 国润新天地工程技术有限公司
  3030. 中潮建设发展有限公司
  3031. 河南捷正建筑工程有限公司
  3032. 林州百万工匠建筑劳务有限公司
  3033. 河南祥彬建筑工程有限公司
  3034. 河南林祥工程建设有限公司
  3035. 河南唐尧建筑劳务有限公司
  3036. 河南汇祥建设有限公司
  3037. 河南友信建设有限公司
  3038. 林州市鼎昇建筑工程有限公司
  3039. 林州市富兴建筑劳务有限公司
  3040. 林州厚德建筑劳务有限公司
  3041. 河南振亚工程建设有限公司
  3042. 河南英茂建筑工程有限公司
  3043. 河南丰岩建设工程有限公司
  3044. 林州市昌都建筑工程有限公司
  3045. 林州四建建筑工程有限公司
  3046. 林州和兴建筑劳务有限公司
  3047. 林州市鸿升建筑工程有限公司
  3048. 河南润泰建设工程有限公司
  3049. 河南鑫路通建筑劳务有限公司
  3050. 河南信守建筑劳务有限公司
  3051. 林州安达鸿昌建筑劳务有限公司
  3052. 河南意达建设有限公司
  3053. 河南金穗来建筑工程有限公司
  3054. 河南东风建筑工程有限公司
  3055. 河南筑胜建筑劳务有限公司
  3056. 河南民润建筑工程有限公司
  3057. 林州市中锦路桥建设工程有限公司
  3058. 林州一建建筑工程有限公司
  3059. 林州市宏瑞建筑劳务有限公司
  3060. 林州鸿恩建筑劳务有限公司
  3061. 河南晟元建筑工程有限公司
  3062. 中国建筑第六工程局有限公司
  3063. 河南筑泰建筑工程有限公司
  3064. 河南省亚建建筑工程有限公司
  3065. 河南辰弘建筑工程有限公司
  3066. 河南先创建筑工程有限公司
  3067. 林豫建工集团有限公司
  3068. 河南省盛民建筑工程有限公司
  3069. 河南泓发市政工程有限公司
  3070. 河南帝恩建筑劳务有限公司
  3071. 河南天泉建设工程有限公司
  3072. 河南恒升工程建设有限公司
  3073. 林州市浩远电力建筑工程有限公司
  3074. 河南友瑞建筑工程有限公司
  3075. 河南冠州路桥工程有限公司
  3076. 三角鼎建设工程有限公司
  3077. 河南富坤建筑工程有限公司
  3078. 林州市恒源建筑工程有限公司
  3079. 河南广汇建筑工程有限公司
  3080. 河南隆豫建设有限公司
  3081. 林州市九洲工程劳务有限公司
  3082. 林州瑜辉建筑工程有限公司
  3083. 河南福恩建筑工程有限公司
  3084. 河南通盛路桥建设有限公司
  3085. 河南央泰建设工程有限公司
  3086. 林州市红旗渠公路养护工程有限公司
  3087. 林州大兴建设工程有限公司
  3088. 河南锐丰建设工程有限公司
  3089. 林州市中泰建筑劳务有限公司
  3090. 林州成业建筑工程有限公司
  3091. 河南建创建筑工程有限公司
  3092. 河南宏兴建设工程有限公司
  3093. 河南隆鼎建筑工程有限公司
  3094. 林州市天罡建筑劳务有限公司
  3095. 汇聚建设发展有限公司
  3096. 中铁中城工程有限公司
  3097. 河南景天建筑劳务有限公司
  3098. 林州蒙建建设工程有限公司
  3099. 富华建设工程有限公司
  3100. 河南殿轩建筑劳务有限公司
  3101. 河南瑞通建设工程有限公司
  3102. 林州金桥劳务工程有限公司
  3103. 河南省景隆实业有限公司
  3104. 河南升洲建筑工程有限公司
  3105. 河南里程建筑劳务有限公司
  3106. 林州市润景建设工程有限公司
  3107. 河南巨坤建筑工程有限公司
  3108. 河南九牛建设劳务有限公司
  3109. 吉修建设工程有限公司
  3110. 河南图润建筑工程有限公司
  3111. 河南鼎鑫建筑劳务有限公司
  3112. 河南港航建设工程有限公司
  3113. 河南省盛飞建设工程有限公司
  3114. 林州市兴义建筑劳务有限公司
  3115. 河南秉程建筑工程有限公司
  3116. 河南硕亚水电路桥工程有限公司
  3117. 河南科才建筑劳务有限公司
  3118. 河南荣泰建筑安装工程有限公司
  3119. 河南省天丰建筑工程有限公司
  3120. 河南方元建筑工程有限公司
  3121. 恒上建设有限公司
  3122. 河南省德信建筑工程有限公司
  3123. 河南诚宸建设工程有限公司
  3124. 河南置信建筑工程有限公司
  3125. 河南省鑫河建设有限公司
  3126. 河南成兴建设工程有限公司
  3127. 林州中港建筑工程有限公司
  3128. 河南富春建设工程有限公司
  3129. 中科豫资建设发展有限公司
  3130. 河南京都建筑安装有限公司
  3131. 安阳市宇豪爆破工程有限公司
  3132. 河南华特建筑工程有限公司
  3133. 河南颍淮建工有限公司
  3134. 林州市八建工程有限公司
  3135. 河南展辉建筑工程有限公司
  3136. 河南中博建筑有限公司
  3137. 河南方圆建设有限公司
  3138. 河南大鼎建筑工程有限公司
  3139. 林州中天建设有限公司
  3140. 河南久东建筑工程有限公司
  3141. 河南九一建设工程有限公司
  3142. 九州水文建设集团有限公司
  3143. 河南省建安防水防腐工程有限公司
  3144. 中建宏图建设发展有限公司
  3145. 筑宇建设有限公司
  3146. 林州市宏图建设工程有限公司
  3147. 河南林润建设工程有限公司
  3148. 嘉泰建设发展有限公司
  3149. 河南丰茂建筑安装工程有限公司
  3150. 河南万泰建设工程有限公司
  3151. 林州市红旗渠市政工程有限公司
  3152. 林州建总建筑工程有限公司
  3153. 河南聚之祥建设有限公司
  3154. 河南鼎之信建设工程有限公司
  3155. 河南省华瑞建设工程有限公司
  3156. 河南世光电力工程有限公司
  3157. 河南地远建筑工程有限公司
  3158. 河南鑫品建筑工程有限公司
  3159. 河南省东旗建筑工程有限公司
  3160. 润华建设有限公司
  3161. 林州富民建筑劳务有限公司
  3162. 林州市晨诚建筑劳务有限公司
  3163. 河南万胜建设有限公司
  3164. 河南龙磐建筑工程有限公司
  3165. 河南顺昌建筑劳务有限公司
  3166. 林州恒瑞建设工程有限公司
  3167. 河南大成建设劳务有限公司
  3168. 河南大一建筑劳务有限公司
  3169. 河南盛威建筑工程有限公司
  3170. 河南坤之宇建筑工程有限公司
  3171. 众信电力工程有限公司
  3172. 河南昱佛建筑工程有限公司
  3173. 河南淇源建筑工程有限公司
  3174. 林州凤宝建筑安装有限公司
  3175. 河南中发岩土工程有限公司
  3176. 河南中都建设工程有限公司
  3177. 河南祥凯建筑工程有限公司
  3178. 河南乐泰建筑工程有限公司
  3179. 林州宏达建筑劳务有限公司
  3180. 河南华盛建设集团有限公司
  3181. 河南凯通建设工程有限公司
  3182. 国腾路桥工程有限公司
  3183. 中建方达建设工程有限公司
  3184. 河南省天都建设工程有限公司
  3185. 昌隆建设工程有限公司
  3186. 河南洹上村园林绿化工程有限公司
  3187. 河南双锦建设工程有限公司
  3188. 河南子丰市政工程有限公司
  3189. 林州首创建筑工程有限公司
  3190. 河南众鑫建筑工程有限公司
  3191. 河南宁崴建筑工程有限公司
  3192. 林州市航安建筑劳务有限公司
  3193. 林州益成建设工程有限公司
  3194. 林州市昌弘建筑工程有限公司
  3195. 河南正耀建设有限公司
  3196. 河南鑫鹏建设工程有限公司
  3197. 林州恒泰建筑工程有限公司
  3198. 林竣建设有限公司
  3199. 河南朝众建筑工程有限公司
  3200. 林州科鸿建筑工程有限公司
  3201. 东辰建设发展有限公司
  3202. 河南创新新能源科技有限公司
  3203. 河南省永业建筑工程有限公司
  3204. 林州市煜凯建筑工程有限公司
  3205. 宝鼎建设工程有限公司
  3206. 林州市航安建筑工程有限公司
  3207. 河南业展建设工程有限公司
  3208. 河南联竣建筑工程有限公司
  3209. 河南聚超建筑工程有限公司
  3210. 林州远方电力工程有限公司
  3211. 河南蒙寅建筑劳务有限公司
  3212. 方元建筑劳务有限公司
  3213. 龙兴建设工程有限公司
  3214. 河南春谦建设工程有限公司
  3215. 河南正博公路工程有限公司
  3216. 林州市汇鑫安装工程有限公司
  3217. 林州市祥隆劳务有限公司
  3218. 河南胜杰建筑工程有限公司
  3219. 河南恩普建筑工程有限公司
  3220. 河南港津建筑工程有限公司
  3221. 河南昌明建筑工程有限公司
  3222. 中豫城控建设集团有限公司
  3223. 林州晨宇建设工程有限公司
  3224. 河南豫柯建筑工程有限公司
  3225. 河南捷润建筑工程有限公司
  3226. 中方通建设工程有限公司
  3227. 河南多果建筑工程有限公司
  3228. 河南尚伟建筑工程有限公司
  3229. 林州新航程建筑工程有限公司
  3230. 河南金华建筑工程有限公司
  3231. 国云工程技术有限公司
  3232. 河南路威路桥工程有限公司
  3233. 林州中盛建设工程有限公司
  3234. 林州市恒基建设有限公司
  3235. 河南润恒建筑工程有限公司
  3236. 河南华安水利工程有限公司
  3237. 中城易通建设发展有限公司
  3238. 河南浚洲建筑工程有限公司
  3239. 林州市锦晟建筑劳务有限公司
  3240. 河南省北安建筑工程有限公司
  3241. 林州泰岳建设工程有限公司
  3242. 河南联洋建筑工程有限公司
  3243. 河南港大市政建筑工程有限公司
  3244. 林州东盛建筑劳务有限公司
  3245. 河南省天鉴建设工程有限公司
  3246. 河南瑞凝建筑工程有限公司
  3247. 林州市东瑞建筑劳务有限公司
  3248. 河南众达建筑劳务有限公司
  3249. 河南省帝增建筑工程有限公司
  3250. 河南省升灿建筑工程有限公司
  3251. 河南苑景建筑劳务分包有限公司
  3252. 林州众立建设工程有限公司
  3253. 红旺建筑工程有限公司
  3254. 林州市圣兴建筑劳务有限公司
  3255. 林州诚林建筑劳务有限公司
  3256. 林州建工劳务有限公司
  3257. 河南巨业建筑工程有限公司
  3258. 中科华夏建设开发有限公司
  3259. 君晟建筑工程有限公司
  3260. 郑州新动力建筑劳务分包有限公司
  3261. 河南省福德建筑工程有限公司
  3262. 林州源大建筑工程有限公司
  3263. 河南大瑞园林建设有限公司
  3264. 河南秋禾建筑劳务有限公司
  3265. 河南腾翔建筑工程有限公司
  3266. 河南天之华建设工程有限公司
  3267. 河南祥和建筑安装有限公司
  3268. 河南省鼎文建设工程有限公司
  3269. 河南周城建设发展有限公司
  3270. 河南庆泰建筑工程有限公司
  3271. 中科信合建设发展有限公司
  3272. 林州恒隆建设工程有限公司
  3273. 河南省力恒建筑工程有限公司
  3274. 林州市四季青绿化有限责任公司
  3275. 林州市景盛建筑工程有限公司
  3276. 河南建基建设工程有限公司
  3277. 河南宝凯建筑工程有限公司
  3278. 林州市四合建筑劳务有限公司
  3279. 河南和耀建筑工程有限公司
  3280. 林州市凯达建筑劳务有限公司
  3281. 林州市恒信建筑劳务有限公司
  3282. 开翔建设工程有限公司
  3283. 河南省新创达建设工程有限公司
  3284. 林州鑫龙建筑工程有限公司
  3285. 河南省昌博建筑工程有限公司
  3286. 河南君利泰建筑工程有限公司
  3287. 林州杏林建筑工程有限公司
  3288. 河南千禧建设工程有限公司
  3289. 中建诚正建筑工程有限公司
  3290. 河南省聚千建筑工程有限公司
  3291. 林州海之鸿建筑工程有限公司
  3292. 河南振鼎建筑工程有限公司
  3293. 林州方成建筑劳务有限公司
  3294. 河南众众建设工程有限公司
  3295. 林州市万润建筑劳务有限公司
  3296. 启创建设工程有限公司
  3297. 河南子明建筑工程有限公司
  3298. 安阳市兴鼎路桥工程有限公司
  3299. 河南智擎建筑劳务有限公司
  3300. 河南鼎平市政工程有限公司
  3301. 林州宏阳建筑工程有限公司
  3302. 河南豫泰建筑工程有限公司
  3303. 林州市鸿浩建筑劳务有限公司
  3304. 林州市锦华建筑工程有限公司
  3305. 河南瑞锋建设有限公司
  3306. 河南欧信建筑劳务有限公司
  3307. 林州市中兴建筑劳务有限公司
  3308. 林州市大德建设工程有限公司
  3309. 河南华文建设有限公司
  3310. 河南凌焜建筑工程有限公司
  3311. 河南安居建设有限公司
  3312. 林州鲲鹏建筑工程有限公司
  3313. 林州经纬建筑工程有限公司
  3314. 林州祥川建筑工程有限公司
  3315. 林州市鑫淼建筑劳务有限公司
  3316. 河南祥泰路桥有限公司
  3317. 景祥建设工程有限公司
  3318. 河南省兴华建安工程有限公司
  3319. 河南古森建筑劳务有限公司
  3320. 平祥建设工程有限公司
  3321. 河南大博建设工程有限公司
  3322. 河南华普建设工程有限公司
  3323. 河南东邦建设工程有限公司
  3324. 卓冠建设工程有限公司
  3325. 河南品瑞建筑工程有限公司
  3326. 河南宝金建设工程有限公司
  3327. 中城鑫邦建设有限公司
  3328. 河南省鸿运建设工程有限公司
  3329. 林州明奥建筑工程有限公司
  3330. 河南金手指建设工程有限公司
  3331. 林州市弘顺建筑劳务有限公司
  3332. 林州市林海建筑劳务有限公司
  3333. 河南艺兆市政工程有限公司
  3334. 林州誉峰建筑工程有限公司
  3335. 河南卓骏建筑工程有限公司
  3336. 林州众成建筑工程有限公司
  3337. 河南城通市政工程有限公司
  3338. 林州市晋源建筑工程有限公司
  3339. 河南飞越建筑工程有限公司
  3340. 林州鑫泰建筑工程有限公司
  3341. 林州市太行建筑劳务有限公司
  3342. 河南筑丰建设发展有限公司
  3343. 林州一帆建筑劳务有限公司
  3344. 林州宏久建筑工程有限公司
  3345. 林州市盛祥建筑劳务有限公司
  3346. 河南黎润建设工程有限公司
  3347. 林州市永安建筑劳务有限公司
  3348. 河南省长江建设实业有限公司
  3349. 河南腾润建设工程有限公司
  3350. 河南国梁建设工程有限公司
  3351. 河南诚聚建筑工程有限公司
  3352. 河南德邦市政工程有限公司
  3353. 河南安德建设工程有限公司
  3354. 河南森川建筑工程有限公司
  3355. 林州市顺通公路工程有限公司
  3356. 河南领邦建筑工程有限公司
  3357. 河南博兴建设工程有限公司
  3358. 东泽消防工程有限公司
  3359. '''
  3360. list_enter = []
  3361. for _p in re.split("\s",str_enter):
  3362. if _p.strip()=="":
  3363. continue
  3364. list_enter.append({"name":_p.strip()})
  3365. def _handle(item,result_queue,pool_ots):
  3366. ots_client = pool_ots.getConnector()
  3367. try:
  3368. bool_query = BoolQuery(must_queries=[
  3369. NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["name"]))
  3370. ,RangeQuery("status",201,300,include_lower=True,include_upper=True)
  3371. ,RangeQuery("page_time","2020-01-01")
  3372. ])
  3373. rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  3374. SearchQuery(bool_query, limit=1, get_total_count=True),
  3375. ColumnsToGet(['docid'], ColumnReturnType.SPECIFIED))
  3376. item["total_count"] = total_count
  3377. # bool_query = BoolQuery(must_queries=[
  3378. # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["name"]))
  3379. # ,RangeQuery("status",201,300,include_lower=True,include_upper=True)
  3380. # ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",0,1000000,include_upper=True))
  3381. # ])
  3382. #
  3383. # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
  3384. # SearchQuery(bool_query, limit=1, get_total_count=True),
  3385. # ColumnsToGet(['docid'], ColumnReturnType.SPECIFIED))
  3386. # item["l_total_count"] = total_count
  3387. except Exception as e:
  3388. pass
  3389. pool_ots.putConnector(ots_client)
  3390. pool_ots = ConnectorPool(init_num=30,max_num=40,method_init=getConnect_ots)
  3391. task_queue = queue.Queue()
  3392. for item in list_enter:
  3393. task_queue.put(item)
  3394. mt = MultiThreadHandler(task_queue,_handle,None,30,pool_ots=pool_ots)
  3395. mt.run()
  3396. df_data = {}
  3397. for item in list_enter:
  3398. for k,v in item.items():
  3399. if k not in df_data:
  3400. df_data[k] = []
  3401. df_data[k].append(v)
  3402. df = pd.DataFrame(df_data)
  3403. df.to_excel("../data/%s.xls"%getCurrent_date("%Y-%m-%d_%H%M%S"))
  3404. from bs4 import BeautifulSoup
  3405. def downloadAttach(_url,_path):
  3406. try:
  3407. result = requests.get(_url,stream=True,timeout=20)
  3408. if result.status_code==200:
  3409. with open(_path,"wb") as f:
  3410. f.write(result.content)
  3411. else:
  3412. log("download failed with code %d of url:%s"%(result.status_code,_url))
  3413. except Exception:
  3414. log("download failed of url:%s"%(_url))
  3415. def extract_pageAttachments(_html):
  3416. fileSuffix = [".zip", ".rar", ".tar", ".7z", ".wim", ".docx", ".doc", ".xlsx", ".xls", ".pdf", ".txt", ".hnzf", ".bmp", ".jpg", ".jpeg", ".png", ".tif", ".swf"]
  3417. _soup = BeautifulSoup(_html,"lxml")
  3418. list_a = _soup.find_all("a")
  3419. list_img = _soup.find_all("img")
  3420. page_attachments = []
  3421. for _a in list_a:
  3422. _text =_a.get_text()
  3423. _url = _a.attrs.get("href","")
  3424. if _url.find("http://www.bidizhaobiao.com")>=0:
  3425. continue
  3426. is_attach = False
  3427. for suf in fileSuffix:
  3428. if _text.find(suf)>=0 or _url.find(suf)>=0:
  3429. is_attach = True
  3430. if is_attach:
  3431. page_attachments.append({"fileLink":_url,"fileTitle":_text})
  3432. for _a in list_img:
  3433. _text =_a.get_text()
  3434. _url = _a.attrs.get("src","")
  3435. if _url.find("http://www.bidizhaobiao.com")>=0:
  3436. continue
  3437. is_attach = False
  3438. for suf in fileSuffix:
  3439. if _text.find(suf)>=0 or _url.find(suf)>=0:
  3440. is_attach = True
  3441. if is_attach:
  3442. page_attachments.append({"fileLink":_url,"fileTitle":_text})
  3443. return page_attachments
  3444. def exportDocument_attachment():
  3445. ots_client = getConnect_ots()
  3446. bool_query = BoolQuery(must_queries=[TermQuery("docid",165528701)])
  3447. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  3448. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  3449. columns_to_get=ColumnsToGet(["dochtmlcon"],ColumnReturnType.SPECIFIED))
  3450. list_data = getRow_ots(rows)
  3451. for _data in list_data:
  3452. extract_pageAttachments(_data["dochtmlcon"])
  3453. def transUUid():
  3454. conn_oracle = getConnection_oracle()
  3455. cursor = conn_oracle.cursor()
  3456. tables = ['T_ZHAO_BIAO_GONG_GAO','T_ZHONG_BIAO_XIN_XI']
  3457. conn_mysql = getConnection_testmysql()
  3458. cursor_mysql = conn_mysql.cursor()
  3459. for _t in tables:
  3460. sql = " select id,page_time,'%s' from bxkc.%s where page_time>='%s' and page_time<='%s' order by page_time "%(_t,_t,"2021-06-01","2021-08-31")
  3461. print(sql)
  3462. cursor.execute(sql)
  3463. _count = 0
  3464. while(True):
  3465. insert_sql = "insert into fix_document(uuid,page_time,table_name) values"
  3466. rows = cursor.fetchmany(10000)
  3467. if not rows:
  3468. break
  3469. _count += len(rows)
  3470. print(_count)
  3471. for row in rows:
  3472. _uuid = row[0]
  3473. page_time = row[1]
  3474. table_name = row[2]
  3475. insert_sql += "('%s','%s','%s'),"%(_uuid,page_time,table_name)
  3476. insert_sql = insert_sql[:-1]
  3477. cursor_mysql.execute(insert_sql)
  3478. conn_mysql.commit()
  3479. def fix_document():
  3480. conn_oracle = getConnection_oracle()
  3481. cursor_oracle = conn_oracle.cursor()
  3482. conn_mysql = getConnection_testmysql()
  3483. cursor_mysql = conn_mysql.cursor()
  3484. sql = "select uuid,page_time,table_name from fix_document_final where page_time>='2021-06-24' "
  3485. cursor_mysql.execute(sql)
  3486. _count = 0
  3487. while True:
  3488. rows = cursor_mysql.fetchmany(1000)
  3489. if not rows:
  3490. break
  3491. _count += len(rows)
  3492. print(_count)
  3493. insert_sql = ""
  3494. for row in rows:
  3495. _uuid = row[0]
  3496. page_time = row[1]
  3497. table_name = row[2]
  3498. insert_sql += " insert into BXKC.fix_document_final(id,page_time,TABLENAME) values('%s','%s','%s');"%(_uuid,page_time,table_name)
  3499. insert_sql = "begin %s end;"%(insert_sql)
  3500. cursor_oracle.execute(insert_sql)
  3501. conn_oracle.commit()
  3502. def exportDocument_forRecommen():
  3503. filename = "../data/推荐 (1).csv"
  3504. df = pd.read_csv(filename,encoding="GBK")
  3505. ots_client = getConnect_ots()
  3506. columns = ["province","city","page_time","doctitle","product"]
  3507. current_date = getCurrent_date("%Y-%m-%d")
  3508. adict_data = []
  3509. _index = 0
  3510. for company,json_docid in zip(df["company"][:10000],df["json_docid"][:10000]):
  3511. _index += 1
  3512. _province = ""
  3513. # consumed, return_row, next_token = ots_client.get_row("enterprise",[("name",company)],columns_to_get=["province"])
  3514. # dict_k = getRow_ots_primary(return_row)
  3515. # _province = dict_k.get("province","")
  3516. print("序号:%d,%s,%s"%(_index,company,_province))
  3517. dict_recommen = json.loads(json_docid)
  3518. for str_way,str_docid in dict_recommen.items():
  3519. should_q = []
  3520. for _docid in str_docid.split(","):
  3521. should_q.append(TermQuery("docid",_docid))
  3522. bool_query = BoolQuery(must_queries=[
  3523. # TermQuery("province",_province)
  3524. # ,RangeQuery("page_time",timeAdd(current_date,-7),current_date,True,True)
  3525. # ,
  3526. BoolQuery(should_queries=should_q)]
  3527. )
  3528. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  3529. SearchQuery(bool_query,get_total_count=True,limit=100),
  3530. ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  3531. adict_row = getRow_ots(rows)
  3532. for dict_row in adict_row:
  3533. dict_item = dict()
  3534. set_dict_item(dict_item,"公司名称",company)
  3535. set_dict_item(dict_item,"推荐路径",str_way)
  3536. set_dict_item(dict_item,"公告id",dict_row.get("docid",""))
  3537. set_dict_item(dict_item,"省份",dict_row.get("province",""))
  3538. set_dict_item(dict_item,"城市",dict_row.get("city",""))
  3539. set_dict_item(dict_item,"page_time",dict_row.get("page_time",""))
  3540. set_dict_item(dict_item,"doctitle",dict_row.get("doctitle",""))
  3541. set_dict_item(dict_item,"product",dict_row.get("product",""))
  3542. adict_data.append(dict_item)
  3543. dict_data = {}
  3544. for dict_item in adict_data:
  3545. for k in list_df_columns:
  3546. if k not in dict_data:
  3547. dict_data[k] = []
  3548. dict_data[k].append(dict_item.get(k,""))
  3549. df1 = pd.DataFrame(dict_data)
  3550. df1.to_excel("../data/%s_推荐.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=list_df_columns)
  3551. def exportDocument_by_days(page_time):
  3552. dict_channel = getDict_docchannel()
  3553. ots_client = getConnect_ots()
  3554. filename = "供货贷含[建筑]企业名单.xlsx"
  3555. df = pd.read_excel(filename)
  3556. bool_query = BoolQuery(must_queries=[TermQuery("page_time",page_time),
  3557. # RangeQuery("status",201,301),
  3558. ])
  3559. # columns = ["doctitle","docchannel","product","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","original_docchannel","detail_link"]
  3560. columns = ["doctitle","docchannel","product","bidway","moneysource","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","original_docchannel","detail_link","page_attachments","service_time"]
  3561. dict_channel = getDict_docchannel()
  3562. def hidePhone(phone):
  3563. if phone is None or phone=="":
  3564. return ""
  3565. return "*"*(len(phone)-4)+phone[-4:]
  3566. def getData(df_data,rows,set_line,list_keyword,set_columns,df_columns):
  3567. list_data = getRow_ots(rows)
  3568. for row in list_data:
  3569. item = {}
  3570. _dict = row
  3571. set_dict_item_columns(set_columns,df_columns,item,"docid",_dict.get("docid",""))
  3572. set_dict_item_columns(set_columns,df_columns,item,"公告标题",_dict.get("doctitle",""))
  3573. # set_dict_item_columns(set_columns,df_columns,item,"公告内容",_dict.get("doctextcon",""))
  3574. set_dict_item_columns(set_columns,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
  3575. # set_dict_item_columns(set_columns,df_columns,item,"关键词",",".join(list(set(re.findall("|".join(list_keyword),_dict.get("doctextcon",""))))))
  3576. set_dict_item_columns(set_columns,df_columns,item,"产品",_dict.get("product",""))
  3577. set_dict_item_columns(set_columns,df_columns,item,"省份",_dict.get("province",""))
  3578. # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
  3579. set_dict_item_columns(set_columns,df_columns,item,"资金来源",_dict.get("moneysource",""))
  3580. set_dict_item_columns(set_columns,df_columns,item,"招标方式",_dict.get("bidway",""))
  3581. set_dict_item_columns(set_columns,df_columns,item,"服务期限",_dict.get("service_time",""))
  3582. set_dict_item_columns(set_columns,df_columns,item,"城市",_dict.get("city",""))
  3583. set_dict_item_columns(set_columns,df_columns,item,"区县",_dict.get("district",""))
  3584. set_dict_item_columns(set_columns,df_columns,item,"发布时间",_dict.get("page_time",""))
  3585. set_dict_item_columns(set_columns,df_columns,item,"创建时间",_dict.get("crtime",""))
  3586. set_dict_item_columns(set_columns,df_columns,item,"行业一级分类",_dict.get("industry",""))
  3587. set_dict_item_columns(set_columns,df_columns,item,"行业二级分类",_dict.get("info_type",""))
  3588. # set_dict_item_columns(set_columns,df_columns,item,"uuid",_dict.get("uuid"))
  3589. # set_dict_item_columns(set_columns,df_columns,item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
  3590. set_dict_item_columns(set_columns,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
  3591. set_dict_item_columns(set_columns,df_columns,item,"原网公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))
  3592. set_dict_item_columns(set_columns,df_columns,item,"status","正常" if _dict.get("status",201) <=300 else "去重")
  3593. set_dict_item_columns(set_columns,df_columns,item,"detail_link",_dict.get("detail_link"))
  3594. set_dict_item_columns(set_columns,df_columns,item,"web_source_no",_dict.get("web_source_no",""))
  3595. set_dict_item_columns(set_columns,df_columns,item,"web_source_name",_dict.get("web_source_name",""))
  3596. set_dict_item_columns(set_columns,df_columns,item,"项目名称",_dict.get("project_name",""))
  3597. set_dict_item_columns(set_columns,df_columns,item,"项目编号",_dict.get("project_code",""))
  3598. set_dict_item_columns(set_columns,df_columns,item,"招标单位",_dict.get("tenderee",""))
  3599. set_dict_item_columns(set_columns,df_columns,item,"招标联系人",_dict.get("tenderee_contact",""))
  3600. set_dict_item_columns(set_columns,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone",""))
  3601. set_dict_item_columns(set_columns,df_columns,item,"代理单位",_dict.get("agency",""))
  3602. set_dict_item_columns(set_columns,df_columns,item,"代理联系人",_dict.get("agency_contact",""))
  3603. set_dict_item_columns(set_columns,df_columns,item,"代理联系人电话",_dict.get("agency_phone",""))
  3604. set_dict_item_columns(set_columns,df_columns,item,"url","http://www.bidizhaobiao.com/info-%d.html"%(_dict.get("docid","")))
  3605. set_dict_item_columns(set_columns,df_columns,item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
  3606. set_dict_item_columns(set_columns,df_columns,item,"截标时间",_dict.get("time_bidclose",""))
  3607. set_dict_item_columns(set_columns,df_columns,item,"page_attachments",_dict.get("page_attachments","[]"))
  3608. sub_docs_json = _dict.get("sub_docs_json")
  3609. set_tenderer = set()
  3610. if sub_docs_json is not None:
  3611. docs = json.loads(sub_docs_json)
  3612. docs.sort(key=lambda x:x.get("win_bid_price",0))
  3613. for _doc in docs:
  3614. if "win_tenderer" in _doc:
  3615. set_dict_item_columns(set_columns,df_columns,item,"中标单位",_doc["win_tenderer"])
  3616. if "second_tenderer" in _doc:
  3617. set_dict_item_columns(set_columns,df_columns,item,"第二候选单位",_doc["second_tenderer"])
  3618. set_tenderer.add(_doc.get("second_tenderer"))
  3619. if "third_tenderer" in _doc:
  3620. set_dict_item_columns(set_columns,df_columns,item,"第三候选单位",_doc["third_tenderer"])
  3621. set_tenderer.add(_doc.get("third_tenderer"))
  3622. if "win_tenderee_manager" in _doc:
  3623. set_dict_item_columns(set_columns,df_columns,item,"中标单位联系人",_doc["win_tenderee_manager"])
  3624. if "win_tenderee_phone" in _doc:
  3625. set_dict_item_columns(set_columns,df_columns,item,"中标单位联系电话",_doc["win_tenderee_phone"])
  3626. if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
  3627. set_dict_item_columns(set_columns,df_columns,item,"中标金额",_doc["win_bid_price"])
  3628. if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
  3629. set_dict_item_columns(set_columns,df_columns,item,"招标金额",_doc["bidding_budget"])
  3630. set_dict_item_columns(set_columns,df_columns,item,"入围供应商",",".join(list(set_tenderer)))
  3631. if "第二候选单位" not in item:
  3632. set_dict_item_columns(set_columns,df_columns,item,"第二候选单位","")
  3633. if "第三候选单位" not in item:
  3634. set_dict_item_columns(set_columns,df_columns,item,"第三候选单位","")
  3635. if "招标金额" not in item:
  3636. set_dict_item_columns(set_columns,df_columns,item,"招标金额","")
  3637. if "中标金额" not in item:
  3638. set_dict_item_columns(set_columns,df_columns,item,"中标金额","")
  3639. if "中标单位" not in item:
  3640. set_dict_item_columns(set_columns,df_columns,item,"中标单位","")
  3641. if "中标单位联系人" not in item:
  3642. set_dict_item_columns(set_columns,df_columns,item,"中标单位联系人","")
  3643. if "中标单位联系电话" not in item:
  3644. set_dict_item_columns(set_columns,df_columns,item,"中标单位联系电话","")
  3645. # if item["中标单位"] not in set_enter:
  3646. # continue
  3647. _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
  3648. # if _line in set_line:
  3649. # continue
  3650. # if item["招标金额"]=="":
  3651. # continue
  3652. # set_line.add(_line)
  3653. for k,v in item.items():
  3654. if k not in df_data:
  3655. df_data[k] = []
  3656. df_data[k].append(v)
  3657. df_data = {}
  3658. set_columns = set()
  3659. df_columns = []
  3660. # for name in df["ent_name_real"]:
  3661. # if isinstance(name,str) and name!="":
  3662. # list_should_q = []
  3663. # # list_should_q.append(MatchPhraseQuery("doctextcon",name))
  3664. # # list_should_q.append(MatchPhraseQuery("attachmenttextcon",name))
  3665. # NestedQuery("sub_docs_json","sub_docs_json.win_tenderer",name)
  3666. # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01-01"),
  3667. # RangeQuery("status",201,301),
  3668. # # BoolQuery(should_queries=list_should_q),
  3669. # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",name))
  3670. # ])
  3671. #
  3672. # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  3673. # SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
  3674. # ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  3675. #
  3676. #
  3677. # while True:
  3678. # getData(df_data,rows,set(),"",set_columns,df_columns)
  3679. # if not next_token:
  3680. # break
  3681. # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  3682. # SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  3683. # ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  3684. # if len(df_data.keys())>0:
  3685. # print(len(df_data[list(df_data.keys())[0]]),total_count)
  3686. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  3687. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
  3688. ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  3689. while True:
  3690. getData(df_data,rows,set(),"",set_columns,df_columns)
  3691. if not next_token:
  3692. break
  3693. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  3694. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  3695. ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  3696. if len(df_data.keys())>0:
  3697. print(len(df_data[list(df_data.keys())[0]]),total_count)
  3698. # appendAttachmentPath(df_data,"page_attachments","附件链接")
  3699. # df_columns.append("附件链接")
  3700. # df_columns = ["docid","公告标题","公告类别","发布时间","公告内容","省份","城市","项目编号","招标单位","招标金额","资金来源","招标方式","代理单位","中标单位","中标金额","第二候选单位","第三候选单位","url","附件链接"]
  3701. df = pd.DataFrame(df_data)
  3702. df.to_excel("../data/%s_%s.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S"),page_time),columns=df_columns)
  3703. def appendAttachmentPath(df_data,key,new_key):
  3704. list_data = []
  3705. for _attach in df_data[key]:
  3706. _dict = {key:_attach}
  3707. list_data.append(_dict)
  3708. task_queue = Queue()
  3709. for _d in list_data:
  3710. task_queue.put(_d)
  3711. auth = oss2.Auth("LTAI5tFuoxHm8Uxrr5nT8wTZ", "Yp01bylJFx0al6teCaccY8hbtllBGg")
  3712. bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
  3713. attachment_bucket_name = "attachment-hub"
  3714. bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)
  3715. ots_client = getConnect_ots()
  3716. def search(ots_client,table_name,key_tuple,columns_to_get):
  3717. try:
  3718. # 调用get_row接口查询,最后一个参数值1表示只需要返回一个版本的值。
  3719. consumed, return_row, next_token = ots_client.get_row(table_name, key_tuple, columns_to_get, None, 1)
  3720. if return_row is not None:
  3721. _dict = getRow_ots_primary(return_row)
  3722. return _dict
  3723. return None
  3724. # 客户端异常,一般为参数错误或者网络异常。
  3725. except OTSClientError as e:
  3726. traceback.print_exc()
  3727. log("get row failed, http_status:%d, error_message:%s" % (e.get_http_status(), e.get_error_message()))
  3728. # 服务端异常,一般为参数错误或者流控错误。
  3729. except OTSServiceError as e:
  3730. traceback.print_exc()
  3731. log("get row failed, http_status:%d, error_code:%s, error_message:%s, request_id:%s" % (e.get_http_status(), e.get_error_code(), e.get_error_message(), e.get_request_id()))
  3732. def _handle(item,result_queue):
  3733. page_attachments = json.loads(item.get(key,"[]"))
  3734. list_url = []
  3735. for _a in page_attachments:
  3736. fileMd5 = _a.get("fileMd5")
  3737. print("==",fileMd5)
  3738. _s_dict = search(ots_client,"attachment",[("filemd5",fileMd5)],["path"])
  3739. if _s_dict is not None:
  3740. _path = _s_dict.get("path")
  3741. if _path is not None:
  3742. _url = bucket.sign_url("GET",_path,86500*5)
  3743. list_url.append(_url)
  3744. item[new_key] = json.dumps(list_url)
  3745. mt = MultiThreadHandler(task_queue,_handle,None,30)
  3746. mt.run()
  3747. df_data[new_key] = []
  3748. for _d in list_data:
  3749. df_data[new_key].append(_d.get(new_key))
  3750. def export_competition():
  3751. file = "select___from_province_indus_entity_top1.xlsx"
  3752. df1 = pd.read_excel(file)
  3753. ots_client = getConnect_ots()
  3754. task_queue = queue.Queue()
  3755. list_entity = []
  3756. for province,industry,entitys in zip(df1["province"],df1["industry"],df1["entitys"]):
  3757. l_e = json.loads(entitys)
  3758. for l in l_e:
  3759. list_entity.append({"province":province,
  3760. "industry":industry,
  3761. "win_tenderer":l.get("win_tenderee","")})
  3762. for item in list_entity:
  3763. task_queue.put(item)
  3764. def _handle(item,result_queue):
  3765. def getData(rows,_set):
  3766. dict_rows = getRow_ots(rows)
  3767. for _dict in dict_rows:
  3768. sub_docs_json = _dict.get("sub_docs_json")
  3769. if sub_docs_json is not None:
  3770. for sub_docs in json.loads(sub_docs_json):
  3771. if sub_docs.get("win_tenderer") is not None:
  3772. _set.add(sub_docs.get("win_tenderer"))
  3773. if sub_docs.get("second_tenderer") is not None:
  3774. _set.add(sub_docs.get("second_tenderer"))
  3775. if sub_docs.get("third_tenderer") is not None:
  3776. _set.add(sub_docs.get("third_tenderer"))
  3777. columns = ["sub_docs_json"]
  3778. _company = item.get("win_tenderer")
  3779. should_q = BoolQuery(should_queries=[NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_company)),
  3780. NestedQuery("sub_docs_json",TermQuery("sub_docs_json.second_tenderer",_company)),
  3781. NestedQuery("sub_docs_json",TermQuery("sub_docs_json.third_tenderer",_company))])
  3782. bool_query = BoolQuery(must_queries=[
  3783. # should_q,
  3784. MatchPhraseQuery("doctextcon",_company),
  3785. RangeQuery("docchannel",101)])
  3786. _set = set()
  3787. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  3788. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
  3789. ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  3790. getData(rows,_set)
  3791. _count = 0
  3792. _page = 0
  3793. while next_token:
  3794. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  3795. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
  3796. ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  3797. getData(rows,_set)
  3798. _count += 1
  3799. _page += 1
  3800. if len(_set)>20 or _page>20:
  3801. break
  3802. if item["win_tenderer"] in _set:
  3803. _set.remove(item["win_tenderer"])
  3804. item["competition"] = ",".join(list(_set))
  3805. mt = MultiThreadHandler(task_queue,_handle,None,30)
  3806. mt.run()
  3807. df_data = {}
  3808. keys = ["province","industry","win_tenderer","competition"]
  3809. for key in keys:
  3810. if key not in df_data:
  3811. df_data[key] = []
  3812. for item in list_entity:
  3813. for key in keys:
  3814. df_data[key].append(item.get(key))
  3815. df2 = pd.DataFrame(df_data)
  3816. df2.to_excel("competition.xlsx",columns=keys)
  3817. def document_dumplicate():
  3818. df = pd.read_excel("../data/2022-01-19_214329_export11.xlsx")
  3819. print(df.keys())
  3820. def export_document_no_price():
  3821. df = pd.read_csv("select___from_document_no_price_tmp.csv",encoding="gbk")
  3822. conn_oracle = getConnection_oracle()
  3823. cursor = conn_oracle.cursor()
  3824. sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO"
  3825. cursor.execute(sql)
  3826. dict_source = {}
  3827. while 1:
  3828. rows = cursor.fetchmany(10000)
  3829. if not rows:
  3830. break
  3831. for row in rows:
  3832. dict_source[row[0]] = row[1]
  3833. dict_source[row[0].split("(")[0]] = row[1]
  3834. list_name = []
  3835. set_web_source = set()
  3836. for web_source_no in df["web_source_no"]:
  3837. set_web_source.add(web_source_no)
  3838. list_name.append(dict_source.get(web_source_no,""))
  3839. dict_source_year = {}
  3840. for web_source_no,year,counts_no_price,counts_all,rate in zip(df["web_source_no"],df["year"],df["counts_no_price"],df["counts_all"],df["rate"]):
  3841. dict_source_year["%s&%s"%(web_source_no,year)] = {"counts_no_price":counts_no_price,"counts_all":counts_all,"rate":rate}
  3842. new_data = {"web_source_no":[],
  3843. "web_source_name":[],
  3844. "counts_no_price":[],
  3845. "counts_all":[],
  3846. "rate":[],
  3847. "counts_no_price1":[],
  3848. "counts_all1":[],
  3849. "rate1":[]}
  3850. for web_source_no in list(set_web_source):
  3851. new_data["web_source_no"].append(web_source_no)
  3852. new_data["web_source_name"].append(dict_source.get(web_source_no,""))
  3853. d_2020 = dict_source_year.get("%s&%s"%(web_source_no,"2020"),{})
  3854. d_2021 = dict_source_year.get("%s&%s"%(web_source_no,"2021"),{})
  3855. new_data["counts_no_price"].append(d_2020.get("counts_no_price"))
  3856. new_data["counts_all"].append(d_2020.get("counts_all"))
  3857. new_data["rate"].append(d_2020.get("rate"))
  3858. new_data["counts_no_price1"].append(d_2021.get("counts_no_price"))
  3859. new_data["counts_all1"].append( d_2021.get("counts_all"))
  3860. new_data["rate1"].append(d_2021.get("rate"))
  3861. # new_data = {"year":df["year"],
  3862. # "web_source_no":df["web_source_no"],
  3863. # "web_source_name":list_name,
  3864. # "counts_no_price":df["counts_no_price"],
  3865. # "counts_all":df["counts_all"],
  3866. # "rate":df["rate"]}
  3867. df2 = pd.DataFrame(new_data)
  3868. df2.to_excel("websource_no_price1.xlsx",columns=["web_source_no","web_source_name","counts_no_price","counts_all","rate","counts_no_price1","counts_all1","rate1"])
  3869. def exportDetailLink():
  3870. df = pd.read_excel("招投标数据测试反馈表3.xlsx")
  3871. list_item = []
  3872. for docid in df["docid"]:
  3873. list_item.append({"docid":docid})
  3874. task_queue = queue.Queue()
  3875. for item in list_item:
  3876. task_queue.put(item)
  3877. def _handle(item,result_queue,ots_client,pool_oracle):
  3878. try:
  3879. conn = pool_oracle.getConnector()
  3880. docid = int(item["docid"])
  3881. partitionkey = int(docid%500+1)
  3882. consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",int(docid))],["original_docchannel","detail_link","uuid"])
  3883. _dict = getRow_ots_primary(return_row)
  3884. if _dict.get("detail_link") is not None and len(_dict.get("detail_link"))>0:
  3885. item["detail_link"] = _dict.get("detail_link")
  3886. else:
  3887. original_docchannel = _dict.get("original_docchannel")
  3888. _uuid = _dict.get("uuid")
  3889. d_tablename = {"51":"T_GONG_GAO_BIAN_GENG",
  3890. "52":"T_ZHAO_BIAO_GONG_GAO",
  3891. "101":"T_ZHONG_BIAO_XIN_XI",
  3892. "102":"T_ZHAO_BIAO_YU_GAO",
  3893. "103":"T_ZHAO_BIAO_DA_YI",
  3894. "104":"T_ZHAO_BIAO_WEN_JIAN",
  3895. "114":"T_CAI_GOU_YI_XIANG"
  3896. }
  3897. _tablename = d_tablename.get(str(original_docchannel))
  3898. if _tablename is not None:
  3899. cursor = conn.cursor()
  3900. sql = "select detail_link from bxkc.%s where id='%s'"%(_tablename,_uuid)
  3901. print(sql)
  3902. cursor.execute(sql)
  3903. rows = cursor.fetchall()
  3904. if len(rows)>0:
  3905. item["detail_link"] = rows[0][0]
  3906. cursor.close()
  3907. except Exception as e:
  3908. traceback.print_exc()
  3909. finally:
  3910. pool_oracle.putConnector(conn)
  3911. ots_client = getConnect_ots()
  3912. pool_oracle = ConnectorPool(10,30,getConnection_oracle)
  3913. mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client,pool_oracle=pool_oracle)
  3914. mt.run()
  3915. df_data = {"docid":[],
  3916. "detail_link":[]}
  3917. for item in list_item:
  3918. for k,v in df_data.items():
  3919. v.append(item.get(k,""))
  3920. df2 = pd.DataFrame(df_data)
  3921. df2.to_excel("222.xlsx")
  3922. def process_doc():
  3923. df = pd.read_excel("../data/2022-03-16_154617_数据导出.xlsx",1)
  3924. list_check = []
  3925. set_process_docid = set()
  3926. for docid in df["process_docid"]:
  3927. set_process_docid.add(docid)
  3928. df = pd.read_excel("../data/2022-03-16_154617_数据导出.xlsx",0)
  3929. for docid in df["docid"]:
  3930. if docid in set_process_docid:
  3931. list_check.append("1")
  3932. else:
  3933. list_check.append("0")
  3934. df["check"] = list_check
  3935. df.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')))
  3936. def export_extract2():
  3937. ots_client = getConnect_ots()
  3938. df_keys = ["docid","extract_json","status"]
  3939. df_data = {}
  3940. for _key in df_keys:
  3941. df_data[_key] = []
  3942. bool_query = BoolQuery(must_queries=[
  3943. RangeQuery("status",1,1000,True,True)])
  3944. rows, next_token, total_count, is_all_succeed = ots_client.search("document_extract2", "document_extract2_index",
  3945. SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
  3946. ColumnsToGet(df_keys,return_type=ColumnReturnType.SPECIFIED))
  3947. list_dict = getRow_ots((rows))
  3948. for _dict in list_dict:
  3949. if re.search("false",_dict.get("extract_json","")) is None:
  3950. continue
  3951. for k in df_keys:
  3952. df_data[k].append(_dict.get(k))
  3953. _count = len(list_dict)
  3954. while next_token:
  3955. rows, next_token, total_count, is_all_succeed = ots_client.search("document_extract2", "document_extract2_index",
  3956. SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
  3957. ColumnsToGet(df_keys,return_type=ColumnReturnType.SPECIFIED))
  3958. list_dict = getRow_ots((rows))
  3959. for _dict in list_dict:
  3960. if re.search("false",_dict.get("extract_json","")) is None:
  3961. continue
  3962. for k in df_keys:
  3963. df_data[k].append(_dict.get(k))
  3964. _count += len(list_dict)
  3965. print("%d/%d"%(_count,total_count))
  3966. df = pd.DataFrame(df_data)
  3967. df.to_excel("../data/%s_extract2.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
  3968. def export_by_file():
  3969. df = pd.read_csv("../data/2022-04-01_121315_数据导出.csv",encoding="gbk")
  3970. keys = df.keys()
  3971. df_data = {}
  3972. set_win = set()
  3973. set_ruwei = set()
  3974. for k in keys:
  3975. df_data[k] = []
  3976. for _i in range(len(df["产品"])):
  3977. product = df["产品"][_i]
  3978. if product is None or not isinstance(product,str):
  3979. continue
  3980. print(product)
  3981. win_tenderer = df["中标单位"][_i]
  3982. if win_tenderer is not None and isinstance(win_tenderer,str):
  3983. set_win.add(win_tenderer)
  3984. set_ruwei.add(win_tenderer)
  3985. ruwei = df["入围供应商"][_i]
  3986. if ruwei is not None and isinstance(ruwei,str):
  3987. l_s = ruwei.split(",")
  3988. for _s in l_s:
  3989. set_ruwei.add(_s)
  3990. if re.search("公路|道路|路基|路面|快速通道|高速|隧道|飞机跑道|桥梁|养护|路段|市政|照明工程|照明设施|亮灯|灯光改造|灯光工程|管道|架线|园林|景观|绿化|排水|河道整治|环境治理|交通|地铁|跌路|高铁|桥梁|大桥|桥段",product) is not None:
  3991. for k in keys:
  3992. df_data[k].append(df[k][_i])
  3993. print("win count:%d ruwei:%d"%(len(set_win),len(set_ruwei)))
  3994. # df1 = pd.DataFrame(df_data)
  3995. # df1.to_excel("../data/%s_文件导出.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")),columns=keys)
  3996. def export_dump():
  3997. import pandas as pd
  3998. df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=0)
  3999. _set_number = set()
  4000. _set_number |= set(df["号码"])
  4001. print(len(_set_number))
  4002. df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=1)
  4003. _set_number |= set(df["号码"])
  4004. print(len(_set_number))
  4005. df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=2)
  4006. keys = df.keys()
  4007. df_data = {}
  4008. for k in keys:
  4009. df_data[k] = []
  4010. for _i in range(len(df[keys[0]])):
  4011. if df["号码"][_i] not in _set_number:
  4012. for k in keys:
  4013. df_data[k].append(df[k][_i])
  4014. _set_number.add(df["号码"][_i])
  4015. df2 = pd.DataFrame(df_data)
  4016. df2.to_excel("tmp222.xlsx")
  4017. def check_data_synchronization():
  4018. filepath = "C:\\Users\\Administrator\\Desktop\\to_check.log"
  4019. list_uuid = []
  4020. _regrex = "ID='(?P<uuid>.+)'"
  4021. with open(filepath,"r",encoding="utf8") as f:
  4022. while 1:
  4023. _line = f.readline()
  4024. if not _line:
  4025. break
  4026. _match = re.search(_regrex,_line)
  4027. if _match is not None:
  4028. _uuid = _match.groupdict().get("uuid")
  4029. if _uuid is not None:
  4030. list_uuid.append(_uuid)
  4031. print(len(list_uuid))
  4032. task_queue = Queue()
  4033. list_data = []
  4034. for _uuid in list_uuid:
  4035. _dict = {"uuid":_uuid}
  4036. list_data.append(_dict)
  4037. task_queue.put(_dict)
  4038. ots_client = getConnect_ots()
  4039. def _handle(_item,result_queue):
  4040. bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])
  4041. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4042. SearchQuery(bool_query,get_total_count=True),
  4043. columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
  4044. _item["exists"] = total_count
  4045. mt = MultiThreadHandler(task_queue,_handle,None,30)
  4046. mt.run()
  4047. df_data = {"uuid":[],
  4048. "exists":[]}
  4049. for _data in list_data:
  4050. for k,v in df_data.items():
  4051. v.append(_data.get(k))
  4052. import pandas as pd
  4053. df2 = pd.DataFrame(df_data)
  4054. df2.to_excel("check.xlsx")
  4055. def group_xlsx():
  4056. filename = "厂商&赛道列表.xlsx"
  4057. df0 = pd.read_excel(filename,0)
  4058. df1 = pd.read_excel(filename,1)
  4059. df2 = pd.read_excel(filename,2)
  4060. set_1 = set(df0["中国厂商"]) | set(df1["国际厂商"])
  4061. set_2 = set(df2["a"]) | set(df2["b"]) | set(df2["c"])
  4062. filename = "../data/2022-05-24_185801_数据导出.xlsx"
  4063. df = pd.read_excel(filename)
  4064. dict_docid = {}
  4065. for docid,keyword in zip(df["docid"],df["关键词"]):
  4066. if docid not in dict_docid:
  4067. dict_docid[docid] = [[],[]]
  4068. if keyword in set_1:
  4069. dict_docid[docid][0].append(keyword)
  4070. else:
  4071. dict_docid[docid][1].append(keyword)
  4072. set_docid = set()
  4073. for k,v in dict_docid.items():
  4074. if len(v[0])>=1 and len(v[1])>=1:
  4075. set_docid.add(k)
  4076. keys = df.keys()
  4077. print(keys)
  4078. df_data = {}
  4079. for i in range(len(df["docid"])):
  4080. print(i)
  4081. docid = df["docid"][i]
  4082. if docid in set_docid:
  4083. for k in keys:
  4084. if k not in df_data:
  4085. df_data[k] = []
  4086. df_data[k].append(df[k][i])
  4087. df_data["关键词"][-1] = str(dict_docid[docid][0][0])+"+"+str(dict_docid[docid][1][0])
  4088. df1 = pd.DataFrame(df_data)
  4089. df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=keys)
  4090. def static_process_time():
  4091. ots_client = getConnect_ots()
  4092. bool_query = BoolQuery(must_queries=[
  4093. RangeQuery("crtime","2022-05-26","2022-05-27"),
  4094. TermQuery("page_time","2022-05-26")
  4095. ])
  4096. rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
  4097. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),get_total_count=True,limit=100),
  4098. ColumnsToGet(column_names=["crtime","opertime","publishtime","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
  4099. list_data = []
  4100. _l = getRow_ots(rows)
  4101. list_data.extend(_l)
  4102. while next_token:
  4103. rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
  4104. SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
  4105. ColumnsToGet(column_names=["crtime","opertime","publishtime","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
  4106. _l = getRow_ots(rows)
  4107. list_data.extend(_l)
  4108. print("%d/%d"%(len(list_data),total_count))
  4109. list_dis = []
  4110. list_dis_a = []
  4111. list_dis_n_a = []
  4112. for _data in list_data:
  4113. crtime = _data.get("crtime")
  4114. opertime = _data.get("opertime")
  4115. page_attachments = _data.get("page_attachments","[]")
  4116. _d = time.mktime(time.strptime(opertime,"%Y-%m-%d %H:%M:%S"))-time.mktime(time.strptime(crtime,"%Y-%m-%d %H:%M:%S"))
  4117. list_dis.append(_d)
  4118. if page_attachments=="[]":
  4119. list_dis_n_a.append(_d)
  4120. else:
  4121. list_dis_a.append(_d)
  4122. print("avg_time:",sum(list_dis)/len(list_dis),max(list_dis),min(list_dis))
  4123. print("avg_time:",sum(list_dis_a)/len(list_dis_a),max(list_dis_a),min(list_dis_a))
  4124. print("avg_time:",sum(list_dis_n_a)/len(list_dis_n_a),max(list_dis_n_a),min(list_dis_n_a))
  4125. def export_dump_by_id():
  4126. filename = "遗漏待验证1.csv"
  4127. df = pd.read_csv(filename)
  4128. list_k = []
  4129. ots_client = getConnect_ots()
  4130. for _main_url,_other_url in zip(df["_c0"],df["_c1"]):
  4131. _d = {}
  4132. main_docid = re.split("[-.]",_main_url)[3]
  4133. l_other = []
  4134. for _l in _other_url.split(","):
  4135. _docid = re.split("[-.]",_l)[3]
  4136. l_other.append(_docid)
  4137. _d["main_docid"] = main_docid
  4138. _d["other_docid"] = l_other
  4139. list_k.append(_d)
  4140. task_queue = Queue()
  4141. for _q in list_k:
  4142. task_queue.put(_q)
  4143. def _handle(item,result_queue):
  4144. columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","page_attachments"]
  4145. main_docid = item["main_docid"]
  4146. other_docid = item["other_docid"]
  4147. list_should_q = []
  4148. list_should_q.append(TermQuery("docid",main_docid))
  4149. for _d in other_docid:
  4150. list_should_q.append(TermQuery("docid",_d))
  4151. _query = BoolQuery(should_queries=list_should_q)
  4152. l_rows = []
  4153. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4154. SearchQuery(_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
  4155. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  4156. dict_row = getRow_ots(rows)
  4157. l_rows.extend(dict_row)
  4158. log("total count:%d"%total_count)
  4159. _count = len(dict_row)
  4160. while next_token:
  4161. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4162. SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
  4163. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  4164. dict_row = getRow_ots(rows)
  4165. l_rows.extend(dict_row)
  4166. _count += len(dict_row)
  4167. item["data"] = l_rows
  4168. mt = MultiThreadHandler(task_queue,_handle,None,30)
  4169. mt.run()
  4170. df_data = {"main_docid":[]}
  4171. set_line = set()
  4172. dict_channel = getDict_docchannel()
  4173. for _d in list_k:
  4174. list_row = _d.get("data")
  4175. if list_row is not None:
  4176. main_docid = _d.get("main_docid")
  4177. getRowData(df_data,list_row,set_line,[],dict_channel,True)
  4178. for _ in list_row:
  4179. df_data["main_docid"].append(main_docid)
  4180. df1 = pd.DataFrame(df_data)
  4181. list_df_columns1 = ["main_docid"].extend(list_df_columns)
  4182. df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns1)
  4183. def count_product():
  4184. filename = "../data/2022-06-24_152201_数据导出.xlsx"
  4185. df = pd.read_excel(filename)
  4186. # _product = df["产品"]
  4187. # dict_p_c = {}
  4188. # for _p in _product:
  4189. # if isinstance(_p,str) and _p!="":
  4190. # l_p = _p.split(",")
  4191. # for _p1 in l_p:
  4192. # if _p1 not in dict_p_c:
  4193. # dict_p_c[_p1] = 0
  4194. # dict_p_c[_p1] += 1
  4195. # df_data = {"产品":[],
  4196. # "次数":[]}
  4197. # for k,v in dict_p_c.items():
  4198. # df_data["产品"].append(k)
  4199. # df_data["次数"].append(v)
  4200. # df1 = pd.DataFrame(df_data)
  4201. # df1.to_excel("222.xlsx")
  4202. keys = df.keys()
  4203. df_data = {}
  4204. for k in keys:
  4205. df_data[k] = []
  4206. product_pattern = "电脑|台式机|电脑|主机|网络|软件|开发|通信|系统|信息技术"
  4207. df1 = pd.read_excel("222.xlsx")
  4208. list_p = []
  4209. for _p,_n in zip(df1["产品"],df1["need"]):
  4210. if _n==1:
  4211. list_p.append(_p)
  4212. product_pattern = product_pattern+"|"+"|".join(list_p)
  4213. _product = df["产品"]
  4214. for _i in range(len(_product)):
  4215. if re.search(product_pattern,str(_product[_i])) is not None:
  4216. for k in keys:
  4217. df_data[k].append(df[k][_i])
  4218. df2 = pd.DataFrame(df_data)
  4219. df2.to_excel("333.xlsx",columns=keys)
  4220. from dataSource.source import getConnect_capacity
  4221. def exportHonors_item_info():
  4222. ots_capacity = getConnect_capacity()
  4223. bool_query = BoolQuery(should_queries=[
  4224. # TermQuery("ryjx","海河杯"),
  4225. WildcardQuery("hjdw","*合肥建工集团有限公司*")
  4226. ])
  4227. rows,next_token,total_count,is_all_succeed = ots_capacity.search("honors_item_info","honors_item_info_index",
  4228. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("create_time")]),get_total_count=True,limit=100),
  4229. columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
  4230. list_data = getRow_ots(rows)
  4231. while next_token:
  4232. rows,next_token,total_count,is_all_succeed = ots_capacity.search("honors_item_info","honors_item_info_index",
  4233. SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
  4234. columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
  4235. list_data.extend(getRow_ots(rows))
  4236. df_data = {}
  4237. set_columns1 = set()
  4238. list_df_columns1 = []
  4239. for _data in list_data:
  4240. _dict = {}
  4241. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"record_id",_data.get("record_id"))
  4242. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"bfdw",_data.get("bfdw"))
  4243. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"create_time",_data.get("create_time"))
  4244. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"cs",_data.get("cs"))
  4245. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"detail_link",_data.get("detail_link"))
  4246. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"fbsj",_data.get("fbsj"))
  4247. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjdw",_data.get("hjdw"))
  4248. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjdwjs",_data.get("hjdwjs"))
  4249. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjxm",_data.get("hjxm"))
  4250. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"jxjb",_data.get("jxjb"))
  4251. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"jxlx",_data.get("jxlx"))
  4252. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"ryjx",_data.get("ryjx"))
  4253. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"sf",_data.get("sf"))
  4254. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"xmfzr",_data.get("xmfzr"))
  4255. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"zgdw",_data.get("zgdw"))
  4256. set_dict_item_columns(set_columns1,list_df_columns1,_dict,"zxj",_data.get("zxj"))
  4257. for k,v in _dict.items():
  4258. if k not in df_data:
  4259. df_data[k] = []
  4260. df_data[k].append(v)
  4261. df = pd.DataFrame(df_data)
  4262. df.to_excel("honor_export.xlsx",columns=list_df_columns1)
  4263. def check_dump_data():
  4264. ots_client = getConnect_ots()
  4265. bool_query = BoolQuery(must_queries=[
  4266. generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery),
  4267. RangeQuery("crtime","2022-09-07 20:00:00","2022-09-08 06:00:00"),
  4268. RangeQuery("page_time","2022-09-07","2022-09-08"),
  4269. RangeQuery("status",201,301)
  4270. ])
  4271. list_data = getDocument([{"query":bool_query}],["docid"],table_name="document",table_index="document_index")
  4272. bool_query1 = BoolQuery(must_queries=[
  4273. generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery),
  4274. RangeQuery("crtime","2022-09-07 20:00:00","2022-09-08 06:00:00"),
  4275. RangeQuery("page_time","2022-09-07","2022-09-08"),
  4276. RangeQuery("status",81,100),
  4277. TermQuery("save",1)
  4278. ])
  4279. list_data1 = getDocument([{"query":bool_query1}],["docid"],table_name="document_tmp",table_index="document_tmp_index")
  4280. set_docid = set()
  4281. set_docid_tmp = set()
  4282. for _data in list_data:
  4283. set_docid.add(_data.get("docid"))
  4284. for _data in list_data1:
  4285. set_docid_tmp.add(_data.get("docid"))
  4286. print("document - tmp",set_docid-set_docid_tmp)
  4287. print("tmp - document",set_docid_tmp-set_docid)
  4288. def search_title_count():
  4289. filename = "数据样例.xlsx"
  4290. df = pd.read_excel(filename)
  4291. list_title_dict = []
  4292. for _title in df["标题"]:
  4293. _dict = {"标题":_title}
  4294. list_title_dict.append(_dict)
  4295. task_queue = Queue()
  4296. for _d in list_title_dict:
  4297. task_queue.put(_d)
  4298. ots_client = getConnect_ots()
  4299. def _handle(item,result_queue):
  4300. columns = ["status","tenderee","agency","sub_docs_json"]
  4301. _title = item.get("标题","")
  4302. if _title!="":
  4303. bool_query = BoolQuery(must_queries=[MatchPhraseQuery("doctitle",_title)])
  4304. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4305. SearchQuery(bool_query,get_total_count=True,limit=10),
  4306. columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
  4307. list_data = getRow_ots(rows)
  4308. item["比地数量"] = total_count
  4309. if len(list_data)>0:
  4310. _str_docid = ",".join([str(a.get("docid")) for a in list_data])
  4311. item["比地_docid"] = _str_docid
  4312. tenderee = list_data[0].get("tenderee")
  4313. item["比地_招标人"] = tenderee
  4314. agency = list_data[0].get("agency")
  4315. item["比地_代理人"] = agency
  4316. sub_docs_json = list_data[0].get("sub_docs_json")
  4317. if sub_docs_json is not None:
  4318. sub_docs = json.loads(sub_docs_json)
  4319. win_tenderer = ""
  4320. win_bid_price = ""
  4321. for _doc in sub_docs:
  4322. if _doc.get("win_tenderer","")!="":
  4323. win_tenderer = _doc.get("win_tenderer")
  4324. win_bid_price = _doc.get("win_bid_price")
  4325. item["比地_中标人"] = win_tenderer
  4326. item["比地_中标金额"] = win_bid_price
  4327. mt = MultiThreadHandler(task_queue,_handle,None,30)
  4328. mt.run()
  4329. df_data = {}
  4330. keys = ["标题","比地数量","比地_docid","比地_招标人","比地_代理人","比地_中标人","比地_中标金额"]
  4331. for _d in list_title_dict:
  4332. for k in keys:
  4333. if k not in df_data:
  4334. df_data[k] = []
  4335. df_data[k].append(_d.get(k,""))
  4336. df1 = pd.DataFrame(df_data)
  4337. df1.to_excel("比地对比数据.xlsx",columns=keys)
  4338. def getDumplicate_docid():
  4339. filename = "2022-11-02_154222_数据导出.xlsx"
  4340. df = pd.read_excel(filename)
  4341. list_docid = df["docid"]
  4342. task_queue = Queue()
  4343. list_d = []
  4344. for _docid in list_docid:
  4345. _dict = {"docid":_docid}
  4346. list_d.append(_dict)
  4347. task_queue.put(_dict)
  4348. ots_client = getConnect_ots()
  4349. def _handle(item,result_queue):
  4350. _docid = item.get("docid")
  4351. bool_query = BoolQuery(must_queries=[TermQuery("docid",int(_docid))])
  4352. rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
  4353. SearchQuery(bool_query),
  4354. columns_to_get=ColumnsToGet(["save"],return_type=ColumnReturnType.SPECIFIED))
  4355. list_data = getRow_ots(rows)
  4356. if len(list_data)>0:
  4357. _save = list_data[0].get("save")
  4358. item["save"] = _save
  4359. mt = MultiThreadHandler(task_queue,_handle,None,30)
  4360. mt.run()
  4361. list_d_docid = []
  4362. for _data in list_d:
  4363. docid = _data.get("docid")
  4364. save = _data.get("save")
  4365. if save==0:
  4366. list_d_docid.append(str(docid))
  4367. print(",".join(list_d_docid))
  4368. def getDocumentHtml():
  4369. filename = "../data/2023-02-20_154118_数据导出.xlsx"
  4370. df = pd.read_excel(filename)
  4371. ots_client = getConnect_capacity()
  4372. list_html_data = []
  4373. _count = 0
  4374. for docid in df["docid"][:10000]:
  4375. partitionkey = int(docid)%500+1
  4376. docid = int(docid)
  4377. try:
  4378. consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["dochtmlcon"])
  4379. _dict = getRow_ots_primary(return_row)
  4380. list_html_data.append(_dict)
  4381. _count += 1
  4382. print("%d/%d"%(_count,len(df["docid"])))
  4383. except Exception as e:
  4384. pass
  4385. save(list_html_data,"list_html_data.pk")
  4386. def exportAgencyCount():
  4387. filename = "广州招标协会.xlsx"
  4388. df = pd.read_excel(filename)
  4389. a = df["a"]
  4390. df_data = {}
  4391. set_c = set()
  4392. for line in a:
  4393. list_c = line.split(" ")
  4394. for _i in range(len(list_c)):
  4395. _key = "c_%s"%(str(_i).rjust(2,'0'))
  4396. if _key not in df_data:
  4397. df_data[_key] = []
  4398. set_c.add(_key)
  4399. df_data[_key].append(list_c[_i])
  4400. list_data = []
  4401. list_query = []
  4402. ots_client = getConnect_ots()
  4403. for _agency in df_data["c_00"]:
  4404. query = BoolQuery(must_queries=[TermQuery("city","广州"),
  4405. TermQuery("docchannel",52),
  4406. RangeQuery("status",201,301),
  4407. RangeQuery("page_time","2022-01-01","2023-01-01"),
  4408. TermQuery("agency",_agency),
  4409. BoolQuery(should_queries=[
  4410. BoolQuery(should_queries=[MatchPhraseQuery("doctitle","工程施工"),
  4411. MatchPhraseQuery("doctextcon","建造师"),
  4412. MatchPhraseQuery("attachmenttextcon","建造师")]),
  4413. BoolQuery(should_queries=[MatchPhraseQuery("doctitle","监理"),
  4414. MatchPhraseQuery("doctextcon","监理工程师"),
  4415. MatchPhraseQuery("attachmenttextcon","监理工程师")]),
  4416. BoolQuery(should_queries=[MatchPhraseQuery("doctitle","造价咨询"),
  4417. MatchPhraseQuery("doctitle","预算"),
  4418. MatchPhraseQuery("doctitle","造价审核"),
  4419. MatchPhraseQuery("doctitle","结算"),
  4420. MatchPhraseQuery("doctitle","概算")]),
  4421. ])
  4422. ],
  4423. must_not_queries=[generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["广州公共资源交易中心"],MatchPhraseQuery)])
  4424. # list_row = getDocument([{"query":query}],["agency","page_time","sub_docs_json"],thread_count=1)
  4425. list_query.append({"query":query})
  4426. df_data = {}
  4427. set_line = set()
  4428. columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
  4429. list_row = getDocument(list_query,columns,thread_count=30)
  4430. # list_row = filterRow(list_row,"product",list_not_key)
  4431. log("get document %d rows"%len(list_row))
  4432. # getRowDataWithKey(df_data,list_row,columns)
  4433. dict_channel = getDict_docchannel()
  4434. getRowData(df_data,list_row,set_line,[''],dict_channel,True)
  4435. # getRowData_sp1(df_data,list_row,set_line,list_keyword,dict_sptype,True)
  4436. # fixContactPerson(df_data,list_df_columns,get_legal_person=False)
  4437. df1 = pd.DataFrame(df_data)
  4438. df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
  4439. return
  4440. # _dict = {"一季度预算":0,"一季度总数":0,"一季度有金额占比":0,
  4441. # "二季度预算":0,"二季度总数":0,"二季度有金额占比":0,
  4442. # "三季度预算":0,"三季度总数":0,"三季度有金额占比":0,
  4443. # "四季度预算":0,"四季度总数":0,"四季度有金额占比":0,
  4444. # }
  4445. # print(_agency,len(list_row))
  4446. #
  4447. #
  4448. # for _row in list_row:
  4449. # print(_row.get("docid"))
  4450. # page_time = _row.get("page_time","")
  4451. # sub_docs_json = _row.get("sub_docs_json","")
  4452. # _t = None
  4453. # if page_time!="":
  4454. # if page_time<="2022-03-31":
  4455. # _t = "一季度"
  4456. # elif page_time<="2022-06-31":
  4457. # _t = "二季度"
  4458. # elif page_time<="2022-09-31":
  4459. # _t = "三季度"
  4460. # elif page_time<="2022-12-31":
  4461. # _t = "四季度"
  4462. # if sub_docs_json != "":
  4463. # sub_docs = json.loads(sub_docs_json)
  4464. # for _doc in sub_docs:
  4465. # bidding_budget = _doc.get("bidding_budget",0)
  4466. # _dict["%s预算"%_t] += bidding_budget
  4467. # _dict["%s总数"%_t] += 1
  4468. # if bidding_budget>0:
  4469. # _dict["%s有金额占比"%_t] += 1
  4470. # print(_dict)
  4471. # _sum = 0
  4472. # _sum_n = 0
  4473. # _sum_hm = 0
  4474. # for k in ["一季度","二季度","三季度","四季度"]:
  4475. # km = "%s预算"%k
  4476. # kn = "%s总数"%k
  4477. # khm = "%s有金额占比"%k
  4478. # _sum += _dict[km]
  4479. # _sum_n += _dict[kn]
  4480. # _sum_hm += _dict[khm]
  4481. # _dict["全年预算"] = _sum
  4482. # _dict["全年总数"] = _sum_n
  4483. # _dict["全年有金额占比"] = _sum_hm
  4484. # for k in ["一季度","二季度","三季度","四季度","全年"]:
  4485. # km = "%s预算"%k
  4486. # kn = "%s总数"%k
  4487. # khm = "%s有金额占比"%k
  4488. # _dict[khm] = _dict[khm]/_dict[kn] if _dict[kn]>0 else 0
  4489. # for k,v in _dict.items():
  4490. # if k not in df_data:
  4491. # df_data[k] = []
  4492. # df_data[k].append(v)
  4493. #
  4494. # list_c = list(set_c)
  4495. # list_c.sort(key=lambda x:x)
  4496. # for k,v in df_data.items():
  4497. # print(k,len(v))
  4498. # df1 = pd.DataFrame(df_data)
  4499. # list_c.append("一季度预算")
  4500. # list_c.append("一季度总数")
  4501. # list_c.append("一季度有金额占比")
  4502. # list_c.append("二季度预算")
  4503. # list_c.append("二季度总数")
  4504. # list_c.append("二季度有金额占比")
  4505. # list_c.append("三季度预算")
  4506. # list_c.append("三季度总数")
  4507. # list_c.append("三季度有金额占比")
  4508. # list_c.append("四季度预算")
  4509. # list_c.append("四季度总数")
  4510. # list_c.append("四季度有金额占比")
  4511. # list_c.append("全年预算")
  4512. # list_c.append("全年总数")
  4513. # list_c.append("全年有金额占比")
  4514. # df1.to_excel("%s_1.xlsx"%(filename),columns=list_c)
  4515. def attachAttachment():
  4516. filename = "北京电信ICT样例(2023一季度)v1.1(2).xlsx"
  4517. df = pd.read_excel(filename,1)
  4518. list_data = []
  4519. task_queue = Queue()
  4520. for _docid in df["docid"]:
  4521. _d = {"docid":_docid}
  4522. list_data.append(_d)
  4523. task_queue.put(_d)
  4524. print("len_docid",len(df["docid"]),len(list_data))
  4525. capacity = getConnect_capacity()
  4526. def _handle(item,result_queue):
  4527. docid = item["docid"]
  4528. consumed, return_row, next_token = capacity.get_row("document",[("partitionkey",int(docid)%500+1),("docid",int(docid))],["dochtmlcon"])
  4529. _d = getRow_ots_primary(return_row)
  4530. _dochtmlcon = _d["dochtmlcon"]
  4531. _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
  4532. _soup = BeautifulSoup(_dochtmlcon,"lxml")
  4533. _div = _soup.find("div",attrs={"class":"richTextFetch"})
  4534. if _div is None:
  4535. _div = ""
  4536. item["attachment"] = _div
  4537. mt = MultiThreadHandler(task_queue,_handle,None,30)
  4538. mt.run()
  4539. list_attachment = []
  4540. for _d in list_data:
  4541. list_attachment.append(getLegal_str(_d.get("attachment","")))
  4542. df_data = {}
  4543. df_data["附件html"] = list_attachment
  4544. df_1 = pd.DataFrame(df_data)
  4545. df_1.to_excel("附加html_"+filename)
  4546. def compareData():
  4547. filename = "D:\\BaiduNetdiskDownload\\bidi_check.csv"
  4548. list_data = []
  4549. with open(filename,"r",encoding="utf8") as f:
  4550. list_lines = f.readlines()
  4551. for _line in list_lines:
  4552. docid,docchannel,win_tenderer,tenderee,win_bid_price,bidding_budget = [None if a[:2]=='\\N' else a for a in _line.split("\t")]
  4553. _d = {"docid":int(docid),
  4554. "docchannel":docchannel,
  4555. "win_tenderer":win_tenderer,
  4556. "tenderee":tenderee,
  4557. "win_bid_price":float(win_bid_price) if win_bid_price is not None else None,
  4558. "bidding_budget":float(bidding_budget) if bidding_budget is not None else None}
  4559. list_data.append(_d)
  4560. del list_lines
  4561. # for _i in range(len(list_data)):
  4562. # print(list_lines[_i])
  4563. # print(list_data[_i])
  4564. ots_client = getConnect_ots()
  4565. task_queue = Queue()
  4566. for _d in list_data:
  4567. task_queue.put(_d)
  4568. def _handle(item,result_queue):
  4569. docid = item.get("docid")
  4570. win_tenderer = item.get("win_tenderer")
  4571. win_bid_price = item.get("win_bid_price")
  4572. tenderee = item.get("tenderee")
  4573. bidding_budget = item.get("bidding_budget")
  4574. must_q = [TermQuery("docid",int(docid))]
  4575. if tenderee is not None:
  4576. must_q.append(TermQuery("tenderee",tenderee))
  4577. if win_tenderer is not None:
  4578. must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",win_tenderer)))
  4579. if win_bid_price is not None:
  4580. must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_bid_price",win_bid_price)))
  4581. if bidding_budget is not None:
  4582. must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.bidding_budget",bidding_budget)))
  4583. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4584. SearchQuery(BoolQuery(must_queries=must_q),get_total_count=True),
  4585. columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
  4586. item["total_count"] = total_count
  4587. if total_count==0:
  4588. print("docid %d total_count is 0",docid)
  4589. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4590. SearchQuery(BoolQuery(must_queries=[TermQuery("docid",docid)]),get_total_count=True),
  4591. columns_to_get=ColumnsToGet(["tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
  4592. l_d = getRow_ots(rows)
  4593. item["return_row"] = l_d
  4594. mt = MultiThreadHandler(task_queue,_handle,None,30)
  4595. mt.run()
  4596. list_new_data = []
  4597. for data in list_data:
  4598. if data.get("total_count")==0:
  4599. new_d = {"docid":data.get("docid"),
  4600. "docchannel":data.get("docchannel")}
  4601. return_row = data.get("return_row")
  4602. if len(return_row)>0:
  4603. # print(return_row)
  4604. _row = return_row[0]
  4605. tenderee = _row.get("tenderee")
  4606. sub_docs_json = _row.get("sub_docs_json")
  4607. bidding_budget = None
  4608. win_tenderer = None
  4609. win_bid_price = None
  4610. if sub_docs_json is not None:
  4611. sub_docs = json.loads(sub_docs_json)
  4612. for _doc in sub_docs:
  4613. if _doc.get("bidding_budget") is not None:
  4614. bidding_budget = _doc.get("bidding_budget")
  4615. if _doc.get("win_tenderer") is not None:
  4616. win_tenderer = _doc.get("win_tenderer")
  4617. win_bid_price = _doc.get("win_bid_price")
  4618. new_d["tenderee"] = tenderee
  4619. new_d["bidding_budget"] = bidding_budget
  4620. new_d["win_tenderer"] = win_tenderer
  4621. new_d["win_bid_price"] = win_bid_price
  4622. list_new_data.append(new_d)
  4623. df_data_c = ["docid","docchannel","win_tenderer","tenderee","win_bid_price","bidding_budget"]
  4624. df_data = {}
  4625. for c in df_data_c:
  4626. df_data[c] = []
  4627. for _d in list_new_data:
  4628. for c in df_data_c:
  4629. df_data[c].append(_d.get(c))
  4630. df = pd.DataFrame(df_data)
  4631. df.to_csv("bid_check_result.csv",columns=df_data_c)
  4632. def exportProducts():
  4633. filename = "货物关键词.xlsx"
  4634. dict_channel = getDict_docchannel()
  4635. df = pd.read_excel(filename)
  4636. list_products = df["货物关键词"]
  4637. list_q = []
  4638. list_result = []
  4639. ots_client = getConnect_ots()
  4640. columns = ["产品","总数","匹配模式"]
  4641. _index = 0
  4642. task_queue = Queue()
  4643. for _product in list_products:
  4644. _index += 1
  4645. print(_product,"%d/%d"%(_index,len(list_products)))
  4646. bool_query = BoolQuery(must_queries=[NestedQuery("products",TermQuery("products.product",_product)),
  4647. # RangeQuery("page_time","2021-01-01"),
  4648. RangeQuery("status",201,301),
  4649. TermQuery("docchannel",101)])
  4650. _q = {"query":bool_query,"product":_product,"匹配模式":"精准"}
  4651. task_queue.put(_q)
  4652. bool_query = BoolQuery(must_queries=[NestedQuery("products",WildcardQuery("products.product","*%s*"%_product)),
  4653. # RangeQuery("page_time","2021-01-01"),
  4654. RangeQuery("status",201,301),
  4655. TermQuery("docchannel",101)])
  4656. _q = {"query":bool_query,"product":_product,"匹配模式":"包括"}
  4657. task_queue.put(_q)
  4658. def _handle(item,result_queue):
  4659. bool_query = item["query"]
  4660. _product = item["product"]
  4661. _type = item["匹配模式"]
  4662. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4663. SearchQuery(bool_query,get_total_count=True),
  4664. columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
  4665. list_result.append({"产品":_product,"总数":total_count,"匹配模式":_type})
  4666. mt = MultiThreadHandler(task_queue,_handle,None,30)
  4667. mt.run()
  4668. print("done result length:%d"%(len(list_result)))
  4669. df_data = {}
  4670. for _d in list_result:
  4671. for c in columns:
  4672. if c not in df_data:
  4673. df_data[c] = []
  4674. df_data[c].append(_d.get(c))
  4675. df1 = pd.DataFrame(df_data)
  4676. df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=columns)
  4677. def statics_attachment_counts():
  4678. bool_query = BoolQuery(must_queries=[
  4679. generateBoolShouldQuery(["industry"],["土木工程建筑业","建筑装饰和其他建筑业","房屋建筑业","专业施工","修缮工程","建筑安装业"],TermQuery),
  4680. RangeQuery("page_time","2023-08-07","2023-08-14"),
  4681. RangeQuery("status",201,301),
  4682. NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
  4683. ])
  4684. ots_client = getConnect_ots()
  4685. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4686. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  4687. ColumnsToGet(["industry","docchannel","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
  4688. list_data = getRow_ots(rows)
  4689. while next_token:
  4690. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4691. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  4692. ColumnsToGet(["industry","docchannel","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
  4693. list_data.extend(getRow_ots(rows))
  4694. # if len(list_data)>1000:
  4695. # break
  4696. task_queue = Queue()
  4697. for _data in list_data:
  4698. task_queue.put(_data)
  4699. def _handle(item,result_queue):
  4700. page_attachments = item.get("page_attachments")
  4701. _size = 0
  4702. if page_attachments is not None and page_attachments!="":
  4703. list_attach = json.loads(page_attachments)
  4704. for _attach in list_attach:
  4705. _md5 = _attach.get("fileMd5")
  4706. if _md5 is not None:
  4707. consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",_md5)],["size"])
  4708. _d = getRow_ots_primary(return_row)
  4709. _size += _d.get("size",0)
  4710. item["size"] = _size
  4711. mt = MultiThreadHandler(task_queue,_handle,None,30)
  4712. mt.run()
  4713. dict_result = {}
  4714. for data in list_data:
  4715. industry = data.get("industry")
  4716. docchannel = data.get("docchannel")
  4717. _type = ""
  4718. if docchannel==52:
  4719. _type = "招标"
  4720. elif docchannel in (101,118,119,120):
  4721. _type = "中标"
  4722. else:
  4723. _type = "其他"
  4724. _key = "%s-%s"%(industry,_type)
  4725. if _key not in dict_result:
  4726. dict_result[_key] = 0
  4727. dict_result[_key] += data.get("size",0)
  4728. print(dict_result)
  4729. for k,v in dict_result.items():
  4730. print(k,"%.2fM"%(v/7/1024/1024))
  4731. def static_dump():
  4732. import pandas as pd
  4733. filename = "select___from_bxkc_bxkc_delete_document_.csv"
  4734. df = pd.read_csv(filename)
  4735. print(df.keys())
  4736. list_docid = df["docid"]
  4737. list_dup_docid = df["dup_docid"]
  4738. list_operate_time = df["operate_time"]
  4739. list_a = []
  4740. for docid,dup_docid in zip(list_docid,list_dup_docid):
  4741. docid = int(docid)
  4742. _flag = False
  4743. if isinstance(dup_docid,str) and dup_docid is not None and dup_docid!="":
  4744. _l = dup_docid.split(",")
  4745. for _i in _l:
  4746. if _i.strip()!="":
  4747. docid1 = int(_i)
  4748. if docid1>docid:
  4749. _flag = True
  4750. break
  4751. if _flag:
  4752. list_a.append("是")
  4753. else:
  4754. list_a.append("否")
  4755. df_data = {"被去重docid":list_docid,
  4756. "重复id":list_dup_docid,
  4757. "是否展示后删除":list_a}
  4758. df1 = pd.DataFrame(df_data)
  4759. df1.to_csv("16号去重统计.csv")
  4760. def append_title():
  4761. import pandas as pd
  4762. filename = "去重记录.xlsx"
  4763. df = pd.read_excel(filename)
  4764. list_docid = df["被去重id"]
  4765. list_keep_id = df["保留id"]
  4766. list_data = []
  4767. task_queue = Queue()
  4768. for _docid,keep_docid in zip(list_docid,list_keep_id):
  4769. _d = {"dup_docid":int(_docid),
  4770. "keep_docid":int(keep_docid)}
  4771. list_data.append(_d)
  4772. task_queue.put(_d)
  4773. ots_client = getConnect_ots()
  4774. def _handle(item,result_queue):
  4775. dup_docid = item.get("dup_docid")
  4776. keep_docid = item.get("keep_docid")
  4777. dup_partitionkey = dup_docid%500+1
  4778. keep_partitionkey = keep_docid%500+1
  4779. consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",dup_partitionkey),("docid",dup_docid)],["status","doctitle","extract_count"])
  4780. _d = getRow_ots_primary(return_row)
  4781. if _d is not None:
  4782. doctitle = _d.get("doctitle")
  4783. item["dup_title"] = doctitle
  4784. extract_count = _d.get("extract_count")
  4785. item["dup_extract_count"] = extract_count
  4786. consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",keep_partitionkey),("docid",keep_docid)],["status","doctitle","extract_count","extract_count"])
  4787. _d = getRow_ots_primary(return_row)
  4788. if _d is not None:
  4789. doctitle = _d.get("doctitle")
  4790. item["keep_title"] = doctitle
  4791. status = _d.get("status")
  4792. extract_count = _d.get("extract_count")
  4793. item["keep_extract_count"] = extract_count
  4794. if status>=201 and status<=300:
  4795. item["保留id状态"] = "正常"
  4796. elif status>=401:
  4797. item["保留id状态"] = "去重"
  4798. else:
  4799. item["保留id状态"] = ""
  4800. mt = MultiThreadHandler(task_queue,_handle,None,39)
  4801. mt.run()
  4802. keys = ["dup_docid","keep_docid","dup_title","keep_title","保留id状态","dup_extract_count","keep_extract_count"]
  4803. df_data = {}
  4804. for data in list_data:
  4805. for k in keys:
  4806. if k not in df_data:
  4807. df_data[k] = []
  4808. df_data[k].append(data.get(k))
  4809. df1 = pd.DataFrame(df_data)
  4810. df1.to_excel("%s.xlsx"%(filename),columns=keys)
  4811. def get_follows():
  4812. _json = '''
  4813. [
  4814. ]
  4815. '''
  4816. ots_client = getConnect_ots()
  4817. list_follows = json.loads(_json)
  4818. new_list = []
  4819. for follow in list_follows:
  4820. docid = follow.get("docid")
  4821. partitionkey = docid%500+1
  4822. consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["tenderee"])
  4823. _d = getRow_ots_primary(return_row)
  4824. print("docid",_d.get("tenderee"))
  4825. if _d.get("tenderee")=="泗阳意杨产业科技园实业有限公司":
  4826. new_list.append(follow)
  4827. print(json.dumps(new_list,ensure_ascii=False))
  4828. def validateTitle(title):
  4829. rstr = r"[\/\\\:\*\?\"\<\>\|\r\n]" # '/ \ : * ? " < > |'
  4830. new_title = re.sub(rstr, "_", title) # 替换为下划线
  4831. return new_title
  4832. def exportParameters():
  4833. from glob import glob
  4834. attach_path = "F:/Workspace2016/BaseDataMaintenance/BaseDataMaintenance/maintenance/product/download"
  4835. ots_client = getConnect_ots()
  4836. bool_query = BoolQuery(must_queries=[TermQuery("parameter_status",1)])
  4837. save_dir = "product"
  4838. if not os.path.exists(save_dir):
  4839. os.mkdir(save_dir)
  4840. rows,next_token,total_count,is_all_succeed = ots_client.search("document_product2","document_product2_index",
  4841. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
  4842. ColumnsToGet(["parameter","bid_filemd5s","name","original_name"],return_type=ColumnReturnType.SPECIFIED))
  4843. list_data = getRow_ots(rows)
  4844. write_count = 0
  4845. for _data in list_data:
  4846. bid_filemd5s = _data["bid_filemd5s"]
  4847. parameter = _data["parameter"]
  4848. name = _data["name"]
  4849. original_name = _data["original_name"]
  4850. list_md5s = bid_filemd5s.split(",")
  4851. if len(list_md5s)==1:
  4852. list_path = glob(os.path.join(attach_path,bid_filemd5s)+"*")
  4853. for _path in list_path:
  4854. if not _path.endswith(".html"):
  4855. filename = _path.split("\\")[-1]
  4856. with open(os.path.join(save_dir,filename),"wb") as f:
  4857. f.write(open(_path,"rb").read())
  4858. pname = "%s_name%s_original_name%s.html"%(bid_filemd5s,name,original_name[:10])
  4859. pname = validateTitle(pname)
  4860. with open(os.path.join(save_dir,pname),"w",encoding="utf8") as f:
  4861. f.write(parameter)
  4862. write_count += 1
  4863. while next_token:
  4864. rows,next_token,total_count,is_all_succeed = ots_client.search("document_product2","document_product2_index",
  4865. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  4866. ColumnsToGet(["parameter","bid_filemd5s","name","original_name"],return_type=ColumnReturnType.SPECIFIED))
  4867. list_data = getRow_ots(rows)
  4868. for _data in list_data:
  4869. bid_filemd5s = _data["bid_filemd5s"]
  4870. parameter = _data["parameter"]
  4871. name = _data["name"]
  4872. original_name = _data["original_name"]
  4873. list_md5s = bid_filemd5s.split(",")
  4874. if len(list_md5s)==1:
  4875. list_path = glob(os.path.join(attach_path,bid_filemd5s)+"*")
  4876. for _path in list_path:
  4877. if not _path.endswith(".html"):
  4878. filename = _path.split("\\")[-1]
  4879. with open(os.path.join(save_dir,filename),"wb") as f:
  4880. f.write(open(_path,"rb").read())
  4881. pname = "%s_name%s_original_name%s.html"%(bid_filemd5s,name,original_name[:10])
  4882. pname = validateTitle(pname)
  4883. with open(os.path.join(save_dir,pname),"w",encoding="utf8") as f:
  4884. f.write(parameter)
  4885. write_count += 1
  4886. if write_count>=2000:
  4887. return
  4888. def exportProjects():
  4889. bool_query = BoolQuery(must_queries=[
  4890. TermQuery("docid_number",1),
  4891. ExistsQuery("zhong_biao_page_time"),
  4892. RangeQuery("page_time","2023-01-01","2023-10-10")
  4893. ],
  4894. must_not_queries=[
  4895. MatchPhraseQuery("doctitles","网上超市")
  4896. ])
  4897. ots_client = getConnect_ots()
  4898. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  4899. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=100,get_total_count=True),
  4900. ColumnsToGet(["docids","doctitles","project_codes"],return_type=ColumnReturnType.SPECIFIED))
  4901. list_data = getRow_ots(rows)
  4902. while next_token:
  4903. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  4904. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  4905. ColumnsToGet(["docids","doctitles","project_codes"],return_type=ColumnReturnType.SPECIFIED))
  4906. list_data.extend(getRow_ots(rows))
  4907. if len(list_data)>10000:
  4908. break
  4909. task_queue = Queue()
  4910. for data in list_data:
  4911. task_queue.put(data)
  4912. def _handle(item,result_queue):
  4913. docids = item["docids"]
  4914. project_codes = item.get("project_codes","")
  4915. if len(project_codes)>0:
  4916. list_codes = project_codes.split(",")
  4917. should_q = []
  4918. for code in list_codes:
  4919. should_q.append(MatchPhraseQuery("doctextcon",code))
  4920. should_q.append(MatchPhraseQuery("attachmenttextcon",code))
  4921. _query = BoolQuery(must_queries=[BoolQuery(should_queries=should_q),RangeQuery("status",201,301)],
  4922. must_not_queries=[TermQuery("docid",int(docids))])
  4923. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4924. SearchQuery(_query,limit=100),
  4925. ColumnsToGet(["doctitle"],return_type=ColumnReturnType.SPECIFIED))
  4926. item["result"] = json.dumps(getRow_ots(rows),ensure_ascii=False)
  4927. mt = MultiThreadHandler(task_queue,_handle,None,30)
  4928. mt.run()
  4929. columns = ["docids","doctitles","project_codes","result"]
  4930. df_data = {}
  4931. for data in list_data:
  4932. for c in columns:
  4933. if c not in df_data:
  4934. df_data[c] = []
  4935. df_data[c].append(data.get(c,""))
  4936. df = pd.DataFrame(df_data)
  4937. df.to_excel("toMerge.xlsx",columns=columns)
  4938. def match_contact():
  4939. filename = r"C:\Users\Administrator\联系电话需求.xlsx"
  4940. df = pd.read_excel(filename)
  4941. ots_client = getConnect_ots()
  4942. list_row = []
  4943. for index,row in df.iterrows():
  4944. a = row["乙方"]
  4945. b = row["联系人"]
  4946. c = row['联系电话(手机号)']
  4947. d = row["想打听甲方联系人的项目"].strip()
  4948. e = row['联系人.1']
  4949. f = row['联系电话(手机号).1']
  4950. print(a,b,c,d,e,f)
  4951. bool_query = BoolQuery(must_queries=[
  4952. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[a,d],MatchPhraseQuery),
  4953. TermQuery("tenderee",d),
  4954. RangeQuery("status",201,301)
  4955. ])
  4956. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  4957. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),get_total_count=True,limit=10),
  4958. ColumnsToGet(["tenderee_contact","tenderee_phone"],return_type=ColumnReturnType.SPECIFIED))
  4959. list_data = getRow_ots(rows)
  4960. _find = False
  4961. # if len(list_data)>0:
  4962. # for data in list_data:
  4963. # if re.search('^1\d{10}',data.get("tenderee_phone","")) is not None:
  4964. # df["联系人.1"][index] = data.get("tenderee_contact","")
  4965. # df['联系电话(手机号).1'][index] = data.get("tenderee_phone","")
  4966. # print("===",data)
  4967. # _find = True
  4968. # break
  4969. bool_query = BoolQuery(must_queries=[
  4970. TermQuery("enterprise_name",a),
  4971. TermQuery("is_mobile",1)
  4972. ])
  4973. rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
  4974. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("score",SortOrder.DESC),FieldSort("update_time",SortOrder.DESC)]),get_total_count=True,limit=10),
  4975. ColumnsToGet(["contact_person","phone_no"],return_type=ColumnReturnType.SPECIFIED))
  4976. list_data = getRow_ots(rows)
  4977. if len(list_data)>0:
  4978. _str = ""
  4979. for _data in list_data:
  4980. _str += _data.get("contact_person","")+"|"+_data.get("phone_no","")+"|\n"
  4981. df["联系人"][index] = _str
  4982. df['联系电话(手机号)'][index] = _str
  4983. if not _find:
  4984. bool_query = BoolQuery(must_queries=[
  4985. TermQuery("enterprise_name",d),
  4986. TermQuery("is_mobile",1)
  4987. ])
  4988. rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
  4989. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("score",SortOrder.DESC),FieldSort("update_time",SortOrder.DESC)]),get_total_count=True,limit=10),
  4990. ColumnsToGet(["contact_person","phone_no"],return_type=ColumnReturnType.SPECIFIED))
  4991. list_data = getRow_ots(rows)
  4992. if len(list_data)>0:
  4993. _str = ""
  4994. for _data in list_data:
  4995. _str += _data.get("contact_person","")+"|"+_data.get("phone_no","")+"|\n"
  4996. df["联系人.1"][index] = _str
  4997. df['联系电话(手机号).1'][index] = _str
  4998. print(total_count)
  4999. df.to_excel("result.xlsx")
  5000. def match_products():
  5001. # filename = "未订阅用户查看公告记录.xlsx"
  5002. # df = pd.read_excel(filename)
  5003. # list_user_id = df["user_id"]
  5004. # list_doc_id = df["doc_id"]
  5005. # ots_client = getConnect_ots()
  5006. # list_product = []
  5007. # _c = 0
  5008. # for docid in list_doc_id:
  5009. # _c += 1
  5010. # print(_c,len(list_doc_id))
  5011. # bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
  5012. # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  5013. # SearchQuery(bool_query),
  5014. # columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
  5015. # list_data = getRow_ots(rows)
  5016. # if len(list_data)>0:
  5017. # list_product.append(list_data[0].get("product",""))
  5018. # else:
  5019. # list_product.append("")
  5020. # u_id = None
  5021. # u_product = []
  5022. # list_product_group = []
  5023. # for user_id,product in zip(list_user_id,list_product):
  5024. # if u_id is None:
  5025. # u_id = user_id
  5026. # u_product.append(product)
  5027. # else:
  5028. # if user_id==u_id:
  5029. # u_product.append(product)
  5030. # else:
  5031. # l_p = []
  5032. # for _p in u_product:
  5033. # s = _p.split(",")
  5034. # l_p.extend(s)
  5035. # _dict = {}
  5036. # for p in l_p:
  5037. # if p not in _dict:
  5038. # _dict[p] = 0
  5039. # _dict[p] += 1
  5040. # for _ in u_product:
  5041. # list_product_group.append(json.dumps(_dict,ensure_ascii=False))
  5042. # u_id = user_id
  5043. # u_product = [product]
  5044. #
  5045. #
  5046. # if len(u_product)>0:
  5047. # l_p = []
  5048. # for _p in u_product:
  5049. # s = _p.split(",")
  5050. # l_p.extend(s)
  5051. # _dict = {}
  5052. # for p in l_p:
  5053. # if p not in _dict:
  5054. # _dict[p] = 0
  5055. # _dict[p] += 1
  5056. # for _ in u_product:
  5057. # list_product_group.append(json.dumps(_dict,ensure_ascii=False))
  5058. #
  5059. # _d = {"user_id":list_user_id,
  5060. # "doc_id":list_doc_id,
  5061. # "product":list_product,
  5062. # "product_group":list_product_group}
  5063. # for k,v in _d.items():
  5064. # print(k,len(v))
  5065. # df1 = pd.DataFrame(_d)
  5066. # df1.to_excel(filename+"_1.xlsx",columns=["user_id","doc_id","product","product_group"])
  5067. filename = "大单未订阅用户关键词.xlsx"
  5068. df = pd.read_excel(filename)
  5069. list_userid = df["userid"]
  5070. list_kw = df["关键词"]
  5071. list_kw1 = []
  5072. for kw in list_kw:
  5073. list_word = kw.split(",")
  5074. list_word.sort(key=lambda x:len(x))
  5075. list_words = []
  5076. for w_i in range(len(list_word)):
  5077. _find = False
  5078. _w = list_word[w_i]
  5079. if len(_w)>8:
  5080. _find = True
  5081. else:
  5082. for w_j in range(w_i):
  5083. _wj = list_word[w_j]
  5084. if str(_w).find(_wj)>=0:
  5085. _find = True
  5086. break
  5087. if not _find:
  5088. list_words.append(_w)
  5089. if len(list_words)==6:
  5090. break
  5091. list_kw1.append(",".join(list_words))
  5092. _dict = {"userid":list_userid,
  5093. "关键词":list_kw1}
  5094. df1 = pd.DataFrame(_dict)
  5095. df1.to_excel(filename+"_1.xlsx",columns=["userid","关键词"])
  5096. def export_columns():
  5097. filename = r"F:\Workspace2016\DataMining\data\2025-03-14_161851_数据导出.xlsx"
  5098. df = pd.read_excel(filename)
  5099. task_queue = Queue()
  5100. result_queue = Queue()
  5101. ots_client = getConnect_ots()
  5102. ots_capacity = getConnect_capacity()
  5103. for i in range(len(df)):
  5104. docid = df.iloc[i]["docid"]
  5105. docid = int(docid)
  5106. task_queue.put(docid)
  5107. from bs4 import BeautifulSoup
  5108. import re
  5109. def html2text_with_tablehtml(_html):
  5110. # 如果输入是字符串,使用 BeautifulSoup 解析
  5111. if isinstance(_html, str):
  5112. _soup = BeautifulSoup(_html, "lxml")
  5113. else:
  5114. _soup = _html
  5115. # 用于存储处理后的文本
  5116. result_parts = []
  5117. _find = False
  5118. # 遍历所有直接子元素
  5119. for child in _soup.find_all(recursive=False):
  5120. if child.name in ["table", "tbody"]:
  5121. # 如果是表格或表格主体,保留 HTML 代码
  5122. result_parts.append("\n"+str(child)+"\n")
  5123. else:
  5124. # 递归处理其他元素并转换为文本
  5125. text = html2text_with_tablehtml(child)
  5126. result_parts.append(text)
  5127. _find = True
  5128. if not _find:
  5129. _text = str(_soup.get_text())
  5130. if len(_text)>0:
  5131. if _soup.name in {"p","div","li"}:
  5132. _text += "\n"
  5133. result_parts.append(_text)
  5134. # 将所有处理后的部分连接成一个字符串
  5135. result = "".join(result_parts)
  5136. return result
  5137. def _handle(item,result_queue):
  5138. consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",int(item%500+1)),("docid",int(item))],["web_source_no","web_source_name"])
  5139. _dict = getRow_ots_primary(return_row)
  5140. web_source_no = _dict.get("web_source_no","")
  5141. web_source_name = _dict.get("web_source_name","")
  5142. consumed, return_row, next_token = ots_capacity.get_row("document",[("partitionkey",int(item%500+1)),("docid",int(item))],["dochtmlcon"])
  5143. _dict = getRow_ots_primary(return_row)
  5144. dochtmlcon = _dict.get("dochtmlcon","")
  5145. _text = html2text_with_tablehtml(dochtmlcon)
  5146. result_queue.put((item,web_source_no,web_source_name,_text))
  5147. mt = MultiThreadHandler(task_queue,_handle,result_queue,30)
  5148. mt.run()
  5149. _dict_docid = {}
  5150. while True:
  5151. try:
  5152. item,web_source_no,web_source_name,_text = result_queue.get(False)
  5153. _dict_docid[item] = (web_source_no,web_source_name,_text)
  5154. except Exception as e:
  5155. break
  5156. for i in range(len(df)):
  5157. docid = df.iloc[i]["docid"]
  5158. docid = int(docid)
  5159. if docid in _dict_docid:
  5160. web_source_no,web_source_name,_text = _dict_docid[docid]
  5161. df.loc[i,"web_source_no"] = web_source_no
  5162. df.loc[i,"web_source_name"] = web_source_name
  5163. df.loc[i,"tokens"] = len(_text)
  5164. else:
  5165. df.loc[i,"web_source_no"] = ""
  5166. df.loc[i,"web_source_name"] = ""
  5167. df.loc[i,"tokens"] = 0
  5168. df.to_excel(filename+"_1.xlsx")
  5169. def clean_subcription(s_subcriptions,s_exclude_subcriptions):
  5170. '''
  5171. 自动清洗订阅词
  5172. :param s_subscriptions:
  5173. :param s_exclude_subscriptions:
  5174. :return:
  5175. '''
  5176. from export.html2text import html2text_with_tablehtml
  5177. ots_client = getConnect_ots()
  5178. ots_capacity = getConnect_capacity()
  5179. def get_content(subcriptions,exclude_subcriptions,current_date):
  5180. must_not_queries=[
  5181. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],exclude_subcriptions[:80],MatchPhraseQuery)
  5182. ] if len(exclude_subcriptions)>0 else []
  5183. bool_query = BoolQuery(must_queries=[
  5184. TermsQuery("docchannel",[52,101,118,119,120,121,122,51]),
  5185. TermQuery("page_time",current_date),
  5186. generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],subcriptions[:80],MatchPhraseQuery),
  5187. ],
  5188. must_not_queries=must_not_queries)
  5189. list_row = getDocument([{"query":bool_query,"limit":10}],["docid","doctitle"],thread_count=1)
  5190. for row in list_row:
  5191. docid = row["docid"]
  5192. partitionkey = row["partitionkey"]
  5193. consumed, return_row, next_token = ots_capacity.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["dochtmlcon"])
  5194. _dict = getRow_ots_primary(return_row)
  5195. dochtmlcon = _dict.get("dochtmlcon","")
  5196. _text = html2text_with_tablehtml(dochtmlcon)
  5197. row["text"] = _text[:10000]
  5198. return list_row
  5199. def _handle(item,result_queue):
  5200. list_row = get_content(item["subcriptions"],item["exclude_subcriptions"],item["current_date"])
  5201. item["list_row"] = list_row
  5202. for row in list_row:
  5203. result_queue.put(row)
  5204. def clean_exclude_set(list_exclude_set):
  5205. while True:
  5206. new_list_exclude_set = []
  5207. pop_index = set()
  5208. for _i in range(len(list_exclude_set)):
  5209. if _i in pop_index:
  5210. continue
  5211. _exclude_set = list_exclude_set[_i]
  5212. _find = False
  5213. for _j in range(_i+1,len(list_exclude_set)):
  5214. if _j in pop_index:
  5215. continue
  5216. _exclude_set_j = list_exclude_set[_j]
  5217. if len(_exclude_set & _exclude_set_j)>0:
  5218. new_list_exclude_set.append(_exclude_set & _exclude_set_j)
  5219. pop_index.add(_j)
  5220. _find = True
  5221. break
  5222. if not _find:
  5223. new_list_exclude_set.append(_exclude_set)
  5224. pop_index.add(_i)
  5225. if len(new_list_exclude_set)==len(list_exclude_set):
  5226. return new_list_exclude_set
  5227. list_exclude_set = new_list_exclude_set
  5228. def _handle1(row,result_queue):
  5229. _prompt = '''
  5230. 客户对招投标信息感心趣
  5231. 订阅词如下:施工围挡,防眩板,标志牌,标示牌,标识牌,指示牌,路名牌,道路标线,道路标志,热熔标线,交通标线,交通标识工程,马路划线,道路标志制作,道路标志设置,车库划线
  5232. 根据如上订阅词,识别客户真正感兴趣的内容,判断以下公告的兴趣度,范围是0-10,如果兴趣度不高(即内容不是客户感兴趣的),请生成排除词,排除词针对这篇公告的核心内容且内容不是客户感兴趣的
  5233. 返回格式如下:
  5234. {"兴趣度":"","排除词":[]}
  5235. %s
  5236. '''%(row["text"])
  5237. _result = chat_doubao(_prompt,model_name = "ep-20250314164242-jd62g")
  5238. _json = get_json_from_text(_result)
  5239. try:
  5240. _dict = json.loads(_json)
  5241. except Exception as e:
  5242. _dict = {}
  5243. row.update(_dict)
  5244. subcriptions = s_subcriptions.split(",")
  5245. exclude_subcriptions = s_exclude_subcriptions.split(",") if s_exclude_subcriptions!="" else []
  5246. original_exclude_subcriptions = exclude_subcriptions.copy()
  5247. task_queue = Queue()
  5248. result_queue = Queue()
  5249. _index = 0
  5250. dict_count = {}
  5251. current_date = getCurrent_date(format="%Y-%m-%d")
  5252. list_exclude_set = []
  5253. while True:
  5254. list_data = []
  5255. if _index>=10:
  5256. break
  5257. _index += 1
  5258. current_date = timeAdd(current_date,-7)
  5259. for subcription in subcriptions:
  5260. _d = {"subcriptions":[subcription],
  5261. "exclude_subcriptions":exclude_subcriptions,
  5262. "current_date":current_date}
  5263. list_data.append(_d)
  5264. task_queue.put(_d)
  5265. mt = MultiThreadHandler(task_queue,_handle,result_queue,10)
  5266. mt.run()
  5267. mt = MultiThreadHandler(result_queue,_handle1,None,30)
  5268. mt.run()
  5269. for row1 in list_data:
  5270. total_interest = 0
  5271. total_count = 0
  5272. list_row = row1.get("list_row",[])
  5273. for row in list_row:
  5274. try:
  5275. interest = int(row["兴趣度"])
  5276. except Exception as e:
  5277. interest = 0
  5278. if interest>0:
  5279. total_interest += interest
  5280. total_count += 1
  5281. if interest<=5:
  5282. list_exclude = row.get("排除词",[])
  5283. if len(list_exclude)>0:
  5284. list_exclude_set.append(set(list_exclude))
  5285. for _exclude in list_exclude:
  5286. if _exclude in dict_count:
  5287. dict_count[_exclude] += 1
  5288. else:
  5289. dict_count[_exclude] = 1
  5290. if total_count>0:
  5291. row1["avg_interest"] = round(total_interest/total_count,2)
  5292. row1["total_interest"] = total_interest
  5293. list_exclude_set = clean_exclude_set(list_exclude_set)
  5294. exclude_set = {_exclude for _exclude_set in list_exclude_set for _exclude in _exclude_set}
  5295. list_count = []
  5296. for k,v in dict_count.items():
  5297. if v>=2 and k in exclude_set:
  5298. list_count.append((v,k))
  5299. list_count.sort(key=lambda x:x[0],reverse=True)
  5300. exclude_subcriptions = original_exclude_subcriptions.copy()
  5301. # for v,k in list_count:
  5302. # if k not in exclude_subcriptions:
  5303. # exclude_subcriptions.append(k)
  5304. #todo 根据排除词的共现来减少排除词
  5305. print("exclude_subcriptions",exclude_subcriptions)
  5306. for row1 in list_data:
  5307. print("interest")
  5308. print(row1.get("subcriptions"))
  5309. print(row1.get("avg_interest"))
  5310. print(row1.get("total_interest"))
  5311. def fix_subcription():
  5312. ots_client = getConnect_ots()
  5313. def _handle(item,result_queue):
  5314. bool_query = BoolQuery(must_queries=[
  5315. NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item.get("公司名称"))),
  5316. TermsQuery("docchannel",[101,118,119,120,121,122]),
  5317. RangeQuery("status",201,301)
  5318. ])
  5319. list_row = getDocument([{"query":bool_query,"limit":50}],["docid","doctitle","product","products","province"],thread_count=1)
  5320. remain_products = []
  5321. total_products = {}
  5322. total_titles = []
  5323. total_province = set()
  5324. for row in list_row:
  5325. product = row.get("product","")
  5326. products = row.get("products")
  5327. doctitle = row.get("doctitle")
  5328. province = row.get("province")
  5329. print("===============")
  5330. print(product)
  5331. print(products)
  5332. if province!="" and province not in ("全国","未知"):
  5333. total_province.add(province)
  5334. total_titles.append(doctitle)
  5335. list_products = product.split(",")
  5336. for p in list_products:
  5337. if len(p)>8:
  5338. continue
  5339. if p not in total_products:
  5340. total_products[p] = 0
  5341. total_products[p] += 1
  5342. if products is not None and products!="":
  5343. products_json = json.loads(products)
  5344. list_products = [a.get("product") for a in products_json]
  5345. for p in list_products:
  5346. if len(p)>8:
  5347. continue
  5348. if p not in total_products:
  5349. total_products[p] = 0
  5350. total_products[p] += 1
  5351. # total_products.extend(list_products)
  5352. # total_products = [a for a in total_products if a is not None and a!=""]
  5353. list_products = []
  5354. for k,v in total_products.items():
  5355. list_products.append((k,v))
  5356. list_products.sort(key=lambda x:x[1],reverse=True)
  5357. # for product in total_products:
  5358. # _find = False
  5359. # for p_i in range(len(remain_products)):
  5360. # p = remain_products[p_i]
  5361. # if p.find(product)>=0:
  5362. # remain_products[p_i] = product
  5363. # _find = True
  5364. # if product.find(p)>=0:
  5365. # _find = True
  5366. # if not _find:
  5367. # remain_products.append(product)
  5368. for p,_ in list_products[:20]:
  5369. remain_products.append(p)
  5370. prompt = "%s的主营产品是什么"%item.get("公司名称")
  5371. # _business = chat_doubao_bot(prompt,"bot-20250725150712-pzfls")
  5372. _business = ""
  5373. item["搜索主营"] = _business
  5374. messages = [
  5375. {"role": "system", "content": "你是豆包,是由字节跳动开发的 AI 人工智能助手"},
  5376. ]
  5377. prompt1 = '''
  5378. 您是一个招投标数据平台的数据专家,你的客户“【%s】”希望和贵司的招投标数据平台合作,需要您为他设定订阅词,目前通过该公司的中标公告中提取的产品词有“【%s】”(注意,这些词里面有一些是提取错误的,也有一些是因为框架标被混杂进来的产品词,请你结合你对这个公司的了解剔除无关关键词),客户自己在平台上自己搜索的关键词有“【%s】”(注意,客户自己搜索的关键词也不一定是业务有关,有些客户使用的时候不熟悉平台所以可能输入有误),请搜索客户公司有关信息,再结合你对客户公司业务的了解,再结合你对客户公司业务的了解,帮客户总结出一些有效的关键词,可以用来进行招投标信息的订阅,关键词数量不超过20个,每个词不超过8个字,用json 格式{"keywords":[]} 返回(注意有些关键词之间是相互包含关系的,只保留最小范围的, 如“种子光选机”和“光选机”, 只保留“光选机”一个就够了)
  5379. '''%(item.get("公司名称"),",".join(remain_products),item.get("搜索词"))
  5380. print(prompt1)
  5381. messages.append({"role": "user", "content": prompt1})
  5382. _result = chat_doubao_bot(messages,"bot-20250725150712-pzfls")
  5383. messages.append({"role": "system", "content": _result})
  5384. _json = get_json_from_text(_result)
  5385. try:
  5386. _dict = json.loads(_json)
  5387. except Exception as e:
  5388. _dict = {}
  5389. if len(total_province)>3 or len(total_province)==0:
  5390. item["地区"] = "全国"
  5391. else:
  5392. item["地区"] = ",".join(list(total_province))
  5393. item["AI订阅词"] = ",".join(_dict.get("keywords",[]))
  5394. must_queries=[
  5395. generateBoolShouldQuery(["doctitle"],item["AI订阅词"].split(","),MatchPhraseQuery),
  5396. RangeQuery("page_time",timeAdd(getCurrent_date("%Y-%m-%d"),-30)),
  5397. TermQuery("docchannel",52)
  5398. ]
  5399. if item["地区"]!="全国":
  5400. must_queries.append(generateBoolShouldQuery(["province"],item["地区"].split(","),TermQuery))
  5401. bool_query = BoolQuery(must_queries=must_queries)
  5402. rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
  5403. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=1,get_total_count=True),
  5404. ColumnsToGet(["doctitle"],return_type=ColumnReturnType.SPECIFIED))
  5405. if total_count<500:
  5406. must_queries[0] = generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],item["AI订阅词"].split(","),MatchPhraseQuery)
  5407. bool_query = BoolQuery(must_queries=must_queries)
  5408. item["搜索范围"] = "全文"
  5409. list_row = getDocument([{"query":bool_query,"limit":50}],["docid","doctitle",],thread_count=1)
  5410. total_titles = []
  5411. for row in list_row:
  5412. doctitle = row.get("doctitle")
  5413. total_titles.append(doctitle)
  5414. prompt2 = '''
  5415. 订阅后获取到的招标公告标题如下:
  5416. 【%s】
  5417. 请你筛选出不符合业务的公告标题,并总结出一些有效通用的标题排除词(排除词也是尽量可泛化一下, 例如”消防监理“属于监理服务,排除词应该直接用”监理“,也尽量用在上述标题中出现的表达搭配,随意省略中间部分词汇可能导致无法有效排除,因为排除词用的是全匹配逻辑),请把这些排除词用 json格式{"exclude_words":[]},json上下文符合正则```json(?P<json>.*)``` 输出(不超过 20 个,每个排除词单个词不超过 6 个字)
  5418. '''%("\r\n".join(total_titles))
  5419. messages.append({"role": "user", "content": prompt2})
  5420. # _result = chat_doubao(prompt2,model_name = "ep-20250314164242-jd62g")
  5421. # _result = chat_doubao_bot(messages,"bot-20250725150712-pzfls")
  5422. _result = chat_doubao_messages(messages,model_name = "ep-20250212111145-fflr7")
  5423. print(prompt2)
  5424. _json = get_json_from_text(_result)
  5425. print(_json)
  5426. try:
  5427. _dict = json.loads(_json)
  5428. except Exception as e:
  5429. _dict = {}
  5430. print(_dict)
  5431. print(_dict.get("exclude_words"))
  5432. item["AI排除词"] = _dict.get("exclude_words")
  5433. item["最终产品词"] = ",".join(remain_products)
  5434. item["标题"] = "\n".join(total_titles)
  5435. filename = "所有注册用户并且关注微信的数据信息.xlsx"
  5436. df = pd.read_excel(filename)
  5437. companynames = ["爱威科技股份有限公司","上海百若试验仪器有限公司","宁联电缆集团有限公司"]
  5438. task_queue = Queue()
  5439. list_data = []
  5440. for company,_b,search in zip(df["公司名"],df["是否中标"],df["搜索词"]):
  5441. print(_b,type(_b))
  5442. if not _b:
  5443. continue
  5444. if not isinstance(search,str):
  5445. continue
  5446. list_search = search.split(",")
  5447. list_search.sort(key=lambda x:len(x))
  5448. _d = {"公司名称":company,
  5449. "搜索词":",".join(list_search[:20]),
  5450. "搜索主营":"",
  5451. "搜索范围":"标题",
  5452. "AI订阅词":"",
  5453. "AI排除词":"",
  5454. "最终产品词":"",
  5455. "标题":"",
  5456. "公告类型":"招标公告",
  5457. "地区":"全国"}
  5458. list_data.append(_d)
  5459. if len(list_data)>100:
  5460. break
  5461. print("list_data",len(list_data))
  5462. for data in list_data:
  5463. task_queue.put(data)
  5464. mt = MultiThreadHandler(task_queue,_handle,None,5)
  5465. mt.run()
  5466. df = pd.DataFrame(list_data)
  5467. df.to_excel("../data/%s_extract2.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
  5468. if __name__=="__main__":
  5469. # compareData()
  5470. # attachAttachment()
  5471. # exportDocument_By_time(time_from="2021-01-29",time_to="2021-01-29",columns=["docid","doctitle","project_name","dochtmlcon"])
  5472. # processDocument()
  5473. # export_extract_check()
  5474. # exportArticle_by_websource()
  5475. # export_keyword_count()
  5476. # export_province_keyword_count()
  5477. # exportDocument_dump()
  5478. # exportDocument_dump_mysql()
  5479. # export_attachment()
  5480. # statics_attachment_counts()
  5481. # get_follows()
  5482. # append_title()
  5483. # exportDocument_by_doctitle()
  5484. # exportIndustryCount()
  5485. # exportDocument_by_pagetime()
  5486. # export_columns()
  5487. # match_products()
  5488. # match_contact()
  5489. # exportProjects()
  5490. # exportProducts()
  5491. # exportParameters()
  5492. # exportAgencyCount()
  5493. # getDocumentHtml()
  5494. # getDumplicate_docid()
  5495. # exportHonors_item_info()
  5496. # check_dump_data()
  5497. # search_title_count()
  5498. # count_product()
  5499. # export_dump_by_id()
  5500. # group_xlsx()
  5501. # static_process_time()
  5502. # check_data_synchronization()
  5503. # process_doc()
  5504. # export_competition()
  5505. # for page_time in ["2022-08-01"]:
  5506. # exportDocument_by_days(page_time)
  5507. # exportDocument_forRecommen()
  5508. # exportDocument_attachment()
  5509. # exportWin_tenderer_count()
  5510. # attachCompanyContact()
  5511. # dumpWebSourceNo()
  5512. # print("http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%103571618))))
  5513. # exportNzj()
  5514. # turn_status()
  5515. # attachBidding_budget()
  5516. # debug_documentMerge()
  5517. # exportDocument_medicine("2021-05-24","2021-05-30")
  5518. # signDocument()
  5519. # transUUid()
  5520. # fix_document()
  5521. # export_document_no_price()
  5522. # findProjects()
  5523. # exportDetailLink()
  5524. # export_extract_check()
  5525. # export_extract2()
  5526. # export_by_file()
  5527. # export_dump()
  5528. # clean_subcription("施工围挡,防眩板,标志牌","")
  5529. fix_subcription()