luojiehua
/
DataMining


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231
							#encoding:UTF8
import sys
import os
sys.path.append("..")
print(sys.path)
import pandas as pd
from dataSource.source import *
import json
from utils.multiThread import MultiThreadHandler
import queue
from utils.Utils import *
from dataSource.pool import ConnectorPool
import re
from tablestore import *
import traceback
from utils.hashUtil import aesCipher
from uuid import uuid4

from export.exportUtils import *
from export.DoubaoUtils import chat_doubao,get_json_from_text,chat_doubao_bot,chat_doubao_messages

data_path = "../data/"

def getCompanyTenderer():

    def _handle(item,result_queue):
        company = item
        dict_result = {"company":company,"count":0,"competitor":"","project_name":""}
        dict_result["company"] = company
        graph = getConnect_neo4j()
        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhao_biao_id,p.zhong_biao_id"%(company)
        finded = graph.run(cql)
        finded_ids = json.loads(json.dumps(finded.data()))
        dict_result["count"] = len(finded_ids)
        mongoDB = getConnect_mongodb()
        coll_zb = mongoDB.zhongbiao_extraction
        if len(finded_ids)>0:
            cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.project_name limit 3"%(company)
            finded = graph.run(cql)
            finded_names = json.loads(json.dumps(finded.data()))
            list_names = [_i["p.project_name"] for _i in finded_names]
            dict_result["project_name"] = str(list_names)

            cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company)
            finded = graph.run(cql)
            finded_money = json.loads(json.dumps(finded.data()))
            whole_money = 0
            for item in finded_money:
                if item["r.price"] is not None:
                    whole_money += getUnifyMoney(item["r.price"])
            dict_result["whole_money"] = str(whole_money)
            dict_competitor = {}
            for item in finded_ids:
                docId = item["p.zhong_biao_id"]
                if docId is not None:
                    rows = coll_zb.find({"docId":docId})
                    for row in rows:
                        keys = ["second_tenderer","third_tenderer"]
                        for _key in keys:
                            if _key in row:
                                if row[_key] not in dict_competitor:
                                    dict_competitor[row[_key]] = 0
                                dict_competitor[row[_key]] += 1
            list_competitor = []
            for _key in dict_competitor:
                list_competitor.append([_key,dict_competitor[_key]])
            list_competitor.sort(key=lambda x:x[1],reverse=True)
            list_competitors = [i[0] for i in list_competitor[:10]]
            dict_result["competitor"] = str(list_competitors)
        result_queue.put(dict_result)


    # filename = "成交客户匹配中标项目的需求.xlsx"
    # df = pd.read_excel(filename)
    # list_company = df["公司名字"]
    # company = list_company[0]
    list_company = []
    filename = "../data/服务型客户.txt"
    with open(filename,"r",encoding="GBK") as f:
        while(True):
            line = f.readline()
            if not line:
                break
            list_company.append(line.strip())
    task_queue = queue.Queue()
    for company in list_company:
        task_queue.put(company)
    result_queue = queue.Queue()
    handler = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=10)
    handler.run()
    list_company = []
    list_zb = []
    list_count = []
    list_project = []
    list_money = []
    list_competitor = []
    while(True):

        try:
            _result = result_queue.get(False)
            list_company.append(_result.get("company",""))
            list_zb.append("是" if _result.get("count","")>0 else "否")
            list_count.append(_result.get("count",""))
            list_project.append(_result.get("project_name",""))
            list_money.append(_result.get("whole_money",""))
            list_competitor.append(_result.get("competitor",""))
        except Exception as e:
            print(e)
            break
    df1 = pd.DataFrame({"公司名字":list_company,"是否中标":list_zb,"中标次数":list_count,"中标项目":list_project,"中标金额":list_money,"潜在竞争对手":list_competitor})
    df1.to_excel("%s_export.xls"%(filename),columns=["公司名字","是否中标","中标次数","中标项目","中标金额","潜在竞争对手"])

def export_count_includeKeyword():
    filename = "../data/other/jc001.xlsx"
    list_name = []
    list_count = []
    df = pd.read_excel(filename)
    _index = 0
    for row in df["品目"]:
        _name = row
        data = solrQuery("document",{"q":'dochtmlcon:"%s"'%_name,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
        if data is not None:
            _count = data["response"]["numFound"]
        else:
            _count = 0
        list_name.append(_name)
        list_count.append(_count)
        _index += 1
        print(_index)
    df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
    df1.to_excel("%s_export.xls"%filename)

def export_count_includeKeyword_multiThread():

    def _handler(item,result_queue):
        data = solrQuery("document",{"q":'dochtmlcon:"%s"'%item,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
        if data is not None:
            _count = data["response"]["numFound"]
        else:
            _count = 0
        result_queue.put([item,_count])

    task_queue = queue.Queue()
    result_queue = queue.Queue()
    filename = "../data/other/jc001.xlsx"
    list_name = []
    list_count = []
    df = pd.read_excel(filename)
    _index = 0
    for row in df["品目"]:
        _name = row
        task_queue.put(_name)
        _index += 1
    multHandler = MultiThreadHandler(task_queue,_handler,result_queue,thread_count=20)
    multHandler.run()
    while(True):
        try:
            item = result_queue.get(False)
            list_name.append(item[0])
            list_count.append(item[1])
        except queue.Empty as e:
            break
    df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
    df1.to_excel("%s_export.xls"%filename)

def exportKeywords():
    def _handle(item,result_queue,pool_mongo):
        docId = item["docId"]
        mongo = pool_mongo.getConnector()
        zhongbiao = mongo.zhongbiao_extraction
        zhaobiao = mongo.zhaobiao_extraction
        _project = ""
        rows = zhaobiao.find({"docId":docId},{"project_name":1})
        find_flag = False
        for row in rows:
            find_flag = True
            _project = row.get("project_name","")
        if not find_flag:
            rows = zhongbiao.find({"docId":docId},{"project_name":1})
            for row in rows:
                _project = row.get("project_name","")
        item["project_name"] = _project
        pool_mongo.putConnector(mongo)
        result_queue.put(item)
    list_key = []
    dict_key_ids = dict()
    with open("../data/品目.txt", "r", encoding="utf8") as f:
        while(True):
            row = f.readline()
            if not row:
                break
            list_key.append(row)
            dict_key_ids[row] = []
            data = solrQuery("document",{"q":'dochtmlcon:"%s" AND dochtmlcon:"法院"'%row,"fq":'(publishtime:[2019-01-01T00:00:00Z TO 2019-12-31T23:59:59Z])',"fl":"id","rows":10000000})
            for item in data["response"]["docs"]:
                dict_key_ids[row].append(item["id"])
    task_queue = queue.Queue()
    result_queue = queue.Queue()
    for _key in dict_key_ids.keys():
        for item in dict_key_ids[_key]:
            task_queue.put({"docId":item,"project_name":""})
    pool_mongo = ConnectorPool(init_num=10,max_num=200,method_init=getConnect_mongodb)
    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=200,pool_mongo=pool_mongo)
    mt.run()
    dict_docId_projectname = {}
    while(True):
        try:
            item = result_queue.get(False)
            dict_docId_projectname[item["docId"]] = item["project_name"]
        except Exception:
            break
    dict_key_count = dict()
    for _key in dict_key_ids.keys():
        set_key = set()
        for docId in dict_key_ids[_key]:
            set_key.add(dict_docId_projectname.get(docId,""))
        dict_key_count[_key] = len(set_key)
    print("==")
    for _key in list_key:
        print(len(dict_key_ids[_key]))
    print("==")
    for _key in list_key:
        print(dict_key_count[_key])
    print("==")

def getIndustryCompany():
    def _handle(item,result_queue,pool_mongo,pool_neo4j,pool_mysql,pool_ots):
        # mongoDB = getConnect_mongodb()
        log(item["enterprise_name"])
        mongoDB = pool_mongo.getConnector()
        # coll_zb = mongoDB.enterprise_profile
        # rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1,"legalPersonName":1,"actualCapital":1, "regCapital":1,"estiblishTime":1,"socialStaffNum":1,"legal_person":1,"phone":1,"businessScope":1,"industry":1 })
        # for row in rows:
        #     item["regCapital"] = row.get("regCapital","")
        #     item["legal_person"] = row.get("legal_person","")
        #     item["phone"] = row.get("phone","")
        #     item["actualCapital"] = row.get("actualCapital","")
        #     item["industry"] = row.get("industry","")
        #     item["estiblishTime"] = row.get("estiblishTime","")
        #     item["socialStaffNum"] = row.get("socialStaffNum","")
        #     item["businessScope"] = row.get("businessScope","")
        # graph = getConnect_neo4j()

        ots_client = pool_ots.getConnector()

        primary_key = [('name',item["enterprise_name"])]

        columns_to_get = ["reg_capital","legal_person","phone","actual_capital","industry","estiblishTime","social_staff_num","business_scope"]

        consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)

        if return_row is not None:
            for att in return_row.attribute_columns:
                item[att[0]] = att[1]

            list_same_industry_company = []

            if "industry" in item:
                bool_query = BoolQuery(must_queries=[TermQuery("industry",item["industry"])])

                col = ColumnsToGet(['enterprise_name'], ColumnReturnType.SPECIFIED)
                rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
                                                                                  SearchQuery(bool_query, limit=10, get_total_count=True),
                                                                                  col)

                for row in rows:
                    for item1 in row[0]:
                        list_same_industry_company.append(item1[1])


            # if "industry" in item:
            #     rows = coll_zb.find({"industry":item["industry"]},{"enterprise_name":1}).limit(10)
            #     for row in rows:
            #         print(row)
            #         list_same_industry_company.append(row.get("enterprise_name",""))
            item["same_industry_company"] = list_same_industry_company

        graph = pool_neo4j.getConnector()
        company_name = item["enterprise_name"]
        cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
        finded = graph.run(cql)
        data = json.loads(json.dumps(finded.data()))
        _count = data[0]["_c"]
        # list_project = []
        # for _data in data:
        #     if _count<=3:
        #         if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
        #             if _data["project_name"] is not None:
        #                 list_project.append(_data["project_name"])
        #     _count += 1
        item["count"] = _count
        # item["project"] = str(list_project)
        result_queue.put(item)
        pool_mongo.putConnector(mongoDB)
        pool_neo4j.putConnector(graph)
        pool_ots.putConnector(ots_client)

    log_tofile("export.log")
    pool_mongo = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_mongodb)
    pool_neo4j = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_neo4j)
    pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql)
    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
    # list_company = getCompanys()
    # filename = "".join(["环境","生态","再生","回收","环保"])
    list_company = []
    filename = "../data/同行客户匹配.xlsx"
    df = pd.read_excel(filename,sheetname=0)
    for _com in df["公司名称"]:
        print(_com)
        if _com is not None and _com.strip()!="":
            _company = {"enterprise_name":""}
            _company["enterprise_name"] = _com
            list_company.append(_company)
    task_queue = queue.Queue()
    for item in list_company:
        task_queue.put(item)
    result_queue = queue.Queue()
    _muti = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j,pool_mysql=pool_mysql,pool_ots=pool_ots)
    _muti.run()
    df_company = {}
    set_key = set()
    if len(list_company)>0:
        for item in list_company:
            for _key in item.keys():
                set_key.add(_key)
                if _key not in df_company:
                    df_company[_key] = []
    list_key = list(set_key)
    for item in list_company:
        for _key in list_key:
            df_company[_key].append(item.get(_key,""))
    df1 = pd.DataFrame(df_company)

    df1.to_excel("%s_export.xlsx"%(filename))


def exportWin_tenderer(time_from,time_to):
    '''
    :return:
    '''
    ost_client = getConnect_ots()
    last_docid = 0
    bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
                                         TermQuery("docchannel",101),
                                         RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
                                         RangeQuery('docid', last_docid, include_lower=False)])

    rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
                                                                      SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
                                                                      ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))

    list_project = []
    def _getRow(list_project,rows,last_docid):
        for row in rows:
            project_name = row[1][0][1]
            docid = row[0][1][1]
            last_docid = docid
            list_pack = json.loads(row[1][1][1])
            _set_tenderer = set()
            win_tenderer = ""
            for _pack in list_pack:
                if "win_tenderer" in _pack and win_tenderer=="":
                    win_tenderer = _pack["win_tenderer"]
                if "second_tenderer" in _pack:
                    _set_tenderer.add(_pack["second_tenderer"])
                if "third_tenderer" in _pack:
                    _set_tenderer.add(_pack["third_tenderer"])
            list_project.append({"docid":docid,"project_name":project_name,"win_tenderer":win_tenderer,"tenderer":list(_set_tenderer)})
        return last_docid
    _getRow(list_project,rows,last_docid)
    while(next_token):
        print("%d/%d"%(len(list_project),total_count))
        rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
                                                                          SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
                                                                          ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
        last_docid = _getRow(list_project,rows,last_docid)
    task_queue = queue.Queue()
    result_queue = queue.Queue()
    for item in list_project:
        task_queue.put(item)
    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
    def _handle(item,result_queue,pool_ots):
        if item["win_tenderer"]!="":
            ots_client = pool_ots.getConnector()
            consumed, return_row, next_token = ost_client.get_row("enterprise", [("name",item["win_tenderer"])], ["province","reg_capital","estiblish_time","business_scope"], None, 1)
            _dict = dict()
            for _item in return_row.attribute_columns:
                _dict[_item[0]] = _item[1]
            for _key in _dict.keys():
                item[_key] = _dict[_key]
            data = solrQuery("contact",{"q":'company_name:"%s"'%item["win_tenderer"],"fl":"contact_person,mobile_no,phone_no","rows":10})
            for _item in data["response"]["docs"]:
                for _key in _item.keys():
                    item[_key] = _item[_key]
                break
            pool_ots.putConnector(ots_client)


    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots);
    mt.run()
    keys = ["docid","project_name","win_tenderer","tenderer","province","reg_capital","business_scope","estiblish_time","contact_person","mobile_no","phone_no"]
    df_data = {}
    for _key in keys:
        df_data[_key] = []
    for item in list_project:
        for _key in keys:
            if _key in item:
                df_data[_key].append(item[_key])
            else:
                df_data[_key].append("")
    df = pd.DataFrame(df_data)
    df.to_excel("../data/%s-%s中标信息.xlsx"%(time_from,time_to),columns=keys)

def exportContact():
    time_from = "2021-01-14"
    time_to = "2021-01-15"
    filename = "../data/%s-%s中标信息.xlsx"%(time_from,time_to)
    df1 = pd.read_excel(filename)
    set_company = set()
    for item in df1["tenderer"]:
        list_company = re.split("\['|', '|'\]|\[\]",item)
        for _company in list_company:
            if _company!="":
                set_company.add(_company)

    companys = list(set_company)
    task_queue = queue.Queue()
    list_company = []
    for _company in companys:
        item = {"company_name":_company}
        list_company.append(item)
        task_queue.put(item)
    result_queue = queue.Queue()
    def _handle(item,result_queue):
        company = item["company_name"]
        data = solrQuery("contact",{"q":'company_name:"%s"'%company,"fl":"company_name,contact_person,mobile_no,phone_no","rows":10})
        for _item in data["response"]["docs"]:
            for _key in _item.keys():
                item[_key] = _item[_key]
            break
    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30);
    mt.run()
    keys = ["company_name","contact_person","mobile_no","phone_no"]
    df_data = {}
    for _key in keys:
        df_data[_key] = []
    ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
    for item in list_company:
        for _key in keys:
            if _key in item:
                df_data[_key].append(ILLEGAL_CHARACTERS_RE.sub(r'', item[_key]))
            else:
                df_data[_key].append("")
    df = pd.DataFrame(df_data)
    df.to_excel("../data/%s-%s竞争对手信息.xlsx"%(time_from,time_to),columns=keys)

def countKeyword():
    conn = getConnection_mysql()
    cursor = conn.cursor()
    print(0)
    sql = "select dochtmlcon from sys_document_22 where docchannel=52 and page_time>='2020-09-01' and page_time<='2020-09-07'"
    cursor.execute(sql)
    print(0.1)
    df = pd.read_excel("万郡绿建细分关键词.xls")
    list_keywords = df["细分类别"]
    dict_keywords = dict()
    for _key in list_keywords:
        dict_keywords[_key] = 0
    print(1)
    from bs4 import BeautifulSoup
    while(True):
        rows = cursor.fetchmany(10000)
        print("==")
        if not rows:
            break
        for row in rows:
            _html = BeautifulSoup(row[0],"lxml").getText()
            for _key in list_keywords:
                if re.search(_key,_html) is not None:
                    dict_keywords[_key] += 1
    print(dict_keywords)
    list_count = []
    for _key in list_keywords:
        list_count.append(dict_keywords[_key])
    df1 = pd.DataFrame({"关键字":list_keywords,"数量":list_count})
    df1.to_excel("关键词统计.xlsx")

def countKeyword_solr():
    def _handle(item,result_queue):
        keyword = item["keyword"]
        data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:101 AND dochtmlcon:"法院" '%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
        _num = data["response"]["numFound"]
        item["zhongbiao"] = _num
        data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:52 AND dochtmlcon:"法院"'%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
        _num = data["response"]["numFound"]
        item["zhaobiao"] = _num
        result_queue.put(item)
    file = "../data/关键词11.xlsx"
    df = pd.read_excel(file)
    task_queue = queue.Queue()
    print(df.keys())
    for item in df["业务关键词"]:
        task_queue.put({"keyword":item,"zhaobiao":0,"zhongbiao":0})
    result_queue = queue.Queue()
    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=10)
    mt.run()
    list_keyword = []
    list_zhaobiao = []
    list_zhongbiao = []
    while(True):
        try:
            item = result_queue.get(False)
            list_keyword.append(item["keyword"])
            list_zhaobiao.append(item["zhaobiao"])
            list_zhongbiao.append(item["zhongbiao"])
        except Exception:
            break
    df1 = pd.DataFrame({"业务关键词":list_keyword,"招标公告":list_zhaobiao,"中标信息":list_zhongbiao})
    df1.to_excel("%s_export.xlsx"%file,columns=["业务关键词","招标公告","中标信息"])

def query_from_solr():
    data = solrQuery("document",{"q":'dochtmlcon:"法律" AND (docchannel:51 OR docchannel:104 or docchannel:52 or docchannel:102) AND province:"湖南" ',"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-01-20T23:59:59Z])',"fl":"id","rows":10})
    _num = data["response"]["numFound"]
    print(_num)

def export_province_keyword_count():
    def _handle(item,result_queue,pool_ots):
        columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
        ots_client = pool_ots.getConnector()
        _province = item["province"]
        print(item)
        # keywords = item["keyword"]+" "+item["key"]
        list_keyword = item["keyword"]
        # for _temp in keywords.split(" "):
        #     if len(_temp)>0:
        #         list_keyword.append(_temp)
        should_queries = []
        must_not_q = []
        for _temp in list_keyword:
            should_queries.append(MatchPhraseQuery("doctitle","%s"%_temp))
            must_not_q.append(WildcardQuery("tenderee","*%s*"%_temp))


        bool_query_keyword = BoolQuery(should_queries=should_queries,minimum_should_match=2)
        page_time = item["page_time"]
        bool_query = BoolQuery(must_queries=[#bool_query_keyword
                                             # ,WildcardQuery("publishtime","%s*"%page_time)
                                             # ,MatchPhraseQuery("doctitle","服务")
                                             RangeQuery("page_time","2018-01-01","2019-01-01",include_lower=True,include_upper=False),
                                             TermQuery("docchannel",52),
                                             RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
                                             WildcardQuery('city', '%s*'%_province)
                                             # ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_tenderer",0,include_lower=True))
                                             ]
                               # ,must_not_queries=must_not_q
                               )

        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("sub_docs_json.bidding_budget",SortOrder.DESC)]), limit=20, get_total_count=True),
                                                                          ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED))

        item["count"] = total_count
        list_data = getRow_ots(rows)

        item["list_data"] = list_data
        print(item)

        pool_ots.putConnector(ots_client)

    df = pd.read_excel("../data/省份信息.xlsx")
    list_province = []
    for _name,_type,_parent in zip(df["cname"],df["ctype"],df["parentid"]):
        if _type==30 and _parent==4:
            list_province.append(_name)

    # filename = "../data/2021-02关键词导出数据.xlsx"
    # dict_keyword = {}
    # df1 = pd.read_excel(filename,dtype=str)
    # for _key,_keyword in zip(df1["key1"],df1["keyword"]):
    #     print("===",str(_keyword))
    #     dict_keyword[_key] = "" if str(_keyword)=="nan" else _keyword
    # for _key in df1["关键词"]:
    #     dict_keyword[_key] = ""
    keyword_str = '''
    快递	物流	供应链	运输	配送
仓储	冷链	整车 服务
    '''
    list_key = []
    for _k in re.split("\s",keyword_str):
        _k1 = _k.strip()
        if len(_k1)>0:
            list_key.append(_k1)


    list_task = []
    page_time = "2020-11"
    for _province in list_province:
        list_task.append({"page_time":page_time,"province":_province,"key":list_key,"keyword":list_key,"count":0})

    task_queue = queue.Queue()
    for item in list_task:
        task_queue.put(item)

    result_queue = queue.Queue()
    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
    mt.run()
    dict_key_data = dict()
    # list_data = []
    # for item in list_task:
    #     list_data.extend(item["list_data"])
    # dict_channel = getDict_docchannel()
    # df_data= {}
    # print(list_data)
    # for row in list_data:
    #     item = {}
    #     _dict = row
    #     set_dict_item(item,"docid",_dict.get("docid",""))
    #     set_dict_item(item,"公告标题",_dict.get("doctitle",""))
    #     set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
    #     set_dict_item(item,"省份",_dict.get("province",""))
    #     # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
    #     set_dict_item(item,"城市",_dict.get("city",""))
    #     set_dict_item(item,"发布时间",_dict.get("page_time",""))
    #
    #     set_dict_item(item,"项目编号",_dict.get("project_code",""))
    #     set_dict_item(item,"招标单位",_dict.get("tenderee",""))
    #     set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
    #     set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
    #     set_dict_item(item,"代理单位",_dict.get("agency",""))
    #     set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
    #     set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
    #     set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
    #
    #     sub_docs_json = _dict.get("sub_docs_json")
    #     for _doc in json.loads(sub_docs_json):
    #         if "win_tenderer" in _doc:
    #             set_dict_item(item,"中标单位",_doc["win_tenderer"])
    #         if "win_tenderee_manager" in _doc:
    #             set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
    #         if "win_tenderee_phone" in _doc:
    #             set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
    #         if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
    #             set_dict_item(item,"中标金额",_doc["win_bid_price"])
    #         if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
    #             set_dict_item(item,"招标金额",_doc["bidding_budget"])
    #     if "招标金额" not in item:
    #         set_dict_item(item,"招标金额","")
    #     if "中标金额" not in item:
    #         set_dict_item(item,"中标金额","")
    #     if "中标单位" not in item:
    #         set_dict_item(item,"中标单位","")
    #     if "中标单位联系人" not in item:
    #         set_dict_item(item,"中标单位联系人","")
    #     if "中标单位联系电话" not in item:
    #         set_dict_item(item,"中标单位联系电话","")
    #
    #
    #     _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
    #     # if _line in set_line:
    #     #     continue
    #     # if item["招标金额"]=="":
    #     #     continue
    #     # set_line.add(_line)
    #     for k,v in item.items():
    #         if k not in df_data:
    #             df_data[k] = []
    #         df_data[k].append(v)
    # df1 = pd.DataFrame(df_data)
    # df1.to_excel("../data/%s_顺丰中标数据.xlsx"%getCurrent_date('%Y-%m-%d_%H%M%S'),columns=list_df_columns)
    for item in list_task:
        print("%s\t%d"%(item["province"],item["count"]))
    # for item in list_task:
    #     dict_key_data[item["key"]][item["province"]] = item
    # dict_key_province = dict()
    # dict_key_province["关键词"] = []
    # for _province in list_province:
    #     dict_key_province[_province] = []
    # for _key in dict_keyword.keys():
    #     dict_key_province["关键词"].append(_key)
    #     for _province in list_province:
    #         dict_key_province[_province].append(dict_key_data[_key][_province]["count"])
    # columns = ["关键词"]
    # columns.extend(list_province)
    # df2 = pd.DataFrame(dict_key_province)
    # df2.to_excel("../data/%s_导出数据.xlsx"%filename,columns=columns)

def export_keyword_count():
    def _handle(item,result_queue,ots_client):

        shoud_q_docchannel = BoolQuery(should_queries=[
            # RangeQuery("docchannel",51,105,True,True)
            TermQuery("docchannel",101),
            RangeQuery("docchannel",118,120,True,True)
                                       ]
                                       )

        should_q_keyword = BoolQuery(should_queries=[
            MatchPhraseQuery("doctitle",item["keyword"]),
            MatchPhraseQuery("doctextcon",item["keyword"]),
            MatchPhraseQuery("attachmenttextcon",item["keyword"])
        ])

        bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
                                             RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
                                             generateBoolShouldQuery(["docchannel"],[51, 52, 101, 118, 119, 120, 114, 51, 103],TermQuery),
                                             # TermQuery("docchannel",101),
                                             # shoud_q_docchannel,
                                             should_q_keyword
                                             # MatchPhraseQuery(item["type"], item["keyword"])
                                             ])

        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query, limit=1, get_total_count=True),
                                                                          ColumnsToGet(return_type=ColumnReturnType.NONE))
        item["total_count"] = total_count

        if total_count>0:
            item["exists"] =1
        else:
            item["exists"] =0

        bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
                                             RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
                                             # TermQuery("docchannel",52),
                                             # shoud_q_docchannel,
                                             # should_q_keyword
                                             TermQuery("tenderee",item["keyword"])
                                             # MatchPhraseQuery(item["type"], item["keyword"])
                                             ])

        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query, limit=1, get_total_count=True),
                                                                          ColumnsToGet(return_type=ColumnReturnType.NONE))
        item["zhaobiao_count"] = total_count

        bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
                                             RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
                                             # TermQuery("docchannel",101),
                                             # shoud_q_docchannel,
                                             # should_q_keyword,
                                             NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["keyword"]))
                                             # MatchPhraseQuery(item["type"], item["keyword"])
                                             ])

        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query, limit=1, get_total_count=True),
                                                                          ColumnsToGet(return_type=ColumnReturnType.NONE))
        item["zhongbiao_count"] = total_count


    range_from = "2024-08-01"
    range_to = "2024-11-01"
    _type = "doctitle"

    assert _type in ["doctitle","doctextcon"]
    list_dict_key_count = []
    filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-12\疑似少数据的中标企业名单.csv"

    with open(filename,"r",encoding="utf8") as f:
        while 1:
            line = f.readline()
            if not line:
                break
            line = line.strip()
            if line=="name":
                continue
            list_dict_key_count.append({"keyword":line,"count":0,"exists":0,"range_from":range_from,"range_to":range_to,"type":_type})
            # if len(list_dict_key_count)>=1000:
            #     break
    # df = pd.read_csv(filename,encoding="ISO-8859-1")
    # for item in df["name"]:
    #     list_dict_key_count.append({"keyword":item,"count":0,"exists":0,"range_from":range_from,"range_to":range_to,"type":_type})
#     str_keys = '''
#     智慧税务
# 发票管理
#
#     '''
#     for item in re.split("\s|\r|\n|,|，|、",str_keys):
#         if item.strip()!="":
#             list_dict_key_count.append({"keyword":item,"total_count":0,"zhaobiao_count":0,"zhongbiao_count":0,"range_from":range_from,"range_to":range_to,"type":_type})

    task_queue = queue.Queue()
    for item in list_dict_key_count:
        task_queue.put(item)

    result_queue = queue.Queue()
    ots_client = getConnect_ots()
    # pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,ots_client=ots_client)
    mt.run()
    columns = ["keyword","total_count","exists","zhaobiao_count","zhongbiao_count","range_from","range_to","type"]
    df_data = {}
    for _c in columns:
        df_data[_c] = []
    for item in list_dict_key_count:
        for _c in columns:
            if _c in item:
                df_data[_c].append(item[_c])
            else:
                df_data[_c].append("")
    for k,v in df_data.items():
        print(k,len(v))
    df2 = pd.DataFrame(df_data)
    df2.to_excel("%s_数量导出全部类别.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=columns)

def export_keyword_title():
    ots_client = getConnect_ots()
    range_from = "2020-01-01"
    range_to = "2022-12-23"
    list_condition = [["医务室"],
                      ["医院"],
                      ["卫生院"],
                      ["卫生所"],
                      ["卫生室"],
                      ["社区卫生服务中心"]]

    list_should_query = []
    for _c in list_condition:
        if len(_c)==1:
            list_should_query.append(MatchPhraseQuery("doctitle",_c[0]))
        else:
            _must_query = []
            for _q in _c:
                _must_query.append(MatchPhraseQuery("doctitle",_q))
            list_should_query.append(BoolQuery(must_queries=_must_query))
    keyword_query = BoolQuery(should_queries=list_should_query)

    bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",range_from,range_to),
                                         RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
                                         keyword_query
                                         ])

    rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                      SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
                                                                      ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))

    df_data = {"docid":[],"doctitle":[],"tenderee":[]}
    def setData(df_data,rows):
        list_dict = getRow_ots(rows)
        for _dict in list_dict:
            docid = _dict.get("docid","")
            doctitle = _dict.get("doctitle","")
            tenderee = _dict.get("tenderee","")

            df_data["docid"].append(docid)
            df_data["doctitle"].append(doctitle)
            df_data["tenderee"].append(tenderee)
    setData(df_data,rows)
    _count = len(rows)
    while next_token:
        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
                                                                          ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
        setData(df_data,rows)
        _count += 100
        print(_count,total_count)
    file_begin = 0
    file_length = 100000
    _count = 0
    while file_begin<len(df_data["docid"]):
        _dict = dict()
        for _key,v in df_data.items():
            _dict[_key] = v[file_begin:file_begin+file_length]
        _count += 1
        file_begin += file_length
        df = pd.DataFrame(_dict)
        df.to_csv("../data/%s-%s_tenderee_doctitle_%d.csv"%(range_from,range_to,_count))

def exportArticle_by_websource():
    # conn = getConnection_testmysql()
    # cursor = conn.cursor()
    # sql = "select web_source_no from web_source"
    # cursor.execute(sql)
    # rows = cursor.fetchmany(10)
    # dict_websource = dict()
    # while(rows):
    #     for row in rows:
    #         web_source_no = row[0]
    #         dict_websource[web_source_no] = []
    #     rows = cursor.fetchmany(1000)
    #
    # task_queue = queue.Queue()
    # for _key in dict_websource.keys():
    #     task_queue.put({"key":_key,"list":dict_websource[_key]})
    #
    # pool_ots = ConnectorPool(init_num=100,max_num=1000,method_init=getConnect_ots)
    # result_queue = queue.Queue()
    # def _handle(item,result_queue,pool_ots):
    #     _key = item["key"]
    #     print(_key)
    #     ots_client = pool_ots.getConnector()
    #     bool_query = BoolQuery(must_queries=[RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
    #                                          TermQuery('web_source_no', '%s'%_key)
    #                                          ])
    #
    #     is_all_succeed = False
    #
    #     while(not is_all_succeed):
    #         try:
    #             rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
    #                                                                               SearchQuery(bool_query, limit=100, get_total_count=True),
    #                                                                               ColumnsToGet(["docid","docchannel","dochtmlcon"],return_type=ColumnReturnType.SPECIFIED))
    #             list_zhaobiao = []
    #             list_zhongbiao = []
    #             for row in rows:
    #                 _dict = dict()
    #                 for values in row:
    #                     for _v in values:
    #                         _dict[_v[0]] = _v[1]
    #                 if _dict["docchannel"]==52:
    #                     list_zhaobiao.append(_dict)
    #                 elif _dict["docchannel"]==101:
    #                     list_zhongbiao.append(_dict)
    #             item["list"].extend(list_zhaobiao[:5])
    #             item["list"].extend(list_zhongbiao[:5])
    #         except Exception as e:
    #             print(str(e))
    #
    #     pool_ots.putConnector(ots_client)
    #
    # mt = MultiThreadHandler(task_queue = task_queue,task_handler=_handle,result_queue=result_queue,thread_count=100,pool_ots=pool_ots)
    # mt.run()
    # df_data = {"docid":[],"web_source_no":[],"docchannel":[],"dochtmlcon":[]}
    # for k,v in dict_websource.items():
    #     for item in v:
    #         df_data["docid"].append(item["docid"])
    #         df_data["web_source_no"].append(k)
    #         df_data["docchannel"].append(item["docchannel"])
    #         df_data["dochtmlcon"].append(item["dochtmlcon"])
    # df = pd.DataFrame(df_data)
    # df.to_csv("../data/websouce_doc.csv",columns=["docid","web_source_no","docchannel","dochtmlcon"],encoding="UTF8")

    df = pd.read_csv("../data/other/websouce_doc.csv")
    df_2000 = {"document_id":[],"document_text":[]}
    print("total_count",len(df["docid"]))
    begin = 230000
    end = 260000
    _count = 0
    for _id,_text in zip(df["docid"][begin:end],df["dochtmlcon"][begin:end]):
        if len(_text)>100000:
            continue
        df_2000["document_id"].append(_id)
        df_2000["document_text"].append(_text)
    df_2 = pd.DataFrame(df_2000)
    df_2.to_csv("../data/websouce_doc_%d-%d.csv"%(begin,end),columns=["document_id","document_text"],encoding="utf8",index=False)
    # save(dict_websource,"../data/dict_websource.pk")

def getWinTenderer(sub_doc_json):
    if sub_doc_json is not None:
        sub_doc = json.loads(sub_doc_json)
        for _doc in sub_doc:
            if "win_tenderer" in _doc:
                return _doc["win_tenderer"]
    return ""


def exportDocument_by_keywords(page_time,
                      list_keyword = ["创客","STEAM","人工智能","课程服务","机器人中学","机器人小学","机器人幼儿园","机器人学校","Labplus","盛思","makeblock柴火","寓乐湾","美科科技","STEAM","能力风暴","优必选","蘑菇云","Dfrobot","中鸣","飞瑞敖","编程猫培生","八爪鱼","八爪鱼教育","童心制物"]):

    task_queue = queue.Queue()
    result_queue = queue.Queue()

    for _k in list_keyword:
        task_queue.put(_k)


    def _handle(keyword,result_queue):

        should_queries = []
        for _temp in [keyword]:
            should_queries.append(MatchPhraseQuery("doctitle",_temp))

        bool_query_keyword = BoolQuery(should_queries=should_queries)

        ots_client = getConnect_ots()

        bool_query = BoolQuery(must_queries=[RangeQuery('publishtime', range_from='2017-12-20'),
                                             MatchPhraseQuery("doctitle",keyword),
                                             TermQuery("docchannel","101")
                                             ])


        is_all_succeed = False


        _count = 0
        total_count = 1
        next_token = None
        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
                                                                          ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
        for row in rows:
            _dict = dict()
            for values in row:
                for _v in values:
                    _dict[_v[0]] = _v[1]
            result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
        print(keyword,next_token,total_count)
        while(next_token):
            try:

                # print(next_token)
                _count += len(rows)
                print("%s:%d/%d"%(keyword,_count,total_count))
                for row in rows:
                    _dict = dict()
                    for values in row:
                        for _v in values:
                            _dict[_v[0]] = _v[1]
                    result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
                rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                                  SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
                                                                                  ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
            except Exception as e:
                traceback.print_exc()
    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
    mt.run()
    df_data = {"docid":[],"keyword":[],"tenderee":[],"win_tenderer":[]}
    while(True):
        try:
            item = result_queue.get(block=True,timeout=1)
            for _k in df_data.keys():
                if _k in item:
                    df_data[_k].append(item[_k])
                else:
                    df_data[_k].append("")
        except queue.Empty as e:
            break
        except Exception as e:
            traceback.print_exc()

    df = pd.DataFrame(df_data)
    df.to_csv("../data/exportArticle1_title.csv",columns=["docid","keyword","tenderee","win_tenderer"])

def exportGovement():
    should_queries1 = []
    for _temp in ["教育局","地化所","税务局","国土局","学校","大学","中学","小学","幼儿园","医院"]:
        should_queries1.append(WildcardQuery("tenderee","*%s*"%_temp))
    should_queries2 = []
    for _temp in ["浙江","江苏","湖北","西北","陕西","甘肃","青海","宁夏","新疆","重庆","四川","云南","贵州"]:
        should_queries2.append(WildcardQuery("province","*%s*"%_temp))
    ots_client = getConnect_ots()
    page_time = "2020-12"
    bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=should_queries1),
                                         BoolQuery(should_queries=should_queries2),
                                         TermQuery("docchannel","52"),
                                         RangeQuery("publishtime",page_time)])
    columns = ["tenderee","tenderee_contact","tenderee_phone"]
    rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                      SearchQuery(bool_query, limit=100, sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True),
                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))

    print(total_count)
    def getRow(rows,df_data,columns):
        for row in rows:
            _dict = dict()
            for part in row:
                for item in part:
                    _dict[item[0]] = item[1]
            if "tenderee_contact" in _dict and "tenderee_phone" in _dict:
                for key in columns:
                    df_data[key].append(_dict.get(key,""))
    all_rows = 0
    df_data = {}
    for key in columns:
        df_data[key] = []
    getRow(rows,df_data,columns)
    _count = 100
    while(next_token):
        print(_count,total_count)
        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query,next_token=next_token, limit=100,get_total_count=True),
                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        _count += 100
        getRow(rows,df_data,columns)

    df2 = pd.DataFrame(df_data)
    df2.to_excel("../data/%s政府招标人导出数据.xlsx"%page_time,columns=columns)

def export_attachment():
    filename = "运营商20240417(1).xlsx"

    df = pd.read_excel(filename)

    auth = oss2.Auth("LTAI5tFuoxHm8Uxrr5nT8wTZ", "Yp01bylJFx0al6teCaccY8hbtllBGg")
    bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
    attachment_bucket_name = "attachment-hub"
    bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)

    ots_client = getConnect_ots()

    list_query = []
    for _title,_no,tenderee,win_tenderer in zip(df["新标题"],df["项目编号"],df["新招采单位"],df["新中标单位"]):
        _dict = {"title":_title,
                 "project_code":_no,
                 "tenderee":tenderee,
                 "win_tenderer":win_tenderer}
        list_query.append(_dict)

    def _handle(_dict,result_queue):

        title = _dict["title"]
        project_code = _dict["project_code"]
        tenderee = _dict["tenderee"]
        win_tenderer = _dict["win_tenderer"]

        if isinstance(project_code,str):
            bool_query = BoolQuery(must_queries=[
                generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[title],MatchPhraseQuery),
                generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[project_code],MatchPhraseQuery),
                NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
                RangeQuery("page_time","2022-01-01")

            ])

            rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                              SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
                                                                              ColumnsToGet(["page_attachments"],return_type=ColumnReturnType.SPECIFIED))
            list_data = getRow_ots(rows)
            for _data in list_data:
                docid = _data["docid"]
                page_attachments = _data["page_attachments"]
                for _attach in json.loads(page_attachments):
                    filemd5 = _attach["fileMd5"]

                    consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",filemd5)],columns_to_get=["classification","path"])
                    dict_k = getRow_ots_primary(return_row)
                    if dict_k is not None and dict_k.get("classification")=="招标文件":
                        _dict["docid"] = docid
                        _dict["path"] = dict_k.get("path")
                        break
                if "docid" in _dict:
                    break
            if "docid" not in _dict:
                if isinstance(tenderee,str):
                    bool_query = BoolQuery(must_queries=[
                        generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[title],MatchPhraseQuery),
                        generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[tenderee],MatchPhraseQuery),
                        NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
                        RangeQuery("page_time","2022-01-01")

                    ])

                    rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                                      SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
                                                                                      ColumnsToGet(["page_attachments"],return_type=ColumnReturnType.SPECIFIED))
                    list_data = getRow_ots(rows)
                    for _data in list_data:
                        docid = _data["docid"]
                        page_attachments = _data["page_attachments"]
                        for _attach in json.loads(page_attachments):
                            filemd5 = _attach["fileMd5"]

                            consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",filemd5)],columns_to_get=["classification","path"])
                            dict_k = getRow_ots_primary(return_row)
                            if dict_k is not None and dict_k.get("classification")=="招标文件":
                                _dict["docid"] = docid
                                _dict["path"] = dict_k.get("path")
                                break
                        if "docid" in _dict:
                            break
            if "docid" not in _dict:
                if isinstance(win_tenderer,str):
                    bool_query = BoolQuery(must_queries=[
                        generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[title],MatchPhraseQuery),
                        generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[win_tenderer],MatchPhraseQuery),
                        NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
                        RangeQuery("page_time","2022-01-01")

                    ])

                    rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                                      SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
                                                                                      ColumnsToGet(["page_attachments"],return_type=ColumnReturnType.SPECIFIED))
                    list_data = getRow_ots(rows)
                    for _data in list_data:
                        docid = _data["docid"]
                        page_attachments = _data["page_attachments"]
                        for _attach in json.loads(page_attachments):
                            filemd5 = _attach["fileMd5"]

                            consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",filemd5)],columns_to_get=["classification","path"])
                            dict_k = getRow_ots_primary(return_row)
                            if dict_k is not None and dict_k.get("classification")=="招标文件":
                                _dict["docid"] = docid
                                _dict["path"] = dict_k.get("path")
                                break
                        if "docid" in _dict:
                            break


    task_queue = queue.Queue()
    result_queue = queue.Queue()

    for item in list_query:
        task_queue.put(item)

    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
    mt.run()

    list_data = []
    for item in list_query:
        list_data.append([item.get("title"),item.get("docid")])
        print(item.get("title"),item.get("docid"),"docid" in item)
        path = item.get("path")
        if path is not None:
            try:
                oss2.resumable_download(bucket,path,"附件/%s.%s"%(re.sub("[<>\[\]{}/\';,.‘、\"]",'',item.get("title")),path.split(".")[-1]))
            except Exception as e:
                print("download error %s %d"%(item.get("title"),item.get("docid")))
    df = pd.DataFrame(list_data)
    df.to_excel("a.xlsx")


def exportIndustryCount():
    import codecs
    time_from = "2020-12-21"
    time_to = "2020-12-25"

    # dict_channel = {"51":{"type":"公告变更"},
    #                 "52":{"type":"招标公告"},
    #                 "101":{"type":"中标信息"},
    #                 "102":{"type":"招标预告"},
    #                 "103":{"type":"招标答疑"},
    #                 "104":{"type":"招标文件"},
    #                 "105":{"type":"资审结果"},
    #                 "103":{"type":"招标控制价"},
    #                 "100":{"type":"未知类型"}}


    dict_industry = {}
    meta_industry = load("../data/other/class2dalei_menlei.pkl")
    for _key in meta_industry.keys():
        dict_industry[_key] = {"type":_key}
    print(dict_industry.keys())
    return
    task_queue = queue.Queue()
    result_queue = queue.Queue()
    for _key in dict_industry.keys():
        task_queue.put(dict_industry[_key])

    def _handle(item,result_queue,pool_ots):
        ots_client = pool_ots.getConnector()
        bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"]),
                                             RangeQuery("publishtime",time_from,time_to,include_lower=True,include_upper=True)])
        columns = ["docid"]
        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query, limit=1,get_total_count=True),
                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        item["count"] = total_count
        columns = ["dochtmlcon"]
        bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"])])
        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query, limit=10,sort=Sort(sorters=[FieldSort("publishtime",SortOrder.ASC)]),get_total_count=True),
                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        for row in rows:
            _dict = dict()
            for part in row:
                for v in part:
                    _dict[v[0]] = v[1]
            with codecs.open("../data/industry/%s_%d.html"%(item["type"],_dict["docid"]),"w",encoding="UTF8") as f:
                f.write(_dict["dochtmlcon"])


        pool_ots.putConnector(ots_client)
    pool_ots = ConnectorPool(init_num=20,max_num=30,method_init=getConnect_ots)
    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
    mt.run()
    columns = ["type","count"]
    df_data = {}
    for _c in columns:
        df_data[_c] = []
    for _indus in dict_industry.keys():
        for _c in columns:
            df_data[_c].append(dict_industry[_indus][_c])

    df = pd.DataFrame(df_data)
    # df.to_excel("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
    df.to_csv("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)

def exportDocument_By_time(time_from,time_to,columns=["docid","doctitle","project_name","dochtmlcon"]):
    '''
    :return:
    '''
    ost_client = getConnect_ots()
    last_docid = 0
    bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
                                         RangeQuery('status', '201', '300', include_lower=True, include_upper=True)])

    rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
                                                                      SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))


    _count = len(rows)
    df_data = {}
    def getData(df_data,rows):
        list_dict = getRow_ots(rows)
        for _dict in list_dict:
            for _k,_v in _dict.items():
                if _k not in df_data:
                    df_data[_k] = []
                df_data[_k].append(getLegal_str(_v))
    getData(df_data,rows)
    while(next_token):
        print("%d/%d"%(_count,total_count))
        rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
                                                                          SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        _count += len(rows)
        getData(df_data,rows)


    df = pd.DataFrame(df_data)
    df.to_excel("%s/%s-%s公告信息.xlsx"%(data_path,time_from,time_to),columns=columns)

def processDocument():
    filename = "../data/2021-01-29-2021-01-29公告信息.xlsx"
    df = pd.read_excel(filename)
    df.to_csv("../data/2021-01-29-2021-01-29公告信息.csv")
    return
    list_dict = []
    for docid,doctitle,project_name,dochtmlcon in zip(df["docid"],df["doctitle"],df["project_name"],df["dochtmlcon"]):
        list_dict.append({"docid":docid,"doctitle":doctitle,"project_name":project_name,"dochtmlcon":dochtmlcon})

    task_queue = queue.Queue()
    for _dict in list_dict:
        task_queue.put(_dict)
    result_queue = queue.Queue()

    def _handle(_dict,result_queue,pool_mysql):
        conn = pool_mysql.getConnector()
        cursor = conn.cursor()
        sql = "insert into test_extract(docid,doctitle,page_time) values(%d,%s,%s)"%(_dict["docid"],_dict["doctitle"],_dict["dochtmlcon"])
        cursor.execute(sql)
        conn.commit()

        pool_mysql.putConnector(conn)
        # url = "http://192.168.2.101:15030"
        # myheaders = {'Content-Type': 'application/json'}
        # print(int(_dict["docid"]))
        # data = {"doc_id":int(_dict["docid"]),"title":_dict["doctitle"],"content":_dict["dochtmlcon"]}
        # resp = requests.post(url,json=data,headers=myheaders, verify=True)
        # result = json.loads(resp.content.decode("utf8"),"utf8")
        # _dict["product"] = result["product"]

    pool_mysql = ConnectorPool(init_num=20,max_num=30,method_init=getConnection_testmysql)
    mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=5,pool_mysql=pool_mysql)
    mt.run()

    # columns = ["docid","doctitle","project_name","product"]
    #
    # df_data = {}
    # for _c in columns:
    #     df_data[_c] = []
    # for _dict in list_dict:
    #     for _c in columns:
    #         df_data[_c].append(_dict.get(_c,""))
    # df = pd.DataFrame(df_data)
    # df.to_excel("%s.product.xlsx"%(filename),columns=columns)

def export_extract_check():
    '''
    :return:导出数据提取校验的结果并生成报告
    '''

    conn = getConnection_testmysql()
    cursor = conn.cursor()

    sql = " select docid,json_result from exportdb.extract_check "

    cursor.execute(sql)


    dict_global = {}

    df_global = {"key_type":[],"online_count":[],"test_count":[],"diff_count":[],"diff_percent":[]}
    df_document = {"docid":[]}

    while True:
        rows = cursor.fetchmany(10000)
        if not rows:
            break
        for docid,json_result in rows:
            df_document["docid"].append(docid)
            _result = json.loads(json_result)
            for k,v in _result.items():
                key = k.split("_")
                _key = "_".join(key[:-1])
                if "punish" in _key or "complainants" in _key or "institutions" in _key:
                    continue
                if k not in df_document:
                    df_document[k] = []
                df_document[k].append(v)
                key_type = key[-1]
                if _key not in dict_global:
                    dict_global[_key] = {}
                if key_type not in dict_global[_key]:
                    dict_global[_key][key_type] = 0
                if key_type=="diff":
                    dict_global[_key][key_type] += v
                if key_type in ("online","test"):
                    if isinstance(v,str):
                        if v!="":
                            dict_global[_key][key_type] += 1
                    elif isinstance(v,list):
                        dict_global[_key][key_type] += len(v)
    for k,v in dict_global.items():
        df_global["key_type"].append(k)
        df_global["online_count"].append(v["online"])
        df_global["test_count"].append(v["test"])
        df_global["diff_count"].append(v["diff"])
        df_global["diff_percent"].append(v["diff"]/v["online"] if v["online"]>0 else 0)

    filename = "../data/%s_extract_check.xlsx"%(time.strftime("%Y-%m-%d"))
    with pd.ExcelWriter(filename) as writer:
        df1 = pd.DataFrame(df_global)
        df1.to_excel(writer,sheet_name="global")
        for k,v in df_document.items():
            print(k,len(v))
        df2 = pd.DataFrame(df_document)
        df2.to_excel(writer,sheet_name="document")
        writer.save()
        writer.close()


def exportDocument_dump():
    # filename = "../data/重复公告.xlsx"
    # df = pd.read_excel(filename)
    ots_client = getConnect_ots()
    columns = ["docid","docchannel","page_time","web_source_no","doctitle","tenderee","agency","project_code","project_name","sub_docs_json"]
    df_keys = ["docid","docchannel","page_time","web_source_no","doctitle","doctitle_refine","tenderee","agency","project_code","project_name","bidding_budget","win_bid_price","win_tenderer","URL"]
    df_data = {}
    for _key in df_keys:
        df_data[_key] = []

    bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-03-03"),
                                         RangeQuery("status",201,300,True,True)])

    rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                      SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
    def getData(df_data,rows):
        list_data = getRow_ots(rows)
        for row in list_data:
            dict_find = {}
            for _key in df_keys:
                dict_find[_key] = 0
            for _k,_v in row.items():
                if _k in df_keys:
                    dict_find[_k] = 1
                    if _k=="project_code":
                        _v = '"%s"'%_v
                    df_data[_k].append(_v)
            doctitle = row.get("doctitle","")
            df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '',  doctitle))
            df_data["URL"].append("http://www.bidizhaobiao.com/info-%d.html"%(row["docid"]))
            dict_find["URL"] = 1
            dict_find["doctitle_refine"] = 1
            sub_docs_json = row.get("sub_docs_json","[{}]")
            doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
            if sub_docs_json is not None:
                for sub_docs in json.loads(sub_docs_json):
                    for _key_sub_docs in sub_docs.keys():
                        if _key_sub_docs in doc_columns:
                            if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
                                if _key_sub_docs in ["bidding_budget","win_bid_price"]:
                                    if float(sub_docs[_key_sub_docs])>0:
                                        doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
                                else:
                                    doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
            for _k,_v in doc_columns.items():
                dict_find[_k] = 1
                df_data[_k].append(_v)
            for _k,_v in dict_find.items():
                if _v==0:
                    df_data[_k].append("")


    _count = len(rows)
    getData(df_data,rows)
    while next_token:
        print("%d/%d"%(_count,total_count))
        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        getData(df_data,rows)
        _count += len(rows)

    # for docid in df["docid"]:
    #     bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
    #
    #     rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
    #                                                                   SearchQuery(bool_query , limit=100, get_total_count=True),
    #                                                                   ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
    #     list_data = getRow_ots(rows)
    #     if len(list_data)>0:
    #         dict_find = {}
    #         for _key in df_keys:
    #             dict_find[_key] = 0
    #         for _k,_v in list_data[0].items():
    #             if _k in df_keys:
    #                 dict_find[_k] = 1
    #                 df_data[_k].append(_v)
    #         doctitle = list_data[0].get("doctitle","")
    #         df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '',  doctitle))
    #         dict_find["doctitle_refine"] = 1
    #         sub_docs_json = list_data[0].get("sub_docs_json","[{}]")
    #         doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
    #         if sub_docs_json is not None:
    #             for sub_docs in json.loads(sub_docs_json):
    #                 for _key_sub_docs in sub_docs.keys():
    #                     if _key_sub_docs in doc_columns:
    #                         if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
    #                             if _key_sub_docs in ["bidding_budget","win_bid_price"]:
    #                                 if float(sub_docs[_key_sub_docs])>0:
    #                                     doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
    #                             else:
    #                                 doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
    #         for _k,_v in doc_columns.items():
    #             dict_find[_k] = 1
    #             df_data[_k].append(_v)
    #         for _k,_v in dict_find.items():
    #             if _v==0:
    #                 df_data[_k].append("")
    df1 = pd.DataFrame(df_data)
    df1.to_csv("../data/0303去重.csv",columns=df_keys)

def exportDocument_dump_mysql():
    conn = getConnection_testmysql()

    cursor = conn.cursor()

    columns = ["project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer","group_id","docid"]

    df_data = {}
    for _c in columns:
        df_data[_c] = []
    sql = " select "+",".join(columns)+" from run_dumplicate_document_his where group_id in (select group_id from run_dumplicate_document_his group by group_id having count(1)>1)"
    cursor.execute(sql)
    while True:
        rows = cursor.fetchmany(100000)
        if not rows:
            break
        for row in rows:
            for _i in range(len(columns)):
                df_data[columns[_i]].append(row[_i])
    df = pd.DataFrame(df_data)
    df.to_csv("../data/0304去重.csv",columns=["group_id","docid","project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer"])


    print(cursor.description)

def getDict_docchannel():
    filename = "docchannel.pk"
    if os.path.exists(filename):
        _dict = load(filename)
        return _dict
    conn = getConnection_mysql()
    cursor = conn.cursor()
    sql  = "select channel_id,chnlname from sys_channel "
    cursor.execute(sql)
    rows = cursor.fetchall()
    _dict = dict()
    for row in rows:
        _dict[row[0]] = row[1]
    save(_dict,filename)
    return _dict

def exportDocument_by_doctitle():
    def timeAdd_minute(_time,minutes):
        a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+60*minutes

        _time1 = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(a))
        return _time1

    columns = ["docid","attachmenttextcon","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource","time_bidclose","time_bidopen"]
    columns = ["doctitle","attachmenttextcon","doctextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no"]
    columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no"]

    dict_channel = getDict_docchannel()


    task_queue = queue.Queue()
    result_queue = queue.Queue()
    list_keyword = []
    # for _keyword in re.split("\s|/|、",str_keyword):
    #     if len(_keyword.strip())==0 and isinstance(_keyword,(str)):
    #         continue
    #     print(_keyword)
    #     item = {"keyword":_keyword.strip()}
    #     list_keyword.append(item)
    #     task_queue.put(item)

    filename = "有效调用次数统计_20220830_v1.xlsx"
    df = pd.read_excel(filename)
    for company in df["enterpriseName"]:
        _dict = {"keyword":company}
        task_queue.put(_dict)
        list_keyword.append(_dict)

    start_day = "2019-01-01"
    count_days = 90

    # for _i in range(count_days):
    #
    #     current_date = timeAdd(start_day,_i)
    #     for _j in range(24*6):
    #         start_minute = timeAdd_minute(current_date,10*_j)
    #         end_minute = timeAdd_minute(current_date,10*(_j+1))
    #
    #         item = {"start_minute":start_minute,"end_minute":end_minute}
    #         list_keyword.append(item)
    #         task_queue.put(item)

    ots_client = getConnect_ots()


    def _handle(item,result_queue,ots_client):

        # should_q_keyword = BoolQuery(should_queries=[
        #                                                 # MatchPhraseQuery("tenderee",item["keyword"]),
        #                                              # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["keyword"])),
        #                                              # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.second_tenderer",item["keyword"])),
        #                                              # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.third_tenderer",item["keyword"]))
        #                                              MatchPhraseQuery("doctextcon",item["keyword"]),
        #     MatchPhraseQuery("doctitle",item["keyword"]),
        #                                              MatchPhraseQuery("attachmenttextcon",item["keyword"])
        #                                              ])
        #
        # should_q2 = BoolQuery(should_queries=[WildcardQuery('province', '%s*'%"广东")
        #     # ,WildcardQuery('province', '%s*'%"湖南")
        #     # ,WildcardQuery('province', '%s*'%"广西")
        #                                       ])
        #
        # should_q_tenderee = BoolQuery(should_queries=[WildcardQuery("tenderee","*中学*"),
        #                                               WildcardQuery("tenderee","*大学*"),
        #                                               WildcardQuery("tenderee","*小学*"),
        #                                               WildcardQuery("tenderee","*教育局*")])
        bool_query = BoolQuery(must_queries=[
            # RangeQuery("page_time","2019-01-01","2023-01-01"),
            # TermQuery("page_time","2022-02-18"),
            generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[item["keyword"]],MatchPhraseQuery),
            # RangeQuery("crtime",item["start_minute"],item["end_minute"])
            RangeQuery("status",151,300,True,True),
            # TermQuery("tenderee",item["keyword"])
            # ,TermQuery("docchannel",52)
            # ,should_q_keyword
            # ,should_q_tenderee
            # ,should_q2
                                             ])

        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("crtime",SortOrder.DESC)]), limit=1, get_total_count=True),
                                                                          ColumnsToGet(return_type=ColumnReturnType.NONE))
        item["total_count"] = total_count

        list_data = getRow_ots(rows)
        for _data in list_data:
            _data["keyword"] = item["keyword"]
            result_queue.put(_data)

        _count = len(list_data)
        while next_token:
            print("%d/%d"%(_count,total_count))
            rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                              SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            list_data = getRow_ots(rows)
            _count += len(list_data)
            for _data in list_data:
                _data["keyword"] = item["keyword"]
                result_queue.put(_data)

    mt = MultiThreadHandler(task_queue,_handle,result_queue,40,ots_client=ots_client)
    mt.run()

    set_docid = set()
    list_item = []

    # for item in list_keyword:
    #     print(item["keyword"])

    total_count = 0
    for item in list_keyword:
        total_count += item["total_count"]
        print(item["total_count"])
    print("total_count:%d"%(total_count))
    keys = list_keyword[0].keys()
    df_data = {}

    for item in list_keyword:
        for k,v in item.items():
            if k not in df_data:
                df_data[k] = []
            df_data[k].append(v)

    dumplicate = False
    try:
        while True:
            _dict = result_queue.get(False)
            _docid = _dict.get("docid")
            if _docid in set_docid and not dumplicate:
                continue
            set_docid.add(_docid)
            list_item.append(_dict)
    except Exception as e:
        print(e)

    list_docid = list(set_docid)
    with open("list_docid.txt","w",encoding="utf8") as f:
        for _docid in list_docid:
            f.write(str(_docid))
            f.write("\n")
        f.flush()


    # log("get document taotal_count:%d"%len(list_item))
    # set_line = set()
    # getRowData(df_data,list_item,set_line,list_keyword,dict_channel,dumplicate)

    # set_enterprise = set()
    # for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
    #     set_enterprise.add(_tenderee)
    #     set_enterprise.add(_agency)
    #     set_enterprise.add(_win_tenderer)
    # if "" in set_enterprise:
    #     set_enterprise.remove("")
    # if None in set_enterprise:
    #     set_enterprise.remove(None)

    # dict_enterprise = getDictEnterprise(list(set_enterprise))
    # if len(set_enterprise)>0:
    #     for _i in range(len(df_data["招标单位"])):
    #         # _enterprise_name = df_data["招标单位"][_i]
    #         # if df_data["招标联系人电话"][_i]=="":
    #         #     contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
    #         #     if contacts is not None:
    #         #         _person,_phone = getOneContact(contacts)
    #         #         df_data["招标联系人"][_i] = _person
    #         #         df_data["招标联系人电话"][_i] = _phone
    #
    #         _enterprise_name = df_data["代理单位"][_i]
    #         if df_data["代理联系人电话"][_i]=="":
    #             contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
    #             if contacts is not None:
    #                 _person,_phone = getOneContact(contacts)
    #                 df_data["代理联系人"][_i] = _person
    #                 df_data["代理联系人电话"][_i] = _phone
    #
    #         _enterprise_name = df_data["中标单位"][_i]
    #         if df_data["中标单位联系电话"][_i]=="":
    #             contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
    #             if contacts is not None:
    #                 _person,_phone = getOneContact(contacts)
    #                 df_data["中标单位联系人"][_i] = _person
    #                 df_data["中标单位联系电话"][_i] = _phone
    df = pd.DataFrame(df_data)

    df.to_excel("../data/%s_export11.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))

set_columns = set()
list_df_columns = []

def set_dict_item(_dict,name,v):
    _dict[name] = getLegal_str(v)
    if name not in set_columns:
        set_columns.add(name)
        list_df_columns.append(getLegal_str(name))

def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v):
    _dict[name] = getLegal_str(v)
    if name not in set_columns1:
        set_columns1.add(name)
        list_df_columns1.append(name)

def exportDocument_medicine(start_time,end_time):
    # filename = "../data/重复公告.xlsx"
    # df = pd.read_excel(filename)
    ots_client = getConnect_ots()
    # columns = ["doctitle","docchannel","time_bidopen","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
    columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen"]

    set_enter = set()
    str_enter = '''
        北京嘉和美康信息技术有限公司
        阿里健康科技（中国）有限公司
        北大医疗信息技术有限公司
        创业慧康科技股份有限公司
        东华医为科技有限公司
        望海康信(北京)科技股份公司
        国新健康保障服务有限公司
        南京海泰医疗信息系统有限公司
        南京海泰信息技术有限公司
        浙江和仁科技股份有限公司
        北京惠每科技有限公司
        金蝶医疗软件科技有限公司
        北京京东健康有限公司
        四川久远银海软件股份有限公司
        零氪科技(北京)有限公司
        北京麦迪克斯科技有限公司
        苏州麦迪斯顿医疗科技股份有限公司
        江苏曼荼罗软件股份有限公司
        北京平安联想智慧医疗信息技术有限公司
        青岛百洋智能科技股份有限公司
        上海森亿医疗科技有限公司
        万达信息股份有限公司
        微医集团（浙江）有限公司
        卫宁健康科技集团股份有限公司
        心医国际数字医疗系统（大连）有限公司
        医渡云（北京）技术有限公司
        医惠科技有限公司
        易联众信息技术股份有限公司
        智业软件股份有限公司
        中电数据服务有限公司
        重庆中联信息产业有限责任公司
        杭州卓健信息科技股份有限公司
        大连万达集团股份有限公司
    '''
    for a in re.split("\s+",str_enter):
        if a.strip()!="":
            set_enter.add(a.strip())

    dict_channel = getDict_docchannel()


    # list_province = ["江西","湖南","四川","安徽"]
    list_province = ["全国"]
    for _province in list_province:
        df_data = {}

        str_p = '''
        智慧医疗系统	医院信息系统	临床路径	医院系统	医院管理软件
县域医共体	远程医疗	医院管理系统	医疗信息化	临床医疗
数据集成	云医院	智慧卫生	卫生信息系统	医疗数字化
临床应用				

            '''
        list_prov = re.split("\s",str_p)
        list_mu = []
        for _p in list_prov:
            if _p.strip()=="":
                continue
            print(_p)
            list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))

        s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
        list_should_ten = []
        for _p in re.split("、",s_tenderee):
            if _p.split()=="":
                continue
            list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))

        list_should_win = []
        for _win in list(set_enter):
            list_should_win.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_win)))

        list_should_chan = []
        list_should_chan.append(TermQuery("docchannel",52))
        list_should_chan.append(TermQuery("docchannel",101))
        list_should_chan.append(TermQuery("docchannel",102))

        should_q1 = BoolQuery(should_queries=list_mu)
        should_q2 = BoolQuery(should_queries=list_should_ten)
        should_q3 = BoolQuery(should_queries=list_should_chan)
        bool_query = BoolQuery(must_queries=[
            BoolQuery(should_queries=list_should_win),
            RangeQuery("page_time",start_time,end_time,include_lower=True,include_upper=True),
            RangeQuery("status",151,300,True,True),
            # should_q1,
            # should_q2,
            # should_q3,
        ],
                               # must_not_queries=[
                               #     MatchPhraseQuery("doctextcon","器械"),
                               #     MatchPhraseQuery("doctextcon","仪器"),
                               # ]
        )


        rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                          SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))


        set_line = set()
        _count = len(rows)
        # getData(df_data,rows,set_line)
        list_row = getRow_ots(rows)
        getRowData(df_data,list_row,set_line,[],dict_channel,False)
        while next_token:
            print("%d/%d"%(_count,total_count))
            rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                              SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
                                                                              ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            list_row = getRow_ots(rows)
            getRowData(df_data,list_row,set_line,[],dict_channel,False)
            _count += len(rows)
            if len(list(df_data.keys()))>0:
                if len(df_data[list(df_data.keys())[0]])>=300:
                    break

        set_enterprise = set()
        for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
            set_enterprise.add(_tenderee)
            set_enterprise.add(_agency)
            set_enterprise.add(_win_tenderer)
        if "" in set_enterprise:
            set_enterprise.remove("")
        if None in set_enterprise:
            set_enterprise.remove(None)
        dict_enterprise = getDictEnterprise(list(set_enterprise))
        if len(set_enterprise)>0:
            for _i in range(len(df_data["招标单位"])):
                _enterprise_name = df_data["招标单位"][_i]
                if df_data["招标联系人电话"][_i]=="":
                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
                    if contacts is not None:
                        _person,_phone = getOneContact(contacts)
                        df_data["招标联系人"][_i] = _person
                        df_data["招标联系人电话"][_i] = _phone

                _enterprise_name = df_data["代理单位"][_i]
                if df_data["代理联系人电话"][_i]=="":
                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
                    if contacts is not None:
                        _person,_phone = getOneContact(contacts)
                        df_data["代理联系人"][_i] = _person
                        df_data["代理联系人电话"][_i] = _phone

                _enterprise_name = df_data["中标单位"][_i]
                if df_data["中标单位联系电话"][_i]=="":
                    contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
                    if contacts is not None:
                        _person,_phone = getOneContact(contacts)
                        df_data["中标单位联系人"][_i] = _person
                        df_data["中标单位联系电话"][_i] = _phone

        return df_data
        df1 = pd.DataFrame(df_data)
        print(len(df_data["docid"]))
        df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)

def getRowDataWithKey(df_data,rows,columns):
    global list_df_columns
    list_df_columns = columns
    for row in rows:
        for c in columns:
            if c not in df_data:
                df_data[c] = []
            df_data[c].append(row.get(c))


def getRowData(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):


    dict_line = {}
    # list_data = getRow_ots(rows)
    _index = 0
    for row in rows:
        _index += 1
        item = {}
        _dict = row


        set_dict_item(item,"docid",_dict.get("docid",""))
        # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
        set_dict_item(item,"crtime",_dict.get("crtime",""))

        # set_dict_item(item,"要素数",_dict.get("extract_count",0))
        set_dict_item(item,"公告标题",_dict.get("doctitle",""))
        set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
        set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))

        set_dict_item(item,"正文实体",_dict.get("nlp_enterprise",""))
        set_dict_item(item,"附件实体",_dict.get("nlp_enterprise_attachment",""))

        # set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
        # set_dict_item(item,"原公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))


        set_dict_item(item,"公告内容",getLegal_str(_dict.get("doctextcon","")))
        set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))


        if "keyword" in _dict:
            set_dict_item(item,"关键词",_dict["keyword"])
        else:
            _wholeword = re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")[:30000]+row.get("attachmenttextcon","")[:30000]).replace("(","（").replace(")","）"))
            _pattern = "|".join([re.escape(str(a).replace("(","（").replace(")","）")) for a in list_keyword])
            set_dict_item(item,"关键词",",".join(list(set(re.findall(_pattern,_wholeword)))))
        # set_dict_item(item,"关键词",_dict.get("keyword",""))
        set_dict_item(item,"产品",_dict.get("product",""))
        set_dict_item(item,"服务期限",_dict.get("service_time",""))
        set_dict_item(item,"省份",_dict.get("province",""))
        # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
        set_dict_item(item,"城市",_dict.get("city",""))
        set_dict_item(item,"区县",_dict.get("district",""))
        set_dict_item(item,"发布时间",_dict.get("page_time",""))
        set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
        set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
        # set_dict_item(item,"创建时间",_dict.get("crtime",""))
        set_dict_item(item,"招标方式",_dict.get("bidway",""))

        set_dict_item(item,"行业一级分类",_dict.get("industry",""))
        set_dict_item(item,"行业二级分类",_dict.get("info_type",""))

        set_dict_item(item,"uuid",_dict.get("uuid"))

        # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  _dict.get("doctitle","")))

        set_dict_item(item,"项目编号",_dict.get("project_code",""))
        set_dict_item(item,"项目名称",_dict.get("project_name",""))
        set_dict_item(item,"招标单位",_dict.get("tenderee",""))
        set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
        set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
        set_dict_item(item,"代理单位",_dict.get("agency",""))
        set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
        set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))


        set_dict_item(item,"评审专家",_dict.get("person_review",""))
        set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
        set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))

        set_dict_item(item,"获取文件开始时间",_dict.get("time_get_file_start",""))
        set_dict_item(item,"获取文件结束时间",_dict.get("time_get_file_end",""))
        set_dict_item(item,"保证金递交开始时间",_dict.get("time_earnest_money_start",""))
        set_dict_item(item,"保证金递交结束时间",_dict.get("time_earnest_money_end",""))
        sub_docs_json = _dict.get("sub_docs_json")
        set_tenderer = set()
        multi_win = []
        multi_package_win = []
        win_joint = []
        if sub_docs_json is not None:
            docs = json.loads(sub_docs_json)
            docs.sort(key=lambda x:float(x.get("win_bid_price",0)))
            for _doc in docs:
                if "win_tenderer" in _doc:
                    set_dict_item(item,"中标单位",_doc["win_tenderer"])
                    multi_package_win.append(_doc["win_tenderer"])
                if "second_tenderer" in _doc:
                    set_dict_item(item,"第二候选单位",_doc["second_tenderer"])
                    set_tenderer.add(_doc.get("second_tenderer"))
                if "third_tenderer" in _doc:
                    set_dict_item(item,"第三候选单位",_doc["third_tenderer"])
                    set_tenderer.add(_doc.get("third_tenderer"))
                if "win_tenderer_manager" in _doc:
                    set_dict_item(item,"中标单位联系人",_doc["win_tenderer_manager"])
                if "win_tenderer_phone" in _doc:
                    set_dict_item(item,"中标单位联系电话",_doc["win_tenderer_phone"])
                if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
                    set_dict_item(item,"中标金额",_doc["win_bid_price"])
                if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
                    set_dict_item(item,"招标金额",_doc["bidding_budget"])

                if "win_tenderer_joint" in _doc:
                    win_tenderer_joints = _doc.get("win_tenderer_joint","").split(",")
                    for _joints in win_tenderer_joints:
                        win_joint.append(_joints)

                if "win_tenderer_joints" in _doc:
                    win_tenderer_joints = json.loads(_doc["win_tenderer_joints"])
                    for _win_joint in win_tenderer_joints:
                        win_joint.append(_win_joint.get("name",""))
                if "multi_winner" in _doc:
                    multi_winners = _doc.get("multi_winner","").split(",")
                    for _mul in multi_winners:
                        multi_win.append(_mul)
                if "multi_winners" in _doc:
                    multi_winners = json.loads(_doc["multi_winners"])
                    for multi_w in multi_winners:
                        multi_win.append(multi_w.get("name",""))
        set_dict_item(item,"多标段中标人",",".join(list(multi_package_win)))
        set_dict_item(item,"多中标人",",".join(list(multi_win)))
        set_dict_item(item,"联合中标人",",".join(list(win_joint)))

        set_dict_item(item,"入围供应商",",".join(list(set_tenderer)))
        if "第二候选单位" not in item:
            set_dict_item(item,"第二候选单位","")
        if "第三候选单位" not in item:
            set_dict_item(item,"第三候选单位","")
        if "招标金额" not in item:
            set_dict_item(item,"招标金额","")
        if "中标金额" not in item:
            set_dict_item(item,"中标金额","")
        if "中标单位" not in item:
            set_dict_item(item,"中标单位","")

        if "中标单位联系人" not in item:
            set_dict_item(item,"中标单位联系人","")
        if "中标单位联系电话" not in item:
            set_dict_item(item,"中标单位联系电话","")

        set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
        set_dict_item(item,"detail_link",_dict.get("detail_link",""))

        # if item["中标单位"] not in set_enter:
        #     continue

        if not dumplicate:
            if item["项目编号"]!="":
                _line = "%s-%s-%s-%s-%s-%s"%(item["公告类别"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["招标金额"]),str(item["中标金额"]))
                if _line in dict_line:
                    dict_line[_line].append(item)
                    continue
                dict_line[_line] = [item]
            _line2 = "%s-%s-%s-%s-%s-%s"%(item["公告标题"],item["公告类别"],item["招标单位"],str(item["招标金额"]),item["中标单位"],str(item["中标金额"]))
            if _line2 in dict_line:
                dict_line[_line2].append(item)
                continue
            dict_line[_line2] = [item]

        # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None:
        #     continue

        # if _dict.get("docid","") in set_ig_docid:
        #     continue

        # if item["招标金额"]=="":
        #     continue

        for k,v in item.items():
            if k not in df_data:
                df_data[k] = []
            df_data[k].append(v)
    if not dumplicate:
        dict_dump = {}
        columns = ["group_id"]
        columns.extend(list_df_columns)
        for k in columns:
            dict_dump[k] = []
        group_id = 1
        for k,v in dict_line.items():
            if len(v)==1:
                continue
            for item in v:
                dict_dump["group_id"].append(group_id)
                for k in list_df_columns:
                    dict_dump[k].append(item.get(k))
            group_id += 1
        df_dump = pd.DataFrame(dict_dump)
        df_dump.to_excel("%s/../data/dumplicate/%s_重复数据.xlsx"%(os.path.dirname(__file__),getCurrent_date("%Y-%m-%d_%H%M%S")))

def getRowData_shenpi(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):


    dict_line = {}
    # list_data = getRow_ots(rows)
    _index = 0
    for row in rows:
        _index += 1
        item = {}
        _dict = row

        extract_json = json.loads(_dict.get("extract_json","{}"))


        set_dict_item(item,"docid",_dict.get("docid",""))
        set_dict_item(item,"original_id",_dict.get("original_id",""))
        # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
        set_dict_item(item,"crtime",_dict.get("crtime",""))

        set_dict_item(item,"province",_dict.get("province",""))
        set_dict_item(item,"city",_dict.get("city",""))
        set_dict_item(item,"district",_dict.get("district",""))

        set_dict_item(item,"单位集合",json.dumps(extract_json.get("dict_enterprise",""),ensure_ascii=False))

        # set_dict_item(item,"要素数",_dict.get("extract_count",0))
        set_dict_item(item,"公告标题",_dict.get("doctitle",""))
        set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))

        set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
        # set_dict_item(item,"原公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))
        # set_dict_item(item,"detail_link",_dict.get("detail_link",""))

        set_dict_item(item,"产品",_dict.get("product",""))

        set_dict_item(item,"发布时间",_dict.get("page_time",""))

        set_dict_item(item,"行业一级分类",_dict.get("industry",""))
        set_dict_item(item,"行业二级分类",_dict.get("info_type",""))


        # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  _dict.get("doctitle","")))


        set_dict_item(item,"项目编号",_dict.get("project_code",""))
        set_dict_item(item,"项目名称",_dict.get("project_name",""))
        set_dict_item(item,"审批事项",_dict.get("approval_items",""))
        set_dict_item(item,"审批结果",_dict.get("approval_result",""))
        set_dict_item(item,"审批部门",_dict.get("approver",""))


        set_dict_item(item,"建设单位",_dict.get("construct_company",""))
        set_dict_item(item,"建设规模",_dict.get("construction_scale",""))
        set_dict_item(item,"申报单位",_dict.get("declare_company",""))
        set_dict_item(item,"审批文号",_dict.get("doc_num",""))


        set_dict_item(item,"环评机构",_dict.get("evaluation_agency",""))
        set_dict_item(item,"项目法人",_dict.get("legal_person",""))
        set_dict_item(item,"资金来源",_dict.get("moneysource",""))

        set_dict_item(item,"申报类型",_dict.get("pro_type",""))
        set_dict_item(item,"项目地址",_dict.get("project_addr",""))
        set_dict_item(item,"项目属性",_dict.get("properties",""))
        set_dict_item(item,"开工时间",_dict.get("time_commencement",""))

        set_dict_item(item,"竣工时间",_dict.get("time_completion",""))
        set_dict_item(item,"申报时间",_dict.get("time_declare",""))
        set_dict_item(item,"总投资",_dict.get("total_tenderee_money",""))
        set_dict_item(item,"建设年限",_dict.get("year_limit",""))

        for k,v in item.items():
            if k not in df_data:
                df_data[k] = []
            df_data[k].append(v)

def getRowData_sp1(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):

    dict_line = {}
    # list_data = getRow_ots(rows)
    _index = 0
    set_id = set()
    for row in rows:
        _index += 1
        item = {}
        _dict = row
        _id = _dict.get("id")
        if _id is not None and _id in set_id:
            continue
        set_id.add(_id)
        set_dict_item(item,"id",_dict.get("id",""))
        # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
        # set_dict_item(item,"crtime",_dict.get("crtime",""))

        set_dict_item(item,"detaillink",_dict.get("detaillink",""))
        # set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
        set_dict_item(item,"公告类别",dict_sptype.get(str(_dict.get("sp_type","")),""))


        set_dict_item(item,"page_time",getLegal_str(_dict.get("page_time","")))
        # set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))


        set_dict_item(item,"page_title",_dict.get("page_title",""))
        set_dict_item(item,"record_id",_dict.get("record_id",""))
        set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
        # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
        set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))

        for k,v in item.items():
            if k not in df_data:
                df_data[k] = []
            df_data[k].append(v)

def getRowData_sp(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):

    dict_line = {}
    # list_data = getRow_ots(rows)
    _index = 0
    set_id = set()
    for row in rows:
        _index += 1
        item = {}
        _dict = row
        set_dict_item(item,"docid",_dict.get("docid",""))
        # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
        # set_dict_item(item,"crtime",_dict.get("crtime",""))

        set_dict_item(item,"公告标题",_dict.get("page_title",""))
        # set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
        set_dict_item(item,"公告类别",dict_sptype.get(str(_dict.get("sp_type","")),""))


        set_dict_item(item,"公告内容",getLegal_str(_dict.get("page_content","")))
        # set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))


        if "keyword" in _dict:
            set_dict_item(item,"关键词",_dict["keyword"])
        else:
            set_dict_item(item,"关键词",",".join(list(set(re.findall("|".join([re.escape(str(a).replace("(","（").replace(")","）")) for a in list_keyword]),re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")[:30000]+row.get("attachmenttextcon","")[:30000]).replace("(","（").replace(")","）")))))))
        # set_dict_item(item,"关键词",_dict.get("keyword",""))
        set_dict_item(item,"产品",_dict.get("product",""))
        set_dict_item(item,"服务期限",_dict.get("service_time",""))
        set_dict_item(item,"省份",_dict.get("province",""))
        # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
        set_dict_item(item,"城市",_dict.get("city",""))
        set_dict_item(item,"区县",_dict.get("district",""))
        set_dict_item(item,"发布时间",_dict.get("page_time",""))
        set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
        set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
        # set_dict_item(item,"创建时间",_dict.get("crtime",""))
        set_dict_item(item,"招标方式",_dict.get("bidway",""))

        set_dict_item(item,"行业一级分类",_dict.get("industry",""))
        set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
        set_dict_item(item,"来源",_dict.get("web_source_name",""))

        set_dict_item(item,"uuid",_dict.get("uuid"))

        # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  _dict.get("doctitle","")))

        set_dict_item(item,"项目编号",_dict.get("page_code",""))
        set_dict_item(item,"项目名称",_dict.get("project_name",""))
        set_dict_item(item,"招标单位",_dict.get("tenderee",""))
        set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
        set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
        set_dict_item(item,"代理单位",_dict.get("agency",""))
        set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
        set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
        # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))

        set_dict_item(item,"评审专家",_dict.get("person_review",""))
        set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
        set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))

        set_dict_item(item,"获取文件开始时间",_dict.get("time_get_file_start",""))
        set_dict_item(item,"获取文件结束时间",_dict.get("time_get_file_end",""))
        set_dict_item(item,"保证金递交开始时间",_dict.get("time_earnest_money_start",""))
        set_dict_item(item,"保证金递交结束时间",_dict.get("time_earnest_money_end",""))
        sub_docs_json = _dict.get("sub_docs_json")
        set_tenderer = set()
        if sub_docs_json is not None:
            docs = json.loads(sub_docs_json)
            docs.sort(key=lambda x:x.get("win_bid_price",0))
            for _doc in docs:
                if "win_tenderer" in _doc:
                    set_dict_item(item,"中标单位",_doc["win_tenderer"])
                if "second_tenderer" in _doc:
                    set_dict_item(item,"第二候选单位",_doc["second_tenderer"])
                    set_tenderer.add(_doc.get("second_tenderer"))
                if "third_tenderer" in _doc:
                    set_dict_item(item,"第三候选单位",_doc["third_tenderer"])
                    set_tenderer.add(_doc.get("third_tenderer"))
                if "win_tenderee_manager" in _doc:
                    set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
                if "win_tenderee_phone" in _doc:
                    set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
                if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
                    set_dict_item(item,"中标金额",_doc["win_bid_price"])
                if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
                    set_dict_item(item,"招标金额",_doc["bidding_budget"])
        set_dict_item(item,"入围供应商",",".join(list(set_tenderer)))
        if "第二候选单位" not in item:
            set_dict_item(item,"第二候选单位","")
        if "第三候选单位" not in item:
            set_dict_item(item,"第三候选单位","")
        if "招标金额" not in item:
            set_dict_item(item,"招标金额","")
        if "中标金额" not in item:
            set_dict_item(item,"中标金额","")
        if "中标单位" not in item:
            set_dict_item(item,"中标单位","")

        if "中标单位联系人" not in item:
            set_dict_item(item,"中标单位联系人","")
        if "中标单位联系电话" not in item:
            set_dict_item(item,"中标单位联系电话","")

        # if item["中标单位"] not in set_enter:
        #     continue

        if not dumplicate:
            if item["项目编号"]!="":
                _line = "%s-%s-%s-%s-%s-%s"%(item["公告类别"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["招标金额"]),str(item["中标金额"]))
                if _line in dict_line:
                    dict_line[_line].append(item)
                    continue
                dict_line[_line] = [item]
            _line2 = "%s-%s-%s-%s-%s-%s"%(item["公告标题"],item["公告类别"],item["招标单位"],str(item["招标金额"]),item["中标单位"],str(item["中标金额"]))
            if _line2 in dict_line:
                dict_line[_line2].append(item)
                continue
            dict_line[_line2] = [item]

        # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None:
        #     continue

        # if _dict.get("docid","") in set_ig_docid:
        #     continue

        # if item["招标金额"]=="":
        #     continue

        for k,v in item.items():
            if k not in df_data:
                df_data[k] = []
            df_data[k].append(v)
    if not dumplicate:
        dict_dump = {}
        columns = ["group_id"]
        columns.extend(list_df_columns)
        for k in columns:
            dict_dump[k] = []
        group_id = 1
        for k,v in dict_line.items():
            if len(v)==1:
                continue
            for item in v:
                dict_dump["group_id"].append(group_id)
                for k in list_df_columns:
                    dict_dump[k].append(item.get(k))
            group_id += 1
        df_dump = pd.DataFrame(dict_dump)
        df_dump.to_excel("%s/../data/dumplicate/%s_重复数据.xlsx"%(os.path.dirname(__file__),getCurrent_date("%Y-%m-%d_%H%M%S")))


def filterRow(list_row,column,list_not_keywrods):
    list_result = []

    for row in list_row:
        _product = row.get(column,"")
        sub_docs_json = row.get("sub_docs_json","")
        doctitle = row.get("doctitle","")

        tenderee = row.get("tenderee","")
        if tenderee!="":
            continue

        nlp_enterprise = row.get("nlp_enterprise")
        nlp_enterprise_attachment = row.get("nlp_enterprise_attachment")
        list_nlp_enterprise = []
        list_enterprise_attachment = []
        if nlp_enterprise is not None:
            list_nlp_enterprise = json.loads(nlp_enterprise)
        if nlp_enterprise_attachment is not None:
            list_enterprise_attachment = json.loads(nlp_enterprise_attachment)
        if max(len(list_nlp_enterprise),len(list_enterprise_attachment))==1:
            list_result.append(row)
        # if re.search("设计",sub_docs_json) is not None:
        #     if re.search("装修",str(doctitle)+str(sub_docs_json)) is None:
        #         list_result.append(row)
        # else:
        #     print("===",_product)


        # if re.search("|".join([re.escape(i) for i in list_not_keywrods]),_product) is not None:
        #     continue
        # list_result.append(row)
        # if row.get("关键词",1)==row.get("招标单位",2) or row.get("关键词",2)==row.get("中标单位",3):
        #     list_result.append(row)
        # doctitle = row.get("doctitle")
        # doctextcon = row.get("doctextcon")
        # if len(re.sub('\s','',doctextcon))==len(doctitle)+4:
        #     list_result.append(row)
        # tenderee_phone = row.get("tenderee_phone","")
        # if len(tenderee_phone)==11:
        #     list_result.append(row)
    return list_result

dict_sptype = {"2":"审批信息",
               "4":"审批结果",
               "8":"核准公示",
               "16":"核准结果",
               "32":"备案公示",
               "64":"备案结果",
               "128":"推介项目",
               "256":"推介结果",
               "512":"项目撤销",
               "1024":"筹备阶段"}

def getKeywordByFile():
    filename = "审批标题对比检查结果（20220831）.xlsx"
    df = pd.read_excel(filename,sheetname=1)

    list_data = []
    for _title,_no,_type in zip(df["标题"],df["编号"],df["检查结果"]):
        if _type not in ["接口错"]:
            continue
        _dict = {"title":_title,
                 "web_source":_no}
        list_data.append(_dict)
    return list_data

def exportDocument_by_pagetime():
    ots_client = getConnect_ots()
    ots_capacity = getConnect_capacity()

    # columns = ["doctitle","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
    # columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
    # columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_name"]
    columns = ["doctitle","docchannel","nlp_enterprise","nlp_enterprise_attachment","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end","detail_link"]
    # columns = ["docchannel","docid","project_name","product","doctitle","page_time","province","city","time_get_file_end","time_bidclose","project_code","sub_docs_json","tenderee","info_type","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","detail_link","bidway","crtime","extract_count","products"]
    # columns = ["page_time","doctitle","crtime","web_source_no","web_source_name","detail_link","original_docchannel","uuid","docid"]
    # columns = ["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnest_money_end","time_earnest_money_start","time_get_file_end","time_get_file_start","time_publicity_end","time_publicity_start","time_registration_end","time_registration_start"]

    dict_channel = getDict_docchannel()

    # columns = ["doctitle","dochtmlcon","page_time","web_source_no","web_source_name","sub_docs_json"]

    # columns = ["tenderee","tenderee_contact","tenderee_phone"]

    # columns = ["extract_json","original_id","crtime","province","city","district","doctitle","web_source_no","web_source_name","product","page_time","industry","info_type","project_code","project_name","approval_items","approval_result","approver","construct_company","construction_scale","declare_company","doc_num","evaluation_agency","legal_person","moneysource","pro_type","project_addr","properties","time_commencement","time_completion","time_declare","total_tenderee_money","year_limit"]

    # columns = ["extract_json","doctitle","status","page_time","docchannel"]


    list_query = []


    str_keyword = '''

博物馆 、文物


    '''
    list_keyword = splitIntoList(str_keyword,"[\s\n、，,|]")

    str_con_keyword = '''
    博物馆 、文物
    '''
    con_keyword = splitIntoList(str_con_keyword,'[\s\n、，,|]')

    print(con_keyword)

    should_q_win = []
    for _keyword in list_keyword:
        should_q_win.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)))
    q_win = BoolQuery(should_queries=should_q_win)


    str_not_keyword = '''
     物业、保洁、家具、装修、维修、修缮、车辆、消防、广告、印刷、安防、防雷
地区：全国
    '''
    list_not_key = splitIntoList(str_not_keyword,"[\s\n、，,]")

    tenderee_keywrod = "医院、大学、高校、高中"
    list_t_key = splitIntoList(tenderee_keywrod,"[\s\n、，,]")

    should_q_system = [TermQuery("procurement_system","企业采购系统"),
                       TermQuery("procurement_system","部队采购系统")]
    q_system = BoolQuery(should_queries=should_q_system)

    log(str(list_keyword))

    s_province = "北京，天津，深圳，上海，浙江，江苏，安徽"
    list_province = splitIntoList(s_province,"[,，\s]")
    st = "环境监测中心、环境监测总站、环保局、水务局、水利局"
    list_tenderee = splitIntoList(st,"、|\s")
    # list_title = getKeywordByFile()
    #
    # for _d in list_title:
    #     _title = _d["title"]
    #     web_source = _d["web_source"]
    #     bool_query = BoolQuery(must_queries=[
    #         generateBoolShouldQuery(["page_title"],[_title],MatchPhraseQuery),
    #         TermQuery("web_source_no",web_source)
    #     ])
    #     list_query.append({"query":bool_query})
    for _keyword in list_keyword:

        bool_query = BoolQuery(must_queries=[
            # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)),
            # generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),

            # generateBoolShouldQuery(["doctitle"],["院","交通","学"],MatchPhraseQuery),
            # generateBoolShouldQuery(["doctitle"],["智慧"],MatchPhraseQuery),
            # ExistsQuery("tenderee"),
            # generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),
            # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["学校","大学","中学","小学"],MatchPhraseQuery),
            # generateBoolShouldQuery(["web_source_name"],list_keyword,TermQuery),
            # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_keyword,MatchPhraseQuery),
            # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],con_keyword,MatchPhraseQuery),
            # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["服务期","服务时间","合同期限","服务范围","质保期","履行期限","履约期限","交货期"],MatchPhraseQuery),
            # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["雪茄柜"],MatchPhraseQuery),
            # BoolQuery(should_queries=[
            #     generateBoolShouldQuery(["doctitle"],["公告","公示","招标","中标","采购","工程","项目","询价","施工","比价","服务","监理","设计"],MatchPhraseQuery),
            #     generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["项目名称","招标人","中标人","项目编号","采购组织机构","采购人","招标范围","投标保证金","报价地址","询价通知书"],MatchPhraseQuery)
            # ]),

            # generateBoolShouldQuery(["doctitle"],["中标"],MatchPhraseQuery),
            # generateBoolShouldQuery(["docid"],list_keyword,TermQuery),
            # q_win,
            # should_q,
            # generateBoolShouldQuery(["tenderee"],[company],TermQuery),
            # generateBoolShouldQuery(["province"],["安徽","江苏"],TermQuery),
            generateBoolShouldQuery(["docchannel"],[52,101,118,119,120,121,122],TermQuery),
            # generateBoolShouldQuery(["docchannel"],[52],TermQuery),
            # generateBoolShouldQuery(["docchannel"],[101,118,119,120,121,122],TermQuery),
            # generateBoolShouldQuery(["docchannel"],[302],TermQuery),
            # generateBoolShouldQuery(["docchannel"],[101,119,120,121,122],TermQuery),
            # generateBoolShouldQuery(["docchannel"],[51,52,101,102,103,104,105,114,118,119,120],TermQuery),
            # generateBoolShouldQuery(["docchannel"],[102,52,101,114,119,120],TermQuery),
            RangeQuery("page_time","2024-01-01","2025-12-12",True,False),
            # NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
            # TermQuery("web_source_name","专项债券信息网"),
            # generateBoolShouldQuery(["city"],["西安","渭南"],TermQuery),
            # TermQuery("info_type","物业管理"),
            # RangeQuery("crtime","2024-07-21 00:00:00","2024-06-21 13:00:00"),
            # TermQuery("save",0),
            RangeQuery("status",201,301,True,True),
            # RangeQuery("crtime","2024-01-01","2025-01-01",True,False),
            # RangeQuery("page_time",range_from="2024-08-01",range_to="2025-01-01"),
            # BoolQuery(should_queries=[TermQuery("page_time","2022-09-15"),
            #                           TermQuery("page_time","2022-10-20"),
            #                           TermQuery("page_time","2022-10-31")])
            # TermQuery("page_time","2025-01-07"),
            # RangeQuery("crtime","2025-05-19","2025-05-20"),
            # TermQuery("docid",237163857),
            # RangeQuery("tenderee","","1"),
            # WildcardQuery("tenderee","*雅居乐*"),
            # RangeQuery("crtime","2023-07-22 00:00:00"),

            # BoolQuery(should_queries=[NestedQuery("products",RangeQuery("products.unit_price",1)),
            #                           NestedQuery("products",RangeQuery("products.total_price",1)),])

            # NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",1000000)),
            # NestedQuery("page_attachments",TermQuery("page_attachments.fileMd5","92775529171409a32513f134a61d73c8")),
            TermQuery("province","广东"),
            # TermQuery("city","上海"),
            # generateBoolShouldQuery(["tenderee"],list_tenderee,WildcardQuery),
            # generateBoolShouldQuery(["tenderee"],["应急管理局","城市管理局","大数据局","政务服务管理局","消防局"],WildcardQuery),
            WildcardQuery("tenderee","*中国电信*"),
            # BoolQuery(should_queries=[WildcardQuery("tenderee","*医院*"),
            #                           WildcardQuery("tenderee","*学校*")])
            # BoolQuery(should_queries=[
            #   #   NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.bidding_budget",100000000)),
            #   # NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",100000000)),
            #   # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","奇安信网神信息技术（北京）股份有限公司")),
            #   # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)),
            #                           ])
            # TermQuery("procurement_system","公安系统"),
            # generateBoolShouldQuery(["province"],["重庆"],WildcardQuery),
            # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
            # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
            ],
            # should_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐禾*")),
            #                 NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*彩食鲜*")),
            #                 NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*望家欢*")),
            #                 NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐味*")),
            #                 ],
            # must_not_queries=[
            #     # TermQuery("exist_table",1),
            #     # WildcardQuery("tenderee","*"),
            #     # TermQuery("attachment_extract_status",1),
            #     # generateBoolShouldQuery(["tenderee"],["银行","集团","公司"],WildcardQuery),
            #     # generateBoolShouldQuery(["doctitle"],list_not_key,MatchPhraseQuery),
            #     # generateBoolShouldQuery(["province"],["湖南","广西","广东"],MatchPhraseQuery),
            #     NestedQuery("sub_docs_json",ExistsQuery("sub_docs_json.win_tenderer"))
            #     # q_system,
            # ]
        )
        list_query.append({"query":bool_query,"limit":10000})
        break
    # list_row = getDocument(list_query,["docid","service_time"],"document","document_index")
    # list_query = []
    # for row in list_row:
    #     list_query.append({"query":TermQuery("docid",row["docid"]),"limit":1})
    list_row = getDocument(list_query,columns,"document","document_index")
    # list_row = getDocument(list_query,columns,"document_product","document_product_index")
    # list_row = getDocument(list_query,columns,"t_shen_pi_xiang_mu","t_shen_pi_xiang_mu_index")
    def judge_save(row,result_queue):
        docid = row.get("docid")
        best_docid = row.get("best_docid")
        if best_docid is not None and best_docid!="":
            best_docid = int(best_docid)
            if best_docid == int(docid):
                return
            consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",best_docid%500+1),("docid",best_docid)],columns_to_get=["web_source_no","page_time","doctitle","crtime","detail_link","fingerprint"])
            _dict = getRow_ots_primary(return_row)
            if _dict is not None:
                if _dict.get("web_source_no")==row.get("web_source_no"):
                    new_dict = {"dup_docid":row.get("docid"),
                                "dup_page_time":row.get("page_time"),
                                "dup_doctitle":row.get("doctitle"),
                                "dup_web_source_no":row.get("web_source_no"),
                                "dup_crtime":row.get("crtime"),
                                "dup_detail_link":row.get("detail_link"),
                                "dup_fingerprint":row.get("fingerprint"),
                                "best_docid":_dict.get("docid"),
                                "best_page_time":_dict.get("page_time"),
                                "best_doctitle":_dict.get("doctitle"),
                                "best_web_source_no":_dict.get("web_source_no"),
                                "best_crtime":_dict.get("crtime"),
                                "best_detail_link":_dict.get("detail_link"),
                                "best_fingerprint":_dict.get("fingerprint"),
                                "detail_link_same":row.get("detail_link")==_dict.get("detail_link"),
                                "fingerprint_same":row.get("fingerprint")==_dict.get("fingerprint"),
                                }
                    result_queue.put(new_dict)

    from export.html2text import html2text_with_tablehtml


    def _judge_service_time(row,result_queue):

        docid = int(row.get("docid"))
        partition_key = docid%500+1
        consumed, return_row, next_token = ots_capacity.get_row("document",[("partitionkey",partition_key),("docid",docid)],columns_to_get=["dochtmlcon"])
        _dict = getRow_ots_primary(return_row)
        if _dict is not None:
            row_service_time = json.loads(row.get("service_time","{}"))
            _html = _dict.get("dochtmlcon","")
            _text = html2text_with_tablehtml(_html)
            if len(_text)>20000:
                return
            _prompt = '''
    请从以下公告中提取服务期限,其中服务开始时间和服务结束时间是yyyy-mm-dd的格式，服务天数是数字天数,没有则给""
    service_start 服务开始时间
    service_end 服务结束时间
    service_days 服务天数
    返回json格式{"service_end":"","service_start":"","service_days":""}
            '''
            _result = chat_doubao(_prompt+_text,model_name='ep-20250314164242-jd62g')
            _json = get_json_from_text(_result)
            if _json is not None:
                try:
                    _dict = json.loads(_json)
                    new_dict = {
                        "docid":row.get("docid"),
                        "service_start_extract":row_service_time.get("service_start",""),
                        "service_end_extract":row_service_time.get("service_end",""),
                        "service_days_extract":row_service_time.get("service_days",""),
                        "service_start_ai":_dict.get("service_start",""),
                        "service_end_ai":_dict.get("service_end",""),
                        "service_days_ai":_dict.get("service_days",""),
                        "extract_equal_ai":row_service_time.get("service_start","")[:7]==_dict.get("service_start","")[:7] and row_service_time.get("service_end","")[:7]==_dict.get("service_end","")[:7] and str(row_service_time.get("service_days",""))==str(_dict.get("service_days","")),

                    }
                    result_queue.put(new_dict)

                except:
                    _dict = {}


    # task_queue = Queue()
    # for row in list_row:
    #     task_queue.put(row)
    # result_queue = Queue()
    # mt = MultiThreadHandler(task_queue,_judge_service_time,result_queue,thread_count=10)
    # mt.run()
    # new_rows = []
    # while 1:
    #     try:
    #         data = result_queue.get(timeout=1)
    #         new_rows.append(data)
    #     except:
    #         break

    # list_query = []
    #
    # for _row in list_row:
    #     _uuid = uuid4().hex
    #     page_attachments = json.loads(_row.get("page_attachments"))
    #     l_s = []
    #     for _at in page_attachments:
    #         l_s.append(NestedQuery("page_attachments",TermQuery("page_attachments.fileMd5",_at.get("fileMd5"))))
    #     list_query.append({"query":BoolQuery(should_queries=l_s),"limit":500,"keyword":_uuid})
    # list_row = getDocument(list_query,columns,"document","document_index")

    df_data = {}
    set_line = set()
    # # list_row = filterRow(list_row,"doctitle",list_not_key)
    # log("get document %d rows"%len(list_row))
    # # getRowDataWithKey(df_data,list_row,columns)
    getRowData(df_data,list_row,set_line,list_keyword,dict_channel,True)
    # # getRowData_shenpi(df_data,list_row,set_line,list_keyword,dict_channel,True)
    # # getRowData_sp1(df_data,list_row,set_line,list_keyword,dict_sptype,True)

    # fixContactPerson(df_data,list_df_columns,get_legal_person=False)


    df1 = pd.DataFrame(df_data)
    df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
    # print("get document %d rows"%len(new_rows))
    # df1 = pd.DataFrame(new_rows)
    # df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')))
    # keys = df_data.keys()
    # print("keys",keys)
    # dict_company = {}
    # set_dup_keys = set()
    # set_docid = set()
    # for _i in range(len(df_data[list(keys)[0]])):
    #     if df_data["关键词"][_i]==df_data["招标单位"][_i] or df_data["关键词"][_i]==df_data["中标单位"][_i]:
    #         company_name = df_data["关键词"][_i]
    #         if company_name not in dict_company:
    #             dict_company[company_name] = {"企业名称":company_name,"招标":[],"中标":[]}
    #         if df_data["关键词"][_i]==df_data["招标单位"][_i]:
    #             if str(df_data["招标金额"][_i])!="nan":
    #                 _key = "%s-%s"%(company_name,str(df_data["招标金额"][_i]))
    #                 if _key not in set_dup_keys:
    #                     set_docid.add(df_data["docid"][_i])
    #                     dict_company[company_name]["招标"].append({"标题":df_data["公告标题"][_i],
    #                                                              "招标方式":df_data["招标方式"][_i],
    #                                                              "招标单位":df_data["招标单位"][_i],
    #                                                              "招标金额":df_data["招标金额"][_i]})
    #                 set_dup_keys.add(_key)
    #         if df_data["关键词"][_i]==df_data["中标单位"][_i]:
    #             if str(df_data["中标金额"][_i])!="nan":
    #                 _key = "%s-%s"%(str(df_data["中标单位"][_i]),str(df_data["中标金额"][_i]))
    #                 if _key not in set_dup_keys:
    #                     set_docid.add(df_data["docid"][_i])
    #                     dict_company[company_name]["中标"].append({"标题1":df_data["公告标题"][_i],
    #                                                              "招标方式1":df_data["招标方式"][_i],
    #                                                              "中标单位1":df_data["中标单位"][_i],
    #                                                              "中标金额1":df_data["中标金额"][_i]})
    #                 set_dup_keys.add(_key)
    # df_keys = ["企业名称","标题","招标方式","招标单位","招标金额","标题1","招标方式1","中标单位1","中标金额1"]
    # df_da = {}
    # for k in df_keys:
    #     df_da[k] = []
    # for k,v in dict_company.items():
    #     list_zhaobiao = v["招标"]
    #     list_zhongbiao = v["中标"]
    #     _nums = max(min(len(list_zhaobiao),5),min(len(list_zhongbiao),5))
    #     for i in range(_nums):
    #         df_da["企业名称"].append(k)
    #         if i>=len(list_zhaobiao):
    #             df_da["标题"].append("")
    #             df_da["招标方式"].append("")
    #             df_da["招标单位"].append("")
    #             df_da["招标金额"].append("")
    #         else:
    #             df_da["标题"].append(list_zhaobiao[i]["标题"])
    #             df_da["招标方式"].append(list_zhaobiao[i]["招标方式"])
    #             df_da["招标单位"].append(list_zhaobiao[i]["招标单位"])
    #             df_da["招标金额"].append(list_zhaobiao[i]["招标金额"])
    #
    #         if i>=len(list_zhongbiao):
    #             df_da["标题1"].append("")
    #             df_da["招标方式1"].append("")
    #             df_da["中标单位1"].append("")
    #             df_da["中标金额1"].append("")
    #         else:
    #             df_da["标题1"].append(list_zhongbiao[i]["标题1"])
    #             df_da["招标方式1"].append(list_zhongbiao[i]["招标方式1"])
    #             df_da["中标单位1"].append(list_zhongbiao[i]["中标单位1"])
    #             df_da["中标金额1"].append(list_zhongbiao[i]["中标金额1"])
    # df2 = pd.DataFrame(df_da)
    # df2.to_excel("tmp333.xlsx",columns=df_keys)
    #
    # df_3 = {}
    # for k in keys:
    #     df_3[k] = []
    # for _i in range(len(df_data[list(keys)[0]])):
    #     docid = df_data["docid"][_i]
    #     if docid in set_docid:
    #         for k in keys:
    #             df_3[k].append(df_data[k][_i])
    # df3 = pd.DataFrame(df_3)
    # df3.to_excel("tmp_333_mx.xlsx",columns=keys)


    # fixContactPerson(df_data,list_df_columns)
    #


def findProjects():
    df = pd.read_excel("两广地区中标时间为空标注_预匹配1.xlsx",0)
    list_items = []
    for docids,project_code,project_name,tenderee,zhao_biao_page_time in zip(df["docids"],df["project_code"],df["project_name"],df["tenderee"],df["zhao_biao_page_time"]):
        if not isinstance(project_code,(str)):
            project_code = "$$$"
        if not isinstance(project_name,(str)):
            project_name = "$$$"
        if not isinstance(tenderee,(str)):
            tenderee = ""
        print(dir(zhao_biao_page_time))
        _dict = {"docids":docids,
                 "project_code":project_code,
                 "project_name":project_name,
                 "tenderee":tenderee,
                 "zhao_biao_page_time":zhao_biao_page_time.strftime("%Y-%m-%d"),
                 "end_time":timeAdd(zhao_biao_page_time.strftime("%Y-%m-%d"),180)}

        list_items.append(_dict)
    task_queue = queue.Queue()
    for item in list_items:
        task_queue.put(item)

    def _handle(item,result_queue,ots_client):
        docids = item.get("docids")
        list_s_n = []
        for docid in re.split(",",str(docids)):
            list_s_n.append(TermQuery("docid",docid))
        query_not = BoolQuery(should_queries=list_s_n)

        # bool_query =BoolQuery(must_queries=[query_not])
        # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
        #                                                                SearchQuery(bool_query,limit=50,get_total_count=True),
        #                                                                ColumnsToGet(["sub_docs_json"],ColumnReturnType.SPECIFIED))
        # if total_count>0:
        #     dict_rows = getRow_ots(rows)
        #     _find = False
        #     for _row in dict_rows:
        #         sub_docs_json = _row.get("sub_docs_json",'[]')
        #         sub_docs = json.loads(sub_docs_json)
        #         for _doc in sub_docs:
        #             if "bidding_budget" in _doc and _doc.get("bidding_budget",0)>0:
        #                 item["new_budding_budget"] = _doc.get("bidding_budget",0)
        #                 _find = True
        #                 break
        #         if _find:
        #             break
        #
        # return


        _find = True
        bool_query =BoolQuery(must_queries=[
            generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_code","$$$$"),item.get("project_name","$$$$")],MatchPhraseQuery),
            generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
            RangeQuery("status",151,301),
            NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
            RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))

        ],must_not_queries=[query_not])
        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(bool_query,limit=50,get_total_count=True),
                                                                       ColumnsToGet(["doctitle","tenderee","sub_docs_json"],ColumnReturnType.SPECIFIED))
        if total_count>0:
            dict_rows = getRow_ots(rows)
            str_docid = ""
            for _row in dict_rows:
                str_docid+="%d,"%_row.get("docid")

                sub_docs_json = _row.get("sub_docs_json",'[]')
                sub_docs = json.loads(sub_docs_json)
                if item.get("tenderee","--")!=_row.get("tenderee","-#"):
                    continue
                for _doc in sub_docs:
                    if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
                        item["new_win_bid_price"] = _doc.get("win_bid_price")
                        item["new_win_tenderer"] = _doc.get("win_tenderer")
                        item["new_finded_docid"] = _row.get("docid")
                        _find = True
                        break
                if _find:
                    return
            item["maybe_docids"] = str_docid
        bool_query =BoolQuery(must_queries=[
            generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_code","$$$$")],MatchPhraseQuery),
            generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
            RangeQuery("status",151,301),
            NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
            RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))

        ],must_not_queries=[query_not])
        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(bool_query,limit=50,get_total_count=True),
                                                                       ColumnsToGet(["doctitle","tenderee","sub_docs_json"],ColumnReturnType.SPECIFIED))
        if total_count>0:
            dict_rows = getRow_ots(rows)
            str_docid = ""
            for _row in dict_rows:
                str_docid+="%d,"%_row.get("docid")

                sub_docs_json = _row.get("sub_docs_json",'[]')
                sub_docs = json.loads(sub_docs_json)
                if item.get("tenderee","--")!=_row.get("tenderee","-#"):
                    continue
                for _doc in sub_docs:
                    if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
                        item["new_win_bid_price"] = _doc.get("win_bid_price")
                        item["new_win_tenderer"] = _doc.get("win_tenderer")
                        item["new_finded_docid"] = _row.get("docid")
                        _find = True
                        break
                if _find:
                    return
            item["maybe_docids"] = str_docid
        bool_query =BoolQuery(must_queries=[
            generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_name","$$$$")],MatchPhraseQuery),
            generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
            RangeQuery("status",151,301),
            NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
            RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))

        ],must_not_queries=[query_not])
        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(bool_query,limit=50,get_total_count=True),
                                                                       ColumnsToGet(["doctitle"],ColumnReturnType.SPECIFIED))
        if total_count>0:
            dict_rows = getRow_ots(rows)
            str_docid = ""
            for _row in dict_rows:
                str_docid+="%d,"%_row.get("docid")
                sub_docs_json = _row.get("sub_docs_json",'[]')
                sub_docs = json.loads(sub_docs_json)
                if item.get("tenderee","--")!=_row.get("tenderee","-#"):
                    continue
                for _doc in sub_docs:
                    if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
                        item["new_win_bid_price"] = _doc.get("win_bid_price")
                        item["new_win_tenderer"] = _doc.get("win_tenderer")
                        item["new_finded_docid"] = _row.get("docid")
                        _find = True
                        break
                if _find:
                    return

            item["maybe_docids"] = str_docid
            return
    mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=getConnect_ots())
    mt.run()
    df_data = {"docids":[],
               "project_code":[],
               "project_name":[],
               "maybe_docids":[],
               "new_budding_budget":[],
               "new_win_bid_price":[],
               "new_win_tenderer":[],
               "new_finded_docid":[]}
    keys = df_data.keys()
    for item in list_items:
        for k in keys:
            df_data[k].append(item.get(k))
    df2 = pd.DataFrame(df_data)
    df2.to_excel("两广补充数据.xlsx")


def attachCompanyContact():
    files = ["../data/2021-03-17_四川_关键词导出.csv",
             "../data/2021-03-17_安徽_关键词导出.csv",
             "../data/2021-03-17_江西_关键词导出.csv",
             "../data/2021-03-17_湖南_关键词导出.csv"]

    files = ["../data/20210609（最新）.xlsx"]


    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)

    set_enter = set()
    for file in files:
        df = pd.read_excel(file)
        columns = ["招标单位","中标单位","代理单位"]
        for _c in columns:
            for item in df[_c]:
                if isinstance(item,str):
                    item = item.strip()
                    if item!="":
                        set_enter.add(item)
    dict_enter = getDictEnterprise(list(set_enter))


    for file in files:
        task_queue = queue.Queue()
        df = pd.read_excel(file,encoding="UTF8")
        keys = df.keys()[1:]
        list_item = []
        for row in df.itertuples():
            _dict = {}
            for _key in keys:
                if  _key in dir(row):
                    _v = row.__getattribute__(_key)
                else:
                    _v = ''
                _dict[_key] = _v
            if str(_dict["招标联系人"]) in ("","nan") or str(_dict["招标联系人电话"]) in ("","nan"):
                contact_person,mobile = getOneContact(dict_enter.get(_dict["招标单位"],{}).get("contacts","[]"))
                if contact_person!="":
                    _dict["招标联系人"] = contact_person
                    _dict["招标联系人电话"] = mobile
            if str(_dict["中标联系人"]) in ("","nan") or str(_dict["中标联系人电话"]) in ("","nan"):
                contact_person,mobile = getOneContact(dict_enter.get(_dict["中标单位"],{}).get("contacts","[]"))
                if contact_person!="":
                    _dict["中标联系人"] = contact_person
                    _dict["中标联系人电话"] = mobile
            if str(_dict["代理联系人"]) in ("","nan") or str(_dict["代理联系人电话"]) in ("","nan"):
                contact_person,mobile = getOneContact(dict_enter.get(_dict["代理单位"],{}).get("contacts","[]"))
                if contact_person!="":
                    _dict["代理联系人"] = contact_person
                    _dict["代理联系人电话"] = mobile
            list_item.append(_dict)
        for item in list_item:
            task_queue.put(item)

        df_data = {}
        for _k in keys:
            df_data[_k] = []
        for item in list_item:
            for _k in keys:
                df_data[_k].append(getLegal_str(item.get(_k,"-")))
        df1 = pd.DataFrame(df_data)
        df1.to_excel("%s_attach.xlsx"%file,columns=keys)

def dumpWebSourceNo():
    conn_oracle = getConnection_oracle()
    cursor_oracle = conn_oracle.cursor()
    sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO "
    cursor_oracle.execute(sql)
    rows = cursor_oracle.fetchall()

    conn_mysql = getConnection_testmysql()
    cursor_mysql = conn_mysql.cursor()
    for row in rows:
        sql = " insert into webSource(web_source_no,web_source_name) values('%s','%s')"%(row[0],row[1])
        print(sql)
        cursor_mysql.execute(sql)
    conn_mysql.commit()

def exportNzj():
    # filename = "../data/重复公告.xlsx"
    # df = pd.read_excel(filename)
    ots_client = getConnect_ots()
    columns = ["contacts","covered_area","follows","docids","page_time","progress","project_description","project_follow","project_code","project_name","project_type"]

    def getData(df_data,rows,set_line):
        list_data = getRow_ots(rows)
        for row in list_data:
            item = {}
            _dict = row
            set_dict_item(item,"docids",_dict.get("docids",""))
            set_dict_item(item,"contacts",_dict.get("contacts",""))
            set_dict_item(item,"covered_area",_dict.get("covered_area",""))
            set_dict_item(item,"follows",_dict.get("follows",""))
            set_dict_item(item,"project_type",_dict.get("project_type",""))
            # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
            set_dict_item(item,"page_time",_dict.get("page_time",""))
            set_dict_item(item,"progress",_dict.get("progress",""))

            set_dict_item(item,"project_description",_dict.get("project_description",""))
            set_dict_item(item,"project_follow",_dict.get("project_follow",""))
            set_dict_item(item,"project_code",_dict.get("project_code",""))
            set_dict_item(item,"project_name",_dict.get("project_name",""))

            for k,v in item.items():
                if k not in df_data:
                    df_data[k] = []
                df_data[k].append(v)

    df_data = {}


    bool_query = BoolQuery(must_queries=[ExistsQuery("docids")])


    rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
                                                                      SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
                                                                      ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))


    set_line = set()
    _count = len(rows)
    getData(df_data,rows,set_line)
    while next_token:
        print("%d/%d"%(_count,total_count))
        rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
                                                                          SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
                                                                          ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        getData(df_data,rows,set_line)
        _count += len(rows)

    df1 = pd.DataFrame(df_data)
    df1.to_excel("../data/2021-03-31_拟在建数据导出1.xlsx",columns=list_df_columns)

def turn_status():
    df = pd.read_excel("../data/欧科自然资源5w以上数据.xlsx")

    conn = getConnection_testmysql()
    cursor = conn.cursor()
    for docid in df["公告id"]:
        partitionkey = int(docid)%500+1
        sql = " insert into turn_status(partitionkey,docid) values(%d,%d)"%(partitionkey,docid)
        cursor.execute(sql)

    conn.commit()


def attachBidding_budget():

    conn_mysql = getConnection_testmysql()
    cursor = conn_mysql.cursor()

    sql = "select docid from analysis_r2 where bidding_budget=''"

    task_queue = queue.Queue()
    result_queue = queue.Queue()
    cursor.execute(sql)
    rows = cursor.fetchmany(10000)
    while(rows):
        for row in rows:
            task_queue.put(row[0])
        rows = cursor.fetchmany(10000)

    pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_testmysql)
    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)

    def _handle(item,result_queue,pool_mysql,pool_ots):
        ots_client = pool_ots.getConnector()

        bool_query = BoolQuery(must_queries=[TermQuery("docids",item)])
        rows, next_token, total_count, is_all_succeed = ots_client.search("project2", "project2_index",
                                                                          SearchQuery(bool_query , limit=1, get_total_count=True),
                                                                          ColumnsToGet(["bidding_budget"],return_type=ColumnReturnType.SPECIFIED))

        list_dict = getRow_ots(rows)
        if len(list_dict)>0:
            conn = pool_mysql.getConnector()
            cursor = conn.cursor()

            sql = " update analysis_r2 set bidding_budget='%s' where docid=%d"%(str(list_dict[0].get("bidding_budget","")),item)
            cursor.execute(sql)
            conn.commit()

            pool_mysql.putConnector(conn)


        pool_ots.putConnector(ots_client)

    mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=pool_mysql,pool_ots=pool_ots)
    mt.run()


def debug_documentMerge():
    conn = getConnection_testmysql()

    cursor = conn.cursor()

    sql = "select merge_docids from project_group_final_log "
    cursor.execute(sql)

    task_queue = queue.Queue()
    for row in cursor.fetchall():
        task_queue.put(row[0])

    print(task_queue.qsize())
    def _handle(item,result_queue,pool_ots):
        ots_client = pool_ots.getConnector()
        list_docids = item.split(",")

        must_q = []
        for _docid in list_docids:
            must_q.append(TermQuery("docids",_docid))
        bool_query = BoolQuery(must_queries=must_q)

        rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
                                                                       SearchQuery(bool_query,limit=1,get_total_count=True),
                                                                       ColumnsToGet(column_names=["docids"],return_type=ColumnReturnType.SPECIFIED))
        if total_count==0:
            print(item)
            result_queue.put(item)
        pool_ots.putConnector(ots_client)
    result_queue = queue.Queue()
    pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
    mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
    mt.run()
    while(True):
        try:
            item = result_queue.get(True)
            print(item)
        except Exception as e:
            print(str(e))
            break

def signDocument():
    filename = "C:\\Users\\Administrator\\Desktop\\中标信息1.xlsx"
    sign_filename = "%s_sign.xlsx"%filename
    df = pd.read_excel(filename)
    df_data = {"sign":[]}

    for item in df["segword"]:
        content = re.sub("\s*","",item)
        _find = re.search("(?P<key>采购失败|招标失败)",content)
        if _find is not None:
            df_data["sign"].append(_find.groupdict().get("key"))
        else:
            df_data["sign"].append("无")
    df1 = pd.DataFrame(df_data)
    df1.to_excel(sign_filename)

def exportWin_tenderer_count():
    ots_client = getConnect_ots()

    str_enter = '''
    红旗渠建设集团有限公司
河南瑞华建筑集团有限公司
林州瑞达工程管理有限公司
河南鸿盛建筑工程有限公司
天一建设发展有限公司
河南省大成建设工程有限公司
中润昌弘建工集团有限公司
河南省中创建筑工程有限公司
河南锦达建设有限公司
林州宏基建筑工程有限公司
河南富世建筑工程有限公司
中恒方圆建筑工程有限公司
河南华隆建设工程有限公司
河南昊锦建设集团有限公司
河南新隆建工集团有限公司
中城华安建设集团有限公司
河南恒通公路桥梁建设有限公司
林州二建集团建设有限公司
河南华安建设集团有限公司
河南裕鸿建筑工程有限公司
中商建投建设有限公司
河南鑫利恒工程有限公司
林州市永盛建筑有限公司
林州市顺鑫建筑有限公司
中水京林建设有限公司
河南宏海建设有限公司
河南宏岳建设有限公司
河南元筑实业有限公司
河南基兆建筑工程有限公司
林州金瑞建筑工程有限公司
林州建工集团有限公司
河南万融建筑工程有限公司
林州东风建设有限公司
河南鸿泰建筑安装有限公司
河南源泰建筑有限公司
河南优德建筑工程有限公司
安阳鸿盛建设劳务有限公司
河南省安阳市安装工程有限责任公司
河南港城建设工程有限公司
河南天汇建筑工程有限公司
河南省惠浦建设发展有限公司
林州市建筑工程有限公司
河南正天建筑工程有限公司
河南颂邦建筑工程有限公司
林州市华源建设有限公司
河南中投建设有限公司
林州华林建筑劳务有限公司
河南基祥建设工程有限公司
河南文水水电工程有限公司
林州兴业建筑工程有限公司
河南中州建筑有限公司
河南省佳禾园林建设有限公司
林州万亚建筑工程有限公司
河南林正建设工程有限公司
河南鼎兴建设工程有限公司
河南平原建工集团有限公司
河南林九建设工程有限公司
林州市三才建筑工程有限公司
安阳建设（集团）有限责任公司
盛世恒达建设有限公司
河南城洲建设工程有限公司
河南国埔建筑工程有限公司
中创市政建设发展有限公司
河南正祥建筑工程有限公司
河南宏九建筑工程有限公司
河南金阳建筑工程有限公司
河南天容建设工程有限责任公司
河南聚宇建筑工程有限公司
河南瑞旗建设工程有限公司
河南利盛建设工程有限公司
林州四海建设有限公司
林州市建工城建集团有限公司
河南众佑建设工程有限公司
河南德诚建设有限公司
河南景华建筑工程有限公司
河南华江建筑工程有限公司
林州永丰建设集团有限公司
林州福东建设工程有限公司
河南恒森建筑工程有限公司
河南朝泓建设工程有限公司
河南润京建设有限公司
林州市红旗渠公路工程有限公司
林州中宇建设工程有限公司
河南长锦建设工程有限公司
河南汇商建筑工程有限公司
河南省豫鹤建设工程有限公司
河南江城建筑工程有限公司
中海华祥建设发展有限公司
河南宁中路桥建筑有限公司
河南天河建设工程有限公司
林州市路桥建筑工程有限公司
河南省中北建设有限公司
河南汇亿建筑工程有限公司
河南金帝建筑安装有限公司
河南省望安建筑工程有限公司
泰欣建设有限公司
河南筑鑫建筑工程有限公司
元熙建设工程有限公司
旭隆建设集团有限公司
河南省城控建工集团有限公司
河南晨丰建筑工程有限公司
河南嘉丰建设有限公司
林州市合众建筑劳务有限公司
河南金瓦刀建筑劳务有限公司
河南中实建筑工程有限公司
畅通路桥工程建设有限责任公司
河南军恒建设有限公司
中钊建设集团有限公司
河南德宁建设集团有限公司
林州兴鸿建筑工程有限公司
林州市明泽建筑工程有限公司
河南紫光建筑工程有限公司
河南誉天建筑工程有限公司
林州景丰建筑劳务有限公司
河南江瀚建筑劳务有限公司
河南弘之昌建筑工程有限公司
河南祥泰钻井工程有限公司
河南迅阳建筑劳务有限公司
河南嘉成建筑工程有限公司
河南兴锦建设工程有限公司
河南邦坤建设工程有限公司
河南锦毅市政工程建筑有限公司
河南广益建筑工程有限公司
河南创胜建筑工程有限公司
河南勤铭建筑工程有限公司
河南铭锋建设工程有限公司
平源建设有限公司
河南隆通建筑工程有限公司
河南省基本建设有限公司
河南丰茂建筑劳务有限公司
河南城安建筑工程有限公司
林州市富源建筑劳务有限公司
德方建设有限公司
河南泰联建筑工程有限公司
河南新建投工程有限公司
河南省鲁班建工集团有限公司
林州方超建筑劳务有限公司
林州市采桑建筑劳务输出有限公司
河南省仁昱建筑工程有限公司
河南鸾林建设工程有限公司
宜民建设集团有限公司
林州聚兴建筑工程有限公司
河南省聚国建筑工程有限公司
林州市大东建筑劳务有限公司
河南欣东劳务有限公司
中建润德景观建筑工程有限公司
河南辰宇建设工程有限公司
号东建设工程有限公司
河南润北建筑工程有限公司
河南邦昊建设工程有限公司
林州市建设投资有限责任公司
林州市太行建设工程有限公司
河南峡安建筑工程有限公司
河南安疆建筑工程有限公司
河南淇河建设工程有限公司
河南晶品建设有限公司
河南翔固建筑工程有限公司
纵横九州路桥建设有限公司
河南青林建筑工程有限公司
合久建设有限公司
河南明昊建筑工程有限公司
河南滨河建设工程有限公司
河南群腾建筑工程有限公司
河南隆亨建筑工程有限公司
骏达建设有限公司
河南仁安建设工程有限公司
河南旻尚园林建筑工程有限公司
河南省匡正建设工程有限公司
河南金凡建筑工程有限公司
河南佰丰建筑工程有限公司
德普建设有限公司
国润新天地工程技术有限公司
中潮建设发展有限公司
河南捷正建筑工程有限公司
林州百万工匠建筑劳务有限公司
河南祥彬建筑工程有限公司
河南林祥工程建设有限公司
河南唐尧建筑劳务有限公司
河南汇祥建设有限公司
河南友信建设有限公司
林州市鼎昇建筑工程有限公司
林州市富兴建筑劳务有限公司
林州厚德建筑劳务有限公司
河南振亚工程建设有限公司
河南英茂建筑工程有限公司
河南丰岩建设工程有限公司
林州市昌都建筑工程有限公司
林州四建建筑工程有限公司
林州和兴建筑劳务有限公司
林州市鸿升建筑工程有限公司
河南润泰建设工程有限公司
河南鑫路通建筑劳务有限公司
河南信守建筑劳务有限公司
林州安达鸿昌建筑劳务有限公司
河南意达建设有限公司
河南金穗来建筑工程有限公司
河南东风建筑工程有限公司
河南筑胜建筑劳务有限公司
河南民润建筑工程有限公司
林州市中锦路桥建设工程有限公司
林州一建建筑工程有限公司
林州市宏瑞建筑劳务有限公司
林州鸿恩建筑劳务有限公司
河南晟元建筑工程有限公司
中国建筑第六工程局有限公司
河南筑泰建筑工程有限公司
河南省亚建建筑工程有限公司
河南辰弘建筑工程有限公司
河南先创建筑工程有限公司
林豫建工集团有限公司
河南省盛民建筑工程有限公司
河南泓发市政工程有限公司
河南帝恩建筑劳务有限公司
河南天泉建设工程有限公司
河南恒升工程建设有限公司
林州市浩远电力建筑工程有限公司
河南友瑞建筑工程有限公司
河南冠州路桥工程有限公司
三角鼎建设工程有限公司
河南富坤建筑工程有限公司
林州市恒源建筑工程有限公司
河南广汇建筑工程有限公司
河南隆豫建设有限公司
林州市九洲工程劳务有限公司
林州瑜辉建筑工程有限公司
河南福恩建筑工程有限公司
河南通盛路桥建设有限公司
河南央泰建设工程有限公司
林州市红旗渠公路养护工程有限公司
林州大兴建设工程有限公司
河南锐丰建设工程有限公司
林州市中泰建筑劳务有限公司
林州成业建筑工程有限公司
河南建创建筑工程有限公司
河南宏兴建设工程有限公司
河南隆鼎建筑工程有限公司
林州市天罡建筑劳务有限公司
汇聚建设发展有限公司
中铁中城工程有限公司
河南景天建筑劳务有限公司
林州蒙建建设工程有限公司
富华建设工程有限公司
河南殿轩建筑劳务有限公司
河南瑞通建设工程有限公司
林州金桥劳务工程有限公司
河南省景隆实业有限公司
河南升洲建筑工程有限公司
河南里程建筑劳务有限公司
林州市润景建设工程有限公司
河南巨坤建筑工程有限公司
河南九牛建设劳务有限公司
吉修建设工程有限公司
河南图润建筑工程有限公司
河南鼎鑫建筑劳务有限公司
河南港航建设工程有限公司
河南省盛飞建设工程有限公司
林州市兴义建筑劳务有限公司
河南秉程建筑工程有限公司
河南硕亚水电路桥工程有限公司
河南科才建筑劳务有限公司
河南荣泰建筑安装工程有限公司
河南省天丰建筑工程有限公司
河南方元建筑工程有限公司
恒上建设有限公司
河南省德信建筑工程有限公司
河南诚宸建设工程有限公司
河南置信建筑工程有限公司
河南省鑫河建设有限公司
河南成兴建设工程有限公司
林州中港建筑工程有限公司
河南富春建设工程有限公司
中科豫资建设发展有限公司
河南京都建筑安装有限公司
安阳市宇豪爆破工程有限公司
河南华特建筑工程有限公司
河南颍淮建工有限公司
林州市八建工程有限公司
河南展辉建筑工程有限公司
河南中博建筑有限公司
河南方圆建设有限公司
河南大鼎建筑工程有限公司
林州中天建设有限公司
河南久东建筑工程有限公司
河南九一建设工程有限公司
九州水文建设集团有限公司
河南省建安防水防腐工程有限公司
中建宏图建设发展有限公司
筑宇建设有限公司
林州市宏图建设工程有限公司
河南林润建设工程有限公司
嘉泰建设发展有限公司
河南丰茂建筑安装工程有限公司
河南万泰建设工程有限公司
林州市红旗渠市政工程有限公司
林州建总建筑工程有限公司
河南聚之祥建设有限公司
河南鼎之信建设工程有限公司
河南省华瑞建设工程有限公司
河南世光电力工程有限公司
河南地远建筑工程有限公司
河南鑫品建筑工程有限公司
河南省东旗建筑工程有限公司
润华建设有限公司
林州富民建筑劳务有限公司
林州市晨诚建筑劳务有限公司
河南万胜建设有限公司
河南龙磐建筑工程有限公司
河南顺昌建筑劳务有限公司
林州恒瑞建设工程有限公司
河南大成建设劳务有限公司
河南大一建筑劳务有限公司
河南盛威建筑工程有限公司
河南坤之宇建筑工程有限公司
众信电力工程有限公司
河南昱佛建筑工程有限公司
河南淇源建筑工程有限公司
林州凤宝建筑安装有限公司
河南中发岩土工程有限公司
河南中都建设工程有限公司
河南祥凯建筑工程有限公司
河南乐泰建筑工程有限公司
林州宏达建筑劳务有限公司
河南华盛建设集团有限公司
河南凯通建设工程有限公司
国腾路桥工程有限公司
中建方达建设工程有限公司
河南省天都建设工程有限公司
昌隆建设工程有限公司
河南洹上村园林绿化工程有限公司
河南双锦建设工程有限公司
河南子丰市政工程有限公司
林州首创建筑工程有限公司
河南众鑫建筑工程有限公司
河南宁崴建筑工程有限公司
林州市航安建筑劳务有限公司
林州益成建设工程有限公司
林州市昌弘建筑工程有限公司
河南正耀建设有限公司
河南鑫鹏建设工程有限公司
林州恒泰建筑工程有限公司
林竣建设有限公司
河南朝众建筑工程有限公司
林州科鸿建筑工程有限公司
东辰建设发展有限公司
河南创新新能源科技有限公司
河南省永业建筑工程有限公司
林州市煜凯建筑工程有限公司
宝鼎建设工程有限公司
林州市航安建筑工程有限公司
河南业展建设工程有限公司
河南联竣建筑工程有限公司
河南聚超建筑工程有限公司
林州远方电力工程有限公司
河南蒙寅建筑劳务有限公司
方元建筑劳务有限公司
龙兴建设工程有限公司
河南春谦建设工程有限公司
河南正博公路工程有限公司
林州市汇鑫安装工程有限公司
林州市祥隆劳务有限公司
河南胜杰建筑工程有限公司
河南恩普建筑工程有限公司
河南港津建筑工程有限公司
河南昌明建筑工程有限公司
中豫城控建设集团有限公司
林州晨宇建设工程有限公司
河南豫柯建筑工程有限公司
河南捷润建筑工程有限公司
中方通建设工程有限公司
河南多果建筑工程有限公司
河南尚伟建筑工程有限公司
林州新航程建筑工程有限公司
河南金华建筑工程有限公司
国云工程技术有限公司
河南路威路桥工程有限公司
林州中盛建设工程有限公司
林州市恒基建设有限公司
河南润恒建筑工程有限公司
河南华安水利工程有限公司
中城易通建设发展有限公司
河南浚洲建筑工程有限公司
林州市锦晟建筑劳务有限公司
河南省北安建筑工程有限公司
林州泰岳建设工程有限公司
河南联洋建筑工程有限公司
河南港大市政建筑工程有限公司
林州东盛建筑劳务有限公司
河南省天鉴建设工程有限公司
河南瑞凝建筑工程有限公司
林州市东瑞建筑劳务有限公司
河南众达建筑劳务有限公司
河南省帝增建筑工程有限公司
河南省升灿建筑工程有限公司
河南苑景建筑劳务分包有限公司
林州众立建设工程有限公司
红旺建筑工程有限公司
林州市圣兴建筑劳务有限公司
林州诚林建筑劳务有限公司
林州建工劳务有限公司
河南巨业建筑工程有限公司
中科华夏建设开发有限公司
君晟建筑工程有限公司
郑州新动力建筑劳务分包有限公司
河南省福德建筑工程有限公司
林州源大建筑工程有限公司
河南大瑞园林建设有限公司
河南秋禾建筑劳务有限公司
河南腾翔建筑工程有限公司
河南天之华建设工程有限公司
河南祥和建筑安装有限公司
河南省鼎文建设工程有限公司
河南周城建设发展有限公司
河南庆泰建筑工程有限公司
中科信合建设发展有限公司
林州恒隆建设工程有限公司
河南省力恒建筑工程有限公司
林州市四季青绿化有限责任公司
林州市景盛建筑工程有限公司
河南建基建设工程有限公司
河南宝凯建筑工程有限公司
林州市四合建筑劳务有限公司
河南和耀建筑工程有限公司
林州市凯达建筑劳务有限公司
林州市恒信建筑劳务有限公司
开翔建设工程有限公司
河南省新创达建设工程有限公司
林州鑫龙建筑工程有限公司
河南省昌博建筑工程有限公司
河南君利泰建筑工程有限公司
林州杏林建筑工程有限公司
河南千禧建设工程有限公司
中建诚正建筑工程有限公司
河南省聚千建筑工程有限公司
林州海之鸿建筑工程有限公司
河南振鼎建筑工程有限公司
林州方成建筑劳务有限公司
河南众众建设工程有限公司
林州市万润建筑劳务有限公司
启创建设工程有限公司
河南子明建筑工程有限公司
安阳市兴鼎路桥工程有限公司
河南智擎建筑劳务有限公司
河南鼎平市政工程有限公司
林州宏阳建筑工程有限公司
河南豫泰建筑工程有限公司
林州市鸿浩建筑劳务有限公司
林州市锦华建筑工程有限公司
河南瑞锋建设有限公司
河南欧信建筑劳务有限公司
林州市中兴建筑劳务有限公司
林州市大德建设工程有限公司
河南华文建设有限公司
河南凌焜建筑工程有限公司
河南安居建设有限公司
林州鲲鹏建筑工程有限公司
林州经纬建筑工程有限公司
林州祥川建筑工程有限公司
林州市鑫淼建筑劳务有限公司
河南祥泰路桥有限公司
景祥建设工程有限公司
河南省兴华建安工程有限公司
河南古森建筑劳务有限公司
平祥建设工程有限公司
河南大博建设工程有限公司
河南华普建设工程有限公司
河南东邦建设工程有限公司
卓冠建设工程有限公司
河南品瑞建筑工程有限公司
河南宝金建设工程有限公司
中城鑫邦建设有限公司
河南省鸿运建设工程有限公司
林州明奥建筑工程有限公司
河南金手指建设工程有限公司
林州市弘顺建筑劳务有限公司
林州市林海建筑劳务有限公司
河南艺兆市政工程有限公司
林州誉峰建筑工程有限公司
河南卓骏建筑工程有限公司
林州众成建筑工程有限公司
河南城通市政工程有限公司
林州市晋源建筑工程有限公司
河南飞越建筑工程有限公司
林州鑫泰建筑工程有限公司
林州市太行建筑劳务有限公司
河南筑丰建设发展有限公司
林州一帆建筑劳务有限公司
林州宏久建筑工程有限公司
林州市盛祥建筑劳务有限公司
河南黎润建设工程有限公司
林州市永安建筑劳务有限公司
河南省长江建设实业有限公司
河南腾润建设工程有限公司
河南国梁建设工程有限公司
河南诚聚建筑工程有限公司
河南德邦市政工程有限公司
河南安德建设工程有限公司
河南森川建筑工程有限公司
林州市顺通公路工程有限公司
河南领邦建筑工程有限公司
河南博兴建设工程有限公司
东泽消防工程有限公司

    '''
    list_enter = []
    for _p in re.split("\s",str_enter):
        if _p.strip()=="":
            continue
        list_enter.append({"name":_p.strip()})

    def _handle(item,result_queue,pool_ots):
        ots_client = pool_ots.getConnector()

        try:

            bool_query = BoolQuery(must_queries=[
                NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["name"]))
                ,RangeQuery("status",201,300,include_lower=True,include_upper=True)
                ,RangeQuery("page_time","2020-01-01")
            ])

            rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
                                                                              SearchQuery(bool_query, limit=1, get_total_count=True),
                                                                              ColumnsToGet(['docid'], ColumnReturnType.SPECIFIED))
            item["total_count"] = total_count

            # bool_query = BoolQuery(must_queries=[
            #     NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["name"]))
            #     ,RangeQuery("status",201,300,include_lower=True,include_upper=True)
            #     ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",0,1000000,include_upper=True))
            # ])
            #
            # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
            #                                                                   SearchQuery(bool_query, limit=1, get_total_count=True),
            #                                                                   ColumnsToGet(['docid'], ColumnReturnType.SPECIFIED))
            # item["l_total_count"] = total_count
        except Exception as e:
            pass
        pool_ots.putConnector(ots_client)

    pool_ots = ConnectorPool(init_num=30,max_num=40,method_init=getConnect_ots)
    task_queue = queue.Queue()
    for item in list_enter:
        task_queue.put(item)
    mt = MultiThreadHandler(task_queue,_handle,None,30,pool_ots=pool_ots)
    mt.run()
    df_data = {}
    for item in list_enter:
        for k,v in item.items():
            if k not in df_data:
                df_data[k] = []
            df_data[k].append(v)
    df = pd.DataFrame(df_data)
    df.to_excel("../data/%s.xls"%getCurrent_date("%Y-%m-%d_%H%M%S"))


from bs4 import BeautifulSoup

def downloadAttach(_url,_path):
    try:
        result = requests.get(_url,stream=True,timeout=20)
        if result.status_code==200:
            with open(_path,"wb") as f:
                f.write(result.content)
        else:
            log("download failed with code %d of url:%s"%(result.status_code,_url))
    except Exception:
        log("download failed of url:%s"%(_url))


def extract_pageAttachments(_html):
    fileSuffix = [".zip", ".rar", ".tar", ".7z", ".wim", ".docx", ".doc", ".xlsx", ".xls", ".pdf", ".txt", ".hnzf", ".bmp", ".jpg", ".jpeg", ".png", ".tif", ".swf"]
    _soup = BeautifulSoup(_html,"lxml")
    list_a = _soup.find_all("a")
    list_img = _soup.find_all("img")
    page_attachments = []
    for _a in list_a:
        _text  =_a.get_text()
        _url = _a.attrs.get("href","")
        if _url.find("http://www.bidizhaobiao.com")>=0:
            continue
        is_attach = False
        for suf in fileSuffix:
            if _text.find(suf)>=0 or _url.find(suf)>=0:
                is_attach = True
        if is_attach:
            page_attachments.append({"fileLink":_url,"fileTitle":_text})
    for _a in list_img:
        _text  =_a.get_text()
        _url = _a.attrs.get("src","")
        if _url.find("http://www.bidizhaobiao.com")>=0:
            continue
        is_attach = False
        for suf in fileSuffix:
            if _text.find(suf)>=0 or _url.find(suf)>=0:
                is_attach = True
        if is_attach:
            page_attachments.append({"fileLink":_url,"fileTitle":_text})
    return page_attachments


def exportDocument_attachment():
    ots_client = getConnect_ots()
    bool_query = BoolQuery(must_queries=[TermQuery("docid",165528701)])

    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                   columns_to_get=ColumnsToGet(["dochtmlcon"],ColumnReturnType.SPECIFIED))
    list_data = getRow_ots(rows)
    for _data in list_data:
        extract_pageAttachments(_data["dochtmlcon"])


def transUUid():
    conn_oracle = getConnection_oracle()
    cursor = conn_oracle.cursor()
    tables = ['T_ZHAO_BIAO_GONG_GAO','T_ZHONG_BIAO_XIN_XI']
    conn_mysql = getConnection_testmysql()
    cursor_mysql = conn_mysql.cursor()
    for _t in tables:
        sql = " select id,page_time,'%s' from bxkc.%s where page_time>='%s' and page_time<='%s' order by page_time "%(_t,_t,"2021-06-01","2021-08-31")
        print(sql)
        cursor.execute(sql)
        _count = 0
        while(True):
            insert_sql = "insert into fix_document(uuid,page_time,table_name) values"
            rows = cursor.fetchmany(10000)

            if not rows:
                break
            _count += len(rows)
            print(_count)
            for row in rows:
                _uuid = row[0]
                page_time = row[1]
                table_name = row[2]
                insert_sql += "('%s','%s','%s'),"%(_uuid,page_time,table_name)
            insert_sql = insert_sql[:-1]
            cursor_mysql.execute(insert_sql)
            conn_mysql.commit()

def fix_document():
    conn_oracle = getConnection_oracle()
    cursor_oracle = conn_oracle.cursor()

    conn_mysql = getConnection_testmysql()
    cursor_mysql = conn_mysql.cursor()

    sql = "select uuid,page_time,table_name from fix_document_final where page_time>='2021-06-24' "
    cursor_mysql.execute(sql)
    _count = 0
    while True:
        rows = cursor_mysql.fetchmany(1000)
        if not rows:
            break
        _count += len(rows)
        print(_count)
        insert_sql = ""
        for row in rows:
            _uuid = row[0]
            page_time = row[1]
            table_name = row[2]
            insert_sql += " insert into BXKC.fix_document_final(id,page_time,TABLENAME) values('%s','%s','%s');"%(_uuid,page_time,table_name)
        insert_sql = "begin %s end;"%(insert_sql)
        cursor_oracle.execute(insert_sql)
    conn_oracle.commit()


def exportDocument_forRecommen():
    filename = "../data/推荐 (1).csv"
    df = pd.read_csv(filename,encoding="GBK")


    ots_client = getConnect_ots()
    columns = ["province","city","page_time","doctitle","product"]
    current_date = getCurrent_date("%Y-%m-%d")

    adict_data = []
    _index = 0
    for company,json_docid in zip(df["company"][:10000],df["json_docid"][:10000]):
        _index += 1

        _province = ""
        # consumed, return_row, next_token = ots_client.get_row("enterprise",[("name",company)],columns_to_get=["province"])
        # dict_k = getRow_ots_primary(return_row)
        # _province = dict_k.get("province","")
        print("序号:%d,%s,%s"%(_index,company,_province))
        dict_recommen = json.loads(json_docid)
        for str_way,str_docid in dict_recommen.items():
            should_q = []
            for _docid in str_docid.split(","):
                should_q.append(TermQuery("docid",_docid))
            bool_query = BoolQuery(must_queries=[
                # TermQuery("province",_province)
                # ,RangeQuery("page_time",timeAdd(current_date,-7),current_date,True,True)
                # ,
                BoolQuery(should_queries=should_q)]
                                   )
            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                           SearchQuery(bool_query,get_total_count=True,limit=100),
                                                                           ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
            adict_row = getRow_ots(rows)
            for dict_row in adict_row:
                dict_item = dict()
                set_dict_item(dict_item,"公司名称",company)
                set_dict_item(dict_item,"推荐路径",str_way)
                set_dict_item(dict_item,"公告id",dict_row.get("docid",""))
                set_dict_item(dict_item,"省份",dict_row.get("province",""))
                set_dict_item(dict_item,"城市",dict_row.get("city",""))
                set_dict_item(dict_item,"page_time",dict_row.get("page_time",""))
                set_dict_item(dict_item,"doctitle",dict_row.get("doctitle",""))
                set_dict_item(dict_item,"product",dict_row.get("product",""))
                adict_data.append(dict_item)
    dict_data = {}
    for dict_item in adict_data:
        for k in list_df_columns:
            if k not in dict_data:
                dict_data[k] = []
            dict_data[k].append(dict_item.get(k,""))
    df1 = pd.DataFrame(dict_data)
    df1.to_excel("../data/%s_推荐.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=list_df_columns)


def exportDocument_by_days(page_time):
    dict_channel = getDict_docchannel()
    ots_client = getConnect_ots()
    filename = "供货贷含[建筑]企业名单.xlsx"
    df = pd.read_excel(filename)


    bool_query = BoolQuery(must_queries=[TermQuery("page_time",page_time),
                                         # RangeQuery("status",201,301),
                                         ])

    # columns = ["doctitle","docchannel","product","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","original_docchannel","detail_link"]
    columns = ["doctitle","docchannel","product","bidway","moneysource","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","original_docchannel","detail_link","page_attachments","service_time"]
    dict_channel = getDict_docchannel()

    def hidePhone(phone):
        if phone is None or phone=="":
            return ""
        return "*"*(len(phone)-4)+phone[-4:]

    def getData(df_data,rows,set_line,list_keyword,set_columns,df_columns):
        list_data = getRow_ots(rows)
        for row in list_data:
            item = {}
            _dict = row
            set_dict_item_columns(set_columns,df_columns,item,"docid",_dict.get("docid",""))
            set_dict_item_columns(set_columns,df_columns,item,"公告标题",_dict.get("doctitle",""))
            # set_dict_item_columns(set_columns,df_columns,item,"公告内容",_dict.get("doctextcon",""))
            set_dict_item_columns(set_columns,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
            # set_dict_item_columns(set_columns,df_columns,item,"关键词",",".join(list(set(re.findall("|".join(list_keyword),_dict.get("doctextcon",""))))))
            set_dict_item_columns(set_columns,df_columns,item,"产品",_dict.get("product",""))
            set_dict_item_columns(set_columns,df_columns,item,"省份",_dict.get("province",""))
            # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
            set_dict_item_columns(set_columns,df_columns,item,"资金来源",_dict.get("moneysource",""))
            set_dict_item_columns(set_columns,df_columns,item,"招标方式",_dict.get("bidway",""))
            set_dict_item_columns(set_columns,df_columns,item,"服务期限",_dict.get("service_time",""))
            set_dict_item_columns(set_columns,df_columns,item,"城市",_dict.get("city",""))
            set_dict_item_columns(set_columns,df_columns,item,"区县",_dict.get("district",""))
            set_dict_item_columns(set_columns,df_columns,item,"发布时间",_dict.get("page_time",""))
            set_dict_item_columns(set_columns,df_columns,item,"创建时间",_dict.get("crtime",""))

            set_dict_item_columns(set_columns,df_columns,item,"行业一级分类",_dict.get("industry",""))
            set_dict_item_columns(set_columns,df_columns,item,"行业二级分类",_dict.get("info_type",""))

            # set_dict_item_columns(set_columns,df_columns,item,"uuid",_dict.get("uuid"))

            # set_dict_item_columns(set_columns,df_columns,item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  _dict.get("doctitle","")))

            set_dict_item_columns(set_columns,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
            set_dict_item_columns(set_columns,df_columns,item,"原网公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))
            set_dict_item_columns(set_columns,df_columns,item,"status","正常" if _dict.get("status",201) <=300 else "去重")
            set_dict_item_columns(set_columns,df_columns,item,"detail_link",_dict.get("detail_link"))
            set_dict_item_columns(set_columns,df_columns,item,"web_source_no",_dict.get("web_source_no",""))
            set_dict_item_columns(set_columns,df_columns,item,"web_source_name",_dict.get("web_source_name",""))
            set_dict_item_columns(set_columns,df_columns,item,"项目名称",_dict.get("project_name",""))
            set_dict_item_columns(set_columns,df_columns,item,"项目编号",_dict.get("project_code",""))
            set_dict_item_columns(set_columns,df_columns,item,"招标单位",_dict.get("tenderee",""))
            set_dict_item_columns(set_columns,df_columns,item,"招标联系人",_dict.get("tenderee_contact",""))
            set_dict_item_columns(set_columns,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone",""))
            set_dict_item_columns(set_columns,df_columns,item,"代理单位",_dict.get("agency",""))
            set_dict_item_columns(set_columns,df_columns,item,"代理联系人",_dict.get("agency_contact",""))
            set_dict_item_columns(set_columns,df_columns,item,"代理联系人电话",_dict.get("agency_phone",""))
            set_dict_item_columns(set_columns,df_columns,item,"url","http://www.bidizhaobiao.com/info-%d.html"%(_dict.get("docid","")))
            set_dict_item_columns(set_columns,df_columns,item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))

            set_dict_item_columns(set_columns,df_columns,item,"截标时间",_dict.get("time_bidclose",""))

            set_dict_item_columns(set_columns,df_columns,item,"page_attachments",_dict.get("page_attachments","[]"))

            sub_docs_json = _dict.get("sub_docs_json")
            set_tenderer = set()
            if sub_docs_json is not None:
                docs = json.loads(sub_docs_json)
                docs.sort(key=lambda x:x.get("win_bid_price",0))
                for _doc in docs:
                    if "win_tenderer" in _doc:
                        set_dict_item_columns(set_columns,df_columns,item,"中标单位",_doc["win_tenderer"])
                    if "second_tenderer" in _doc:
                        set_dict_item_columns(set_columns,df_columns,item,"第二候选单位",_doc["second_tenderer"])
                        set_tenderer.add(_doc.get("second_tenderer"))
                    if "third_tenderer" in _doc:
                        set_dict_item_columns(set_columns,df_columns,item,"第三候选单位",_doc["third_tenderer"])
                        set_tenderer.add(_doc.get("third_tenderer"))
                    if "win_tenderee_manager" in _doc:
                        set_dict_item_columns(set_columns,df_columns,item,"中标单位联系人",_doc["win_tenderee_manager"])
                    if "win_tenderee_phone" in _doc:
                        set_dict_item_columns(set_columns,df_columns,item,"中标单位联系电话",_doc["win_tenderee_phone"])
                    if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
                        set_dict_item_columns(set_columns,df_columns,item,"中标金额",_doc["win_bid_price"])
                    if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
                        set_dict_item_columns(set_columns,df_columns,item,"招标金额",_doc["bidding_budget"])
            set_dict_item_columns(set_columns,df_columns,item,"入围供应商",",".join(list(set_tenderer)))
            if "第二候选单位" not in item:
                set_dict_item_columns(set_columns,df_columns,item,"第二候选单位","")
            if "第三候选单位" not in item:
                set_dict_item_columns(set_columns,df_columns,item,"第三候选单位","")
            if "招标金额" not in item:
                set_dict_item_columns(set_columns,df_columns,item,"招标金额","")
            if "中标金额" not in item:
                set_dict_item_columns(set_columns,df_columns,item,"中标金额","")
            if "中标单位" not in item:
                set_dict_item_columns(set_columns,df_columns,item,"中标单位","")

            if "中标单位联系人" not in item:
                set_dict_item_columns(set_columns,df_columns,item,"中标单位联系人","")
            if "中标单位联系电话" not in item:
                set_dict_item_columns(set_columns,df_columns,item,"中标单位联系电话","")

            # if item["中标单位"] not in set_enter:
            #     continue

            _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
            # if _line in set_line:
            #     continue
            # if item["招标金额"]=="":
            #     continue

            # set_line.add(_line)
            for k,v in item.items():
                if k not in df_data:
                    df_data[k] = []
                df_data[k].append(v)

    df_data = {}
    set_columns = set()
    df_columns = []
    # for name in df["ent_name_real"]:
    #     if isinstance(name,str) and name!="":
    #         list_should_q = []
    #         # list_should_q.append(MatchPhraseQuery("doctextcon",name))
    #         # list_should_q.append(MatchPhraseQuery("attachmenttextcon",name))
    #         NestedQuery("sub_docs_json","sub_docs_json.win_tenderer",name)
    #         bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01-01"),
    #                                              RangeQuery("status",201,301),
    #                                              # BoolQuery(should_queries=list_should_q),
    #                                              NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",name))
    #                                              ])
    #
    #         rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
    #                                                                        SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
    #                                                                        ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
    #
    #
    #         while True:
    #             getData(df_data,rows,set(),"",set_columns,df_columns)
    #             if not next_token:
    #                 break
    #             rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
    #                                                                            SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
    #                                                                            ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
    #             if len(df_data.keys())>0:
    #                 print(len(df_data[list(df_data.keys())[0]]),total_count)

    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
                                                                   ColumnsToGet(columns,ColumnReturnType.SPECIFIED))


    while True:
        getData(df_data,rows,set(),"",set_columns,df_columns)
        if not next_token:
            break
        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                       ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
        if len(df_data.keys())>0:
            print(len(df_data[list(df_data.keys())[0]]),total_count)


    # appendAttachmentPath(df_data,"page_attachments","附件链接")
    # df_columns.append("附件链接")
    # df_columns = ["docid","公告标题","公告类别","发布时间","公告内容","省份","城市","项目编号","招标单位","招标金额","资金来源","招标方式","代理单位","中标单位","中标金额","第二候选单位","第三候选单位","url","附件链接"]

    df = pd.DataFrame(df_data)
    df.to_excel("../data/%s_%s.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S"),page_time),columns=df_columns)

def appendAttachmentPath(df_data,key,new_key):
    list_data = []
    for _attach in df_data[key]:
        _dict = {key:_attach}
        list_data.append(_dict)
    task_queue = Queue()
    for _d in list_data:
        task_queue.put(_d)

    auth = oss2.Auth("LTAI5tFuoxHm8Uxrr5nT8wTZ", "Yp01bylJFx0al6teCaccY8hbtllBGg")
    bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
    attachment_bucket_name = "attachment-hub"
    bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)
    ots_client = getConnect_ots()

    def search(ots_client,table_name,key_tuple,columns_to_get):
        try:
            # 调用get_row接口查询，最后一个参数值1表示只需要返回一个版本的值。
            consumed, return_row, next_token = ots_client.get_row(table_name, key_tuple, columns_to_get, None, 1)
            if return_row is not None:
                _dict = getRow_ots_primary(return_row)
                return _dict
            return None

        # 客户端异常，一般为参数错误或者网络异常。
        except OTSClientError as e:
            traceback.print_exc()
            log("get row failed, http_status:%d, error_message:%s" % (e.get_http_status(), e.get_error_message()))
        # 服务端异常，一般为参数错误或者流控错误。
        except OTSServiceError as e:
            traceback.print_exc()
            log("get row failed, http_status:%d, error_code:%s, error_message:%s, request_id:%s" % (e.get_http_status(), e.get_error_code(), e.get_error_message(), e.get_request_id()))

    def _handle(item,result_queue):
        page_attachments = json.loads(item.get(key,"[]"))

        list_url = []
        for _a in page_attachments:
            fileMd5 = _a.get("fileMd5")
            print("==",fileMd5)
            _s_dict = search(ots_client,"attachment",[("filemd5",fileMd5)],["path"])
            if _s_dict is not None:
                _path = _s_dict.get("path")
                if _path is not None:
                    _url = bucket.sign_url("GET",_path,86500*5)
                    list_url.append(_url)
        item[new_key] = json.dumps(list_url)
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()
    df_data[new_key] = []
    for _d in list_data:
        df_data[new_key].append(_d.get(new_key))


def export_competition():

    file = "select___from_province_indus_entity_top1.xlsx"
    df1 = pd.read_excel(file)
    ots_client = getConnect_ots()
    task_queue = queue.Queue()
    list_entity = []
    for province,industry,entitys in zip(df1["province"],df1["industry"],df1["entitys"]):
        l_e = json.loads(entitys)
        for l in l_e:
            list_entity.append({"province":province,
                                "industry":industry,
                                "win_tenderer":l.get("win_tenderee","")})
    for item in list_entity:
        task_queue.put(item)


    def _handle(item,result_queue):
        def getData(rows,_set):
            dict_rows = getRow_ots(rows)
            for _dict in dict_rows:
                sub_docs_json = _dict.get("sub_docs_json")
                if sub_docs_json is not None:
                    for sub_docs in json.loads(sub_docs_json):
                        if sub_docs.get("win_tenderer") is not None:
                            _set.add(sub_docs.get("win_tenderer"))
                        if sub_docs.get("second_tenderer") is not None:
                            _set.add(sub_docs.get("second_tenderer"))
                        if sub_docs.get("third_tenderer") is not None:
                            _set.add(sub_docs.get("third_tenderer"))
        columns = ["sub_docs_json"]
        _company = item.get("win_tenderer")
        should_q = BoolQuery(should_queries=[NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_company)),
                                             NestedQuery("sub_docs_json",TermQuery("sub_docs_json.second_tenderer",_company)),
                                             NestedQuery("sub_docs_json",TermQuery("sub_docs_json.third_tenderer",_company))])
        bool_query = BoolQuery(must_queries=[
                                    # should_q,
                                    MatchPhraseQuery("doctextcon",_company),
                                             RangeQuery("docchannel",101)])

        _set = set()
        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
                                                                       ColumnsToGet(columns,ColumnReturnType.SPECIFIED))


        getData(rows,_set)
        _count = 0
        _page = 0
        while next_token:
            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                           SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
                                                                           ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
            getData(rows,_set)
            _count += 1
            _page += 1
            if len(_set)>20 or _page>20:
                break
        if item["win_tenderer"] in _set:
            _set.remove(item["win_tenderer"])
        item["competition"] = ",".join(list(_set))
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()
    df_data = {}
    keys = ["province","industry","win_tenderer","competition"]
    for key in keys:
        if key not in df_data:
            df_data[key] = []
    for item in list_entity:
        for key in keys:
            df_data[key].append(item.get(key))
    df2 = pd.DataFrame(df_data)
    df2.to_excel("competition.xlsx",columns=keys)


def document_dumplicate():
    df = pd.read_excel("../data/2022-01-19_214329_export11.xlsx")
    print(df.keys())


def export_document_no_price():
    df = pd.read_csv("select___from_document_no_price_tmp.csv",encoding="gbk")
    conn_oracle = getConnection_oracle()
    cursor = conn_oracle.cursor()
    sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO"
    cursor.execute(sql)
    dict_source = {}
    while 1:
        rows = cursor.fetchmany(10000)
        if not rows:
            break
        for row in rows:
            dict_source[row[0]] = row[1]
            dict_source[row[0].split("(")[0]] = row[1]
    list_name = []
    set_web_source = set()
    for web_source_no in df["web_source_no"]:
        set_web_source.add(web_source_no)
        list_name.append(dict_source.get(web_source_no,""))
    dict_source_year = {}
    for web_source_no,year,counts_no_price,counts_all,rate in zip(df["web_source_no"],df["year"],df["counts_no_price"],df["counts_all"],df["rate"]):
        dict_source_year["%s&%s"%(web_source_no,year)] = {"counts_no_price":counts_no_price,"counts_all":counts_all,"rate":rate}
    new_data = {"web_source_no":[],
                "web_source_name":[],
                "counts_no_price":[],
                "counts_all":[],
                "rate":[],
                "counts_no_price1":[],
                "counts_all1":[],
                "rate1":[]}

    for web_source_no in list(set_web_source):
        new_data["web_source_no"].append(web_source_no)
        new_data["web_source_name"].append(dict_source.get(web_source_no,""))
        d_2020 = dict_source_year.get("%s&%s"%(web_source_no,"2020"),{})
        d_2021 = dict_source_year.get("%s&%s"%(web_source_no,"2021"),{})
        new_data["counts_no_price"].append(d_2020.get("counts_no_price"))
        new_data["counts_all"].append(d_2020.get("counts_all"))
        new_data["rate"].append(d_2020.get("rate"))
        new_data["counts_no_price1"].append(d_2021.get("counts_no_price"))
        new_data["counts_all1"].append( d_2021.get("counts_all"))
        new_data["rate1"].append(d_2021.get("rate"))

    # new_data = {"year":df["year"],
    #             "web_source_no":df["web_source_no"],
    #             "web_source_name":list_name,
    #             "counts_no_price":df["counts_no_price"],
    #             "counts_all":df["counts_all"],
    #             "rate":df["rate"]}


    df2 = pd.DataFrame(new_data)
    df2.to_excel("websource_no_price1.xlsx",columns=["web_source_no","web_source_name","counts_no_price","counts_all","rate","counts_no_price1","counts_all1","rate1"])

def exportDetailLink():
    df = pd.read_excel("招投标数据测试反馈表3.xlsx")
    list_item = []
    for docid in df["docid"]:
        list_item.append({"docid":docid})
    task_queue = queue.Queue()
    for item in list_item:
        task_queue.put(item)

    def _handle(item,result_queue,ots_client,pool_oracle):
        try:
            conn = pool_oracle.getConnector()

            docid = int(item["docid"])
            partitionkey = int(docid%500+1)
            consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",int(docid))],["original_docchannel","detail_link","uuid"])
            _dict = getRow_ots_primary(return_row)
            if _dict.get("detail_link") is not None and len(_dict.get("detail_link"))>0:
                item["detail_link"] = _dict.get("detail_link")
            else:
                original_docchannel = _dict.get("original_docchannel")
                _uuid = _dict.get("uuid")
                d_tablename = {"51":"T_GONG_GAO_BIAN_GENG",
                              "52":"T_ZHAO_BIAO_GONG_GAO",
                              "101":"T_ZHONG_BIAO_XIN_XI",
                              "102":"T_ZHAO_BIAO_YU_GAO",
                              "103":"T_ZHAO_BIAO_DA_YI",
                              "104":"T_ZHAO_BIAO_WEN_JIAN",
                              "114":"T_CAI_GOU_YI_XIANG"
                              }
                _tablename = d_tablename.get(str(original_docchannel))
                if _tablename is not None:
                    cursor = conn.cursor()
                    sql = "select detail_link from bxkc.%s where id='%s'"%(_tablename,_uuid)
                    print(sql)
                    cursor.execute(sql)
                    rows = cursor.fetchall()
                    if len(rows)>0:
                        item["detail_link"] = rows[0][0]
                    cursor.close()


        except Exception as e:
            traceback.print_exc()
        finally:
            pool_oracle.putConnector(conn)

    ots_client = getConnect_ots()
    pool_oracle = ConnectorPool(10,30,getConnection_oracle)
    mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client,pool_oracle=pool_oracle)
    mt.run()

    df_data = {"docid":[],
               "detail_link":[]}
    for item in list_item:
        for k,v in df_data.items():
            v.append(item.get(k,""))
    df2 = pd.DataFrame(df_data)
    df2.to_excel("222.xlsx")

def process_doc():
    df = pd.read_excel("../data/2022-03-16_154617_数据导出.xlsx",1)

    list_check = []
    set_process_docid = set()
    for docid in df["process_docid"]:
        set_process_docid.add(docid)
    df = pd.read_excel("../data/2022-03-16_154617_数据导出.xlsx",0)
    for docid in df["docid"]:
        if docid in set_process_docid:
            list_check.append("1")
        else:
            list_check.append("0")
    df["check"] = list_check
    df.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')))


def export_extract2():
    ots_client = getConnect_ots()
    df_keys = ["docid","extract_json","status"]
    df_data = {}
    for _key in df_keys:
        df_data[_key] = []

    bool_query = BoolQuery(must_queries=[
                                         RangeQuery("status",1,1000,True,True)])

    rows, next_token, total_count, is_all_succeed = ots_client.search("document_extract2", "document_extract2_index",
                                                                      SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
                                                                      ColumnsToGet(df_keys,return_type=ColumnReturnType.SPECIFIED))
    list_dict = getRow_ots((rows))
    for _dict in list_dict:
        if re.search("false",_dict.get("extract_json","")) is None:
            continue
        for k in df_keys:
            df_data[k].append(_dict.get(k))
    _count = len(list_dict)
    while next_token:
        rows, next_token, total_count, is_all_succeed = ots_client.search("document_extract2", "document_extract2_index",
                                                                          SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
                                                                          ColumnsToGet(df_keys,return_type=ColumnReturnType.SPECIFIED))
        list_dict = getRow_ots((rows))
        for _dict in list_dict:
            if re.search("false",_dict.get("extract_json","")) is None:
                continue
            for k in df_keys:
                df_data[k].append(_dict.get(k))
        _count += len(list_dict)
        print("%d/%d"%(_count,total_count))
    df = pd.DataFrame(df_data)
    df.to_excel("../data/%s_extract2.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))

def export_by_file():
    df = pd.read_csv("../data/2022-04-01_121315_数据导出.csv",encoding="gbk")
    keys = df.keys()
    df_data = {}
    set_win = set()
    set_ruwei = set()
    for k in keys:
        df_data[k] = []
    for _i in range(len(df["产品"])):
        product = df["产品"][_i]
        if product is None or not isinstance(product,str):
            continue
        print(product)
        win_tenderer = df["中标单位"][_i]
        if win_tenderer is not None and isinstance(win_tenderer,str):
            set_win.add(win_tenderer)
            set_ruwei.add(win_tenderer)
        ruwei = df["入围供应商"][_i]
        if ruwei is not None and isinstance(ruwei,str):
            l_s = ruwei.split(",")
            for _s in l_s:
                set_ruwei.add(_s)
        if re.search("公路|道路|路基|路面|快速通道|高速|隧道|飞机跑道|桥梁|养护|路段|市政|照明工程|照明设施|亮灯|灯光改造|灯光工程|管道|架线|园林|景观|绿化|排水|河道整治|环境治理|交通|地铁|跌路|高铁|桥梁|大桥|桥段",product) is not None:
            for k in keys:
                df_data[k].append(df[k][_i])
    print("win count:%d ruwei:%d"%(len(set_win),len(set_ruwei)))
    # df1 = pd.DataFrame(df_data)
    # df1.to_excel("../data/%s_文件导出.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")),columns=keys)

def export_dump():
    import pandas as pd
    df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=0)
    _set_number = set()
    _set_number |= set(df["号码"])
    print(len(_set_number))
    df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=1)
    _set_number |= set(df["号码"])
    print(len(_set_number))
    df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=2)
    keys = df.keys()
    df_data = {}
    for k in keys:
        df_data[k] = []
    for _i in range(len(df[keys[0]])):
        if df["号码"][_i] not in _set_number:
            for k in keys:
                df_data[k].append(df[k][_i])
        _set_number.add(df["号码"][_i])
    df2 = pd.DataFrame(df_data)
    df2.to_excel("tmp222.xlsx")

def check_data_synchronization():
    filepath = "C:\\Users\\Administrator\\Desktop\\to_check.log"
    list_uuid = []
    _regrex = "ID='(?P<uuid>.+)'"
    with open(filepath,"r",encoding="utf8") as f:
        while 1:
            _line = f.readline()
            if not _line:
                break
            _match = re.search(_regrex,_line)
            if _match is not None:
                _uuid = _match.groupdict().get("uuid")
                if _uuid is not None:
                    list_uuid.append(_uuid)
    print(len(list_uuid))
    task_queue = Queue()
    list_data = []
    for _uuid in list_uuid:
        _dict = {"uuid":_uuid}
        list_data.append(_dict)
        task_queue.put(_dict)
    ots_client = getConnect_ots()
    def _handle(_item,result_queue):
        bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])

        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(bool_query,get_total_count=True),
                                                                       columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))

        _item["exists"] = total_count
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()
    df_data = {"uuid":[],
               "exists":[]}
    for _data in list_data:
        for k,v in df_data.items():
            v.append(_data.get(k))
    import pandas as pd
    df2 = pd.DataFrame(df_data)
    df2.to_excel("check.xlsx")

def group_xlsx():
    filename = "厂商&赛道列表.xlsx"
    df0 = pd.read_excel(filename,0)
    df1 = pd.read_excel(filename,1)
    df2 = pd.read_excel(filename,2)
    set_1 = set(df0["中国厂商"]) | set(df1["国际厂商"])
    set_2 = set(df2["a"]) | set(df2["b"]) | set(df2["c"])

    filename = "../data/2022-05-24_185801_数据导出.xlsx"
    df = pd.read_excel(filename)
    dict_docid = {}
    for docid,keyword in zip(df["docid"],df["关键词"]):
        if docid not in dict_docid:
            dict_docid[docid] = [[],[]]
        if keyword in set_1:
            dict_docid[docid][0].append(keyword)
        else:
            dict_docid[docid][1].append(keyword)
    set_docid = set()
    for k,v in dict_docid.items():
        if len(v[0])>=1 and len(v[1])>=1:
            set_docid.add(k)
    keys = df.keys()
    print(keys)
    df_data = {}
    for i in range(len(df["docid"])):
        print(i)
        docid = df["docid"][i]
        if docid in set_docid:
            for k in keys:
                if k not in df_data:
                    df_data[k] = []
                df_data[k].append(df[k][i])
            df_data["关键词"][-1] = str(dict_docid[docid][0][0])+"+"+str(dict_docid[docid][1][0])
    df1 = pd.DataFrame(df_data)
    df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=keys)

def static_process_time():
    ots_client = getConnect_ots()

    bool_query = BoolQuery(must_queries=[
        RangeQuery("crtime","2022-05-26","2022-05-27"),
        TermQuery("page_time","2022-05-26")
    ])
    rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),get_total_count=True,limit=100),
                                                                   ColumnsToGet(column_names=["crtime","opertime","publishtime","page_attachments"],return_type=ColumnReturnType.SPECIFIED))

    list_data = []
    _l = getRow_ots(rows)
    list_data.extend(_l)
    while next_token:
        rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
                                                                       SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
                                                                       ColumnsToGet(column_names=["crtime","opertime","publishtime","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
        _l = getRow_ots(rows)
        list_data.extend(_l)
        print("%d/%d"%(len(list_data),total_count))
    list_dis = []
    list_dis_a = []
    list_dis_n_a = []
    for _data in list_data:
        crtime = _data.get("crtime")
        opertime = _data.get("opertime")
        page_attachments = _data.get("page_attachments","[]")
        _d = time.mktime(time.strptime(opertime,"%Y-%m-%d %H:%M:%S"))-time.mktime(time.strptime(crtime,"%Y-%m-%d %H:%M:%S"))
        list_dis.append(_d)
        if page_attachments=="[]":
            list_dis_n_a.append(_d)
        else:
            list_dis_a.append(_d)
    print("avg_time:",sum(list_dis)/len(list_dis),max(list_dis),min(list_dis))
    print("avg_time:",sum(list_dis_a)/len(list_dis_a),max(list_dis_a),min(list_dis_a))
    print("avg_time:",sum(list_dis_n_a)/len(list_dis_n_a),max(list_dis_n_a),min(list_dis_n_a))

def export_dump_by_id():
    filename = "遗漏待验证1.csv"
    df = pd.read_csv(filename)
    list_k = []
    ots_client = getConnect_ots()
    for _main_url,_other_url in zip(df["_c0"],df["_c1"]):
        _d = {}
        main_docid = re.split("[-.]",_main_url)[3]
        l_other = []
        for _l in _other_url.split(","):
            _docid = re.split("[-.]",_l)[3]
            l_other.append(_docid)
        _d["main_docid"] = main_docid
        _d["other_docid"] = l_other
        list_k.append(_d)
    task_queue = Queue()
    for _q in list_k:
        task_queue.put(_q)
    def _handle(item,result_queue):
        columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","page_attachments"]
        main_docid = item["main_docid"]
        other_docid = item["other_docid"]
        list_should_q = []
        list_should_q.append(TermQuery("docid",main_docid))
        for _d in other_docid:
            list_should_q.append(TermQuery("docid",_d))
        _query = BoolQuery(should_queries=list_should_q)
        l_rows = []
        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
                                                                       ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        dict_row = getRow_ots(rows)
        l_rows.extend(dict_row)
        log("total count:%d"%total_count)
        _count = len(dict_row)
        while next_token:
            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                           SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
                                                                           ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            dict_row = getRow_ots(rows)
            l_rows.extend(dict_row)
            _count += len(dict_row)
        item["data"] = l_rows
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()
    df_data = {"main_docid":[]}
    set_line = set()
    dict_channel = getDict_docchannel()
    for _d in list_k:
        list_row = _d.get("data")
        if list_row is not None:
            main_docid = _d.get("main_docid")
            getRowData(df_data,list_row,set_line,[],dict_channel,True)
            for _ in list_row:
                df_data["main_docid"].append(main_docid)
    df1 = pd.DataFrame(df_data)
    list_df_columns1 = ["main_docid"].extend(list_df_columns)
    df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns1)

def count_product():

    filename = "../data/2022-06-24_152201_数据导出.xlsx"
    df = pd.read_excel(filename)

    # _product = df["产品"]
    # dict_p_c = {}
    # for _p in _product:
    #     if isinstance(_p,str) and _p!="":
    #         l_p = _p.split(",")
    #         for _p1 in l_p:
    #             if _p1 not in dict_p_c:
    #                 dict_p_c[_p1] = 0
    #             dict_p_c[_p1] += 1
    # df_data = {"产品":[],
    #            "次数":[]}
    # for k,v in dict_p_c.items():
    #     df_data["产品"].append(k)
    #     df_data["次数"].append(v)
    # df1 = pd.DataFrame(df_data)
    # df1.to_excel("222.xlsx")

    keys = df.keys()
    df_data = {}
    for k in keys:
        df_data[k] = []
    product_pattern = "电脑|台式机|电脑|主机|网络|软件|开发|通信|系统|信息技术"
    df1 = pd.read_excel("222.xlsx")
    list_p = []
    for _p,_n in zip(df1["产品"],df1["need"]):
        if _n==1:
            list_p.append(_p)
    product_pattern = product_pattern+"|"+"|".join(list_p)
    _product = df["产品"]
    for _i in range(len(_product)):
        if re.search(product_pattern,str(_product[_i])) is not None:
            for k in keys:
                df_data[k].append(df[k][_i])

    df2 = pd.DataFrame(df_data)
    df2.to_excel("333.xlsx",columns=keys)

from dataSource.source import getConnect_capacity
def exportHonors_item_info():

    ots_capacity = getConnect_capacity()

    bool_query = BoolQuery(should_queries=[
        # TermQuery("ryjx","海河杯"),
        WildcardQuery("hjdw","*合肥建工集团有限公司*")
    ])

    rows,next_token,total_count,is_all_succeed = ots_capacity.search("honors_item_info","honors_item_info_index",
                                                                     SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("create_time")]),get_total_count=True,limit=100),
                                                                     columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
    list_data = getRow_ots(rows)

    while next_token:
        rows,next_token,total_count,is_all_succeed = ots_capacity.search("honors_item_info","honors_item_info_index",
                                                                         SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
                                                                         columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
        list_data.extend(getRow_ots(rows))

    df_data = {}
    set_columns1 = set()
    list_df_columns1 = []
    for _data in list_data:

        _dict = {}
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"record_id",_data.get("record_id"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"bfdw",_data.get("bfdw"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"create_time",_data.get("create_time"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"cs",_data.get("cs"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"detail_link",_data.get("detail_link"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"fbsj",_data.get("fbsj"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjdw",_data.get("hjdw"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjdwjs",_data.get("hjdwjs"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjxm",_data.get("hjxm"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"jxjb",_data.get("jxjb"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"jxlx",_data.get("jxlx"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"ryjx",_data.get("ryjx"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"sf",_data.get("sf"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"xmfzr",_data.get("xmfzr"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"zgdw",_data.get("zgdw"))
        set_dict_item_columns(set_columns1,list_df_columns1,_dict,"zxj",_data.get("zxj"))

        for k,v in _dict.items():
            if k not in df_data:
                df_data[k] = []
            df_data[k].append(v)
    df = pd.DataFrame(df_data)
    df.to_excel("honor_export.xlsx",columns=list_df_columns1)


def check_dump_data():

    ots_client = getConnect_ots()
    bool_query = BoolQuery(must_queries=[
        generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery),
        RangeQuery("crtime","2022-09-07 20:00:00","2022-09-08 06:00:00"),
        RangeQuery("page_time","2022-09-07","2022-09-08"),
        RangeQuery("status",201,301)
    ])

    list_data = getDocument([{"query":bool_query}],["docid"],table_name="document",table_index="document_index")


    bool_query1 = BoolQuery(must_queries=[
        generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery),
        RangeQuery("crtime","2022-09-07 20:00:00","2022-09-08 06:00:00"),
        RangeQuery("page_time","2022-09-07","2022-09-08"),
        RangeQuery("status",81,100),
        TermQuery("save",1)
    ])

    list_data1 = getDocument([{"query":bool_query1}],["docid"],table_name="document_tmp",table_index="document_tmp_index")

    set_docid = set()
    set_docid_tmp = set()
    for _data in list_data:
        set_docid.add(_data.get("docid"))
    for _data in list_data1:
        set_docid_tmp.add(_data.get("docid"))
    print("document - tmp",set_docid-set_docid_tmp)
    print("tmp - document",set_docid_tmp-set_docid)

def search_title_count():
    filename = "数据样例.xlsx"
    df = pd.read_excel(filename)

    list_title_dict = []
    for _title in df["标题"]:
        _dict = {"标题":_title}
        list_title_dict.append(_dict)
    task_queue = Queue()

    for _d in list_title_dict:
        task_queue.put(_d)
    ots_client = getConnect_ots()
    def _handle(item,result_queue):
        columns = ["status","tenderee","agency","sub_docs_json"]
        _title = item.get("标题","")
        if _title!="":
            bool_query = BoolQuery(must_queries=[MatchPhraseQuery("doctitle",_title)])

            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                           SearchQuery(bool_query,get_total_count=True,limit=10),
                                                                           columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
            list_data = getRow_ots(rows)
            item["比地数量"] = total_count
            if len(list_data)>0:
                _str_docid = ",".join([str(a.get("docid")) for a in list_data])
                item["比地_docid"] = _str_docid
                tenderee = list_data[0].get("tenderee")
                item["比地_招标人"] = tenderee
                agency = list_data[0].get("agency")
                item["比地_代理人"] = agency
                sub_docs_json = list_data[0].get("sub_docs_json")
                if sub_docs_json is not None:
                    sub_docs = json.loads(sub_docs_json)
                    win_tenderer = ""
                    win_bid_price = ""
                    for _doc in sub_docs:
                        if _doc.get("win_tenderer","")!="":
                            win_tenderer = _doc.get("win_tenderer")
                            win_bid_price = _doc.get("win_bid_price")
                    item["比地_中标人"] = win_tenderer
                    item["比地_中标金额"] = win_bid_price
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()
    df_data = {}
    keys = ["标题","比地数量","比地_docid","比地_招标人","比地_代理人","比地_中标人","比地_中标金额"]
    for _d in list_title_dict:
        for k in keys:
            if k not in df_data:
                df_data[k] = []
            df_data[k].append(_d.get(k,""))
    df1 = pd.DataFrame(df_data)
    df1.to_excel("比地对比数据.xlsx",columns=keys)


def getDumplicate_docid():
    filename = "2022-11-02_154222_数据导出.xlsx"
    df = pd.read_excel(filename)
    list_docid = df["docid"]
    task_queue = Queue()
    list_d = []
    for _docid in list_docid:
        _dict = {"docid":_docid}
        list_d.append(_dict)
        task_queue.put(_dict)

    ots_client = getConnect_ots()
    def _handle(item,result_queue):

        _docid = item.get("docid")
        bool_query = BoolQuery(must_queries=[TermQuery("docid",int(_docid))])
        rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
                                                                       SearchQuery(bool_query),
                                                                       columns_to_get=ColumnsToGet(["save"],return_type=ColumnReturnType.SPECIFIED))
        list_data = getRow_ots(rows)
        if len(list_data)>0:
            _save = list_data[0].get("save")
            item["save"] = _save
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()
    list_d_docid = []
    for _data in list_d:
        docid = _data.get("docid")
        save = _data.get("save")
        if save==0:
            list_d_docid.append(str(docid))
    print(",".join(list_d_docid))


def getDocumentHtml():
    filename = "../data/2023-02-20_154118_数据导出.xlsx"
    df = pd.read_excel(filename)
    ots_client = getConnect_capacity()
    list_html_data = []
    _count = 0
    for docid in df["docid"][:10000]:
        partitionkey = int(docid)%500+1
        docid = int(docid)
        try:
            consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["dochtmlcon"])
            _dict = getRow_ots_primary(return_row)
            list_html_data.append(_dict)
            _count += 1
            print("%d/%d"%(_count,len(df["docid"])))
        except Exception as e:
            pass
    save(list_html_data,"list_html_data.pk")

def exportAgencyCount():
    filename = "广州招标协会.xlsx"

    df = pd.read_excel(filename)
    a = df["a"]
    df_data = {}
    set_c = set()
    for line in a:
        list_c = line.split(" ")
        for _i in range(len(list_c)):
            _key = "c_%s"%(str(_i).rjust(2,'0'))
            if _key not in df_data:
                df_data[_key] = []
                set_c.add(_key)
            df_data[_key].append(list_c[_i])
    list_data = []
    list_query = []
    ots_client = getConnect_ots()
    for _agency in df_data["c_00"]:
        query = BoolQuery(must_queries=[TermQuery("city","广州"),
                                        TermQuery("docchannel",52),
                                        RangeQuery("status",201,301),
                                        RangeQuery("page_time","2022-01-01","2023-01-01"),
                                        TermQuery("agency",_agency),
                                        BoolQuery(should_queries=[
                                            BoolQuery(should_queries=[MatchPhraseQuery("doctitle","工程施工"),
                                                                      MatchPhraseQuery("doctextcon","建造师"),
                                                                      MatchPhraseQuery("attachmenttextcon","建造师")]),
                                            BoolQuery(should_queries=[MatchPhraseQuery("doctitle","监理"),
                                                                      MatchPhraseQuery("doctextcon","监理工程师"),
                                                                      MatchPhraseQuery("attachmenttextcon","监理工程师")]),
                                            BoolQuery(should_queries=[MatchPhraseQuery("doctitle","造价咨询"),
                                                                      MatchPhraseQuery("doctitle","预算"),
                                                                      MatchPhraseQuery("doctitle","造价审核"),
                                                                      MatchPhraseQuery("doctitle","结算"),
                                                                      MatchPhraseQuery("doctitle","概算")]),
                                        ])
                                        ],
                          must_not_queries=[generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["广州公共资源交易中心"],MatchPhraseQuery)])


        # list_row = getDocument([{"query":query}],["agency","page_time","sub_docs_json"],thread_count=1)
        list_query.append({"query":query})

    df_data = {}
    set_line = set()
    columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
    list_row = getDocument(list_query,columns,thread_count=30)
    # list_row = filterRow(list_row,"product",list_not_key)
    log("get document %d rows"%len(list_row))
    # getRowDataWithKey(df_data,list_row,columns)
    dict_channel = getDict_docchannel()
    getRowData(df_data,list_row,set_line,[''],dict_channel,True)
    # getRowData_sp1(df_data,list_row,set_line,list_keyword,dict_sptype,True)

    # fixContactPerson(df_data,list_df_columns,get_legal_person=False)
    df1 = pd.DataFrame(df_data)
    df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
    return

    #     _dict = {"一季度预算":0,"一季度总数":0,"一季度有金额占比":0,
    #              "二季度预算":0,"二季度总数":0,"二季度有金额占比":0,
    #              "三季度预算":0,"三季度总数":0,"三季度有金额占比":0,
    #              "四季度预算":0,"四季度总数":0,"四季度有金额占比":0,
    #              }
    #     print(_agency,len(list_row))
    #
    #
    #     for _row in list_row:
    #         print(_row.get("docid"))
    #         page_time = _row.get("page_time","")
    #         sub_docs_json = _row.get("sub_docs_json","")
    #         _t = None
    #         if page_time!="":
    #             if page_time<="2022-03-31":
    #                 _t = "一季度"
    #             elif page_time<="2022-06-31":
    #                 _t = "二季度"
    #             elif page_time<="2022-09-31":
    #                 _t = "三季度"
    #             elif page_time<="2022-12-31":
    #                 _t  = "四季度"
    #             if sub_docs_json != "":
    #                 sub_docs = json.loads(sub_docs_json)
    #                 for _doc in sub_docs:
    #                     bidding_budget = _doc.get("bidding_budget",0)
    #                     _dict["%s预算"%_t] += bidding_budget
    #                     _dict["%s总数"%_t] += 1
    #                     if bidding_budget>0:
    #                         _dict["%s有金额占比"%_t] += 1
    #                     print(_dict)
    #     _sum = 0
    #     _sum_n = 0
    #     _sum_hm = 0
    #     for k in ["一季度","二季度","三季度","四季度"]:
    #         km = "%s预算"%k
    #         kn = "%s总数"%k
    #         khm = "%s有金额占比"%k
    #         _sum += _dict[km]
    #         _sum_n += _dict[kn]
    #         _sum_hm += _dict[khm]
    #     _dict["全年预算"] = _sum
    #     _dict["全年总数"] = _sum_n
    #     _dict["全年有金额占比"] = _sum_hm
    #     for k in ["一季度","二季度","三季度","四季度","全年"]:
    #         km = "%s预算"%k
    #         kn = "%s总数"%k
    #         khm = "%s有金额占比"%k
    #         _dict[khm] = _dict[khm]/_dict[kn] if _dict[kn]>0 else 0
    #     for k,v in _dict.items():
    #         if k not in df_data:
    #             df_data[k] = []
    #         df_data[k].append(v)
    #
    # list_c = list(set_c)
    # list_c.sort(key=lambda x:x)
    # for k,v in df_data.items():
    #     print(k,len(v))
    # df1 = pd.DataFrame(df_data)
    # list_c.append("一季度预算")
    # list_c.append("一季度总数")
    # list_c.append("一季度有金额占比")
    # list_c.append("二季度预算")
    # list_c.append("二季度总数")
    # list_c.append("二季度有金额占比")
    # list_c.append("三季度预算")
    # list_c.append("三季度总数")
    # list_c.append("三季度有金额占比")
    # list_c.append("四季度预算")
    # list_c.append("四季度总数")
    # list_c.append("四季度有金额占比")
    # list_c.append("全年预算")
    # list_c.append("全年总数")
    # list_c.append("全年有金额占比")
    # df1.to_excel("%s_1.xlsx"%(filename),columns=list_c)

def attachAttachment():
    filename = "北京电信ICT样例（2023一季度）v1.1(2).xlsx"
    df = pd.read_excel(filename,1)
    list_data = []
    task_queue = Queue()
    for _docid in df["docid"]:
        _d = {"docid":_docid}
        list_data.append(_d)
        task_queue.put(_d)
    print("len_docid",len(df["docid"]),len(list_data))

    capacity = getConnect_capacity()
    def _handle(item,result_queue):
        docid = item["docid"]
        consumed, return_row, next_token = capacity.get_row("document",[("partitionkey",int(docid)%500+1),("docid",int(docid))],["dochtmlcon"])
        _d = getRow_ots_primary(return_row)
        _dochtmlcon = _d["dochtmlcon"]
        _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
        _soup = BeautifulSoup(_dochtmlcon,"lxml")
        _div = _soup.find("div",attrs={"class":"richTextFetch"})
        if _div is None:
            _div = ""
        item["attachment"] = _div
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()

    list_attachment = []

    for _d in list_data:
        list_attachment.append(getLegal_str(_d.get("attachment","")))
    df_data = {}

    df_data["附件html"] = list_attachment
    df_1 = pd.DataFrame(df_data)
    df_1.to_excel("附加html_"+filename)

def compareData():
    filename = "D:\\BaiduNetdiskDownload\\bidi_check.csv"
    list_data = []
    with open(filename,"r",encoding="utf8") as f:
        list_lines = f.readlines()
    for _line in list_lines:
        docid,docchannel,win_tenderer,tenderee,win_bid_price,bidding_budget = [None if a[:2]=='\\N' else a for a in _line.split("\t")]
        _d = {"docid":int(docid),
              "docchannel":docchannel,
              "win_tenderer":win_tenderer,
              "tenderee":tenderee,
              "win_bid_price":float(win_bid_price) if win_bid_price is not None else None,
              "bidding_budget":float(bidding_budget) if bidding_budget is not None else None}
        list_data.append(_d)
    del list_lines
    # for _i in range(len(list_data)):
    #     print(list_lines[_i])
    #     print(list_data[_i])

    ots_client = getConnect_ots()
    task_queue = Queue()
    for _d in list_data:
        task_queue.put(_d)
    def _handle(item,result_queue):
        docid = item.get("docid")
        win_tenderer = item.get("win_tenderer")
        win_bid_price = item.get("win_bid_price")
        tenderee = item.get("tenderee")
        bidding_budget = item.get("bidding_budget")

        must_q = [TermQuery("docid",int(docid))]
        if tenderee is not None:
            must_q.append(TermQuery("tenderee",tenderee))
        if win_tenderer is not None:
            must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",win_tenderer)))
        if win_bid_price is not None:
            must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_bid_price",win_bid_price)))
        if bidding_budget is not None:
            must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.bidding_budget",bidding_budget)))

        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(BoolQuery(must_queries=must_q),get_total_count=True),
                                                                       columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
        item["total_count"] = total_count
        if total_count==0:
            print("docid %d total_count is 0",docid)
            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                           SearchQuery(BoolQuery(must_queries=[TermQuery("docid",docid)]),get_total_count=True),
                                                                           columns_to_get=ColumnsToGet(["tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
            l_d = getRow_ots(rows)
            item["return_row"] = l_d
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()
    list_new_data = []
    for data in list_data:
        if data.get("total_count")==0:
            new_d = {"docid":data.get("docid"),
                     "docchannel":data.get("docchannel")}
            return_row = data.get("return_row")
            if len(return_row)>0:
                # print(return_row)
                _row = return_row[0]
                tenderee = _row.get("tenderee")
                sub_docs_json = _row.get("sub_docs_json")
                bidding_budget = None
                win_tenderer = None
                win_bid_price = None
                if sub_docs_json is not None:
                    sub_docs = json.loads(sub_docs_json)
                    for _doc in sub_docs:
                        if _doc.get("bidding_budget") is not None:
                            bidding_budget = _doc.get("bidding_budget")
                        if _doc.get("win_tenderer") is not None:
                            win_tenderer = _doc.get("win_tenderer")
                            win_bid_price = _doc.get("win_bid_price")
                new_d["tenderee"] = tenderee
                new_d["bidding_budget"] = bidding_budget
                new_d["win_tenderer"] = win_tenderer
                new_d["win_bid_price"] = win_bid_price
                list_new_data.append(new_d)
    df_data_c = ["docid","docchannel","win_tenderer","tenderee","win_bid_price","bidding_budget"]
    df_data = {}
    for c in df_data_c:
        df_data[c] = []
    for _d in list_new_data:
        for c in df_data_c:
            df_data[c].append(_d.get(c))
    df = pd.DataFrame(df_data)
    df.to_csv("bid_check_result.csv",columns=df_data_c)


def exportProducts():
    filename = "货物关键词.xlsx"

    dict_channel = getDict_docchannel()
    df = pd.read_excel(filename)
    list_products = df["货物关键词"]
    list_q = []
    list_result = []
    ots_client = getConnect_ots()
    columns = ["产品","总数","匹配模式"]
    _index = 0
    task_queue = Queue()
    for _product in list_products:
        _index += 1
        print(_product,"%d/%d"%(_index,len(list_products)))
        bool_query = BoolQuery(must_queries=[NestedQuery("products",TermQuery("products.product",_product)),
                                             # RangeQuery("page_time","2021-01-01"),
                                             RangeQuery("status",201,301),
                                             TermQuery("docchannel",101)])


        _q = {"query":bool_query,"product":_product,"匹配模式":"精准"}
        task_queue.put(_q)

        bool_query = BoolQuery(must_queries=[NestedQuery("products",WildcardQuery("products.product","*%s*"%_product)),
                                             # RangeQuery("page_time","2021-01-01"),
                                             RangeQuery("status",201,301),
                                             TermQuery("docchannel",101)])


        _q = {"query":bool_query,"product":_product,"匹配模式":"包括"}
        task_queue.put(_q)

    def _handle(item,result_queue):
        bool_query = item["query"]
        _product = item["product"]
        _type = item["匹配模式"]
        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(bool_query,get_total_count=True),
                                                                       columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
        list_result.append({"产品":_product,"总数":total_count,"匹配模式":_type})
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()

    print("done result length:%d"%(len(list_result)))

    df_data = {}
    for _d in list_result:
        for c in columns:
            if c not in df_data:
                df_data[c] = []
            df_data[c].append(_d.get(c))


    df1 = pd.DataFrame(df_data)
    df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=columns)

def statics_attachment_counts():

    bool_query = BoolQuery(must_queries=[
        generateBoolShouldQuery(["industry"],["土木工程建筑业","建筑装饰和其他建筑业","房屋建筑业","专业施工","修缮工程","建筑安装业"],TermQuery),
        RangeQuery("page_time","2023-08-07","2023-08-14"),
        RangeQuery("status",201,301),
        NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
    ])

    ots_client = getConnect_ots()
    rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                   ColumnsToGet(["industry","docchannel","page_attachments"],return_type=ColumnReturnType.SPECIFIED))

    list_data = getRow_ots(rows)
    while next_token:
        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                       ColumnsToGet(["industry","docchannel","page_attachments"],return_type=ColumnReturnType.SPECIFIED))

        list_data.extend(getRow_ots(rows))
        # if len(list_data)>1000:
        #     break
    task_queue = Queue()
    for _data in list_data:
        task_queue.put(_data)

    def _handle(item,result_queue):
        page_attachments = item.get("page_attachments")
        _size = 0
        if page_attachments is not None and page_attachments!="":
            list_attach = json.loads(page_attachments)
            for _attach in list_attach:
                _md5 = _attach.get("fileMd5")
                if _md5 is not None:
                    consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",_md5)],["size"])
                    _d = getRow_ots_primary(return_row)
                    _size += _d.get("size",0)
        item["size"] = _size
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()

    dict_result = {}
    for data in list_data:
        industry = data.get("industry")
        docchannel = data.get("docchannel")
        _type = ""
        if docchannel==52:
            _type = "招标"
        elif docchannel in (101,118,119,120):
            _type = "中标"
        else:
            _type = "其他"
        _key = "%s-%s"%(industry,_type)
        if _key not in dict_result:
            dict_result[_key] = 0
        dict_result[_key] += data.get("size",0)
    print(dict_result)
    for k,v in dict_result.items():
        print(k,"%.2fM"%(v/7/1024/1024))

def static_dump():
    import pandas as pd


    filename = "select___from_bxkc_bxkc_delete_document_.csv"

    df = pd.read_csv(filename)
    print(df.keys())

    list_docid = df["docid"]
    list_dup_docid = df["dup_docid"]
    list_operate_time = df["operate_time"]
    list_a = []

    for docid,dup_docid in zip(list_docid,list_dup_docid):
        docid = int(docid)
        _flag = False
        if isinstance(dup_docid,str) and dup_docid is not None and dup_docid!="":
            _l = dup_docid.split(",")
            for _i in _l:
                if _i.strip()!="":
                    docid1 = int(_i)
                    if docid1>docid:
                        _flag = True
                        break
        if _flag:
            list_a.append("是")
        else:
            list_a.append("否")

    df_data = {"被去重docid":list_docid,
               "重复id":list_dup_docid,
               "是否展示后删除":list_a}
    df1 = pd.DataFrame(df_data)
    df1.to_csv("16号去重统计.csv")

def append_title():
    import pandas as pd
    filename = "去重记录.xlsx"

    df = pd.read_excel(filename)
    list_docid = df["被去重id"]
    list_keep_id = df["保留id"]

    list_data = []
    task_queue = Queue()
    for _docid,keep_docid in zip(list_docid,list_keep_id):
        _d = {"dup_docid":int(_docid),
              "keep_docid":int(keep_docid)}
        list_data.append(_d)
        task_queue.put(_d)

    ots_client = getConnect_ots()

    def _handle(item,result_queue):
        dup_docid = item.get("dup_docid")
        keep_docid = item.get("keep_docid")

        dup_partitionkey = dup_docid%500+1
        keep_partitionkey = keep_docid%500+1
        consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",dup_partitionkey),("docid",dup_docid)],["status","doctitle","extract_count"])
        _d = getRow_ots_primary(return_row)
        if _d is not None:
            doctitle = _d.get("doctitle")
            item["dup_title"] = doctitle
            extract_count = _d.get("extract_count")
            item["dup_extract_count"] = extract_count
        consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",keep_partitionkey),("docid",keep_docid)],["status","doctitle","extract_count","extract_count"])
        _d = getRow_ots_primary(return_row)
        if _d is not None:
            doctitle = _d.get("doctitle")
            item["keep_title"] = doctitle
            status = _d.get("status")
            extract_count = _d.get("extract_count")
            item["keep_extract_count"] = extract_count
            if status>=201 and status<=300:
                item["保留id状态"] = "正常"
            elif status>=401:
                item["保留id状态"] = "去重"
            else:
                item["保留id状态"] = ""
    mt = MultiThreadHandler(task_queue,_handle,None,39)
    mt.run()
    keys = ["dup_docid","keep_docid","dup_title","keep_title","保留id状态","dup_extract_count","keep_extract_count"]
    df_data = {}
    for data in list_data:
        for k in keys:
            if k not in df_data:
                df_data[k] = []
            df_data[k].append(data.get(k))
    df1 = pd.DataFrame(df_data)
    df1.to_excel("%s.xlsx"%(filename),columns=keys)


def get_follows():

    _json = '''
    [
]
    '''
    ots_client = getConnect_ots()
    list_follows = json.loads(_json)
    new_list = []
    for follow in list_follows:
        docid = follow.get("docid")
        partitionkey = docid%500+1
        consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["tenderee"])
        _d = getRow_ots_primary(return_row)
        print("docid",_d.get("tenderee"))
        if _d.get("tenderee")=="泗阳意杨产业科技园实业有限公司":
            new_list.append(follow)
    print(json.dumps(new_list,ensure_ascii=False))

def validateTitle(title):
    rstr = r"[\/\\\:\*\?\"\<\>\|\r\n]" # '/ \ : * ? " < > |'
    new_title = re.sub(rstr, "_", title) # 替换为下划线
    return new_title

def exportParameters():
    from glob import glob
    attach_path = "F:/Workspace2016/BaseDataMaintenance/BaseDataMaintenance/maintenance/product/download"

    ots_client = getConnect_ots()
    bool_query = BoolQuery(must_queries=[TermQuery("parameter_status",1)])
    save_dir = "product"
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    rows,next_token,total_count,is_all_succeed = ots_client.search("document_product2","document_product2_index",
                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
                                                                   ColumnsToGet(["parameter","bid_filemd5s","name","original_name"],return_type=ColumnReturnType.SPECIFIED))
    list_data = getRow_ots(rows)
    write_count = 0
    for _data in list_data:
        bid_filemd5s = _data["bid_filemd5s"]
        parameter = _data["parameter"]
        name = _data["name"]
        original_name = _data["original_name"]
        list_md5s = bid_filemd5s.split(",")
        if len(list_md5s)==1:
            list_path = glob(os.path.join(attach_path,bid_filemd5s)+"*")
            for _path in list_path:
                if not _path.endswith(".html"):
                    filename = _path.split("\\")[-1]
                    with open(os.path.join(save_dir,filename),"wb") as f:
                        f.write(open(_path,"rb").read())
                    pname = "%s_name%s_original_name%s.html"%(bid_filemd5s,name,original_name[:10])
                    pname = validateTitle(pname)
                    with open(os.path.join(save_dir,pname),"w",encoding="utf8") as f:
                        f.write(parameter)
                    write_count += 1
    while next_token:
        rows,next_token,total_count,is_all_succeed = ots_client.search("document_product2","document_product2_index",
                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                       ColumnsToGet(["parameter","bid_filemd5s","name","original_name"],return_type=ColumnReturnType.SPECIFIED))
        list_data = getRow_ots(rows)
        for _data in list_data:
            bid_filemd5s = _data["bid_filemd5s"]
            parameter = _data["parameter"]
            name = _data["name"]
            original_name = _data["original_name"]
            list_md5s = bid_filemd5s.split(",")
            if len(list_md5s)==1:
                list_path = glob(os.path.join(attach_path,bid_filemd5s)+"*")
                for _path in list_path:
                    if not _path.endswith(".html"):
                        filename = _path.split("\\")[-1]
                        with open(os.path.join(save_dir,filename),"wb") as f:
                            f.write(open(_path,"rb").read())
                        pname = "%s_name%s_original_name%s.html"%(bid_filemd5s,name,original_name[:10])
                        pname = validateTitle(pname)
                        with open(os.path.join(save_dir,pname),"w",encoding="utf8") as f:
                            f.write(parameter)
                        write_count += 1
                        if write_count>=2000:
                            return


def exportProjects():
    bool_query = BoolQuery(must_queries=[
        TermQuery("docid_number",1),
        ExistsQuery("zhong_biao_page_time"),
        RangeQuery("page_time","2023-01-01","2023-10-10")
    ],
    must_not_queries=[
        MatchPhraseQuery("doctitles","网上超市")
    ])
    ots_client = getConnect_ots()
    rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
                                                                   SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=100,get_total_count=True),
                                                                   ColumnsToGet(["docids","doctitles","project_codes"],return_type=ColumnReturnType.SPECIFIED))
    list_data = getRow_ots(rows)

    while next_token:
        rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
                                                                       SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                       ColumnsToGet(["docids","doctitles","project_codes"],return_type=ColumnReturnType.SPECIFIED))
        list_data.extend(getRow_ots(rows))
        if len(list_data)>10000:
            break
    task_queue = Queue()
    for data in list_data:
        task_queue.put(data)

    def _handle(item,result_queue):
        docids = item["docids"]
        project_codes = item.get("project_codes","")
        if len(project_codes)>0:
            list_codes = project_codes.split(",")

            should_q = []
            for code in list_codes:
                should_q.append(MatchPhraseQuery("doctextcon",code))
                should_q.append(MatchPhraseQuery("attachmenttextcon",code))
            _query = BoolQuery(must_queries=[BoolQuery(should_queries=should_q),RangeQuery("status",201,301)],
                               must_not_queries=[TermQuery("docid",int(docids))])
            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                           SearchQuery(_query,limit=100),
                                                                           ColumnsToGet(["doctitle"],return_type=ColumnReturnType.SPECIFIED))
            item["result"] = json.dumps(getRow_ots(rows),ensure_ascii=False)
    mt = MultiThreadHandler(task_queue,_handle,None,30)
    mt.run()
    columns = ["docids","doctitles","project_codes","result"]
    df_data = {}
    for data in list_data:
        for c in columns:
            if c not in df_data:
                df_data[c] = []
            df_data[c].append(data.get(c,""))
    df = pd.DataFrame(df_data)
    df.to_excel("toMerge.xlsx",columns=columns)

def match_contact():
    filename = r"C:\Users\Administrator\联系电话需求.xlsx"
    df = pd.read_excel(filename)
    
    ots_client = getConnect_ots()

    list_row = []
    for index,row in df.iterrows():
        a = row["乙方"]
        b = row["联系人"]
        c = row['联系电话（手机号）']
        d = row["想打听甲方联系人的项目"].strip()
        e = row['联系人.1']
        f = row['联系电话（手机号）.1']

        print(a,b,c,d,e,f)
        bool_query = BoolQuery(must_queries=[
            generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[a,d],MatchPhraseQuery),
            TermQuery("tenderee",d),
            RangeQuery("status",201,301)
        ])
        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),get_total_count=True,limit=10),
                                                                       ColumnsToGet(["tenderee_contact","tenderee_phone"],return_type=ColumnReturnType.SPECIFIED))

        list_data = getRow_ots(rows)
        _find = False
        # if len(list_data)>0:
        #     for data in list_data:
        #         if re.search('^1\d{10}',data.get("tenderee_phone","")) is not None:
        #             df["联系人.1"][index] = data.get("tenderee_contact","")
        #             df['联系电话（手机号）.1'][index] = data.get("tenderee_phone","")
        #             print("===",data)
        #             _find = True
        #             break
        bool_query = BoolQuery(must_queries=[
            TermQuery("enterprise_name",a),
            TermQuery("is_mobile",1)
        ])
        rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("score",SortOrder.DESC),FieldSort("update_time",SortOrder.DESC)]),get_total_count=True,limit=10),
                                                                       ColumnsToGet(["contact_person","phone_no"],return_type=ColumnReturnType.SPECIFIED))
        list_data = getRow_ots(rows)
        if len(list_data)>0:
            _str = ""
            for _data in list_data:
                _str += _data.get("contact_person","")+"|"+_data.get("phone_no","")+"|\n"
            df["联系人"][index] = _str
            df['联系电话（手机号）'][index] = _str

        if not _find:
            bool_query = BoolQuery(must_queries=[
                TermQuery("enterprise_name",d),
                TermQuery("is_mobile",1)
            ])
            rows,next_token,total_count,is_all_succeed = ots_client.search("enterprise_contact","enterprise_contact_index",
                                                                           SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("score",SortOrder.DESC),FieldSort("update_time",SortOrder.DESC)]),get_total_count=True,limit=10),
                                                                           ColumnsToGet(["contact_person","phone_no"],return_type=ColumnReturnType.SPECIFIED))
            list_data = getRow_ots(rows)
            if len(list_data)>0:
                _str = ""
                for _data in list_data:
                        _str += _data.get("contact_person","")+"|"+_data.get("phone_no","")+"|\n"

                df["联系人.1"][index] = _str
                df['联系电话（手机号）.1'][index] = _str


        print(total_count)
    df.to_excel("result.xlsx")

def match_products():
    # filename = "未订阅用户查看公告记录.xlsx"
    # df = pd.read_excel(filename)
    # list_user_id = df["user_id"]
    # list_doc_id = df["doc_id"]
    # ots_client = getConnect_ots()
    # list_product = []
    # _c = 0
    # for docid in list_doc_id:
    #     _c += 1
    #     print(_c,len(list_doc_id))
    #     bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
    #     rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
    #                                                                    SearchQuery(bool_query),
    #                                                                    columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
    #     list_data = getRow_ots(rows)
    #     if len(list_data)>0:
    #         list_product.append(list_data[0].get("product",""))
    #     else:
    #         list_product.append("")
    # u_id = None
    # u_product = []
    # list_product_group = []
    # for user_id,product in zip(list_user_id,list_product):
    #     if u_id is None:
    #         u_id = user_id
    #         u_product.append(product)
    #     else:
    #         if user_id==u_id:
    #             u_product.append(product)
    #         else:
    #             l_p = []
    #             for _p in u_product:
    #                 s = _p.split(",")
    #                 l_p.extend(s)
    #             _dict = {}
    #             for p in l_p:
    #                 if p not in _dict:
    #                     _dict[p] = 0
    #                 _dict[p] += 1
    #             for _ in u_product:
    #                 list_product_group.append(json.dumps(_dict,ensure_ascii=False))
    #             u_id = user_id
    #             u_product = [product]
    #
    #
    # if len(u_product)>0:
    #     l_p = []
    #     for _p in u_product:
    #         s = _p.split(",")
    #         l_p.extend(s)
    #     _dict = {}
    #     for p in l_p:
    #         if p not in _dict:
    #             _dict[p] = 0
    #         _dict[p] += 1
    #     for _ in u_product:
    #         list_product_group.append(json.dumps(_dict,ensure_ascii=False))
    #
    # _d = {"user_id":list_user_id,
    #       "doc_id":list_doc_id,
    #       "product":list_product,
    #       "product_group":list_product_group}
    # for k,v in _d.items():
    #     print(k,len(v))
    # df1 = pd.DataFrame(_d)
    # df1.to_excel(filename+"_1.xlsx",columns=["user_id","doc_id","product","product_group"])

    filename = "大单未订阅用户关键词.xlsx"
    df = pd.read_excel(filename)
    list_userid = df["userid"]
    list_kw = df["关键词"]
    list_kw1 = []
    for kw in list_kw:
        list_word = kw.split(",")
        list_word.sort(key=lambda x:len(x))
        list_words = []
        for w_i in range(len(list_word)):
            _find = False
            _w = list_word[w_i]
            if len(_w)>8:
                _find = True
            else:
                for w_j in range(w_i):
                    _wj = list_word[w_j]
                    if str(_w).find(_wj)>=0:
                        _find = True
                        break
            if not _find:
                list_words.append(_w)
            if len(list_words)==6:
                break
        list_kw1.append(",".join(list_words))

    _dict = {"userid":list_userid,
             "关键词":list_kw1}
    df1 = pd.DataFrame(_dict)
    df1.to_excel(filename+"_1.xlsx",columns=["userid","关键词"])

def export_columns():
    filename = r"F:\Workspace2016\DataMining\data\2025-03-14_161851_数据导出.xlsx"
    df = pd.read_excel(filename)
    task_queue = Queue()
    result_queue = Queue()
    ots_client = getConnect_ots()
    ots_capacity = getConnect_capacity()
    for i in range(len(df)):
        docid = df.iloc[i]["docid"]
        docid = int(docid)
        task_queue.put(docid)

    from bs4 import BeautifulSoup
    import re

    def html2text_with_tablehtml(_html):
        # 如果输入是字符串，使用 BeautifulSoup 解析
        if isinstance(_html, str):
            _soup = BeautifulSoup(_html, "lxml")
        else:
            _soup = _html

        # 用于存储处理后的文本
        result_parts = []

        _find = False
        # 遍历所有直接子元素
        for child in _soup.find_all(recursive=False):
            if child.name in ["table", "tbody"]:
                # 如果是表格或表格主体，保留 HTML 代码
                result_parts.append("\n"+str(child)+"\n")
            else:
                # 递归处理其他元素并转换为文本
                text = html2text_with_tablehtml(child)
                result_parts.append(text)
            _find = True
        if not _find:
            _text = str(_soup.get_text())
            if len(_text)>0:
                if _soup.name in {"p","div","li"}:
                    _text += "\n"
                result_parts.append(_text)

        # 将所有处理后的部分连接成一个字符串
        result = "".join(result_parts)
        return result

    def _handle(item,result_queue):
        consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",int(item%500+1)),("docid",int(item))],["web_source_no","web_source_name"])
        _dict = getRow_ots_primary(return_row)

        web_source_no = _dict.get("web_source_no","")
        web_source_name = _dict.get("web_source_name","")

        consumed, return_row, next_token = ots_capacity.get_row("document",[("partitionkey",int(item%500+1)),("docid",int(item))],["dochtmlcon"])
        _dict = getRow_ots_primary(return_row)
        dochtmlcon =  _dict.get("dochtmlcon","")
        _text = html2text_with_tablehtml(dochtmlcon)
        result_queue.put((item,web_source_no,web_source_name,_text))
    mt = MultiThreadHandler(task_queue,_handle,result_queue,30)
    mt.run()
    _dict_docid = {}
    while True:
        try:
            item,web_source_no,web_source_name,_text = result_queue.get(False)
            _dict_docid[item] = (web_source_no,web_source_name,_text)
        except Exception as e:
            break
    for i in range(len(df)):
        docid = df.iloc[i]["docid"]
        docid = int(docid)
        if docid in _dict_docid:
            web_source_no,web_source_name,_text = _dict_docid[docid]
            df.loc[i,"web_source_no"] = web_source_no
            df.loc[i,"web_source_name"] = web_source_name
            df.loc[i,"tokens"] = len(_text)
        else:
            df.loc[i,"web_source_no"] = ""
            df.loc[i,"web_source_name"] = ""
            df.loc[i,"tokens"] = 0
    df.to_excel(filename+"_1.xlsx")

def clean_subcription(s_subcriptions,s_exclude_subcriptions):
    '''
    自动清洗订阅词
    :param s_subscriptions:
    :param s_exclude_subscriptions:
    :return:
    '''
    from export.html2text import html2text_with_tablehtml
    ots_client = getConnect_ots()
    ots_capacity = getConnect_capacity()

    def get_content(subcriptions,exclude_subcriptions,current_date):
        must_not_queries=[
            generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],exclude_subcriptions[:80],MatchPhraseQuery)
        ] if len(exclude_subcriptions)>0 else []
        bool_query = BoolQuery(must_queries=[
            TermsQuery("docchannel",[52,101,118,119,120,121,122,51]),
            TermQuery("page_time",current_date),
            generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],subcriptions[:80],MatchPhraseQuery),
        ],
        must_not_queries=must_not_queries)
        list_row = getDocument([{"query":bool_query,"limit":10}],["docid","doctitle"],thread_count=1)
        for row in list_row:
            docid = row["docid"]
            partitionkey = row["partitionkey"]
            consumed, return_row, next_token = ots_capacity.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["dochtmlcon"])
            _dict = getRow_ots_primary(return_row)
            dochtmlcon =  _dict.get("dochtmlcon","")
            _text = html2text_with_tablehtml(dochtmlcon)
            row["text"] = _text[:10000]
        return list_row

    def _handle(item,result_queue):
        list_row = get_content(item["subcriptions"],item["exclude_subcriptions"],item["current_date"])
        item["list_row"] = list_row
        for row in list_row:
            result_queue.put(row)


    def clean_exclude_set(list_exclude_set):
        while True:
            new_list_exclude_set = []
            pop_index = set()
            for _i in range(len(list_exclude_set)):
                if _i in pop_index:
                    continue
                _exclude_set = list_exclude_set[_i]
                _find = False
                for _j in range(_i+1,len(list_exclude_set)):
                    if _j in pop_index:
                        continue
                    _exclude_set_j = list_exclude_set[_j]
                    if len(_exclude_set & _exclude_set_j)>0:
                        new_list_exclude_set.append(_exclude_set & _exclude_set_j)
                        pop_index.add(_j)
                        _find = True
                        break
                if not _find:
                    new_list_exclude_set.append(_exclude_set)
                    pop_index.add(_i)
            if len(new_list_exclude_set)==len(list_exclude_set):
                return new_list_exclude_set
            list_exclude_set = new_list_exclude_set


    def _handle1(row,result_queue):
        _prompt = '''
客户对招投标信息感心趣
订阅词如下：施工围挡,防眩板,标志牌,标示牌,标识牌,指示牌,路名牌,道路标线,道路标志,热熔标线,交通标线,交通标识工程,马路划线,道路标志制作,道路标志设置,车库划线
根据如上订阅词，识别客户真正感兴趣的内容，判断以下公告的兴趣度，范围是0-10，如果兴趣度不高（即内容不是客户感兴趣的），请生成排除词，排除词针对这篇公告的核心内容且内容不是客户感兴趣的
返回格式如下：
{"兴趣度":"","排除词":[]}
%s
        '''%(row["text"])
        _result = chat_doubao(_prompt,model_name = "ep-20250314164242-jd62g")
        _json = get_json_from_text(_result)
        try:
            _dict = json.loads(_json)
        except Exception as e:
            _dict = {}
        row.update(_dict)

    subcriptions = s_subcriptions.split(",")
    exclude_subcriptions = s_exclude_subcriptions.split(",") if s_exclude_subcriptions!="" else []
    original_exclude_subcriptions = exclude_subcriptions.copy()
    task_queue = Queue()
    result_queue = Queue()
    _index = 0
    dict_count = {}
    current_date = getCurrent_date(format="%Y-%m-%d")
    list_exclude_set = []
    while True:
        list_data = []
        if _index>=10:
            break
        _index += 1
        current_date = timeAdd(current_date,-7)
        for subcription in subcriptions:
            _d = {"subcriptions":[subcription],
                  "exclude_subcriptions":exclude_subcriptions,
                  "current_date":current_date}
            list_data.append(_d)
            task_queue.put(_d)
        mt = MultiThreadHandler(task_queue,_handle,result_queue,10)
        mt.run()
        mt = MultiThreadHandler(result_queue,_handle1,None,30)
        mt.run()


        for row1 in list_data:
            total_interest = 0
            total_count = 0
            list_row = row1.get("list_row",[])
            for row in list_row:
                try:
                    interest = int(row["兴趣度"])
                except Exception as e:
                    interest = 0
                if interest>0:
                    total_interest += interest
                    total_count += 1

                if interest<=5:
                    list_exclude = row.get("排除词",[])
                    if len(list_exclude)>0:
                        list_exclude_set.append(set(list_exclude))
                    for _exclude in list_exclude:
                        if _exclude in dict_count:
                            dict_count[_exclude] += 1
                        else:
                            dict_count[_exclude] = 1
            if total_count>0:
                row1["avg_interest"] = round(total_interest/total_count,2)
                row1["total_interest"] = total_interest
        list_exclude_set = clean_exclude_set(list_exclude_set)
        exclude_set = {_exclude for _exclude_set in list_exclude_set for _exclude in _exclude_set}
        list_count = []
        for k,v in dict_count.items():
            if v>=2 and k in exclude_set:
                list_count.append((v,k))
        list_count.sort(key=lambda x:x[0],reverse=True)
        exclude_subcriptions = original_exclude_subcriptions.copy()
        # for v,k in list_count:
        #     if k not in exclude_subcriptions:
        #         exclude_subcriptions.append(k)
        #todo 根据排除词的共现来减少排除词

        print("exclude_subcriptions",exclude_subcriptions)
        for row1 in list_data:
            print("interest")
            print(row1.get("subcriptions"))
            print(row1.get("avg_interest"))
            print(row1.get("total_interest"))

def fix_subcription():

    ots_client = getConnect_ots()

    def _handle(item,result_queue):
        bool_query = BoolQuery(must_queries=[
            NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item.get("公司名称"))),
            TermsQuery("docchannel",[101,118,119,120,121,122]),
            RangeQuery("status",201,301)

        ])

        list_row = getDocument([{"query":bool_query,"limit":50}],["docid","doctitle","product","products","province"],thread_count=1)
        remain_products = []
        total_products = {}
        total_titles = []
        total_province = set()
        for row in list_row:
            product = row.get("product","")
            products = row.get("products")
            doctitle = row.get("doctitle")
            province = row.get("province")
            print("===============")
            print(product)
            print(products)

            if province!="" and province not in ("全国","未知"):
                total_province.add(province)
            total_titles.append(doctitle)
            list_products = product.split(",")
            for p in list_products:
                if len(p)>8:
                    continue
                if p not in total_products:
                    total_products[p] = 0
                total_products[p] += 1
            if products is not None and products!="":
                products_json = json.loads(products)
                list_products = [a.get("product") for a in products_json]
                for p in list_products:
                    if len(p)>8:
                        continue
                    if p not in total_products:
                        total_products[p] = 0
                    total_products[p] += 1
            # total_products.extend(list_products)
        # total_products = [a for a in total_products if a is not None and a!=""]
        list_products = []
        for k,v in total_products.items():
            list_products.append((k,v))
        list_products.sort(key=lambda x:x[1],reverse=True)

        # for product in total_products:
        #     _find = False
        #     for p_i in range(len(remain_products)):
        #         p = remain_products[p_i]
        #         if p.find(product)>=0:
        #             remain_products[p_i] = product
        #             _find = True
        #         if product.find(p)>=0:
        #             _find = True
        #     if not _find:
        #         remain_products.append(product)
        for p,_ in list_products[:20]:
            remain_products.append(p)

        prompt = "%s的主营产品是什么"%item.get("公司名称")
        # _business = chat_doubao_bot(prompt,"bot-20250725150712-pzfls")
        _business = ""
        item["搜索主营"] = _business


        messages =  [
            {"role": "system", "content": "你是豆包，是由字节跳动开发的 AI 人工智能助手"},
            ]
        prompt1 = '''
        您是一个招投标数据平台的数据专家，你的客户“【%s】”希望和贵司的招投标数据平台合作，需要您为他设定订阅词,目前通过该公司的中标公告中提取的产品词有“【%s】”（注意，这些词里面有一些是提取错误的，也有一些是因为框架标被混杂进来的产品词，请你结合你对这个公司的了解剔除无关关键词），客户自己在平台上自己搜索的关键词有“【%s】”（注意，客户自己搜索的关键词也不一定是业务有关，有些客户使用的时候不熟悉平台所以可能输入有误），请搜索客户公司有关信息，再结合你对客户公司业务的了解，再结合你对客户公司业务的了解，帮客户总结出一些有效的关键词，可以用来进行招投标信息的订阅，关键词数量不超过20个，每个词不超过8个字，用json 格式{"keywords":[]} 返回（注意有些关键词之间是相互包含关系的，只保留最小范围的， 如“种子光选机”和“光选机”， 只保留“光选机”一个就够了）
        '''%(item.get("公司名称"),",".join(remain_products),item.get("搜索词"))
        print(prompt1)
        messages.append({"role": "user", "content": prompt1})
        _result = chat_doubao_bot(messages,"bot-20250725150712-pzfls")
        messages.append({"role": "system", "content": _result})
        _json = get_json_from_text(_result)
        try:
            _dict = json.loads(_json)
        except Exception as e:
            _dict = {}

        if len(total_province)>3 or len(total_province)==0:
            item["地区"] = "全国"
        else:
            item["地区"] = ",".join(list(total_province))
        item["AI订阅词"] = ",".join(_dict.get("keywords",[]))
        must_queries=[
            generateBoolShouldQuery(["doctitle"],item["AI订阅词"].split(","),MatchPhraseQuery),
            RangeQuery("page_time",timeAdd(getCurrent_date("%Y-%m-%d"),-30)),
            TermQuery("docchannel",52)
            ]
        if item["地区"]!="全国":
            must_queries.append(generateBoolShouldQuery(["province"],item["地区"].split(","),TermQuery))


        bool_query = BoolQuery(must_queries=must_queries)

        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=1,get_total_count=True),
                                                                       ColumnsToGet(["doctitle"],return_type=ColumnReturnType.SPECIFIED))
        if total_count<500:
            must_queries[0] = generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],item["AI订阅词"].split(","),MatchPhraseQuery)
            bool_query = BoolQuery(must_queries=must_queries)
            item["搜索范围"] = "全文"

        list_row = getDocument([{"query":bool_query,"limit":50}],["docid","doctitle",],thread_count=1)

        total_titles = []
        for row in list_row:
            doctitle = row.get("doctitle")
            total_titles.append(doctitle)

        prompt2 = '''
        订阅后获取到的招标公告标题如下：
        【%s】
         
        请你筛选出不符合业务的公告标题，并总结出一些有效通用的标题排除词（排除词也是尽量可泛化一下， 例如”消防监理“属于监理服务，排除词应该直接用”监理“，也尽量用在上述标题中出现的表达搭配，随意省略中间部分词汇可能导致无法有效排除，因为排除词用的是全匹配逻辑），请把这些排除词用 json格式{"exclude_words":[]},json上下文符合正则```json(?P<json>.*)```  输出（不超过 20 个，每个排除词单个词不超过 6 个字）
                '''%("\r\n".join(total_titles))

        messages.append({"role": "user", "content": prompt2})
        # _result = chat_doubao(prompt2,model_name = "ep-20250314164242-jd62g")
        # _result = chat_doubao_bot(messages,"bot-20250725150712-pzfls")
        _result = chat_doubao_messages(messages,model_name = "ep-20250212111145-fflr7")
        print(prompt2)
        _json = get_json_from_text(_result)
        print(_json)
        try:
            _dict = json.loads(_json)
        except Exception as e:
            _dict = {}
        print(_dict)
        print(_dict.get("exclude_words"))
        item["AI排除词"] = _dict.get("exclude_words")

        item["最终产品词"] = ",".join(remain_products)
        item["标题"] = "\n".join(total_titles)


    filename = "所有注册用户并且关注微信的数据信息.xlsx"
    df = pd.read_excel(filename)

    companynames = ["爱威科技股份有限公司","上海百若试验仪器有限公司","宁联电缆集团有限公司"]

    task_queue = Queue()
    list_data = []
    for company,_b,search in zip(df["公司名"],df["是否中标"],df["搜索词"]):
        print(_b,type(_b))
        if not _b:
            continue
        if not isinstance(search,str):
            continue
        list_search = search.split(",")
        list_search.sort(key=lambda x:len(x))
        _d = {"公司名称":company,
              "搜索词":",".join(list_search[:20]),
              "搜索主营":"",
              "搜索范围":"标题",
              "AI订阅词":"",
              "AI排除词":"",
              "最终产品词":"",
              "标题":"",
              "公告类型":"招标公告",
              "地区":"全国"}
        list_data.append(_d)
        if len(list_data)>100:
            break
    print("list_data",len(list_data))

    for data in list_data:
        task_queue.put(data)

    mt = MultiThreadHandler(task_queue,_handle,None,5)
    mt.run()
    df = pd.DataFrame(list_data)
    df.to_excel("../data/%s_extract2.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))


if __name__=="__main__":
    # compareData()
    # attachAttachment()
    # exportDocument_By_time(time_from="2021-01-29",time_to="2021-01-29",columns=["docid","doctitle","project_name","dochtmlcon"])
    # processDocument()
    # export_extract_check()
    # exportArticle_by_websource()
    # export_keyword_count()
    # export_province_keyword_count()
    # exportDocument_dump()
    # exportDocument_dump_mysql()
    # export_attachment()
    # statics_attachment_counts()
    # get_follows()
    # append_title()
    # exportDocument_by_doctitle()
    # exportIndustryCount()
    # exportDocument_by_pagetime()
    # export_columns()
    # match_products()
    # match_contact()
    # exportProjects()
    # exportProducts()
    # exportParameters()
    # exportAgencyCount()
    # getDocumentHtml()
    # getDumplicate_docid()
    # exportHonors_item_info()
    # check_dump_data()
    # search_title_count()
    # count_product()
    # export_dump_by_id()
    # group_xlsx()
    # static_process_time()
    # check_data_synchronization()
    # process_doc()
    # export_competition()
    # for page_time in ["2022-08-01"]:
    #     exportDocument_by_days(page_time)
    # exportDocument_forRecommen()
    # exportDocument_attachment()
    # exportWin_tenderer_count()
    # attachCompanyContact()
    # dumpWebSourceNo()
    # print("http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%103571618))))
    # exportNzj()
    # turn_status()
    # attachBidding_budget()
    # debug_documentMerge()
    # exportDocument_medicine("2021-05-24","2021-05-30")
    # signDocument()
    # transUUid()
    # fix_document()
    # export_document_no_price()
    # findProjects()
    # exportDetailLink()
    # export_extract_check()
    # export_extract2()
    # export_by_file()
    # export_dump()
    # clean_subcription("施工围挡,防眩板,标志牌","")
    fix_subcription()