12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324 |
- #encoding:UTF8
- import sys
- import os
- sys.path.append("..")
- print(sys.path)
- import pandas as pd
- from dataSource.source import *
- import json
- from utils.multiThread import MultiThreadHandler
- import queue
- from utils.Utils import *
- from dataSource.pool import ConnectorPool
- import re
- from tablestore import *
- import traceback
- from utils.hashUtil import aesCipher
- from uuid import uuid4
- from export.exportUtils import *
- data_path = "../data/"
- def getCompanyTenderer():
- def _handle(item,result_queue):
- company = item
- dict_result = {"company":company,"count":0,"competitor":"","project_name":""}
- dict_result["company"] = company
- graph = getConnect_neo4j()
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.zhao_biao_id,p.zhong_biao_id"%(company)
- finded = graph.run(cql)
- finded_ids = json.loads(json.dumps(finded.data()))
- dict_result["count"] = len(finded_ids)
- mongoDB = getConnect_mongodb()
- coll_zb = mongoDB.zhongbiao_extraction
- if len(finded_ids)>0:
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN p.project_name limit 3"%(company)
- finded = graph.run(cql)
- finded_names = json.loads(json.dumps(finded.data()))
- list_names = [_i["p.project_name"] for _i in finded_names]
- dict_result["project_name"] = str(list_names)
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN r.price"%(company)
- finded = graph.run(cql)
- finded_money = json.loads(json.dumps(finded.data()))
- whole_money = 0
- for item in finded_money:
- if item["r.price"] is not None:
- whole_money += getUnifyMoney(item["r.price"])
- dict_result["whole_money"] = str(whole_money)
- dict_competitor = {}
- for item in finded_ids:
- docId = item["p.zhong_biao_id"]
- if docId is not None:
- rows = coll_zb.find({"docId":docId})
- for row in rows:
- keys = ["second_tenderer","third_tenderer"]
- for _key in keys:
- if _key in row:
- if row[_key] not in dict_competitor:
- dict_competitor[row[_key]] = 0
- dict_competitor[row[_key]] += 1
- list_competitor = []
- for _key in dict_competitor:
- list_competitor.append([_key,dict_competitor[_key]])
- list_competitor.sort(key=lambda x:x[1],reverse=True)
- list_competitors = [i[0] for i in list_competitor[:10]]
- dict_result["competitor"] = str(list_competitors)
- result_queue.put(dict_result)
- # filename = "成交客户匹配中标项目的需求.xlsx"
- # df = pd.read_excel(filename)
- # list_company = df["公司名字"]
- # company = list_company[0]
- list_company = []
- filename = "../data/服务型客户.txt"
- with open(filename,"r",encoding="GBK") as f:
- while(True):
- line = f.readline()
- if not line:
- break
- list_company.append(line.strip())
- task_queue = queue.Queue()
- for company in list_company:
- task_queue.put(company)
- result_queue = queue.Queue()
- handler = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=10)
- handler.run()
- list_company = []
- list_zb = []
- list_count = []
- list_project = []
- list_money = []
- list_competitor = []
- while(True):
- try:
- _result = result_queue.get(False)
- list_company.append(_result.get("company",""))
- list_zb.append("是" if _result.get("count","")>0 else "否")
- list_count.append(_result.get("count",""))
- list_project.append(_result.get("project_name",""))
- list_money.append(_result.get("whole_money",""))
- list_competitor.append(_result.get("competitor",""))
- except Exception as e:
- print(e)
- break
- df1 = pd.DataFrame({"公司名字":list_company,"是否中标":list_zb,"中标次数":list_count,"中标项目":list_project,"中标金额":list_money,"潜在竞争对手":list_competitor})
- df1.to_excel("%s_export.xls"%(filename),columns=["公司名字","是否中标","中标次数","中标项目","中标金额","潜在竞争对手"])
- def export_count_includeKeyword():
- filename = "../data/other/jc001.xlsx"
- list_name = []
- list_count = []
- df = pd.read_excel(filename)
- _index = 0
- for row in df["品目"]:
- _name = row
- data = solrQuery("document",{"q":'dochtmlcon:"%s"'%_name,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
- if data is not None:
- _count = data["response"]["numFound"]
- else:
- _count = 0
- list_name.append(_name)
- list_count.append(_count)
- _index += 1
- print(_index)
- df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
- df1.to_excel("%s_export.xls"%filename)
- def export_count_includeKeyword_multiThread():
- def _handler(item,result_queue):
- data = solrQuery("document",{"q":'dochtmlcon:"%s"'%item,"fq":'(publishtime:[2020-01-01T00:00:00Z%20TO%202020-08-12T23:59:59Z])',"fl":"city","rows":1})
- if data is not None:
- _count = data["response"]["numFound"]
- else:
- _count = 0
- result_queue.put([item,_count])
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- filename = "../data/other/jc001.xlsx"
- list_name = []
- list_count = []
- df = pd.read_excel(filename)
- _index = 0
- for row in df["品目"]:
- _name = row
- task_queue.put(_name)
- _index += 1
- multHandler = MultiThreadHandler(task_queue,_handler,result_queue,thread_count=20)
- multHandler.run()
- while(True):
- try:
- item = result_queue.get(False)
- list_name.append(item[0])
- list_count.append(item[1])
- except queue.Empty as e:
- break
- df1 = pd.DataFrame({"品目":list_name,"数量":list_count})
- df1.to_excel("%s_export.xls"%filename)
- def exportKeywords():
- def _handle(item,result_queue,pool_mongo):
- docId = item["docId"]
- mongo = pool_mongo.getConnector()
- zhongbiao = mongo.zhongbiao_extraction
- zhaobiao = mongo.zhaobiao_extraction
- _project = ""
- rows = zhaobiao.find({"docId":docId},{"project_name":1})
- find_flag = False
- for row in rows:
- find_flag = True
- _project = row.get("project_name","")
- if not find_flag:
- rows = zhongbiao.find({"docId":docId},{"project_name":1})
- for row in rows:
- _project = row.get("project_name","")
- item["project_name"] = _project
- pool_mongo.putConnector(mongo)
- result_queue.put(item)
- list_key = []
- dict_key_ids = dict()
- with open("../data/品目.txt", "r", encoding="utf8") as f:
- while(True):
- row = f.readline()
- if not row:
- break
- list_key.append(row)
- dict_key_ids[row] = []
- data = solrQuery("document",{"q":'dochtmlcon:"%s" AND dochtmlcon:"法院"'%row,"fq":'(publishtime:[2019-01-01T00:00:00Z TO 2019-12-31T23:59:59Z])',"fl":"id","rows":10000000})
- for item in data["response"]["docs"]:
- dict_key_ids[row].append(item["id"])
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for _key in dict_key_ids.keys():
- for item in dict_key_ids[_key]:
- task_queue.put({"docId":item,"project_name":""})
- pool_mongo = ConnectorPool(init_num=10,max_num=200,method_init=getConnect_mongodb)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=200,pool_mongo=pool_mongo)
- mt.run()
- dict_docId_projectname = {}
- while(True):
- try:
- item = result_queue.get(False)
- dict_docId_projectname[item["docId"]] = item["project_name"]
- except Exception:
- break
- dict_key_count = dict()
- for _key in dict_key_ids.keys():
- set_key = set()
- for docId in dict_key_ids[_key]:
- set_key.add(dict_docId_projectname.get(docId,""))
- dict_key_count[_key] = len(set_key)
- print("==")
- for _key in list_key:
- print(len(dict_key_ids[_key]))
- print("==")
- for _key in list_key:
- print(dict_key_count[_key])
- print("==")
- def getIndustryCompany():
- def _handle(item,result_queue,pool_mongo,pool_neo4j,pool_mysql,pool_ots):
- # mongoDB = getConnect_mongodb()
- log(item["enterprise_name"])
- mongoDB = pool_mongo.getConnector()
- # coll_zb = mongoDB.enterprise_profile
- # rows = coll_zb.find({"enterprise_name":item["enterprise_name"]},{"enterprise_name":1,"legalPersonName":1,"actualCapital":1, "regCapital":1,"estiblishTime":1,"socialStaffNum":1,"legal_person":1,"phone":1,"businessScope":1,"industry":1 })
- # for row in rows:
- # item["regCapital"] = row.get("regCapital","")
- # item["legal_person"] = row.get("legal_person","")
- # item["phone"] = row.get("phone","")
- # item["actualCapital"] = row.get("actualCapital","")
- # item["industry"] = row.get("industry","")
- # item["estiblishTime"] = row.get("estiblishTime","")
- # item["socialStaffNum"] = row.get("socialStaffNum","")
- # item["businessScope"] = row.get("businessScope","")
- # graph = getConnect_neo4j()
- ots_client = pool_ots.getConnector()
- primary_key = [('name',item["enterprise_name"])]
- columns_to_get = ["reg_capital","legal_person","phone","actual_capital","industry","estiblishTime","social_staff_num","business_scope"]
- consumed, return_row, next_token = ots_client.get_row("enterprise",primary_key, columns_to_get, None, 1)
- if return_row is not None:
- for att in return_row.attribute_columns:
- item[att[0]] = att[1]
- list_same_industry_company = []
- if "industry" in item:
- bool_query = BoolQuery(must_queries=[TermQuery("industry",item["industry"])])
- col = ColumnsToGet(['enterprise_name'], ColumnReturnType.SPECIFIED)
- rows, next_token, total_count, is_all_succeed = ots_client.search("enterprise", "enterprise_index",
- SearchQuery(bool_query, limit=10, get_total_count=True),
- col)
- for row in rows:
- for item1 in row[0]:
- list_same_industry_company.append(item1[1])
- # if "industry" in item:
- # rows = coll_zb.find({"industry":item["industry"]},{"enterprise_name":1}).limit(10)
- # for row in rows:
- # print(row)
- # list_same_industry_company.append(row.get("enterprise_name",""))
- item["same_industry_company"] = list_same_industry_company
- graph = pool_neo4j.getConnector()
- company_name = item["enterprise_name"]
- cql = "MATCH (n:Organization)-[r:ZhongBiaoRelation]->(p:Project) where n.name='%s' RETURN count(p) as _c "%(company_name)
- finded = graph.run(cql)
- data = json.loads(json.dumps(finded.data()))
- _count = data[0]["_c"]
- # list_project = []
- # for _data in data:
- # if _count<=3:
- # if "zhong_biao_page_time" in _data and _data["zhong_biao_page_time"]>"2019-01-01":
- # if _data["project_name"] is not None:
- # list_project.append(_data["project_name"])
- # _count += 1
- item["count"] = _count
- # item["project"] = str(list_project)
- result_queue.put(item)
- pool_mongo.putConnector(mongoDB)
- pool_neo4j.putConnector(graph)
- pool_ots.putConnector(ots_client)
- log_tofile("export.log")
- pool_mongo = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_mongodb)
- pool_neo4j = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_neo4j)
- pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_mysql)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- # list_company = getCompanys()
- # filename = "".join(["环境","生态","再生","回收","环保"])
- list_company = []
- filename = "../data/同行客户匹配.xlsx"
- df = pd.read_excel(filename,sheetname=0)
- for _com in df["公司名称"]:
- print(_com)
- if _com is not None and _com.strip()!="":
- _company = {"enterprise_name":""}
- _company["enterprise_name"] = _com
- list_company.append(_company)
- task_queue = queue.Queue()
- for item in list_company:
- task_queue.put(item)
- result_queue = queue.Queue()
- _muti = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mongo=pool_mongo,pool_neo4j=pool_neo4j,pool_mysql=pool_mysql,pool_ots=pool_ots)
- _muti.run()
- df_company = {}
- set_key = set()
- if len(list_company)>0:
- for item in list_company:
- for _key in item.keys():
- set_key.add(_key)
- if _key not in df_company:
- df_company[_key] = []
- list_key = list(set_key)
- for item in list_company:
- for _key in list_key:
- df_company[_key].append(item.get(_key,""))
- df1 = pd.DataFrame(df_company)
- df1.to_excel("%s_export.xlsx"%(filename))
- def exportWin_tenderer(time_from,time_to):
- '''
- :return:
- '''
- ost_client = getConnect_ots()
- last_docid = 0
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
- TermQuery("docchannel",101),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- RangeQuery('docid', last_docid, include_lower=False)])
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
- ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- list_project = []
- def _getRow(list_project,rows,last_docid):
- for row in rows:
- project_name = row[1][0][1]
- docid = row[0][1][1]
- last_docid = docid
- list_pack = json.loads(row[1][1][1])
- _set_tenderer = set()
- win_tenderer = ""
- for _pack in list_pack:
- if "win_tenderer" in _pack and win_tenderer=="":
- win_tenderer = _pack["win_tenderer"]
- if "second_tenderer" in _pack:
- _set_tenderer.add(_pack["second_tenderer"])
- if "third_tenderer" in _pack:
- _set_tenderer.add(_pack["third_tenderer"])
- list_project.append({"docid":docid,"project_name":project_name,"win_tenderer":win_tenderer,"tenderer":list(_set_tenderer)})
- return last_docid
- _getRow(list_project,rows,last_docid)
- while(next_token):
- print("%d/%d"%(len(list_project),total_count))
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(["project_name","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- last_docid = _getRow(list_project,rows,last_docid)
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for item in list_project:
- task_queue.put(item)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- def _handle(item,result_queue,pool_ots):
- if item["win_tenderer"]!="":
- ots_client = pool_ots.getConnector()
- consumed, return_row, next_token = ost_client.get_row("enterprise", [("name",item["win_tenderer"])], ["province","reg_capital","estiblish_time","business_scope"], None, 1)
- _dict = dict()
- for _item in return_row.attribute_columns:
- _dict[_item[0]] = _item[1]
- for _key in _dict.keys():
- item[_key] = _dict[_key]
- data = solrQuery("contact",{"q":'company_name:"%s"'%item["win_tenderer"],"fl":"contact_person,mobile_no,phone_no","rows":10})
- for _item in data["response"]["docs"]:
- for _key in _item.keys():
- item[_key] = _item[_key]
- break
- pool_ots.putConnector(ots_client)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots);
- mt.run()
- keys = ["docid","project_name","win_tenderer","tenderer","province","reg_capital","business_scope","estiblish_time","contact_person","mobile_no","phone_no"]
- df_data = {}
- for _key in keys:
- df_data[_key] = []
- for item in list_project:
- for _key in keys:
- if _key in item:
- df_data[_key].append(item[_key])
- else:
- df_data[_key].append("")
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s-%s中标信息.xlsx"%(time_from,time_to),columns=keys)
- def exportContact():
- time_from = "2021-01-14"
- time_to = "2021-01-15"
- filename = "../data/%s-%s中标信息.xlsx"%(time_from,time_to)
- df1 = pd.read_excel(filename)
- set_company = set()
- for item in df1["tenderer"]:
- list_company = re.split("\['|', '|'\]|\[\]",item)
- for _company in list_company:
- if _company!="":
- set_company.add(_company)
- companys = list(set_company)
- task_queue = queue.Queue()
- list_company = []
- for _company in companys:
- item = {"company_name":_company}
- list_company.append(item)
- task_queue.put(item)
- result_queue = queue.Queue()
- def _handle(item,result_queue):
- company = item["company_name"]
- data = solrQuery("contact",{"q":'company_name:"%s"'%company,"fl":"company_name,contact_person,mobile_no,phone_no","rows":10})
- for _item in data["response"]["docs"]:
- for _key in _item.keys():
- item[_key] = _item[_key]
- break
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30);
- mt.run()
- keys = ["company_name","contact_person","mobile_no","phone_no"]
- df_data = {}
- for _key in keys:
- df_data[_key] = []
- ILLEGAL_CHARACTERS_RE = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
- for item in list_company:
- for _key in keys:
- if _key in item:
- df_data[_key].append(ILLEGAL_CHARACTERS_RE.sub(r'', item[_key]))
- else:
- df_data[_key].append("")
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s-%s竞争对手信息.xlsx"%(time_from,time_to),columns=keys)
- def countKeyword():
- conn = getConnection_mysql()
- cursor = conn.cursor()
- print(0)
- sql = "select dochtmlcon from sys_document_22 where docchannel=52 and page_time>='2020-09-01' and page_time<='2020-09-07'"
- cursor.execute(sql)
- print(0.1)
- df = pd.read_excel("万郡绿建细分关键词.xls")
- list_keywords = df["细分类别"]
- dict_keywords = dict()
- for _key in list_keywords:
- dict_keywords[_key] = 0
- print(1)
- from bs4 import BeautifulSoup
- while(True):
- rows = cursor.fetchmany(10000)
- print("==")
- if not rows:
- break
- for row in rows:
- _html = BeautifulSoup(row[0],"lxml").getText()
- for _key in list_keywords:
- if re.search(_key,_html) is not None:
- dict_keywords[_key] += 1
- print(dict_keywords)
- list_count = []
- for _key in list_keywords:
- list_count.append(dict_keywords[_key])
- df1 = pd.DataFrame({"关键字":list_keywords,"数量":list_count})
- df1.to_excel("关键词统计.xlsx")
- def countKeyword_solr():
- def _handle(item,result_queue):
- keyword = item["keyword"]
- data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:101 AND dochtmlcon:"法院" '%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
- _num = data["response"]["numFound"]
- item["zhongbiao"] = _num
- data = solrQuery("document",{"q":'dochtmlcon:"%s" AND docchannel:52 AND dochtmlcon:"法院"'%keyword,"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-12-31T23:59:59Z])',"fl":"id","rows":10})
- _num = data["response"]["numFound"]
- item["zhaobiao"] = _num
- result_queue.put(item)
- file = "../data/关键词11.xlsx"
- df = pd.read_excel(file)
- task_queue = queue.Queue()
- print(df.keys())
- for item in df["业务关键词"]:
- task_queue.put({"keyword":item,"zhaobiao":0,"zhongbiao":0})
- result_queue = queue.Queue()
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=10)
- mt.run()
- list_keyword = []
- list_zhaobiao = []
- list_zhongbiao = []
- while(True):
- try:
- item = result_queue.get(False)
- list_keyword.append(item["keyword"])
- list_zhaobiao.append(item["zhaobiao"])
- list_zhongbiao.append(item["zhongbiao"])
- except Exception:
- break
- df1 = pd.DataFrame({"业务关键词":list_keyword,"招标公告":list_zhaobiao,"中标信息":list_zhongbiao})
- df1.to_excel("%s_export.xlsx"%file,columns=["业务关键词","招标公告","中标信息"])
- def query_from_solr():
- data = solrQuery("document",{"q":'dochtmlcon:"法律" AND (docchannel:51 OR docchannel:104 or docchannel:52 or docchannel:102) AND province:"湖南" ',"fq":'(publishtime:[2020-01-01T00:00:00Z TO 2020-01-20T23:59:59Z])',"fl":"id","rows":10})
- _num = data["response"]["numFound"]
- print(_num)
- def export_province_keyword_count():
- def _handle(item,result_queue,pool_ots):
- columns = ["doctitle","docchannel","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
- ots_client = pool_ots.getConnector()
- _province = item["province"]
- print(item)
- # keywords = item["keyword"]+" "+item["key"]
- list_keyword = item["keyword"]
- # for _temp in keywords.split(" "):
- # if len(_temp)>0:
- # list_keyword.append(_temp)
- should_queries = []
- must_not_q = []
- for _temp in list_keyword:
- should_queries.append(MatchPhraseQuery("doctitle","%s"%_temp))
- must_not_q.append(WildcardQuery("tenderee","*%s*"%_temp))
- bool_query_keyword = BoolQuery(should_queries=should_queries,minimum_should_match=2)
- page_time = item["page_time"]
- bool_query = BoolQuery(must_queries=[#bool_query_keyword
- # ,WildcardQuery("publishtime","%s*"%page_time)
- # ,MatchPhraseQuery("doctitle","服务")
- RangeQuery("page_time","2018-01-01","2019-01-01",include_lower=True,include_upper=False),
- TermQuery("docchannel",52),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- WildcardQuery('city', '%s*'%_province)
- # ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_tenderer",0,include_lower=True))
- ]
- # ,must_not_queries=must_not_q
- )
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("sub_docs_json.bidding_budget",SortOrder.DESC)]), limit=20, get_total_count=True),
- ColumnsToGet(column_names=columns,return_type=ColumnReturnType.SPECIFIED))
- item["count"] = total_count
- list_data = getRow_ots(rows)
- item["list_data"] = list_data
- print(item)
- pool_ots.putConnector(ots_client)
- df = pd.read_excel("../data/省份信息.xlsx")
- list_province = []
- for _name,_type,_parent in zip(df["cname"],df["ctype"],df["parentid"]):
- if _type==30 and _parent==4:
- list_province.append(_name)
- # filename = "../data/2021-02关键词导出数据.xlsx"
- # dict_keyword = {}
- # df1 = pd.read_excel(filename,dtype=str)
- # for _key,_keyword in zip(df1["key1"],df1["keyword"]):
- # print("===",str(_keyword))
- # dict_keyword[_key] = "" if str(_keyword)=="nan" else _keyword
- # for _key in df1["关键词"]:
- # dict_keyword[_key] = ""
- keyword_str = '''
- 快递 物流 供应链 运输 配送
- 仓储 冷链 整车 服务
- '''
- list_key = []
- for _k in re.split("\s",keyword_str):
- _k1 = _k.strip()
- if len(_k1)>0:
- list_key.append(_k1)
- list_task = []
- page_time = "2020-11"
- for _province in list_province:
- list_task.append({"page_time":page_time,"province":_province,"key":list_key,"keyword":list_key,"count":0})
- task_queue = queue.Queue()
- for item in list_task:
- task_queue.put(item)
- result_queue = queue.Queue()
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
- mt.run()
- dict_key_data = dict()
- # list_data = []
- # for item in list_task:
- # list_data.extend(item["list_data"])
- # dict_channel = getDict_docchannel()
- # df_data= {}
- # print(list_data)
- # for row in list_data:
- # item = {}
- # _dict = row
- # set_dict_item(item,"docid",_dict.get("docid",""))
- # set_dict_item(item,"公告标题",_dict.get("doctitle",""))
- # set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- # set_dict_item(item,"省份",_dict.get("province",""))
- # # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- # set_dict_item(item,"城市",_dict.get("city",""))
- # set_dict_item(item,"发布时间",_dict.get("page_time",""))
- #
- # set_dict_item(item,"项目编号",_dict.get("project_code",""))
- # set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- # set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- # set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- # set_dict_item(item,"代理单位",_dict.get("agency",""))
- # set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- # set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- #
- # sub_docs_json = _dict.get("sub_docs_json")
- # for _doc in json.loads(sub_docs_json):
- # if "win_tenderer" in _doc:
- # set_dict_item(item,"中标单位",_doc["win_tenderer"])
- # if "win_tenderee_manager" in _doc:
- # set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
- # if "win_tenderee_phone" in _doc:
- # set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
- # if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- # set_dict_item(item,"中标金额",_doc["win_bid_price"])
- # if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- # set_dict_item(item,"招标金额",_doc["bidding_budget"])
- # if "招标金额" not in item:
- # set_dict_item(item,"招标金额","")
- # if "中标金额" not in item:
- # set_dict_item(item,"中标金额","")
- # if "中标单位" not in item:
- # set_dict_item(item,"中标单位","")
- # if "中标单位联系人" not in item:
- # set_dict_item(item,"中标单位联系人","")
- # if "中标单位联系电话" not in item:
- # set_dict_item(item,"中标单位联系电话","")
- #
- #
- # _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
- # # if _line in set_line:
- # # continue
- # # if item["招标金额"]=="":
- # # continue
- # # set_line.add(_line)
- # for k,v in item.items():
- # if k not in df_data:
- # df_data[k] = []
- # df_data[k].append(v)
- # df1 = pd.DataFrame(df_data)
- # df1.to_excel("../data/%s_顺丰中标数据.xlsx"%getCurrent_date('%Y-%m-%d_%H%M%S'),columns=list_df_columns)
- for item in list_task:
- print("%s\t%d"%(item["province"],item["count"]))
- # for item in list_task:
- # dict_key_data[item["key"]][item["province"]] = item
- # dict_key_province = dict()
- # dict_key_province["关键词"] = []
- # for _province in list_province:
- # dict_key_province[_province] = []
- # for _key in dict_keyword.keys():
- # dict_key_province["关键词"].append(_key)
- # for _province in list_province:
- # dict_key_province[_province].append(dict_key_data[_key][_province]["count"])
- # columns = ["关键词"]
- # columns.extend(list_province)
- # df2 = pd.DataFrame(dict_key_province)
- # df2.to_excel("../data/%s_导出数据.xlsx"%filename,columns=columns)
- def export_keyword_count():
- def _handle(item,result_queue,ots_client):
- shoud_q_docchannel = BoolQuery(should_queries=[
- # RangeQuery("docchannel",51,105,True,True)
- TermQuery("docchannel",101),
- RangeQuery("docchannel",118,120,True,True)
- ]
- )
- should_q_keyword = BoolQuery(should_queries=[
- MatchPhraseQuery("doctitle",item["keyword"]),
- MatchPhraseQuery("doctextcon",item["keyword"]),
- MatchPhraseQuery("attachmenttextcon",item["keyword"])
- ])
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- generateBoolShouldQuery(["docchannel"],[51, 52, 101, 118, 119, 120, 114, 51, 103],TermQuery),
- # TermQuery("docchannel",101),
- # shoud_q_docchannel,
- should_q_keyword
- # MatchPhraseQuery(item["type"], item["keyword"])
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1, get_total_count=True),
- ColumnsToGet(return_type=ColumnReturnType.ALL))
- item["total_count"] = total_count
- if total_count>0:
- item["exists"] =1
- else:
- item["exists"] =0
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- TermQuery("docchannel",52),
- # shoud_q_docchannel,
- should_q_keyword
- # MatchPhraseQuery(item["type"], item["keyword"])
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1, get_total_count=True),
- ColumnsToGet(return_type=ColumnReturnType.ALL))
- item["zhaobiao_count"] = total_count
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",item["range_from"],item["range_to"],True,False),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- # TermQuery("docchannel",101),
- shoud_q_docchannel,
- should_q_keyword,
- # MatchPhraseQuery(item["type"], item["keyword"])
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1, get_total_count=True),
- ColumnsToGet(return_type=ColumnReturnType.ALL))
- item["zhongbiao_count"] = total_count
- range_from = "2022-02-21"
- range_to = "2022-02-22"
- _type = "doctitle"
- assert _type in ["doctitle","doctextcon"]
- list_dict_key_count = []
- filename = "注册资金大于3kw.csv"
- df = pd.read_csv(filename)
- for item in df["name"]:
- list_dict_key_count.append({"keyword":item,"count":0,"exists":0,"range_from":range_from,"range_to":range_to,"type":_type})
- # str_keys = '''
- # 智慧税务
- # 发票管理
- #
- # '''
- # for item in re.split("\s|\r|\n|,|,|、",str_keys):
- # if item.strip()!="":
- # list_dict_key_count.append({"keyword":item,"total_count":0,"zhaobiao_count":0,"zhongbiao_count":0,"range_from":range_from,"range_to":range_to,"type":_type})
- task_queue = queue.Queue()
- for item in list_dict_key_count:
- task_queue.put(item)
- result_queue = queue.Queue()
- ots_client = getConnect_ots()
- # pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=60,ots_client=ots_client)
- mt.run()
- columns = ["keyword","total_count","exists","zhaobiao_count","zhongbiao_count","range_from","range_to","type"]
- df_data = {}
- for _c in columns:
- df_data[_c] = []
- for item in list_dict_key_count:
- print(item)
- for _c in columns:
- if _c in item:
- df_data[_c].append(item[_c])
- else:
- df_data[_c].append("")
- for k,v in df_data.items():
- print(k,len(v))
- df2 = pd.DataFrame(df_data)
- df2.to_excel("%s_数量导出全部类别.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=columns)
- def export_keyword_title():
- ots_client = getConnect_ots()
- range_from = "2020-01-01"
- range_to = "2022-12-23"
- list_condition = [["医务室"],
- ["医院"],
- ["卫生院"],
- ["卫生所"],
- ["卫生室"],
- ["社区卫生服务中心"]]
- list_should_query = []
- for _c in list_condition:
- if len(_c)==1:
- list_should_query.append(MatchPhraseQuery("doctitle",_c[0]))
- else:
- _must_query = []
- for _q in _c:
- _must_query.append(MatchPhraseQuery("doctitle",_q))
- list_should_query.append(BoolQuery(must_queries=_must_query))
- keyword_query = BoolQuery(should_queries=list_should_query)
- bool_query = BoolQuery(must_queries=[RangeQuery("publishtime",range_from,range_to),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- keyword_query
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
- df_data = {"docid":[],"doctitle":[],"tenderee":[]}
- def setData(df_data,rows):
- list_dict = getRow_ots(rows)
- for _dict in list_dict:
- docid = _dict.get("docid","")
- doctitle = _dict.get("doctitle","")
- tenderee = _dict.get("tenderee","")
- df_data["docid"].append(docid)
- df_data["doctitle"].append(doctitle)
- df_data["tenderee"].append(tenderee)
- setData(df_data,rows)
- _count = len(rows)
- while next_token:
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(["docid","doctitle","tenderee"],return_type=ColumnReturnType.SPECIFIED))
- setData(df_data,rows)
- _count += 100
- print(_count,total_count)
- file_begin = 0
- file_length = 100000
- _count = 0
- while file_begin<len(df_data["docid"]):
- _dict = dict()
- for _key,v in df_data.items():
- _dict[_key] = v[file_begin:file_begin+file_length]
- _count += 1
- file_begin += file_length
- df = pd.DataFrame(_dict)
- df.to_csv("../data/%s-%s_tenderee_doctitle_%d.csv"%(range_from,range_to,_count))
- def exportArticle_by_websource():
- # conn = getConnection_testmysql()
- # cursor = conn.cursor()
- # sql = "select web_source_no from web_source"
- # cursor.execute(sql)
- # rows = cursor.fetchmany(10)
- # dict_websource = dict()
- # while(rows):
- # for row in rows:
- # web_source_no = row[0]
- # dict_websource[web_source_no] = []
- # rows = cursor.fetchmany(1000)
- #
- # task_queue = queue.Queue()
- # for _key in dict_websource.keys():
- # task_queue.put({"key":_key,"list":dict_websource[_key]})
- #
- # pool_ots = ConnectorPool(init_num=100,max_num=1000,method_init=getConnect_ots)
- # result_queue = queue.Queue()
- # def _handle(item,result_queue,pool_ots):
- # _key = item["key"]
- # print(_key)
- # ots_client = pool_ots.getConnector()
- # bool_query = BoolQuery(must_queries=[RangeQuery('status', '201', '300', include_lower=True, include_upper=True),
- # TermQuery('web_source_no', '%s'%_key)
- # ])
- #
- # is_all_succeed = False
- #
- # while(not is_all_succeed):
- # try:
- # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- # SearchQuery(bool_query, limit=100, get_total_count=True),
- # ColumnsToGet(["docid","docchannel","dochtmlcon"],return_type=ColumnReturnType.SPECIFIED))
- # list_zhaobiao = []
- # list_zhongbiao = []
- # for row in rows:
- # _dict = dict()
- # for values in row:
- # for _v in values:
- # _dict[_v[0]] = _v[1]
- # if _dict["docchannel"]==52:
- # list_zhaobiao.append(_dict)
- # elif _dict["docchannel"]==101:
- # list_zhongbiao.append(_dict)
- # item["list"].extend(list_zhaobiao[:5])
- # item["list"].extend(list_zhongbiao[:5])
- # except Exception as e:
- # print(str(e))
- #
- # pool_ots.putConnector(ots_client)
- #
- # mt = MultiThreadHandler(task_queue = task_queue,task_handler=_handle,result_queue=result_queue,thread_count=100,pool_ots=pool_ots)
- # mt.run()
- # df_data = {"docid":[],"web_source_no":[],"docchannel":[],"dochtmlcon":[]}
- # for k,v in dict_websource.items():
- # for item in v:
- # df_data["docid"].append(item["docid"])
- # df_data["web_source_no"].append(k)
- # df_data["docchannel"].append(item["docchannel"])
- # df_data["dochtmlcon"].append(item["dochtmlcon"])
- # df = pd.DataFrame(df_data)
- # df.to_csv("../data/websouce_doc.csv",columns=["docid","web_source_no","docchannel","dochtmlcon"],encoding="UTF8")
- df = pd.read_csv("../data/other/websouce_doc.csv")
- df_2000 = {"document_id":[],"document_text":[]}
- print("total_count",len(df["docid"]))
- begin = 230000
- end = 260000
- _count = 0
- for _id,_text in zip(df["docid"][begin:end],df["dochtmlcon"][begin:end]):
- if len(_text)>100000:
- continue
- df_2000["document_id"].append(_id)
- df_2000["document_text"].append(_text)
- df_2 = pd.DataFrame(df_2000)
- df_2.to_csv("../data/websouce_doc_%d-%d.csv"%(begin,end),columns=["document_id","document_text"],encoding="utf8",index=False)
- # save(dict_websource,"../data/dict_websource.pk")
- def getWinTenderer(sub_doc_json):
- if sub_doc_json is not None:
- sub_doc = json.loads(sub_doc_json)
- for _doc in sub_doc:
- if "win_tenderer" in _doc:
- return _doc["win_tenderer"]
- return ""
- def exportDocument_by_keywords(page_time,
- list_keyword = ["创客","STEAM","人工智能","课程服务","机器人中学","机器人小学","机器人幼儿园","机器人学校","Labplus","盛思","makeblock柴火","寓乐湾","美科科技","STEAM","能力风暴","优必选","蘑菇云","Dfrobot","中鸣","飞瑞敖","编程猫培生","八爪鱼","八爪鱼教育","童心制物"]):
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for _k in list_keyword:
- task_queue.put(_k)
- def _handle(keyword,result_queue):
- should_queries = []
- for _temp in [keyword]:
- should_queries.append(MatchPhraseQuery("doctitle",_temp))
- bool_query_keyword = BoolQuery(should_queries=should_queries)
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[RangeQuery('publishtime', range_from='2017-12-20'),
- MatchPhraseQuery("doctitle",keyword),
- TermQuery("docchannel","101")
- ])
- is_all_succeed = False
- _count = 0
- total_count = 1
- next_token = None
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
- ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- for row in rows:
- _dict = dict()
- for values in row:
- for _v in values:
- _dict[_v[0]] = _v[1]
- result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
- print(keyword,next_token,total_count)
- while(next_token):
- try:
- # print(next_token)
- _count += len(rows)
- print("%s:%d/%d"%(keyword,_count,total_count))
- for row in rows:
- _dict = dict()
- for values in row:
- for _v in values:
- _dict[_v[0]] = _v[1]
- result_queue.put({"docid":_dict.get("docid",""),"keyword":keyword,"tenderee":_dict.get("tenderee",""),"win_tenderer":getWinTenderer(_dict.get("sub_docs_json",None))})
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(["docid","tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- except Exception as e:
- traceback.print_exc()
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30)
- mt.run()
- df_data = {"docid":[],"keyword":[],"tenderee":[],"win_tenderer":[]}
- while(True):
- try:
- item = result_queue.get(block=True,timeout=1)
- for _k in df_data.keys():
- if _k in item:
- df_data[_k].append(item[_k])
- else:
- df_data[_k].append("")
- except queue.Empty as e:
- break
- except Exception as e:
- traceback.print_exc()
- df = pd.DataFrame(df_data)
- df.to_csv("../data/exportArticle1_title.csv",columns=["docid","keyword","tenderee","win_tenderer"])
- def exportGovement():
- should_queries1 = []
- for _temp in ["教育局","地化所","税务局","国土局","学校","大学","中学","小学","幼儿园","医院"]:
- should_queries1.append(WildcardQuery("tenderee","*%s*"%_temp))
- should_queries2 = []
- for _temp in ["浙江","江苏","湖北","西北","陕西","甘肃","青海","宁夏","新疆","重庆","四川","云南","贵州"]:
- should_queries2.append(WildcardQuery("province","*%s*"%_temp))
- ots_client = getConnect_ots()
- page_time = "2020-12"
- bool_query = BoolQuery(must_queries=[BoolQuery(should_queries=should_queries1),
- BoolQuery(should_queries=should_queries2),
- TermQuery("docchannel","52"),
- RangeQuery("publishtime",page_time)])
- columns = ["tenderee","tenderee_contact","tenderee_phone"]
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=100, sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- print(total_count)
- def getRow(rows,df_data,columns):
- for row in rows:
- _dict = dict()
- for part in row:
- for item in part:
- _dict[item[0]] = item[1]
- if "tenderee_contact" in _dict and "tenderee_phone" in _dict:
- for key in columns:
- df_data[key].append(_dict.get(key,""))
- all_rows = 0
- df_data = {}
- for key in columns:
- df_data[key] = []
- getRow(rows,df_data,columns)
- _count = 100
- while(next_token):
- print(_count,total_count)
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100,get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count += 100
- getRow(rows,df_data,columns)
- df2 = pd.DataFrame(df_data)
- df2.to_excel("../data/%s政府招标人导出数据.xlsx"%page_time,columns=columns)
- def export_attachment():
- filename = "../data/销售部-字段名.xlsx"
- df = pd.read_excel(filename)
- list_dict = []
- for _key in df["关键词"]:
- if len(_key)>0:
- list_dict.append({"keyword":_key})
- def _handle(_dict,result_queue,pool_ots):
- _keyword = _dict["keyword"]
- ots_client = pool_ots.getConnector()
- keyword_query = BoolQuery(should_queries=[MatchPhraseQuery("doctextcon",_keyword),
- MatchPhraseQuery("doctitle",_keyword)])
- bool_query = BoolQuery(must_queries=[RangeQuery("status","201","300",include_upper=True,include_lower=True),
- keyword_query])
- columns = ["dochtmlcon","docid"]
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- print(_keyword,total_count)
- hit_count = 0
- def getData(rows,hit_count):
- list_fields = getRow_ots(rows)
- for _fields in list_fields:
- dochtmlcon = _fields["dochtmlcon"]
- docid = _fields["docid"]
- _flag = False
- for url,suffix in re.findall("(http://[^\'\";;\n]+?\.(zip|rar|doc|xls|xlsx|pdf|txt))",dochtmlcon):
- try:
- result = requests.get(url,stream=True,timeout=100)
- if result.status_code==200:
- filename = get_file_name(url,result.headers)
- with open("../data/attachment/%s_%d_%s"%(_keyword,docid,filename),"wb") as f:
- f.write(result.content)
- _flag = True
- except Exception:
- pass
- if _flag:
- hit_count += 1
- return hit_count
- hit_count = getData(rows,hit_count)
- _count = len(rows)
- while next_token:
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count += len(rows)
- if _count>20000:
- break
- hit_count = getData(rows,hit_count)
- if hit_count>20:
- break
- pool_ots.putConnector(ots_client)
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for item in list_dict:
- task_queue.put(item)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
- mt.run()
- def exportIndustryCount():
- import codecs
- time_from = "2020-12-21"
- time_to = "2020-12-25"
- # dict_channel = {"51":{"type":"公告变更"},
- # "52":{"type":"招标公告"},
- # "101":{"type":"中标信息"},
- # "102":{"type":"招标预告"},
- # "103":{"type":"招标答疑"},
- # "104":{"type":"招标文件"},
- # "105":{"type":"资审结果"},
- # "103":{"type":"招标控制价"},
- # "100":{"type":"未知类型"}}
- dict_industry = {}
- meta_industry = load("../data/other/class2dalei_menlei.pkl")
- for _key in meta_industry.keys():
- dict_industry[_key] = {"type":_key}
- print(dict_industry.keys())
- return
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- for _key in dict_industry.keys():
- task_queue.put(dict_industry[_key])
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"]),
- RangeQuery("publishtime",time_from,time_to,include_lower=True,include_upper=True)])
- columns = ["docid"]
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1,get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- item["count"] = total_count
- columns = ["dochtmlcon"]
- bool_query = BoolQuery(must_queries=[TermQuery("info_type",item["type"])])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=10,sort=Sort(sorters=[FieldSort("publishtime",SortOrder.ASC)]),get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- for row in rows:
- _dict = dict()
- for part in row:
- for v in part:
- _dict[v[0]] = v[1]
- with codecs.open("../data/industry/%s_%d.html"%(item["type"],_dict["docid"]),"w",encoding="UTF8") as f:
- f.write(_dict["dochtmlcon"])
- pool_ots.putConnector(ots_client)
- pool_ots = ConnectorPool(init_num=20,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=30,pool_ots=pool_ots)
- mt.run()
- columns = ["type","count"]
- df_data = {}
- for _c in columns:
- df_data[_c] = []
- for _indus in dict_industry.keys():
- for _c in columns:
- df_data[_c].append(dict_industry[_indus][_c])
- df = pd.DataFrame(df_data)
- # df.to_excel("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
- df.to_csv("../data/%s-%s_industry_count.xlsx"%(time_from,time_to),columns=columns)
- def exportDocument_By_time(time_from,time_to,columns=["docid","doctitle","project_name","dochtmlcon"]):
- '''
- :return:
- '''
- ost_client = getConnect_ots()
- last_docid = 0
- bool_query = BoolQuery(must_queries=[RangeQuery("page_time",time_from,time_to,include_lower=True,include_upper=True),
- RangeQuery('status', '201', '300', include_lower=True, include_upper=True)])
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]) , limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count = len(rows)
- df_data = {}
- def getData(df_data,rows):
- list_dict = getRow_ots(rows)
- for _dict in list_dict:
- for _k,_v in _dict.items():
- if _k not in df_data:
- df_data[_k] = []
- df_data[_k].append(getLegal_str(_v))
- getData(df_data,rows)
- while(next_token):
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ost_client.search("document", "document_index",
- SearchQuery(bool_query,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- _count += len(rows)
- getData(df_data,rows)
- df = pd.DataFrame(df_data)
- df.to_excel("%s/%s-%s公告信息.xlsx"%(data_path,time_from,time_to),columns=columns)
- def processDocument():
- filename = "../data/2021-01-29-2021-01-29公告信息.xlsx"
- df = pd.read_excel(filename)
- df.to_csv("../data/2021-01-29-2021-01-29公告信息.csv")
- return
- list_dict = []
- for docid,doctitle,project_name,dochtmlcon in zip(df["docid"],df["doctitle"],df["project_name"],df["dochtmlcon"]):
- list_dict.append({"docid":docid,"doctitle":doctitle,"project_name":project_name,"dochtmlcon":dochtmlcon})
- task_queue = queue.Queue()
- for _dict in list_dict:
- task_queue.put(_dict)
- result_queue = queue.Queue()
- def _handle(_dict,result_queue,pool_mysql):
- conn = pool_mysql.getConnector()
- cursor = conn.cursor()
- sql = "insert into test_extract(docid,doctitle,page_time) values(%d,%s,%s)"%(_dict["docid"],_dict["doctitle"],_dict["dochtmlcon"])
- cursor.execute(sql)
- conn.commit()
- pool_mysql.putConnector(conn)
- # url = "http://192.168.2.101:15030"
- # myheaders = {'Content-Type': 'application/json'}
- # print(int(_dict["docid"]))
- # data = {"doc_id":int(_dict["docid"]),"title":_dict["doctitle"],"content":_dict["dochtmlcon"]}
- # resp = requests.post(url,json=data,headers=myheaders, verify=True)
- # result = json.loads(resp.content.decode("utf8"),"utf8")
- # _dict["product"] = result["product"]
- pool_mysql = ConnectorPool(init_num=20,max_num=30,method_init=getConnection_testmysql)
- mt = MultiThreadHandler(task_queue=task_queue,task_handler=_handle,result_queue=result_queue,thread_count=5,pool_mysql=pool_mysql)
- mt.run()
- # columns = ["docid","doctitle","project_name","product"]
- #
- # df_data = {}
- # for _c in columns:
- # df_data[_c] = []
- # for _dict in list_dict:
- # for _c in columns:
- # df_data[_c].append(_dict.get(_c,""))
- # df = pd.DataFrame(df_data)
- # df.to_excel("%s.product.xlsx"%(filename),columns=columns)
- def export_extract_check():
- '''
- :return:导出数据提取校验的结果并生成报告
- '''
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- sql = " select docid,json_result from exportdb.extract_check "
- cursor.execute(sql)
- dict_global = {}
- df_global = {"key_type":[],"online_count":[],"test_count":[],"diff_count":[],"diff_percent":[]}
- df_document = {"docid":[]}
- while True:
- rows = cursor.fetchmany(10000)
- if not rows:
- break
- for docid,json_result in rows:
- df_document["docid"].append(docid)
- _result = json.loads(json_result)
- for k,v in _result.items():
- key = k.split("_")
- _key = "_".join(key[:-1])
- if "punish" in _key or "complainants" in _key or "institutions" in _key:
- continue
- if k not in df_document:
- df_document[k] = []
- df_document[k].append(v)
- key_type = key[-1]
- if _key not in dict_global:
- dict_global[_key] = {}
- if key_type not in dict_global[_key]:
- dict_global[_key][key_type] = 0
- if key_type=="diff":
- dict_global[_key][key_type] += v
- if key_type in ("online","test"):
- if isinstance(v,str):
- if v!="":
- dict_global[_key][key_type] += 1
- elif isinstance(v,list):
- dict_global[_key][key_type] += len(v)
- for k,v in dict_global.items():
- df_global["key_type"].append(k)
- df_global["online_count"].append(v["online"])
- df_global["test_count"].append(v["test"])
- df_global["diff_count"].append(v["diff"])
- df_global["diff_percent"].append(v["diff"]/v["online"] if v["online"]>0 else 0)
- filename = "../data/%s_extract_check.xlsx"%(time.strftime("%Y-%m-%d"))
- with pd.ExcelWriter(filename) as writer:
- df1 = pd.DataFrame(df_global)
- df1.to_excel(writer,sheet_name="global")
- for k,v in df_document.items():
- print(k,len(v))
- df2 = pd.DataFrame(df_document)
- df2.to_excel(writer,sheet_name="document")
- writer.save()
- writer.close()
- def exportDocument_dump():
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- columns = ["docid","docchannel","page_time","web_source_no","doctitle","tenderee","agency","project_code","project_name","sub_docs_json"]
- df_keys = ["docid","docchannel","page_time","web_source_no","doctitle","doctitle_refine","tenderee","agency","project_code","project_name","bidding_budget","win_bid_price","win_tenderer","URL"]
- df_data = {}
- for _key in df_keys:
- df_data[_key] = []
- bool_query = BoolQuery(must_queries=[TermQuery("page_time","2021-03-03"),
- RangeQuery("status",201,300,True,True)])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- def getData(df_data,rows):
- list_data = getRow_ots(rows)
- for row in list_data:
- dict_find = {}
- for _key in df_keys:
- dict_find[_key] = 0
- for _k,_v in row.items():
- if _k in df_keys:
- dict_find[_k] = 1
- if _k=="project_code":
- _v = '"%s"'%_v
- df_data[_k].append(_v)
- doctitle = row.get("doctitle","")
- df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
- df_data["URL"].append("http://www.bidizhaobiao.com/info-%d.html"%(row["docid"]))
- dict_find["URL"] = 1
- dict_find["doctitle_refine"] = 1
- sub_docs_json = row.get("sub_docs_json","[{}]")
- doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
- if sub_docs_json is not None:
- for sub_docs in json.loads(sub_docs_json):
- for _key_sub_docs in sub_docs.keys():
- if _key_sub_docs in doc_columns:
- if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
- if _key_sub_docs in ["bidding_budget","win_bid_price"]:
- if float(sub_docs[_key_sub_docs])>0:
- doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- else:
- doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- for _k,_v in doc_columns.items():
- dict_find[_k] = 1
- df_data[_k].append(_v)
- for _k,_v in dict_find.items():
- if _v==0:
- df_data[_k].append("")
- _count = len(rows)
- getData(df_data,rows)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- getData(df_data,rows)
- _count += len(rows)
- # for docid in df["docid"]:
- # bool_query = BoolQuery(must_queries=[TermQuery("docid",int(docid))])
- #
- # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- # SearchQuery(bool_query , limit=100, get_total_count=True),
- # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- # list_data = getRow_ots(rows)
- # if len(list_data)>0:
- # dict_find = {}
- # for _key in df_keys:
- # dict_find[_key] = 0
- # for _k,_v in list_data[0].items():
- # if _k in df_keys:
- # dict_find[_k] = 1
- # df_data[_k].append(_v)
- # doctitle = list_data[0].get("doctitle","")
- # df_data["doctitle_refine"].append(re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价', '', doctitle))
- # dict_find["doctitle_refine"] = 1
- # sub_docs_json = list_data[0].get("sub_docs_json","[{}]")
- # doc_columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
- # if sub_docs_json is not None:
- # for sub_docs in json.loads(sub_docs_json):
- # for _key_sub_docs in sub_docs.keys():
- # if _key_sub_docs in doc_columns:
- # if doc_columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
- # if _key_sub_docs in ["bidding_budget","win_bid_price"]:
- # if float(sub_docs[_key_sub_docs])>0:
- # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- # else:
- # doc_columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
- # for _k,_v in doc_columns.items():
- # dict_find[_k] = 1
- # df_data[_k].append(_v)
- # for _k,_v in dict_find.items():
- # if _v==0:
- # df_data[_k].append("")
- df1 = pd.DataFrame(df_data)
- df1.to_csv("../data/0303去重.csv",columns=df_keys)
- def exportDocument_dump_mysql():
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- columns = ["project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer","group_id","docid"]
- df_data = {}
- for _c in columns:
- df_data[_c] = []
- sql = " select "+",".join(columns)+" from run_dumplicate_document_his where group_id in (select group_id from run_dumplicate_document_his group by group_id having count(1)>1)"
- cursor.execute(sql)
- while True:
- rows = cursor.fetchmany(100000)
- if not rows:
- break
- for row in rows:
- for _i in range(len(columns)):
- df_data[columns[_i]].append(row[_i])
- df = pd.DataFrame(df_data)
- df.to_csv("../data/0304去重.csv",columns=["group_id","docid","project_code","doctitle","doctitle_refine","tenderee","agency","project_name","win_bid_price","bidding_budget","page_time","docchannel","web_source_no","win_tenderer"])
- print(cursor.description)
- def getDict_docchannel():
- filename = "docchannel.pk"
- if os.path.exists(filename):
- _dict = load(filename)
- return _dict
- conn = getConnection_mysql()
- cursor = conn.cursor()
- sql = "select channel_id,chnlname from sys_channel "
- cursor.execute(sql)
- rows = cursor.fetchall()
- _dict = dict()
- for row in rows:
- _dict[row[0]] = row[1]
- save(_dict,filename)
- return _dict
- def exportDocument_by_doctitle():
- def timeAdd_minute(_time,minutes):
- a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+60*minutes
- _time1 = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(a))
- return _time1
- columns = ["docid","attachmenttextcon","doctitle","docchannel","bidway","province","city","district","info_type","page_time","crtime","project_code","tenderee","project_name","agency","sub_docs_json","tenderee_contact","tenderee_phone","doctextcon","product","moneysource","time_bidclose","time_bidopen"]
- columns = ["doctitle","attachmenttextcon","doctextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no"]
- columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no"]
- dict_channel = getDict_docchannel()
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- list_keyword = []
- # for _keyword in re.split("\s|/|、",str_keyword):
- # if len(_keyword.strip())==0 and isinstance(_keyword,(str)):
- # continue
- # print(_keyword)
- # item = {"keyword":_keyword.strip()}
- # list_keyword.append(item)
- # task_queue.put(item)
- filename = "有效调用次数统计_20220830_v1.xlsx"
- df = pd.read_excel(filename)
- for company in df["enterpriseName"]:
- _dict = {"keyword":company}
- task_queue.put(_dict)
- list_keyword.append(_dict)
- start_day = "2019-01-01"
- count_days = 90
- # for _i in range(count_days):
- #
- # current_date = timeAdd(start_day,_i)
- # for _j in range(24*6):
- # start_minute = timeAdd_minute(current_date,10*_j)
- # end_minute = timeAdd_minute(current_date,10*(_j+1))
- #
- # item = {"start_minute":start_minute,"end_minute":end_minute}
- # list_keyword.append(item)
- # task_queue.put(item)
- ots_client = getConnect_ots()
- def _handle(item,result_queue,ots_client):
- # should_q_keyword = BoolQuery(should_queries=[
- # # MatchPhraseQuery("tenderee",item["keyword"]),
- # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["keyword"])),
- # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.second_tenderer",item["keyword"])),
- # # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.third_tenderer",item["keyword"]))
- # MatchPhraseQuery("doctextcon",item["keyword"]),
- # MatchPhraseQuery("doctitle",item["keyword"]),
- # MatchPhraseQuery("attachmenttextcon",item["keyword"])
- # ])
- #
- # should_q2 = BoolQuery(should_queries=[WildcardQuery('province', '%s*'%"广东")
- # # ,WildcardQuery('province', '%s*'%"湖南")
- # # ,WildcardQuery('province', '%s*'%"广西")
- # ])
- #
- # should_q_tenderee = BoolQuery(should_queries=[WildcardQuery("tenderee","*中学*"),
- # WildcardQuery("tenderee","*大学*"),
- # WildcardQuery("tenderee","*小学*"),
- # WildcardQuery("tenderee","*教育局*")])
- bool_query = BoolQuery(must_queries=[
- # RangeQuery("page_time","2019-01-01","2023-01-01"),
- # TermQuery("page_time","2022-02-18"),
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],[item["keyword"]],MatchPhraseQuery),
- # RangeQuery("crtime",item["start_minute"],item["end_minute"])
- RangeQuery("status",151,300,True,True),
- # TermQuery("tenderee",item["keyword"])
- # ,TermQuery("docchannel",52)
- # ,should_q_keyword
- # ,should_q_tenderee
- # ,should_q2
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("crtime",SortOrder.DESC)]), limit=1, get_total_count=True),
- ColumnsToGet(return_type=ColumnReturnType.NONE))
- item["total_count"] = total_count
- list_data = getRow_ots(rows)
- for _data in list_data:
- _data["keyword"] = item["keyword"]
- result_queue.put(_data)
- _count = len(list_data)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- _count += len(list_data)
- for _data in list_data:
- _data["keyword"] = item["keyword"]
- result_queue.put(_data)
- mt = MultiThreadHandler(task_queue,_handle,result_queue,40,ots_client=ots_client)
- mt.run()
- set_docid = set()
- list_item = []
- # for item in list_keyword:
- # print(item["keyword"])
- total_count = 0
- for item in list_keyword:
- total_count += item["total_count"]
- print(item["total_count"])
- print("total_count:%d"%(total_count))
- keys = list_keyword[0].keys()
- df_data = {}
- for item in list_keyword:
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- dumplicate = False
- try:
- while True:
- _dict = result_queue.get(False)
- _docid = _dict.get("docid")
- if _docid in set_docid and not dumplicate:
- continue
- set_docid.add(_docid)
- list_item.append(_dict)
- except Exception as e:
- print(e)
- list_docid = list(set_docid)
- with open("list_docid.txt","w",encoding="utf8") as f:
- for _docid in list_docid:
- f.write(str(_docid))
- f.write("\n")
- f.flush()
- # log("get document taotal_count:%d"%len(list_item))
- # set_line = set()
- # getRowData(df_data,list_item,set_line,list_keyword,dict_channel,dumplicate)
- # set_enterprise = set()
- # for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
- # set_enterprise.add(_tenderee)
- # set_enterprise.add(_agency)
- # set_enterprise.add(_win_tenderer)
- # if "" in set_enterprise:
- # set_enterprise.remove("")
- # if None in set_enterprise:
- # set_enterprise.remove(None)
- # dict_enterprise = getDictEnterprise(list(set_enterprise))
- # if len(set_enterprise)>0:
- # for _i in range(len(df_data["招标单位"])):
- # # _enterprise_name = df_data["招标单位"][_i]
- # # if df_data["招标联系人电话"][_i]=="":
- # # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- # # if contacts is not None:
- # # _person,_phone = getOneContact(contacts)
- # # df_data["招标联系人"][_i] = _person
- # # df_data["招标联系人电话"][_i] = _phone
- #
- # _enterprise_name = df_data["代理单位"][_i]
- # if df_data["代理联系人电话"][_i]=="":
- # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- # if contacts is not None:
- # _person,_phone = getOneContact(contacts)
- # df_data["代理联系人"][_i] = _person
- # df_data["代理联系人电话"][_i] = _phone
- #
- # _enterprise_name = df_data["中标单位"][_i]
- # if df_data["中标单位联系电话"][_i]=="":
- # contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- # if contacts is not None:
- # _person,_phone = getOneContact(contacts)
- # df_data["中标单位联系人"][_i] = _person
- # df_data["中标单位联系电话"][_i] = _phone
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s_export11.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
- set_columns = set()
- list_df_columns = []
- def set_dict_item(_dict,name,v):
- _dict[name] = getLegal_str(v)
- if name not in set_columns:
- set_columns.add(name)
- list_df_columns.append(getLegal_str(name))
- def set_dict_item_columns(set_columns1,list_df_columns1,_dict,name,v):
- _dict[name] = getLegal_str(v)
- if name not in set_columns1:
- set_columns1.add(name)
- list_df_columns1.append(name)
- def exportDocument_medicine(start_time,end_time):
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- # columns = ["doctitle","docchannel","time_bidopen","province","city","district","page_time","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone"]
- columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen"]
- set_enter = set()
- str_enter = '''
- 北京嘉和美康信息技术有限公司
- 阿里健康科技(中国)有限公司
- 北大医疗信息技术有限公司
- 创业慧康科技股份有限公司
- 东华医为科技有限公司
- 望海康信(北京)科技股份公司
- 国新健康保障服务有限公司
- 南京海泰医疗信息系统有限公司
- 南京海泰信息技术有限公司
- 浙江和仁科技股份有限公司
- 北京惠每科技有限公司
- 金蝶医疗软件科技有限公司
- 北京京东健康有限公司
- 四川久远银海软件股份有限公司
- 零氪科技(北京)有限公司
- 北京麦迪克斯科技有限公司
- 苏州麦迪斯顿医疗科技股份有限公司
- 江苏曼荼罗软件股份有限公司
- 北京平安联想智慧医疗信息技术有限公司
- 青岛百洋智能科技股份有限公司
- 上海森亿医疗科技有限公司
- 万达信息股份有限公司
- 微医集团(浙江)有限公司
- 卫宁健康科技集团股份有限公司
- 心医国际数字医疗系统(大连)有限公司
- 医渡云(北京)技术有限公司
- 医惠科技有限公司
- 易联众信息技术股份有限公司
- 智业软件股份有限公司
- 中电数据服务有限公司
- 重庆中联信息产业有限责任公司
- 杭州卓健信息科技股份有限公司
- 大连万达集团股份有限公司
- '''
- for a in re.split("\s+",str_enter):
- if a.strip()!="":
- set_enter.add(a.strip())
- dict_channel = getDict_docchannel()
- # list_province = ["江西","湖南","四川","安徽"]
- list_province = ["全国"]
- for _province in list_province:
- df_data = {}
- str_p = '''
- 智慧医疗系统 医院信息系统 临床路径 医院系统 医院管理软件
- 县域医共体 远程医疗 医院管理系统 医疗信息化 临床医疗
- 数据集成 云医院 智慧卫生 卫生信息系统 医疗数字化
- 临床应用
- '''
- list_prov = re.split("\s",str_p)
- list_mu = []
- for _p in list_prov:
- if _p.strip()=="":
- continue
- print(_p)
- list_mu.append(MatchPhraseQuery('doctextcon', '%s'%_p.strip()))
- s_tenderee = '医院、卫生院、疗养院、健康局、卫生局'
- list_should_ten = []
- for _p in re.split("、",s_tenderee):
- if _p.split()=="":
- continue
- list_should_ten.append(WildcardQuery("tenderee","*%s*"%_p.strip()))
- list_should_win = []
- for _win in list(set_enter):
- list_should_win.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_win)))
- list_should_chan = []
- list_should_chan.append(TermQuery("docchannel",52))
- list_should_chan.append(TermQuery("docchannel",101))
- list_should_chan.append(TermQuery("docchannel",102))
- should_q1 = BoolQuery(should_queries=list_mu)
- should_q2 = BoolQuery(should_queries=list_should_ten)
- should_q3 = BoolQuery(should_queries=list_should_chan)
- bool_query = BoolQuery(must_queries=[
- BoolQuery(should_queries=list_should_win),
- RangeQuery("page_time",start_time,end_time,include_lower=True,include_upper=True),
- RangeQuery("status",151,300,True,True),
- # should_q1,
- # should_q2,
- # should_q3,
- ],
- # must_not_queries=[
- # MatchPhraseQuery("doctextcon","器械"),
- # MatchPhraseQuery("doctextcon","仪器"),
- # ]
- )
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- set_line = set()
- _count = len(rows)
- # getData(df_data,rows,set_line)
- list_row = getRow_ots(rows)
- getRowData(df_data,list_row,set_line,[],dict_channel,False)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- list_row = getRow_ots(rows)
- getRowData(df_data,list_row,set_line,[],dict_channel,False)
- _count += len(rows)
- if len(list(df_data.keys()))>0:
- if len(df_data[list(df_data.keys())[0]])>=300:
- break
- set_enterprise = set()
- for _tenderee,_agency,_win_tenderer in zip(df_data["招标单位"],df_data["代理单位"],df_data["中标单位"]):
- set_enterprise.add(_tenderee)
- set_enterprise.add(_agency)
- set_enterprise.add(_win_tenderer)
- if "" in set_enterprise:
- set_enterprise.remove("")
- if None in set_enterprise:
- set_enterprise.remove(None)
- dict_enterprise = getDictEnterprise(list(set_enterprise))
- if len(set_enterprise)>0:
- for _i in range(len(df_data["招标单位"])):
- _enterprise_name = df_data["招标单位"][_i]
- if df_data["招标联系人电话"][_i]=="":
- contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- if contacts is not None:
- _person,_phone = getOneContact(contacts)
- df_data["招标联系人"][_i] = _person
- df_data["招标联系人电话"][_i] = _phone
- _enterprise_name = df_data["代理单位"][_i]
- if df_data["代理联系人电话"][_i]=="":
- contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- if contacts is not None:
- _person,_phone = getOneContact(contacts)
- df_data["代理联系人"][_i] = _person
- df_data["代理联系人电话"][_i] = _phone
- _enterprise_name = df_data["中标单位"][_i]
- if df_data["中标单位联系电话"][_i]=="":
- contacts = dict_enterprise.get(_enterprise_name,{}).get("contacts")
- if contacts is not None:
- _person,_phone = getOneContact(contacts)
- df_data["中标单位联系人"][_i] = _person
- df_data["中标单位联系电话"][_i] = _phone
- return df_data
- df1 = pd.DataFrame(df_data)
- print(len(df_data["docid"]))
- df1.to_excel("../data/%s_周五医疗数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
- def getRowDataWithKey(df_data,rows,columns):
- global list_df_columns
- list_df_columns = columns
- for row in rows:
- for c in columns:
- if c not in df_data:
- df_data[c] = []
- df_data[c].append(row.get(c))
- def getRowData(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
- dict_line = {}
- # list_data = getRow_ots(rows)
- _index = 0
- for row in rows:
- _index += 1
- item = {}
- _dict = row
- set_dict_item(item,"docid",_dict.get("docid",""))
- # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
- set_dict_item(item,"crtime",_dict.get("crtime",""))
- # set_dict_item(item,"要素数",_dict.get("extract_count",0))
- set_dict_item(item,"公告标题",_dict.get("doctitle",""))
- set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
- set_dict_item(item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- # set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
- # set_dict_item(item,"原公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))
- # set_dict_item(item,"detail_link",_dict.get("detail_link",""))
- set_dict_item(item,"公告内容",getLegal_str(_dict.get("doctextcon","")))
- set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))
- if "keyword" in _dict:
- set_dict_item(item,"关键词",_dict["keyword"])
- else:
- _wholeword = re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")[:30000]+row.get("attachmenttextcon","")[:30000]).replace("(","(").replace(")",")"))
- _pattern = "|".join([re.escape(str(a).replace("(","(").replace(")",")")) for a in list_keyword])
- set_dict_item(item,"关键词",",".join(list(set(re.findall(_pattern,_wholeword)))))
- # set_dict_item(item,"关键词",_dict.get("keyword",""))
- set_dict_item(item,"产品",_dict.get("product",""))
- set_dict_item(item,"服务期限",_dict.get("service_time",""))
- set_dict_item(item,"省份",_dict.get("province",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"城市",_dict.get("city",""))
- set_dict_item(item,"区县",_dict.get("district",""))
- set_dict_item(item,"发布时间",_dict.get("page_time",""))
- set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
- set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
- # set_dict_item(item,"创建时间",_dict.get("crtime",""))
- set_dict_item(item,"招标方式",_dict.get("bidway",""))
- set_dict_item(item,"行业一级分类",_dict.get("industry",""))
- set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
- set_dict_item(item,"uuid",_dict.get("uuid"))
- # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
- set_dict_item(item,"项目编号",_dict.get("project_code",""))
- set_dict_item(item,"项目名称",_dict.get("project_name",""))
- set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- set_dict_item(item,"代理单位",_dict.get("agency",""))
- set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- set_dict_item(item,"评审专家",_dict.get("person_review",""))
- set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
- set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
- set_dict_item(item,"获取文件开始时间",_dict.get("time_get_file_start",""))
- set_dict_item(item,"获取文件结束时间",_dict.get("time_get_file_end",""))
- set_dict_item(item,"保证金递交开始时间",_dict.get("time_earnest_money_start",""))
- set_dict_item(item,"保证金递交结束时间",_dict.get("time_earnest_money_end",""))
- sub_docs_json = _dict.get("sub_docs_json")
- set_tenderer = set()
- if sub_docs_json is not None:
- docs = json.loads(sub_docs_json)
- docs.sort(key=lambda x:float(x.get("win_bid_price",0)))
- for _doc in docs:
- if "win_tenderer" in _doc:
- set_dict_item(item,"中标单位",_doc["win_tenderer"])
- if "second_tenderer" in _doc:
- set_dict_item(item,"第二候选单位",_doc["second_tenderer"])
- set_tenderer.add(_doc.get("second_tenderer"))
- if "third_tenderer" in _doc:
- set_dict_item(item,"第三候选单位",_doc["third_tenderer"])
- set_tenderer.add(_doc.get("third_tenderer"))
- if "win_tenderee_manager" in _doc:
- set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
- if "win_tenderee_phone" in _doc:
- set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
- if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- set_dict_item(item,"中标金额",_doc["win_bid_price"])
- if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- set_dict_item(item,"招标金额",_doc["bidding_budget"])
- set_dict_item(item,"入围供应商",",".join(list(set_tenderer)))
- if "第二候选单位" not in item:
- set_dict_item(item,"第二候选单位","")
- if "第三候选单位" not in item:
- set_dict_item(item,"第三候选单位","")
- if "招标金额" not in item:
- set_dict_item(item,"招标金额","")
- if "中标金额" not in item:
- set_dict_item(item,"中标金额","")
- if "中标单位" not in item:
- set_dict_item(item,"中标单位","")
- if "中标单位联系人" not in item:
- set_dict_item(item,"中标单位联系人","")
- if "中标单位联系电话" not in item:
- set_dict_item(item,"中标单位联系电话","")
- # if item["中标单位"] not in set_enter:
- # continue
- if not dumplicate:
- if item["项目编号"]!="":
- _line = "%s-%s-%s-%s-%s-%s"%(item["公告类别"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["招标金额"]),str(item["中标金额"]))
- if _line in dict_line:
- dict_line[_line].append(item)
- continue
- dict_line[_line] = [item]
- _line2 = "%s-%s-%s-%s-%s-%s"%(item["公告标题"],item["公告类别"],item["招标单位"],str(item["招标金额"]),item["中标单位"],str(item["中标金额"]))
- if _line2 in dict_line:
- dict_line[_line2].append(item)
- continue
- dict_line[_line2] = [item]
- # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None:
- # continue
- # if _dict.get("docid","") in set_ig_docid:
- # continue
- # if item["招标金额"]=="":
- # continue
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- if not dumplicate:
- dict_dump = {}
- columns = ["group_id"]
- columns.extend(list_df_columns)
- for k in columns:
- dict_dump[k] = []
- group_id = 1
- for k,v in dict_line.items():
- if len(v)==1:
- continue
- for item in v:
- dict_dump["group_id"].append(group_id)
- for k in list_df_columns:
- dict_dump[k].append(item.get(k))
- group_id += 1
- df_dump = pd.DataFrame(dict_dump)
- df_dump.to_excel("%s/../data/dumplicate/%s_重复数据.xlsx"%(os.path.dirname(__file__),getCurrent_date("%Y-%m-%d_%H%M%S")))
- def getRowData_sp1(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
- dict_line = {}
- # list_data = getRow_ots(rows)
- _index = 0
- set_id = set()
- for row in rows:
- _index += 1
- item = {}
- _dict = row
- _id = _dict.get("id")
- if _id is not None and _id in set_id:
- continue
- set_id.add(_id)
- set_dict_item(item,"id",_dict.get("id",""))
- # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
- # set_dict_item(item,"crtime",_dict.get("crtime",""))
- set_dict_item(item,"detaillink",_dict.get("detaillink",""))
- # set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
- set_dict_item(item,"公告类别",dict_sptype.get(str(_dict.get("sp_type","")),""))
- set_dict_item(item,"page_time",getLegal_str(_dict.get("page_time","")))
- # set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))
- set_dict_item(item,"page_title",_dict.get("page_title",""))
- set_dict_item(item,"record_id",_dict.get("record_id",""))
- set_dict_item(item,"web_source_name",_dict.get("web_source_name",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- def getRowData_sp(df_data,rows,set_line,list_keyword,dict_channel,dumplicate):
- dict_line = {}
- # list_data = getRow_ots(rows)
- _index = 0
- set_id = set()
- for row in rows:
- _index += 1
- item = {}
- _dict = row
- set_dict_item(item,"docid",_dict.get("docid",""))
- # set_dict_item(item,"attachment_extract_status",_dict.get("attachment_extract_status",""))
- # set_dict_item(item,"crtime",_dict.get("crtime",""))
- set_dict_item(item,"公告标题",_dict.get("page_title",""))
- # set_dict_item(item,"web_source_no",_dict.get("web_source_no",""))
- set_dict_item(item,"公告类别",dict_sptype.get(str(_dict.get("sp_type","")),""))
- set_dict_item(item,"公告内容",getLegal_str(_dict.get("page_content","")))
- # set_dict_item(item,"附件内容",getLegal_str(_dict.get("attachmenttextcon","")))
- if "keyword" in _dict:
- set_dict_item(item,"关键词",_dict["keyword"])
- else:
- set_dict_item(item,"关键词",",".join(list(set(re.findall("|".join([re.escape(str(a).replace("(","(").replace(")",")")) for a in list_keyword]),re.sub("\s","",str(row.get("doctitle","")+row.get("doctextcon","")[:30000]+row.get("attachmenttextcon","")[:30000]).replace("(","(").replace(")",")")))))))
- # set_dict_item(item,"关键词",_dict.get("keyword",""))
- set_dict_item(item,"产品",_dict.get("product",""))
- set_dict_item(item,"服务期限",_dict.get("service_time",""))
- set_dict_item(item,"省份",_dict.get("province",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"城市",_dict.get("city",""))
- set_dict_item(item,"区县",_dict.get("district",""))
- set_dict_item(item,"发布时间",_dict.get("page_time",""))
- set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
- set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
- # set_dict_item(item,"创建时间",_dict.get("crtime",""))
- set_dict_item(item,"招标方式",_dict.get("bidway",""))
- set_dict_item(item,"行业一级分类",_dict.get("industry",""))
- set_dict_item(item,"行业二级分类",_dict.get("info_type",""))
- set_dict_item(item,"来源",_dict.get("web_source_name",""))
- set_dict_item(item,"uuid",_dict.get("uuid"))
- # set_dict_item(item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
- set_dict_item(item,"项目编号",_dict.get("page_code",""))
- set_dict_item(item,"项目名称",_dict.get("project_name",""))
- set_dict_item(item,"招标单位",_dict.get("tenderee",""))
- set_dict_item(item,"招标联系人",_dict.get("tenderee_contact",""))
- set_dict_item(item,"招标联系人电话",_dict.get("tenderee_phone",""))
- set_dict_item(item,"代理单位",_dict.get("agency",""))
- set_dict_item(item,"代理联系人",_dict.get("agency_contact",""))
- set_dict_item(item,"代理联系人电话",_dict.get("agency_phone",""))
- # set_dict_item(item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- set_dict_item(item,"评审专家",_dict.get("person_review",""))
- set_dict_item(item,"开标时间",_dict.get("time_bidopen",""))
- set_dict_item(item,"截标时间",_dict.get("time_bidclose",""))
- set_dict_item(item,"获取文件开始时间",_dict.get("time_get_file_start",""))
- set_dict_item(item,"获取文件结束时间",_dict.get("time_get_file_end",""))
- set_dict_item(item,"保证金递交开始时间",_dict.get("time_earnest_money_start",""))
- set_dict_item(item,"保证金递交结束时间",_dict.get("time_earnest_money_end",""))
- sub_docs_json = _dict.get("sub_docs_json")
- set_tenderer = set()
- if sub_docs_json is not None:
- docs = json.loads(sub_docs_json)
- docs.sort(key=lambda x:x.get("win_bid_price",0))
- for _doc in docs:
- if "win_tenderer" in _doc:
- set_dict_item(item,"中标单位",_doc["win_tenderer"])
- if "second_tenderer" in _doc:
- set_dict_item(item,"第二候选单位",_doc["second_tenderer"])
- set_tenderer.add(_doc.get("second_tenderer"))
- if "third_tenderer" in _doc:
- set_dict_item(item,"第三候选单位",_doc["third_tenderer"])
- set_tenderer.add(_doc.get("third_tenderer"))
- if "win_tenderee_manager" in _doc:
- set_dict_item(item,"中标单位联系人",_doc["win_tenderee_manager"])
- if "win_tenderee_phone" in _doc:
- set_dict_item(item,"中标单位联系电话",_doc["win_tenderee_phone"])
- if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- set_dict_item(item,"中标金额",_doc["win_bid_price"])
- if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- set_dict_item(item,"招标金额",_doc["bidding_budget"])
- set_dict_item(item,"入围供应商",",".join(list(set_tenderer)))
- if "第二候选单位" not in item:
- set_dict_item(item,"第二候选单位","")
- if "第三候选单位" not in item:
- set_dict_item(item,"第三候选单位","")
- if "招标金额" not in item:
- set_dict_item(item,"招标金额","")
- if "中标金额" not in item:
- set_dict_item(item,"中标金额","")
- if "中标单位" not in item:
- set_dict_item(item,"中标单位","")
- if "中标单位联系人" not in item:
- set_dict_item(item,"中标单位联系人","")
- if "中标单位联系电话" not in item:
- set_dict_item(item,"中标单位联系电话","")
- # if item["中标单位"] not in set_enter:
- # continue
- if not dumplicate:
- if item["项目编号"]!="":
- _line = "%s-%s-%s-%s-%s-%s"%(item["公告类别"],item["项目编号"],item["招标单位"],item["中标单位"],str(item["招标金额"]),str(item["中标金额"]))
- if _line in dict_line:
- dict_line[_line].append(item)
- continue
- dict_line[_line] = [item]
- _line2 = "%s-%s-%s-%s-%s-%s"%(item["公告标题"],item["公告类别"],item["招标单位"],str(item["招标金额"]),item["中标单位"],str(item["中标金额"]))
- if _line2 in dict_line:
- dict_line[_line2].append(item)
- continue
- dict_line[_line2] = [item]
- # if re.search("[大中小]学|幼儿园|医院|公司",item["招标单位"]) is not None:
- # continue
- # if _dict.get("docid","") in set_ig_docid:
- # continue
- # if item["招标金额"]=="":
- # continue
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- if not dumplicate:
- dict_dump = {}
- columns = ["group_id"]
- columns.extend(list_df_columns)
- for k in columns:
- dict_dump[k] = []
- group_id = 1
- for k,v in dict_line.items():
- if len(v)==1:
- continue
- for item in v:
- dict_dump["group_id"].append(group_id)
- for k in list_df_columns:
- dict_dump[k].append(item.get(k))
- group_id += 1
- df_dump = pd.DataFrame(dict_dump)
- df_dump.to_excel("%s/../data/dumplicate/%s_重复数据.xlsx"%(os.path.dirname(__file__),getCurrent_date("%Y-%m-%d_%H%M%S")))
- def filterRow(list_row,column,list_not_keywrods):
- list_result = []
- for row in list_row:
- _product = row.get(column,"")
- sub_docs_json = row.get("sub_docs_json","")
- doctitle = row.get("doctitle","")
- if re.search("设计",sub_docs_json) is not None:
- if re.search("装修",str(doctitle)+str(sub_docs_json)) is None:
- list_result.append(row)
- else:
- print("===",_product)
- # if re.search("|".join([re.escape(i) for i in list_not_keywrods]),_product) is not None:
- # continue
- # list_result.append(row)
- # if row.get("关键词",1)==row.get("招标单位",2) or row.get("关键词",2)==row.get("中标单位",3):
- # list_result.append(row)
- # doctitle = row.get("doctitle")
- # doctextcon = row.get("doctextcon")
- # if len(re.sub('\s','',doctextcon))==len(doctitle)+4:
- # list_result.append(row)
- # tenderee_phone = row.get("tenderee_phone","")
- # if len(tenderee_phone)==11:
- # list_result.append(row)
- return list_result
- dict_sptype = {"2":"审批信息",
- "4":"审批结果",
- "8":"核准公示",
- "16":"核准结果",
- "32":"备案公示",
- "64":"备案结果",
- "128":"推介项目",
- "256":"推介结果",
- "512":"项目撤销",
- "1024":"筹备阶段"}
- def getKeywordByFile():
- filename = "审批标题对比检查结果(20220831).xlsx"
- df = pd.read_excel(filename,sheetname=1)
- list_data = []
- for _title,_no,_type in zip(df["标题"],df["编号"],df["检查结果"]):
- if _type not in ["接口错"]:
- continue
- _dict = {"title":_title,
- "web_source":_no}
- list_data.append(_dict)
- return list_data
- def exportDocument_by_pagetime():
- ots_client = getConnect_ots()
- # columns = ["doctitle","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
- # columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
- # columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_name"]
- columns = ["doctitle","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
- # columns = ["docchannel","docid","project_name","product","doctitle","page_time","province","city","time_get_file_end","time_bidclose","project_code","sub_docs_json","tenderee","info_type","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","detail_link","bidway","crtime","extract_count","products"]
- # columns = ["page_time","doctitle","crtime","web_source_no","web_source_name","detail_link","original_docchannel","uuid"]
- dict_channel = getDict_docchannel()
- # columns = ["doctitle","dochtmlcon","page_time","web_source_no","web_source_name","sub_docs_json"]
- # columns = ["tenderee","tenderee_contact","tenderee_phone"]
- list_query = []
- str_keyword = '''
- 重大安全风险防控 大范围速扫设备 气体泄露监测
- '''
- list_keyword = splitIntoList(str_keyword,"[\s\n、,,|]")
- str_con_keyword = '''
- 信息化、弱电、pis、通讯
- '''
- con_keyword = splitIntoList(str_con_keyword,'[\s\n、,,|]')
- print(con_keyword)
- should_q_win = []
- for _keyword in list_keyword:
- should_q_win.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)))
- q_win = BoolQuery(should_queries=should_q_win)
- str_not_keyword = '''
-
- '''
- list_not_key = splitIntoList(str_not_keyword,"[\s\n、,,]")
- tenderee_keywrod = "医院、大学、高校、高中"
- list_t_key = splitIntoList(tenderee_keywrod,"[\s\n、,,]")
- should_q_system = [TermQuery("procurement_system","企业采购系统"),
- TermQuery("procurement_system","部队采购系统")]
- q_system = BoolQuery(should_queries=should_q_system)
- log(str(list_keyword))
- s_province = "北京,天津,深圳,上海,浙江,江苏,安徽"
- list_province = splitIntoList(s_province,"[,,\s]")
- st = "环境监测中心、环境监测总站、环保局、水务局、水利局"
- list_tenderee = splitIntoList(st,"、|\s")
- # list_title = getKeywordByFile()
- #
- # for _d in list_title:
- # _title = _d["title"]
- # web_source = _d["web_source"]
- # bool_query = BoolQuery(must_queries=[
- # generateBoolShouldQuery(["page_title"],[_title],MatchPhraseQuery),
- # TermQuery("web_source_no",web_source)
- # ])
- # list_query.append({"query":bool_query})
- for _keyword in list_keyword:
- bool_query = BoolQuery(must_queries=[
- # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_keyword)),
- # generateBoolShouldQuery(["doctitle"],list_keyword,MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle"],["实景三维"],MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle"],["电子站牌","智慧公交站"],MatchPhraseQuery),
- # ExistsQuery("tenderee"),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["食堂","饭堂"],MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["学校","大学","中学","小学"],MatchPhraseQuery),
- # generateBoolShouldQuery(["web_source_name"],list_keyword,TermQuery),
- generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_keyword,MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],con_keyword,MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["上海泛微网络科技股份有限公司"],MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["广州政通信息科技有限公司"],MatchPhraseQuery),
- # BoolQuery(should_queries=[
- # generateBoolShouldQuery(["doctitle"],["公告","公示","招标","中标","采购","工程","项目","询价","施工","比价","服务","监理","设计"],MatchPhraseQuery),
- # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["项目名称","招标人","中标人","项目编号","采购组织机构","采购人","招标范围","投标保证金","报价地址","询价通知书"],MatchPhraseQuery)
- # ]),
- # generateBoolShouldQuery(["doctitle"],["中标"],MatchPhraseQuery),
- # generateBoolShouldQuery(["docid"],list_keyword,TermQuery),
- # q_win,
- # should_q,
- # generateBoolShouldQuery(["tenderee"],[company],TermQuery),
- # generateBoolShouldQuery(["doctitle"],["实验室"],MatchPhraseQuery),
- # generateBoolShouldQuery(["docchannel"],[52,51,102,103,104],TermQuery),
- # generateBoolShouldQuery(["docchannel"],[101,118,119,120,121,122],TermQuery),
- generateBoolShouldQuery(["docchannel"],[101,119,120,121,122],TermQuery),
- # generateBoolShouldQuery(["docchannel"],[51,52,101,102,103,104,105,114,118,119,120],TermQuery),
- # generateBoolShouldQuery(["docchannel"],[102,52,101,114,119,120],TermQuery),
- # RangeQuery("page_time","2020-12-05","2023-12-12",True,False),
- # NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*"))
- # WildcardQuery("web_source_no","00219*"),
- # generateBoolShouldQuery(["district"],["南沙","中山","江门","珠海"],TermQuery),
- # TermQuery("info_type","物业管理"),
- RangeQuery("page_time","2023-01-01","2025-01-01"),
- RangeQuery("status",201,300,True,True),
- # RangeQuery("crtime","2020-05-01 09:00:00","2022-05-31 10:00:00",True,True),
- # RangeQuery("page_time",range_from="2022-01-01",range_to="2023-01-01"),
- # BoolQuery(should_queries=[TermQuery("page_time","2022-09-15"),
- # TermQuery("page_time","2022-10-20"),
- # TermQuery("page_time","2022-10-31")])
- # TermQuery("page_time","2023-07-21"),
- # RangeQuery("page_time","2023-05-16","2024-10-14"),
- # TermQuery("docid",237163857),
- # WildcardQuery("tenderee","*雅居乐*"),
- # RangeQuery("crtime","2023-07-22 00:00:00"),
- # BoolQuery(should_queries=[NestedQuery("products",RangeQuery("products.unit_price",1)),
- # NestedQuery("products",RangeQuery("products.total_price",1)),])
- # NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",1000)),
- # NestedQuery("page_attachments",TermQuery("page_attachments.fileMd5","92775529171409a32513f134a61d73c8")),
- # TermQuery("province","广东"),
- # TermQuery("city","无锡"),
- # generateBoolShouldQuery(["tenderee"],list_tenderee,WildcardQuery),
- # generateBoolShouldQuery(["tenderee"],["应急管理局","城市管理局","大数据局","政务服务管理局","消防局"],WildcardQuery),
- # WildcardQuery("tenderee","公安*"),
- # BoolQuery(should_queries=[WildcardQuery("tenderee","*地铁*"),
- # WildcardQuery("tenderee","*轨道交通*")])
- # BoolQuery(should_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐禾*")),
- # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*彩食鲜*")),
- # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*望家欢*")),
- # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐味*")),
- # ])
- # TermQuery("procurement_system","公安系统"),
- # generateBoolShouldQuery(["province"],["重庆"],WildcardQuery),
- # generateBoolShouldQuery(["tenderee"],list_t_key,WildcardQuery)
- # generateBoolShouldQuery(["docchannel"],[101,118,119],TermQuery),
- ],
- # should_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐禾*")),
- # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*彩食鲜*")),
- # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*望家欢*")),
- # NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*乐味*")),
- # ],
- # must_not_queries=[
- # # TermQuery("exist_table",1),
- # # WildcardQuery("tenderee","公安县*"),
- # # TermQuery("attachment_extract_status",1),
- # # generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],list_not_key,MatchPhraseQuery),
- # generateBoolShouldQuery(["province"],["湖南","广西","广东"],MatchPhraseQuery),
- # # q_system,
- # ]
- )
- list_query.append({"query":bool_query,"limit":300000})
- break
- list_row = getDocument(list_query,columns,"document","document_index")
- # list_row = getDocument(list_query,columns,"t_shen_pi_xiang_mu","t_shen_pi_xiang_mu_index")
- log("get document %d rows"%len(list_row))
- # list_query = []
- #
- # for _row in list_row:
- # _uuid = uuid4().hex
- # page_attachments = json.loads(_row.get("page_attachments"))
- # l_s = []
- # for _at in page_attachments:
- # l_s.append(NestedQuery("page_attachments",TermQuery("page_attachments.fileMd5",_at.get("fileMd5"))))
- # list_query.append({"query":BoolQuery(should_queries=l_s),"limit":500,"keyword":_uuid})
- # list_row = getDocument(list_query,columns,"document","document_index")
- df_data = {}
- set_line = set()
- # list_row = filterRow(list_row,"doctitle",list_not_key)
- log("get document %d rows"%len(list_row))
- # getRowDataWithKey(df_data,list_row,columns)
- getRowData(df_data,list_row,set_line,list_keyword,dict_channel,True)
- # getRowData_sp1(df_data,list_row,set_line,list_keyword,dict_sptype,True)
- # fixContactPerson(df_data,list_df_columns,get_legal_person=False)
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
- # keys = df_data.keys()
- # print("keys",keys)
- # dict_company = {}
- # set_dup_keys = set()
- # set_docid = set()
- # for _i in range(len(df_data[list(keys)[0]])):
- # if df_data["关键词"][_i]==df_data["招标单位"][_i] or df_data["关键词"][_i]==df_data["中标单位"][_i]:
- # company_name = df_data["关键词"][_i]
- # if company_name not in dict_company:
- # dict_company[company_name] = {"企业名称":company_name,"招标":[],"中标":[]}
- # if df_data["关键词"][_i]==df_data["招标单位"][_i]:
- # if str(df_data["招标金额"][_i])!="nan":
- # _key = "%s-%s"%(company_name,str(df_data["招标金额"][_i]))
- # if _key not in set_dup_keys:
- # set_docid.add(df_data["docid"][_i])
- # dict_company[company_name]["招标"].append({"标题":df_data["公告标题"][_i],
- # "招标方式":df_data["招标方式"][_i],
- # "招标单位":df_data["招标单位"][_i],
- # "招标金额":df_data["招标金额"][_i]})
- # set_dup_keys.add(_key)
- # if df_data["关键词"][_i]==df_data["中标单位"][_i]:
- # if str(df_data["中标金额"][_i])!="nan":
- # _key = "%s-%s"%(str(df_data["中标单位"][_i]),str(df_data["中标金额"][_i]))
- # if _key not in set_dup_keys:
- # set_docid.add(df_data["docid"][_i])
- # dict_company[company_name]["中标"].append({"标题1":df_data["公告标题"][_i],
- # "招标方式1":df_data["招标方式"][_i],
- # "中标单位1":df_data["中标单位"][_i],
- # "中标金额1":df_data["中标金额"][_i]})
- # set_dup_keys.add(_key)
- # df_keys = ["企业名称","标题","招标方式","招标单位","招标金额","标题1","招标方式1","中标单位1","中标金额1"]
- # df_da = {}
- # for k in df_keys:
- # df_da[k] = []
- # for k,v in dict_company.items():
- # list_zhaobiao = v["招标"]
- # list_zhongbiao = v["中标"]
- # _nums = max(min(len(list_zhaobiao),5),min(len(list_zhongbiao),5))
- # for i in range(_nums):
- # df_da["企业名称"].append(k)
- # if i>=len(list_zhaobiao):
- # df_da["标题"].append("")
- # df_da["招标方式"].append("")
- # df_da["招标单位"].append("")
- # df_da["招标金额"].append("")
- # else:
- # df_da["标题"].append(list_zhaobiao[i]["标题"])
- # df_da["招标方式"].append(list_zhaobiao[i]["招标方式"])
- # df_da["招标单位"].append(list_zhaobiao[i]["招标单位"])
- # df_da["招标金额"].append(list_zhaobiao[i]["招标金额"])
- #
- # if i>=len(list_zhongbiao):
- # df_da["标题1"].append("")
- # df_da["招标方式1"].append("")
- # df_da["中标单位1"].append("")
- # df_da["中标金额1"].append("")
- # else:
- # df_da["标题1"].append(list_zhongbiao[i]["标题1"])
- # df_da["招标方式1"].append(list_zhongbiao[i]["招标方式1"])
- # df_da["中标单位1"].append(list_zhongbiao[i]["中标单位1"])
- # df_da["中标金额1"].append(list_zhongbiao[i]["中标金额1"])
- # df2 = pd.DataFrame(df_da)
- # df2.to_excel("tmp333.xlsx",columns=df_keys)
- #
- # df_3 = {}
- # for k in keys:
- # df_3[k] = []
- # for _i in range(len(df_data[list(keys)[0]])):
- # docid = df_data["docid"][_i]
- # if docid in set_docid:
- # for k in keys:
- # df_3[k].append(df_data[k][_i])
- # df3 = pd.DataFrame(df_3)
- # df3.to_excel("tmp_333_mx.xlsx",columns=keys)
- # fixContactPerson(df_data,list_df_columns)
- #
- def findProjects():
- df = pd.read_excel("两广地区中标时间为空标注_预匹配1.xlsx",0)
- list_items = []
- for docids,project_code,project_name,tenderee,zhao_biao_page_time in zip(df["docids"],df["project_code"],df["project_name"],df["tenderee"],df["zhao_biao_page_time"]):
- if not isinstance(project_code,(str)):
- project_code = "$$$"
- if not isinstance(project_name,(str)):
- project_name = "$$$"
- if not isinstance(tenderee,(str)):
- tenderee = ""
- print(dir(zhao_biao_page_time))
- _dict = {"docids":docids,
- "project_code":project_code,
- "project_name":project_name,
- "tenderee":tenderee,
- "zhao_biao_page_time":zhao_biao_page_time.strftime("%Y-%m-%d"),
- "end_time":timeAdd(zhao_biao_page_time.strftime("%Y-%m-%d"),180)}
- list_items.append(_dict)
- task_queue = queue.Queue()
- for item in list_items:
- task_queue.put(item)
- def _handle(item,result_queue,ots_client):
- docids = item.get("docids")
- list_s_n = []
- for docid in re.split(",",str(docids)):
- list_s_n.append(TermQuery("docid",docid))
- query_not = BoolQuery(should_queries=list_s_n)
- # bool_query =BoolQuery(must_queries=[query_not])
- # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- # SearchQuery(bool_query,limit=50,get_total_count=True),
- # ColumnsToGet(["sub_docs_json"],ColumnReturnType.SPECIFIED))
- # if total_count>0:
- # dict_rows = getRow_ots(rows)
- # _find = False
- # for _row in dict_rows:
- # sub_docs_json = _row.get("sub_docs_json",'[]')
- # sub_docs = json.loads(sub_docs_json)
- # for _doc in sub_docs:
- # if "bidding_budget" in _doc and _doc.get("bidding_budget",0)>0:
- # item["new_budding_budget"] = _doc.get("bidding_budget",0)
- # _find = True
- # break
- # if _find:
- # break
- #
- # return
- _find = True
- bool_query =BoolQuery(must_queries=[
- generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_code","$$$$"),item.get("project_name","$$$$")],MatchPhraseQuery),
- generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
- RangeQuery("status",151,301),
- NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
- RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))
- ],must_not_queries=[query_not])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,limit=50,get_total_count=True),
- ColumnsToGet(["doctitle","tenderee","sub_docs_json"],ColumnReturnType.SPECIFIED))
- if total_count>0:
- dict_rows = getRow_ots(rows)
- str_docid = ""
- for _row in dict_rows:
- str_docid+="%d,"%_row.get("docid")
- sub_docs_json = _row.get("sub_docs_json",'[]')
- sub_docs = json.loads(sub_docs_json)
- if item.get("tenderee","--")!=_row.get("tenderee","-#"):
- continue
- for _doc in sub_docs:
- if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
- item["new_win_bid_price"] = _doc.get("win_bid_price")
- item["new_win_tenderer"] = _doc.get("win_tenderer")
- item["new_finded_docid"] = _row.get("docid")
- _find = True
- break
- if _find:
- return
- item["maybe_docids"] = str_docid
- bool_query =BoolQuery(must_queries=[
- generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_code","$$$$")],MatchPhraseQuery),
- generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
- RangeQuery("status",151,301),
- NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
- RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))
- ],must_not_queries=[query_not])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,limit=50,get_total_count=True),
- ColumnsToGet(["doctitle","tenderee","sub_docs_json"],ColumnReturnType.SPECIFIED))
- if total_count>0:
- dict_rows = getRow_ots(rows)
- str_docid = ""
- for _row in dict_rows:
- str_docid+="%d,"%_row.get("docid")
- sub_docs_json = _row.get("sub_docs_json",'[]')
- sub_docs = json.loads(sub_docs_json)
- if item.get("tenderee","--")!=_row.get("tenderee","-#"):
- continue
- for _doc in sub_docs:
- if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
- item["new_win_bid_price"] = _doc.get("win_bid_price")
- item["new_win_tenderer"] = _doc.get("win_tenderer")
- item["new_finded_docid"] = _row.get("docid")
- _find = True
- break
- if _find:
- return
- item["maybe_docids"] = str_docid
- bool_query =BoolQuery(must_queries=[
- generateBoolShouldQuery(["doctitle",'doctextcon','attachmenttextcon'],[item.get("project_name","$$$$")],MatchPhraseQuery),
- generateBoolShouldQuery(["docchannel"],[101,119,120],TermQuery),
- RangeQuery("status",151,301),
- NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*")),
- RangeQuery("page_time",item.get("zhao_biao_page_time"),item.get("end_time"))
- ],must_not_queries=[query_not])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,limit=50,get_total_count=True),
- ColumnsToGet(["doctitle"],ColumnReturnType.SPECIFIED))
- if total_count>0:
- dict_rows = getRow_ots(rows)
- str_docid = ""
- for _row in dict_rows:
- str_docid+="%d,"%_row.get("docid")
- sub_docs_json = _row.get("sub_docs_json",'[]')
- sub_docs = json.loads(sub_docs_json)
- if item.get("tenderee","--")!=_row.get("tenderee","-#"):
- continue
- for _doc in sub_docs:
- if "win_bid_price" in _doc and _doc.get("win_bid_price",0)>0:
- item["new_win_bid_price"] = _doc.get("win_bid_price")
- item["new_win_tenderer"] = _doc.get("win_tenderer")
- item["new_finded_docid"] = _row.get("docid")
- _find = True
- break
- if _find:
- return
- item["maybe_docids"] = str_docid
- return
- mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=getConnect_ots())
- mt.run()
- df_data = {"docids":[],
- "project_code":[],
- "project_name":[],
- "maybe_docids":[],
- "new_budding_budget":[],
- "new_win_bid_price":[],
- "new_win_tenderer":[],
- "new_finded_docid":[]}
- keys = df_data.keys()
- for item in list_items:
- for k in keys:
- df_data[k].append(item.get(k))
- df2 = pd.DataFrame(df_data)
- df2.to_excel("两广补充数据.xlsx")
- def attachCompanyContact():
- files = ["../data/2021-03-17_四川_关键词导出.csv",
- "../data/2021-03-17_安徽_关键词导出.csv",
- "../data/2021-03-17_江西_关键词导出.csv",
- "../data/2021-03-17_湖南_关键词导出.csv"]
- files = ["../data/20210609(最新).xlsx"]
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- set_enter = set()
- for file in files:
- df = pd.read_excel(file)
- columns = ["招标单位","中标单位","代理单位"]
- for _c in columns:
- for item in df[_c]:
- if isinstance(item,str):
- item = item.strip()
- if item!="":
- set_enter.add(item)
- dict_enter = getDictEnterprise(list(set_enter))
- for file in files:
- task_queue = queue.Queue()
- df = pd.read_excel(file,encoding="UTF8")
- keys = df.keys()[1:]
- list_item = []
- for row in df.itertuples():
- _dict = {}
- for _key in keys:
- if _key in dir(row):
- _v = row.__getattribute__(_key)
- else:
- _v = ''
- _dict[_key] = _v
- if str(_dict["招标联系人"]) in ("","nan") or str(_dict["招标联系人电话"]) in ("","nan"):
- contact_person,mobile = getOneContact(dict_enter.get(_dict["招标单位"],{}).get("contacts","[]"))
- if contact_person!="":
- _dict["招标联系人"] = contact_person
- _dict["招标联系人电话"] = mobile
- if str(_dict["中标联系人"]) in ("","nan") or str(_dict["中标联系人电话"]) in ("","nan"):
- contact_person,mobile = getOneContact(dict_enter.get(_dict["中标单位"],{}).get("contacts","[]"))
- if contact_person!="":
- _dict["中标联系人"] = contact_person
- _dict["中标联系人电话"] = mobile
- if str(_dict["代理联系人"]) in ("","nan") or str(_dict["代理联系人电话"]) in ("","nan"):
- contact_person,mobile = getOneContact(dict_enter.get(_dict["代理单位"],{}).get("contacts","[]"))
- if contact_person!="":
- _dict["代理联系人"] = contact_person
- _dict["代理联系人电话"] = mobile
- list_item.append(_dict)
- for item in list_item:
- task_queue.put(item)
- df_data = {}
- for _k in keys:
- df_data[_k] = []
- for item in list_item:
- for _k in keys:
- df_data[_k].append(getLegal_str(item.get(_k,"-")))
- df1 = pd.DataFrame(df_data)
- df1.to_excel("%s_attach.xlsx"%file,columns=keys)
- def dumpWebSourceNo():
- conn_oracle = getConnection_oracle()
- cursor_oracle = conn_oracle.cursor()
- sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO "
- cursor_oracle.execute(sql)
- rows = cursor_oracle.fetchall()
- conn_mysql = getConnection_testmysql()
- cursor_mysql = conn_mysql.cursor()
- for row in rows:
- sql = " insert into webSource(web_source_no,web_source_name) values('%s','%s')"%(row[0],row[1])
- print(sql)
- cursor_mysql.execute(sql)
- conn_mysql.commit()
- def exportNzj():
- # filename = "../data/重复公告.xlsx"
- # df = pd.read_excel(filename)
- ots_client = getConnect_ots()
- columns = ["contacts","covered_area","follows","docids","page_time","progress","project_description","project_follow","project_code","project_name","project_type"]
- def getData(df_data,rows,set_line):
- list_data = getRow_ots(rows)
- for row in list_data:
- item = {}
- _dict = row
- set_dict_item(item,"docids",_dict.get("docids",""))
- set_dict_item(item,"contacts",_dict.get("contacts",""))
- set_dict_item(item,"covered_area",_dict.get("covered_area",""))
- set_dict_item(item,"follows",_dict.get("follows",""))
- set_dict_item(item,"project_type",_dict.get("project_type",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item(item,"page_time",_dict.get("page_time",""))
- set_dict_item(item,"progress",_dict.get("progress",""))
- set_dict_item(item,"project_description",_dict.get("project_description",""))
- set_dict_item(item,"project_follow",_dict.get("project_follow",""))
- set_dict_item(item,"project_code",_dict.get("project_code",""))
- set_dict_item(item,"project_name",_dict.get("project_name",""))
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df_data = {}
- bool_query = BoolQuery(must_queries=[ExistsQuery("docids")])
- rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("id",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- set_line = set()
- _count = len(rows)
- getData(df_data,rows,set_line)
- while next_token:
- print("%d/%d"%(_count,total_count))
- rows, next_token, total_count, is_all_succeed = ots_client.search("designed_project", "designed_project_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- getData(df_data,rows,set_line)
- _count += len(rows)
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/2021-03-31_拟在建数据导出1.xlsx",columns=list_df_columns)
- def turn_status():
- df = pd.read_excel("../data/欧科自然资源5w以上数据.xlsx")
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- for docid in df["公告id"]:
- partitionkey = int(docid)%500+1
- sql = " insert into turn_status(partitionkey,docid) values(%d,%d)"%(partitionkey,docid)
- cursor.execute(sql)
- conn.commit()
- def attachBidding_budget():
- conn_mysql = getConnection_testmysql()
- cursor = conn_mysql.cursor()
- sql = "select docid from analysis_r2 where bidding_budget=''"
- task_queue = queue.Queue()
- result_queue = queue.Queue()
- cursor.execute(sql)
- rows = cursor.fetchmany(10000)
- while(rows):
- for row in rows:
- task_queue.put(row[0])
- rows = cursor.fetchmany(10000)
- pool_mysql = ConnectorPool(init_num=10,max_num=30,method_init=getConnection_testmysql)
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- def _handle(item,result_queue,pool_mysql,pool_ots):
- ots_client = pool_ots.getConnector()
- bool_query = BoolQuery(must_queries=[TermQuery("docids",item)])
- rows, next_token, total_count, is_all_succeed = ots_client.search("project2", "project2_index",
- SearchQuery(bool_query , limit=1, get_total_count=True),
- ColumnsToGet(["bidding_budget"],return_type=ColumnReturnType.SPECIFIED))
- list_dict = getRow_ots(rows)
- if len(list_dict)>0:
- conn = pool_mysql.getConnector()
- cursor = conn.cursor()
- sql = " update analysis_r2 set bidding_budget='%s' where docid=%d"%(str(list_dict[0].get("bidding_budget","")),item)
- cursor.execute(sql)
- conn.commit()
- pool_mysql.putConnector(conn)
- pool_ots.putConnector(ots_client)
- mt = MultiThreadHandler(task_queue,_handle,result_queue,thread_count=30,pool_mysql=pool_mysql,pool_ots=pool_ots)
- mt.run()
- def debug_documentMerge():
- conn = getConnection_testmysql()
- cursor = conn.cursor()
- sql = "select merge_docids from project_group_final_log "
- cursor.execute(sql)
- task_queue = queue.Queue()
- for row in cursor.fetchall():
- task_queue.put(row[0])
- print(task_queue.qsize())
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- list_docids = item.split(",")
- must_q = []
- for _docid in list_docids:
- must_q.append(TermQuery("docids",_docid))
- bool_query = BoolQuery(must_queries=must_q)
- rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
- SearchQuery(bool_query,limit=1,get_total_count=True),
- ColumnsToGet(column_names=["docids"],return_type=ColumnReturnType.SPECIFIED))
- if total_count==0:
- print(item)
- result_queue.put(item)
- pool_ots.putConnector(ots_client)
- result_queue = queue.Queue()
- pool_ots = ConnectorPool(init_num=10,max_num=30,method_init=getConnect_ots)
- mt = MultiThreadHandler(task_queue,_handle,result_queue,30,pool_ots=pool_ots)
- mt.run()
- while(True):
- try:
- item = result_queue.get(True)
- print(item)
- except Exception as e:
- print(str(e))
- break
- def signDocument():
- filename = "C:\\Users\\Administrator\\Desktop\\中标信息1.xlsx"
- sign_filename = "%s_sign.xlsx"%filename
- df = pd.read_excel(filename)
- df_data = {"sign":[]}
- for item in df["segword"]:
- content = re.sub("\s*","",item)
- _find = re.search("(?P<key>采购失败|招标失败)",content)
- if _find is not None:
- df_data["sign"].append(_find.groupdict().get("key"))
- else:
- df_data["sign"].append("无")
- df1 = pd.DataFrame(df_data)
- df1.to_excel(sign_filename)
- def exportWin_tenderer_count():
- ots_client = getConnect_ots()
- str_enter = '''
- 红旗渠建设集团有限公司
- 河南瑞华建筑集团有限公司
- 林州瑞达工程管理有限公司
- 河南鸿盛建筑工程有限公司
- 天一建设发展有限公司
- 河南省大成建设工程有限公司
- 中润昌弘建工集团有限公司
- 河南省中创建筑工程有限公司
- 河南锦达建设有限公司
- 林州宏基建筑工程有限公司
- 河南富世建筑工程有限公司
- 中恒方圆建筑工程有限公司
- 河南华隆建设工程有限公司
- 河南昊锦建设集团有限公司
- 河南新隆建工集团有限公司
- 中城华安建设集团有限公司
- 河南恒通公路桥梁建设有限公司
- 林州二建集团建设有限公司
- 河南华安建设集团有限公司
- 河南裕鸿建筑工程有限公司
- 中商建投建设有限公司
- 河南鑫利恒工程有限公司
- 林州市永盛建筑有限公司
- 林州市顺鑫建筑有限公司
- 中水京林建设有限公司
- 河南宏海建设有限公司
- 河南宏岳建设有限公司
- 河南元筑实业有限公司
- 河南基兆建筑工程有限公司
- 林州金瑞建筑工程有限公司
- 林州建工集团有限公司
- 河南万融建筑工程有限公司
- 林州东风建设有限公司
- 河南鸿泰建筑安装有限公司
- 河南源泰建筑有限公司
- 河南优德建筑工程有限公司
- 安阳鸿盛建设劳务有限公司
- 河南省安阳市安装工程有限责任公司
- 河南港城建设工程有限公司
- 河南天汇建筑工程有限公司
- 河南省惠浦建设发展有限公司
- 林州市建筑工程有限公司
- 河南正天建筑工程有限公司
- 河南颂邦建筑工程有限公司
- 林州市华源建设有限公司
- 河南中投建设有限公司
- 林州华林建筑劳务有限公司
- 河南基祥建设工程有限公司
- 河南文水水电工程有限公司
- 林州兴业建筑工程有限公司
- 河南中州建筑有限公司
- 河南省佳禾园林建设有限公司
- 林州万亚建筑工程有限公司
- 河南林正建设工程有限公司
- 河南鼎兴建设工程有限公司
- 河南平原建工集团有限公司
- 河南林九建设工程有限公司
- 林州市三才建筑工程有限公司
- 安阳建设(集团)有限责任公司
- 盛世恒达建设有限公司
- 河南城洲建设工程有限公司
- 河南国埔建筑工程有限公司
- 中创市政建设发展有限公司
- 河南正祥建筑工程有限公司
- 河南宏九建筑工程有限公司
- 河南金阳建筑工程有限公司
- 河南天容建设工程有限责任公司
- 河南聚宇建筑工程有限公司
- 河南瑞旗建设工程有限公司
- 河南利盛建设工程有限公司
- 林州四海建设有限公司
- 林州市建工城建集团有限公司
- 河南众佑建设工程有限公司
- 河南德诚建设有限公司
- 河南景华建筑工程有限公司
- 河南华江建筑工程有限公司
- 林州永丰建设集团有限公司
- 林州福东建设工程有限公司
- 河南恒森建筑工程有限公司
- 河南朝泓建设工程有限公司
- 河南润京建设有限公司
- 林州市红旗渠公路工程有限公司
- 林州中宇建设工程有限公司
- 河南长锦建设工程有限公司
- 河南汇商建筑工程有限公司
- 河南省豫鹤建设工程有限公司
- 河南江城建筑工程有限公司
- 中海华祥建设发展有限公司
- 河南宁中路桥建筑有限公司
- 河南天河建设工程有限公司
- 林州市路桥建筑工程有限公司
- 河南省中北建设有限公司
- 河南汇亿建筑工程有限公司
- 河南金帝建筑安装有限公司
- 河南省望安建筑工程有限公司
- 泰欣建设有限公司
- 河南筑鑫建筑工程有限公司
- 元熙建设工程有限公司
- 旭隆建设集团有限公司
- 河南省城控建工集团有限公司
- 河南晨丰建筑工程有限公司
- 河南嘉丰建设有限公司
- 林州市合众建筑劳务有限公司
- 河南金瓦刀建筑劳务有限公司
- 河南中实建筑工程有限公司
- 畅通路桥工程建设有限责任公司
- 河南军恒建设有限公司
- 中钊建设集团有限公司
- 河南德宁建设集团有限公司
- 林州兴鸿建筑工程有限公司
- 林州市明泽建筑工程有限公司
- 河南紫光建筑工程有限公司
- 河南誉天建筑工程有限公司
- 林州景丰建筑劳务有限公司
- 河南江瀚建筑劳务有限公司
- 河南弘之昌建筑工程有限公司
- 河南祥泰钻井工程有限公司
- 河南迅阳建筑劳务有限公司
- 河南嘉成建筑工程有限公司
- 河南兴锦建设工程有限公司
- 河南邦坤建设工程有限公司
- 河南锦毅市政工程建筑有限公司
- 河南广益建筑工程有限公司
- 河南创胜建筑工程有限公司
- 河南勤铭建筑工程有限公司
- 河南铭锋建设工程有限公司
- 平源建设有限公司
- 河南隆通建筑工程有限公司
- 河南省基本建设有限公司
- 河南丰茂建筑劳务有限公司
- 河南城安建筑工程有限公司
- 林州市富源建筑劳务有限公司
- 德方建设有限公司
- 河南泰联建筑工程有限公司
- 河南新建投工程有限公司
- 河南省鲁班建工集团有限公司
- 林州方超建筑劳务有限公司
- 林州市采桑建筑劳务输出有限公司
- 河南省仁昱建筑工程有限公司
- 河南鸾林建设工程有限公司
- 宜民建设集团有限公司
- 林州聚兴建筑工程有限公司
- 河南省聚国建筑工程有限公司
- 林州市大东建筑劳务有限公司
- 河南欣东劳务有限公司
- 中建润德景观建筑工程有限公司
- 河南辰宇建设工程有限公司
- 号东建设工程有限公司
- 河南润北建筑工程有限公司
- 河南邦昊建设工程有限公司
- 林州市建设投资有限责任公司
- 林州市太行建设工程有限公司
- 河南峡安建筑工程有限公司
- 河南安疆建筑工程有限公司
- 河南淇河建设工程有限公司
- 河南晶品建设有限公司
- 河南翔固建筑工程有限公司
- 纵横九州路桥建设有限公司
- 河南青林建筑工程有限公司
- 合久建设有限公司
- 河南明昊建筑工程有限公司
- 河南滨河建设工程有限公司
- 河南群腾建筑工程有限公司
- 河南隆亨建筑工程有限公司
- 骏达建设有限公司
- 河南仁安建设工程有限公司
- 河南旻尚园林建筑工程有限公司
- 河南省匡正建设工程有限公司
- 河南金凡建筑工程有限公司
- 河南佰丰建筑工程有限公司
- 德普建设有限公司
- 国润新天地工程技术有限公司
- 中潮建设发展有限公司
- 河南捷正建筑工程有限公司
- 林州百万工匠建筑劳务有限公司
- 河南祥彬建筑工程有限公司
- 河南林祥工程建设有限公司
- 河南唐尧建筑劳务有限公司
- 河南汇祥建设有限公司
- 河南友信建设有限公司
- 林州市鼎昇建筑工程有限公司
- 林州市富兴建筑劳务有限公司
- 林州厚德建筑劳务有限公司
- 河南振亚工程建设有限公司
- 河南英茂建筑工程有限公司
- 河南丰岩建设工程有限公司
- 林州市昌都建筑工程有限公司
- 林州四建建筑工程有限公司
- 林州和兴建筑劳务有限公司
- 林州市鸿升建筑工程有限公司
- 河南润泰建设工程有限公司
- 河南鑫路通建筑劳务有限公司
- 河南信守建筑劳务有限公司
- 林州安达鸿昌建筑劳务有限公司
- 河南意达建设有限公司
- 河南金穗来建筑工程有限公司
- 河南东风建筑工程有限公司
- 河南筑胜建筑劳务有限公司
- 河南民润建筑工程有限公司
- 林州市中锦路桥建设工程有限公司
- 林州一建建筑工程有限公司
- 林州市宏瑞建筑劳务有限公司
- 林州鸿恩建筑劳务有限公司
- 河南晟元建筑工程有限公司
- 中国建筑第六工程局有限公司
- 河南筑泰建筑工程有限公司
- 河南省亚建建筑工程有限公司
- 河南辰弘建筑工程有限公司
- 河南先创建筑工程有限公司
- 林豫建工集团有限公司
- 河南省盛民建筑工程有限公司
- 河南泓发市政工程有限公司
- 河南帝恩建筑劳务有限公司
- 河南天泉建设工程有限公司
- 河南恒升工程建设有限公司
- 林州市浩远电力建筑工程有限公司
- 河南友瑞建筑工程有限公司
- 河南冠州路桥工程有限公司
- 三角鼎建设工程有限公司
- 河南富坤建筑工程有限公司
- 林州市恒源建筑工程有限公司
- 河南广汇建筑工程有限公司
- 河南隆豫建设有限公司
- 林州市九洲工程劳务有限公司
- 林州瑜辉建筑工程有限公司
- 河南福恩建筑工程有限公司
- 河南通盛路桥建设有限公司
- 河南央泰建设工程有限公司
- 林州市红旗渠公路养护工程有限公司
- 林州大兴建设工程有限公司
- 河南锐丰建设工程有限公司
- 林州市中泰建筑劳务有限公司
- 林州成业建筑工程有限公司
- 河南建创建筑工程有限公司
- 河南宏兴建设工程有限公司
- 河南隆鼎建筑工程有限公司
- 林州市天罡建筑劳务有限公司
- 汇聚建设发展有限公司
- 中铁中城工程有限公司
- 河南景天建筑劳务有限公司
- 林州蒙建建设工程有限公司
- 富华建设工程有限公司
- 河南殿轩建筑劳务有限公司
- 河南瑞通建设工程有限公司
- 林州金桥劳务工程有限公司
- 河南省景隆实业有限公司
- 河南升洲建筑工程有限公司
- 河南里程建筑劳务有限公司
- 林州市润景建设工程有限公司
- 河南巨坤建筑工程有限公司
- 河南九牛建设劳务有限公司
- 吉修建设工程有限公司
- 河南图润建筑工程有限公司
- 河南鼎鑫建筑劳务有限公司
- 河南港航建设工程有限公司
- 河南省盛飞建设工程有限公司
- 林州市兴义建筑劳务有限公司
- 河南秉程建筑工程有限公司
- 河南硕亚水电路桥工程有限公司
- 河南科才建筑劳务有限公司
- 河南荣泰建筑安装工程有限公司
- 河南省天丰建筑工程有限公司
- 河南方元建筑工程有限公司
- 恒上建设有限公司
- 河南省德信建筑工程有限公司
- 河南诚宸建设工程有限公司
- 河南置信建筑工程有限公司
- 河南省鑫河建设有限公司
- 河南成兴建设工程有限公司
- 林州中港建筑工程有限公司
- 河南富春建设工程有限公司
- 中科豫资建设发展有限公司
- 河南京都建筑安装有限公司
- 安阳市宇豪爆破工程有限公司
- 河南华特建筑工程有限公司
- 河南颍淮建工有限公司
- 林州市八建工程有限公司
- 河南展辉建筑工程有限公司
- 河南中博建筑有限公司
- 河南方圆建设有限公司
- 河南大鼎建筑工程有限公司
- 林州中天建设有限公司
- 河南久东建筑工程有限公司
- 河南九一建设工程有限公司
- 九州水文建设集团有限公司
- 河南省建安防水防腐工程有限公司
- 中建宏图建设发展有限公司
- 筑宇建设有限公司
- 林州市宏图建设工程有限公司
- 河南林润建设工程有限公司
- 嘉泰建设发展有限公司
- 河南丰茂建筑安装工程有限公司
- 河南万泰建设工程有限公司
- 林州市红旗渠市政工程有限公司
- 林州建总建筑工程有限公司
- 河南聚之祥建设有限公司
- 河南鼎之信建设工程有限公司
- 河南省华瑞建设工程有限公司
- 河南世光电力工程有限公司
- 河南地远建筑工程有限公司
- 河南鑫品建筑工程有限公司
- 河南省东旗建筑工程有限公司
- 润华建设有限公司
- 林州富民建筑劳务有限公司
- 林州市晨诚建筑劳务有限公司
- 河南万胜建设有限公司
- 河南龙磐建筑工程有限公司
- 河南顺昌建筑劳务有限公司
- 林州恒瑞建设工程有限公司
- 河南大成建设劳务有限公司
- 河南大一建筑劳务有限公司
- 河南盛威建筑工程有限公司
- 河南坤之宇建筑工程有限公司
- 众信电力工程有限公司
- 河南昱佛建筑工程有限公司
- 河南淇源建筑工程有限公司
- 林州凤宝建筑安装有限公司
- 河南中发岩土工程有限公司
- 河南中都建设工程有限公司
- 河南祥凯建筑工程有限公司
- 河南乐泰建筑工程有限公司
- 林州宏达建筑劳务有限公司
- 河南华盛建设集团有限公司
- 河南凯通建设工程有限公司
- 国腾路桥工程有限公司
- 中建方达建设工程有限公司
- 河南省天都建设工程有限公司
- 昌隆建设工程有限公司
- 河南洹上村园林绿化工程有限公司
- 河南双锦建设工程有限公司
- 河南子丰市政工程有限公司
- 林州首创建筑工程有限公司
- 河南众鑫建筑工程有限公司
- 河南宁崴建筑工程有限公司
- 林州市航安建筑劳务有限公司
- 林州益成建设工程有限公司
- 林州市昌弘建筑工程有限公司
- 河南正耀建设有限公司
- 河南鑫鹏建设工程有限公司
- 林州恒泰建筑工程有限公司
- 林竣建设有限公司
- 河南朝众建筑工程有限公司
- 林州科鸿建筑工程有限公司
- 东辰建设发展有限公司
- 河南创新新能源科技有限公司
- 河南省永业建筑工程有限公司
- 林州市煜凯建筑工程有限公司
- 宝鼎建设工程有限公司
- 林州市航安建筑工程有限公司
- 河南业展建设工程有限公司
- 河南联竣建筑工程有限公司
- 河南聚超建筑工程有限公司
- 林州远方电力工程有限公司
- 河南蒙寅建筑劳务有限公司
- 方元建筑劳务有限公司
- 龙兴建设工程有限公司
- 河南春谦建设工程有限公司
- 河南正博公路工程有限公司
- 林州市汇鑫安装工程有限公司
- 林州市祥隆劳务有限公司
- 河南胜杰建筑工程有限公司
- 河南恩普建筑工程有限公司
- 河南港津建筑工程有限公司
- 河南昌明建筑工程有限公司
- 中豫城控建设集团有限公司
- 林州晨宇建设工程有限公司
- 河南豫柯建筑工程有限公司
- 河南捷润建筑工程有限公司
- 中方通建设工程有限公司
- 河南多果建筑工程有限公司
- 河南尚伟建筑工程有限公司
- 林州新航程建筑工程有限公司
- 河南金华建筑工程有限公司
- 国云工程技术有限公司
- 河南路威路桥工程有限公司
- 林州中盛建设工程有限公司
- 林州市恒基建设有限公司
- 河南润恒建筑工程有限公司
- 河南华安水利工程有限公司
- 中城易通建设发展有限公司
- 河南浚洲建筑工程有限公司
- 林州市锦晟建筑劳务有限公司
- 河南省北安建筑工程有限公司
- 林州泰岳建设工程有限公司
- 河南联洋建筑工程有限公司
- 河南港大市政建筑工程有限公司
- 林州东盛建筑劳务有限公司
- 河南省天鉴建设工程有限公司
- 河南瑞凝建筑工程有限公司
- 林州市东瑞建筑劳务有限公司
- 河南众达建筑劳务有限公司
- 河南省帝增建筑工程有限公司
- 河南省升灿建筑工程有限公司
- 河南苑景建筑劳务分包有限公司
- 林州众立建设工程有限公司
- 红旺建筑工程有限公司
- 林州市圣兴建筑劳务有限公司
- 林州诚林建筑劳务有限公司
- 林州建工劳务有限公司
- 河南巨业建筑工程有限公司
- 中科华夏建设开发有限公司
- 君晟建筑工程有限公司
- 郑州新动力建筑劳务分包有限公司
- 河南省福德建筑工程有限公司
- 林州源大建筑工程有限公司
- 河南大瑞园林建设有限公司
- 河南秋禾建筑劳务有限公司
- 河南腾翔建筑工程有限公司
- 河南天之华建设工程有限公司
- 河南祥和建筑安装有限公司
- 河南省鼎文建设工程有限公司
- 河南周城建设发展有限公司
- 河南庆泰建筑工程有限公司
- 中科信合建设发展有限公司
- 林州恒隆建设工程有限公司
- 河南省力恒建筑工程有限公司
- 林州市四季青绿化有限责任公司
- 林州市景盛建筑工程有限公司
- 河南建基建设工程有限公司
- 河南宝凯建筑工程有限公司
- 林州市四合建筑劳务有限公司
- 河南和耀建筑工程有限公司
- 林州市凯达建筑劳务有限公司
- 林州市恒信建筑劳务有限公司
- 开翔建设工程有限公司
- 河南省新创达建设工程有限公司
- 林州鑫龙建筑工程有限公司
- 河南省昌博建筑工程有限公司
- 河南君利泰建筑工程有限公司
- 林州杏林建筑工程有限公司
- 河南千禧建设工程有限公司
- 中建诚正建筑工程有限公司
- 河南省聚千建筑工程有限公司
- 林州海之鸿建筑工程有限公司
- 河南振鼎建筑工程有限公司
- 林州方成建筑劳务有限公司
- 河南众众建设工程有限公司
- 林州市万润建筑劳务有限公司
- 启创建设工程有限公司
- 河南子明建筑工程有限公司
- 安阳市兴鼎路桥工程有限公司
- 河南智擎建筑劳务有限公司
- 河南鼎平市政工程有限公司
- 林州宏阳建筑工程有限公司
- 河南豫泰建筑工程有限公司
- 林州市鸿浩建筑劳务有限公司
- 林州市锦华建筑工程有限公司
- 河南瑞锋建设有限公司
- 河南欧信建筑劳务有限公司
- 林州市中兴建筑劳务有限公司
- 林州市大德建设工程有限公司
- 河南华文建设有限公司
- 河南凌焜建筑工程有限公司
- 河南安居建设有限公司
- 林州鲲鹏建筑工程有限公司
- 林州经纬建筑工程有限公司
- 林州祥川建筑工程有限公司
- 林州市鑫淼建筑劳务有限公司
- 河南祥泰路桥有限公司
- 景祥建设工程有限公司
- 河南省兴华建安工程有限公司
- 河南古森建筑劳务有限公司
- 平祥建设工程有限公司
- 河南大博建设工程有限公司
- 河南华普建设工程有限公司
- 河南东邦建设工程有限公司
- 卓冠建设工程有限公司
- 河南品瑞建筑工程有限公司
- 河南宝金建设工程有限公司
- 中城鑫邦建设有限公司
- 河南省鸿运建设工程有限公司
- 林州明奥建筑工程有限公司
- 河南金手指建设工程有限公司
- 林州市弘顺建筑劳务有限公司
- 林州市林海建筑劳务有限公司
- 河南艺兆市政工程有限公司
- 林州誉峰建筑工程有限公司
- 河南卓骏建筑工程有限公司
- 林州众成建筑工程有限公司
- 河南城通市政工程有限公司
- 林州市晋源建筑工程有限公司
- 河南飞越建筑工程有限公司
- 林州鑫泰建筑工程有限公司
- 林州市太行建筑劳务有限公司
- 河南筑丰建设发展有限公司
- 林州一帆建筑劳务有限公司
- 林州宏久建筑工程有限公司
- 林州市盛祥建筑劳务有限公司
- 河南黎润建设工程有限公司
- 林州市永安建筑劳务有限公司
- 河南省长江建设实业有限公司
- 河南腾润建设工程有限公司
- 河南国梁建设工程有限公司
- 河南诚聚建筑工程有限公司
- 河南德邦市政工程有限公司
- 河南安德建设工程有限公司
- 河南森川建筑工程有限公司
- 林州市顺通公路工程有限公司
- 河南领邦建筑工程有限公司
- 河南博兴建设工程有限公司
- 东泽消防工程有限公司
- '''
- list_enter = []
- for _p in re.split("\s",str_enter):
- if _p.strip()=="":
- continue
- list_enter.append({"name":_p.strip()})
- def _handle(item,result_queue,pool_ots):
- ots_client = pool_ots.getConnector()
- try:
- bool_query = BoolQuery(must_queries=[
- NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["name"]))
- ,RangeQuery("status",201,300,include_lower=True,include_upper=True)
- ,RangeQuery("page_time","2020-01-01")
- ])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- SearchQuery(bool_query, limit=1, get_total_count=True),
- ColumnsToGet(['docid'], ColumnReturnType.SPECIFIED))
- item["total_count"] = total_count
- # bool_query = BoolQuery(must_queries=[
- # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",item["name"]))
- # ,RangeQuery("status",201,300,include_lower=True,include_upper=True)
- # ,NestedQuery("sub_docs_json",RangeQuery("sub_docs_json.win_bid_price",0,1000000,include_upper=True))
- # ])
- #
- # rows, next_token, total_count, is_all_succeed = ots_client.search("document", "document_index",
- # SearchQuery(bool_query, limit=1, get_total_count=True),
- # ColumnsToGet(['docid'], ColumnReturnType.SPECIFIED))
- # item["l_total_count"] = total_count
- except Exception as e:
- pass
- pool_ots.putConnector(ots_client)
- pool_ots = ConnectorPool(init_num=30,max_num=40,method_init=getConnect_ots)
- task_queue = queue.Queue()
- for item in list_enter:
- task_queue.put(item)
- mt = MultiThreadHandler(task_queue,_handle,None,30,pool_ots=pool_ots)
- mt.run()
- df_data = {}
- for item in list_enter:
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s.xls"%getCurrent_date("%Y-%m-%d_%H%M%S"))
- from bs4 import BeautifulSoup
- def downloadAttach(_url,_path):
- try:
- result = requests.get(_url,stream=True,timeout=20)
- if result.status_code==200:
- with open(_path,"wb") as f:
- f.write(result.content)
- else:
- log("download failed with code %d of url:%s"%(result.status_code,_url))
- except Exception:
- log("download failed of url:%s"%(_url))
- def extract_pageAttachments(_html):
- fileSuffix = [".zip", ".rar", ".tar", ".7z", ".wim", ".docx", ".doc", ".xlsx", ".xls", ".pdf", ".txt", ".hnzf", ".bmp", ".jpg", ".jpeg", ".png", ".tif", ".swf"]
- _soup = BeautifulSoup(_html,"lxml")
- list_a = _soup.find_all("a")
- list_img = _soup.find_all("img")
- page_attachments = []
- for _a in list_a:
- _text =_a.get_text()
- _url = _a.attrs.get("href","")
- if _url.find("http://www.bidizhaobiao.com")>=0:
- continue
- is_attach = False
- for suf in fileSuffix:
- if _text.find(suf)>=0 or _url.find(suf)>=0:
- is_attach = True
- if is_attach:
- page_attachments.append({"fileLink":_url,"fileTitle":_text})
- for _a in list_img:
- _text =_a.get_text()
- _url = _a.attrs.get("src","")
- if _url.find("http://www.bidizhaobiao.com")>=0:
- continue
- is_attach = False
- for suf in fileSuffix:
- if _text.find(suf)>=0 or _url.find(suf)>=0:
- is_attach = True
- if is_attach:
- page_attachments.append({"fileLink":_url,"fileTitle":_text})
- return page_attachments
- def exportDocument_attachment():
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[TermQuery("docid",165528701)])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
- columns_to_get=ColumnsToGet(["dochtmlcon"],ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- for _data in list_data:
- extract_pageAttachments(_data["dochtmlcon"])
- def transUUid():
- conn_oracle = getConnection_oracle()
- cursor = conn_oracle.cursor()
- tables = ['T_ZHAO_BIAO_GONG_GAO','T_ZHONG_BIAO_XIN_XI']
- conn_mysql = getConnection_testmysql()
- cursor_mysql = conn_mysql.cursor()
- for _t in tables:
- sql = " select id,page_time,'%s' from bxkc.%s where page_time>='%s' and page_time<='%s' order by page_time "%(_t,_t,"2021-06-01","2021-08-31")
- print(sql)
- cursor.execute(sql)
- _count = 0
- while(True):
- insert_sql = "insert into fix_document(uuid,page_time,table_name) values"
- rows = cursor.fetchmany(10000)
- if not rows:
- break
- _count += len(rows)
- print(_count)
- for row in rows:
- _uuid = row[0]
- page_time = row[1]
- table_name = row[2]
- insert_sql += "('%s','%s','%s'),"%(_uuid,page_time,table_name)
- insert_sql = insert_sql[:-1]
- cursor_mysql.execute(insert_sql)
- conn_mysql.commit()
- def fix_document():
- conn_oracle = getConnection_oracle()
- cursor_oracle = conn_oracle.cursor()
- conn_mysql = getConnection_testmysql()
- cursor_mysql = conn_mysql.cursor()
- sql = "select uuid,page_time,table_name from fix_document_final where page_time>='2021-06-24' "
- cursor_mysql.execute(sql)
- _count = 0
- while True:
- rows = cursor_mysql.fetchmany(1000)
- if not rows:
- break
- _count += len(rows)
- print(_count)
- insert_sql = ""
- for row in rows:
- _uuid = row[0]
- page_time = row[1]
- table_name = row[2]
- insert_sql += " insert into BXKC.fix_document_final(id,page_time,TABLENAME) values('%s','%s','%s');"%(_uuid,page_time,table_name)
- insert_sql = "begin %s end;"%(insert_sql)
- cursor_oracle.execute(insert_sql)
- conn_oracle.commit()
- def exportDocument_forRecommen():
- filename = "../data/推荐 (1).csv"
- df = pd.read_csv(filename,encoding="GBK")
- ots_client = getConnect_ots()
- columns = ["province","city","page_time","doctitle","product"]
- current_date = getCurrent_date("%Y-%m-%d")
- adict_data = []
- _index = 0
- for company,json_docid in zip(df["company"][:10000],df["json_docid"][:10000]):
- _index += 1
- _province = ""
- # consumed, return_row, next_token = ots_client.get_row("enterprise",[("name",company)],columns_to_get=["province"])
- # dict_k = getRow_ots_primary(return_row)
- # _province = dict_k.get("province","")
- print("序号:%d,%s,%s"%(_index,company,_province))
- dict_recommen = json.loads(json_docid)
- for str_way,str_docid in dict_recommen.items():
- should_q = []
- for _docid in str_docid.split(","):
- should_q.append(TermQuery("docid",_docid))
- bool_query = BoolQuery(must_queries=[
- # TermQuery("province",_province)
- # ,RangeQuery("page_time",timeAdd(current_date,-7),current_date,True,True)
- # ,
- BoolQuery(should_queries=should_q)]
- )
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,get_total_count=True,limit=100),
- ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- adict_row = getRow_ots(rows)
- for dict_row in adict_row:
- dict_item = dict()
- set_dict_item(dict_item,"公司名称",company)
- set_dict_item(dict_item,"推荐路径",str_way)
- set_dict_item(dict_item,"公告id",dict_row.get("docid",""))
- set_dict_item(dict_item,"省份",dict_row.get("province",""))
- set_dict_item(dict_item,"城市",dict_row.get("city",""))
- set_dict_item(dict_item,"page_time",dict_row.get("page_time",""))
- set_dict_item(dict_item,"doctitle",dict_row.get("doctitle",""))
- set_dict_item(dict_item,"product",dict_row.get("product",""))
- adict_data.append(dict_item)
- dict_data = {}
- for dict_item in adict_data:
- for k in list_df_columns:
- if k not in dict_data:
- dict_data[k] = []
- dict_data[k].append(dict_item.get(k,""))
- df1 = pd.DataFrame(dict_data)
- df1.to_excel("../data/%s_推荐.xlsx"%getCurrent_date("%Y-%m-%d_%H%M%S"),columns=list_df_columns)
- def exportDocument_by_days(page_time):
- dict_channel = getDict_docchannel()
- ots_client = getConnect_ots()
- filename = "供货贷含[建筑]企业名单.xlsx"
- df = pd.read_excel(filename)
- bool_query = BoolQuery(must_queries=[TermQuery("page_time",page_time),
- # RangeQuery("status",201,301),
- ])
- # columns = ["doctitle","docchannel","product","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","original_docchannel","detail_link"]
- columns = ["doctitle","docchannel","product","bidway","moneysource","province","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","original_docchannel","detail_link","page_attachments","service_time"]
- dict_channel = getDict_docchannel()
- def hidePhone(phone):
- if phone is None or phone=="":
- return ""
- return "*"*(len(phone)-4)+phone[-4:]
- def getData(df_data,rows,set_line,list_keyword,set_columns,df_columns):
- list_data = getRow_ots(rows)
- for row in list_data:
- item = {}
- _dict = row
- set_dict_item_columns(set_columns,df_columns,item,"docid",_dict.get("docid",""))
- set_dict_item_columns(set_columns,df_columns,item,"公告标题",_dict.get("doctitle",""))
- # set_dict_item_columns(set_columns,df_columns,item,"公告内容",_dict.get("doctextcon",""))
- set_dict_item_columns(set_columns,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- # set_dict_item_columns(set_columns,df_columns,item,"关键词",",".join(list(set(re.findall("|".join(list_keyword),_dict.get("doctextcon",""))))))
- set_dict_item_columns(set_columns,df_columns,item,"产品",_dict.get("product",""))
- set_dict_item_columns(set_columns,df_columns,item,"省份",_dict.get("province",""))
- # item["区域"] = "%s-%s-%s"%(_dict.get("province",""),_dict.get("city",""),_dict.get("district",""))
- set_dict_item_columns(set_columns,df_columns,item,"资金来源",_dict.get("moneysource",""))
- set_dict_item_columns(set_columns,df_columns,item,"招标方式",_dict.get("bidway",""))
- set_dict_item_columns(set_columns,df_columns,item,"服务期限",_dict.get("service_time",""))
- set_dict_item_columns(set_columns,df_columns,item,"城市",_dict.get("city",""))
- set_dict_item_columns(set_columns,df_columns,item,"区县",_dict.get("district",""))
- set_dict_item_columns(set_columns,df_columns,item,"发布时间",_dict.get("page_time",""))
- set_dict_item_columns(set_columns,df_columns,item,"创建时间",_dict.get("crtime",""))
- set_dict_item_columns(set_columns,df_columns,item,"行业一级分类",_dict.get("industry",""))
- set_dict_item_columns(set_columns,df_columns,item,"行业二级分类",_dict.get("info_type",""))
- # set_dict_item_columns(set_columns,df_columns,item,"uuid",_dict.get("uuid"))
- # set_dict_item_columns(set_columns,df_columns,item,"公告标题_refine",re.sub(r'工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _dict.get("doctitle","")))
- set_dict_item_columns(set_columns,df_columns,item,"公告类别",dict_channel.get(_dict.get("docchannel",""),""))
- set_dict_item_columns(set_columns,df_columns,item,"原网公告类别",dict_channel.get(_dict.get("original_docchannel",""),""))
- set_dict_item_columns(set_columns,df_columns,item,"status","正常" if _dict.get("status",201) <=300 else "去重")
- set_dict_item_columns(set_columns,df_columns,item,"detail_link",_dict.get("detail_link"))
- set_dict_item_columns(set_columns,df_columns,item,"web_source_no",_dict.get("web_source_no",""))
- set_dict_item_columns(set_columns,df_columns,item,"web_source_name",_dict.get("web_source_name",""))
- set_dict_item_columns(set_columns,df_columns,item,"项目名称",_dict.get("project_name",""))
- set_dict_item_columns(set_columns,df_columns,item,"项目编号",_dict.get("project_code",""))
- set_dict_item_columns(set_columns,df_columns,item,"招标单位",_dict.get("tenderee",""))
- set_dict_item_columns(set_columns,df_columns,item,"招标联系人",_dict.get("tenderee_contact",""))
- set_dict_item_columns(set_columns,df_columns,item,"招标联系人电话",_dict.get("tenderee_phone",""))
- set_dict_item_columns(set_columns,df_columns,item,"代理单位",_dict.get("agency",""))
- set_dict_item_columns(set_columns,df_columns,item,"代理联系人",_dict.get("agency_contact",""))
- set_dict_item_columns(set_columns,df_columns,item,"代理联系人电话",_dict.get("agency_phone",""))
- set_dict_item_columns(set_columns,df_columns,item,"url","http://www.bidizhaobiao.com/info-%d.html"%(_dict.get("docid","")))
- set_dict_item_columns(set_columns,df_columns,item,"比地招标公告地址","http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%_dict.get("docid")))))
- set_dict_item_columns(set_columns,df_columns,item,"截标时间",_dict.get("time_bidclose",""))
- set_dict_item_columns(set_columns,df_columns,item,"page_attachments",_dict.get("page_attachments","[]"))
- sub_docs_json = _dict.get("sub_docs_json")
- set_tenderer = set()
- if sub_docs_json is not None:
- docs = json.loads(sub_docs_json)
- docs.sort(key=lambda x:x.get("win_bid_price",0))
- for _doc in docs:
- if "win_tenderer" in _doc:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位",_doc["win_tenderer"])
- if "second_tenderer" in _doc:
- set_dict_item_columns(set_columns,df_columns,item,"第二候选单位",_doc["second_tenderer"])
- set_tenderer.add(_doc.get("second_tenderer"))
- if "third_tenderer" in _doc:
- set_dict_item_columns(set_columns,df_columns,item,"第三候选单位",_doc["third_tenderer"])
- set_tenderer.add(_doc.get("third_tenderer"))
- if "win_tenderee_manager" in _doc:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位联系人",_doc["win_tenderee_manager"])
- if "win_tenderee_phone" in _doc:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位联系电话",_doc["win_tenderee_phone"])
- if "win_bid_price" in _doc and float(0 if _doc["win_bid_price"]=="" else _doc["win_bid_price"])>0:
- set_dict_item_columns(set_columns,df_columns,item,"中标金额",_doc["win_bid_price"])
- if "bidding_budget" in _doc and float(0 if _doc["bidding_budget"]=="" else _doc["bidding_budget"])>0:
- set_dict_item_columns(set_columns,df_columns,item,"招标金额",_doc["bidding_budget"])
- set_dict_item_columns(set_columns,df_columns,item,"入围供应商",",".join(list(set_tenderer)))
- if "第二候选单位" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"第二候选单位","")
- if "第三候选单位" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"第三候选单位","")
- if "招标金额" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"招标金额","")
- if "中标金额" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"中标金额","")
- if "中标单位" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位","")
- if "中标单位联系人" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位联系人","")
- if "中标单位联系电话" not in item:
- set_dict_item_columns(set_columns,df_columns,item,"中标单位联系电话","")
- # if item["中标单位"] not in set_enter:
- # continue
- _line = "%s-%s-%s-%s-%s-%s"%(item["省份"],item["城市"],item["项目编号"],item["招标单位"],item["招标联系人"],str(item["招标金额"]))
- # if _line in set_line:
- # continue
- # if item["招标金额"]=="":
- # continue
- # set_line.add(_line)
- for k,v in item.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df_data = {}
- set_columns = set()
- df_columns = []
- # for name in df["ent_name_real"]:
- # if isinstance(name,str) and name!="":
- # list_should_q = []
- # # list_should_q.append(MatchPhraseQuery("doctextcon",name))
- # # list_should_q.append(MatchPhraseQuery("attachmenttextcon",name))
- # NestedQuery("sub_docs_json","sub_docs_json.win_tenderer",name)
- # bool_query = BoolQuery(must_queries=[RangeQuery("page_time","2018-01-01"),
- # RangeQuery("status",201,301),
- # # BoolQuery(should_queries=list_should_q),
- # NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",name))
- # ])
- #
- # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- # SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
- # ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- #
- #
- # while True:
- # getData(df_data,rows,set(),"",set_columns,df_columns)
- # if not next_token:
- # break
- # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- # SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
- # ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- # if len(df_data.keys())>0:
- # print(len(df_data[list(df_data.keys())[0]]),total_count)
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
- ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- while True:
- getData(df_data,rows,set(),"",set_columns,df_columns)
- if not next_token:
- break
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
- ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- if len(df_data.keys())>0:
- print(len(df_data[list(df_data.keys())[0]]),total_count)
- # appendAttachmentPath(df_data,"page_attachments","附件链接")
- # df_columns.append("附件链接")
- # df_columns = ["docid","公告标题","公告类别","发布时间","公告内容","省份","城市","项目编号","招标单位","招标金额","资金来源","招标方式","代理单位","中标单位","中标金额","第二候选单位","第三候选单位","url","附件链接"]
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s_%s.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S"),page_time),columns=df_columns)
- def appendAttachmentPath(df_data,key,new_key):
- list_data = []
- for _attach in df_data[key]:
- _dict = {key:_attach}
- list_data.append(_dict)
- task_queue = Queue()
- for _d in list_data:
- task_queue.put(_d)
- auth = oss2.Auth("LTAI5tFuoxHm8Uxrr5nT8wTZ", "Yp01bylJFx0al6teCaccY8hbtllBGg")
- bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
- attachment_bucket_name = "attachment-hub"
- bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)
- ots_client = getConnect_ots()
- def search(ots_client,table_name,key_tuple,columns_to_get):
- try:
- # 调用get_row接口查询,最后一个参数值1表示只需要返回一个版本的值。
- consumed, return_row, next_token = ots_client.get_row(table_name, key_tuple, columns_to_get, None, 1)
- if return_row is not None:
- _dict = getRow_ots_primary(return_row)
- return _dict
- return None
- # 客户端异常,一般为参数错误或者网络异常。
- except OTSClientError as e:
- traceback.print_exc()
- log("get row failed, http_status:%d, error_message:%s" % (e.get_http_status(), e.get_error_message()))
- # 服务端异常,一般为参数错误或者流控错误。
- except OTSServiceError as e:
- traceback.print_exc()
- log("get row failed, http_status:%d, error_code:%s, error_message:%s, request_id:%s" % (e.get_http_status(), e.get_error_code(), e.get_error_message(), e.get_request_id()))
- def _handle(item,result_queue):
- page_attachments = json.loads(item.get(key,"[]"))
- list_url = []
- for _a in page_attachments:
- fileMd5 = _a.get("fileMd5")
- print("==",fileMd5)
- _s_dict = search(ots_client,"attachment",[("filemd5",fileMd5)],["path"])
- if _s_dict is not None:
- _path = _s_dict.get("path")
- if _path is not None:
- _url = bucket.sign_url("GET",_path,86500*5)
- list_url.append(_url)
- item[new_key] = json.dumps(list_url)
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- df_data[new_key] = []
- for _d in list_data:
- df_data[new_key].append(_d.get(new_key))
- def export_competition():
- file = "select___from_province_indus_entity_top1.xlsx"
- df1 = pd.read_excel(file)
- ots_client = getConnect_ots()
- task_queue = queue.Queue()
- list_entity = []
- for province,industry,entitys in zip(df1["province"],df1["industry"],df1["entitys"]):
- l_e = json.loads(entitys)
- for l in l_e:
- list_entity.append({"province":province,
- "industry":industry,
- "win_tenderer":l.get("win_tenderee","")})
- for item in list_entity:
- task_queue.put(item)
- def _handle(item,result_queue):
- def getData(rows,_set):
- dict_rows = getRow_ots(rows)
- for _dict in dict_rows:
- sub_docs_json = _dict.get("sub_docs_json")
- if sub_docs_json is not None:
- for sub_docs in json.loads(sub_docs_json):
- if sub_docs.get("win_tenderer") is not None:
- _set.add(sub_docs.get("win_tenderer"))
- if sub_docs.get("second_tenderer") is not None:
- _set.add(sub_docs.get("second_tenderer"))
- if sub_docs.get("third_tenderer") is not None:
- _set.add(sub_docs.get("third_tenderer"))
- columns = ["sub_docs_json"]
- _company = item.get("win_tenderer")
- should_q = BoolQuery(should_queries=[NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",_company)),
- NestedQuery("sub_docs_json",TermQuery("sub_docs_json.second_tenderer",_company)),
- NestedQuery("sub_docs_json",TermQuery("sub_docs_json.third_tenderer",_company))])
- bool_query = BoolQuery(must_queries=[
- # should_q,
- MatchPhraseQuery("doctextcon",_company),
- RangeQuery("docchannel",101)])
- _set = set()
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
- ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- getData(rows,_set)
- _count = 0
- _page = 0
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
- ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- getData(rows,_set)
- _count += 1
- _page += 1
- if len(_set)>20 or _page>20:
- break
- if item["win_tenderer"] in _set:
- _set.remove(item["win_tenderer"])
- item["competition"] = ",".join(list(_set))
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- df_data = {}
- keys = ["province","industry","win_tenderer","competition"]
- for key in keys:
- if key not in df_data:
- df_data[key] = []
- for item in list_entity:
- for key in keys:
- df_data[key].append(item.get(key))
- df2 = pd.DataFrame(df_data)
- df2.to_excel("competition.xlsx",columns=keys)
- def document_dumplicate():
- df = pd.read_excel("../data/2022-01-19_214329_export11.xlsx")
- print(df.keys())
- def export_document_no_price():
- df = pd.read_csv("select___from_document_no_price_tmp.csv",encoding="gbk")
- conn_oracle = getConnection_oracle()
- cursor = conn_oracle.cursor()
- sql = " select source_encode,source_name from bxkc.T_WEBSOURCENUM_INFO"
- cursor.execute(sql)
- dict_source = {}
- while 1:
- rows = cursor.fetchmany(10000)
- if not rows:
- break
- for row in rows:
- dict_source[row[0]] = row[1]
- dict_source[row[0].split("(")[0]] = row[1]
- list_name = []
- set_web_source = set()
- for web_source_no in df["web_source_no"]:
- set_web_source.add(web_source_no)
- list_name.append(dict_source.get(web_source_no,""))
- dict_source_year = {}
- for web_source_no,year,counts_no_price,counts_all,rate in zip(df["web_source_no"],df["year"],df["counts_no_price"],df["counts_all"],df["rate"]):
- dict_source_year["%s&%s"%(web_source_no,year)] = {"counts_no_price":counts_no_price,"counts_all":counts_all,"rate":rate}
- new_data = {"web_source_no":[],
- "web_source_name":[],
- "counts_no_price":[],
- "counts_all":[],
- "rate":[],
- "counts_no_price1":[],
- "counts_all1":[],
- "rate1":[]}
- for web_source_no in list(set_web_source):
- new_data["web_source_no"].append(web_source_no)
- new_data["web_source_name"].append(dict_source.get(web_source_no,""))
- d_2020 = dict_source_year.get("%s&%s"%(web_source_no,"2020"),{})
- d_2021 = dict_source_year.get("%s&%s"%(web_source_no,"2021"),{})
- new_data["counts_no_price"].append(d_2020.get("counts_no_price"))
- new_data["counts_all"].append(d_2020.get("counts_all"))
- new_data["rate"].append(d_2020.get("rate"))
- new_data["counts_no_price1"].append(d_2021.get("counts_no_price"))
- new_data["counts_all1"].append( d_2021.get("counts_all"))
- new_data["rate1"].append(d_2021.get("rate"))
- # new_data = {"year":df["year"],
- # "web_source_no":df["web_source_no"],
- # "web_source_name":list_name,
- # "counts_no_price":df["counts_no_price"],
- # "counts_all":df["counts_all"],
- # "rate":df["rate"]}
- df2 = pd.DataFrame(new_data)
- df2.to_excel("websource_no_price1.xlsx",columns=["web_source_no","web_source_name","counts_no_price","counts_all","rate","counts_no_price1","counts_all1","rate1"])
- def exportDetailLink():
- df = pd.read_excel("招投标数据测试反馈表3.xlsx")
- list_item = []
- for docid in df["docid"]:
- list_item.append({"docid":docid})
- task_queue = queue.Queue()
- for item in list_item:
- task_queue.put(item)
- def _handle(item,result_queue,ots_client,pool_oracle):
- try:
- conn = pool_oracle.getConnector()
- docid = int(item["docid"])
- partitionkey = int(docid%500+1)
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",int(docid))],["original_docchannel","detail_link","uuid"])
- _dict = getRow_ots_primary(return_row)
- if _dict.get("detail_link") is not None and len(_dict.get("detail_link"))>0:
- item["detail_link"] = _dict.get("detail_link")
- else:
- original_docchannel = _dict.get("original_docchannel")
- _uuid = _dict.get("uuid")
- d_tablename = {"51":"T_GONG_GAO_BIAN_GENG",
- "52":"T_ZHAO_BIAO_GONG_GAO",
- "101":"T_ZHONG_BIAO_XIN_XI",
- "102":"T_ZHAO_BIAO_YU_GAO",
- "103":"T_ZHAO_BIAO_DA_YI",
- "104":"T_ZHAO_BIAO_WEN_JIAN",
- "114":"T_CAI_GOU_YI_XIANG"
- }
- _tablename = d_tablename.get(str(original_docchannel))
- if _tablename is not None:
- cursor = conn.cursor()
- sql = "select detail_link from bxkc.%s where id='%s'"%(_tablename,_uuid)
- print(sql)
- cursor.execute(sql)
- rows = cursor.fetchall()
- if len(rows)>0:
- item["detail_link"] = rows[0][0]
- cursor.close()
- except Exception as e:
- traceback.print_exc()
- finally:
- pool_oracle.putConnector(conn)
- ots_client = getConnect_ots()
- pool_oracle = ConnectorPool(10,30,getConnection_oracle)
- mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client,pool_oracle=pool_oracle)
- mt.run()
- df_data = {"docid":[],
- "detail_link":[]}
- for item in list_item:
- for k,v in df_data.items():
- v.append(item.get(k,""))
- df2 = pd.DataFrame(df_data)
- df2.to_excel("222.xlsx")
- def process_doc():
- df = pd.read_excel("../data/2022-03-16_154617_数据导出.xlsx",1)
- list_check = []
- set_process_docid = set()
- for docid in df["process_docid"]:
- set_process_docid.add(docid)
- df = pd.read_excel("../data/2022-03-16_154617_数据导出.xlsx",0)
- for docid in df["docid"]:
- if docid in set_process_docid:
- list_check.append("1")
- else:
- list_check.append("0")
- df["check"] = list_check
- df.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')))
- def export_extract2():
- ots_client = getConnect_ots()
- df_keys = ["docid","extract_json","status"]
- df_data = {}
- for _key in df_keys:
- df_data[_key] = []
- bool_query = BoolQuery(must_queries=[
- RangeQuery("status",1,1000,True,True)])
- rows, next_token, total_count, is_all_succeed = ots_client.search("document_extract2", "document_extract2_index",
- SearchQuery(bool_query ,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]), limit=100, get_total_count=True),
- ColumnsToGet(df_keys,return_type=ColumnReturnType.SPECIFIED))
- list_dict = getRow_ots((rows))
- for _dict in list_dict:
- if re.search("false",_dict.get("extract_json","")) is None:
- continue
- for k in df_keys:
- df_data[k].append(_dict.get(k))
- _count = len(list_dict)
- while next_token:
- rows, next_token, total_count, is_all_succeed = ots_client.search("document_extract2", "document_extract2_index",
- SearchQuery(bool_query ,next_token=next_token, limit=100, get_total_count=True),
- ColumnsToGet(df_keys,return_type=ColumnReturnType.SPECIFIED))
- list_dict = getRow_ots((rows))
- for _dict in list_dict:
- if re.search("false",_dict.get("extract_json","")) is None:
- continue
- for k in df_keys:
- df_data[k].append(_dict.get(k))
- _count += len(list_dict)
- print("%d/%d"%(_count,total_count))
- df = pd.DataFrame(df_data)
- df.to_excel("../data/%s_extract2.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")))
- def export_by_file():
- df = pd.read_csv("../data/2022-04-01_121315_数据导出.csv",encoding="gbk")
- keys = df.keys()
- df_data = {}
- set_win = set()
- set_ruwei = set()
- for k in keys:
- df_data[k] = []
- for _i in range(len(df["产品"])):
- product = df["产品"][_i]
- if product is None or not isinstance(product,str):
- continue
- print(product)
- win_tenderer = df["中标单位"][_i]
- if win_tenderer is not None and isinstance(win_tenderer,str):
- set_win.add(win_tenderer)
- set_ruwei.add(win_tenderer)
- ruwei = df["入围供应商"][_i]
- if ruwei is not None and isinstance(ruwei,str):
- l_s = ruwei.split(",")
- for _s in l_s:
- set_ruwei.add(_s)
- if re.search("公路|道路|路基|路面|快速通道|高速|隧道|飞机跑道|桥梁|养护|路段|市政|照明工程|照明设施|亮灯|灯光改造|灯光工程|管道|架线|园林|景观|绿化|排水|河道整治|环境治理|交通|地铁|跌路|高铁|桥梁|大桥|桥段",product) is not None:
- for k in keys:
- df_data[k].append(df[k][_i])
- print("win count:%d ruwei:%d"%(len(set_win),len(set_ruwei)))
- # df1 = pd.DataFrame(df_data)
- # df1.to_excel("../data/%s_文件导出.xlsx"%(getCurrent_date("%Y-%m-%d_%H%M%S")),columns=keys)
- def export_dump():
- import pandas as pd
- df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=0)
- _set_number = set()
- _set_number |= set(df["号码"])
- print(len(_set_number))
- df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=1)
- _set_number |= set(df["号码"])
- print(len(_set_number))
- df = pd.read_excel("NotIn家具中标 去除注销企业 31410(3)(1)(1).xlsx",sheetname=2)
- keys = df.keys()
- df_data = {}
- for k in keys:
- df_data[k] = []
- for _i in range(len(df[keys[0]])):
- if df["号码"][_i] not in _set_number:
- for k in keys:
- df_data[k].append(df[k][_i])
- _set_number.add(df["号码"][_i])
- df2 = pd.DataFrame(df_data)
- df2.to_excel("tmp222.xlsx")
- def check_data_synchronization():
- filepath = "C:\\Users\\Administrator\\Desktop\\to_check.log"
- list_uuid = []
- _regrex = "ID='(?P<uuid>.+)'"
- with open(filepath,"r",encoding="utf8") as f:
- while 1:
- _line = f.readline()
- if not _line:
- break
- _match = re.search(_regrex,_line)
- if _match is not None:
- _uuid = _match.groupdict().get("uuid")
- if _uuid is not None:
- list_uuid.append(_uuid)
- print(len(list_uuid))
- task_queue = Queue()
- list_data = []
- for _uuid in list_uuid:
- _dict = {"uuid":_uuid}
- list_data.append(_dict)
- task_queue.put(_dict)
- ots_client = getConnect_ots()
- def _handle(_item,result_queue):
- bool_query = BoolQuery(must_queries=[TermQuery("uuid",_item.get("uuid"))])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,get_total_count=True),
- columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
- _item["exists"] = total_count
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- df_data = {"uuid":[],
- "exists":[]}
- for _data in list_data:
- for k,v in df_data.items():
- v.append(_data.get(k))
- import pandas as pd
- df2 = pd.DataFrame(df_data)
- df2.to_excel("check.xlsx")
- def group_xlsx():
- filename = "厂商&赛道列表.xlsx"
- df0 = pd.read_excel(filename,0)
- df1 = pd.read_excel(filename,1)
- df2 = pd.read_excel(filename,2)
- set_1 = set(df0["中国厂商"]) | set(df1["国际厂商"])
- set_2 = set(df2["a"]) | set(df2["b"]) | set(df2["c"])
- filename = "../data/2022-05-24_185801_数据导出.xlsx"
- df = pd.read_excel(filename)
- dict_docid = {}
- for docid,keyword in zip(df["docid"],df["关键词"]):
- if docid not in dict_docid:
- dict_docid[docid] = [[],[]]
- if keyword in set_1:
- dict_docid[docid][0].append(keyword)
- else:
- dict_docid[docid][1].append(keyword)
- set_docid = set()
- for k,v in dict_docid.items():
- if len(v[0])>=1 and len(v[1])>=1:
- set_docid.add(k)
- keys = df.keys()
- print(keys)
- df_data = {}
- for i in range(len(df["docid"])):
- print(i)
- docid = df["docid"][i]
- if docid in set_docid:
- for k in keys:
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(df[k][i])
- df_data["关键词"][-1] = str(dict_docid[docid][0][0])+"+"+str(dict_docid[docid][1][0])
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=keys)
- def static_process_time():
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[
- RangeQuery("crtime","2022-05-26","2022-05-27"),
- TermQuery("page_time","2022-05-26")
- ])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),get_total_count=True,limit=100),
- ColumnsToGet(column_names=["crtime","opertime","publishtime","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- list_data = []
- _l = getRow_ots(rows)
- list_data.extend(_l)
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
- SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
- ColumnsToGet(column_names=["crtime","opertime","publishtime","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- _l = getRow_ots(rows)
- list_data.extend(_l)
- print("%d/%d"%(len(list_data),total_count))
- list_dis = []
- list_dis_a = []
- list_dis_n_a = []
- for _data in list_data:
- crtime = _data.get("crtime")
- opertime = _data.get("opertime")
- page_attachments = _data.get("page_attachments","[]")
- _d = time.mktime(time.strptime(opertime,"%Y-%m-%d %H:%M:%S"))-time.mktime(time.strptime(crtime,"%Y-%m-%d %H:%M:%S"))
- list_dis.append(_d)
- if page_attachments=="[]":
- list_dis_n_a.append(_d)
- else:
- list_dis_a.append(_d)
- print("avg_time:",sum(list_dis)/len(list_dis),max(list_dis),min(list_dis))
- print("avg_time:",sum(list_dis_a)/len(list_dis_a),max(list_dis_a),min(list_dis_a))
- print("avg_time:",sum(list_dis_n_a)/len(list_dis_n_a),max(list_dis_n_a),min(list_dis_n_a))
- def export_dump_by_id():
- filename = "遗漏待验证1.csv"
- df = pd.read_csv(filename)
- list_k = []
- ots_client = getConnect_ots()
- for _main_url,_other_url in zip(df["_c0"],df["_c1"]):
- _d = {}
- main_docid = re.split("[-.]",_main_url)[3]
- l_other = []
- for _l in _other_url.split(","):
- _docid = re.split("[-.]",_l)[3]
- l_other.append(_docid)
- _d["main_docid"] = main_docid
- _d["other_docid"] = l_other
- list_k.append(_d)
- task_queue = Queue()
- for _q in list_k:
- task_queue.put(_q)
- def _handle(item,result_queue):
- columns = ["doctitle","docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","web_source_no","web_source_name","service_time","page_attachments"]
- main_docid = item["main_docid"]
- other_docid = item["other_docid"]
- list_should_q = []
- list_should_q.append(TermQuery("docid",main_docid))
- for _d in other_docid:
- list_should_q.append(TermQuery("docid",_d))
- _query = BoolQuery(should_queries=list_should_q)
- l_rows = []
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- dict_row = getRow_ots(rows)
- l_rows.extend(dict_row)
- log("total count:%d"%total_count)
- _count = len(dict_row)
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(_query,next_token=next_token,limit=100,get_total_count=True),
- ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
- dict_row = getRow_ots(rows)
- l_rows.extend(dict_row)
- _count += len(dict_row)
- item["data"] = l_rows
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- df_data = {"main_docid":[]}
- set_line = set()
- dict_channel = getDict_docchannel()
- for _d in list_k:
- list_row = _d.get("data")
- if list_row is not None:
- main_docid = _d.get("main_docid")
- getRowData(df_data,list_row,set_line,[],dict_channel,True)
- for _ in list_row:
- df_data["main_docid"].append(main_docid)
- df1 = pd.DataFrame(df_data)
- list_df_columns1 = ["main_docid"].extend(list_df_columns)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns1)
- def count_product():
- filename = "../data/2022-06-24_152201_数据导出.xlsx"
- df = pd.read_excel(filename)
- # _product = df["产品"]
- # dict_p_c = {}
- # for _p in _product:
- # if isinstance(_p,str) and _p!="":
- # l_p = _p.split(",")
- # for _p1 in l_p:
- # if _p1 not in dict_p_c:
- # dict_p_c[_p1] = 0
- # dict_p_c[_p1] += 1
- # df_data = {"产品":[],
- # "次数":[]}
- # for k,v in dict_p_c.items():
- # df_data["产品"].append(k)
- # df_data["次数"].append(v)
- # df1 = pd.DataFrame(df_data)
- # df1.to_excel("222.xlsx")
- keys = df.keys()
- df_data = {}
- for k in keys:
- df_data[k] = []
- product_pattern = "电脑|台式机|电脑|主机|网络|软件|开发|通信|系统|信息技术"
- df1 = pd.read_excel("222.xlsx")
- list_p = []
- for _p,_n in zip(df1["产品"],df1["need"]):
- if _n==1:
- list_p.append(_p)
- product_pattern = product_pattern+"|"+"|".join(list_p)
- _product = df["产品"]
- for _i in range(len(_product)):
- if re.search(product_pattern,str(_product[_i])) is not None:
- for k in keys:
- df_data[k].append(df[k][_i])
- df2 = pd.DataFrame(df_data)
- df2.to_excel("333.xlsx",columns=keys)
- from dataSource.source import getConnect_capacity
- def exportHonors_item_info():
- ots_capacity = getConnect_capacity()
- bool_query = BoolQuery(should_queries=[
- # TermQuery("ryjx","海河杯"),
- WildcardQuery("hjdw","*合肥建工集团有限公司*")
- ])
- rows,next_token,total_count,is_all_succeed = ots_capacity.search("honors_item_info","honors_item_info_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("create_time")]),get_total_count=True,limit=100),
- columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
- list_data = getRow_ots(rows)
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_capacity.search("honors_item_info","honors_item_info_index",
- SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
- columns_to_get=ColumnsToGet(return_type=ColumnReturnType.ALL))
- list_data.extend(getRow_ots(rows))
- df_data = {}
- set_columns1 = set()
- list_df_columns1 = []
- for _data in list_data:
- _dict = {}
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"record_id",_data.get("record_id"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"bfdw",_data.get("bfdw"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"create_time",_data.get("create_time"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"cs",_data.get("cs"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"detail_link",_data.get("detail_link"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"fbsj",_data.get("fbsj"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjdw",_data.get("hjdw"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjdwjs",_data.get("hjdwjs"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"hjxm",_data.get("hjxm"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"jxjb",_data.get("jxjb"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"jxlx",_data.get("jxlx"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"ryjx",_data.get("ryjx"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"sf",_data.get("sf"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"xmfzr",_data.get("xmfzr"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"zgdw",_data.get("zgdw"))
- set_dict_item_columns(set_columns1,list_df_columns1,_dict,"zxj",_data.get("zxj"))
- for k,v in _dict.items():
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(v)
- df = pd.DataFrame(df_data)
- df.to_excel("honor_export.xlsx",columns=list_df_columns1)
- def check_dump_data():
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[
- generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery),
- RangeQuery("crtime","2022-09-07 20:00:00","2022-09-08 06:00:00"),
- RangeQuery("page_time","2022-09-07","2022-09-08"),
- RangeQuery("status",201,301)
- ])
- list_data = getDocument([{"query":bool_query}],["docid"],table_name="document",table_index="document_index")
- bool_query1 = BoolQuery(must_queries=[
- generateBoolShouldQuery(["docchannel"],[52,102,114],TermQuery),
- RangeQuery("crtime","2022-09-07 20:00:00","2022-09-08 06:00:00"),
- RangeQuery("page_time","2022-09-07","2022-09-08"),
- RangeQuery("status",81,100),
- TermQuery("save",1)
- ])
- list_data1 = getDocument([{"query":bool_query1}],["docid"],table_name="document_tmp",table_index="document_tmp_index")
- set_docid = set()
- set_docid_tmp = set()
- for _data in list_data:
- set_docid.add(_data.get("docid"))
- for _data in list_data1:
- set_docid_tmp.add(_data.get("docid"))
- print("document - tmp",set_docid-set_docid_tmp)
- print("tmp - document",set_docid_tmp-set_docid)
- def search_title_count():
- filename = "数据样例.xlsx"
- df = pd.read_excel(filename)
- list_title_dict = []
- for _title in df["标题"]:
- _dict = {"标题":_title}
- list_title_dict.append(_dict)
- task_queue = Queue()
- for _d in list_title_dict:
- task_queue.put(_d)
- ots_client = getConnect_ots()
- def _handle(item,result_queue):
- columns = ["status","tenderee","agency","sub_docs_json"]
- _title = item.get("标题","")
- if _title!="":
- bool_query = BoolQuery(must_queries=[MatchPhraseQuery("doctitle",_title)])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,get_total_count=True,limit=10),
- columns_to_get=ColumnsToGet(columns,ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- item["比地数量"] = total_count
- if len(list_data)>0:
- _str_docid = ",".join([str(a.get("docid")) for a in list_data])
- item["比地_docid"] = _str_docid
- tenderee = list_data[0].get("tenderee")
- item["比地_招标人"] = tenderee
- agency = list_data[0].get("agency")
- item["比地_代理人"] = agency
- sub_docs_json = list_data[0].get("sub_docs_json")
- if sub_docs_json is not None:
- sub_docs = json.loads(sub_docs_json)
- win_tenderer = ""
- win_bid_price = ""
- for _doc in sub_docs:
- if _doc.get("win_tenderer","")!="":
- win_tenderer = _doc.get("win_tenderer")
- win_bid_price = _doc.get("win_bid_price")
- item["比地_中标人"] = win_tenderer
- item["比地_中标金额"] = win_bid_price
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- df_data = {}
- keys = ["标题","比地数量","比地_docid","比地_招标人","比地_代理人","比地_中标人","比地_中标金额"]
- for _d in list_title_dict:
- for k in keys:
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(_d.get(k,""))
- df1 = pd.DataFrame(df_data)
- df1.to_excel("比地对比数据.xlsx",columns=keys)
- def getDumplicate_docid():
- filename = "2022-11-02_154222_数据导出.xlsx"
- df = pd.read_excel(filename)
- list_docid = df["docid"]
- task_queue = Queue()
- list_d = []
- for _docid in list_docid:
- _dict = {"docid":_docid}
- list_d.append(_dict)
- task_queue.put(_dict)
- ots_client = getConnect_ots()
- def _handle(item,result_queue):
- _docid = item.get("docid")
- bool_query = BoolQuery(must_queries=[TermQuery("docid",int(_docid))])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
- SearchQuery(bool_query),
- columns_to_get=ColumnsToGet(["save"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- if len(list_data)>0:
- _save = list_data[0].get("save")
- item["save"] = _save
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- list_d_docid = []
- for _data in list_d:
- docid = _data.get("docid")
- save = _data.get("save")
- if save==0:
- list_d_docid.append(str(docid))
- print(",".join(list_d_docid))
- def getDocumentHtml():
- filename = "../data/2023-02-20_154118_数据导出.xlsx"
- df = pd.read_excel(filename)
- ots_client = getConnect_capacity()
- list_html_data = []
- _count = 0
- for docid in df["docid"][:10000]:
- partitionkey = int(docid)%500+1
- docid = int(docid)
- try:
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["dochtmlcon"])
- _dict = getRow_ots_primary(return_row)
- list_html_data.append(_dict)
- _count += 1
- print("%d/%d"%(_count,len(df["docid"])))
- except Exception as e:
- pass
- save(list_html_data,"list_html_data.pk")
- def exportAgencyCount():
- filename = "广州招标协会.xlsx"
- df = pd.read_excel(filename)
- a = df["a"]
- df_data = {}
- set_c = set()
- for line in a:
- list_c = line.split(" ")
- for _i in range(len(list_c)):
- _key = "c_%s"%(str(_i).rjust(2,'0'))
- if _key not in df_data:
- df_data[_key] = []
- set_c.add(_key)
- df_data[_key].append(list_c[_i])
- list_data = []
- list_query = []
- ots_client = getConnect_ots()
- for _agency in df_data["c_00"]:
- query = BoolQuery(must_queries=[TermQuery("city","广州"),
- TermQuery("docchannel",52),
- RangeQuery("status",201,301),
- RangeQuery("page_time","2022-01-01","2023-01-01"),
- TermQuery("agency",_agency),
- BoolQuery(should_queries=[
- BoolQuery(should_queries=[MatchPhraseQuery("doctitle","工程施工"),
- MatchPhraseQuery("doctextcon","建造师"),
- MatchPhraseQuery("attachmenttextcon","建造师")]),
- BoolQuery(should_queries=[MatchPhraseQuery("doctitle","监理"),
- MatchPhraseQuery("doctextcon","监理工程师"),
- MatchPhraseQuery("attachmenttextcon","监理工程师")]),
- BoolQuery(should_queries=[MatchPhraseQuery("doctitle","造价咨询"),
- MatchPhraseQuery("doctitle","预算"),
- MatchPhraseQuery("doctitle","造价审核"),
- MatchPhraseQuery("doctitle","结算"),
- MatchPhraseQuery("doctitle","概算")]),
- ])
- ],
- must_not_queries=[generateBoolShouldQuery(["doctitle","doctextcon","attachmenttextcon"],["广州公共资源交易中心"],MatchPhraseQuery)])
- # list_row = getDocument([{"query":query}],["agency","page_time","sub_docs_json"],thread_count=1)
- list_query.append({"query":query})
- df_data = {}
- set_line = set()
- columns = ["doctitle","doctextcon","attachmenttextcon","docchannel","original_docchannel","product","province","bidway","city","district","page_time","industry","info_type","tenderee","project_code","project_name","sub_docs_json","tenderee_contact","tenderee_phone","agency","agency_contact","agency_phone","uuid","time_bidclose","time_bidopen","web_source_no","web_source_name","service_time","person_review","time_get_file_start","time_get_file_end","time_earnest_money_start","time_earnest_money_end"]
- list_row = getDocument(list_query,columns,thread_count=30)
- # list_row = filterRow(list_row,"product",list_not_key)
- log("get document %d rows"%len(list_row))
- # getRowDataWithKey(df_data,list_row,columns)
- dict_channel = getDict_docchannel()
- getRowData(df_data,list_row,set_line,[''],dict_channel,True)
- # getRowData_sp1(df_data,list_row,set_line,list_keyword,dict_sptype,True)
- # fixContactPerson(df_data,list_df_columns,get_legal_person=False)
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=list_df_columns)
- return
- # _dict = {"一季度预算":0,"一季度总数":0,"一季度有金额占比":0,
- # "二季度预算":0,"二季度总数":0,"二季度有金额占比":0,
- # "三季度预算":0,"三季度总数":0,"三季度有金额占比":0,
- # "四季度预算":0,"四季度总数":0,"四季度有金额占比":0,
- # }
- # print(_agency,len(list_row))
- #
- #
- # for _row in list_row:
- # print(_row.get("docid"))
- # page_time = _row.get("page_time","")
- # sub_docs_json = _row.get("sub_docs_json","")
- # _t = None
- # if page_time!="":
- # if page_time<="2022-03-31":
- # _t = "一季度"
- # elif page_time<="2022-06-31":
- # _t = "二季度"
- # elif page_time<="2022-09-31":
- # _t = "三季度"
- # elif page_time<="2022-12-31":
- # _t = "四季度"
- # if sub_docs_json != "":
- # sub_docs = json.loads(sub_docs_json)
- # for _doc in sub_docs:
- # bidding_budget = _doc.get("bidding_budget",0)
- # _dict["%s预算"%_t] += bidding_budget
- # _dict["%s总数"%_t] += 1
- # if bidding_budget>0:
- # _dict["%s有金额占比"%_t] += 1
- # print(_dict)
- # _sum = 0
- # _sum_n = 0
- # _sum_hm = 0
- # for k in ["一季度","二季度","三季度","四季度"]:
- # km = "%s预算"%k
- # kn = "%s总数"%k
- # khm = "%s有金额占比"%k
- # _sum += _dict[km]
- # _sum_n += _dict[kn]
- # _sum_hm += _dict[khm]
- # _dict["全年预算"] = _sum
- # _dict["全年总数"] = _sum_n
- # _dict["全年有金额占比"] = _sum_hm
- # for k in ["一季度","二季度","三季度","四季度","全年"]:
- # km = "%s预算"%k
- # kn = "%s总数"%k
- # khm = "%s有金额占比"%k
- # _dict[khm] = _dict[khm]/_dict[kn] if _dict[kn]>0 else 0
- # for k,v in _dict.items():
- # if k not in df_data:
- # df_data[k] = []
- # df_data[k].append(v)
- #
- # list_c = list(set_c)
- # list_c.sort(key=lambda x:x)
- # for k,v in df_data.items():
- # print(k,len(v))
- # df1 = pd.DataFrame(df_data)
- # list_c.append("一季度预算")
- # list_c.append("一季度总数")
- # list_c.append("一季度有金额占比")
- # list_c.append("二季度预算")
- # list_c.append("二季度总数")
- # list_c.append("二季度有金额占比")
- # list_c.append("三季度预算")
- # list_c.append("三季度总数")
- # list_c.append("三季度有金额占比")
- # list_c.append("四季度预算")
- # list_c.append("四季度总数")
- # list_c.append("四季度有金额占比")
- # list_c.append("全年预算")
- # list_c.append("全年总数")
- # list_c.append("全年有金额占比")
- # df1.to_excel("%s_1.xlsx"%(filename),columns=list_c)
- def attachAttachment():
- filename = "北京电信ICT样例(2023一季度)v1.1(2).xlsx"
- df = pd.read_excel(filename,1)
- list_data = []
- task_queue = Queue()
- for _docid in df["docid"]:
- _d = {"docid":_docid}
- list_data.append(_d)
- task_queue.put(_d)
- print("len_docid",len(df["docid"]),len(list_data))
- capacity = getConnect_capacity()
- def _handle(item,result_queue):
- docid = item["docid"]
- consumed, return_row, next_token = capacity.get_row("document",[("partitionkey",int(docid)%500+1),("docid",int(docid))],["dochtmlcon"])
- _d = getRow_ots_primary(return_row)
- _dochtmlcon = _d["dochtmlcon"]
- _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
- _soup = BeautifulSoup(_dochtmlcon,"lxml")
- _div = _soup.find("div",attrs={"class":"richTextFetch"})
- if _div is None:
- _div = ""
- item["attachment"] = _div
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- list_attachment = []
- for _d in list_data:
- list_attachment.append(getLegal_str(_d.get("attachment","")))
- df_data = {}
- df_data["附件html"] = list_attachment
- df_1 = pd.DataFrame(df_data)
- df_1.to_excel("附加html_"+filename)
- def compareData():
- filename = "D:\\BaiduNetdiskDownload\\bidi_check.csv"
- list_data = []
- with open(filename,"r",encoding="utf8") as f:
- list_lines = f.readlines()
- for _line in list_lines:
- docid,docchannel,win_tenderer,tenderee,win_bid_price,bidding_budget = [None if a[:2]=='\\N' else a for a in _line.split("\t")]
- _d = {"docid":int(docid),
- "docchannel":docchannel,
- "win_tenderer":win_tenderer,
- "tenderee":tenderee,
- "win_bid_price":float(win_bid_price) if win_bid_price is not None else None,
- "bidding_budget":float(bidding_budget) if bidding_budget is not None else None}
- list_data.append(_d)
- del list_lines
- # for _i in range(len(list_data)):
- # print(list_lines[_i])
- # print(list_data[_i])
- ots_client = getConnect_ots()
- task_queue = Queue()
- for _d in list_data:
- task_queue.put(_d)
- def _handle(item,result_queue):
- docid = item.get("docid")
- win_tenderer = item.get("win_tenderer")
- win_bid_price = item.get("win_bid_price")
- tenderee = item.get("tenderee")
- bidding_budget = item.get("bidding_budget")
- must_q = [TermQuery("docid",int(docid))]
- if tenderee is not None:
- must_q.append(TermQuery("tenderee",tenderee))
- if win_tenderer is not None:
- must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_tenderer",win_tenderer)))
- if win_bid_price is not None:
- must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.win_bid_price",win_bid_price)))
- if bidding_budget is not None:
- must_q.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.bidding_budget",bidding_budget)))
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(BoolQuery(must_queries=must_q),get_total_count=True),
- columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
- item["total_count"] = total_count
- if total_count==0:
- print("docid %d total_count is 0",docid)
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(BoolQuery(must_queries=[TermQuery("docid",docid)]),get_total_count=True),
- columns_to_get=ColumnsToGet(["tenderee","sub_docs_json"],return_type=ColumnReturnType.SPECIFIED))
- l_d = getRow_ots(rows)
- item["return_row"] = l_d
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- list_new_data = []
- for data in list_data:
- if data.get("total_count")==0:
- new_d = {"docid":data.get("docid"),
- "docchannel":data.get("docchannel")}
- return_row = data.get("return_row")
- if len(return_row)>0:
- # print(return_row)
- _row = return_row[0]
- tenderee = _row.get("tenderee")
- sub_docs_json = _row.get("sub_docs_json")
- bidding_budget = None
- win_tenderer = None
- win_bid_price = None
- if sub_docs_json is not None:
- sub_docs = json.loads(sub_docs_json)
- for _doc in sub_docs:
- if _doc.get("bidding_budget") is not None:
- bidding_budget = _doc.get("bidding_budget")
- if _doc.get("win_tenderer") is not None:
- win_tenderer = _doc.get("win_tenderer")
- win_bid_price = _doc.get("win_bid_price")
- new_d["tenderee"] = tenderee
- new_d["bidding_budget"] = bidding_budget
- new_d["win_tenderer"] = win_tenderer
- new_d["win_bid_price"] = win_bid_price
- list_new_data.append(new_d)
- df_data_c = ["docid","docchannel","win_tenderer","tenderee","win_bid_price","bidding_budget"]
- df_data = {}
- for c in df_data_c:
- df_data[c] = []
- for _d in list_new_data:
- for c in df_data_c:
- df_data[c].append(_d.get(c))
- df = pd.DataFrame(df_data)
- df.to_csv("bid_check_result.csv",columns=df_data_c)
- def exportProducts():
- filename = "货物关键词.xlsx"
- dict_channel = getDict_docchannel()
- df = pd.read_excel(filename)
- list_products = df["货物关键词"]
- list_q = []
- list_result = []
- ots_client = getConnect_ots()
- columns = ["产品","总数","匹配模式"]
- _index = 0
- task_queue = Queue()
- for _product in list_products:
- _index += 1
- print(_product,"%d/%d"%(_index,len(list_products)))
- bool_query = BoolQuery(must_queries=[NestedQuery("products",TermQuery("products.product",_product)),
- # RangeQuery("page_time","2021-01-01"),
- RangeQuery("status",201,301),
- TermQuery("docchannel",101)])
- _q = {"query":bool_query,"product":_product,"匹配模式":"精准"}
- task_queue.put(_q)
- bool_query = BoolQuery(must_queries=[NestedQuery("products",WildcardQuery("products.product","*%s*"%_product)),
- # RangeQuery("page_time","2021-01-01"),
- RangeQuery("status",201,301),
- TermQuery("docchannel",101)])
- _q = {"query":bool_query,"product":_product,"匹配模式":"包括"}
- task_queue.put(_q)
- def _handle(item,result_queue):
- bool_query = item["query"]
- _product = item["product"]
- _type = item["匹配模式"]
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,get_total_count=True),
- columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
- list_result.append({"产品":_product,"总数":total_count,"匹配模式":_type})
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- print("done result length:%d"%(len(list_result)))
- df_data = {}
- for _d in list_result:
- for c in columns:
- if c not in df_data:
- df_data[c] = []
- df_data[c].append(_d.get(c))
- df1 = pd.DataFrame(df_data)
- df1.to_excel("../data/%s_数据导出.xlsx"%(getCurrent_date('%Y-%m-%d_%H%M%S')),columns=columns)
- def statics_attachment_counts():
- bool_query = BoolQuery(must_queries=[
- generateBoolShouldQuery(["industry"],["土木工程建筑业","建筑装饰和其他建筑业","房屋建筑业","专业施工","修缮工程","建筑安装业"],TermQuery),
- RangeQuery("page_time","2023-08-07","2023-08-14"),
- RangeQuery("status",201,301),
- NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5"))
- ])
- ots_client = getConnect_ots()
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
- ColumnsToGet(["industry","docchannel","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
- ColumnsToGet(["industry","docchannel","page_attachments"],return_type=ColumnReturnType.SPECIFIED))
- list_data.extend(getRow_ots(rows))
- # if len(list_data)>1000:
- # break
- task_queue = Queue()
- for _data in list_data:
- task_queue.put(_data)
- def _handle(item,result_queue):
- page_attachments = item.get("page_attachments")
- _size = 0
- if page_attachments is not None and page_attachments!="":
- list_attach = json.loads(page_attachments)
- for _attach in list_attach:
- _md5 = _attach.get("fileMd5")
- if _md5 is not None:
- consumed, return_row, next_token = ots_client.get_row("attachment",[("filemd5",_md5)],["size"])
- _d = getRow_ots_primary(return_row)
- _size += _d.get("size",0)
- item["size"] = _size
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- dict_result = {}
- for data in list_data:
- industry = data.get("industry")
- docchannel = data.get("docchannel")
- _type = ""
- if docchannel==52:
- _type = "招标"
- elif docchannel in (101,118,119,120):
- _type = "中标"
- else:
- _type = "其他"
- _key = "%s-%s"%(industry,_type)
- if _key not in dict_result:
- dict_result[_key] = 0
- dict_result[_key] += data.get("size",0)
- print(dict_result)
- for k,v in dict_result.items():
- print(k,"%.2fM"%(v/7/1024/1024))
- def static_dump():
- import pandas as pd
- filename = "select___from_bxkc_bxkc_delete_document_.csv"
- df = pd.read_csv(filename)
- print(df.keys())
- list_docid = df["docid"]
- list_dup_docid = df["dup_docid"]
- list_operate_time = df["operate_time"]
- list_a = []
- for docid,dup_docid in zip(list_docid,list_dup_docid):
- docid = int(docid)
- _flag = False
- if isinstance(dup_docid,str) and dup_docid is not None and dup_docid!="":
- _l = dup_docid.split(",")
- for _i in _l:
- if _i.strip()!="":
- docid1 = int(_i)
- if docid1>docid:
- _flag = True
- break
- if _flag:
- list_a.append("是")
- else:
- list_a.append("否")
- df_data = {"被去重docid":list_docid,
- "重复id":list_dup_docid,
- "是否展示后删除":list_a}
- df1 = pd.DataFrame(df_data)
- df1.to_csv("16号去重统计.csv")
- def append_title():
- import pandas as pd
- filename = "去重记录.xlsx"
- df = pd.read_excel(filename)
- list_docid = df["被去重id"]
- list_keep_id = df["保留id"]
- list_data = []
- task_queue = Queue()
- for _docid,keep_docid in zip(list_docid,list_keep_id):
- _d = {"dup_docid":int(_docid),
- "keep_docid":int(keep_docid)}
- list_data.append(_d)
- task_queue.put(_d)
- ots_client = getConnect_ots()
- def _handle(item,result_queue):
- dup_docid = item.get("dup_docid")
- keep_docid = item.get("keep_docid")
- dup_partitionkey = dup_docid%500+1
- keep_partitionkey = keep_docid%500+1
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",dup_partitionkey),("docid",dup_docid)],["status","doctitle","extract_count"])
- _d = getRow_ots_primary(return_row)
- if _d is not None:
- doctitle = _d.get("doctitle")
- item["dup_title"] = doctitle
- extract_count = _d.get("extract_count")
- item["dup_extract_count"] = extract_count
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",keep_partitionkey),("docid",keep_docid)],["status","doctitle","extract_count","extract_count"])
- _d = getRow_ots_primary(return_row)
- if _d is not None:
- doctitle = _d.get("doctitle")
- item["keep_title"] = doctitle
- status = _d.get("status")
- extract_count = _d.get("extract_count")
- item["keep_extract_count"] = extract_count
- if status>=201 and status<=300:
- item["保留id状态"] = "正常"
- elif status>=401:
- item["保留id状态"] = "去重"
- else:
- item["保留id状态"] = ""
- mt = MultiThreadHandler(task_queue,_handle,None,39)
- mt.run()
- keys = ["dup_docid","keep_docid","dup_title","keep_title","保留id状态","dup_extract_count","keep_extract_count"]
- df_data = {}
- for data in list_data:
- for k in keys:
- if k not in df_data:
- df_data[k] = []
- df_data[k].append(data.get(k))
- df1 = pd.DataFrame(df_data)
- df1.to_excel("%s.xlsx"%(filename),columns=keys)
- def get_follows():
- _json = '''
- [
- ]
- '''
- ots_client = getConnect_ots()
- list_follows = json.loads(_json)
- new_list = []
- for follow in list_follows:
- docid = follow.get("docid")
- partitionkey = docid%500+1
- consumed, return_row, next_token = ots_client.get_row("document",[("partitionkey",partitionkey),("docid",docid)],["tenderee"])
- _d = getRow_ots_primary(return_row)
- print("docid",_d.get("tenderee"))
- if _d.get("tenderee")=="泗阳意杨产业科技园实业有限公司":
- new_list.append(follow)
- print(json.dumps(new_list,ensure_ascii=False))
- def validateTitle(title):
- rstr = r"[\/\\\:\*\?\"\<\>\|\r\n]" # '/ \ : * ? " < > |'
- new_title = re.sub(rstr, "_", title) # 替换为下划线
- return new_title
- def exportParameters():
- from glob import glob
- attach_path = "F:/Workspace2016/BaseDataMaintenance/BaseDataMaintenance/maintenance/product/download"
- ots_client = getConnect_ots()
- bool_query = BoolQuery(must_queries=[TermQuery("parameter_status",1)])
- save_dir = "product"
- if not os.path.exists(save_dir):
- os.mkdir(save_dir)
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_product2","document_product2_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("parameter_status")]),limit=100,get_total_count=True),
- ColumnsToGet(["parameter","bid_filemd5s","name","original_name"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- write_count = 0
- for _data in list_data:
- bid_filemd5s = _data["bid_filemd5s"]
- parameter = _data["parameter"]
- name = _data["name"]
- original_name = _data["original_name"]
- list_md5s = bid_filemd5s.split(",")
- if len(list_md5s)==1:
- list_path = glob(os.path.join(attach_path,bid_filemd5s)+"*")
- for _path in list_path:
- if not _path.endswith(".html"):
- filename = _path.split("\\")[-1]
- with open(os.path.join(save_dir,filename),"wb") as f:
- f.write(open(_path,"rb").read())
- pname = "%s_name%s_original_name%s.html"%(bid_filemd5s,name,original_name[:10])
- pname = validateTitle(pname)
- with open(os.path.join(save_dir,pname),"w",encoding="utf8") as f:
- f.write(parameter)
- write_count += 1
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_product2","document_product2_index",
- SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
- ColumnsToGet(["parameter","bid_filemd5s","name","original_name"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- for _data in list_data:
- bid_filemd5s = _data["bid_filemd5s"]
- parameter = _data["parameter"]
- name = _data["name"]
- original_name = _data["original_name"]
- list_md5s = bid_filemd5s.split(",")
- if len(list_md5s)==1:
- list_path = glob(os.path.join(attach_path,bid_filemd5s)+"*")
- for _path in list_path:
- if not _path.endswith(".html"):
- filename = _path.split("\\")[-1]
- with open(os.path.join(save_dir,filename),"wb") as f:
- f.write(open(_path,"rb").read())
- pname = "%s_name%s_original_name%s.html"%(bid_filemd5s,name,original_name[:10])
- pname = validateTitle(pname)
- with open(os.path.join(save_dir,pname),"w",encoding="utf8") as f:
- f.write(parameter)
- write_count += 1
- if write_count>=2000:
- return
- def exportProjects():
- bool_query = BoolQuery(must_queries=[
- TermQuery("docid_number",1),
- ExistsQuery("zhong_biao_page_time"),
- RangeQuery("page_time","2023-01-01","2023-10-10")
- ],
- must_not_queries=[
- MatchPhraseQuery("doctitles","网上超市")
- ])
- ots_client = getConnect_ots()
- rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time",SortOrder.DESC)]),limit=100,get_total_count=True),
- ColumnsToGet(["docids","doctitles","project_codes"],return_type=ColumnReturnType.SPECIFIED))
- list_data = getRow_ots(rows)
- while next_token:
- rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
- SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
- ColumnsToGet(["docids","doctitles","project_codes"],return_type=ColumnReturnType.SPECIFIED))
- list_data.extend(getRow_ots(rows))
- if len(list_data)>10000:
- break
- task_queue = Queue()
- for data in list_data:
- task_queue.put(data)
- def _handle(item,result_queue):
- docids = item["docids"]
- project_codes = item.get("project_codes","")
- if len(project_codes)>0:
- list_codes = project_codes.split(",")
- should_q = []
- for code in list_codes:
- should_q.append(MatchPhraseQuery("doctextcon",code))
- should_q.append(MatchPhraseQuery("attachmenttextcon",code))
- _query = BoolQuery(must_queries=[BoolQuery(should_queries=should_q),RangeQuery("status",201,301)],
- must_not_queries=[TermQuery("docid",int(docids))])
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
- SearchQuery(_query,limit=100),
- ColumnsToGet(["doctitle"],return_type=ColumnReturnType.SPECIFIED))
- item["result"] = json.dumps(getRow_ots(rows),ensure_ascii=False)
- mt = MultiThreadHandler(task_queue,_handle,None,30)
- mt.run()
- columns = ["docids","doctitles","project_codes","result"]
- df_data = {}
- for data in list_data:
- for c in columns:
- if c not in df_data:
- df_data[c] = []
- df_data[c].append(data.get(c,""))
- df = pd.DataFrame(df_data)
- df.to_excel("toMerge.xlsx",columns=columns)
- if __name__=="__main__":
- # compareData()
- # attachAttachment()
- # exportDocument_By_time(time_from="2021-01-29",time_to="2021-01-29",columns=["docid","doctitle","project_name","dochtmlcon"])
- # processDocument()
- # export_extract_check()
- # exportArticle_by_websource()
- # export_keyword_count()
- # export_province_keyword_count()
- # exportDocument_dump()
- # exportDocument_dump_mysql()
- # export_attachment()
- # statics_attachment_counts()
- # get_follows()
- # append_title()
- # exportDocument_by_doctitle()
- # exportIndustryCount()
- exportDocument_by_pagetime()
- # exportProjects()
- # exportProducts()
- # exportParameters()
- # exportAgencyCount()
- # getDocumentHtml()
- # getDumplicate_docid()
- # exportHonors_item_info()
- # check_dump_data()
- # search_title_count()
- # count_product()
- # export_dump_by_id()
- # group_xlsx()
- # static_process_time()
- # check_data_synchronization()
- # process_doc()
- # export_competition()
- # for page_time in ["2022-08-01"]:
- # exportDocument_by_days(page_time)
- # exportDocument_forRecommen()
- # exportDocument_attachment()
- # exportWin_tenderer_count()
- # attachCompanyContact()
- # dumpWebSourceNo()
- # print("http://www.bidizhaobiao.com/excel_detail.do?code=%s"%(str(aesCipher.encrypt('{"docid":%d}'%103571618))))
- # exportNzj()
- # turn_status()
- # attachBidding_budget()
- # debug_documentMerge()
- # exportDocument_medicine("2021-05-24","2021-05-30")
- # signDocument()
- # transUUid()
- # fix_document()
- # export_document_no_price()
- # findProjects()
- # exportDetailLink()
- # export_extract_check()
- # export_extract2()
- # export_by_file()
- # export_dump()
|