dataflow.py 273 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623
  1. # sys.path.append("/data")
  2. from BaseDataMaintenance.dataSource.source import getConnect_activateMQ_ali
  3. from BaseDataMaintenance.common.multiThread import MultiThreadHandler
  4. from BaseDataMaintenance.common.multiProcess import MultiHandler
  5. from queue import Queue
  6. from multiprocessing import Queue as PQueue
  7. from multiprocessing import Process
  8. from BaseDataMaintenance.model.ots.document_tmp import *
  9. from BaseDataMaintenance.model.ots.attachment import *
  10. from BaseDataMaintenance.model.ots.document_html import *
  11. from BaseDataMaintenance.model.ots.document_extract2 import *
  12. from BaseDataMaintenance.model.ots.project import *
  13. from BaseDataMaintenance.model.ots.project2_tmp import *
  14. from BaseDataMaintenance.model.ots.document import *
  15. from BaseDataMaintenance.model.ots.project_process import *
  16. import base64
  17. from BaseDataMaintenance.dataSource.interface import getAttachDealInterface,sentMsgToDD
  18. from uuid import uuid4
  19. from BaseDataMaintenance.common.ossUtils import *
  20. from BaseDataMaintenance.dataSource.source import is_internal,getAuth
  21. from apscheduler.schedulers.blocking import BlockingScheduler
  22. from BaseDataMaintenance.maintenance.dataflow_settings import *
  23. from threading import Thread
  24. import oss2
  25. from BaseDataMaintenance.maxcompute.documentDumplicate import *
  26. from BaseDataMaintenance.maxcompute.documentMerge import *
  27. from BaseDataMaintenance.common.otsUtils import *
  28. from BaseDataMaintenance.common.activateMQUtils import *
  29. from BaseDataMaintenance.dataMonitor.data_monitor import BaseDataMonitor
  30. from BaseDataMaintenance.dataSource.pool import ConnectorPool
  31. def getSet(list_dict,key):
  32. _set = set()
  33. for item in list_dict:
  34. if key in item:
  35. if item[key]!='' and item[key] is not None:
  36. if re.search("^\d[\d\.]*$",item[key]) is not None:
  37. _set.add(str(float(item[key])))
  38. else:
  39. _set.add(str(item[key]))
  40. return _set
  41. def getSimilarityOfString(str1,str2):
  42. _set1 = set()
  43. _set2 = set()
  44. if str1 is not None:
  45. for i in range(1,len(str1)):
  46. _set1.add(str1[i-1:i+1])
  47. if str2 is not None:
  48. for i in range(1,len(str2)):
  49. _set2.add(str2[i-1:i+1])
  50. _len = max(1,min(len(_set1),len(_set2)))
  51. return len(_set1&_set2)/_len
  52. def getDiffIndex(list_dict,key,confidence=100):
  53. _set = set()
  54. for _i in range(len(list_dict)):
  55. item = list_dict[_i]
  56. if item["confidence"]>=confidence:
  57. continue
  58. if key in item:
  59. if item[key]!='' and item[key] is not None:
  60. if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  61. _set.add(str(float(item[key])))
  62. else:
  63. _set.add(str(item[key]))
  64. if len(_set)>1:
  65. return _i
  66. return len(list_dict)
  67. def transformSWF(bucket,attachment_hub_url,objectPath,localpath,swf_dir):
  68. swf_urls = []
  69. try:
  70. list_files = os.listdir(swf_dir)
  71. list_files.sort(key=lambda x:x)
  72. headers = dict()
  73. headers["x-oss-object-acl"] = oss2.OBJECT_ACL_PUBLIC_READ
  74. for _file in list_files:
  75. swf_localpath = "%s/%s"%(swf_dir,_file)
  76. swf_objectPath = "%s/%s"%(objectPath.split(".")[0],_file)
  77. uploadFileByPath(bucket,swf_localpath,swf_objectPath,headers)
  78. _url = "%s/%s"%(attachment_hub_url,swf_objectPath)
  79. swf_urls.append(_url)
  80. os.remove(swf_localpath)
  81. except Exception as e:
  82. traceback.print_exc()
  83. return swf_urls
  84. class Dataflow():
  85. def __init__(self):
  86. self.ots_client = getConnect_ots()
  87. self.queue_init = Queue()
  88. self.queue_attachment = Queue()
  89. self.queue_attachment_ocr = Queue()
  90. self.queue_attachment_not_ocr = Queue()
  91. self.list_attachment_ocr = []
  92. self.list_attachment_not_ocr = []
  93. self.queue_extract = Queue()
  94. self.list_extract = []
  95. self.queue_dumplicate = PQueue()
  96. self.queue_dumplicate_processed = PQueue()
  97. self.dumplicate_set = set()
  98. self.queue_merge = Queue()
  99. self.queue_syncho = Queue()
  100. self.queue_remove = Queue()
  101. self.queue_remove_project = Queue()
  102. self.attachment_rec_interface = ""
  103. self.ots_client_merge = getConnect_ots()
  104. if is_internal:
  105. self.bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
  106. else:
  107. self.bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
  108. if is_internal:
  109. self.extract_url = "http://1255640119316927.vpc.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
  110. self.industy_url = "http://1255640119316927.vpc.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/industry_extract"
  111. self.other_url = "http://1255640119316927.vpc.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/other_extract"
  112. else:
  113. self.extract_url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
  114. self.industy_url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/industry_extract"
  115. self.other_url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/other_extract"
  116. self.header = {'Content-Type': 'application/json',"Authorization":"NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg=="}
  117. self.attachment_hub_url = "https://attachment-hub.oss-cn-hangzhou.aliyuncs.com/"
  118. self.auth = getAuth()
  119. oss2.defaults.connection_pool_size = 100
  120. oss2.defaults.multiget_num_threads = 20
  121. log("bucket_url:%s"%(self.bucket_url))
  122. self.attachment_bucket_name = "attachment-hub"
  123. self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
  124. self.current_path = os.path.dirname(__file__)
  125. def flow_init(self):
  126. def producer():
  127. bool_query = BoolQuery(must_queries=[RangeQuery("crtime",'2022-04-20')])
  128. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
  129. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  130. ColumnsToGet(return_type=ColumnReturnType.ALL))
  131. log("flow_init producer total_count:%d"%total_count)
  132. list_dict = getRow_ots(rows)
  133. for _dict in list_dict:
  134. self.queue_init.put(_dict)
  135. _count = len(list_dict)
  136. while next_token:
  137. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
  138. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  139. ColumnsToGet(return_type=ColumnReturnType.ALL))
  140. list_dict = getRow_ots(rows)
  141. for _dict in list_dict:
  142. self.queue_init.put(_dict)
  143. _count += len(list_dict)
  144. def comsumer():
  145. mt = MultiThreadHandler(self.queue_init,comsumer_handle,None,30,1,ots_client=self.ots_client)
  146. mt.run()
  147. def comsumer_handle(item,result_queue,ots_client):
  148. _dochtmlcon = item.get(document_tmp_dochtmlcon,"")
  149. if document_tmp_dochtmlcon in item:
  150. item.pop(document_tmp_dochtmlcon)
  151. if document_tmp_doctextcon in item:
  152. item.pop(document_tmp_doctextcon)
  153. if document_tmp_attachmenttextcon in item:
  154. item.pop(document_tmp_attachmenttextcon)
  155. _status = item.get(document_tmp_status)
  156. new_status = None
  157. if _status>=201 and _status<=300:
  158. item[document_tmp_save] = 1
  159. new_status = 81
  160. elif _status>=401 and _status<=450:
  161. item[document_tmp_save] = 0
  162. new_status = 81
  163. else:
  164. new_status = 1
  165. # new_status = 1
  166. item[document_tmp_status] = new_status
  167. dtmp = Document_tmp(item)
  168. dhtml = Document_html({document_tmp_partitionkey:item.get(document_tmp_partitionkey),
  169. document_tmp_docid:item.get(document_tmp_docid),
  170. document_tmp_dochtmlcon:_dochtmlcon})
  171. dtmp.update_row(ots_client)
  172. dhtml.update_row(ots_client)
  173. producer()
  174. comsumer()
  175. def getTitleFromHtml(self,filemd5,_html):
  176. _soup = BeautifulSoup(_html,"lxml")
  177. _find = _soup.find("a",attrs={"data":filemd5})
  178. _title = ""
  179. if _find is not None:
  180. _title = _find.get_text()
  181. return _title
  182. def getSourceLinkFromHtml(self,filemd5,_html):
  183. _soup = BeautifulSoup(_html,"lxml")
  184. _find = _soup.find("a",attrs={"filelink":filemd5})
  185. filelink = ""
  186. if _find is None:
  187. _find = _soup.find("img",attrs={"filelink":filemd5})
  188. if _find is not None:
  189. filelink = _find.attrs.get("src","")
  190. else:
  191. filelink = _find.attrs.get("href","")
  192. return filelink
  193. def request_attachment_interface(self,attach,_dochtmlcon):
  194. filemd5 = attach.getProperties().get(attachment_filemd5)
  195. _status = attach.getProperties().get(attachment_status)
  196. _filetype = attach.getProperties().get(attachment_filetype)
  197. _size = attach.getProperties().get(attachment_size)
  198. _path = attach.getProperties().get(attachment_path)
  199. _uuid = uuid4()
  200. objectPath = attach.getProperties().get(attachment_path)
  201. localpath = os.path.join(self.current_path,"download",_uuid.hex)
  202. docids = attach.getProperties().get(attachment_docids)
  203. try:
  204. if _size>ATTACHMENT_LARGESIZE:
  205. attach.setValue(attachment_status, ATTACHMENT_TOOLARGE)
  206. log("attachment :%s of path:%s to large"%(filemd5,_path))
  207. attach.update_row(self.ots_client)
  208. return True
  209. else:
  210. d_start_time = time.time()
  211. if downloadFile(self.bucket,objectPath,localpath):
  212. time_download = time.time()-d_start_time
  213. _data_base64 = base64.b64encode(open(localpath,"rb").read())
  214. #调用接口处理结果
  215. start_time = time.time()
  216. _success,_html,swf_images = getAttachDealInterface(_data_base64,_filetype,kwargs={"timeout":600})
  217. if _success:
  218. log("process filemd5:%s of type:%s with size:%.3fM download:%ds recognize takes %ds,ret_size:%d"%(filemd5,_filetype,round(_size/1024/1024,4),time_download,time.time()-start_time,len(_html)))
  219. else:
  220. log("attach interface failed of docid:%s filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
  221. # sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
  222. _html = ""
  223. return False
  224. swf_images = eval(swf_images)
  225. if attach.getProperties().get(attachment_filetype)=="swf" and len(swf_images)>0:
  226. swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
  227. if len(swf_urls)==0:
  228. objectPath = attach.getProperties().get(attachment_path,"")
  229. localpath = os.path.join(self.current_path,"download/%s.swf"%(uuid4().hex))
  230. swf_dir = os.path.join(self.current_path,"swf_images",uuid4().hex)
  231. if not os.path.exists(swf_dir):
  232. os.mkdir(swf_dir)
  233. for _i in range(len(swf_images)):
  234. _base = swf_images[_i]
  235. _base = base64.b64decode(_base)
  236. filename = "swf_page_%d.png"%(_i)
  237. filepath = os.path.join(swf_dir,filename)
  238. with open(filepath,"wb") as f:
  239. f.write(_base)
  240. swf_urls = transformSWF(self.bucket,self.attachment_hub_url,objectPath,None,swf_dir)
  241. if os.path.exists(swf_dir):
  242. os.rmdir(swf_dir)
  243. attach.setValue(attachment_swfUrls,json.dumps(swf_urls,ensure_ascii=False),True)
  244. if re.search("<td",_html) is not None:
  245. attach.setValue(attachment_has_table,1,True)
  246. _file_title = self.getTitleFromHtml(filemd5,_dochtmlcon)
  247. filelink = self.getSourceLinkFromHtml(filemd5,_dochtmlcon)
  248. if _file_title!="":
  249. attach.setValue(attachment_file_title,_file_title,True)
  250. if filelink!="":
  251. attach.setValue(attachment_file_link,filelink,True)
  252. attach.setValue(attachment_attachmenthtml,_html,True)
  253. attach.setValue(attachment_attachmentcon,BeautifulSoup(_html,"lxml").get_text(),True)
  254. attach.setValue(attachment_status,ATTACHMENT_PROCESSED,True)
  255. attach.setValue(attachment_recsize,len(_html),True)
  256. attach.setValue(attachment_process_time,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
  257. attach.update_row(self.ots_client) #线上再开放更新
  258. return True
  259. else:
  260. return False
  261. except oss2.exceptions.NotFound as e:
  262. return True
  263. except Exception as e:
  264. traceback.print_exc()
  265. finally:
  266. try:
  267. os.remove(localpath)
  268. except:
  269. pass
  270. def rec_attachments_by_interface(self,list_attach,_dochtmlcon,save=True):
  271. list_html = []
  272. swf_urls = []
  273. for _attach in list_attach:
  274. #测试全跑
  275. if _attach.getProperties().get(attachment_status) in (ATTACHMENT_PROCESSED,ATTACHMENT_TOOLARGE):
  276. _html = _attach.getProperties().get(attachment_attachmenthtml,"")
  277. if _html is None:
  278. _html = ""
  279. list_html.append(_html)
  280. else:
  281. _succeed = self.request_attachment_interface(_attach,_dochtmlcon)
  282. if not _succeed:
  283. return False,"",[]
  284. _html = _attach.getProperties().get(attachment_attachmenthtml,"")
  285. if _html is None:
  286. _html = ""
  287. list_html.append(_html)
  288. if _attach.getProperties().get(attachment_filetype)=="swf":
  289. swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))
  290. return True,list_html,swf_urls
  291. def generate_dumplicate_query(self,_dict,_dict_must_not,set_match=set(["project_code","project_codes","product"]),set_nested=set(["win_tenderer","bidding_budget","win_bid_price"]),
  292. set_term=set(["doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save","docid"]),
  293. set_range=set(["page_time","status"]),set_phrase=set(["doctitle","project_name"])):
  294. list_must_queries = []
  295. list_must_no_queries = []
  296. for k,v in _dict.items():
  297. if k in set_match:
  298. if isinstance(v,str):
  299. l_s = []
  300. for s_v in v.split(","):
  301. l_s.append(MatchQuery(k,s_v))
  302. list_must_queries.append(BoolQuery(should_queries=l_s))
  303. elif k in set_nested:
  304. _v = v
  305. if k!="":
  306. if k=="bidding_budget" or k=="win_bid_price":
  307. _v = float(_v)
  308. list_must_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
  309. else:
  310. list_must_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
  311. elif k in set_term:
  312. list_must_queries.append(TermQuery(k,v))
  313. elif k in set_phrase:
  314. list_must_queries.append(MatchPhraseQuery(k,v))
  315. elif k in set_range:
  316. if len(v)==1:
  317. list_must_queries.append(RangeQuery(k,v[0]))
  318. elif len(v)==2:
  319. list_must_queries.append(RangeQuery(k,v[0],v[1],True,True))
  320. for k,v in _dict_must_not.items():
  321. if k in set_match:
  322. if isinstance(v,str):
  323. l_s = []
  324. for s_v in v.split(","):
  325. l_s.append(MatchQuery(k,s_v))
  326. list_must_no_queries.append(BoolQuery(should_queries=l_s))
  327. elif k in set_nested:
  328. _v = v
  329. if k!="":
  330. if k=="bidding_budget" or k=="win_bid_price":
  331. _v = float(_v)
  332. list_must_no_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
  333. else:
  334. list_must_no_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
  335. elif k in set_term:
  336. list_must_no_queries.append(TermQuery(k,v))
  337. elif k in set_range:
  338. if len(v)==1:
  339. list_must_no_queries.append(RangeQuery(k,v[0]))
  340. elif len(v)==2:
  341. list_must_no_queries.append(RangeQuery(k,v[0],v[1],True,True))
  342. return BoolQuery(must_queries=list_must_queries,must_not_queries=list_must_no_queries)
  343. def f_decode_sub_docs_json(self, project_code,project_name,tenderee,agency,sub_docs_json):
  344. columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  345. extract_count = 0
  346. if project_code is not None and project_code!="":
  347. extract_count += 1
  348. if project_name is not None and project_name!="":
  349. extract_count += 1
  350. if tenderee is not None and tenderee!="":
  351. extract_count += 1
  352. if agency is not None and agency!="":
  353. extract_count += 1
  354. if sub_docs_json is not None:
  355. try:
  356. sub_docs = json.loads(sub_docs_json)
  357. except Exception as e:
  358. sub_docs = []
  359. sub_docs.sort(key=lambda x:float(x.get("bidding_budget",0)),reverse=True)
  360. sub_docs.sort(key=lambda x:float(x.get("win_bid_price",0)),reverse=True)
  361. # log("==%s"%(str(sub_docs)))
  362. for sub_docs in sub_docs:
  363. for _key_sub_docs in sub_docs.keys():
  364. extract_count += 1
  365. if _key_sub_docs in columns:
  366. if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  367. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  368. if float(sub_docs[_key_sub_docs])>0:
  369. columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
  370. else:
  371. columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  372. return columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count
  373. def post_extract(self,_dict):
  374. win_tenderer,bidding_budget,win_bid_price,extract_count = self.f_decode_sub_docs_json(_dict.get(document_tmp_project_code),_dict.get(document_tmp_project_name),_dict.get(document_tmp_tenderee),_dict.get(document_tmp_agency),_dict.get(document_tmp_sub_docs_json))
  375. _dict["win_tenderer"] = win_tenderer
  376. _dict["bidding_budget"] = bidding_budget
  377. _dict["win_bid_price"] = win_bid_price
  378. if "extract_count" not in _dict:
  379. _dict["extract_count"] = extract_count
  380. def get_dump_columns(self,_dict):
  381. docchannel = _dict.get(document_tmp_docchannel,0)
  382. project_code = _dict.get(document_tmp_project_code,"")
  383. project_name = _dict.get(document_tmp_project_name,"")
  384. tenderee = _dict.get(document_tmp_tenderee,"")
  385. agency = _dict.get(document_tmp_agency,"")
  386. doctitle = _dict.get(document_tmp_doctitle,"")
  387. doctitle_refine = _dict.get(document_tmp_doctitle_refine,"")
  388. win_tenderer = _dict.get("win_tenderer","")
  389. bidding_budget = _dict.get("bidding_budget","")
  390. if bidding_budget==0:
  391. bidding_budget = ""
  392. win_bid_price = _dict.get("win_bid_price","")
  393. if win_bid_price==0:
  394. win_bid_price = ""
  395. page_time = _dict.get(document_tmp_page_time,"")
  396. fingerprint = _dict.get(document_tmp_fingerprint,"")
  397. product = _dict.get(document_tmp_product,"")
  398. return docchannel,project_code,project_name,tenderee,agency,doctitle,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product
  399. def f_set_docid_limitNum_contain(self,item, _split,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"]):
  400. flag = True
  401. for _key in singleNum_keys:
  402. if len(getSet(_split,_key))>1:
  403. flag = False
  404. break
  405. for _key in multiNum_keys:
  406. if len(getSet(_split,_key))<=1:
  407. flag = False
  408. break
  409. project_code = item.get("project_code","")
  410. for _key in notlike_keys:
  411. if not flag:
  412. break
  413. for _d in _split:
  414. _key_v = _d.get(_key,"")
  415. _sim = getSimilarityOfString(project_code,_key_v)
  416. if _sim>0.7 and _sim<1:
  417. flag = False
  418. break
  419. #判断组内每条公告是否包含
  420. if flag:
  421. if len(contain_keys)>0:
  422. for _key in contain_keys:
  423. MAX_CONTAIN_COLUMN = None
  424. for _d in _split:
  425. contain_column = _d.get(_key)
  426. if contain_column is not None and contain_column !="":
  427. if MAX_CONTAIN_COLUMN is None:
  428. MAX_CONTAIN_COLUMN = contain_column
  429. else:
  430. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  431. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  432. flag = False
  433. break
  434. MAX_CONTAIN_COLUMN = contain_column
  435. else:
  436. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  437. flag = False
  438. break
  439. if flag:
  440. return _split
  441. return []
  442. def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count,document_tmp_doctitle]):
  443. list_data = []
  444. if isinstance(_query,list):
  445. bool_query = BoolQuery(should_queries=_query)
  446. else:
  447. bool_query = _query
  448. rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  449. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=50,get_total_count=True),
  450. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  451. list_dict = getRow_ots(rows)
  452. for _dict in list_dict:
  453. self.post_extract(_dict)
  454. _dict["confidence"] = confidence
  455. list_data.append(_dict)
  456. # _count = len(list_dict)
  457. # while next_token:
  458. # rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  459. # SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  460. # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  461. # list_dict = getRow_ots(rows)
  462. # for _dict in list_dict:
  463. # self.post_extract(_dict)
  464. # _dict["confidence"] = confidence
  465. # list_data.append(_dict)
  466. list_dict = self.f_set_docid_limitNum_contain(item,list_dict,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys)
  467. return list_dict
  468. def add_data_by_query(self,item,base_list,set_docid,_query,confidence,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_save,document_tmp_status,document_tmp_product,document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
  469. list_dict = self.search_data_by_query(item,_query,confidence,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys,columns=columns)
  470. for _dict in list_dict:
  471. self.post_extract(_dict)
  472. _docid = _dict.get(document_tmp_docid)
  473. if _docid not in set_docid:
  474. base_list.append(_dict)
  475. set_docid.add(_docid)
  476. def translate_dumplicate_rules(self,status_from,item):
  477. docchannel,project_code,project_name,tenderee,agency,doctitle,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
  478. if page_time=='':
  479. page_time = getCurrent_date("%Y-%m-%d")
  480. base_dict = {
  481. "status":[status_from[0]],
  482. "page_time":[timeAdd(page_time,-2),timeAdd(page_time,2)]
  483. }
  484. must_not_dict = {"save":0}
  485. list_rules = []
  486. singleNum_keys = ["tenderee","win_tenderer"]
  487. if fingerprint!="":
  488. _dict = {}
  489. confidence = 100
  490. _dict[document_tmp_fingerprint] = fingerprint
  491. _dict.update(base_dict)
  492. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  493. _rule = {"confidence":confidence,
  494. "item":item,
  495. "query":_query,
  496. "singleNum_keys":[],
  497. "contain_keys":[],
  498. "multiNum_keys":[]}
  499. list_rules.append(_rule)
  500. if docchannel in (52,118):
  501. if bidding_budget!="" and tenderee!="" and project_code!="":
  502. confidence = 90
  503. _dict = {document_tmp_docchannel:docchannel,
  504. "bidding_budget":item.get("bidding_budget"),
  505. document_tmp_tenderee:item.get(document_tmp_tenderee,""),
  506. document_tmp_project_code:item.get(document_tmp_project_code,"")
  507. }
  508. _dict.update(base_dict)
  509. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  510. _rule = {"confidence":confidence,
  511. "query":_query,
  512. "singleNum_keys":singleNum_keys,
  513. "contain_keys":[],
  514. "multiNum_keys":[document_tmp_web_source_no]}
  515. list_rules.append(_rule)
  516. if doctitle_refine!="" and tenderee!="" and bidding_budget!="":
  517. confidence = 80
  518. _dict = {document_tmp_docchannel:docchannel,
  519. "doctitle_refine":doctitle_refine,
  520. "tenderee":tenderee,
  521. bidding_budget:"bidding_budget"
  522. }
  523. _dict.update(base_dict)
  524. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  525. _rule = {"confidence":confidence,
  526. "query":_query,
  527. "singleNum_keys":singleNum_keys,
  528. "contain_keys":[],
  529. "multiNum_keys":[document_tmp_web_source_no]}
  530. list_rules.append(_rule)
  531. if project_code!="" and doctitle_refine!="" and agency!="" and bidding_budget!="":
  532. confidence = 90
  533. _dict = {document_tmp_docchannel:docchannel,
  534. "project_code":project_code,
  535. "doctitle_refine":doctitle_refine,
  536. "agency":agency,
  537. "bidding_budget":bidding_budget
  538. }
  539. _dict.update(base_dict)
  540. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  541. _rule = {"confidence":confidence,
  542. "query":_query,
  543. "singleNum_keys":singleNum_keys,
  544. "contain_keys":[],
  545. "multiNum_keys":[document_tmp_web_source_no]}
  546. list_rules.append(_rule)
  547. if project_code!="" and tenderee!="" and bidding_budget!="":
  548. confidence = 91
  549. _dict = {document_tmp_docchannel:docchannel,
  550. "project_code":project_code,
  551. "tenderee":tenderee,
  552. "bidding_budget":bidding_budget
  553. }
  554. _dict.update(base_dict)
  555. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  556. _rule = {"confidence":confidence,
  557. "query":_query,
  558. "singleNum_keys":singleNum_keys,
  559. "contain_keys":[],
  560. "multiNum_keys":[document_tmp_web_source_no]}
  561. list_rules.append(_rule)
  562. if doctitle_refine!="" and agency!="" and bidding_budget!="":
  563. confidence = 71
  564. _dict = {document_tmp_docchannel:docchannel,
  565. "doctitle_refine":doctitle_refine,
  566. "agency":agency,
  567. "bidding_budget":bidding_budget
  568. }
  569. _dict.update(base_dict)
  570. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  571. _rule = {"confidence":confidence,
  572. "query":_query,
  573. "singleNum_keys":singleNum_keys,
  574. "contain_keys":[],
  575. "multiNum_keys":[document_tmp_web_source_no]}
  576. list_rules.append(_rule)
  577. if project_code!="" and project_name!="" and agency!="" and bidding_budget!="":
  578. confidence = 91
  579. _dict = {document_tmp_docchannel:docchannel,
  580. "project_code":project_code,
  581. "project_name":project_name,
  582. "agency":agency,
  583. "bidding_budget":bidding_budget
  584. }
  585. _dict.update(base_dict)
  586. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  587. n_singleKeys = [i for i in singleNum_keys]
  588. n_singleKeys.append(document_tmp_web_source_no)
  589. _rule = {"confidence":confidence,
  590. "query":_query,
  591. "singleNum_keys":n_singleKeys,
  592. "contain_keys":[],
  593. "multiNum_keys":[]}
  594. list_rules.append(_rule)
  595. ##-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1
  596. if project_code!="" and project_name!="" and tenderee!="" and bidding_budget!="":
  597. confidence = 91
  598. _dict = {document_tmp_docchannel:docchannel,
  599. "project_code":project_code,
  600. "project_name":project_name,
  601. "tenderee":tenderee,
  602. "bidding_budget":bidding_budget
  603. }
  604. _dict.update(base_dict)
  605. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  606. n_singleKeys = [i for i in singleNum_keys]
  607. n_singleKeys.append(document_tmp_web_source_no)
  608. _rule = {"confidence":confidence,
  609. "query":_query,
  610. "singleNum_keys":n_singleKeys,
  611. "contain_keys":[],
  612. "multiNum_keys":[]}
  613. list_rules.append(_rule)
  614. if project_code!="" and doctitle_refine!="" and tenderee!="" and bidding_budget!="":
  615. confidence = 71
  616. _dict = {document_tmp_docchannel:docchannel,
  617. "project_code":project_code,
  618. "doctitle_refine":doctitle_refine,
  619. "tenderee":tenderee,
  620. "bidding_budget":bidding_budget
  621. }
  622. _dict.update(base_dict)
  623. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  624. _rule = {"confidence":confidence,
  625. "query":_query,
  626. "singleNum_keys":singleNum_keys,
  627. "contain_keys":[],
  628. "multiNum_keys":[document_tmp_web_source_no]}
  629. list_rules.append(_rule)
  630. #-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
  631. if project_name!="" and agency!="":
  632. tmp_bidding = 0
  633. if bidding_budget!="":
  634. tmp_bidding = bidding_budget
  635. confidence = 51
  636. _dict = {document_tmp_docchannel:docchannel,
  637. "project_name":project_name,
  638. "agency":agency,
  639. "bidding_budget":tmp_bidding
  640. }
  641. _dict.update(base_dict)
  642. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  643. _rule = {"confidence":confidence,
  644. "query":_query,
  645. "singleNum_keys":singleNum_keys,
  646. "contain_keys":[],
  647. "multiNum_keys":[document_tmp_web_source_no]}
  648. list_rules.append(_rule)
  649. #-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
  650. if project_code!="" and agency!="":
  651. tmp_bidding = 0
  652. if bidding_budget!="":
  653. tmp_bidding = bidding_budget
  654. confidence = 51
  655. _dict = {document_tmp_docchannel:docchannel,
  656. "project_code":project_code,
  657. "agency":agency,
  658. "bidding_budget":tmp_bidding
  659. }
  660. _dict.update(base_dict)
  661. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  662. _rule = {"confidence":confidence,
  663. "query":_query,
  664. "singleNum_keys":singleNum_keys,
  665. "contain_keys":[],
  666. "multiNum_keys":[document_tmp_web_source_no]}
  667. list_rules.append(_rule)
  668. if docchannel not in (101,119,120):
  669. #-- 7. 非中标公告 - 同项目名称 - 同发布日期 - 同招标人 - 同预算 - 同类型 - 信息源>1 - 同项目编号
  670. if project_name!="" and tenderee!="" and project_code!="":
  671. tmp_bidding = 0
  672. if bidding_budget!="":
  673. tmp_bidding = bidding_budget
  674. confidence = 51
  675. _dict = {document_tmp_docchannel:docchannel,
  676. "project_name":project_name,
  677. "tenderee":tenderee,
  678. "project_code":project_code
  679. }
  680. _dict.update(base_dict)
  681. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  682. _rule = {"confidence":confidence,
  683. "query":_query,
  684. "singleNum_keys":singleNum_keys,
  685. "contain_keys":[],
  686. "multiNum_keys":[document_tmp_web_source_no]}
  687. list_rules.append(_rule)
  688. if docchannel in (101,119,120):
  689. #-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0)
  690. if project_code!="" and project_name!="" and win_tenderer!="":
  691. tmp_win = 0
  692. if win_bid_price!="":
  693. tmp_win = win_bid_price
  694. confidence = 61
  695. _dict = {document_tmp_docchannel:docchannel,
  696. "project_code":project_code,
  697. "project_name":project_name,
  698. "win_tenderer":win_tenderer,
  699. "win_bid_price":tmp_win
  700. }
  701. _dict.update(base_dict)
  702. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  703. _rule = {"confidence":confidence,
  704. "query":_query,
  705. "singleNum_keys":singleNum_keys,
  706. "contain_keys":[],
  707. "multiNum_keys":[]}
  708. list_rules.append(_rule)
  709. if project_code!="" and project_name!="" and bidding_budget!="" and product!="":
  710. confidence = 72
  711. _dict = {document_tmp_docchannel:docchannel,
  712. "project_code":project_code,
  713. "project_name":project_name,
  714. "bidding_budget":bidding_budget,
  715. "product":product
  716. }
  717. _dict.update(base_dict)
  718. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  719. n_singleKeys = [i for i in singleNum_keys]
  720. n_singleKeys.append(document_tmp_web_source_no)
  721. _rule = {"confidence":confidence,
  722. "query":_query,
  723. "singleNum_keys":n_singleKeys,
  724. "contain_keys":[],
  725. "multiNum_keys":[]}
  726. list_rules.append(_rule)
  727. if project_code!='' and doctitle_refine!="" and win_tenderer!="" and win_bid_price!="":
  728. confidence = 91
  729. _dict = {document_tmp_docchannel:docchannel,
  730. "project_code":project_code,
  731. "doctitle_refine":doctitle_refine,
  732. "win_tenderer":win_tenderer,
  733. "win_bid_price":win_bid_price
  734. }
  735. _dict.update(base_dict)
  736. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  737. n_singleKeys = [i for i in singleNum_keys]
  738. n_singleKeys.append(document_tmp_web_source_no)
  739. _rule = {"confidence":confidence,
  740. "query":_query,
  741. "singleNum_keys":n_singleKeys,
  742. "contain_keys":[],
  743. "multiNum_keys":[]}
  744. list_rules.append(_rule)
  745. ##-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1
  746. if project_code!="" and project_name!="" and win_tenderer!="" and win_bid_price!="":
  747. confidence = 91
  748. _dict = {document_tmp_docchannel:docchannel,
  749. "project_code":project_code,
  750. "project_name":project_name,
  751. "win_tenderer":win_tenderer,
  752. "win_bid_price":win_bid_price
  753. }
  754. _dict.update(base_dict)
  755. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  756. n_singleKeys = [i for i in singleNum_keys]
  757. n_singleKeys.append(document_tmp_web_source_no)
  758. _rule = {"confidence":confidence,
  759. "query":_query,
  760. "singleNum_keys":n_singleKeys,
  761. "contain_keys":[],
  762. "multiNum_keys":[]}
  763. list_rules.append(_rule)
  764. if project_name!="" and win_tenderer!="" and win_bid_price!="":
  765. confidence = 91
  766. _dict = {document_tmp_docchannel:docchannel,
  767. "project_name":project_name,
  768. "win_tenderer":win_tenderer,
  769. "win_bid_price":win_bid_price,
  770. }
  771. _dict.update(base_dict)
  772. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  773. _rule = {"confidence":confidence,
  774. "query":_query,
  775. "singleNum_keys":singleNum_keys,
  776. "contain_keys":[],
  777. "multiNum_keys":[document_tmp_web_source_no]}
  778. list_rules.append(_rule)
  779. if project_code!="" and win_tenderer!="" and win_bid_price!="":
  780. confidence = 91
  781. _dict = {document_tmp_docchannel:docchannel,
  782. "project_code":project_code,
  783. "win_tenderer":win_tenderer,
  784. "win_bid_price":win_bid_price,
  785. }
  786. _dict.update(base_dict)
  787. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  788. _rule = {"confidence":confidence,
  789. "query":_query,
  790. "singleNum_keys":singleNum_keys,
  791. "contain_keys":[],
  792. "multiNum_keys":[document_tmp_web_source_no]}
  793. list_rules.append(_rule)
  794. if project_code!="" and doctitle_refine!="" and win_tenderer!="" and win_bid_price!="":
  795. confidence = 91
  796. _dict = {document_tmp_docchannel:docchannel,
  797. "project_code":project_code,
  798. "doctitle_refine":doctitle_refine,
  799. "win_tenderer":win_tenderer,
  800. "win_bid_price":win_bid_price
  801. }
  802. _dict.update(base_dict)
  803. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  804. n_singleKeys = [i for i in singleNum_keys]
  805. n_singleKeys.append(document_tmp_web_source_no)
  806. _rule = {"confidence":confidence,
  807. "query":_query,
  808. "singleNum_keys":n_singleKeys,
  809. "contain_keys":[],
  810. "multiNum_keys":[]}
  811. list_rules.append(_rule)
  812. if doctitle_refine!="" and win_tenderer!="" and win_bid_price!="":
  813. confidence=90
  814. _dict = {document_tmp_docchannel:docchannel,
  815. "doctitle_refine":doctitle_refine,
  816. "win_tenderer":win_tenderer,
  817. "win_bid_price":win_bid_price
  818. }
  819. _dict.update(base_dict)
  820. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  821. _rule = {"confidence":confidence,
  822. "query":_query,
  823. "singleNum_keys":singleNum_keys,
  824. "contain_keys":[],
  825. "multiNum_keys":[document_tmp_web_source_no]}
  826. list_rules.append(_rule)
  827. if project_name!="" and win_tenderer!="" and win_bid_price!="" and project_code!="":
  828. confidence=95
  829. _dict = {document_tmp_docchannel:docchannel,
  830. "project_name":project_name,
  831. "win_tenderer":win_tenderer,
  832. "win_bid_price":win_bid_price,
  833. "project_code":project_code
  834. }
  835. _dict.update(base_dict)
  836. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  837. _rule = {"confidence":confidence,
  838. "query":_query,
  839. "singleNum_keys":singleNum_keys,
  840. "contain_keys":[],
  841. "multiNum_keys":[document_tmp_web_source_no]}
  842. list_rules.append(_rule)
  843. if docchannel in (51,103,115,116):
  844. #9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
  845. if doctitle_refine!="" and tenderee!="":
  846. tmp_budget = 0
  847. if bidding_budget!="":
  848. tmp_budget = bidding_budget
  849. confidence=81
  850. _dict = {document_tmp_docchannel:docchannel,
  851. "doctitle_refine":doctitle_refine,
  852. "tenderee":tenderee,
  853. "bidding_budget":tmp_budget,
  854. }
  855. _dict.update(base_dict)
  856. _dict["page_time"] = [page_time,page_time]
  857. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  858. _rule = {"confidence":confidence,
  859. "query":_query,
  860. "singleNum_keys":singleNum_keys,
  861. "contain_keys":[],
  862. "multiNum_keys":[document_tmp_web_source_no]}
  863. list_rules.append(_rule)
  864. #-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
  865. if project_code!="" and tenderee!="":
  866. confidence=81
  867. tmp_budget = 0
  868. if bidding_budget!="":
  869. tmp_budget = bidding_budget
  870. _dict = {document_tmp_docchannel:docchannel,
  871. "project_code":project_code,
  872. "tenderee":tenderee,
  873. "bidding_budget":tmp_budget,
  874. }
  875. _dict.update(base_dict)
  876. _dict["page_time"] = [page_time,page_time]
  877. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  878. _rule = {"confidence":confidence,
  879. "query":_query,
  880. "singleNum_keys":singleNum_keys,
  881. "contain_keys":[],
  882. "multiNum_keys":[document_tmp_web_source_no]}
  883. list_rules.append(_rule)
  884. if project_name!="" and tenderee!="":
  885. confidence=81
  886. tmp_budget = 0
  887. if bidding_budget!="":
  888. tmp_budget = bidding_budget
  889. _dict = {document_tmp_docchannel:docchannel,
  890. "project_name":project_name,
  891. "tenderee":tenderee,
  892. "bidding_budget":tmp_budget,
  893. }
  894. _dict.update(base_dict)
  895. _dict["page_time"] = [page_time,page_time]
  896. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  897. _rule = {"confidence":confidence,
  898. "query":_query,
  899. "singleNum_keys":singleNum_keys,
  900. "contain_keys":[],
  901. "multiNum_keys":[document_tmp_web_source_no]}
  902. list_rules.append(_rule)
  903. if agency!="" and tenderee!="":
  904. confidence=81
  905. tmp_budget = 0
  906. if bidding_budget!="":
  907. tmp_budget = bidding_budget
  908. _dict = {document_tmp_docchannel:docchannel,
  909. "agency":agency,
  910. "tenderee":tenderee,
  911. "bidding_budget":tmp_budget,
  912. "product":product
  913. }
  914. _dict.update(base_dict)
  915. _dict["page_time"] = [page_time,page_time]
  916. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  917. _rule = {"confidence":confidence,
  918. "query":_query,
  919. "singleNum_keys":singleNum_keys,
  920. "contain_keys":[],
  921. "multiNum_keys":[document_tmp_web_source_no]}
  922. list_rules.append(_rule)
  923. if agency!="" and project_code!="":
  924. confidence=81
  925. tmp_budget = 0
  926. if bidding_budget!="":
  927. tmp_budget = bidding_budget
  928. _dict = {document_tmp_docchannel:docchannel,
  929. "agency":agency,
  930. "project_code":project_code,
  931. "bidding_budget":tmp_budget,
  932. "product":product
  933. }
  934. _dict.update(base_dict)
  935. _dict["page_time"] = [page_time,page_time]
  936. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  937. _rule = {"confidence":confidence,
  938. "query":_query,
  939. "singleNum_keys":singleNum_keys,
  940. "contain_keys":[],
  941. "multiNum_keys":[document_tmp_web_source_no]}
  942. list_rules.append(_rule)
  943. if agency!="" and project_name!="":
  944. confidence=81
  945. tmp_budget = 0
  946. if bidding_budget!="":
  947. tmp_budget = bidding_budget
  948. _dict = {document_tmp_docchannel:docchannel,
  949. "agency":agency,
  950. "project_name":project_name,
  951. "bidding_budget":tmp_budget,
  952. "product":product
  953. }
  954. _dict.update(base_dict)
  955. _dict["page_time"] = [page_time,page_time]
  956. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  957. _rule = {"confidence":confidence,
  958. "query":_query,
  959. "singleNum_keys":singleNum_keys,
  960. "contain_keys":[],
  961. "multiNum_keys":[document_tmp_web_source_no]}
  962. list_rules.append(_rule)
  963. #五选二
  964. if tenderee!="" and bidding_budget!="" and product!="":
  965. confidence=80
  966. _dict = {document_tmp_docchannel:docchannel,
  967. "tenderee":tenderee,
  968. "bidding_budget":bidding_budget,
  969. "product":product,
  970. }
  971. _dict.update(base_dict)
  972. _dict["page_time"] = [page_time,page_time]
  973. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  974. _rule = {"confidence":confidence,
  975. "query":_query,
  976. "singleNum_keys":singleNum_keys,
  977. "contain_keys":[],
  978. "multiNum_keys":[]}
  979. list_rules.append(_rule)
  980. if tenderee!="" and win_tenderer!="" and product!="":
  981. confidence=80
  982. _dict = {document_tmp_docchannel:docchannel,
  983. "tenderee":tenderee,
  984. "win_tenderer":win_tenderer,
  985. "product":product,
  986. }
  987. _dict.update(base_dict)
  988. _dict["page_time"] = [page_time,page_time]
  989. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  990. _rule = {"confidence":confidence,
  991. "query":_query,
  992. "singleNum_keys":singleNum_keys,
  993. "contain_keys":[],
  994. "multiNum_keys":[]}
  995. list_rules.append(_rule)
  996. if tenderee!="" and win_bid_price!="":
  997. confidence=80
  998. _dict = {document_tmp_docchannel:docchannel,
  999. "tenderee":tenderee,
  1000. "win_bid_price":win_bid_price,
  1001. "product":product,
  1002. }
  1003. _dict.update(base_dict)
  1004. _dict["page_time"] = [page_time,page_time]
  1005. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1006. _rule = {"confidence":confidence,
  1007. "query":_query,
  1008. "singleNum_keys":singleNum_keys,
  1009. "contain_keys":[],
  1010. "multiNum_keys":[]}
  1011. list_rules.append(_rule)
  1012. if tenderee!="" and agency!="":
  1013. confidence=80
  1014. _dict = {document_tmp_docchannel:docchannel,
  1015. "tenderee":tenderee,
  1016. "agency":agency,
  1017. "product":product,
  1018. }
  1019. _dict.update(base_dict)
  1020. _dict["page_time"] = [page_time,page_time]
  1021. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1022. _rule = {"confidence":confidence,
  1023. "query":_query,
  1024. "singleNum_keys":singleNum_keys,
  1025. "contain_keys":[],
  1026. "multiNum_keys":[]}
  1027. list_rules.append(_rule)
  1028. if win_tenderer!="" and bidding_budget!="":
  1029. confidence=80
  1030. _dict = {document_tmp_docchannel:docchannel,
  1031. "win_tenderer":win_tenderer,
  1032. "bidding_budget":bidding_budget,
  1033. "product":product,
  1034. }
  1035. _dict.update(base_dict)
  1036. _dict["page_time"] = [page_time,page_time]
  1037. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1038. _rule = {"confidence":confidence,
  1039. "query":_query,
  1040. "singleNum_keys":singleNum_keys,
  1041. "contain_keys":[],
  1042. "multiNum_keys":[]}
  1043. list_rules.append(_rule)
  1044. if win_bid_price!="" and bidding_budget!="":
  1045. confidence=80
  1046. _dict = {document_tmp_docchannel:docchannel,
  1047. "win_bid_price":win_bid_price,
  1048. "bidding_budget":bidding_budget,
  1049. "product":product,
  1050. }
  1051. _dict.update(base_dict)
  1052. _dict["page_time"] = [page_time,page_time]
  1053. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1054. _rule = {"confidence":confidence,
  1055. "query":_query,
  1056. "singleNum_keys":singleNum_keys,
  1057. "contain_keys":[],
  1058. "multiNum_keys":[]}
  1059. list_rules.append(_rule)
  1060. if agency!="" and bidding_budget!="":
  1061. confidence=80
  1062. _dict = {document_tmp_docchannel:docchannel,
  1063. "agency":agency,
  1064. "bidding_budget":bidding_budget,
  1065. "product":product,
  1066. }
  1067. _dict.update(base_dict)
  1068. _dict["page_time"] = [page_time,page_time]
  1069. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1070. _rule = {"confidence":confidence,
  1071. "query":_query,
  1072. "singleNum_keys":singleNum_keys,
  1073. "contain_keys":[],
  1074. "multiNum_keys":[]}
  1075. list_rules.append(_rule)
  1076. if win_tenderer!="" and win_bid_price!="":
  1077. confidence=80
  1078. _dict = {document_tmp_docchannel:docchannel,
  1079. "win_tenderer":win_tenderer,
  1080. "win_bid_price":win_bid_price,
  1081. "product":product,
  1082. }
  1083. _dict.update(base_dict)
  1084. _dict["page_time"] = [page_time,page_time]
  1085. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1086. _rule = {"confidence":confidence,
  1087. "query":_query,
  1088. "singleNum_keys":singleNum_keys,
  1089. "contain_keys":[],
  1090. "multiNum_keys":[]}
  1091. list_rules.append(_rule)
  1092. if win_tenderer!="" and agency!="":
  1093. confidence=80
  1094. _dict = {document_tmp_docchannel:docchannel,
  1095. "win_tenderer":win_tenderer,
  1096. "agency":agency,
  1097. "product":product,
  1098. }
  1099. _dict.update(base_dict)
  1100. _dict["page_time"] = [page_time,page_time]
  1101. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1102. _rule = {"confidence":confidence,
  1103. "query":_query,
  1104. "singleNum_keys":singleNum_keys,
  1105. "contain_keys":[],
  1106. "multiNum_keys":[]}
  1107. list_rules.append(_rule)
  1108. if doctitle_refine!="" and product!="" and len(doctitle_refine)>7:
  1109. confidence=80
  1110. _dict = {document_tmp_docchannel:docchannel,
  1111. "doctitle_refine":doctitle_refine,
  1112. "product":product,
  1113. }
  1114. _dict.update(base_dict)
  1115. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1116. _rule = {"confidence":confidence,
  1117. "query":_query,
  1118. "singleNum_keys":singleNum_keys,
  1119. "contain_keys":[],
  1120. "multiNum_keys":[]}
  1121. list_rules.append(_rule)
  1122. return list_rules
  1123. def dumplicate_fianl_check(self,base_list):
  1124. the_group = base_list
  1125. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1126. if len(the_group)>10:
  1127. keys = ["tenderee","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
  1128. else:
  1129. keys = ["tenderee","win_tenderer","win_bid_price","bidding_budget"]
  1130. #置信度
  1131. list_key_index = []
  1132. for _k in keys:
  1133. if _k=="doctitle":
  1134. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  1135. else:
  1136. list_key_index.append(getDiffIndex(the_group,_k))
  1137. _index = min(list_key_index)
  1138. if _index>1:
  1139. return the_group[:_index]
  1140. return []
  1141. def get_best_docid(self,base_list):
  1142. to_reverse = False
  1143. dict_source_count = {}
  1144. for _item in base_list:
  1145. _web_source = _item.get(document_tmp_web_source_no)
  1146. _web_source_name = _item.get(document_tmp_web_source_name)
  1147. _fingerprint = _item.get(document_tmp_fingerprint)
  1148. _item['from_bidi'] = 1 if _web_source_name=="比地招标" else 0 # 是否为比地收录的公告
  1149. if _web_source is not None:
  1150. if _web_source not in dict_source_count:
  1151. dict_source_count[_web_source] = set()
  1152. dict_source_count[_web_source].add(_fingerprint)
  1153. if len(dict_source_count[_web_source])>=2:
  1154. to_reverse=True
  1155. # 专项债
  1156. if len(base_list)>0 and base_list[0].get("is_special_bonds")==1:
  1157. for _item in base_list:
  1158. detail_link = _item.get("detail_link")
  1159. detail_link = detail_link.strip() if detail_link else ""
  1160. if "bondId=" in detail_link:
  1161. bondId = detail_link.split("bondId=")[1]
  1162. bondId = bondId.split(",") if bondId else []
  1163. else:
  1164. bondId = []
  1165. _item['bondId_num'] = len(bondId)
  1166. # print([i.get("bondId_num") for i in base_list])
  1167. base_list.sort(key=lambda x:x["docid"], reverse=False)
  1168. base_list.sort(key=lambda x:x.get(document_attachment_extract_status,0),reverse=True)
  1169. base_list.sort(key=lambda x:x["extract_count"], reverse=True)
  1170. base_list.sort(key=lambda x:x["bondId_num"],reverse=True)
  1171. return base_list[0]["docid"]
  1172. if len(base_list)>0:
  1173. base_list.sort(key=lambda x:x["docid"],reverse=False)
  1174. base_list.sort(key=lambda x:x.get(document_attachment_extract_status,0),reverse=True)
  1175. base_list.sort(key=lambda x:x["extract_count"],reverse=True)
  1176. base_list.sort(key=lambda x:x["from_bidi"],reverse=False)
  1177. return base_list[0]["docid"]
  1178. def save_dumplicate(self,base_list,best_docid,status_from,status_to):
  1179. #best_docid need check while others can save directly
  1180. list_dict = []
  1181. for item in base_list:
  1182. docid = item["docid"]
  1183. _dict = {"partitionkey":item["partitionkey"],
  1184. "docid":item["docid"]}
  1185. if docid==best_docid:
  1186. if item.get("save",1)!=0:
  1187. _dict["save"] = 1
  1188. else:
  1189. _dict["save"] = 0
  1190. if item.get("status")>=status_from[0] and item.get("status")<=status_from[1]:
  1191. _dict["status"] = random.randint(status_to[0],status_to[1])
  1192. list_dict.append(_dict)
  1193. for _dict in list_dict:
  1194. dtmp = Document_tmp(_dict)
  1195. dtmp.update_row(self.ots_client)
  1196. def flow_test(self,status_to=[1,10]):
  1197. def producer():
  1198. bool_query = BoolQuery(must_queries=[
  1199. # ExistsQuery("docid"),
  1200. # RangeQuery("crtime",range_to='2022-04-10'),
  1201. # RangeQuery("status",61),
  1202. NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
  1203. ],
  1204. must_not_queries=[
  1205. # NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
  1206. TermQuery("attachment_extract_status",1),
  1207. RangeQuery("status",1,11)
  1208. ]
  1209. )
  1210. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1211. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1212. ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
  1213. log("flow_init producer total_count:%d"%total_count)
  1214. list_dict = getRow_ots(rows)
  1215. for _dict in list_dict:
  1216. self.queue_init.put(_dict)
  1217. _count = len(list_dict)
  1218. while next_token and _count<1000000:
  1219. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1220. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1221. ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
  1222. list_dict = getRow_ots(rows)
  1223. for _dict in list_dict:
  1224. self.queue_init.put(_dict)
  1225. _count += len(list_dict)
  1226. print("%d/%d"%(_count,total_count))
  1227. def comsumer():
  1228. mt = MultiThreadHandler(self.queue_init,comsumer_handle,None,30,1,ots_client=self.ots_client)
  1229. mt.run()
  1230. def comsumer_handle(item,result_queue,ots_client):
  1231. # print(item)
  1232. dtmp = Document_tmp(item)
  1233. dtmp.setValue(document_tmp_status,random.randint(*status_to),True)
  1234. dtmp.update_row(ots_client)
  1235. # dhtml = Document_html(item)
  1236. # dhtml.update_row(ots_client)
  1237. # dtmp.delete_row(ots_client)
  1238. # dhtml.delete_row(ots_client)
  1239. producer()
  1240. comsumer()
  1241. def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
  1242. def producer(columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_web_source_name]):
  1243. bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True)])
  1244. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1245. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1246. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1247. log("flow_dumplicate producer total_count:%d"%total_count)
  1248. list_dict = getRow_ots(rows)
  1249. for _dict in list_dict:
  1250. self.queue_dumplicate.put(_dict)
  1251. _count = len(list_dict)
  1252. while next_token and _count<flow_process_count:
  1253. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1254. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1255. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1256. list_dict = getRow_ots(rows)
  1257. for _dict in list_dict:
  1258. self.queue_dumplicate.put(_dict)
  1259. _count += len(list_dict)
  1260. def comsumer():
  1261. mt = MultiThreadHandler(self.queue_dumplicate,comsumer_handle,None,10,1,ots_client=self.ots_client)
  1262. mt.run()
  1263. def comsumer_handle(item,result_queue,ots_client):
  1264. self.post_extract(item)
  1265. base_list = []
  1266. set_docid = set()
  1267. list_rules = self.translate_dumplicate_rules(flow_dumplicate_status_from,item)
  1268. list_rules.sort(key=lambda x:x["confidence"],reverse=True)
  1269. # print(item,"len_rules",len(list_rules))
  1270. for _rule in list_rules:
  1271. _query = _rule["query"]
  1272. confidence = _rule["confidence"]
  1273. singleNum_keys = _rule["singleNum_keys"]
  1274. contain_keys = _rule["contain_keys"]
  1275. multiNum_keys = _rule["multiNum_keys"]
  1276. self.add_data_by_query(item,base_list,set_docid,_query,confidence,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys)
  1277. item["confidence"] = 999
  1278. if item.get(document_tmp_docid) not in set_docid:
  1279. base_list.append(item)
  1280. final_list = self.dumplicate_fianl_check(base_list)
  1281. best_docid = self.get_best_docid(final_list)
  1282. # log(str(final_list))
  1283. _d = {"partitionkey":item["partitionkey"],
  1284. "docid":item["docid"],
  1285. "status":random.randint(*flow_dumplicate_status_to),
  1286. document_tmp_opertime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")
  1287. }
  1288. dtmp = Document_tmp(_d)
  1289. dup_docid = set()
  1290. for _dict in final_list:
  1291. dup_docid.add(_dict.get(document_tmp_docid))
  1292. if item.get(document_tmp_docid) in dup_docid:
  1293. dup_docid.remove(item.get(document_tmp_docid))
  1294. if len(final_list)==0 or best_docid==item.get(document_tmp_docid):
  1295. dtmp.setValue(document_tmp_save,1,True)
  1296. dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
  1297. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  1298. else:
  1299. dtmp.setValue(document_tmp_save,0,True)
  1300. if best_docid in dup_docid:
  1301. dup_docid.remove(best_docid)
  1302. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  1303. dmp_docid = "%d,%s"%(best_docid,dmp_docid)
  1304. else:
  1305. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  1306. dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
  1307. dtmp.update_row(self.ots_client)
  1308. #只保留当前公告
  1309. # self.save_dumplicate(final_list,best_docid,status_from,status_to)
  1310. #
  1311. # print("=base=",item)
  1312. # if len(final_list)>=1:
  1313. # print("==================")
  1314. # for _dict in final_list:
  1315. # print(_dict)
  1316. # print("========>>>>>>>>>>")
  1317. producer()
  1318. comsumer()
  1319. def merge_document(self,item,status_to=None):
  1320. self.post_extract(item)
  1321. docchannel,project_code,project_name,tenderee,agency,doctitle,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
  1322. _d = {"partitionkey":item["partitionkey"],
  1323. "docid":item["docid"],
  1324. }
  1325. dtmp = Document_tmp(_d)
  1326. if item.get(document_tmp_save,1)==1:
  1327. list_should_q = []
  1328. if project_code!="" and tenderee!="":
  1329. _q = BoolQuery(must_queries=[MatchQuery("project_code",project_code),
  1330. TermQuery("tenderee",tenderee)])
  1331. list_should_q.append(_q)
  1332. if project_name!="" and project_code!="":
  1333. _q = BoolQuery(must_queries=[MatchQuery("project_code",project_code),
  1334. TermQuery("project_name",project_name)])
  1335. list_should_q.append(_q)
  1336. if len(list_should_q)>0:
  1337. list_data = self.search_data_by_query(item,list_should_q,100,merge=True,table_name="project2",table_index="project2_index_formerge",sort_column="tenderee",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=["tenderee","win_tenderer"])
  1338. if len(list_data)==1:
  1339. dtmp.setValue("merge_uuid",list_data[0]["uuid"],True)
  1340. print(item["docid"],list_data[0]["uuid"])
  1341. else:
  1342. list_should_q = []
  1343. if bidding_budget!="" and project_code!="":
  1344. _q = BoolQuery(must_queries=[MatchQuery("project_code",project_code),
  1345. TermQuery("bidding_budget",float(bidding_budget))])
  1346. list_should_q.append(_q)
  1347. if tenderee!="" and bidding_budget!="" and project_name!="":
  1348. _q = BoolQuery(must_queries=[MatchQuery("tenderee",tenderee),
  1349. TermQuery("bidding_budget",float(bidding_budget)),
  1350. TermQuery("project_name",project_name)])
  1351. list_should_q.append(_q)
  1352. if tenderee!="" and win_bid_price!="" and project_name!="":
  1353. _q = BoolQuery(must_queries=[MatchQuery("tenderee",tenderee),
  1354. TermQuery("win_bid_price",float(win_bid_price)),
  1355. TermQuery("project_name",project_name)])
  1356. list_should_q.append(_q)
  1357. if len(list_should_q)>0:
  1358. list_data = self.search_data_by_query(item,list_should_q,100,table_name="project2",table_index="project2_index_formerge",sort_column="tenderee",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=["tenderee","win_tenderer"])
  1359. if len(list_data)==1:
  1360. dtmp.setValue("merge_uuid",list_data[0]["uuid"],True)
  1361. print(item["docid"],list_data[0]["uuid"])
  1362. return dtmp.getProperties().get("merge_uuid","")
  1363. # dtmp.update_row(self.ots_client)
  1364. def test_merge(self):
  1365. import pandas as pd
  1366. import queue
  1367. def producer(columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json]):
  1368. list_test_item = []
  1369. should_q = BoolQuery(should_queries=[
  1370. TermQuery("docchannel",101),
  1371. TermQuery("docchannel",119),
  1372. TermQuery("docchannel",120)
  1373. ])
  1374. bool_query = BoolQuery(must_queries=[
  1375. TermQuery("page_time","2022-04-22"),
  1376. should_q,
  1377. TermQuery("save",1)
  1378. ])
  1379. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1380. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1381. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1382. log("flow_dumplicate producer total_count:%d"%total_count)
  1383. list_dict = getRow_ots(rows)
  1384. for _dict in list_dict:
  1385. list_test_item.append(_dict)
  1386. _count = len(list_dict)
  1387. while next_token:
  1388. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1389. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1390. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1391. list_dict = getRow_ots(rows)
  1392. for _dict in list_dict:
  1393. list_test_item.append(_dict)
  1394. _count += len(list_dict)
  1395. print("%d/%d"%(_count,total_count))
  1396. return list_test_item
  1397. from BaseDataMaintenance.model.ots.project import Project
  1398. def comsumer_handle(item,result_queue,ots_client):
  1399. item["merge_uuid"] = self.merge_document(item)
  1400. if item["merge_uuid"]!="":
  1401. _dict = {"uuid":item["merge_uuid"]}
  1402. _p = Project(_dict)
  1403. _p.fix_columns(self.ots_client,["zhao_biao_page_time"],True)
  1404. if _p.getProperties().get("zhao_biao_page_time","")!="":
  1405. item["是否有招标"] = "是"
  1406. list_test_item = producer()
  1407. task_queue = queue.Queue()
  1408. for item in list_test_item:
  1409. task_queue.put(item)
  1410. mt = MultiThreadHandler(task_queue,comsumer_handle,None,30,1,ots_client=self.ots_client)
  1411. mt.run()
  1412. keys = [document_tmp_docid,document_tmp_docchannel,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_doctitle_refine,"win_tenderer","bidding_budget","win_bid_price","merge_uuid","是否有招标"]
  1413. df_data = {}
  1414. for k in keys:
  1415. df_data[k] = []
  1416. for item in list_test_item:
  1417. for k in keys:
  1418. df_data[k].append(item.get(k,""))
  1419. df = pd.DataFrame(df_data)
  1420. df.to_excel("test_merge.xlsx",columns=keys)
  1421. def flow_merge(self,process_count=10000,status_from=[71,80],status_to=[81,90]):
  1422. def producer(columns=[document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json]):
  1423. bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True)])
  1424. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1425. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1426. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1427. log("flow_merge producer total_count:%d"%total_count)
  1428. list_dict = getRow_ots(rows)
  1429. for _dict in list_dict:
  1430. self.queue_merge.put(_dict)
  1431. _count = len(list_dict)
  1432. while next_token and _count<process_count:
  1433. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1434. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1435. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1436. list_dict = getRow_ots(rows)
  1437. for _dict in list_dict:
  1438. self.queue_merge.put(_dict)
  1439. _count += len(list_dict)
  1440. def comsumer():
  1441. mt = MultiThreadHandler(self.queue_merge,comsumer_handle,None,10,1,ots_client=self.ots_client)
  1442. mt.run()
  1443. def comsumer_handle(item,result_queue,ots_client):
  1444. self.merge_document(item,status_to)
  1445. # producer()
  1446. # comsumer()
  1447. pass
  1448. def flow_syncho(self,status_from=[71,80],status_to=[81,90]):
  1449. pass
  1450. def flow_remove(self,process_count=flow_process_count,status_from=flow_remove_status_from):
  1451. def producer():
  1452. current_date = getCurrent_date("%Y-%m-%d")
  1453. tmp_date = timeAdd(current_date,-10)
  1454. bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True),
  1455. RangeQuery(document_tmp_crtime,range_to="%s 00:00:00"%(tmp_date))])
  1456. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1457. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1458. ColumnsToGet(return_type=ColumnReturnType.NONE))
  1459. log("flow_remove producer total_count:%d"%total_count)
  1460. list_dict = getRow_ots(rows)
  1461. for _dict in list_dict:
  1462. self.queue_remove.put(_dict)
  1463. _count = len(list_dict)
  1464. while next_token:
  1465. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1466. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1467. ColumnsToGet(return_type=ColumnReturnType.NONE))
  1468. list_dict = getRow_ots(rows)
  1469. for _dict in list_dict:
  1470. self.queue_remove.put(_dict)
  1471. _count += len(list_dict)
  1472. def comsumer():
  1473. mt = MultiThreadHandler(self.queue_remove,comsumer_handle,None,10,1,ots_client=self.ots_client)
  1474. mt.run()
  1475. def comsumer_handle(item,result_queue,ots_client):
  1476. dtmp = Document_tmp(item)
  1477. dtmp.delete_row(self.ots_client)
  1478. dhtml = Document_html(item)
  1479. dhtml.delete_row(self.ots_client)
  1480. producer()
  1481. comsumer()
  1482. def start_flow_dumplicate(self):
  1483. schedule = BlockingScheduler()
  1484. schedule.add_job(self.flow_remove,"cron",hour="20")
  1485. schedule.add_job(self.flow_remove_project_tmp,"cron",hour="20")
  1486. schedule.add_job(self.flow_dumplicate,"cron",second="*/10")
  1487. schedule.start()
  1488. def flow_remove_project_tmp(self,process_count=flow_process_count):
  1489. def producer():
  1490. current_date = getCurrent_date("%Y-%m-%d")
  1491. tmp_date = timeAdd(current_date,-6*31)
  1492. bool_query = BoolQuery(must_queries=[
  1493. RangeQuery(project_page_time,range_to="%s"%(tmp_date))])
  1494. rows,next_token,total_count,is_all_succeed = self.ots_client.search("project2_tmp","project2_tmp_index",
  1495. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time")]),limit=100,get_total_count=True),
  1496. ColumnsToGet(return_type=ColumnReturnType.NONE))
  1497. log("flow_remove project2_tmp producer total_count:%d"%total_count)
  1498. list_dict = getRow_ots(rows)
  1499. for _dict in list_dict:
  1500. self.queue_remove_project.put(_dict)
  1501. _count = len(list_dict)
  1502. while next_token:
  1503. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1504. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1505. ColumnsToGet(return_type=ColumnReturnType.NONE))
  1506. list_dict = getRow_ots(rows)
  1507. for _dict in list_dict:
  1508. self.queue_remove_project.put(_dict)
  1509. _count += len(list_dict)
  1510. def comsumer():
  1511. mt = MultiThreadHandler(self.queue_remove_project,comsumer_handle,None,10,1,ots_client=self.ots_client)
  1512. mt.run()
  1513. def comsumer_handle(item,result_queue,ots_client):
  1514. ptmp = Project_tmp(item)
  1515. ptmp.delete_row(self.ots_client)
  1516. producer()
  1517. comsumer()
  1518. def start_flow_merge(self):
  1519. schedule = BlockingScheduler()
  1520. schedule.add_job(self.flow_merge,"cron",second="*/10")
  1521. schedule.start()
  1522. def download_attachment():
  1523. ots_client = getConnect_ots()
  1524. queue_attachment = Queue()
  1525. auth = getAuth()
  1526. oss2.defaults.connection_pool_size = 100
  1527. oss2.defaults.multiget_num_threads = 20
  1528. attachment_bucket_name = "attachment-hub"
  1529. if is_internal:
  1530. bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
  1531. else:
  1532. bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
  1533. bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)
  1534. current_path = os.path.dirname(__file__)
  1535. def producer():
  1536. columns = [document_tmp_attachment_path]
  1537. bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_crtime,"2022-03-29 15:00:00","2022-03-29 17:00:00",True,True)])
  1538. rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
  1539. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status",SortOrder.DESC)]),limit=100,get_total_count=True),
  1540. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1541. log("flow_attachment producer total_count:%d"%total_count)
  1542. list_dict = getRow_ots(rows)
  1543. for _dict in list_dict:
  1544. queue_attachment.put(_dict)
  1545. _count = len(list_dict)
  1546. while next_token:
  1547. rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
  1548. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1549. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1550. list_dict = getRow_ots(rows)
  1551. for _dict in list_dict:
  1552. queue_attachment.put(_dict)
  1553. _count += len(list_dict)
  1554. def comsumer():
  1555. mt = MultiThreadHandler(queue_attachment,comsumer_handle,None,10,1)
  1556. mt.run()
  1557. def getAttachments(list_filemd5,columns_to_get=[attachment_filemd5,attachment_path,attachment_size,attachment_attachmenthtml,attachment_filetype,attachment_docids,attachment_status,attachment_swfUrls]):
  1558. list_attachment = []
  1559. rows_to_get = []
  1560. for _md5 in list_filemd5[:50]:
  1561. if _md5 is None:
  1562. continue
  1563. primary_key = [(attachment_filemd5,_md5)]
  1564. rows_to_get.append(primary_key)
  1565. req = BatchGetRowRequest()
  1566. req.add(TableInBatchGetRowItem(attachment_table_name,rows_to_get,columns_to_get,None,1))
  1567. try:
  1568. result = ots_client.batch_get_row(req)
  1569. attach_result = result.get_result_by_table(attachment_table_name)
  1570. for item in attach_result:
  1571. if item.is_ok:
  1572. _dict = getRow_ots_primary(item.row)
  1573. if _dict is not None:
  1574. list_attachment.append(attachment(_dict))
  1575. except Exception as e:
  1576. log(str(list_filemd5))
  1577. log("attachProcess comsumer error %s"%str(e))
  1578. return list_attachment
  1579. def comsumer_handle(item,result_queue):
  1580. page_attachments = json.loads(item.get(document_tmp_attachment_path,"[]"))
  1581. if len(page_attachments)==0:
  1582. pass
  1583. else:
  1584. list_fileMd5 = []
  1585. for _atta in page_attachments:
  1586. list_fileMd5.append(_atta.get(document_tmp_attachment_path_filemd5))
  1587. list_attach = getAttachments(list_fileMd5)
  1588. for attach in list_attach:
  1589. filemd5 = attach.getProperties().get(attachment_filemd5)
  1590. _status = attach.getProperties().get(attachment_status)
  1591. _filetype = attach.getProperties().get(attachment_filetype)
  1592. _size = attach.getProperties().get(attachment_size)
  1593. _path = attach.getProperties().get(attachment_path)
  1594. _uuid = uuid4()
  1595. objectPath = attach.getProperties().get(attachment_path)
  1596. localpath = os.path.join(current_path,"download","%s.%s"%(filemd5,_filetype))
  1597. try:
  1598. if _size>ATTACHMENT_LARGESIZE:
  1599. pass
  1600. else:
  1601. downloadFile(bucket,objectPath,localpath)
  1602. except Exception as e:
  1603. traceback.print_exc()
  1604. producer()
  1605. comsumer()
  1606. def test_attachment_interface():
  1607. current_path = os.path.dirname(__file__)
  1608. task_queue = Queue()
  1609. def producer():
  1610. _count = 0
  1611. list_filename = os.listdir(os.path.join(current_path,"download"))
  1612. for _filename in list_filename:
  1613. _count += 1
  1614. _type = _filename.split(".")[1]
  1615. task_queue.put({"path":os.path.join(current_path,"download",_filename),"file_type":_type})
  1616. if _count>=500:
  1617. break
  1618. def comsumer():
  1619. mt = MultiThreadHandler(task_queue,comsumer_handle,None,10)
  1620. mt.run()
  1621. def comsumer_handle(item,result_queue):
  1622. _path = item.get("path")
  1623. _type = item.get("file_type")
  1624. _data_base64 = base64.b64encode(open(_path,"rb").read())
  1625. #调用接口处理结果
  1626. start_time = time.time()
  1627. _success,_html,swf_images = getAttachDealInterface(_data_base64,_type)
  1628. log("%s result:%s takes:%d"%(_path,str(_success),time.time()-start_time))
  1629. producer()
  1630. comsumer()
  1631. class Dataflow_attachment(Dataflow):
  1632. def __init__(self):
  1633. Dataflow.__init__(self)
  1634. self.process_list_thread = []
  1635. def flow_attachment_process(self):
  1636. self.process_comsumer()
  1637. def monitor_attachment_process(self):
  1638. alive_count = 0
  1639. for _t in self.process_list_thread:
  1640. if _t.is_alive():
  1641. alive_count += 1
  1642. log("attachment_process alive:%d total:%d"%(alive_count,len(self.process_list_thread)))
  1643. def process_comsumer(self):
  1644. if len(self.process_list_thread)==0:
  1645. thread_count = 60
  1646. for i in range(thread_count):
  1647. self.process_list_thread.append(Thread(target=self.process_comsumer_handle))
  1648. for t in self.process_list_thread:
  1649. t.start()
  1650. while 1:
  1651. failed_count = 0
  1652. for _i in range(len(self.process_list_thread)):
  1653. t = self.process_list_thread[_i]
  1654. if not t.is_alive():
  1655. failed_count += 1
  1656. self.prcess_list_thread[_i] = Thread(target=self.process_comsumer_handle)
  1657. self.prcess_list_thread[_i].start()
  1658. if failed_count>0:
  1659. log("attachment failed %d"%(failed_count))
  1660. time.sleep(5)
  1661. def process_comsumer_handle(self):
  1662. while 1:
  1663. _flag = False
  1664. log("attachment handle:%s"%str(threading.get_ident()))
  1665. try:
  1666. item = self.queue_attachment_ocr.get(True,timeout=0.2)
  1667. log("attachment get doc:%s"%(str(item.get("item",{}).get("docid"))))
  1668. self.attachment_recognize(item,None)
  1669. log("attachment get doc:%s succeed"%(str(item.get("item",{}).get("docid"))))
  1670. except Exception as e:
  1671. _flag = True
  1672. pass
  1673. try:
  1674. item = self.queue_attachment_not_ocr.get(True,timeout=0.2)
  1675. log("attachment get doc:%s"%(str(item.get("item",{}).get("docid"))))
  1676. self.attachment_recognize(item,None)
  1677. log("attachment get doc:%s succeed"%(str(item.get("item",{}).get("docid"))))
  1678. except Exception as e:
  1679. _flag = True and _flag
  1680. pass
  1681. if _flag:
  1682. time.sleep(2)
  1683. def attachment_recognize(self,_dict,result_queue):
  1684. item = _dict.get("item")
  1685. list_attach = _dict.get("list_attach")
  1686. dhtml = Document_html({"partitionkey":item.get("partitionkey"),
  1687. "docid":item.get("docid")})
  1688. dhtml.fix_columns(self.ots_client,["dochtmlcon"],True)
  1689. _dochtmlcon = dhtml.getProperties().get("dochtmlcon","")
  1690. _succeed,list_html,swf_urls = self.rec_attachments_by_interface(list_attach,_dochtmlcon,save=True)
  1691. log(str(swf_urls))
  1692. if not _succeed:
  1693. item[document_tmp_status] = random.randint(*flow_attachment_status_failed_to)
  1694. else:
  1695. dhtml.updateSWFImages(swf_urls)
  1696. dhtml.updateAttachment(list_html)
  1697. dhtml.update_row(self.ots_client)
  1698. item[document_tmp_status] = random.randint(*flow_attachment_status_succeed_to)
  1699. item[document_tmp_attachment_extract_status] = 1
  1700. log("document:%d get attachments with result:%s"%(item.get("docid"),str(_succeed)))
  1701. dtmp = Document_tmp(item)
  1702. dtmp.update_row(self.ots_client)
  1703. def flow_attachment(self):
  1704. self.flow_attachment_producer()
  1705. self.flow_attachment_producer_comsumer()
  1706. def getAttachments(self,list_filemd5,columns_to_get=[attachment_filemd5,attachment_path,attachment_size,attachment_attachmenthtml,attachment_filetype,attachment_docids,attachment_status,attachment_swfUrls]):
  1707. list_attachment = []
  1708. rows_to_get = []
  1709. for _md5 in list_filemd5[:50]:
  1710. if _md5 is None:
  1711. continue
  1712. primary_key = [(attachment_filemd5,_md5)]
  1713. rows_to_get.append(primary_key)
  1714. req = BatchGetRowRequest()
  1715. req.add(TableInBatchGetRowItem(attachment_table_name,rows_to_get,columns_to_get,None,1))
  1716. try:
  1717. result = self.ots_client.batch_get_row(req)
  1718. attach_result = result.get_result_by_table(attachment_table_name)
  1719. for item in attach_result:
  1720. if item.is_ok:
  1721. _dict = getRow_ots_primary(item.row)
  1722. if _dict is not None:
  1723. list_attachment.append(attachment(_dict))
  1724. except Exception as e:
  1725. log(str(list_filemd5))
  1726. log("attachProcess comsumer error %s"%str(e))
  1727. return list_attachment
  1728. def flow_attachment_producer(self,columns=[document_tmp_attachment_path,document_tmp_crtime]):
  1729. qsize_ocr = self.queue_attachment_ocr.qsize()
  1730. qsize_not_ocr = self.queue_attachment_not_ocr.qsize()
  1731. log("queue_attachment_ocr:%d,queue_attachment_not_ocr:%d"%(qsize_ocr,qsize_not_ocr))
  1732. #选择加入数据场景
  1733. if min(qsize_ocr,qsize_not_ocr)>200 or max(qsize_ocr,qsize_not_ocr)>1000:
  1734. return
  1735. #去重
  1736. set_docid = set()
  1737. set_docid = set_docid | set(self.list_attachment_ocr) | set(self.list_attachment_not_ocr)
  1738. if qsize_ocr>0:
  1739. self.list_attachment_ocr = self.list_attachment_ocr[-qsize_ocr:]
  1740. else:
  1741. self.list_attachment_ocr = []
  1742. if qsize_not_ocr>0:
  1743. self.list_attachment_not_ocr = self.list_attachment_not_ocr[-qsize_not_ocr:]
  1744. else:
  1745. self.list_attachment_not_ocr = []
  1746. try:
  1747. bool_query = BoolQuery(must_queries=[
  1748. RangeQuery(document_tmp_status,*flow_attachment_status_from,True,True),
  1749. # TermQuery(document_tmp_docid,234925191),
  1750. ])
  1751. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1752. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status",SortOrder.DESC)]),limit=100,get_total_count=True),
  1753. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1754. log("flow_attachment producer total_count:%d"%total_count)
  1755. list_dict = getRow_ots(rows)
  1756. _count = 0
  1757. for _dict in list_dict:
  1758. docid = _dict.get(document_tmp_docid)
  1759. if docid in set_docid:
  1760. continue
  1761. self.queue_attachment.put(_dict,True)
  1762. _count += 1
  1763. while next_token and _count<flow_process_count:
  1764. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1765. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1766. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1767. list_dict = getRow_ots(rows)
  1768. for _dict in list_dict:
  1769. docid = _dict.get(document_tmp_docid)
  1770. if docid in set_docid:
  1771. continue
  1772. self.queue_attachment.put(_dict,True)
  1773. _count += 1
  1774. log("add attachment count:%d"%(_count))
  1775. except Exception as e:
  1776. log("flow attachment producer error:%s"%(str(e)))
  1777. traceback.print_exc()
  1778. def flow_attachment_producer_comsumer(self):
  1779. log("start flow_attachment comsumer")
  1780. mt = MultiThreadHandler(self.queue_attachment,self.comsumer_handle,None,10,1)
  1781. mt.run()
  1782. def set_queue(self,_dict):
  1783. list_attach = _dict.get("list_attach")
  1784. to_ocr = False
  1785. for attach in list_attach:
  1786. if attach.getProperties().get(attachment_filetype) in ["bmp","jpeg","jpg","png","swf","pdf","tif"]:
  1787. to_ocr = True
  1788. break
  1789. if to_ocr:
  1790. self.queue_attachment_ocr.put(_dict,True)
  1791. # self.list_attachment_ocr.append(_dict.get("item").get(document_tmp_docid))
  1792. else:
  1793. self.queue_attachment_not_ocr.put(_dict,True)
  1794. # self.list_attachment_not_ocr.append(_dict.get("item").get(document_tmp_docid))
  1795. def comsumer_handle(self,item,result_queue):
  1796. try:
  1797. page_attachments = json.loads(item.get(document_tmp_attachment_path,"[]"))
  1798. if len(page_attachments)==0:
  1799. item[document_tmp_status] = random.randint(*flow_attachment_status_succeed_to)
  1800. dtmp = Document_tmp(item)
  1801. dtmp.update_row(self.ots_client)
  1802. else:
  1803. list_fileMd5 = []
  1804. for _atta in page_attachments:
  1805. list_fileMd5.append(_atta.get(document_tmp_attachment_path_filemd5))
  1806. list_attach = self.getAttachments(list_fileMd5)
  1807. #未上传成功的2小时内不处理
  1808. if len(page_attachments)!=len(list_attach) and time.mktime(time.localtime())-time.mktime(time.strptime(item.get(document_tmp_crtime),"%Y-%m-%d %H:%M:%S"))<7200:
  1809. item[document_tmp_status] = 1
  1810. dtmp = Document_tmp(item)
  1811. dtmp.update_row(self.ots_client)
  1812. return
  1813. self.set_queue({"item":item,"list_attach":list_attach})
  1814. except Exception as e:
  1815. traceback.print_exc()
  1816. def start_flow_attachment(self):
  1817. schedule = BlockingScheduler()
  1818. schedule.add_job(self.flow_attachment_process,"cron",second="*/20")
  1819. schedule.add_job(self.flow_attachment,"cron",second="*/10")
  1820. schedule.start()
  1821. class Dataflow_extract(Dataflow):
  1822. def __init__(self):
  1823. Dataflow.__init__(self)
  1824. def flow_extract_producer(self,columns=[document_tmp_page_time,document_tmp_doctitle,document_tmp_docchannel,document_tmp_status,document_tmp_original_docchannel,document_tmp_web_source_no]):
  1825. q_size = self.queue_extract.qsize()
  1826. if q_size>100:
  1827. return
  1828. set_docid = set(self.list_extract)
  1829. if q_size>0:
  1830. self.list_extract = self.list_extract[-q_size:]
  1831. else:
  1832. self.list_extract = []
  1833. try:
  1834. bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*flow_extract_status_from,True,True)])
  1835. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1836. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status",SortOrder.ASC)]),limit=100,get_total_count=True),
  1837. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1838. log("flow_extract producer total_count:%d"%total_count)
  1839. list_dict = getRow_ots(rows)
  1840. for _dict in list_dict:
  1841. docid = _dict.get(document_tmp_docid)
  1842. if docid in set_docid:
  1843. self.list_extract.insert(0,docid)
  1844. continue
  1845. else:
  1846. self.queue_extract.put(_dict)
  1847. self.list_extract.append(docid)
  1848. _count = len(list_dict)
  1849. while next_token and _count<flow_process_count:
  1850. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1851. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1852. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1853. list_dict = getRow_ots(rows)
  1854. for _dict in list_dict:
  1855. docid = _dict.get(document_tmp_docid)
  1856. if docid in set_docid:
  1857. self.list_extract.insert(0,docid)
  1858. continue
  1859. else:
  1860. self.queue_extract.put(_dict)
  1861. self.list_extract.append(docid)
  1862. _count += len(list_dict)
  1863. except Exception as e:
  1864. log("flow extract producer error:%s"%(str(e)))
  1865. traceback.print_exc()
  1866. def flow_extract(self,):
  1867. self.comsumer()
  1868. def comsumer(self):
  1869. mt = MultiThreadHandler(self.queue_extract,self.comsumer_handle,None,35,1,True)
  1870. mt.run()
  1871. def comsumer_handle(self,item,result_queue):
  1872. dhtml = Document_html({"partitionkey":item.get("partitionkey"),
  1873. "docid":item.get("docid")})
  1874. dhtml.fix_columns(self.ots_client,["dochtmlcon"],True)
  1875. item[document_tmp_dochtmlcon] = dhtml.getProperties().get(document_tmp_dochtmlcon,"")
  1876. _extract = Document_extract({})
  1877. _extract.setValue(document_extract2_partitionkey,item.get(document_partitionkey))
  1878. _extract.setValue(document_extract2_docid,item.get(document_docid))
  1879. all_done = 1
  1880. if all_done:
  1881. data = item
  1882. resp = requests.post(self.other_url,json=data,headers=self.header)
  1883. if (resp.status_code >=200 and resp.status_code<=210):
  1884. _extract.setValue(document_extract2_other_json,resp.content.decode("utf8"),True)
  1885. else:
  1886. all_done = -1
  1887. data = {}
  1888. for k,v in item.items():
  1889. data[k] = v
  1890. data["timeout"] = 240
  1891. data["doc_id"] = data.get(document_tmp_docid)
  1892. data["content"] = data.get(document_tmp_dochtmlcon,"")
  1893. if document_tmp_dochtmlcon in data:
  1894. data.pop(document_tmp_dochtmlcon)
  1895. data["title"] = data.get(document_tmp_doctitle,"")
  1896. data["web_source_no"] = item.get(document_tmp_web_source_no,"")
  1897. data["original_docchannel"] = item.get(document_tmp_original_docchannel,"")
  1898. if all_done:
  1899. resp = requests.post(self.extract_url,json=data,headers=self.header)
  1900. if (resp.status_code >=200 and resp.status_code<=210):
  1901. _extract.setValue(document_extract2_extract_json,resp.content.decode("utf8"),True)
  1902. else:
  1903. all_done = -2
  1904. if all_done:
  1905. resp = requests.post(self.industy_url,json=data,headers=self.header)
  1906. if (resp.status_code >=200 and resp.status_code<=210):
  1907. _extract.setValue(document_extract2_industry_json,resp.content.decode("utf8"),True)
  1908. else:
  1909. all_done = -3
  1910. _dict = {document_partitionkey:item.get(document_tmp_partitionkey),
  1911. document_docid:item.get(document_tmp_docid),
  1912. }
  1913. dtmp = Document_tmp(_dict)
  1914. if all_done!=1:
  1915. sentMsgToDD("要素提取失败:docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
  1916. dtmp.setValue(document_tmp_status,random.randint(*flow_extract_status_failed_to),True)
  1917. dtmp.update_row(self.ots_client)
  1918. else:
  1919. dtmp.setValue(document_tmp_status,random.randint(*flow_extract_status_succeed_to),True)
  1920. dtmp.update_row(self.ots_client)
  1921. # 插入接口表,上线放开
  1922. _extract.setValue(document_extract2_status,random.randint(1,50),True)
  1923. _extract.update_row(self.ots_client)
  1924. log("process docid:%d %s"%(data["doc_id"],str(all_done)))
  1925. def start_flow_extract(self):
  1926. schedule = BlockingScheduler()
  1927. schedule.add_job(self.flow_extract_producer,"cron",second="*/10")
  1928. schedule.add_job(self.flow_extract,"cron",second="*/10")
  1929. schedule.start()
  1930. class Dataflow_dumplicate(Dataflow):
  1931. class DeleteListener():
  1932. def __init__(self,conn,_func,*args,**kwargs):
  1933. self.conn = conn
  1934. self._func = _func
  1935. def on_error(self, headers,*args,**kwargs):
  1936. log('received an error %s' % str(headers.body))
  1937. def on_message(self, headers,*args,**kwargs):
  1938. try:
  1939. message_id = headers.headers["message-id"]
  1940. body = headers.body
  1941. log("get message %s"%(message_id))
  1942. self._func(_dict={"frame":headers,"conn":self.conn},result_queue=None)
  1943. except Exception as e:
  1944. traceback.print_exc()
  1945. pass
  1946. def __del__(self):
  1947. self.conn.disconnect()
  1948. def __init__(self,start_delete_listener=True):
  1949. Dataflow.__init__(self,)
  1950. self.c_f_get_extractCount = f_get_extractCount()
  1951. self.c_f_get_package = f_get_package()
  1952. logging.basicConfig(level = logging.info,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1953. self.fix_doc_docid = None
  1954. self.bdm = BaseDataMonitor()
  1955. self.check_rule = 1
  1956. if start_delete_listener:
  1957. self.delete_comsumer_counts = 2
  1958. self.doc_delete_queue = "/queue/doc_delete_queue"
  1959. self.doc_delete_result = "/queue/doc_delete_result"
  1960. self.pool_mq_ali = ConnectorPool(1,10,getConnect_activateMQ_ali)
  1961. for _ in range(self.delete_comsumer_counts):
  1962. conn = getConnect_activateMQ_ali()
  1963. listener = self.DeleteListener(conn,self.delete_doc_handle)
  1964. createComsumer(listener,self.doc_delete_queue)
  1965. def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart"]):
  1966. dict_time = {}
  1967. for k in keys:
  1968. _time = _extract.get(k)
  1969. _time = _time[:10] if _time else ""
  1970. dict_time[k] = _time
  1971. return dict_time
  1972. def get_attrs_before_dump(self,docid,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type,"detail_link",'products','crtime']):
  1973. bool_query = BoolQuery(must_queries=[
  1974. TermQuery("docid",docid)
  1975. ])
  1976. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
  1977. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1978. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1979. log("flow_dumplicate producer total_count:%d"%total_count)
  1980. if total_count==0:
  1981. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1982. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1983. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1984. list_dict = getRow_ots(rows)
  1985. if len(list_dict)>0:
  1986. return self.post_extract(list_dict[0])
  1987. def post_extract(self,_dict):
  1988. win_tenderer,bidding_budget,win_bid_price,_ = self.f_decode_sub_docs_json(_dict.get(document_tmp_project_code),_dict.get(document_tmp_project_name),_dict.get(document_tmp_tenderee),_dict.get(document_tmp_agency),_dict.get(document_tmp_sub_docs_json))
  1989. _dict["win_tenderer"] = win_tenderer
  1990. _dict["bidding_budget"] = bidding_budget
  1991. _dict["win_bid_price"] = win_bid_price
  1992. extract_json = _dict.get(document_tmp_extract_json,"{}")
  1993. _extract = json.loads(extract_json)
  1994. _dict["product"] = ",".join(_extract.get("product",[]))
  1995. _dict["fingerprint"] = _extract.get("fingerprint","")
  1996. _dict["project_codes"] = _extract.get("code",[])
  1997. if len(_dict["project_codes"])>0:
  1998. _dict["project_code"] = _dict["project_codes"][0]
  1999. else:
  2000. _dict["project_code"] = ""
  2001. _dict["doctitle_refine"] = _extract.get("doctitle_refine","")
  2002. if _dict["doctitle_refine"]=="":
  2003. _dict["doctitle_refine"] = _dict.get("doctitle")
  2004. _dict["moneys"] = set(_extract.get("moneys",[]))
  2005. _dict["moneys_attachment"] = set(_extract.get("moneys_attachment",[]))
  2006. _dict["nlp_enterprise"] = json.dumps({"indoctextcon":_extract.get("nlp_enterprise",[]),
  2007. "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])},ensure_ascii=False)
  2008. _dict["extract_count"] = _extract.get("extract_count",0)
  2009. _dict["package"] = self.c_f_get_package.evaluate(extract_json)
  2010. _dict["project_name"] = _extract.get("name","")
  2011. _dict["dict_time"] = self.get_dict_time(_extract)
  2012. _dict["punish"] = _extract.get("punish",{})
  2013. _dict["approval"] = _extract.get("approval",[])
  2014. _dict["products_original"] = _extract.get("product_attrs_original", {}).get("data",[])
  2015. _dict["products"] = _dict.get("products") if _dict.get("products") is not None else []
  2016. _dict["products"] = _dict["products"] if isinstance(_dict["products"], list) else json.loads(_dict["products"])
  2017. # 变更内容(变更答疑公告)
  2018. _dict["change_content"] = _extract.get("change_content","")
  2019. _dict["change_time"] = _extract.get("change_time","")
  2020. _dict["word_count"] = _extract.get("word_count", {})# 正文附件文本字数统计
  2021. # 专项债字段
  2022. issue_details = _extract.get("debt_dic",{}).get("issue_details",[])
  2023. _dict["is_special_bonds"] = 1 if _dict.get(document_tmp_docchannel)==302 and _dict.get(document_tmp_web_source_name)=='专项债券信息网' and issue_details else 0
  2024. # 采购意向字段
  2025. if _dict.get("docchannel")==114:
  2026. _dict["demand_info"] = _extract.get("demand_info",{}).get("data",[])
  2027. else:
  2028. _dict["demand_info"] = []
  2029. return _dict
  2030. def dumplicate_fianl_check(self,base_list,b_log=False):
  2031. the_group = base_list
  2032. # the_group.sort(key=lambda x:x["confidence"],reverse=True)
  2033. the_group.sort(key=lambda x:(x["confidence"],-x['docid']),reverse=True)
  2034. _index = 0
  2035. base_fingerprint = "None"
  2036. if len(base_list)>0:
  2037. base_fingerprint = base_list[0]["fingerprint"]
  2038. final_group = []
  2039. for _i in range(len(base_list)):
  2040. _dict1 = base_list[_i]
  2041. fingerprint_less = _dict1["fingerprint"]
  2042. _pass = True
  2043. if fingerprint_less==base_fingerprint:
  2044. _index = _i
  2045. final_group.append(_dict1)
  2046. continue
  2047. for _dict2 in final_group:
  2048. _prob,day_dis = self.dumplicate_check(_dict1,_dict2,_dict1.get("min_counts",10),b_log=b_log)
  2049. if _prob<=0.1:
  2050. _pass = False
  2051. # print('final check error',_dict1['docid'])
  2052. break
  2053. log("checking index:%d %s %.2f"%(_i,str(_pass),_prob))
  2054. _index = _i
  2055. if _pass:
  2056. final_group.append(_dict1)
  2057. # else:
  2058. # break
  2059. return final_group
  2060. def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
  2061. document_less = _dict1
  2062. docid_less = _dict1["docid"]
  2063. docchannel_less = document_less.get("docchannel",0)
  2064. page_time_less = document_less.get("page_time")
  2065. doctitle_refine_less = document_less["doctitle_refine"]
  2066. project_codes_less = document_less.get("project_codes")
  2067. nlp_enterprise_less = document_less["nlp_enterprise"]
  2068. tenderee_less = document_less.get("tenderee","")
  2069. agency_less = document_less.get("agency")
  2070. win_tenderer_less = document_less["win_tenderer"]
  2071. bidding_budget_less = document_less["bidding_budget"]
  2072. win_bid_price_less = document_less["win_bid_price"]
  2073. product_less = document_less.get("product")
  2074. package_less = document_less.get("package")
  2075. json_time_less = document_less.get("dict_time")
  2076. project_name_less = document_less.get("project_name")
  2077. fingerprint_less = document_less.get("fingerprint")
  2078. extract_count_less = document_less.get("extract_count",0)
  2079. web_source_no_less = document_less.get("web_source_no")
  2080. province_less = document_less.get("province")
  2081. city_less = document_less.get("city")
  2082. district_less = document_less.get("district")
  2083. moneys_less = document_less.get("moneys")
  2084. moneys_attachment_less = document_less.get("moneys_attachment")
  2085. page_attachments_less = document_less.get("page_attachments","[]")
  2086. punish_less = document_less.get("punish",{})
  2087. approval_less = document_less.get("approval",[])
  2088. source_type_less = document_less.get("source_type")
  2089. document_greater = _dict2
  2090. docid_greater = _dict2["docid"]
  2091. page_time_greater = document_greater["page_time"]
  2092. docchannel_greater = document_greater.get("docchannel",0)
  2093. doctitle_refine_greater = document_greater.get("doctitle_refine","")
  2094. project_codes_greater = document_greater["project_codes"]
  2095. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  2096. tenderee_greater = document_greater.get("tenderee","")
  2097. agency_greater = document_greater.get("agency","")
  2098. win_tenderer_greater = document_greater["win_tenderer"]
  2099. bidding_budget_greater = document_greater["bidding_budget"]
  2100. win_bid_price_greater = document_greater["win_bid_price"]
  2101. product_greater = document_greater.get("product")
  2102. package_greater = document_greater.get("package")
  2103. json_time_greater = document_greater["dict_time"]
  2104. project_name_greater = document_greater.get("project_name")
  2105. fingerprint_greater = document_greater.get("fingerprint")
  2106. extract_count_greater = document_greater.get("extract_count",0)
  2107. web_source_no_greater = document_greater.get("web_source_no")
  2108. province_greater = document_greater.get("province")
  2109. city_greater = document_greater.get("city")
  2110. district_greater = document_greater.get("district")
  2111. moneys_greater = document_greater.get("moneys")
  2112. moneys_attachment_greater = document_greater.get("moneys_attachment")
  2113. page_attachments_greater = document_greater.get("page_attachments","[]")
  2114. punish_greater = document_greater.get("punish",{})
  2115. approval_greater = document_greater.get("approval",[])
  2116. source_type_greater = document_greater.get("source_type")
  2117. hard_level=1
  2118. if docchannel_less==docchannel_greater==302:
  2119. hard_level=2
  2120. if web_source_no_less==web_source_no_greater=="17397-3":
  2121. hard_level=2
  2122. if self.check_rule==1:
  2123. _prob = check_dumplicate_rule(document_less,document_greater,min_counts,b_log=b_log,hard_level=hard_level)
  2124. else:
  2125. _prob = check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater)
  2126. pagetime_stamp_less = getTimeStamp(page_time_less)
  2127. pagetime_stamp_greater = getTimeStamp(page_time_greater)
  2128. day_dis = abs(pagetime_stamp_greater-pagetime_stamp_less)//86400
  2129. if document_less.get("is_special_bonds",0)==document_greater.get("is_special_bonds",0)==1:
  2130. pass
  2131. else:
  2132. if day_dis>7:
  2133. _prob = 0
  2134. elif day_dis>3:
  2135. if _prob<0.4:
  2136. _prob = 0
  2137. return _prob,day_dis
  2138. def dumplicate_check_bak(self,_dict1,_dict2,min_counts,b_log=False):
  2139. document_less = _dict1
  2140. docid_less = _dict1["docid"]
  2141. docchannel_less = document_less["docchannel"]
  2142. page_time_less = document_less["page_time"]
  2143. doctitle_refine_less = document_less["doctitle_refine"]
  2144. project_codes_less = document_less["project_codes"]
  2145. nlp_enterprise_less = document_less["nlp_enterprise"]
  2146. tenderee_less = document_less["tenderee"]
  2147. agency_less = document_less["agency"]
  2148. win_tenderer_less = document_less["win_tenderer"]
  2149. bidding_budget_less = document_less["bidding_budget"]
  2150. win_bid_price_less = document_less["win_bid_price"]
  2151. product_less = document_less["product"]
  2152. package_less = document_less["package"]
  2153. json_time_less = document_less["dict_time"]
  2154. project_name_less = document_less["project_name"]
  2155. fingerprint_less = document_less["fingerprint"]
  2156. extract_count_less = document_less["extract_count"]
  2157. document_greater = _dict2
  2158. docid_greater = _dict2["docid"]
  2159. page_time_greater = document_greater["page_time"]
  2160. doctitle_refine_greater = document_greater["doctitle_refine"]
  2161. project_codes_greater = document_greater["project_codes"]
  2162. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  2163. tenderee_greater = document_greater["tenderee"]
  2164. agency_greater = document_greater["agency"]
  2165. win_tenderer_greater = document_greater["win_tenderer"]
  2166. bidding_budget_greater = document_greater["bidding_budget"]
  2167. win_bid_price_greater = document_greater["win_bid_price"]
  2168. product_greater = document_greater["product"]
  2169. package_greater = document_greater["package"]
  2170. json_time_greater = document_greater["dict_time"]
  2171. project_name_greater = document_greater["project_name"]
  2172. fingerprint_greater = document_greater["fingerprint"]
  2173. extract_count_greater = document_greater["extract_count"]
  2174. if fingerprint_less==fingerprint_greater:
  2175. return 1
  2176. same_count = 0
  2177. all_count = 8
  2178. if len(set(project_codes_less) & set(project_codes_greater))>0:
  2179. same_count += 1
  2180. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  2181. same_count += 1
  2182. if getLength(agency_less)>0 and agency_less==agency_greater:
  2183. same_count += 1
  2184. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  2185. same_count += 1
  2186. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  2187. same_count += 1
  2188. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  2189. same_count += 1
  2190. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  2191. same_count += 1
  2192. if getLength(doctitle_refine_less)>0 and (doctitle_refine_less==doctitle_refine_greater or doctitle_refine_less in doctitle_refine_greater or doctitle_refine_greater in doctitle_refine_less):
  2193. same_count += 1
  2194. base_prob = 0
  2195. if min_counts<3:
  2196. base_prob = 0.9
  2197. elif min_counts<5:
  2198. base_prob = 0.8
  2199. elif min_counts<8:
  2200. base_prob = 0.7
  2201. else:
  2202. base_prob = 0.6
  2203. _prob = base_prob*same_count/all_count
  2204. if _prob<0.1 and min(extract_count_less,extract_count_greater)<=3:
  2205. _prob = 0.15
  2206. if _prob<0.1:
  2207. return _prob
  2208. check_result = {"pass":1}
  2209. if docchannel_less in (51,102,103,104,115,116,117):
  2210. if doctitle_refine_less!=doctitle_refine_greater:
  2211. if page_time_less!=page_time_greater:
  2212. check_result["docchannel"] = 0
  2213. check_result["pass"] = 0
  2214. else:
  2215. check_result["docchannel"] = 2
  2216. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater,page_time_less,page_time_greater):
  2217. check_result["doctitle"] = 0
  2218. check_result["pass"] = 0
  2219. if b_log:
  2220. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  2221. else:
  2222. check_result["doctitle"] = 2
  2223. #added check
  2224. if not check_codes(project_codes_less,project_codes_greater):
  2225. check_result["code"] = 0
  2226. check_result["pass"] = 0
  2227. if b_log:
  2228. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  2229. else:
  2230. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  2231. check_result["code"] = 2
  2232. else:
  2233. check_result["code"] = 1
  2234. if not check_product(product_less,product_greater,doctitle_refine_less,doctitle_refine_greater):
  2235. check_result["product"] = 0
  2236. check_result["pass"] = 0
  2237. if b_log:
  2238. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  2239. else:
  2240. if getLength(product_less)>0 and getLength(product_greater)>0:
  2241. check_result["product"] = 2
  2242. else:
  2243. check_result["product"] = 1
  2244. if not check_demand():
  2245. check_result["pass"] = 0
  2246. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  2247. tenderee_less,tenderee_greater,
  2248. agency_less,agency_greater,
  2249. win_tenderer_less,win_tenderer_greater):
  2250. check_result["entity"] = 0
  2251. check_result["pass"] = 0
  2252. if b_log:
  2253. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  2254. else:
  2255. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  2256. check_result["entity"] = 2
  2257. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  2258. check_result["entity"] = 2
  2259. else:
  2260. check_result["entity"] = 1
  2261. if not check_money(bidding_budget_less,bidding_budget_greater,
  2262. win_bid_price_less,win_bid_price_greater):
  2263. if b_log:
  2264. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  2265. check_result["money"] = 0
  2266. check_result["pass"] = 0
  2267. else:
  2268. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  2269. check_result["money"] = 2
  2270. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  2271. check_result["money"] = 2
  2272. else:
  2273. check_result["money"] = 1
  2274. #added check
  2275. if not check_package(package_less,package_greater):
  2276. if b_log:
  2277. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  2278. check_result["package"] = 0
  2279. check_result["pass"] = 0
  2280. else:
  2281. if getLength(package_less)>0 and getLength(package_greater)>0:
  2282. check_result["package"] = 2
  2283. else:
  2284. check_result["package"] = 1
  2285. #added check
  2286. if not check_time(json_time_less,json_time_greater):
  2287. if b_log:
  2288. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  2289. if isinstance(json_time_less,dict):
  2290. time_less = json_time_less
  2291. else:
  2292. time_less = json.loads(json_time_less)
  2293. if isinstance(json_time_greater,dict):
  2294. time_greater = json_time_greater
  2295. else:
  2296. time_greater = json.loads(json_time_greater)
  2297. for k,v in time_less.items():
  2298. if getLength(v)>0:
  2299. v1 = time_greater.get(k,"")
  2300. if getLength(v1)>0:
  2301. if v!=v1:
  2302. log("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  2303. check_result["time"] = 0
  2304. check_result["pass"] = 0
  2305. else:
  2306. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  2307. check_result["time"] = 2
  2308. else:
  2309. check_result["time"] = 1
  2310. if check_result.get("pass",0)==0:
  2311. if b_log:
  2312. logging.info(str(check_result))
  2313. if check_result.get("money",1)==0:
  2314. return 0
  2315. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  2316. return _prob
  2317. else:
  2318. return 0
  2319. if check_result.get("time",1)==0:
  2320. return 0
  2321. return _prob
  2322. def search_data_by_query(self,item,_query,confidence,retry_times=3,merge=False,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count],b_log=False):
  2323. for _ in range(retry_times):
  2324. try:
  2325. _time = time.time()
  2326. check_time = 0
  2327. if isinstance(_query,list):
  2328. bool_query = BoolQuery(should_queries=_query)
  2329. else:
  2330. bool_query = _query
  2331. rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  2332. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=100,get_total_count=True),
  2333. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  2334. list_dict = getRow_ots(rows)
  2335. list_data = []
  2336. set_docids = set([doc.get(document_tmp_docid) for doc in list_dict])
  2337. current_date = getCurrent_date("%Y-%m-%d")
  2338. page_time = item.get("page_time","")
  2339. crtime = item.get("crtime","")[:10]
  2340. if page_time == '':
  2341. page_time = current_date
  2342. if crtime == '':
  2343. crtime = current_date
  2344. # 新爬取的历史数据去重时,document表无数据,补充document_tmp表的数据
  2345. if table_name=='document' and page_time<timeAdd(current_date,-7) and crtime>=timeAdd(current_date,-7):
  2346. rows, next_token, total_count, is_all_succeed = self.ots_client.search("document_tmp", "document_tmp_index",
  2347. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=100,get_total_count=True),
  2348. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  2349. list_dict_add = getRow_ots(rows)
  2350. for doc in list_dict_add:
  2351. if doc.get(document_tmp_docid) not in set_docids:
  2352. list_dict.append(doc)
  2353. for _dict in list_dict:
  2354. self.post_extract(_dict)
  2355. _docid = _dict.get(document_tmp_docid)
  2356. if merge:
  2357. list_data.append(_dict)
  2358. else:
  2359. if _docid!=item.get(document_tmp_docid):
  2360. _time1 = time.time()
  2361. confidence,day_dis = self.dumplicate_check(item,_dict,total_count,b_log=b_log)
  2362. check_time+= time.time()-_time1
  2363. _dict["confidence"] = confidence
  2364. _dict["min_counts"] = total_count
  2365. list_data.append(_dict)
  2366. all_time = time.time()-_time
  2367. # log("check:%d rows takes%.4f,check%.4f"%(len(list_dict),all_time-check_time,check_time))
  2368. return list_data
  2369. except Exception as e:
  2370. traceback.print_exc()
  2371. return []
  2372. def add_data_by_query(self,item,base_list,set_docid,_query,confidence,table_name,table_index,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_save,document_tmp_status,document_tmp_product,document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count],b_log=False):
  2373. list_dict = self.search_data_by_query(item,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys,columns=columns,b_log=b_log)
  2374. for _dict in list_dict:
  2375. _docid = _dict.get(document_tmp_docid)
  2376. confidence = _dict["confidence"]
  2377. if b_log:
  2378. log("confidence %d %.3f total_count %d"%(_docid,confidence,_dict.get('min_counts',0)))
  2379. if confidence>0.1:
  2380. if _docid not in set_docid:
  2381. base_list.append(_dict)
  2382. set_docid.add(_docid)
  2383. set_docid.add(_docid)
  2384. def appendRule(self,list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=False):
  2385. for k,v in _dict.items():
  2386. if getLength(v)==0:
  2387. return
  2388. _dict.update(base_dict)
  2389. if b_log:
  2390. log("rule dict:"+str(_dict))
  2391. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  2392. _rule = {"confidence":confidence,
  2393. "item":item,
  2394. "query":_query,
  2395. "singleNum_keys":[],
  2396. "contain_keys":[],
  2397. "multiNum_keys":[],
  2398. "_dict":_dict}
  2399. list_rules.append(_rule)
  2400. def translate_dumplicate_rules(self,status_from,item,get_all=False,to_log=False,day_dis=7,table_name ="document_tmp",table_index="document_tmp_index"):
  2401. docchannel,project_code,project_name,tenderee,agency,doctitle,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
  2402. current_date = getCurrent_date("%Y-%m-%d")
  2403. if page_time=='':
  2404. page_time = current_date
  2405. two_day_dict = {"page_time":[timeAdd(page_time,-7),timeAdd(page_time,7)]}
  2406. if table_name in {"document_tmp","document"}:
  2407. # if page_time>=timeAdd(current_date,-7) and item.get("is_special_bonds")!=1:
  2408. if page_time>=timeAdd(current_date,-7) and item.get("is_special_bonds")!=1 and not get_all:
  2409. table_name = "document_tmp"
  2410. table_index = "document_tmp_index"
  2411. base_dict = {
  2412. "docchannel":item.get("docchannel",52),
  2413. "status":[status_from[0]],
  2414. "page_time":[timeAdd(page_time,-day_dis),timeAdd(page_time,day_dis)]
  2415. }
  2416. must_not_dict = {"save":0,"docid":item.get("docid")}
  2417. doctitle_refine_name = "doctitle_refine"
  2418. else:
  2419. table_name = "document"
  2420. table_index = "document_index"
  2421. if get_all:
  2422. _status = [0,450]
  2423. else:
  2424. _status = [0,300]
  2425. base_dict = {
  2426. "docchannel":item["docchannel"],
  2427. "status":_status,
  2428. "page_time":[timeAdd(page_time,-day_dis),timeAdd(page_time,day_dis)]
  2429. }
  2430. must_not_dict = {"docid":item.get("docid")}
  2431. doctitle_refine_name = "doctitle"
  2432. doctitle_refine = doctitle
  2433. else:
  2434. _status = [201,300]
  2435. base_dict = {
  2436. "docchannel":item["docchannel"],
  2437. "status":_status,
  2438. "page_time":[timeAdd(page_time,-day_dis),timeAdd(page_time,day_dis)]
  2439. }
  2440. must_not_dict = {"docid":item.get("docid")}
  2441. doctitle_refine_name = "doctitle"
  2442. doctitle_refine = doctitle
  2443. list_rules = []
  2444. singleNum_keys = ["tenderee","win_tenderer"]
  2445. confidence = 100
  2446. self.appendRule(list_rules,{document_tmp_fingerprint:fingerprint},base_dict,must_not_dict,confidence,item,b_log=to_log)
  2447. confidence = 90
  2448. _dict = {document_tmp_agency:agency,
  2449. "win_tenderer":win_tenderer,
  2450. "win_bid_price":win_bid_price}
  2451. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2452. _dict = {document_tmp_agency:agency,
  2453. "win_tenderer":win_tenderer,
  2454. "bidding_budget":bidding_budget}
  2455. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2456. _dict = {document_tmp_agency:agency,
  2457. "win_bid_price":win_bid_price,
  2458. "bidding_budget":bidding_budget}
  2459. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2460. _dict = {win_tenderer:win_tenderer,
  2461. "win_bid_price":win_bid_price,
  2462. "bidding_budget":bidding_budget}
  2463. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2464. _dict = {"tenderee":tenderee,
  2465. "win_tenderer":win_tenderer,
  2466. "win_bid_price":win_bid_price}
  2467. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2468. _dict = {"tenderee":tenderee,
  2469. "win_tenderer":win_tenderer,
  2470. "bidding_budget":bidding_budget}
  2471. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2472. _dict = {"tenderee":tenderee,
  2473. "win_bid_price":win_bid_price,
  2474. "bidding_budget":bidding_budget}
  2475. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2476. _dict = {"tenderee":tenderee,
  2477. "agency":agency,
  2478. "win_tenderer":win_tenderer}
  2479. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2480. _dict = {"tenderee":tenderee,
  2481. "agency":agency,
  2482. "win_bid_price":win_bid_price}
  2483. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2484. _dict = {"tenderee":tenderee,
  2485. "agency":agency,
  2486. "bidding_budget":bidding_budget}
  2487. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2488. _dict = {"tenderee":tenderee,
  2489. "project_codes":project_code
  2490. }
  2491. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2492. _dict = {"tenderee":tenderee,
  2493. "win_bid_price":win_bid_price
  2494. }
  2495. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2496. _dict = {"agency":agency,
  2497. "project_codes":project_code
  2498. }
  2499. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2500. _dict = {"win_tenderer":win_tenderer,
  2501. "bidding_budget":bidding_budget
  2502. }
  2503. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2504. _dict = {"project_codes":project_code,
  2505. "win_bid_price":win_bid_price
  2506. }
  2507. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2508. _dict = {"project_codes":project_code,
  2509. "bidding_budget":bidding_budget
  2510. }
  2511. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2512. _dict = {"project_codes":project_code,
  2513. doctitle_refine_name:doctitle_refine
  2514. }
  2515. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2516. _dict = {"tenderee":tenderee,
  2517. "bidding_budget":bidding_budget
  2518. }
  2519. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2520. _dict = {"project_codes":project_code,
  2521. "win_tenderer":win_tenderer
  2522. }
  2523. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2524. base_dict.update(two_day_dict)
  2525. confidence=85
  2526. _dict = {"tenderee":tenderee,
  2527. "agency":agency
  2528. }
  2529. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2530. _dict = {"tenderee":tenderee,
  2531. "project_name":project_name
  2532. }
  2533. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2534. if getLength(product)>0:
  2535. l_p = product.split(",")
  2536. _dict = {"tenderee":tenderee,
  2537. "product":l_p[0]
  2538. }
  2539. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2540. _dict = {"tenderee":tenderee,
  2541. "win_tenderer":win_tenderer
  2542. }
  2543. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2544. _dict = {"tenderee":tenderee,
  2545. doctitle_refine_name:doctitle_refine
  2546. }
  2547. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2548. _dict = {"agency":agency,
  2549. "project_name":project_name
  2550. }
  2551. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2552. _dict = {"project_codes":project_code,
  2553. "project_name":project_name
  2554. }
  2555. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2556. _dict = {"project_name":project_name,
  2557. "win_tenderer":win_tenderer
  2558. }
  2559. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2560. _dict = {"project_name":project_name,
  2561. "win_bid_price":win_bid_price
  2562. }
  2563. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2564. _dict = {"project_name":project_name,
  2565. "bidding_budget":bidding_budget
  2566. }
  2567. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2568. _dict = {"project_name":project_name,
  2569. doctitle_refine_name:doctitle_refine
  2570. }
  2571. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2572. _dict = {"win_tenderer":win_tenderer,
  2573. "win_bid_price":win_bid_price
  2574. }
  2575. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2576. _dict = {"win_tenderer":win_tenderer,
  2577. doctitle_refine_name:doctitle_refine
  2578. }
  2579. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2580. _dict = {"win_bid_price":win_bid_price,
  2581. "bidding_budget":bidding_budget
  2582. }
  2583. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2584. confidence=80
  2585. _dict = {"project_codes":project_code}
  2586. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2587. _dict = {"win_bid_price":win_bid_price,
  2588. doctitle_refine_name:doctitle_refine
  2589. }
  2590. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2591. _dict = {"bidding_budget":bidding_budget,
  2592. doctitle_refine_name:doctitle_refine
  2593. }
  2594. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2595. confidence=80
  2596. _dict = {doctitle_refine_name:doctitle_refine}
  2597. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2598. # 专项债
  2599. if item.get("is_special_bonds")==1:
  2600. confidence = 90
  2601. _dict = {doctitle_refine_name: doctitle_refine,
  2602. document_tmp_web_source_name:"专项债券信息网"}
  2603. tmp_base_dict = {
  2604. "docchannel": item["docchannel"],
  2605. "status": [0, 450],
  2606. # "page_time": [timeAdd(page_time, -365), timeAdd(page_time, 365)]
  2607. }
  2608. self.appendRule(list_rules, _dict, tmp_base_dict, must_not_dict, confidence, item, b_log=to_log)
  2609. confidence=70
  2610. _dict = {"project_name":project_name}
  2611. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2612. return list_rules,table_name,table_index
  2613. def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type,"detail_link",'products','crtime']):
  2614. q_size = self.queue_dumplicate.qsize()
  2615. log("dumplicate queue size %d"%(q_size))
  2616. while 1:
  2617. try:
  2618. docid = self.queue_dumplicate_processed.get(block=False)
  2619. if docid in self.dumplicate_set:
  2620. self.dumplicate_set.remove(docid)
  2621. except Exception as e:
  2622. break
  2623. if q_size>process_count//3:
  2624. return
  2625. bool_query = BoolQuery(must_queries=[
  2626. RangeQuery(document_tmp_status,*status_from,True,True),
  2627. # TermQuery("docid",271983871)
  2628. ])
  2629. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  2630. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(document_update_document,SortOrder.DESC),FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
  2631. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  2632. log("flow_dumplicate producer total_count:%d"%total_count)
  2633. list_dict = getRow_ots(rows)
  2634. for _dict in list_dict:
  2635. docid = _dict.get(document_tmp_docid)
  2636. if docid in self.dumplicate_set:
  2637. continue
  2638. self.dumplicate_set.add(docid)
  2639. self.queue_dumplicate.put(_dict)
  2640. _count = len(list_dict)
  2641. while next_token and _count<process_count:
  2642. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  2643. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  2644. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  2645. list_dict = getRow_ots(rows)
  2646. for _dict in list_dict:
  2647. docid = _dict.get(document_tmp_docid)
  2648. if docid in self.dumplicate_set:
  2649. continue
  2650. self.dumplicate_set.add(docid)
  2651. self.queue_dumplicate.put(_dict)
  2652. _count += len(list_dict)
  2653. # _l = list(self.dumplicate_set)
  2654. # _l.sort(key=lambda x:x,reverse=True)
  2655. # self.dumplicate_set = set(_l[:flow_process_count*2])
  2656. def comsumer_flow_dumplicate(self):
  2657. mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,60,1,ots_client=self.ots_client)
  2658. mt.run()
  2659. def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
  2660. self.producer_flow_dumplicate(process_count=process_count,status_from=status_from)
  2661. # self.comsumer_flow_dumplicate()
  2662. def flow_dumpcate_comsumer(self):
  2663. from multiprocessing import Process
  2664. process_count = 6
  2665. thread_count = 12
  2666. list_process = []
  2667. def start_thread():
  2668. mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,thread_count,1,need_stop=False,restart=True,timeout=600,ots_client=self.ots_client)
  2669. mt.run()
  2670. for _ in range(process_count):
  2671. p = Process(target=start_thread)
  2672. list_process.append(p)
  2673. for p in list_process:
  2674. p.start()
  2675. while 1:
  2676. for _i in range(len(list_process)):
  2677. p = list_process[_i]
  2678. if not p.is_alive():
  2679. p = Process(target=start_thread)
  2680. list_process[_i] = p
  2681. p.start()
  2682. time.sleep(1)
  2683. # mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,40,1,ots_client=self.ots_client)
  2684. # mt.run()
  2685. def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment,document_tenderee_code,document_agency_code,document_candidates],document_name="document"):
  2686. '''
  2687. 根据docid查询公告内容,先查询document_tmp,再查询document
  2688. :param list_docids:
  2689. :return:
  2690. '''
  2691. list_docs = []
  2692. set_fingerprint = set()
  2693. for _docid in list_docids:
  2694. docid = int(_docid)
  2695. _dict = {document_partitionkey:getPartitionKey(docid),
  2696. document_docid:docid}
  2697. if document_name in {"document","document_tmp"}:
  2698. _doc = Document_tmp(_dict)
  2699. _exists = _doc.fix_columns(self.ots_client,columns_to_get,True)
  2700. if not _exists:
  2701. _doc = Document(_dict)
  2702. _exists = _doc.fix_columns(self.ots_client,columns_to_get,True)
  2703. else:
  2704. _doc = Document(_dict)
  2705. _doc.table_name = document_name
  2706. _exists = _doc.fix_columns(self.ots_client,columns_to_get,True)
  2707. if _exists:
  2708. _fingerprint = _doc.getProperties().get(document_fingerprint)
  2709. if _fingerprint in set_fingerprint:
  2710. continue
  2711. set_fingerprint.add(_fingerprint)
  2712. list_docs.append(_doc)
  2713. for _doc in list_docs:
  2714. try:
  2715. _sub_docs_json = _doc.getProperties().get(document_tmp_sub_docs_json)
  2716. if _sub_docs_json is not None:
  2717. _doc.setValue("sub_docs",json.loads(_sub_docs_json),False)
  2718. except Exception as e:
  2719. traceback.print_exc()
  2720. list_docs.sort(key=lambda x:x.getProperties().get(document_docid,""))
  2721. list_docs.sort(key=lambda x:x.getProperties().get(document_page_time,""))
  2722. return list_docs
  2723. def is_same_package(self,_dict1,_dict2):
  2724. sub_project_name1 = _dict1.get(project_sub_project_name,"")
  2725. if sub_project_name1=="Project":
  2726. sub_project_name1 = ""
  2727. win_tenderer1 = _dict1.get(project_win_tenderer,"")
  2728. win_bid_price1 = _dict1.get(project_win_bid_price,0)
  2729. bidding_budget1 = _dict1.get(project_bidding_budget,0)
  2730. sub_project_name2 = _dict2.get(project_sub_project_name,"")
  2731. if sub_project_name2=="Project":
  2732. sub_project_name2 = ""
  2733. win_tenderer2 = _dict2.get(project_win_tenderer,"")
  2734. win_bid_price2 = _dict2.get(project_win_bid_price,0)
  2735. bidding_budget2 = _dict2.get(project_bidding_budget,0)
  2736. _set = set([a for a in [sub_project_name1,sub_project_name2] if a!=""])
  2737. if len(_set)>1:
  2738. return False
  2739. _set = set([a for a in [win_tenderer1,win_tenderer2] if a!=""])
  2740. if len(_set)>1:
  2741. return False
  2742. _set = set([a for a in [win_bid_price1,win_bid_price2] if a!=0])
  2743. if len(_set)>1:
  2744. return False
  2745. _set = set([a for a in [bidding_budget1,bidding_budget2] if a!=0])
  2746. if len(_set)>1:
  2747. return False
  2748. return True
  2749. def getUpdate_dict(self,_dict):
  2750. update_dict = {}
  2751. for k,v in _dict.items():
  2752. if v is None:
  2753. continue
  2754. if isinstance(v,str):
  2755. if v=="":
  2756. continue
  2757. if isinstance(v,(float,int)):
  2758. if v==0:
  2759. continue
  2760. update_dict[k] = v
  2761. return update_dict
  2762. def update_projects_by_document(self,docid,save,projects,document_name="document"):
  2763. '''
  2764. 更新projects中对应的document的属性
  2765. :param docid:
  2766. :param projects: 项目集合
  2767. :param action:add/delete add时附加唯一属性,delete时删除唯一属性
  2768. :return:
  2769. '''
  2770. list_docs = self.search_docs([docid],document_name=document_name)
  2771. docs = [_doc.getProperties() for _doc in list_docs]
  2772. project_dict = generate_common_properties(docs)
  2773. list_package_properties = generate_packages_properties(docs)
  2774. _dict = {}
  2775. #更新公共属性
  2776. _replace_replace = False
  2777. v = project_dict.get(document_district,"")
  2778. if not (v is None or v=="" or v=="[]" or v=="未知"):
  2779. _replace_replace = True
  2780. for k,v in project_dict.items():
  2781. if not _replace_replace:
  2782. if k in [document_district,document_city,document_province,document_area]:
  2783. continue
  2784. if v is None or v=="" or v=="[]" or v=="未知":
  2785. continue
  2786. if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_candidates,project_zhong_biao_page_time,project_zhao_biao_page_time,project_page_time,project_docchannel):
  2787. continue
  2788. _dict[k] = v
  2789. for _proj in projects:
  2790. _proj.update(_dict)
  2791. for _proj in projects:
  2792. if _proj.get(project_page_time,"")<=project_dict.get(project_page_time,""):
  2793. _proj[project_page_time] = project_dict.get(project_page_time,"")
  2794. _proj[project_docchannel] = project_dict.get(project_docchannel,"")
  2795. else:
  2796. if project_docchannel in project_dict:
  2797. project_dict.pop(project_docchannel)
  2798. if not _proj.get(project_zhong_biao_page_time,""):
  2799. _proj[project_zhong_biao_page_time] = project_dict.get(project_zhong_biao_page_time, "")
  2800. elif _proj.get(project_zhong_biao_page_time,"")>project_dict.get(project_zhong_biao_page_time,""):
  2801. if project_dict.get(project_zhong_biao_page_time,""):
  2802. _proj[project_zhong_biao_page_time] = project_dict.get(project_zhong_biao_page_time,"")
  2803. if not _proj.get(project_zhao_biao_page_time,""):
  2804. _proj[project_zhao_biao_page_time] = project_dict.get(project_zhao_biao_page_time, "")
  2805. elif _proj.get(project_zhao_biao_page_time,"")>project_dict.get(project_zhao_biao_page_time,""):
  2806. if project_dict.get(project_zhao_biao_page_time,""):
  2807. _proj[project_zhao_biao_page_time] = project_dict.get(project_zhao_biao_page_time,"")
  2808. for _proj in projects:
  2809. #拼接属性
  2810. append_dict = {}
  2811. set_docid = set()
  2812. set_product = set()
  2813. set_code = set()
  2814. set_nlp_enterprise = set()
  2815. set_nlp_enterprise_attachment = set()
  2816. set_candidates = set()
  2817. _docids = _proj.get(project_docids,"")
  2818. _codes = _proj.get(project_project_codes,"")
  2819. _product = _proj.get(project_product,"")
  2820. set_docid = set(_docids.split(","))
  2821. if save==1:
  2822. set_docid.add(str(docid))
  2823. else:
  2824. if str(docid) in set_docid:
  2825. set_docid.remove(str(docid))
  2826. set_code = set_code | set(_codes.split(","))
  2827. set_product = set_product | set(_product.split(","))
  2828. try:
  2829. set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
  2830. set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
  2831. list_candidates = json.loads(project_dict.get(project_candidates,"[]"))
  2832. for item in list_candidates:
  2833. if item.get("name") is not None and item.get("name") not in set_candidates:
  2834. set_candidates.add(item.get("name"))
  2835. set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
  2836. set_product = set_product | set(project_dict.get(project_product,"").split(","))
  2837. set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
  2838. set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
  2839. for item in json.loads(_proj.get(project_candidates,"[]")):
  2840. if item.get("name") is not None and item.get("name") not in set_candidates:
  2841. set_candidates.add(item.get("name"))
  2842. list_candidates.append(item)
  2843. except Exception as e:
  2844. pass
  2845. append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
  2846. append_dict[project_docid_number] = len(set_docid)
  2847. append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""])
  2848. append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""])
  2849. append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
  2850. append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
  2851. append_dict[project_candidates] = json.dumps(list_candidates,ensure_ascii=False)
  2852. dict_dynamic = {}
  2853. set_docid = set()
  2854. _dynamic = json.loads(_proj.get(project_project_dynamics,"[]"))
  2855. for _dy in _dynamic:
  2856. _docid = _dy.get("docid")
  2857. dict_dynamic[_docid] = _dy
  2858. _dynamic = json.loads(project_dict.get(project_project_dynamics,"[]"))
  2859. for _dy in _dynamic:
  2860. _docid = _dy.get("docid")
  2861. dict_dynamic[_docid] = _dy
  2862. list_dynamics = []
  2863. for k,v in dict_dynamic.items():
  2864. list_dynamics.append(v)
  2865. list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
  2866. append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
  2867. _proj.update(append_dict)
  2868. dict_package = {}
  2869. for _pp in projects:
  2870. _counts = 0
  2871. sub_project_name = _pp.get(project_sub_project_name,"")
  2872. if sub_project_name=="Project":
  2873. sub_project_name = ""
  2874. win_tenderer = _pp.get(project_win_tenderer,"")
  2875. win_bid_price = _pp.get(project_win_bid_price,0)
  2876. bidding_budget = _pp.get(project_bidding_budget,0)
  2877. if win_tenderer!="" and bidding_budget!=0:
  2878. _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
  2879. dict_package[_key] = _pp
  2880. _counts += 1
  2881. if win_tenderer!="" and win_bid_price!=0:
  2882. _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
  2883. dict_package[_key] = _pp
  2884. _counts +=1
  2885. if _counts==0:
  2886. if win_tenderer!="":
  2887. _key = "%s-%s"%(sub_project_name,win_tenderer)
  2888. dict_package[_key] = _pp
  2889. _counts += 1
  2890. if bidding_budget!=0:
  2891. _key = "%s-%s"%(sub_project_name,str(bidding_budget))
  2892. dict_package[_key] = _pp
  2893. _counts += 1
  2894. #更新私有属性
  2895. if len(projects)==1 and len(list_package_properties)==1:
  2896. _pp = list_package_properties[0]
  2897. pp = projects[0]
  2898. ud = self.getUpdate_dict(_pp)
  2899. self.set_project_uuid(ud,pp.get("uuid"))
  2900. pp.update(_pp)
  2901. else:
  2902. for _pp in list_package_properties:
  2903. flag_update = False
  2904. sub_project_name = _pp.get(project_sub_project_name,"")
  2905. if sub_project_name=="Project":
  2906. sub_project_name = ""
  2907. win_tenderer = _pp.get(project_win_tenderer,"")
  2908. win_bid_price = _pp.get(project_win_bid_price,0)
  2909. bidding_budget = _pp.get(project_bidding_budget,0)
  2910. if win_tenderer!="" and bidding_budget!=0:
  2911. _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
  2912. if _key in dict_package:
  2913. if self.is_same_package(_pp,dict_package[_key]):
  2914. ud = self.getUpdate_dict(_pp)
  2915. self.set_project_uuid(ud,dict_package[_key].get("uuid"))
  2916. dict_package[_key].update(ud)
  2917. flag_update = True
  2918. continue
  2919. if win_tenderer!="" and win_bid_price!=0:
  2920. _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
  2921. if _key in dict_package:
  2922. if self.is_same_package(_pp,dict_package[_key]):
  2923. ud = self.getUpdate_dict(_pp)
  2924. self.set_project_uuid(ud,dict_package[_key].get("uuid"))
  2925. dict_package[_key].update(ud)
  2926. flag_update = True
  2927. continue
  2928. if win_tenderer!="":
  2929. _key = "%s-%s"%(sub_project_name,win_tenderer)
  2930. if _key in dict_package:
  2931. if self.is_same_package(_pp,dict_package[_key]):
  2932. ud = self.getUpdate_dict(_pp)
  2933. self.set_project_uuid(ud,dict_package[_key].get("uuid"))
  2934. dict_package[_key].update(ud)
  2935. flag_update = True
  2936. continue
  2937. if bidding_budget!=0:
  2938. _key = "%s-%s"%(sub_project_name,str(bidding_budget))
  2939. if _key in dict_package:
  2940. if self.is_same_package(_pp,dict_package[_key]):
  2941. ud = self.getUpdate_dict(_pp)
  2942. self.set_project_uuid(ud,dict_package[_key].get("uuid"))
  2943. dict_package[_key].update(ud)
  2944. flag_update = True
  2945. continue
  2946. if not flag_update:
  2947. _pp.update(project_dict)
  2948. projects.append(_pp)
  2949. _counts = 0
  2950. if win_tenderer!="" and bidding_budget!=0:
  2951. _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
  2952. dict_package[_key] = _pp
  2953. _counts += 1
  2954. if win_tenderer!="" and win_bid_price!=0:
  2955. _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
  2956. dict_package[_key] = _pp
  2957. _counts +=1
  2958. if _counts==0:
  2959. if win_tenderer!="":
  2960. _key = "%s-%s"%(sub_project_name,win_tenderer)
  2961. dict_package[_key] = _pp
  2962. _counts += 1
  2963. if bidding_budget!=0:
  2964. _key = "%s-%s"%(sub_project_name,str(bidding_budget))
  2965. dict_package[_key] = _pp
  2966. _counts += 1
  2967. def delete_projects_by_document(self,docid):
  2968. '''
  2969. 更新projects中对应的document的属性
  2970. :param docid:
  2971. :param projects: 项目集合
  2972. :param action:add/delete add时附加唯一属性,delete时删除唯一属性
  2973. :return:
  2974. '''
  2975. set_docid = set()
  2976. list_delete_projects = []
  2977. list_projects = self.search_projects_with_document([docid],project_table="project2",project_table_index="project2_index_formerge")
  2978. for _proj in list_projects:
  2979. _p = {}
  2980. _docids = _proj.get(project_docids,"")
  2981. print(_proj.get(project_uuid))
  2982. _p["delete_uuid"] = _proj.get(project_uuid)
  2983. _p["to_delete"] = True
  2984. list_delete_projects.append(_p)
  2985. if _docids!="":
  2986. set_docid = set_docid | set(_docids.split(","))
  2987. if str(docid) in set_docid:
  2988. set_docid.remove(str(docid))
  2989. list_docid = list(set_docid)
  2990. list_projects = []
  2991. if len(list_docid)>0:
  2992. list_docs = self.search_docs(list_docid)
  2993. print("search_docs(list_docid)")
  2994. list_projects = self.generate_projects_from_document(list_docs)
  2995. print("generate_projects_from_document")
  2996. list_projects = dumplicate_projects(list_projects,max_count=20)
  2997. print("dumplicate_projects")
  2998. list_projects.extend(list_delete_projects)
  2999. project_json = to_project_json(list_projects)
  3000. return project_json
  3001. def delete_doc_handle(self,_dict,result_queue):
  3002. try:
  3003. headers = _dict.get("frame")
  3004. conn = _dict.get("conn")
  3005. if headers is not None:
  3006. message_id = headers.headers["message-id"]
  3007. body = headers.body
  3008. item = json.loads(body)
  3009. docid = item.get("docid")
  3010. log("==========start delete docid:%s"%(str(docid)))
  3011. if docid is None:
  3012. ackMsg(conn,message_id)
  3013. delete_result = self.delete_projects_by_document(docid)
  3014. log("1")
  3015. _uuid = uuid4().hex
  3016. _d = {PROJECT_PROCESS_UUID:_uuid,
  3017. PROJECT_PROCESS_CRTIME:1,
  3018. PROJECT_PROCESS_PROJECTS:delete_result}
  3019. _pp = Project_process(_d)
  3020. log("2")
  3021. try:
  3022. if _pp.update_row(self.ots_client):
  3023. ackMsg(conn,message_id)
  3024. except Exception as e:
  3025. ackMsg(conn,message_id)
  3026. log("3")
  3027. #取消插入结果队列,改成插入project_process表
  3028. # if send_msg_toacmq(self.pool_mq_ali,delete_result,self.doc_delete_result):
  3029. # ackMsg(conn,message_id)
  3030. log("==========end delete docid:%s"%(str(docid)))
  3031. else:
  3032. log("has not headers")
  3033. except Exception as e:
  3034. traceback.print_exc()
  3035. ackMsg(conn,message_id)
  3036. log("==========end delete docid:%s"%(str(docid)))
  3037. def generate_common_properties(self,list_docs):
  3038. '''
  3039. #通用属性生成
  3040. :param list_docis:
  3041. :return:
  3042. '''
  3043. #计数法选择
  3044. choose_dict = {}
  3045. project_dict = {}
  3046. for _key in [document_bidway,document_industry,document_info_type,document_info_source,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count]:
  3047. for _doc in list_docs:
  3048. _value = _doc.getProperties().get(_key,"")
  3049. if _value!="":
  3050. if _key not in choose_dict:
  3051. choose_dict[_key] = {}
  3052. if _value not in choose_dict[_key]:
  3053. choose_dict[_key][_value] = 0
  3054. choose_dict[_key][_value] += 1
  3055. _find = False
  3056. for _key in [document_district,document_city,document_province,document_area]:
  3057. area_dict = {}
  3058. for _doc in list_docs:
  3059. loc = _doc.getProperties().get(_key,"未知")
  3060. if loc not in ('全国','未知',"0"):
  3061. if loc not in area_dict:
  3062. area_dict[loc] = 0
  3063. area_dict[loc] += 1
  3064. list_loc = []
  3065. for k,v in area_dict.items():
  3066. list_loc.append([k,v])
  3067. list_loc.sort(key=lambda x:x[1],reverse=True)
  3068. if len(list_loc)>0:
  3069. project_dict[document_district] = _doc.getProperties().get(document_district)
  3070. project_dict[document_city] = _doc.getProperties().get(document_city)
  3071. project_dict[document_province] = _doc.getProperties().get(document_province)
  3072. project_dict[document_area] = _doc.getProperties().get(document_area)
  3073. _find = True
  3074. break
  3075. if not _find:
  3076. if len(list_docs)>0:
  3077. project_dict[document_district] = list_docs[0].getProperties().get(document_district)
  3078. project_dict[document_city] = list_docs[0].getProperties().get(document_city)
  3079. project_dict[document_province] = list_docs[0].getProperties().get(document_province)
  3080. project_dict[document_area] = list_docs[0].getProperties().get(document_area)
  3081. for _key,_value in choose_dict.items():
  3082. _l = []
  3083. for k,v in _value.items():
  3084. _l.append([k,v])
  3085. _l.sort(key=lambda x:x[1],reverse=True)
  3086. if len(_l)>0:
  3087. _v = _l[0][0]
  3088. if _v in ('全国','未知'):
  3089. if len(_l)>1:
  3090. _v = _l[1][0]
  3091. project_dict[_key] = _v
  3092. list_dynamics = []
  3093. docid_number = 0
  3094. visuable_docids = []
  3095. zhao_biao_page_time = ""
  3096. zhong_biao_page_time = ""
  3097. list_codes = []
  3098. list_product = []
  3099. p_page_time = ""
  3100. remove_docids = set()
  3101. for _doc in list_docs:
  3102. table_name = _doc.getProperties().get("table_name")
  3103. status = _doc.getProperties().get(document_status,0)
  3104. _save = _doc.getProperties().get(document_tmp_save,1)
  3105. doctitle = _doc.getProperties().get(document_doctitle,"")
  3106. docchannel = _doc.getProperties().get(document_docchannel)
  3107. page_time = _doc.getProperties().get(document_page_time,"")
  3108. _docid = _doc.getProperties().get(document_docid)
  3109. _bidway = _doc.getProperties().get(document_bidway,"")
  3110. _docchannel = _doc.getProperties().get(document_life_docchannel,0)
  3111. project_codes = _doc.getProperties().get(document_project_codes)
  3112. product = _doc.getProperties().get(document_product)
  3113. sub_docs = _doc.getProperties().get("sub_docs",[])
  3114. is_multipack = True if len(sub_docs)>1 else False
  3115. extract_count = _doc.getProperties().get(document_tmp_extract_count,0)
  3116. if product is not None:
  3117. list_product.extend(product.split(","))
  3118. if project_codes is not None:
  3119. _c = project_codes.split(",")
  3120. list_codes.extend(_c)
  3121. if p_page_time=="":
  3122. p_page_time = page_time
  3123. if zhao_biao_page_time=="" and _docchannel in (51,52,102,103,114):
  3124. zhao_biao_page_time = page_time
  3125. if zhong_biao_page_time=="" and _docchannel in (101,118,119,120):
  3126. zhong_biao_page_time = page_time
  3127. is_visuable = 0
  3128. if table_name=="document":
  3129. if status>=0 and status<=300:
  3130. docid_number +=1
  3131. visuable_docids.append(str(_docid))
  3132. is_visuable = 1
  3133. else:
  3134. remove_docids.add(str(_docid))
  3135. else:
  3136. if _save==1:
  3137. docid_number +=1
  3138. visuable_docids.append(str(_docid))
  3139. is_visuable = 1
  3140. else:
  3141. remove_docids.add(str(_docid))
  3142. list_dynamics.append({document_docid:_docid,
  3143. document_doctitle:doctitle,
  3144. document_docchannel:_docchannel,
  3145. document_bidway:_bidway,
  3146. document_page_time:page_time,
  3147. document_status:201 if is_visuable==1 else 401,
  3148. "is_multipack":is_multipack,
  3149. document_tmp_extract_count:extract_count
  3150. }
  3151. )
  3152. project_dict[project_project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
  3153. project_dict[project_docid_number] = docid_number
  3154. project_dict[project_docids] = ",".join(list(set(visuable_docids)-remove_docids))
  3155. if zhao_biao_page_time !="":
  3156. project_dict[project_zhao_biao_page_time] = zhao_biao_page_time
  3157. if zhong_biao_page_time !="":
  3158. project_dict[project_zhong_biao_page_time] = zhong_biao_page_time
  3159. project_dict[project_project_codes] = ",".join(list(set(list_codes)))
  3160. project_dict[project_page_time] = p_page_time
  3161. project_dict[project_product] = ",".join(list(set(list_product)))
  3162. return project_dict
  3163. def generate_packages_properties(self,list_docs):
  3164. '''
  3165. 生成分包属性
  3166. :param list_docs:
  3167. :return:
  3168. '''
  3169. list_properties = []
  3170. set_key = set()
  3171. for _doc in list_docs:
  3172. _dict = {}
  3173. sub_docs = _doc.getProperties().get("sub_docs")
  3174. if sub_docs is not None:
  3175. for _d in sub_docs:
  3176. sub_project_code = _d.get(project_sub_project_code,"")
  3177. sub_project_name = _d.get(project_sub_project_name,"")
  3178. win_tenderer = _d.get(project_win_tenderer,"")
  3179. win_bid_price = _d.get(project_win_bid_price,"")
  3180. _key = "%s-%s-%s-%s"%(sub_project_code,sub_project_name,win_tenderer,win_bid_price)
  3181. if _key in set_key:
  3182. continue
  3183. set_key.add(_key)
  3184. list_properties.append(_d)
  3185. return list_properties
  3186. def generate_projects_from_document(self,list_docs):
  3187. '''
  3188. #通过公告生成projects
  3189. :param list_docids:
  3190. :return:
  3191. '''
  3192. #判断标段数
  3193. list_projects = generate_projects([doc.getProperties() for doc in list_docs])
  3194. return list_projects
  3195. def search_projects_with_document(self,list_docids,project_table,project_table_index):
  3196. '''
  3197. 通过docid集合查询对应的projects
  3198. :param list_docids:
  3199. :return:
  3200. '''
  3201. log("search_projects_with_document %s"%str(list_docids))
  3202. list_should_q = []
  3203. for _docid in list_docids:
  3204. list_should_q.append(TermQuery("docids",_docid))
  3205. bool_query = BoolQuery(should_queries=list_should_q)
  3206. _query = {"query":bool_query,"limit":20}
  3207. list_project_dict = getDocument(_query,self.ots_client,[
  3208. project_uuid,project_docids,project_zhao_biao_page_time,
  3209. project_zhong_biao_page_time,
  3210. project_page_time,
  3211. project_area,
  3212. project_province,
  3213. project_city,
  3214. project_district,
  3215. project_info_type,
  3216. project_industry,
  3217. project_qcodes,
  3218. project_project_name,
  3219. project_project_code,
  3220. project_project_codes,
  3221. project_project_addr,
  3222. project_tenderee,
  3223. project_tenderee_addr,
  3224. project_tenderee_phone,
  3225. project_tenderee_contact,
  3226. project_agency,
  3227. project_agency_phone,
  3228. project_agency_contact,
  3229. project_sub_project_name,
  3230. project_sub_project_code,
  3231. project_bidding_budget,
  3232. project_win_tenderer,
  3233. project_win_bid_price,
  3234. project_win_tenderer_manager,
  3235. project_win_tenderer_phone,
  3236. project_second_tenderer,
  3237. project_second_bid_price,
  3238. project_second_tenderer_manager,
  3239. project_second_tenderer_phone,
  3240. project_third_tenderer,
  3241. project_third_bid_price,
  3242. project_third_tenderer_manager,
  3243. project_third_tenderer_phone,
  3244. project_procurement_system,
  3245. project_bidway,
  3246. project_dup_data,
  3247. project_docid_number,
  3248. project_project_dynamics,
  3249. project_product,
  3250. project_moneysource,
  3251. project_service_time,
  3252. project_time_bidclose,
  3253. project_time_bidopen,
  3254. project_time_bidstart,
  3255. project_time_commencement,
  3256. project_time_completion,
  3257. project_time_earnest_money_start,
  3258. project_time_earnest_money_end,
  3259. project_time_get_file_end,
  3260. project_time_get_file_start,
  3261. project_time_publicity_end,
  3262. project_time_publicity_start,
  3263. project_time_registration_end,
  3264. project_time_registration_start,
  3265. project_time_release,
  3266. project_dup_docid,
  3267. project_info_source,
  3268. project_nlp_enterprise,
  3269. project_nlp_enterprise_attachment,
  3270. project_tenderee_code,
  3271. project_agency_code,
  3272. project_candidates,
  3273. project_docchannel
  3274. ],sort="page_time",table_name=project_table,table_index=project_table_index)
  3275. return list_project_dict
  3276. def set_project_uuid(self,_dict,_uuid):
  3277. if _uuid is not None and _uuid!="":
  3278. if "uuid" in _dict:
  3279. _dict["uuid"] = "%s,%s"%(_dict["uuid"],_uuid)
  3280. else:
  3281. _dict["uuid"] = _uuid
  3282. def getMerge_rules(self,page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price,province,city,district):
  3283. whole_time_start = time.time()
  3284. _time = time.time()
  3285. list_query = []
  3286. list_code = [a for a in project_codes.split(",") if a!='']
  3287. should_q_code = BoolQuery(should_queries=[MatchQuery(project_project_codes,a) for a in list_code[:20]])
  3288. # print("should_q_code",[a for a in list_code[:20]])
  3289. should_q_cod = BoolQuery(should_queries=[MatchQuery(project_project_code,a) for a in list_code[:20]])
  3290. list_product = [a for a in product.split(",") if a!='']
  3291. should_q_product = BoolQuery(should_queries=[MatchQuery(project_product,a) for a in list_product[:20]])
  3292. should_q_area = None
  3293. if province!="" or city!="" or district!="":
  3294. should_q = []
  3295. if province not in ("","全国","未知") and province is not None:
  3296. should_q.append(TermQuery(project_province,province))
  3297. if city not in ("","全国","未知") and city is not None:
  3298. should_q.append(TermQuery(project_city,city))
  3299. # should_q.append(BoolQuery(must_queries=[
  3300. # TermQuery(project_province, province),
  3301. # TermQuery(project_city, city)
  3302. # ])
  3303. # )
  3304. if district not in ("","全国","未知") and district is not None:
  3305. should_q.append(TermQuery(project_district,district))
  3306. # should_q.append(BoolQuery(must_queries=[
  3307. # TermQuery(project_province, province),
  3308. # TermQuery(project_district, district)
  3309. # ])
  3310. # )
  3311. if len(should_q)>0:
  3312. should_q_area = BoolQuery(should_queries=should_q)
  3313. prepare_time = time.time()-_time
  3314. _time = time.time()
  3315. # log("list_code %s"%(str(list_code)))
  3316. # log("list_product %s"%(str(list_product)))
  3317. # log("tenderee %s"%(tenderee))
  3318. # log("bidding_budget %s"%(bidding_budget))
  3319. # log("win_tenderer %s"%(win_tenderer))
  3320. # log("win_bid_price %s"%(win_bid_price))
  3321. # log("project_name %s"%(project_name))
  3322. log_time = time.time()-_time
  3323. _time = time.time()
  3324. if tenderee!="" and len(list_code)>0:
  3325. _query = [TermQuery(project_tenderee,tenderee),
  3326. should_q_code,
  3327. ]
  3328. list_query.append([_query,2])
  3329. _query = [TermQuery(project_tenderee,tenderee),
  3330. should_q_cod
  3331. ]
  3332. list_query.append([_query,2])
  3333. if tenderee!="" and len(list_product)>0:
  3334. _query = [TermQuery(project_tenderee,tenderee),
  3335. should_q_product]
  3336. list_query.append([_query,1])
  3337. if tenderee!="" and project_name!="":
  3338. _query = [TermQuery(project_tenderee,tenderee),
  3339. TermQuery(project_project_name,project_name)]
  3340. list_query.append([_query,2])
  3341. if tenderee!="" and agency!="":
  3342. _query = [TermQuery(project_tenderee,tenderee),
  3343. TermQuery(project_agency,agency)]
  3344. list_query.append([_query,0])
  3345. if tenderee!="" and float(bidding_budget)>0:
  3346. _query = [TermQuery(project_tenderee,tenderee),
  3347. TermQuery(project_bidding_budget,bidding_budget)]
  3348. list_query.append([_query,2])
  3349. if float(bidding_budget)>0 and float(win_bid_price)>0:
  3350. _query = [TermQuery(project_bidding_budget,bidding_budget),
  3351. TermQuery(project_win_bid_price,win_bid_price)]
  3352. list_query.append([_query,2])
  3353. if tenderee!="" and win_tenderer!="":
  3354. _query = [TermQuery(project_tenderee,tenderee),
  3355. TermQuery(project_win_tenderer,win_tenderer)]
  3356. list_query.append([_query,2])
  3357. if agency!="" and win_tenderer!="":
  3358. _query = [TermQuery(project_agency,agency),
  3359. TermQuery(project_win_tenderer,win_tenderer)]
  3360. list_query.append([_query,0])
  3361. if agency!="" and len(list_product)>0:
  3362. _query = [TermQuery(project_agency,agency),
  3363. should_q_product]
  3364. list_query.append([_query,1])
  3365. if win_tenderer!="" and len(list_code)>0:
  3366. _query = [TermQuery(project_win_tenderer,win_tenderer),
  3367. should_q_code]
  3368. list_query.append([_query,2])
  3369. _query = [TermQuery(project_win_tenderer,win_tenderer),
  3370. should_q_cod]
  3371. list_query.append([_query,2])
  3372. if win_tenderer!="" and sub_project_name!="":
  3373. _query = [TermQuery(project_win_tenderer,win_tenderer),
  3374. TermQuery(project_sub_project_name,sub_project_name)
  3375. ]
  3376. list_query.append([_query,2])
  3377. if win_tenderer!="" and float(win_bid_price)>0:
  3378. _query = [TermQuery(project_win_tenderer,win_tenderer),
  3379. TermQuery(project_win_bid_price,win_bid_price)]
  3380. list_query.append([_query,2])
  3381. if win_tenderer!="" and float(bidding_budget)>0:
  3382. _query = [TermQuery(project_win_tenderer,win_tenderer),
  3383. TermQuery(project_bidding_budget,bidding_budget)]
  3384. list_query.append([_query,2])
  3385. if len(list_code)>0 and len(list_product)>0:
  3386. _query = [should_q_code,
  3387. should_q_product]
  3388. list_query.append([_query,2])
  3389. if len(list_code)>0:
  3390. _query = [
  3391. should_q_code]
  3392. list_query.append([_query,2])
  3393. _query = [
  3394. should_q_cod]
  3395. list_query.append([_query,1])
  3396. if project_name!="" and project_name is not None:
  3397. _query = [
  3398. TermQuery(project_project_name,project_name)]
  3399. list_query.append([_query,1])
  3400. _query_title = [MatchPhraseQuery(project_doctitles,project_name)]
  3401. list_query.append([_query_title,1])
  3402. if len(list_product)>0 and should_q_area is not None:
  3403. _query = [should_q_area,
  3404. should_q_product]
  3405. list_query.append([_query,0])
  3406. generate_time = time.time()-_time
  3407. whole_time = time.time()-whole_time_start
  3408. # log("projects merge rules whole_time:%.3f prepare_time:%.3f log_time:%.3f generate_time:%.3f"%(whole_time,prepare_time,log_time,generate_time))
  3409. return list_query
  3410. def merge_projects(self,list_projects,b_log=False,check_columns=[project_uuid,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_project_name,project_project_code,project_project_codes,project_tenderee,project_agency,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_project_dynamics,project_product,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_nlp_enterprise,project_nlp_enterprise_attachment,project_docids,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_addr,project_tenderee_addr,project_agency_phone,project_agency_contact,project_tenderee_phone,project_tenderee_contact,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_moneysource,project_service_time,project_dup_docid,project_info_source],project_table="project2",project_table_index="project2_index_formerge",project_uuids=[]):
  3411. '''
  3412. 对项目进行合并
  3413. :return:
  3414. '''
  3415. try:
  3416. whole_time_start = time.time()
  3417. set_uuid = set()
  3418. for _proj in list_projects:
  3419. _uuid = _proj.get("uuid")
  3420. if _uuid is not None:
  3421. set_uuid = set_uuid | set(_uuid.split(","))
  3422. projects_merge_count = 0
  3423. projects_check_rule_time = 0
  3424. projects_update_time = 0
  3425. projects_query_time = 0
  3426. projects_prepare_time = 0
  3427. current_date = getCurrent_date("%Y-%m-%d")
  3428. min_date = timeAdd(current_date,-35,format="%Y-%m-%d")
  3429. search_table = "project2"
  3430. search_table_index = "project2_index_formerge"
  3431. project_cls = Project
  3432. docids = ""
  3433. for _proj in list_projects[:30]:
  3434. must_not_q = []
  3435. for _uuid in list(set_uuid):
  3436. must_not_q.append(TermQuery("uuid",_uuid))
  3437. docids = _proj.get(project_docids,"")
  3438. page_time = _proj.get(project_page_time,"")
  3439. project_codes = _proj.get(project_project_codes,"")
  3440. project_name = _proj.get(project_project_name,"")
  3441. tenderee = _proj.get(project_tenderee,"")
  3442. agency = _proj.get(project_agency,"")
  3443. product = _proj.get(project_product,"")
  3444. sub_project_name = _proj.get(project_sub_project_name,"")
  3445. bidding_budget = _proj.get(project_bidding_budget,-1)
  3446. win_tenderer = _proj.get(project_win_tenderer,"")
  3447. win_bid_price = _proj.get(project_win_bid_price,-1)
  3448. _dynamic = _proj.get(project_project_dynamics,"[]")
  3449. is_yanshou = False
  3450. list_dynamic = json.loads(_dynamic)
  3451. for _d in list_dynamic:
  3452. _title = _d.get("doctitle","")
  3453. if re.search("验收公[示告]|验收结果",_title) is not None or _d.get("docchannel")==122:
  3454. is_yanshou = True
  3455. break
  3456. province = _proj.get(project_province,"")
  3457. city = _proj.get(project_city,"")
  3458. district = _proj.get(project_district,"")
  3459. if is_yanshou:
  3460. page_time_less = timeAdd(page_time,-850)
  3461. page_time_greater = timeAdd(page_time,820)
  3462. else:
  3463. page_time_less = timeAdd(page_time,-450)
  3464. page_time_greater = timeAdd(page_time,420)
  3465. sub_project_q = TermQuery(project_sub_project_name,sub_project_name) if sub_project_name.replace("Project","")!="" else None
  3466. _time = time.time()
  3467. list_must_query = self.getMerge_rules(page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price,province,city,district)
  3468. list_merge_data = []
  3469. search_table = "project2"
  3470. search_table_index = "project2_index_formerge"
  3471. project_cls = Project
  3472. search_table = project_table
  3473. search_table_index = project_table_index
  3474. # print("page_time,min_date",page_time,min_date)
  3475. # if page_time>=min_date:
  3476. # search_table = "project2_tmp"
  3477. # search_table_index = "project2_tmp_index"
  3478. # project_cls = Project_tmp
  3479. _step = 2
  3480. _begin = 0
  3481. must_queries = []
  3482. if page_time_less is not None and page_time_greater is not None:
  3483. must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
  3484. # RangeQuery("status",201,301)
  3485. ]
  3486. #sub_project_name非必要条件
  3487. # if sub_project_q is not None:
  3488. # must_queries.append(sub_project_q)
  3489. projects_prepare_time += time.time()-_time
  3490. _time = time.time()
  3491. sort_type = SortOrder.DESC
  3492. while _begin<len(list_must_query):
  3493. if sort_type==SortOrder.DESC:
  3494. sort_type=SortOrder.ASC
  3495. if sort_type==SortOrder.ASC:
  3496. sort_type=SortOrder.DESC
  3497. list_should_q = []
  3498. _limit = 10
  3499. for must_q,_count in list_must_query[_begin:_begin+_step]:
  3500. must_q1 = list(must_q)
  3501. must_q1.extend(must_queries)
  3502. list_should_q.append(BoolQuery(must_queries=must_q1))
  3503. _limit += _count*5
  3504. _query = BoolQuery(
  3505. should_queries=list_should_q,
  3506. must_not_queries=must_not_q[:100]
  3507. )
  3508. # rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search("project2","project2_index_formerge",
  3509. # SearchQuery(_query,limit=_limit),
  3510. # columns_to_get=ColumnsToGet(column_names=[project_uuid,project_docids,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_name,project_project_code,project_project_codes,project_project_addr,project_tenderee,project_tenderee_addr,project_tenderee_phone,project_tenderee_contact,project_agency,project_agency_phone,project_agency_contact,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_project_dynamics,project_product,project_moneysource,project_service_time,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_dup_docid,project_info_source,project_nlp_enterprise,project_nlp_enterprise_attachment],return_type=ColumnReturnType.SPECIFIED))
  3511. rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search(search_table,search_table_index,
  3512. SearchQuery(_query,sort=Sort(sorters=[FieldSort(project_page_time,sort_type)]),limit=_limit),
  3513. columns_to_get=ColumnsToGet(column_names=check_columns,return_type=ColumnReturnType.SPECIFIED))
  3514. list_data = getRow_ots(rows)
  3515. list_merge_data.extend(list_data)
  3516. # print(list_data)
  3517. for _data in list_data:
  3518. must_not_q.append(TermQuery(project_uuid,_data.get(project_uuid)))
  3519. _begin += _step
  3520. projects_query_time += time.time()-_time
  3521. #优先匹配招标金额相近的
  3522. projects_merge_count = len(list_merge_data)
  3523. list_merge_data.sort(key=lambda x:x.get(project_page_time,""))
  3524. list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
  3525. # log(page_time_less+"=="+page_time_greater)
  3526. if b_log:
  3527. log("list_merge_data count:%d"%(len(list_merge_data)))
  3528. list_check_data = []
  3529. for _data in list_merge_data:
  3530. _time = time.time()
  3531. _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True,project_uuids=project_uuids)
  3532. if b_log:
  3533. log("merge rule res: %s prob: %s"%(str(_check),str(_prob)))
  3534. projects_check_rule_time += time.time()-_time
  3535. if _check:
  3536. list_check_data.append([_data,_prob])
  3537. list_check_data.sort(key=lambda x:x[1],reverse=True)
  3538. for _data,_ in list_check_data:
  3539. _time = time.time()
  3540. _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True,project_uuids=project_uuids)
  3541. # log("secend check_merge_rule res: " + str(_check) + " prob: " + str(_prob))
  3542. projects_check_rule_time += time.time()-_time
  3543. _time = time.time()
  3544. if _check:
  3545. # o_proj = project_cls(_data)
  3546. # o_proj.fix_columns(self.ots_client,fix_columns,True)
  3547. # for k in fix_columns:
  3548. # _data[k] = o_proj.getProperties().get(k)
  3549. update_projects_by_project(_data,[_proj])
  3550. projects_update_time += time.time()-_time
  3551. whole_time = time.time()-whole_time_start
  3552. log("%s %s merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(search_table,docids,whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
  3553. return list_projects
  3554. except Exception as e:
  3555. traceback.print_exc()
  3556. assert 1==2
  3557. def dumplicate_document_in_merge(self,list_projects,dup_docid,_docid,_docchannel,document_name="document",b_log=False):
  3558. '''
  3559. 合并时去重
  3560. :param list_projects:
  3561. :return:
  3562. '''
  3563. dup_docid = set([str(a) for a in dup_docid])
  3564. set_dup_total = set()
  3565. docid_item = self.get_attrs_before_dump(_docid)
  3566. best_docid = None
  3567. for _proj in list_projects:
  3568. try:
  3569. docids = _proj.get(project_docids,"")
  3570. set_docids = set([a for a in docids.split(",") if a!=""])
  3571. _project_dynamics = _proj.get(project_project_dynamics,"[]")
  3572. list_dynamics = json.loads(_project_dynamics)
  3573. set_dup_docid = set()
  3574. list_dup_result = [(_docid,docid_item.get("extract_count"))]
  3575. log("=========%s---%s"%(str(set_docids),str(_docid)))
  3576. tmp_project_list = [[p['docid'],p['page_time'],p['docchannel']] for p in list_dynamics if p['docid']!=_docid]
  3577. if str(_docid) in set_docids:
  3578. list_to_dup_docid = []
  3579. for _d in list_dynamics:
  3580. docid = _d.get(document_docid)
  3581. doctitle = _d.get(document_doctitle,"")
  3582. docchannel = _d.get(document_docchannel,0)
  3583. status = _d.get(document_status,0)
  3584. if status>=401:
  3585. continue
  3586. if str(docid) not in set_docids:
  3587. continue
  3588. if str(docid) in dup_docid:
  3589. continue
  3590. if docchannel!=_docchannel:
  3591. continue
  3592. if docid==_docid:
  3593. continue
  3594. list_to_dup_docid.append(_d)
  3595. for _d in list_to_dup_docid:
  3596. docid = _d.get(document_docid)
  3597. page_time = _d.get(document_page_time)
  3598. _item = self.get_attrs_before_dump(docid)
  3599. _prob = check_dumplicate_rule(docid_item,_item,5,b_log=b_log)
  3600. log("dumplicate_document_in_merge %s-%s prob %.2f"%(str(_docid),str(docid),_prob))
  3601. if _prob>0.4:
  3602. docid = int(docid)
  3603. _d = {"partitionkey":docid%500+1,
  3604. "docid":docid,
  3605. }
  3606. _doc = Document(_d)
  3607. _doc.table_name = document_name
  3608. if _doc.fix_columns(self.ots_client,[document_page_time,document_update_document],True):
  3609. is_dup = True
  3610. # 中间有废标公告不去重
  3611. if _docchannel==52 and page_time!=docid_item['page_time'] and 118 in [p[2] for p in tmp_project_list if p[1]>=min(page_time,docid_item['page_time']) and p[1]<=max(page_time,docid_item['page_time'])]:
  3612. is_dup = False
  3613. # update_document为true也不去重
  3614. if _doc.getProperties().get(document_update_document,"")!="true":
  3615. is_dup = False
  3616. if is_dup:
  3617. list_dup_result.append((docid,_item.get("extract_count")))
  3618. list_dup_result.sort(key=lambda x:x[0])
  3619. list_dup_result.sort(key=lambda x:x[1],reverse=True)
  3620. if len(list_dup_result)>0:
  3621. best_docid1 = list_dup_result[0][0]
  3622. if best_docid1 not in set_dup_total:
  3623. best_docid = best_docid1
  3624. for _d in list_dup_result[1:]:
  3625. set_dup_docid.add(str(_d[0]))
  3626. for _dynamic in list_dynamics:
  3627. if _dynamic.get(document_docid) in set_dup_docid:
  3628. _dynamic[document_status] = 401
  3629. set_docids = set_docids-set_dup_docid-dup_docid
  3630. set_dup_total |= set_dup_docid
  3631. if len(set_docids)==0:
  3632. print(set_dup_docid,dup_docid)
  3633. log("projects set_docids length is zero %s"%(docids))
  3634. return None,None
  3635. else:
  3636. _proj[project_docids] = ",".join(list(set_docids))
  3637. _proj[project_project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
  3638. _proj[project_docid_number] = len(set_docids)
  3639. _proj[project_dup_docid] = ",".join(list(set_dup_docid))
  3640. # log("dumplicate_document docid%s dynamic %d takes%.3f"%(str(docid),len(list_dynamics),time.time()-_time))
  3641. except Exception as e:
  3642. traceback.print_exc()
  3643. if best_docid in set_dup_total:
  3644. best_docid = None
  3645. return best_docid,list(set_dup_total)
  3646. def merge_document_real(self,item,dup_docid,best_docid,save,document_name="document",project_table="project2",project_table_index="project2_index_formerge",b_log=False,project_uuids=[]):
  3647. '''
  3648. 实时项目合并
  3649. :param item:
  3650. :param dup_docid:重复的公告集合
  3651. :param status_to:
  3652. :return:
  3653. '''
  3654. try:
  3655. list_docids = []
  3656. _docid = item.get(document_tmp_docid)
  3657. list_docids.append(_docid)
  3658. print("dup_docid",dup_docid)
  3659. if save==0:
  3660. dup_docid.insert(0,_docid)
  3661. if isinstance(dup_docid,list):
  3662. list_docids.extend(dup_docid)
  3663. if best_docid and int(best_docid) > 0:
  3664. if best_docid not in list_docids:
  3665. list_docids.append(best_docid)
  3666. list_docids = [a for a in list_docids if a is not None]
  3667. _time = time.time()
  3668. list_projects = self.search_projects_with_document(list_docids,project_table,project_table_index)
  3669. log("search %d projects takes:%.3f"%(len(list_projects),time.time()-_time))
  3670. # list_projects = []
  3671. if len(list_projects)==0:
  3672. # _time = time.time()
  3673. list_docs = self.search_docs(list_docids,document_name=document_name)
  3674. # list_docs = self.search_docs(list_docids+[373990715,372459879],document_name=document_name) # 手动查找合并
  3675. # log("search document takes:%.3f"%(time.time()-_time))
  3676. # _time = time.time()
  3677. list_projects = self.generate_projects_from_document(list_docs)
  3678. # log("generate projects takes:%.3f"%(time.time()-_time))
  3679. else:
  3680. _time = time.time()
  3681. self.update_projects_by_document(_docid,save,list_projects,document_name=document_name)
  3682. # log("update projects takes:%.3f"%(time.time()-_time))
  3683. _time = time.time()
  3684. list_projects = dumplicate_projects(list_projects,max_count=20)
  3685. # list_projects[1]['uuid'] = '55523886-3896-4985-9fdc-d17560a9123b'
  3686. # list_projects = [list_projects[1]]
  3687. # log("dumplicate projects takes:%.3f"%(time.time()-_time))
  3688. _time = time.time()
  3689. list_projects = self.merge_projects(list_projects,b_log,project_table=project_table,project_table_index=project_table_index,project_uuids=project_uuids)
  3690. # log("merge projects takes:%.3f"%(time.time()-_time))
  3691. _time = time.time()
  3692. best_docid,list_merge_dump = self.dumplicate_document_in_merge(list_projects,dup_docid,_docid,item.get(document_docchannel),document_name=document_name,b_log=b_log)
  3693. # log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
  3694. if list_merge_dump is None:
  3695. list_projects = []
  3696. _time = time.time()
  3697. project_json = to_project_json(list_projects)
  3698. # log("json projects takes:%.3f"%(time.time()-_time))
  3699. if b_log:
  3700. log("project_json:%s"%project_json)
  3701. return project_json,best_docid,list_merge_dump
  3702. except Exception as e:
  3703. raise RuntimeError("error on dumplicate")
  3704. def is_exist_fingerprint(self,final_list,_docid,_fingerprint,is_tmp=False):
  3705. set_fingerprint = set()
  3706. for _i in range(1,len(final_list)):
  3707. _dict = final_list[_i]
  3708. b_docid = _dict[document_tmp_docid]
  3709. _save = _dict.get(document_tmp_save,0)
  3710. _status = _dict.get(document_tmp_status,0)
  3711. if not is_tmp:
  3712. if _status>=201 and _status<=300:
  3713. _save = 1
  3714. fingerprint_less = _dict.get(document_tmp_fingerprint,"")
  3715. if b_docid==_docid:
  3716. pass
  3717. else:
  3718. if _save==1:
  3719. set_fingerprint.add(fingerprint_less)
  3720. if _fingerprint in set_fingerprint:
  3721. return True
  3722. return False
  3723. def exists_normal_fingerprint(self,_fingerprint,docid,table_name="document",table_index="document_index"):
  3724. query = BoolQuery(must_queries=[
  3725. RangeQuery("status",201,301),
  3726. TermQuery("fingerprint",_fingerprint),
  3727. RangeQuery("docid",0,docid-400000),
  3728. ]
  3729. )
  3730. rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  3731. SearchQuery(query,get_total_count=True,limit=1))
  3732. if total_count>0:
  3733. return True
  3734. return False
  3735. def check_page_time(self,item,table_name="document",table_index="document_index"):
  3736. page_time = item.get(document_page_time,"")
  3737. has_before = False
  3738. has_after = False
  3739. bidclose_time = page_time
  3740. web_source_name = item.get(document_tmp_web_source_name,"")
  3741. docchannel = item.get(document_tmp_docchannel,"0")
  3742. try:
  3743. docchannel = int(docchannel)
  3744. except:
  3745. docchannel = 0
  3746. if docchannel<200:
  3747. if len(page_time)>0:
  3748. l_page_time = timeAdd(page_time,days=-90)
  3749. dict_time = item.get("dict_time",{})
  3750. for k,v in dict_time.items():
  3751. if v is not None and len(v)>0:
  3752. if l_page_time>v:
  3753. has_before = True
  3754. if v>=page_time:
  3755. has_after = True
  3756. if k==document_tmp_time_bidclose:
  3757. bidclose_time = v
  3758. set_web_source = {"中国招标投标公共服务平台","比地招标"}
  3759. if web_source_name in set_web_source and bidclose_time<page_time:
  3760. return False
  3761. log("%s check page_time has_before %s has_after %s"%(str(item.get(document_docid)),str(has_before),str(has_after)))
  3762. if has_before:
  3763. _query = BoolQuery(must_queries=[MatchPhraseQuery(document_doctitle,item.get(document_doctitle,""))],
  3764. must_not_queries=[TermQuery(document_docid,item.get(document_docid,0))])
  3765. if not has_after:
  3766. rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  3767. SearchQuery(_query,get_total_count=True,limit=1))
  3768. if total_count>0:
  3769. log("%s check page_time false %s==%s-%s"%(str(item.get(document_docid)),l_page_time,k,v))
  3770. return False
  3771. if item.get(document_web_source_name,"")=="中国政府采购网":
  3772. rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  3773. SearchQuery(_query,get_total_count=True,limit=1))
  3774. if total_count>0:
  3775. log("%s check 中国政府采购网 false "%(str(item.get(document_docid))))
  3776. return False
  3777. return True
  3778. def dumplicate_comsumer_handle_interface(self,docid,document_table,document_table_index,project_table,project_table_index,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=False,upgrade=False):
  3779. result_dict = {"success":True}
  3780. try:
  3781. bool_query = BoolQuery(must_queries=[
  3782. TermQuery("docid",docid)
  3783. ])
  3784. rows,next_token,total_count,is_all_succeed = self.ots_client.search(document_table,document_table_index,
  3785. SearchQuery(bool_query,limit=1,get_total_count=True),
  3786. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  3787. list_dict = getRow_ots(rows)
  3788. if len(list_dict)==0:
  3789. raise RuntimeError("未查找到docid为%s的数据"%(str(docid)))
  3790. item = list_dict[0]
  3791. self.post_extract(item)
  3792. log("dumplicate start on:%s"%(str(item.get(document_tmp_docid))))
  3793. base_list = []
  3794. set_docid = set()
  3795. list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,get_all=False,to_log=False,table_name=document_table,table_index=document_table_index)
  3796. # print("len_rules",len(list_rules),table_name,table_index)
  3797. list_rules.sort(key=lambda x:x["confidence"],reverse=True)
  3798. log("dumplicate %s rules:%d"%(str(item.get(document_tmp_docid)),len(list_rules)))
  3799. list_rules = list_rules[:30]
  3800. _i = 0
  3801. step = 2
  3802. item["confidence"] = 999
  3803. if item.get(document_tmp_docid) not in set_docid:
  3804. base_list.append(item)
  3805. set_docid.add(item.get(document_tmp_docid))
  3806. while _i<len(list_rules):
  3807. must_not_q = []
  3808. if len(base_list)>0:
  3809. must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
  3810. _query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
  3811. must_not_queries=must_not_q)
  3812. _rule = list_rules[_i]
  3813. confidence = _rule["confidence"]
  3814. singleNum_keys = _rule["singleNum_keys"]
  3815. contain_keys = _rule["contain_keys"]
  3816. multiNum_keys = _rule["multiNum_keys"]
  3817. self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=b_log)
  3818. _i += step
  3819. _time = time.time()
  3820. # log("%d start final check with length:%d"%(item["docid"],len(base_list)))
  3821. final_list = self.dumplicate_fianl_check(base_list,b_log)
  3822. exist_finterprint = self.is_exist_fingerprint(final_list,item.get(document_tmp_docid),item.get(document_tmp_fingerprint),is_tmp=table_name=="document_tmp")
  3823. exist_normal_fingerprint = self.exists_normal_fingerprint(item.get(document_tmp_fingerprint),item.get(document_tmp_docid),table_name=table_name,table_index=table_index)
  3824. # print("exist_normal_fingerprint",exist_normal_fingerprint)
  3825. # log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
  3826. best_docid = self.get_best_docid(final_list)
  3827. final_list_docid = [a["docid"] for a in final_list]
  3828. # log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
  3829. _d = {"partitionkey":item["partitionkey"],
  3830. "docid":item["docid"],
  3831. "status":random.randint(*flow_dumplicate_status_to),
  3832. document_tmp_opertime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")
  3833. }
  3834. dtmp = Document_tmp(_d)
  3835. dup_docid = set()
  3836. for _dict in final_list:
  3837. if _dict.get("update_document","")!="true":
  3838. dup_docid.add(_dict.get(document_tmp_docid))
  3839. if item.get(document_tmp_docid) in dup_docid:
  3840. dup_docid.remove(item.get(document_tmp_docid))
  3841. remove_list = []
  3842. _unnormal = False
  3843. dmp_docid = ""
  3844. _check_time = self.check_page_time(item,table_name=table_name,table_index=table_index)
  3845. if (_check_time and not exist_normal_fingerprint and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
  3846. dtmp.setValue(document_tmp_save,1,True)
  3847. # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
  3848. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3849. for _dict in final_list:
  3850. if _dict.get(document_tmp_docid) in dup_docid:
  3851. remove_list.append(_dict)
  3852. else:
  3853. if exist_normal_fingerprint:
  3854. log("%s has exist_normal_fingerprint"%(str(item.get(document_docid))))
  3855. best_docid = -1
  3856. dmp_docid = ""
  3857. _unnormal = True
  3858. if not _check_time:
  3859. best_docid = -2
  3860. dmp_docid = ""
  3861. _unnormal = True
  3862. dtmp.setValue(document_tmp_save,0,True)
  3863. if best_docid in dup_docid:
  3864. dup_docid.remove(best_docid)
  3865. for _dict in final_list:
  3866. if _dict.get(document_tmp_docid) in dup_docid:
  3867. remove_list.append(_dict)
  3868. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3869. else:
  3870. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3871. for _dict in final_list:
  3872. if _dict.get(document_tmp_docid) in dup_docid:
  3873. remove_list.append(_dict)
  3874. list_docids = list(dup_docid)
  3875. # if item.get(document_update_document)=="true":
  3876. # dtmp.setValue(document_tmp_save,1,True)
  3877. list_merge_dump = []
  3878. if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
  3879. if exist_finterprint:
  3880. log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
  3881. dtmp.setValue(document_tmp_projects,"[]",True)
  3882. else:
  3883. project_json,merge_best_docid,list_merge_dump = self.merge_document_real(item,list_docids,dtmp.getProperties().get(document_tmp_save),document_name=document_table,project_table=project_table,project_table_index=project_table_index,b_log=b_log)
  3884. if merge_best_docid is not None and (best_docid is None or best_docid==item.get(document_tmp_docid) or best_docid<0):
  3885. best_docid = merge_best_docid
  3886. if list_merge_dump is not None and len(list_merge_dump)>0 and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
  3887. dtmp.setValue(document_tmp_save,0,True)
  3888. if list_merge_dump is not None:
  3889. dmp_docid = "%s,%s"%(dmp_docid,",".join([str(a) for a in list_merge_dump]))
  3890. dtmp.setValue(document_tmp_projects,project_json,True)
  3891. result_dict["projects"] = project_json
  3892. log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
  3893. dmp_docid = set([a for a in dmp_docid.split(",") if a!=""])
  3894. if str(best_docid) in dmp_docid:
  3895. dmp_docid.remove(str(best_docid))
  3896. dmp_docid = ",".join([str(a) for a in list(dmp_docid)])
  3897. result_dict["best_docid"] = str(best_docid) if best_docid is not None else ""
  3898. result_dict["save"] = dtmp.getProperties().get("save")
  3899. result_dict["dmp_docid"] = dmp_docid
  3900. except Exception as e:
  3901. result_dict["success"] = False
  3902. result_dict["errmsg"] = str(e)
  3903. return result_dict
  3904. def dumplicate_comsumer_handle(self,item,result_queue,ots_client,get_all=False,upgrade=True,project_uuids=[]):
  3905. try:
  3906. start_time = time.time()
  3907. b_log = False if upgrade else True
  3908. self.post_extract(item)
  3909. log("dumplicate start on:%s"%(str(item.get(document_tmp_docid))))
  3910. base_list = []
  3911. set_docid = set()
  3912. list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,get_all=get_all,to_log=b_log)
  3913. # print("len_rules",len(list_rules),table_name,table_index)
  3914. list_rules.sort(key=lambda x:x["confidence"],reverse=True)
  3915. log("dumplicate %s rules:%d"%(str(item.get(document_tmp_docid)),len(list_rules)))
  3916. list_rules = list_rules[:30]
  3917. _i = 0
  3918. step = 2
  3919. item["confidence"] = 999
  3920. if item.get(document_tmp_docid) not in set_docid:
  3921. base_list.append(item)
  3922. set_docid.add(item.get(document_tmp_docid))
  3923. while _i<len(list_rules):
  3924. must_not_q = []
  3925. if len(base_list)>0:
  3926. must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
  3927. _query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
  3928. must_not_queries=must_not_q)
  3929. _rule = list_rules[_i]
  3930. confidence = _rule["confidence"]
  3931. singleNum_keys = _rule["singleNum_keys"]
  3932. contain_keys = _rule["contain_keys"]
  3933. multiNum_keys = _rule["multiNum_keys"]
  3934. self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document,document_tmp_web_source_name,'detail_link','products','crtime'],b_log=b_log)
  3935. _i += step
  3936. _time = time.time()
  3937. # log("%d start final check with length:%d"%(item["docid"],len(base_list)))
  3938. # print('base_list',[i['docid'] for i in base_list])
  3939. final_list = self.dumplicate_fianl_check(base_list,b_log)
  3940. # print('final_list',[i['docid'] for i in final_list])
  3941. exist_finterprint = self.is_exist_fingerprint(final_list,item.get(document_tmp_docid),item.get(document_tmp_fingerprint),is_tmp=table_name=="document_tmp")
  3942. exist_normal_fingerprint = self.exists_normal_fingerprint(item.get(document_tmp_fingerprint),item.get(document_tmp_docid))
  3943. # print("exist_normal_fingerprint",exist_normal_fingerprint)
  3944. # log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
  3945. best_docid = self.get_best_docid(final_list)
  3946. final_list_docid = [a["docid"] for a in final_list]
  3947. # log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
  3948. _d = {"partitionkey":item["partitionkey"],
  3949. "docid":item["docid"],
  3950. "status":random.randint(*flow_dumplicate_status_to),
  3951. document_tmp_opertime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")
  3952. }
  3953. dtmp = Document_tmp(_d)
  3954. dup_docid = set()
  3955. for _dict in final_list:
  3956. if _dict.get("update_document","")!="true":
  3957. dup_docid.add(_dict.get(document_tmp_docid))
  3958. if item.get(document_tmp_docid) in dup_docid:
  3959. dup_docid.remove(item.get(document_tmp_docid))
  3960. remove_list = []
  3961. _unnormal = False
  3962. dmp_docid = ""
  3963. _check_time = self.check_page_time(item)
  3964. if (_check_time and not exist_normal_fingerprint and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
  3965. dtmp.setValue(document_tmp_save,1,True)
  3966. # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
  3967. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3968. for _dict in final_list:
  3969. if _dict.get(document_tmp_docid) in dup_docid:
  3970. remove_list.append(_dict)
  3971. else:
  3972. if exist_normal_fingerprint:
  3973. log("%s has exist_normal_fingerprint"%(str(item.get(document_docid))))
  3974. best_docid = -1
  3975. dmp_docid = ""
  3976. _unnormal = True
  3977. if not _check_time:
  3978. best_docid = -2
  3979. dmp_docid = ""
  3980. _unnormal = True
  3981. dtmp.setValue(document_tmp_save,0,True)
  3982. if best_docid in dup_docid:
  3983. dup_docid.remove(best_docid)
  3984. for _dict in final_list:
  3985. if _dict.get(document_tmp_docid) in dup_docid:
  3986. remove_list.append(_dict)
  3987. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3988. else:
  3989. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3990. for _dict in final_list:
  3991. if _dict.get(document_tmp_docid) in dup_docid:
  3992. remove_list.append(_dict)
  3993. list_docids = list(dup_docid)
  3994. # if item.get(document_update_document)=="true":
  3995. # dtmp.setValue(document_tmp_save,1,True)
  3996. list_merge_dump = []
  3997. if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
  3998. if exist_finterprint:
  3999. log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
  4000. dtmp.setValue(document_tmp_projects,"[]",True)
  4001. else:
  4002. project_json,merge_best_docid,list_merge_dump = self.merge_document_real(item,list_docids,best_docid,dtmp.getProperties().get(document_tmp_save),b_log=b_log,project_uuids=project_uuids)
  4003. if merge_best_docid is not None and (best_docid is None or best_docid==item.get(document_tmp_docid) or best_docid<0):
  4004. best_docid = merge_best_docid
  4005. if list_merge_dump is not None and len(list_merge_dump)>0 and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
  4006. dtmp.setValue(document_tmp_save,0,True)
  4007. if list_merge_dump is not None:
  4008. dmp_docid = "%s,%s"%(dmp_docid,",".join([str(a) for a in list_merge_dump]))
  4009. dtmp.setValue(document_tmp_projects,project_json,True)
  4010. log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
  4011. dmp_docid = set([a for a in dmp_docid.split(",") if a!=""])
  4012. if str(best_docid) in dmp_docid:
  4013. dmp_docid.remove(str(best_docid))
  4014. dmp_docid = ",".join([str(a) for a in list(dmp_docid)])
  4015. if _unnormal:
  4016. dmp_docid = ""
  4017. if upgrade:
  4018. # print(dtmp.getProperties())
  4019. dmp_docid = dmp_docid.replace(",,",",")
  4020. dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
  4021. dtmp.setValue(document_tmp_best_docid,best_docid,True)
  4022. if item.get("test_merge"):# 手动合并时复制数据
  4023. # 复制doc表数据到dtmp表
  4024. # cols = [document_tmp_partitionkey, document_tmp_docid, document_tmp_doctitle, document_tmp_attachment_path, document_tmp_uuid, document_tmp_crtime, document_tmp_tenderee, document_tmp_agency, document_tmp_project_code, document_tmp_product, document_tmp_project_name, document_tmp_doctitle_refine, document_tmp_extract_count, document_tmp_save, document_tmp_projects, document_tmp_page_time, document_tmp_web_source_no, document_tmp_docchannel, document_tmp_original_docchannel, document_tmp_extract_json]
  4025. rows, next_token, total_count, is_all_succeed = self.ots_client.search("document", "document_index",
  4026. SearchQuery(TermQuery('docid',item.get("docid")),
  4027. sort=Sort(sorters=[FieldSort("docid")]),
  4028. limit=1,get_total_count=True),
  4029. ColumnsToGet(return_type=ColumnReturnType.ALL))
  4030. search_item = getRow_ots(rows)[0]
  4031. search_item.pop(document_tmp_doctextcon)# 去掉长字段
  4032. search_item.pop(document_tmp_attachmenttextcon)
  4033. for k,v in search_item.items():
  4034. if v:
  4035. if dtmp.getProperties().get(k)==None:
  4036. dtmp.setValue(k, v, True)
  4037. dtmp.setValue(document_tmp_doctitle_refine, item.get(document_tmp_doctitle_refine), True)
  4038. # for k,v in item.items():
  4039. # # if k in cols and v!=None:
  4040. # if v!=None:
  4041. # if dtmp.getProperties().get(k)==None:
  4042. # dtmp.setValue(k, v, True)
  4043. # 复制html到doc_html_tmp
  4044. dhtml = Document_html({"partitionkey": item.get("partitionkey"),
  4045. "docid": item.get("docid")})
  4046. from BaseDataMaintenance.dataSource.source import getConnect_ots_capacity
  4047. tmp_ots_client = getConnect_ots_capacity()
  4048. rows,next_token,total_count,is_all_succeed = tmp_ots_client.search("document","document_index",
  4049. SearchQuery(TermQuery('docid',item.get("docid")),sort=Sort(sorters=[FieldSort("docid")]),limit=1,get_total_count=True),
  4050. ColumnsToGet([document_dochtmlcon],return_type=ColumnReturnType.SPECIFIED))
  4051. _html = getRow_ots(rows)[0][document_dochtmlcon]
  4052. dhtml.setValue(document_dochtmlcon,_html,True)
  4053. dhtml.update_row(self.ots_client)
  4054. _flag = dtmp.update_row(self.ots_client)
  4055. if not _flag:
  4056. for i in range(10):
  4057. list_proj_json = dtmp.getProperties().get(document_tmp_projects)
  4058. if list_proj_json is not None:
  4059. list_proj = json.loads(list_proj_json)
  4060. dtmp.setValue(document_tmp_projects,json.dumps(list_proj[:len(list_proj)//2]),True)
  4061. if dtmp.update_row(self.ots_client):
  4062. break
  4063. self.changeSaveStatus(remove_list)
  4064. self.changeSaveStatus(list_merge_dump)
  4065. else:
  4066. return list_docids
  4067. except Exception as e:
  4068. traceback.print_exc()
  4069. log("dumplicate error on:%s"%(str(item.get(document_tmp_docid))))
  4070. finally:
  4071. log("dumplicate end on:%s"%(str(item.get(document_tmp_docid))))
  4072. self.queue_dumplicate_processed.put(item.get(document_tmp_docid))
  4073. def fix_doc_which_not_in_project(self):
  4074. '''
  4075. 将成品公告中不存在于project2的数据取出,并放入document_tmp中重新进行去重和合并
  4076. :return:
  4077. '''
  4078. def fix_doc_handle(item,result_queue):
  4079. _docid = item.get(document_tmp_docid)
  4080. b_q = BoolQuery(must_queries=[TermQuery(project_docids,str(_docid))])
  4081. rows,next_token,total_count,is_all_succeed = self.ots_client.search("project2","project2_index",
  4082. SearchQuery(b_q,get_total_count=True),
  4083. ColumnsToGet(return_type=ColumnReturnType.NONE))
  4084. if total_count==0:
  4085. log("fix_doc:%s not in project2"%(str(_docid)))
  4086. d_tmp = Document_tmp(item)
  4087. d_tmp.setValue(document_tmp_status,flow_dumplicate_status_from[0],True)
  4088. d_tmp.update_row(self.ots_client)
  4089. current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
  4090. before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-20)
  4091. after_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
  4092. if self.fix_doc_docid is None:
  4093. bool_query = BoolQuery(must_queries=[
  4094. TermQuery(document_tmp_save,1),
  4095. RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
  4096. RangeQuery(document_tmp_docchannel,0,300),
  4097. RangeQuery(document_tmp_opertime,before_date,after_date)
  4098. ])
  4099. else:
  4100. bool_query = BoolQuery(must_queries=[
  4101. TermQuery(document_tmp_save,1),
  4102. RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
  4103. RangeQuery(document_tmp_docchannel,0,300),
  4104. RangeQuery(document_tmp_docid,self.fix_doc_docid),
  4105. RangeQuery(document_tmp_opertime,before_date,after_date)
  4106. ])
  4107. list_data = []
  4108. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  4109. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True,limit=100),
  4110. ColumnsToGet(return_type=ColumnReturnType.NONE))
  4111. list_d = getRow_ots(rows)
  4112. list_data.extend(list_d)
  4113. while next_token:
  4114. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  4115. SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
  4116. ColumnsToGet(return_type=ColumnReturnType.NONE))
  4117. list_d = getRow_ots(rows)
  4118. list_data.extend(list_d)
  4119. print("%d/%d"%(len(list_data),total_count))
  4120. if len(list_data)>0:
  4121. self.fix_doc_docid = list_data[-1].get(document_tmp_docid)
  4122. log("current fix_doc_docid:%s"%(str(self.fix_doc_docid)))
  4123. task_queue = Queue()
  4124. for _data in list_data:
  4125. task_queue.put(_data)
  4126. mt = MultiThreadHandler(task_queue,fix_doc_handle,None,30)
  4127. mt.run()
  4128. def send_daily_check_data(self):
  4129. import datetime
  4130. def get_download_url(bucket, ObjectName, timeout):
  4131. url = ""
  4132. exist = bucket.object_exists(ObjectName)
  4133. if exist:
  4134. get_url = False
  4135. for i in range(3):
  4136. try:
  4137. url = bucket.sign_url('GET', ObjectName, timeout)
  4138. url = url.replace("-internal", "") # 替换地址里的内网标识
  4139. get_url = True
  4140. except:
  4141. pass
  4142. if get_url:
  4143. break
  4144. return url
  4145. file_timeout = 60 * 60 * 24 * 5 # 文件下载链接保存 5 天
  4146. # 获取昨天的日期
  4147. date = str(datetime.date.today() - datetime.timedelta(days=1))
  4148. oss_path = 'tmp_document_quality_data/'
  4149. object_path = oss_path + date + '/'
  4150. msg = "每日数据质量检查结果(报警):"
  4151. csv_name = "数据质量监控检查结果.xlsx"
  4152. ObjectName = object_path + csv_name
  4153. url = get_download_url(self.bucket,ObjectName,file_timeout)
  4154. if url:
  4155. msg += "\n文件名:\"%s\",链接:%s" % (csv_name, url)
  4156. csv_name = "公告重复量大的编号.xlsx"
  4157. ObjectName = object_path + csv_name
  4158. url = get_download_url(self.bucket, ObjectName, file_timeout)
  4159. if url:
  4160. msg += "\n文件名:\"%s\",链接:%s" % (csv_name, url)
  4161. csv_name = "公告附件重复量大的编号.xlsx"
  4162. ObjectName = object_path + csv_name
  4163. url = get_download_url(self.bucket, ObjectName, file_timeout)
  4164. if url:
  4165. msg += "\n文件名:\"%s\",链接:%s" % (csv_name, url)
  4166. csv_name = "附件识别异常的站源.xlsx"
  4167. ObjectName = object_path + csv_name
  4168. url = get_download_url(self.bucket, ObjectName, file_timeout)
  4169. if url:
  4170. msg += "\n文件名:\"%s\",链接:%s" % (csv_name, url)
  4171. csv_name = "报名时间,截止时间在发布时间之前的公告.xlsx"
  4172. ObjectName = object_path + csv_name
  4173. url = get_download_url(self.bucket, ObjectName, file_timeout)
  4174. if url:
  4175. msg += "\n文件名:\"%s\",链接:%s" % (csv_name, url)
  4176. atMobiles = ['18813973429'] # 维阵
  4177. ACCESS_TOKEN_DATAWORKS = "https://oapi.dingtalk.com/robot/send?access_token=9489f01c4ab9f0c3f87e2ff5c3e35eb9fb0d17afb6244de4683596df1111daea"
  4178. sentMsgToDD(msg,ACCESS_TOKEN_DATAWORKS,atMobiles=atMobiles)
  4179. def send_daily_check_data2(self):
  4180. import datetime
  4181. import pandas as pd
  4182. from itertools import groupby
  4183. dict_channel = {"公告变更": 51,
  4184. "招标公告": 52,
  4185. "中标信息": 101,
  4186. "招标预告": 102,
  4187. "招标答疑": 103,
  4188. "资审结果": 105,
  4189. "法律法规": 106,
  4190. "新闻资讯": 107,
  4191. "采购意向": 114,
  4192. "拍卖出让": 115,
  4193. "土地矿产": 116,
  4194. "产权交易": 117,
  4195. "废标公告": 118,
  4196. "候选人公示": 119,
  4197. "合同公告": 120}
  4198. label2channel = {v:k for k,v in dict_channel.items()}
  4199. def post_data(url,json_data):
  4200. post_sucess = False
  4201. for i in range(3):
  4202. if not post_sucess:
  4203. try:
  4204. # 发送POST请求,传输JSON数据
  4205. response = requests.post(url, json=json_data)
  4206. # 检查响应状态码
  4207. if response.status_code == 200:
  4208. post_sucess = True
  4209. except requests.exceptions.RequestException as e:
  4210. log("send_daily_check_data2,post error reason: %s"%(str(e)))
  4211. pass
  4212. return post_sucess
  4213. res_json = {
  4214. "data": [],
  4215. "count": 0
  4216. }
  4217. # 获取昨天的日期
  4218. date = str(datetime.date.today() - datetime.timedelta(days=1))
  4219. oss_path = 'tmp_document_quality_data/'
  4220. object_path = oss_path + date + '/'
  4221. csv_name = "数据质量监控检查结果.xlsx"
  4222. ObjectName = object_path + csv_name
  4223. LocalPath = os.path.join(self.current_path,"download",csv_name)
  4224. down_res = downloadFile(self.bucket,ObjectName,LocalPath,retry=3)
  4225. if down_res:
  4226. df = pd.read_excel(LocalPath)
  4227. for web_source_no,original_docchannel,error_rule in zip(df['web_source_no'],df['original_docchannel'],df['error_rule']):
  4228. error_rule = json.loads(error_rule)
  4229. for error_type,error_sample in error_rule.items():
  4230. tmp_data = {
  4231. "WEB_SOURCE_NO": web_source_no,
  4232. "WEBTYPE": label2channel.get(original_docchannel, ""),
  4233. "TYPE": error_type,
  4234. "ITEMS": error_sample
  4235. }
  4236. res_json['data'].append(tmp_data)
  4237. res_json['count'] += 1
  4238. os.remove(LocalPath)
  4239. csv_name = "公告重复量大的编号.xlsx"
  4240. ObjectName = object_path + csv_name
  4241. down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
  4242. if down_res:
  4243. df = pd.read_excel(LocalPath)
  4244. tmp_list = []
  4245. for web_source_no,fingerprint,original_docchannel,cnt,res in zip(df['web_source_no'], df['fingerprint'],
  4246. df['original_docchannel'],df['cnt'],df['res']):
  4247. tmp_data = {
  4248. "WEB_SOURCE_NO": web_source_no,
  4249. "WEBTYPE": label2channel.get(original_docchannel, ""),
  4250. "TYPE": "编号公告重复",
  4251. "FINGERPRINT": fingerprint,
  4252. "ITEMS": json.loads(res)
  4253. }
  4254. tmp_list.append(tmp_data)
  4255. tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
  4256. for key, group in groupby(tmp_list, lambda x: (x['WEB_SOURCE_NO'])):
  4257. group = list(group)[:5]
  4258. res_json['data'].extend(group)
  4259. res_json['count'] += len(group)
  4260. os.remove(LocalPath)
  4261. csv_name = "公告附件重复量大的编号.xlsx"
  4262. ObjectName = object_path + csv_name
  4263. down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
  4264. if down_res:
  4265. df = pd.read_excel(LocalPath)
  4266. tmp_list = []
  4267. for web_source_no,filemd5,original_docchannel,cnt,res in zip(df['web_source_no'],df['filemd5'],
  4268. df['original_docchannel'],df['cnt'],df['res']):
  4269. tmp_data = {
  4270. "WEB_SOURCE_NO": web_source_no,
  4271. "WEBTYPE": label2channel.get(original_docchannel, ""),
  4272. "TYPE": "编号附件重复",
  4273. "FILEMD5": filemd5,
  4274. "ITEMS": json.loads(res)
  4275. }
  4276. tmp_list.append(tmp_data)
  4277. tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
  4278. for key, group in groupby(tmp_list, lambda x: (x['WEB_SOURCE_NO'])):
  4279. group = list(group)[:5]
  4280. res_json['data'].extend(group)
  4281. res_json['count'] += len(group)
  4282. os.remove(LocalPath)
  4283. csv_name = "附件识别异常的站源.xlsx"
  4284. ObjectName = object_path + csv_name
  4285. down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
  4286. if down_res:
  4287. df = pd.read_excel(LocalPath)
  4288. for web_source_no,original_docchannel,error_ratio,error_sample,res in zip(df['web_source_no'], df['original_docchannel'],
  4289. df['error_ratio'],df['error_sample'],df['res']):
  4290. tmp_data = {
  4291. "WEB_SOURCE_NO": web_source_no,
  4292. "WEBTYPE": label2channel.get(original_docchannel, ""),
  4293. "TYPE": "附件识别异常",
  4294. "ITEMS": json.loads(res)
  4295. }
  4296. res_json['data'].append(tmp_data)
  4297. res_json['count'] += 1
  4298. os.remove(LocalPath)
  4299. csv_name = "报名时间,截止时间在发布时间之前的公告.xlsx"
  4300. ObjectName = object_path + csv_name
  4301. down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
  4302. if down_res:
  4303. df = pd.read_excel(LocalPath)
  4304. tmp_list = []
  4305. for web_source_no,original_docchannel,res in zip(df['web_source_no'],df['original_docchannel'],df['res']):
  4306. tmp_data = {
  4307. "WEB_SOURCE_NO": web_source_no,
  4308. "WEBTYPE": label2channel.get(original_docchannel, ""),
  4309. "TYPE": "截止日期在发布日期之前",
  4310. "ITEMS": json.loads(res)
  4311. }
  4312. tmp_list.append(tmp_data)
  4313. res_json['data'].extend(tmp_list)
  4314. res_json['count'] += len(tmp_list)
  4315. os.remove(LocalPath)
  4316. # url = "http://120.132.118.205:17090/saveQualityListData"
  4317. url = "http://data-monitor.bidizhaobiao.com/oldApi/saveQualityListData"
  4318. res = post_data(url,res_json)
  4319. if res:
  4320. log("send_daily_check_data2,sent data len: %d"%(res_json['count']))
  4321. # 根据项目修复doc公告数据
  4322. def fix_doc_by_project2(self):
  4323. import datetime
  4324. from itertools import groupby
  4325. from collections import Counter
  4326. label2key = {
  4327. '公告变更': 51,
  4328. '招标公告': 52,
  4329. '中标信息': 101,
  4330. '招标预告': 102,
  4331. '招标答疑': 103,
  4332. '招标文件': 104,
  4333. '资审结果': 105,
  4334. '法律法规': 106,
  4335. '新闻资讯': 107,
  4336. '采购意向': 114,
  4337. '拍卖出让': 115,
  4338. '土地矿产': 116,
  4339. '产权交易': 117,
  4340. '废标公告': 118,
  4341. '候选人公示': 119,
  4342. '合同公告': 120,
  4343. '开标记录': 121,
  4344. '验收合同': 122,
  4345. # 以下排除
  4346. '拟在建数据': 301,
  4347. '审批项目数据': 302,
  4348. '投诉处罚': 303
  4349. }
  4350. key2label = dict((i[1], i[0]) for i in label2key.items())
  4351. today = str(datetime.date.today())
  4352. yesterday = str(datetime.date.today() - datetime.timedelta(days=1))
  4353. front_year = str(datetime.date.today() - datetime.timedelta(days=365))
  4354. bool_query = BoolQuery(must_queries=[RangeQuery("update_time", yesterday + " 00:00:00", today + " 00:00:00"),
  4355. RangeQuery("page_time", front_year, today),
  4356. RangeQuery("status", 201, 301),
  4357. RangeQuery("docid_number", 4, 30)]
  4358. )
  4359. all_rows = []
  4360. rows, next_token, total_count, is_all_succeed = self.ots_client.search("project2", "project2_index",
  4361. SearchQuery(bool_query, sort=Sort(sorters=[
  4362. FieldSort("update_time", SortOrder.ASC)]),
  4363. limit=100, get_total_count=True),
  4364. ColumnsToGet(['uuid', 'docids', 'update_time','docid_number'],
  4365. return_type=ColumnReturnType.SPECIFIED))
  4366. all_rows.extend(rows)
  4367. while next_token:
  4368. rows, next_token, total_count, is_all_succeed = self.ots_client.search("project2", "project2_index",
  4369. SearchQuery(bool_query,
  4370. next_token=next_token,
  4371. sort=Sort(sorters=[
  4372. FieldSort("update_time",SortOrder.ASC)]),
  4373. limit=100,get_total_count=True),
  4374. ColumnsToGet(['uuid', 'docids', 'update_time','docid_number'],
  4375. return_type=ColumnReturnType.SPECIFIED))
  4376. all_rows.extend(rows)
  4377. list_dict = getRow_ots(all_rows)
  4378. docids_list = []
  4379. for _dict in list_dict:
  4380. _uuid = _dict.get("uuid", "")
  4381. _docids = _dict.get("docids", "")
  4382. _docids = _docids.split(",")
  4383. for docid in _docids:
  4384. docids_list.append([_uuid, int(docid)])
  4385. # print('docids_list len:', len(docids_list))
  4386. ots_query_res = []
  4387. doc_columns_list = ['page_time', 'tenderee', 'tenderee_phone', 'agency', 'agency_phone', 'extract_count',
  4388. "sub_docs_json",'extract_json', 'extract_json1', 'extract_json2', 'extract_json3']
  4389. def extract_json_process(res_json):
  4390. # 解析document数据
  4391. extract_json = res_json.pop("extract_json")
  4392. extract_json = extract_json if extract_json else "{}"
  4393. if 'extract_json1' in res_json:
  4394. extract_json1 = res_json.pop("extract_json1")
  4395. extract_json1 = extract_json1 if extract_json1 else ""
  4396. extract_json = extract_json + extract_json1
  4397. if 'extract_json2' in res_json:
  4398. extract_json2 = res_json.pop("extract_json2")
  4399. extract_json2 = extract_json2 if extract_json2 else ""
  4400. extract_json = extract_json + extract_json2
  4401. if 'extract_json3' in res_json:
  4402. extract_json3 = res_json.pop("extract_json3")
  4403. extract_json3 = extract_json3 if extract_json3 else ""
  4404. extract_json = extract_json + extract_json3
  4405. try:
  4406. extract_json = json.loads(extract_json)
  4407. except:
  4408. return None
  4409. docchannel_dict = extract_json.get('docchannel', {})
  4410. res_json['docchannel'] = docchannel_dict.get('docchannel', "")
  4411. res_json['life_docchannel'] = docchannel_dict.get('life_docchannel', "")
  4412. district_dict = extract_json.get('district', {})
  4413. res_json['province'] = district_dict.get('province', "")
  4414. res_json['city'] = district_dict.get('city', "")
  4415. res_json['district'] = district_dict.get('district', "")
  4416. res_json['area'] = district_dict.get('area', "")
  4417. prem = extract_json.get('prem', {})
  4418. res_json['prem'] = prem
  4419. return res_json
  4420. def _handle(item, _):
  4421. # 查询解析document数据
  4422. _uuid = item[0] # project uuid
  4423. _docid = item[1]
  4424. for i in range(3):
  4425. try:
  4426. bool_query = BoolQuery(must_queries=[TermQuery('docid', _docid)]
  4427. )
  4428. rows, next_token, total_count, is_all_succeed = self.ots_client.search("document", "document_index",
  4429. SearchQuery(bool_query,
  4430. sort=Sort(sorters=[FieldSort("page_time",SortOrder.ASC)]),
  4431. limit=None,get_total_count=True),
  4432. ColumnsToGet(doc_columns_list,
  4433. return_type=ColumnReturnType.SPECIFIED))
  4434. res = getRow_ots(rows)
  4435. if res:
  4436. # 通过extract_count过滤掉相关性不大的公告
  4437. if res[0].get('extract_count', 0) > 5:
  4438. ots_query_res.append([_uuid, _docid, extract_json_process(res[0])])
  4439. break
  4440. except Exception as e:
  4441. # print('error:',e)
  4442. pass
  4443. task_queue = Queue()
  4444. for item in docids_list:
  4445. task_queue.put(item)
  4446. if task_queue.qsize() >= 10000:
  4447. _mt = MultiThreadHandler(task_queue, _handle, None, 20)
  4448. _mt.run()
  4449. if task_queue.qsize() >= 0:
  4450. _mt = MultiThreadHandler(task_queue, _handle, None, 20)
  4451. _mt.run()
  4452. # print('ots_query_res len:', len(ots_query_res))
  4453. # 处理修复数据
  4454. ots_query_res.sort(key=lambda x: x[0])
  4455. # 招标类别
  4456. zb_type = [51, 52, 101, 102, 103, 104, 105, 114, 118, 119, 120, 121, 122]
  4457. zb_type = [key2label[i] for i in zb_type]
  4458. change_res = []
  4459. for key, group in groupby(ots_query_res, lambda x: (x[0])):
  4460. uuid = key
  4461. project_data = list(group)
  4462. all_len = len(project_data)
  4463. if all_len < 4:
  4464. continue
  4465. zb_len = sum([1 if i[2].get('docchannel') in zb_type else 0 for i in project_data])
  4466. # 招标类公告占比
  4467. # if zb_len / all_len <= 0.5:
  4468. if zb_len / all_len <= 0.7:
  4469. # 项目不是招标相关项目
  4470. continue
  4471. # 项目里最多的省份
  4472. province_list = [i[2].get('province', '') for i in project_data]
  4473. province_sort = Counter(province_list).most_common()
  4474. change_province = ""
  4475. change_city = ""
  4476. change_district = ""
  4477. change_area = ""
  4478. # if province_sort[0][1]/all_len > 0.5:
  4479. if province_sort[0][1] / all_len > 0.7:
  4480. if province_sort[0][0] and province_sort[0][0] not in ["全国", "未知"]:
  4481. change_province = province_sort[0][0]
  4482. if change_province:
  4483. # 只替换到city,district 取"未知"
  4484. change_province_data = [(i[2].get('province', ''), i[2].get('city', ''), i[2].get('area', '')) for i in
  4485. project_data if i[2].get('province', '') == change_province]
  4486. change_province_data_sort = Counter(change_province_data).most_common()
  4487. change_city = change_province_data_sort[0][0][1]
  4488. change_area = change_province_data_sort[0][0][2]
  4489. change_district = "未知"
  4490. # 联系方式统计
  4491. phone_dict = {}
  4492. for d in project_data:
  4493. tenderee = d[2].get("tenderee", "")
  4494. agency = d[2].get("agency", "")
  4495. prem = d[2].get("prem", {})
  4496. if len(prem) > 0:
  4497. for name, project in prem.items():
  4498. roleList = project.get("roleList", [])
  4499. for role in roleList:
  4500. role_name = role.get("role_name", "")
  4501. role_text = role.get("role_text", "")
  4502. if role_name in ['tenderee', 'agency', 'win_tenderer']:
  4503. linklist = role.get("linklist", [])
  4504. for _contact in linklist:
  4505. if _contact[1] not in phone_dict:
  4506. phone_dict[_contact[1]] = {}
  4507. if role_text not in phone_dict[_contact[1]]:
  4508. phone_dict[_contact[1]][role_text] = 0
  4509. phone_dict[_contact[1]][role_text] += 1
  4510. # 汇总电话对应的实体
  4511. new_phone_dict = dict((phone, []) for phone in phone_dict)
  4512. for phone, value in phone_dict.items():
  4513. phone_name = [(name, count) for name, count in value.items()]
  4514. phone_name.sort(key=lambda x: x[1], reverse=True)
  4515. max_count = phone_name[0][1]
  4516. max_name = [name for name, count in value.items() if count == max_count and max_count > 0]
  4517. new_phone_dict[phone] = max_name
  4518. for item in project_data:
  4519. change_json = {"partitionkey": item[2].get("partitionkey"),
  4520. 'docid': item[1],
  4521. 'contactsByDelete': []}
  4522. tenderee = item[2].get("tenderee", "")
  4523. agency = item[2].get("agency", "")
  4524. # docchannel修复
  4525. docchannel = item[2].get('docchannel', "")
  4526. life_docchannel = item[2].get('life_docchannel', "")
  4527. if docchannel and docchannel not in zb_type:
  4528. if life_docchannel in zb_type and docchannel != '采招数据':
  4529. change_json['docchannel'] = label2key.get(life_docchannel)
  4530. # province修复
  4531. province = item[2].get('province', "")
  4532. if change_province:
  4533. if province != change_province and province in ["全国", "未知", '']: # province未识别时才修复
  4534. change_json['province'] = change_province
  4535. change_json['city'] = change_city
  4536. change_json['district'] = change_district
  4537. change_json['area'] = change_area
  4538. # 联系方式修复
  4539. tenderee_phone = item[2].get("tenderee_phone", "")
  4540. agency_phone = item[2].get("agency_phone", "")
  4541. prem = item[2].get("prem", {})
  4542. sub_docs_json = item[2].get("sub_docs_json", "[]")
  4543. try:
  4544. sub_docs_json = json.loads(sub_docs_json)
  4545. except:
  4546. sub_docs_json = []
  4547. for name, project in prem.items():
  4548. roleList = project.get("roleList", [])
  4549. for role in roleList:
  4550. role_name = role.get("role_name", "")
  4551. role_text = role.get("role_text", "")
  4552. if role_name == 'tenderee' and role_text == tenderee:
  4553. linklist = role.get("linklist", [])
  4554. need_change = False
  4555. right_contact = []
  4556. for _contact in linklist:
  4557. if _contact[1] and new_phone_dict.get(_contact[1]) and role_text not in new_phone_dict[_contact[1]]:
  4558. change_json['contactsByDelete'].append({"enterpriseName": role_text, "phoneNo": _contact[1]})
  4559. if _contact[1] == tenderee_phone:
  4560. need_change = True
  4561. else:
  4562. right_contact.append([_contact[0], _contact[1]])
  4563. if need_change:
  4564. if right_contact:
  4565. right_contact.sort(reverse=True)
  4566. change_json['tendereeContact'] = right_contact[0][0]
  4567. change_json['tendereePhone'] = right_contact[0][1]
  4568. elif role_name == 'agency' and role_text == agency:
  4569. linklist = role.get("linklist", [])
  4570. need_change = False
  4571. right_contact = []
  4572. for _contact in linklist:
  4573. if _contact[1] and new_phone_dict.get(_contact[1]) and role_text not in new_phone_dict[_contact[1]]:
  4574. change_json['contactsByDelete'].append({"enterpriseName": role_text, "phoneNo": _contact[1]})
  4575. if _contact[1] == agency_phone:
  4576. need_change = True
  4577. else:
  4578. right_contact.append([_contact[0], _contact[1]])
  4579. if need_change:
  4580. if right_contact:
  4581. right_contact.sort(reverse=True)
  4582. change_json['agencyContact'] = right_contact[0][0]
  4583. change_json['agencyPhone'] = right_contact[0][1]
  4584. elif role_name == 'win_tenderer':
  4585. linklist = role.get("linklist", [])
  4586. for _contact in linklist:
  4587. if _contact[1] and new_phone_dict.get(_contact[1]) and role_text not in new_phone_dict[_contact[1]]:
  4588. change_json['contactsByDelete'].append({"enterpriseName": role_text, "phoneNo": _contact[1]})
  4589. sub_docs_json_change = False
  4590. if sub_docs_json:
  4591. for _project in sub_docs_json:
  4592. win_tenderer = _project.get("win_tenderer", "")
  4593. win_tenderer_phone = _project.get("win_tenderer_phone", "")
  4594. if win_tenderer_phone and new_phone_dict.get(win_tenderer_phone) and win_tenderer not in new_phone_dict[win_tenderer_phone]:
  4595. _project["win_tenderer_phone"] = ""
  4596. _project["win_tenderer_manager"] = ""
  4597. sub_docs_json_change = True
  4598. if sub_docs_json_change:
  4599. change_json['subDocsJson'] = sub_docs_json
  4600. new_contact_json = []
  4601. for _contact in change_json['contactsByDelete']:
  4602. if _contact not in new_contact_json:
  4603. new_contact_json.append(_contact)
  4604. change_json['contactsByDelete'] = new_contact_json
  4605. if len(change_json) > 3 or len(change_json['contactsByDelete']) > 0:
  4606. # 没有修改地区时,传输原来提取的地区
  4607. if not change_json.get("province"):
  4608. change_json['area'] = item[2].get("area", "")
  4609. change_json['province'] = item[2].get("province", "")
  4610. change_json['city'] = item[2].get("city", "")
  4611. change_json['district'] = item[2].get("district", "")
  4612. change_res.append({"document": change_json})
  4613. # post result
  4614. headers = {'Content-Type': 'application/json',
  4615. "Authorization": "Bearer eyJhbGciOiJIUzUxMiJ9.eyJ1c2VySWQiOjEsInVzZXJuYW1lIjoiYWRtaW4iLCJ1dWlkIjoiNGQwYzA0ODYtMzVmZi00MDJhLTk4OWQtNWEwNTE3YTljMDNiIiwic3ViIjoiMSIsImlhdCI6MTY3OTk5MTcxNywiZXhwIjo0ODMzNTkxNzE3fQ.ESDDnEDYP5ioK4ouHOYXsZbLayGRNVI9ugpbxDx_3fPIceD1KIjlDeopBmeATLoz8VYQihd8qO-UzP5pDsaUmQ"}
  4616. # url = "http://192.168.2.26:8002/document/updateAreaAndContact"
  4617. url = "http://data-api.bidizhaobiao.com/document/updateAreaAndContact"
  4618. for _data in change_res:
  4619. post_sucess = False
  4620. for i in range(3):
  4621. if not post_sucess:
  4622. try:
  4623. # 发送POST请求,传输JSON数据
  4624. response = requests.post(url, json=_data,headers=headers)
  4625. # print(response.status_code,response.json())
  4626. # 检查响应状态码
  4627. if response.status_code == 200:
  4628. post_sucess = True
  4629. except requests.exceptions.RequestException as e:
  4630. # log("fix doc by project2,post error reason: %s"%(str(e)))
  4631. pass
  4632. log("fix doc by project2, change doc nums:%d"%len(change_res))
  4633. def start_flow_dumplicate(self):
  4634. schedule = BlockingScheduler()
  4635. schedule.add_job(self.flow_dumplicate,"cron",second="*/5")
  4636. schedule.add_job(self.flow_dumpcate_comsumer,"cron",second="*/30")
  4637. schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
  4638. schedule.add_job(self.flow_remove,"cron",hour="20")
  4639. schedule.add_job(self.send_daily_check_data,"cron",hour='9', minute='10')
  4640. schedule.add_job(self.send_daily_check_data2,"cron",hour='9', minute='10')
  4641. schedule.add_job(self.fix_doc_by_project2,"cron",hour='8', minute='10')
  4642. schedule.add_job(self.flow_remove_project_tmp,"cron",hour="20")
  4643. schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="*/10")
  4644. schedule.start()
  4645. def changeSaveStatus(self,list_dict):
  4646. if list_dict is not None:
  4647. for _dict in list_dict:
  4648. if isinstance(_dict,dict):
  4649. if _dict.get(document_tmp_save,1)==1:
  4650. _d = {"partitionkey":_dict["partitionkey"],
  4651. "docid":_dict["docid"],
  4652. document_tmp_save:0
  4653. }
  4654. _d_tmp = Document_tmp(_d)
  4655. if _d_tmp.exists_row(self.ots_client):
  4656. _d_tmp.update_row(self.ots_client)
  4657. elif isinstance(_dict,int):
  4658. _d = {"partitionkey":_dict%500+1,
  4659. "docid":_dict,
  4660. document_tmp_save:0
  4661. }
  4662. _d_tmp = Document_tmp(_d)
  4663. if _d_tmp.fix_columns(self.ots_client,["status",document_update_document],True):
  4664. if _d_tmp.getProperties().get("status")==1:
  4665. if _d_tmp.getProperties().get(document_update_document,"")!="true":
  4666. _d_tmp.setValue("status",0,True)
  4667. _d_tmp.update_row(self.ots_client)
  4668. # project_uuids为目标项目uuid,手动把docid合并到project_uuids对应的项目中
  4669. def test_dumplicate(self,docid,project_uuids=[]):
  4670. # columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
  4671. columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type,'detail_link','products','crtime']
  4672. # print('columns',columns)
  4673. item = self.get_attrs_before_dump(docid,columns)
  4674. # 是否需要把属性复制到doc_tmp表
  4675. item['test_merge'] = False
  4676. if item:
  4677. log("start dumplicate_comsumer_handle")
  4678. # self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=False,upgrade=False,project_uuids=project_uuids)
  4679. self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=True,upgrade=False,project_uuids=project_uuids)
  4680. return
  4681. def test_merge(self,list_docid_less,list_docid_greater):
  4682. list_docs_less = self.search_docs(list_docid_less)
  4683. list_projects_less = self.generate_projects_from_document(list_docs_less)
  4684. list_docs_greater = self.search_docs(list_docid_greater)
  4685. list_projects_greater = self.generate_projects_from_document(list_docs_greater)
  4686. list_projects_less.extend(list_projects_greater)
  4687. list_projects = dumplicate_projects(list_projects_less,b_log=True)
  4688. project_json = to_project_json(list_projects)
  4689. log("project_json:%s"%project_json)
  4690. return project_json
  4691. def getRemainDoc(self,docid):
  4692. columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]
  4693. bool_query = BoolQuery(must_queries=[
  4694. TermQuery("docid",docid)
  4695. ])
  4696. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
  4697. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  4698. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  4699. list_dict = getRow_ots(rows)
  4700. if len(list_dict)>0:
  4701. item = list_dict[0]
  4702. start_time = time.time()
  4703. self.post_extract(item)
  4704. base_list = []
  4705. set_docid = set()
  4706. list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,to_log=True)
  4707. list_rules.sort(key=lambda x:x["confidence"],reverse=True)
  4708. _i = 0
  4709. step = 5
  4710. item["confidence"] = 999
  4711. if item.get(document_tmp_docid) not in set_docid:
  4712. base_list.append(item)
  4713. set_docid.add(item.get(document_tmp_docid))
  4714. while _i<len(list_rules):
  4715. must_not_q = []
  4716. if len(base_list)>0:
  4717. must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
  4718. _query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
  4719. must_not_queries=must_not_q)
  4720. _rule = list_rules[_i]
  4721. confidence = _rule["confidence"]
  4722. singleNum_keys = _rule["singleNum_keys"]
  4723. contain_keys = _rule["contain_keys"]
  4724. multiNum_keys = _rule["multiNum_keys"]
  4725. self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json])
  4726. _i += step
  4727. _time = time.time()
  4728. log("%d start final check with length:%d"%(item["docid"],len(base_list)))
  4729. final_list = self.dumplicate_fianl_check(base_list)
  4730. log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
  4731. best_docid = self.get_best_docid(final_list)
  4732. return best_docid
  4733. return None
  4734. def compare_dumplicate_check():
  4735. import pandas as pd
  4736. df_dump = Dataflow_dumplicate(start_delete_listener=False)
  4737. test_count = 1000
  4738. # columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
  4739. columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district]
  4740. bool_query = BoolQuery(must_queries=[
  4741. RangeQuery("docid",400453395,400463395)
  4742. ])
  4743. rows,next_token,total_count,is_all_succeed = df_dump.ots_client.search("document","document_index",
  4744. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=10,get_total_count=True),
  4745. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  4746. log("flow_dumplicate producer total_count:%d"%total_count)
  4747. list_dict = getRow_ots(rows)
  4748. while 1:
  4749. if not next_token or len(list_dict)>=test_count:
  4750. break
  4751. rows,next_token,total_count,is_all_succeed = df_dump.ots_client.search("document","document_index",
  4752. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  4753. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  4754. list_dict.extend(getRow_ots(rows))
  4755. def _handle1(_item,result_queue):
  4756. try:
  4757. list_docid = df_dump.dumplicate_comsumer_handle(_item,None,df_dump.ots_client,get_all=True,upgrade=False)
  4758. _item["before"] = list_docid
  4759. except Exception as e:
  4760. pass
  4761. dump_result = {}
  4762. for item in list_dict:
  4763. dump_result[item["docid"]] = {}
  4764. task_queue = Queue()
  4765. list_item = []
  4766. for item in list_dict:
  4767. _item = {}
  4768. _item.update(item)
  4769. list_item.append(_item)
  4770. task_queue.put(_item)
  4771. mt = MultiThreadHandler(task_queue,_handle1,None,30)
  4772. mt.run()
  4773. for item in list_item:
  4774. dump_result[item["docid"]]["before"] = item.get("before")
  4775. df_dump.check_rule = 2
  4776. def _handle2(_item,result_queue):
  4777. try:
  4778. list_docid1 = df_dump.dumplicate_comsumer_handle(_item,None,df_dump.ots_client,get_all=True,upgrade=False)
  4779. _item["after"] = list_docid1
  4780. except Exception as e:
  4781. pass
  4782. task_queue = Queue()
  4783. list_item = []
  4784. for item in list_dict:
  4785. _item = {}
  4786. _item.update(item)
  4787. list_item.append(_item)
  4788. task_queue.put(_item)
  4789. mt = MultiThreadHandler(task_queue,_handle2,None,30)
  4790. mt.run()
  4791. for item in list_item:
  4792. dump_result[item["docid"]]["after"] = item.get("after")
  4793. df_data = {"docid":[],
  4794. "before":[],
  4795. "after":[],
  4796. "before-after":[],
  4797. "after-before":[]}
  4798. for docid,_d in dump_result.items():
  4799. df_data["docid"].append(docid)
  4800. before = _d.get("before",[])
  4801. after = _d.get("after",[])
  4802. df_data["before"].append(str(before))
  4803. df_data["after"].append(str(after))
  4804. df_data["before-after"].append(str(set(before)-set(after)))
  4805. df_data["after-before"].append(str(set(after)-set(before)))
  4806. df = pd.DataFrame(df_data,columns=["docid","before","after","before-after","after-before"])
  4807. df.to_excel("compare_dump.xlsx")
  4808. def fix_merge_docid(docid):
  4809. def get_uuid_docids(docid):
  4810. ots_client = getConnect_ots()
  4811. bool_query = BoolQuery(must_queries=[
  4812. TermQuery("docids",docid)
  4813. ])
  4814. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  4815. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time")]),limit=100,get_total_count=True),
  4816. ColumnsToGet(["docids"],return_type=ColumnReturnType.SPECIFIED))
  4817. list_row = getRow_ots(rows)
  4818. while next_token:
  4819. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  4820. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  4821. ColumnsToGet(["docids"],return_type=ColumnReturnType.SPECIFIED))
  4822. list_row.extend(getRow_ots(rows))
  4823. return list_row
  4824. def get_new_docid(list_docid1,list_docid2):
  4825. return list(set(list_docid1)-set(list_docid2))
  4826. def get_list_docid(list_row):
  4827. list_docid = []
  4828. for row in list_row:
  4829. docids = row.get("docids",'')
  4830. if docids:
  4831. list_docid.extend([int(a) for a in docids.split(",")])
  4832. return list(set(list_docid))
  4833. def get_list_uuid(list_row):
  4834. list_uuid = []
  4835. for row in list_row:
  4836. uuid = row.get("uuid",'')
  4837. if uuid:
  4838. list_uuid.append(uuid)
  4839. return list(set(list_uuid))
  4840. list_row = get_uuid_docids(docid)
  4841. print(list_row)
  4842. list_docid1 = get_list_docid(list_row)
  4843. list_new_docid = get_new_docid(list_docid1,[docid])
  4844. while 1:
  4845. if len(list_new_docid)==0:
  4846. break
  4847. list_row2 = []
  4848. for _docid in list_new_docid:
  4849. list_row2.extend(get_uuid_docids(_docid))
  4850. list_docid1 = get_list_docid(list_row)
  4851. list_docid2 = get_list_docid(list_row2)
  4852. list_new_docid = get_new_docid(list_docid1,list_docid2)
  4853. list_row.extend(list_row2)
  4854. list_uuid = get_list_uuid(list_row)
  4855. list_docid = get_list_docid(list_row)
  4856. print(list_uuid)
  4857. print(list_docid)
  4858. for _docid in list_docid:
  4859. _d = Document({document_partitionkey:_docid%500+1,
  4860. document_docid:_docid,
  4861. document_status:1})
  4862. if _d.exists_row(ots_client):
  4863. _d.update_row(ots_client)
  4864. for _uuid in list_uuid:
  4865. _p = Project({project_uuid:_uuid,})
  4866. _p.delete_row(ots_client)
  4867. if __name__ == '__main__':
  4868. a = time.time()
  4869. # df = Dataflow()
  4870. # df.flow_init()
  4871. # df.flow_test()
  4872. # df.test_merge()
  4873. # df.start_flow_attachment()
  4874. # df.start_flow_extract()
  4875. # df.start_flow_dumplicate()
  4876. # # df.start_flow_merge()
  4877. # df.start_flow_remove()
  4878. # download_attachment()
  4879. # test_attachment_interface()
  4880. df_dump = Dataflow_dumplicate(start_delete_listener=False)
  4881. # df_dump.start_flow_dumplicate()
  4882. df_dump.test_dumplicate(616657130,
  4883. # project_uuids=["904cab6d-169b-4403-bbeb-2885a8546aa1"]
  4884. )
  4885. # df_dump.dumplicate_comsumer_handle_interface(603504420,document_table="document_0000",document_table_index="document_0000_index",project_table="project_0000",project_table_index="project_0000_index_formerge")
  4886. # compare_dumplicate_check()
  4887. # df_dump.test_merge([391898061
  4888. # ],[371551361,])
  4889. # df_dump.flow_remove_project_tmp()
  4890. # fix_merge_docid(595271944)
  4891. print("takes",time.time()-a)
  4892. # df_dump.fix_doc_which_not_in_project()
  4893. # df_dump.delete_projects_by_document(16288036)
  4894. # log("=======")
  4895. # for i in range(3):
  4896. # time.sleep(20)
  4897. #
  4898. # a = {"docid":74295123}
  4899. # send_msg_toacmq(df_dump.pool_mq_ali,json.dumps(a),df_dump.doc_delete_queue)