dataflow.py 244 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137
  1. # sys.path.append("/data")
  2. from BaseDataMaintenance.dataSource.source import getConnect_activateMQ_ali
  3. from BaseDataMaintenance.common.multiThread import MultiThreadHandler
  4. from BaseDataMaintenance.common.multiProcess import MultiHandler
  5. from queue import Queue
  6. from multiprocessing import Queue as PQueue
  7. from BaseDataMaintenance.model.ots.document_tmp import *
  8. from BaseDataMaintenance.model.ots.attachment import *
  9. from BaseDataMaintenance.model.ots.document_html import *
  10. from BaseDataMaintenance.model.ots.document_extract2 import *
  11. from BaseDataMaintenance.model.ots.project import *
  12. from BaseDataMaintenance.model.ots.project2_tmp import *
  13. from BaseDataMaintenance.model.ots.document import *
  14. from BaseDataMaintenance.model.ots.project_process import *
  15. import base64
  16. from BaseDataMaintenance.dataSource.interface import getAttachDealInterface,sentMsgToDD
  17. from uuid import uuid4
  18. from BaseDataMaintenance.common.ossUtils import *
  19. from BaseDataMaintenance.dataSource.source import is_internal,getAuth
  20. from apscheduler.schedulers.blocking import BlockingScheduler
  21. from BaseDataMaintenance.maintenance.dataflow_settings import *
  22. from threading import Thread
  23. import oss2
  24. from BaseDataMaintenance.maxcompute.documentDumplicate import *
  25. from BaseDataMaintenance.maxcompute.documentMerge import *
  26. from BaseDataMaintenance.common.otsUtils import *
  27. from BaseDataMaintenance.common.activateMQUtils import *
  28. from BaseDataMaintenance.dataMonitor.data_monitor import BaseDataMonitor
  29. from BaseDataMaintenance.dataSource.pool import ConnectorPool
  30. def getSet(list_dict,key):
  31. _set = set()
  32. for item in list_dict:
  33. if key in item:
  34. if item[key]!='' and item[key] is not None:
  35. if re.search("^\d[\d\.]*$",item[key]) is not None:
  36. _set.add(str(float(item[key])))
  37. else:
  38. _set.add(str(item[key]))
  39. return _set
  40. def getSimilarityOfString(str1,str2):
  41. _set1 = set()
  42. _set2 = set()
  43. if str1 is not None:
  44. for i in range(1,len(str1)):
  45. _set1.add(str1[i-1:i+1])
  46. if str2 is not None:
  47. for i in range(1,len(str2)):
  48. _set2.add(str2[i-1:i+1])
  49. _len = max(1,min(len(_set1),len(_set2)))
  50. return len(_set1&_set2)/_len
  51. def getDiffIndex(list_dict,key,confidence=100):
  52. _set = set()
  53. for _i in range(len(list_dict)):
  54. item = list_dict[_i]
  55. if item["confidence"]>=confidence:
  56. continue
  57. if key in item:
  58. if item[key]!='' and item[key] is not None:
  59. if re.search("^\d+(\.\d+)?$",item[key]) is not None:
  60. _set.add(str(float(item[key])))
  61. else:
  62. _set.add(str(item[key]))
  63. if len(_set)>1:
  64. return _i
  65. return len(list_dict)
  66. def transformSWF(bucket,attachment_hub_url,objectPath,localpath,swf_dir):
  67. swf_urls = []
  68. try:
  69. list_files = os.listdir(swf_dir)
  70. list_files.sort(key=lambda x:x)
  71. headers = dict()
  72. headers["x-oss-object-acl"] = oss2.OBJECT_ACL_PUBLIC_READ
  73. for _file in list_files:
  74. swf_localpath = "%s/%s"%(swf_dir,_file)
  75. swf_objectPath = "%s/%s"%(objectPath.split(".")[0],_file)
  76. uploadFileByPath(bucket,swf_localpath,swf_objectPath,headers)
  77. _url = "%s/%s"%(attachment_hub_url,swf_objectPath)
  78. swf_urls.append(_url)
  79. os.remove(swf_localpath)
  80. except Exception as e:
  81. traceback.print_exc()
  82. return swf_urls
  83. class Dataflow():
  84. def __init__(self):
  85. self.ots_client = getConnect_ots()
  86. self.queue_init = Queue()
  87. self.queue_attachment = Queue()
  88. self.queue_attachment_ocr = Queue()
  89. self.queue_attachment_not_ocr = Queue()
  90. self.list_attachment_ocr = []
  91. self.list_attachment_not_ocr = []
  92. self.queue_extract = Queue()
  93. self.list_extract = []
  94. self.queue_dumplicate = PQueue()
  95. self.queue_dumplicate_processed = PQueue()
  96. self.dumplicate_set = set()
  97. self.queue_merge = Queue()
  98. self.queue_syncho = Queue()
  99. self.queue_remove = Queue()
  100. self.queue_remove_project = Queue()
  101. self.attachment_rec_interface = ""
  102. self.ots_client_merge = getConnect_ots()
  103. if is_internal:
  104. self.bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
  105. else:
  106. self.bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
  107. if is_internal:
  108. self.extract_url = "http://1255640119316927.vpc.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
  109. self.industy_url = "http://1255640119316927.vpc.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/industry_extract"
  110. self.other_url = "http://1255640119316927.vpc.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/other_extract"
  111. else:
  112. self.extract_url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
  113. self.industy_url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/industry_extract"
  114. self.other_url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/other_extract"
  115. self.header = {'Content-Type': 'application/json',"Authorization":"NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg=="}
  116. self.attachment_hub_url = "https://attachment-hub.oss-cn-hangzhou.aliyuncs.com/"
  117. self.auth = getAuth()
  118. oss2.defaults.connection_pool_size = 100
  119. oss2.defaults.multiget_num_threads = 20
  120. log("bucket_url:%s"%(self.bucket_url))
  121. self.attachment_bucket_name = "attachment-hub"
  122. self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
  123. self.current_path = os.path.dirname(__file__)
  124. def flow_init(self):
  125. def producer():
  126. bool_query = BoolQuery(must_queries=[RangeQuery("crtime",'2022-04-20')])
  127. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
  128. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  129. ColumnsToGet(return_type=ColumnReturnType.ALL))
  130. log("flow_init producer total_count:%d"%total_count)
  131. list_dict = getRow_ots(rows)
  132. for _dict in list_dict:
  133. self.queue_init.put(_dict)
  134. _count = len(list_dict)
  135. while next_token:
  136. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
  137. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  138. ColumnsToGet(return_type=ColumnReturnType.ALL))
  139. list_dict = getRow_ots(rows)
  140. for _dict in list_dict:
  141. self.queue_init.put(_dict)
  142. _count += len(list_dict)
  143. def comsumer():
  144. mt = MultiThreadHandler(self.queue_init,comsumer_handle,None,30,1,ots_client=self.ots_client)
  145. mt.run()
  146. def comsumer_handle(item,result_queue,ots_client):
  147. _dochtmlcon = item.get(document_tmp_dochtmlcon,"")
  148. if document_tmp_dochtmlcon in item:
  149. item.pop(document_tmp_dochtmlcon)
  150. if document_tmp_doctextcon in item:
  151. item.pop(document_tmp_doctextcon)
  152. if document_tmp_attachmenttextcon in item:
  153. item.pop(document_tmp_attachmenttextcon)
  154. _status = item.get(document_tmp_status)
  155. new_status = None
  156. if _status>=201 and _status<=300:
  157. item[document_tmp_save] = 1
  158. new_status = 81
  159. elif _status>=401 and _status<=450:
  160. item[document_tmp_save] = 0
  161. new_status = 81
  162. else:
  163. new_status = 1
  164. # new_status = 1
  165. item[document_tmp_status] = new_status
  166. dtmp = Document_tmp(item)
  167. dhtml = Document_html({document_tmp_partitionkey:item.get(document_tmp_partitionkey),
  168. document_tmp_docid:item.get(document_tmp_docid),
  169. document_tmp_dochtmlcon:_dochtmlcon})
  170. dtmp.update_row(ots_client)
  171. dhtml.update_row(ots_client)
  172. producer()
  173. comsumer()
  174. def getTitleFromHtml(self,filemd5,_html):
  175. _soup = BeautifulSoup(_html,"lxml")
  176. _find = _soup.find("a",attrs={"data":filemd5})
  177. _title = ""
  178. if _find is not None:
  179. _title = _find.get_text()
  180. return _title
  181. def getSourceLinkFromHtml(self,filemd5,_html):
  182. _soup = BeautifulSoup(_html,"lxml")
  183. _find = _soup.find("a",attrs={"filelink":filemd5})
  184. filelink = ""
  185. if _find is None:
  186. _find = _soup.find("img",attrs={"filelink":filemd5})
  187. if _find is not None:
  188. filelink = _find.attrs.get("src","")
  189. else:
  190. filelink = _find.attrs.get("href","")
  191. return filelink
  192. def request_attachment_interface(self,attach,_dochtmlcon):
  193. filemd5 = attach.getProperties().get(attachment_filemd5)
  194. _status = attach.getProperties().get(attachment_status)
  195. _filetype = attach.getProperties().get(attachment_filetype)
  196. _size = attach.getProperties().get(attachment_size)
  197. _path = attach.getProperties().get(attachment_path)
  198. _uuid = uuid4()
  199. objectPath = attach.getProperties().get(attachment_path)
  200. localpath = os.path.join(self.current_path,"download",_uuid.hex)
  201. docids = attach.getProperties().get(attachment_docids)
  202. try:
  203. if _size>ATTACHMENT_LARGESIZE:
  204. attach.setValue(attachment_status, ATTACHMENT_TOOLARGE)
  205. log("attachment :%s of path:%s to large"%(filemd5,_path))
  206. attach.update_row(self.ots_client)
  207. return True
  208. else:
  209. d_start_time = time.time()
  210. if downloadFile(self.bucket,objectPath,localpath):
  211. time_download = time.time()-d_start_time
  212. _data_base64 = base64.b64encode(open(localpath,"rb").read())
  213. #调用接口处理结果
  214. start_time = time.time()
  215. _success,_html,swf_images = getAttachDealInterface(_data_base64,_filetype,kwargs={"timeout":600})
  216. if _success:
  217. log("process filemd5:%s of type:%s with size:%.3fM download:%ds recognize takes %ds,ret_size:%d"%(filemd5,_filetype,round(_size/1024/1024,4),time_download,time.time()-start_time,len(_html)))
  218. else:
  219. log("attach interface failed of docid:%s filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
  220. # sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
  221. _html = ""
  222. return False
  223. swf_images = eval(swf_images)
  224. if attach.getProperties().get(attachment_filetype)=="swf" and len(swf_images)>0:
  225. swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
  226. if len(swf_urls)==0:
  227. objectPath = attach.getProperties().get(attachment_path,"")
  228. localpath = os.path.join(self.current_path,"download/%s.swf"%(uuid4().hex))
  229. swf_dir = os.path.join(self.current_path,"swf_images",uuid4().hex)
  230. if not os.path.exists(swf_dir):
  231. os.mkdir(swf_dir)
  232. for _i in range(len(swf_images)):
  233. _base = swf_images[_i]
  234. _base = base64.b64decode(_base)
  235. filename = "swf_page_%d.png"%(_i)
  236. filepath = os.path.join(swf_dir,filename)
  237. with open(filepath,"wb") as f:
  238. f.write(_base)
  239. swf_urls = transformSWF(self.bucket,self.attachment_hub_url,objectPath,None,swf_dir)
  240. if os.path.exists(swf_dir):
  241. os.rmdir(swf_dir)
  242. attach.setValue(attachment_swfUrls,json.dumps(swf_urls,ensure_ascii=False),True)
  243. if re.search("<td",_html) is not None:
  244. attach.setValue(attachment_has_table,1,True)
  245. _file_title = self.getTitleFromHtml(filemd5,_dochtmlcon)
  246. filelink = self.getSourceLinkFromHtml(filemd5,_dochtmlcon)
  247. if _file_title!="":
  248. attach.setValue(attachment_file_title,_file_title,True)
  249. if filelink!="":
  250. attach.setValue(attachment_file_link,filelink,True)
  251. attach.setValue(attachment_attachmenthtml,_html,True)
  252. attach.setValue(attachment_attachmentcon,BeautifulSoup(_html,"lxml").get_text(),True)
  253. attach.setValue(attachment_status,ATTACHMENT_PROCESSED,True)
  254. attach.setValue(attachment_recsize,len(_html),True)
  255. attach.setValue(attachment_process_time,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
  256. attach.update_row(self.ots_client) #线上再开放更新
  257. return True
  258. else:
  259. return False
  260. except oss2.exceptions.NotFound as e:
  261. return True
  262. except Exception as e:
  263. traceback.print_exc()
  264. finally:
  265. try:
  266. os.remove(localpath)
  267. except:
  268. pass
  269. def rec_attachments_by_interface(self,list_attach,_dochtmlcon,save=True):
  270. list_html = []
  271. swf_urls = []
  272. for _attach in list_attach:
  273. #测试全跑
  274. if _attach.getProperties().get(attachment_status) in (ATTACHMENT_PROCESSED,ATTACHMENT_TOOLARGE):
  275. _html = _attach.getProperties().get(attachment_attachmenthtml,"")
  276. if _html is None:
  277. _html = ""
  278. list_html.append(_html)
  279. else:
  280. _succeed = self.request_attachment_interface(_attach,_dochtmlcon)
  281. if not _succeed:
  282. return False,"",[]
  283. _html = _attach.getProperties().get(attachment_attachmenthtml,"")
  284. if _html is None:
  285. _html = ""
  286. list_html.append(_html)
  287. if _attach.getProperties().get(attachment_filetype)=="swf":
  288. swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))
  289. return True,list_html,swf_urls
  290. def generate_dumplicate_query(self,_dict,_dict_must_not,set_match=set(["project_code","project_codes","product"]),set_nested=set(["win_tenderer","bidding_budget","win_bid_price"]),
  291. set_term=set(["doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save","docid"]),
  292. set_range=set(["page_time","status"]),set_phrase=set(["doctitle","project_name"])):
  293. list_must_queries = []
  294. list_must_no_queries = []
  295. for k,v in _dict.items():
  296. if k in set_match:
  297. if isinstance(v,str):
  298. l_s = []
  299. for s_v in v.split(","):
  300. l_s.append(MatchQuery(k,s_v))
  301. list_must_queries.append(BoolQuery(should_queries=l_s))
  302. elif k in set_nested:
  303. _v = v
  304. if k!="":
  305. if k=="bidding_budget" or k=="win_bid_price":
  306. _v = float(_v)
  307. list_must_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
  308. else:
  309. list_must_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
  310. elif k in set_term:
  311. list_must_queries.append(TermQuery(k,v))
  312. elif k in set_phrase:
  313. list_must_queries.append(MatchPhraseQuery(k,v))
  314. elif k in set_range:
  315. if len(v)==1:
  316. list_must_queries.append(RangeQuery(k,v[0]))
  317. elif len(v)==2:
  318. list_must_queries.append(RangeQuery(k,v[0],v[1],True,True))
  319. for k,v in _dict_must_not.items():
  320. if k in set_match:
  321. if isinstance(v,str):
  322. l_s = []
  323. for s_v in v.split(","):
  324. l_s.append(MatchQuery(k,s_v))
  325. list_must_no_queries.append(BoolQuery(should_queries=l_s))
  326. elif k in set_nested:
  327. _v = v
  328. if k!="":
  329. if k=="bidding_budget" or k=="win_bid_price":
  330. _v = float(_v)
  331. list_must_no_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
  332. else:
  333. list_must_no_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
  334. elif k in set_term:
  335. list_must_no_queries.append(TermQuery(k,v))
  336. elif k in set_range:
  337. if len(v)==1:
  338. list_must_no_queries.append(RangeQuery(k,v[0]))
  339. elif len(v)==2:
  340. list_must_no_queries.append(RangeQuery(k,v[0],v[1],True,True))
  341. return BoolQuery(must_queries=list_must_queries,must_not_queries=list_must_no_queries)
  342. def f_decode_sub_docs_json(self, project_code,project_name,tenderee,agency,sub_docs_json):
  343. columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
  344. extract_count = 0
  345. if project_code is not None and project_code!="":
  346. extract_count += 1
  347. if project_name is not None and project_name!="":
  348. extract_count += 1
  349. if tenderee is not None and tenderee!="":
  350. extract_count += 1
  351. if agency is not None and agency!="":
  352. extract_count += 1
  353. if sub_docs_json is not None:
  354. try:
  355. sub_docs = json.loads(sub_docs_json)
  356. except Exception as e:
  357. sub_docs = []
  358. sub_docs.sort(key=lambda x:float(x.get("bidding_budget",0)),reverse=True)
  359. sub_docs.sort(key=lambda x:float(x.get("win_bid_price",0)),reverse=True)
  360. # log("==%s"%(str(sub_docs)))
  361. for sub_docs in sub_docs:
  362. for _key_sub_docs in sub_docs.keys():
  363. extract_count += 1
  364. if _key_sub_docs in columns:
  365. if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
  366. if _key_sub_docs in ["bidding_budget","win_bid_price"]:
  367. if float(sub_docs[_key_sub_docs])>0:
  368. columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
  369. else:
  370. columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
  371. return columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count
  372. def post_extract(self,_dict):
  373. win_tenderer,bidding_budget,win_bid_price,extract_count = self.f_decode_sub_docs_json(_dict.get(document_tmp_project_code),_dict.get(document_tmp_project_name),_dict.get(document_tmp_tenderee),_dict.get(document_tmp_agency),_dict.get(document_tmp_sub_docs_json))
  374. _dict["win_tenderer"] = win_tenderer
  375. _dict["bidding_budget"] = bidding_budget
  376. _dict["win_bid_price"] = win_bid_price
  377. if "extract_count" not in _dict:
  378. _dict["extract_count"] = extract_count
  379. def get_dump_columns(self,_dict):
  380. docchannel = _dict.get(document_tmp_docchannel,0)
  381. project_code = _dict.get(document_tmp_project_code,"")
  382. project_name = _dict.get(document_tmp_project_name,"")
  383. tenderee = _dict.get(document_tmp_tenderee,"")
  384. agency = _dict.get(document_tmp_agency,"")
  385. doctitle_refine = _dict.get(document_tmp_doctitle_refine,"")
  386. win_tenderer = _dict.get("win_tenderer","")
  387. bidding_budget = _dict.get("bidding_budget","")
  388. if bidding_budget==0:
  389. bidding_budget = ""
  390. win_bid_price = _dict.get("win_bid_price","")
  391. if win_bid_price==0:
  392. win_bid_price = ""
  393. page_time = _dict.get(document_tmp_page_time,"")
  394. fingerprint = _dict.get(document_tmp_fingerprint,"")
  395. product = _dict.get(document_tmp_product,"")
  396. return docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product
  397. def f_set_docid_limitNum_contain(self,item, _split,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"]):
  398. flag = True
  399. for _key in singleNum_keys:
  400. if len(getSet(_split,_key))>1:
  401. flag = False
  402. break
  403. for _key in multiNum_keys:
  404. if len(getSet(_split,_key))<=1:
  405. flag = False
  406. break
  407. project_code = item.get("project_code","")
  408. for _key in notlike_keys:
  409. if not flag:
  410. break
  411. for _d in _split:
  412. _key_v = _d.get(_key,"")
  413. _sim = getSimilarityOfString(project_code,_key_v)
  414. if _sim>0.7 and _sim<1:
  415. flag = False
  416. break
  417. #判断组内每条公告是否包含
  418. if flag:
  419. if len(contain_keys)>0:
  420. for _key in contain_keys:
  421. MAX_CONTAIN_COLUMN = None
  422. for _d in _split:
  423. contain_column = _d.get(_key)
  424. if contain_column is not None and contain_column !="":
  425. if MAX_CONTAIN_COLUMN is None:
  426. MAX_CONTAIN_COLUMN = contain_column
  427. else:
  428. if len(MAX_CONTAIN_COLUMN)<len(contain_column):
  429. if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
  430. flag = False
  431. break
  432. MAX_CONTAIN_COLUMN = contain_column
  433. else:
  434. if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
  435. flag = False
  436. break
  437. if flag:
  438. return _split
  439. return []
  440. def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count,document_tmp_doctitle]):
  441. list_data = []
  442. if isinstance(_query,list):
  443. bool_query = BoolQuery(should_queries=_query)
  444. else:
  445. bool_query = _query
  446. rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  447. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=50,get_total_count=True),
  448. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  449. list_dict = getRow_ots(rows)
  450. for _dict in list_dict:
  451. self.post_extract(_dict)
  452. _dict["confidence"] = confidence
  453. list_data.append(_dict)
  454. # _count = len(list_dict)
  455. # while next_token:
  456. # rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  457. # SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  458. # ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  459. # list_dict = getRow_ots(rows)
  460. # for _dict in list_dict:
  461. # self.post_extract(_dict)
  462. # _dict["confidence"] = confidence
  463. # list_data.append(_dict)
  464. list_dict = self.f_set_docid_limitNum_contain(item,list_dict,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys)
  465. return list_dict
  466. def add_data_by_query(self,item,base_list,set_docid,_query,confidence,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_save,document_tmp_status,document_tmp_product,document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
  467. list_dict = self.search_data_by_query(item,_query,confidence,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys,columns=columns)
  468. for _dict in list_dict:
  469. self.post_extract(_dict)
  470. _docid = _dict.get(document_tmp_docid)
  471. if _docid not in set_docid:
  472. base_list.append(_dict)
  473. set_docid.add(_docid)
  474. def translate_dumplicate_rules(self,status_from,item):
  475. docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
  476. if page_time=='':
  477. page_time = getCurrent_date("%Y-%m-%d")
  478. base_dict = {
  479. "status":[status_from[0]],
  480. "page_time":[timeAdd(page_time,-2),timeAdd(page_time,2)]
  481. }
  482. must_not_dict = {"save":0}
  483. list_rules = []
  484. singleNum_keys = ["tenderee","win_tenderer"]
  485. if fingerprint!="":
  486. _dict = {}
  487. confidence = 100
  488. _dict[document_tmp_fingerprint] = fingerprint
  489. _dict.update(base_dict)
  490. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  491. _rule = {"confidence":confidence,
  492. "item":item,
  493. "query":_query,
  494. "singleNum_keys":[],
  495. "contain_keys":[],
  496. "multiNum_keys":[]}
  497. list_rules.append(_rule)
  498. if docchannel in (52,118):
  499. if bidding_budget!="" and tenderee!="" and project_code!="":
  500. confidence = 90
  501. _dict = {document_tmp_docchannel:docchannel,
  502. "bidding_budget":item.get("bidding_budget"),
  503. document_tmp_tenderee:item.get(document_tmp_tenderee,""),
  504. document_tmp_project_code:item.get(document_tmp_project_code,"")
  505. }
  506. _dict.update(base_dict)
  507. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  508. _rule = {"confidence":confidence,
  509. "query":_query,
  510. "singleNum_keys":singleNum_keys,
  511. "contain_keys":[],
  512. "multiNum_keys":[document_tmp_web_source_no]}
  513. list_rules.append(_rule)
  514. if doctitle_refine!="" and tenderee!="" and bidding_budget!="":
  515. confidence = 80
  516. _dict = {document_tmp_docchannel:docchannel,
  517. "doctitle_refine":doctitle_refine,
  518. "tenderee":tenderee,
  519. bidding_budget:"bidding_budget"
  520. }
  521. _dict.update(base_dict)
  522. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  523. _rule = {"confidence":confidence,
  524. "query":_query,
  525. "singleNum_keys":singleNum_keys,
  526. "contain_keys":[],
  527. "multiNum_keys":[document_tmp_web_source_no]}
  528. list_rules.append(_rule)
  529. if project_code!="" and doctitle_refine!="" and agency!="" and bidding_budget!="":
  530. confidence = 90
  531. _dict = {document_tmp_docchannel:docchannel,
  532. "project_code":project_code,
  533. "doctitle_refine":doctitle_refine,
  534. "agency":agency,
  535. "bidding_budget":bidding_budget
  536. }
  537. _dict.update(base_dict)
  538. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  539. _rule = {"confidence":confidence,
  540. "query":_query,
  541. "singleNum_keys":singleNum_keys,
  542. "contain_keys":[],
  543. "multiNum_keys":[document_tmp_web_source_no]}
  544. list_rules.append(_rule)
  545. if project_code!="" and tenderee!="" and bidding_budget!="":
  546. confidence = 91
  547. _dict = {document_tmp_docchannel:docchannel,
  548. "project_code":project_code,
  549. "tenderee":tenderee,
  550. "bidding_budget":bidding_budget
  551. }
  552. _dict.update(base_dict)
  553. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  554. _rule = {"confidence":confidence,
  555. "query":_query,
  556. "singleNum_keys":singleNum_keys,
  557. "contain_keys":[],
  558. "multiNum_keys":[document_tmp_web_source_no]}
  559. list_rules.append(_rule)
  560. if doctitle_refine!="" and agency!="" and bidding_budget!="":
  561. confidence = 71
  562. _dict = {document_tmp_docchannel:docchannel,
  563. "doctitle_refine":doctitle_refine,
  564. "agency":agency,
  565. "bidding_budget":bidding_budget
  566. }
  567. _dict.update(base_dict)
  568. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  569. _rule = {"confidence":confidence,
  570. "query":_query,
  571. "singleNum_keys":singleNum_keys,
  572. "contain_keys":[],
  573. "multiNum_keys":[document_tmp_web_source_no]}
  574. list_rules.append(_rule)
  575. if project_code!="" and project_name!="" and agency!="" and bidding_budget!="":
  576. confidence = 91
  577. _dict = {document_tmp_docchannel:docchannel,
  578. "project_code":project_code,
  579. "project_name":project_name,
  580. "agency":agency,
  581. "bidding_budget":bidding_budget
  582. }
  583. _dict.update(base_dict)
  584. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  585. n_singleKeys = [i for i in singleNum_keys]
  586. n_singleKeys.append(document_tmp_web_source_no)
  587. _rule = {"confidence":confidence,
  588. "query":_query,
  589. "singleNum_keys":n_singleKeys,
  590. "contain_keys":[],
  591. "multiNum_keys":[]}
  592. list_rules.append(_rule)
  593. ##-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1
  594. if project_code!="" and project_name!="" and tenderee!="" and bidding_budget!="":
  595. confidence = 91
  596. _dict = {document_tmp_docchannel:docchannel,
  597. "project_code":project_code,
  598. "project_name":project_name,
  599. "tenderee":tenderee,
  600. "bidding_budget":bidding_budget
  601. }
  602. _dict.update(base_dict)
  603. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  604. n_singleKeys = [i for i in singleNum_keys]
  605. n_singleKeys.append(document_tmp_web_source_no)
  606. _rule = {"confidence":confidence,
  607. "query":_query,
  608. "singleNum_keys":n_singleKeys,
  609. "contain_keys":[],
  610. "multiNum_keys":[]}
  611. list_rules.append(_rule)
  612. if project_code!="" and doctitle_refine!="" and tenderee!="" and bidding_budget!="":
  613. confidence = 71
  614. _dict = {document_tmp_docchannel:docchannel,
  615. "project_code":project_code,
  616. "doctitle_refine":doctitle_refine,
  617. "tenderee":tenderee,
  618. "bidding_budget":bidding_budget
  619. }
  620. _dict.update(base_dict)
  621. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  622. _rule = {"confidence":confidence,
  623. "query":_query,
  624. "singleNum_keys":singleNum_keys,
  625. "contain_keys":[],
  626. "multiNum_keys":[document_tmp_web_source_no]}
  627. list_rules.append(_rule)
  628. #-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
  629. if project_name!="" and agency!="":
  630. tmp_bidding = 0
  631. if bidding_budget!="":
  632. tmp_bidding = bidding_budget
  633. confidence = 51
  634. _dict = {document_tmp_docchannel:docchannel,
  635. "project_name":project_name,
  636. "agency":agency,
  637. "bidding_budget":tmp_bidding
  638. }
  639. _dict.update(base_dict)
  640. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  641. _rule = {"confidence":confidence,
  642. "query":_query,
  643. "singleNum_keys":singleNum_keys,
  644. "contain_keys":[],
  645. "multiNum_keys":[document_tmp_web_source_no]}
  646. list_rules.append(_rule)
  647. #-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
  648. if project_code!="" and agency!="":
  649. tmp_bidding = 0
  650. if bidding_budget!="":
  651. tmp_bidding = bidding_budget
  652. confidence = 51
  653. _dict = {document_tmp_docchannel:docchannel,
  654. "project_code":project_code,
  655. "agency":agency,
  656. "bidding_budget":tmp_bidding
  657. }
  658. _dict.update(base_dict)
  659. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  660. _rule = {"confidence":confidence,
  661. "query":_query,
  662. "singleNum_keys":singleNum_keys,
  663. "contain_keys":[],
  664. "multiNum_keys":[document_tmp_web_source_no]}
  665. list_rules.append(_rule)
  666. if docchannel not in (101,119,120):
  667. #-- 7. 非中标公告 - 同项目名称 - 同发布日期 - 同招标人 - 同预算 - 同类型 - 信息源>1 - 同项目编号
  668. if project_name!="" and tenderee!="" and project_code!="":
  669. tmp_bidding = 0
  670. if bidding_budget!="":
  671. tmp_bidding = bidding_budget
  672. confidence = 51
  673. _dict = {document_tmp_docchannel:docchannel,
  674. "project_name":project_name,
  675. "tenderee":tenderee,
  676. "project_code":project_code
  677. }
  678. _dict.update(base_dict)
  679. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  680. _rule = {"confidence":confidence,
  681. "query":_query,
  682. "singleNum_keys":singleNum_keys,
  683. "contain_keys":[],
  684. "multiNum_keys":[document_tmp_web_source_no]}
  685. list_rules.append(_rule)
  686. if docchannel in (101,119,120):
  687. #-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0)
  688. if project_code!="" and project_name!="" and win_tenderer!="":
  689. tmp_win = 0
  690. if win_bid_price!="":
  691. tmp_win = win_bid_price
  692. confidence = 61
  693. _dict = {document_tmp_docchannel:docchannel,
  694. "project_code":project_code,
  695. "project_name":project_name,
  696. "win_tenderer":win_tenderer,
  697. "win_bid_price":tmp_win
  698. }
  699. _dict.update(base_dict)
  700. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  701. _rule = {"confidence":confidence,
  702. "query":_query,
  703. "singleNum_keys":singleNum_keys,
  704. "contain_keys":[],
  705. "multiNum_keys":[]}
  706. list_rules.append(_rule)
  707. if project_code!="" and project_name!="" and bidding_budget!="" and product!="":
  708. confidence = 72
  709. _dict = {document_tmp_docchannel:docchannel,
  710. "project_code":project_code,
  711. "project_name":project_name,
  712. "bidding_budget":bidding_budget,
  713. "product":product
  714. }
  715. _dict.update(base_dict)
  716. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  717. n_singleKeys = [i for i in singleNum_keys]
  718. n_singleKeys.append(document_tmp_web_source_no)
  719. _rule = {"confidence":confidence,
  720. "query":_query,
  721. "singleNum_keys":n_singleKeys,
  722. "contain_keys":[],
  723. "multiNum_keys":[]}
  724. list_rules.append(_rule)
  725. if project_code!='' and doctitle_refine!="" and win_tenderer!="" and win_bid_price!="":
  726. confidence = 91
  727. _dict = {document_tmp_docchannel:docchannel,
  728. "project_code":project_code,
  729. "doctitle_refine":doctitle_refine,
  730. "win_tenderer":win_tenderer,
  731. "win_bid_price":win_bid_price
  732. }
  733. _dict.update(base_dict)
  734. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  735. n_singleKeys = [i for i in singleNum_keys]
  736. n_singleKeys.append(document_tmp_web_source_no)
  737. _rule = {"confidence":confidence,
  738. "query":_query,
  739. "singleNum_keys":n_singleKeys,
  740. "contain_keys":[],
  741. "multiNum_keys":[]}
  742. list_rules.append(_rule)
  743. ##-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1
  744. if project_code!="" and project_name!="" and win_tenderer!="" and win_bid_price!="":
  745. confidence = 91
  746. _dict = {document_tmp_docchannel:docchannel,
  747. "project_code":project_code,
  748. "project_name":project_name,
  749. "win_tenderer":win_tenderer,
  750. "win_bid_price":win_bid_price
  751. }
  752. _dict.update(base_dict)
  753. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  754. n_singleKeys = [i for i in singleNum_keys]
  755. n_singleKeys.append(document_tmp_web_source_no)
  756. _rule = {"confidence":confidence,
  757. "query":_query,
  758. "singleNum_keys":n_singleKeys,
  759. "contain_keys":[],
  760. "multiNum_keys":[]}
  761. list_rules.append(_rule)
  762. if project_name!="" and win_tenderer!="" and win_bid_price!="":
  763. confidence = 91
  764. _dict = {document_tmp_docchannel:docchannel,
  765. "project_name":project_name,
  766. "win_tenderer":win_tenderer,
  767. "win_bid_price":win_bid_price,
  768. }
  769. _dict.update(base_dict)
  770. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  771. _rule = {"confidence":confidence,
  772. "query":_query,
  773. "singleNum_keys":singleNum_keys,
  774. "contain_keys":[],
  775. "multiNum_keys":[document_tmp_web_source_no]}
  776. list_rules.append(_rule)
  777. if project_code!="" and win_tenderer!="" and win_bid_price!="":
  778. confidence = 91
  779. _dict = {document_tmp_docchannel:docchannel,
  780. "project_code":project_code,
  781. "win_tenderer":win_tenderer,
  782. "win_bid_price":win_bid_price,
  783. }
  784. _dict.update(base_dict)
  785. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  786. _rule = {"confidence":confidence,
  787. "query":_query,
  788. "singleNum_keys":singleNum_keys,
  789. "contain_keys":[],
  790. "multiNum_keys":[document_tmp_web_source_no]}
  791. list_rules.append(_rule)
  792. if project_code!="" and doctitle_refine!="" and win_tenderer!="" and win_bid_price!="":
  793. confidence = 91
  794. _dict = {document_tmp_docchannel:docchannel,
  795. "project_code":project_code,
  796. "doctitle_refine":doctitle_refine,
  797. "win_tenderer":win_tenderer,
  798. "win_bid_price":win_bid_price
  799. }
  800. _dict.update(base_dict)
  801. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  802. n_singleKeys = [i for i in singleNum_keys]
  803. n_singleKeys.append(document_tmp_web_source_no)
  804. _rule = {"confidence":confidence,
  805. "query":_query,
  806. "singleNum_keys":n_singleKeys,
  807. "contain_keys":[],
  808. "multiNum_keys":[]}
  809. list_rules.append(_rule)
  810. if doctitle_refine!="" and win_tenderer!="" and win_bid_price!="":
  811. confidence=90
  812. _dict = {document_tmp_docchannel:docchannel,
  813. "doctitle_refine":doctitle_refine,
  814. "win_tenderer":win_tenderer,
  815. "win_bid_price":win_bid_price
  816. }
  817. _dict.update(base_dict)
  818. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  819. _rule = {"confidence":confidence,
  820. "query":_query,
  821. "singleNum_keys":singleNum_keys,
  822. "contain_keys":[],
  823. "multiNum_keys":[document_tmp_web_source_no]}
  824. list_rules.append(_rule)
  825. if project_name!="" and win_tenderer!="" and win_bid_price!="" and project_code!="":
  826. confidence=95
  827. _dict = {document_tmp_docchannel:docchannel,
  828. "project_name":project_name,
  829. "win_tenderer":win_tenderer,
  830. "win_bid_price":win_bid_price,
  831. "project_code":project_code
  832. }
  833. _dict.update(base_dict)
  834. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  835. _rule = {"confidence":confidence,
  836. "query":_query,
  837. "singleNum_keys":singleNum_keys,
  838. "contain_keys":[],
  839. "multiNum_keys":[document_tmp_web_source_no]}
  840. list_rules.append(_rule)
  841. if docchannel in (51,103,115,116):
  842. #9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
  843. if doctitle_refine!="" and tenderee!="":
  844. tmp_budget = 0
  845. if bidding_budget!="":
  846. tmp_budget = bidding_budget
  847. confidence=81
  848. _dict = {document_tmp_docchannel:docchannel,
  849. "doctitle_refine":doctitle_refine,
  850. "tenderee":tenderee,
  851. "bidding_budget":tmp_budget,
  852. }
  853. _dict.update(base_dict)
  854. _dict["page_time"] = [page_time,page_time]
  855. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  856. _rule = {"confidence":confidence,
  857. "query":_query,
  858. "singleNum_keys":singleNum_keys,
  859. "contain_keys":[],
  860. "multiNum_keys":[document_tmp_web_source_no]}
  861. list_rules.append(_rule)
  862. #-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
  863. if project_code!="" and tenderee!="":
  864. confidence=81
  865. tmp_budget = 0
  866. if bidding_budget!="":
  867. tmp_budget = bidding_budget
  868. _dict = {document_tmp_docchannel:docchannel,
  869. "project_code":project_code,
  870. "tenderee":tenderee,
  871. "bidding_budget":tmp_budget,
  872. }
  873. _dict.update(base_dict)
  874. _dict["page_time"] = [page_time,page_time]
  875. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  876. _rule = {"confidence":confidence,
  877. "query":_query,
  878. "singleNum_keys":singleNum_keys,
  879. "contain_keys":[],
  880. "multiNum_keys":[document_tmp_web_source_no]}
  881. list_rules.append(_rule)
  882. if project_name!="" and tenderee!="":
  883. confidence=81
  884. tmp_budget = 0
  885. if bidding_budget!="":
  886. tmp_budget = bidding_budget
  887. _dict = {document_tmp_docchannel:docchannel,
  888. "project_name":project_name,
  889. "tenderee":tenderee,
  890. "bidding_budget":tmp_budget,
  891. }
  892. _dict.update(base_dict)
  893. _dict["page_time"] = [page_time,page_time]
  894. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  895. _rule = {"confidence":confidence,
  896. "query":_query,
  897. "singleNum_keys":singleNum_keys,
  898. "contain_keys":[],
  899. "multiNum_keys":[document_tmp_web_source_no]}
  900. list_rules.append(_rule)
  901. if agency!="" and tenderee!="":
  902. confidence=81
  903. tmp_budget = 0
  904. if bidding_budget!="":
  905. tmp_budget = bidding_budget
  906. _dict = {document_tmp_docchannel:docchannel,
  907. "agency":agency,
  908. "tenderee":tenderee,
  909. "bidding_budget":tmp_budget,
  910. "product":product
  911. }
  912. _dict.update(base_dict)
  913. _dict["page_time"] = [page_time,page_time]
  914. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  915. _rule = {"confidence":confidence,
  916. "query":_query,
  917. "singleNum_keys":singleNum_keys,
  918. "contain_keys":[],
  919. "multiNum_keys":[document_tmp_web_source_no]}
  920. list_rules.append(_rule)
  921. if agency!="" and project_code!="":
  922. confidence=81
  923. tmp_budget = 0
  924. if bidding_budget!="":
  925. tmp_budget = bidding_budget
  926. _dict = {document_tmp_docchannel:docchannel,
  927. "agency":agency,
  928. "project_code":project_code,
  929. "bidding_budget":tmp_budget,
  930. "product":product
  931. }
  932. _dict.update(base_dict)
  933. _dict["page_time"] = [page_time,page_time]
  934. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  935. _rule = {"confidence":confidence,
  936. "query":_query,
  937. "singleNum_keys":singleNum_keys,
  938. "contain_keys":[],
  939. "multiNum_keys":[document_tmp_web_source_no]}
  940. list_rules.append(_rule)
  941. if agency!="" and project_name!="":
  942. confidence=81
  943. tmp_budget = 0
  944. if bidding_budget!="":
  945. tmp_budget = bidding_budget
  946. _dict = {document_tmp_docchannel:docchannel,
  947. "agency":agency,
  948. "project_name":project_name,
  949. "bidding_budget":tmp_budget,
  950. "product":product
  951. }
  952. _dict.update(base_dict)
  953. _dict["page_time"] = [page_time,page_time]
  954. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  955. _rule = {"confidence":confidence,
  956. "query":_query,
  957. "singleNum_keys":singleNum_keys,
  958. "contain_keys":[],
  959. "multiNum_keys":[document_tmp_web_source_no]}
  960. list_rules.append(_rule)
  961. #五选二
  962. if tenderee!="" and bidding_budget!="" and product!="":
  963. confidence=80
  964. _dict = {document_tmp_docchannel:docchannel,
  965. "tenderee":tenderee,
  966. "bidding_budget":bidding_budget,
  967. "product":product,
  968. }
  969. _dict.update(base_dict)
  970. _dict["page_time"] = [page_time,page_time]
  971. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  972. _rule = {"confidence":confidence,
  973. "query":_query,
  974. "singleNum_keys":singleNum_keys,
  975. "contain_keys":[],
  976. "multiNum_keys":[]}
  977. list_rules.append(_rule)
  978. if tenderee!="" and win_tenderer!="" and product!="":
  979. confidence=80
  980. _dict = {document_tmp_docchannel:docchannel,
  981. "tenderee":tenderee,
  982. "win_tenderer":win_tenderer,
  983. "product":product,
  984. }
  985. _dict.update(base_dict)
  986. _dict["page_time"] = [page_time,page_time]
  987. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  988. _rule = {"confidence":confidence,
  989. "query":_query,
  990. "singleNum_keys":singleNum_keys,
  991. "contain_keys":[],
  992. "multiNum_keys":[]}
  993. list_rules.append(_rule)
  994. if tenderee!="" and win_bid_price!="":
  995. confidence=80
  996. _dict = {document_tmp_docchannel:docchannel,
  997. "tenderee":tenderee,
  998. "win_bid_price":win_bid_price,
  999. "product":product,
  1000. }
  1001. _dict.update(base_dict)
  1002. _dict["page_time"] = [page_time,page_time]
  1003. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1004. _rule = {"confidence":confidence,
  1005. "query":_query,
  1006. "singleNum_keys":singleNum_keys,
  1007. "contain_keys":[],
  1008. "multiNum_keys":[]}
  1009. list_rules.append(_rule)
  1010. if tenderee!="" and agency!="":
  1011. confidence=80
  1012. _dict = {document_tmp_docchannel:docchannel,
  1013. "tenderee":tenderee,
  1014. "agency":agency,
  1015. "product":product,
  1016. }
  1017. _dict.update(base_dict)
  1018. _dict["page_time"] = [page_time,page_time]
  1019. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1020. _rule = {"confidence":confidence,
  1021. "query":_query,
  1022. "singleNum_keys":singleNum_keys,
  1023. "contain_keys":[],
  1024. "multiNum_keys":[]}
  1025. list_rules.append(_rule)
  1026. if win_tenderer!="" and bidding_budget!="":
  1027. confidence=80
  1028. _dict = {document_tmp_docchannel:docchannel,
  1029. "win_tenderer":win_tenderer,
  1030. "bidding_budget":bidding_budget,
  1031. "product":product,
  1032. }
  1033. _dict.update(base_dict)
  1034. _dict["page_time"] = [page_time,page_time]
  1035. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1036. _rule = {"confidence":confidence,
  1037. "query":_query,
  1038. "singleNum_keys":singleNum_keys,
  1039. "contain_keys":[],
  1040. "multiNum_keys":[]}
  1041. list_rules.append(_rule)
  1042. if win_bid_price!="" and bidding_budget!="":
  1043. confidence=80
  1044. _dict = {document_tmp_docchannel:docchannel,
  1045. "win_bid_price":win_bid_price,
  1046. "bidding_budget":bidding_budget,
  1047. "product":product,
  1048. }
  1049. _dict.update(base_dict)
  1050. _dict["page_time"] = [page_time,page_time]
  1051. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1052. _rule = {"confidence":confidence,
  1053. "query":_query,
  1054. "singleNum_keys":singleNum_keys,
  1055. "contain_keys":[],
  1056. "multiNum_keys":[]}
  1057. list_rules.append(_rule)
  1058. if agency!="" and bidding_budget!="":
  1059. confidence=80
  1060. _dict = {document_tmp_docchannel:docchannel,
  1061. "agency":agency,
  1062. "bidding_budget":bidding_budget,
  1063. "product":product,
  1064. }
  1065. _dict.update(base_dict)
  1066. _dict["page_time"] = [page_time,page_time]
  1067. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1068. _rule = {"confidence":confidence,
  1069. "query":_query,
  1070. "singleNum_keys":singleNum_keys,
  1071. "contain_keys":[],
  1072. "multiNum_keys":[]}
  1073. list_rules.append(_rule)
  1074. if win_tenderer!="" and win_bid_price!="":
  1075. confidence=80
  1076. _dict = {document_tmp_docchannel:docchannel,
  1077. "win_tenderer":win_tenderer,
  1078. "win_bid_price":win_bid_price,
  1079. "product":product,
  1080. }
  1081. _dict.update(base_dict)
  1082. _dict["page_time"] = [page_time,page_time]
  1083. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1084. _rule = {"confidence":confidence,
  1085. "query":_query,
  1086. "singleNum_keys":singleNum_keys,
  1087. "contain_keys":[],
  1088. "multiNum_keys":[]}
  1089. list_rules.append(_rule)
  1090. if win_tenderer!="" and agency!="":
  1091. confidence=80
  1092. _dict = {document_tmp_docchannel:docchannel,
  1093. "win_tenderer":win_tenderer,
  1094. "agency":agency,
  1095. "product":product,
  1096. }
  1097. _dict.update(base_dict)
  1098. _dict["page_time"] = [page_time,page_time]
  1099. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1100. _rule = {"confidence":confidence,
  1101. "query":_query,
  1102. "singleNum_keys":singleNum_keys,
  1103. "contain_keys":[],
  1104. "multiNum_keys":[]}
  1105. list_rules.append(_rule)
  1106. if doctitle_refine!="" and product!="" and len(doctitle_refine)>7:
  1107. confidence=80
  1108. _dict = {document_tmp_docchannel:docchannel,
  1109. "doctitle_refine":doctitle_refine,
  1110. "product":product,
  1111. }
  1112. _dict.update(base_dict)
  1113. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  1114. _rule = {"confidence":confidence,
  1115. "query":_query,
  1116. "singleNum_keys":singleNum_keys,
  1117. "contain_keys":[],
  1118. "multiNum_keys":[]}
  1119. list_rules.append(_rule)
  1120. return list_rules
  1121. def dumplicate_fianl_check(self,base_list):
  1122. the_group = base_list
  1123. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1124. if len(the_group)>10:
  1125. keys = ["tenderee","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
  1126. else:
  1127. keys = ["tenderee","win_tenderer","win_bid_price","bidding_budget"]
  1128. #置信度
  1129. list_key_index = []
  1130. for _k in keys:
  1131. if _k=="doctitle":
  1132. list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
  1133. else:
  1134. list_key_index.append(getDiffIndex(the_group,_k))
  1135. _index = min(list_key_index)
  1136. if _index>1:
  1137. return the_group[:_index]
  1138. return []
  1139. def get_best_docid(self,base_list):
  1140. to_reverse = False
  1141. dict_source_count = {}
  1142. for _item in base_list:
  1143. _web_source = _item.get(document_tmp_web_source_no)
  1144. _fingerprint = _item.get(document_tmp_fingerprint)
  1145. if _web_source is not None:
  1146. if _web_source not in dict_source_count:
  1147. dict_source_count[_web_source] = set()
  1148. dict_source_count[_web_source].add(_fingerprint)
  1149. if len(dict_source_count[_web_source])>=2:
  1150. to_reverse=True
  1151. if len(base_list)>0:
  1152. base_list.sort(key=lambda x:x["docid"],reverse=to_reverse)
  1153. base_list.sort(key=lambda x:x.get(document_attachment_extract_status,0),reverse=True)
  1154. base_list.sort(key=lambda x:x["extract_count"],reverse=True)
  1155. return base_list[0]["docid"]
  1156. def save_dumplicate(self,base_list,best_docid,status_from,status_to):
  1157. #best_docid need check while others can save directly
  1158. list_dict = []
  1159. for item in base_list:
  1160. docid = item["docid"]
  1161. _dict = {"partitionkey":item["partitionkey"],
  1162. "docid":item["docid"]}
  1163. if docid==best_docid:
  1164. if item.get("save",1)!=0:
  1165. _dict["save"] = 1
  1166. else:
  1167. _dict["save"] = 0
  1168. if item.get("status")>=status_from[0] and item.get("status")<=status_from[1]:
  1169. _dict["status"] = random.randint(status_to[0],status_to[1])
  1170. list_dict.append(_dict)
  1171. for _dict in list_dict:
  1172. dtmp = Document_tmp(_dict)
  1173. dtmp.update_row(self.ots_client)
  1174. def flow_test(self,status_to=[1,10]):
  1175. def producer():
  1176. bool_query = BoolQuery(must_queries=[
  1177. # ExistsQuery("docid"),
  1178. # RangeQuery("crtime",range_to='2022-04-10'),
  1179. # RangeQuery("status",61),
  1180. NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
  1181. ],
  1182. must_not_queries=[
  1183. # NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
  1184. TermQuery("attachment_extract_status",1),
  1185. RangeQuery("status",1,11)
  1186. ]
  1187. )
  1188. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1189. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1190. ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
  1191. log("flow_init producer total_count:%d"%total_count)
  1192. list_dict = getRow_ots(rows)
  1193. for _dict in list_dict:
  1194. self.queue_init.put(_dict)
  1195. _count = len(list_dict)
  1196. while next_token and _count<1000000:
  1197. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1198. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1199. ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
  1200. list_dict = getRow_ots(rows)
  1201. for _dict in list_dict:
  1202. self.queue_init.put(_dict)
  1203. _count += len(list_dict)
  1204. print("%d/%d"%(_count,total_count))
  1205. def comsumer():
  1206. mt = MultiThreadHandler(self.queue_init,comsumer_handle,None,30,1,ots_client=self.ots_client)
  1207. mt.run()
  1208. def comsumer_handle(item,result_queue,ots_client):
  1209. # print(item)
  1210. dtmp = Document_tmp(item)
  1211. dtmp.setValue(document_tmp_status,random.randint(*status_to),True)
  1212. dtmp.update_row(ots_client)
  1213. # dhtml = Document_html(item)
  1214. # dhtml.update_row(ots_client)
  1215. # dtmp.delete_row(ots_client)
  1216. # dhtml.delete_row(ots_client)
  1217. producer()
  1218. comsumer()
  1219. def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
  1220. def producer(columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_web_source_name]):
  1221. bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True)])
  1222. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1223. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1224. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1225. log("flow_dumplicate producer total_count:%d"%total_count)
  1226. list_dict = getRow_ots(rows)
  1227. for _dict in list_dict:
  1228. self.queue_dumplicate.put(_dict)
  1229. _count = len(list_dict)
  1230. while next_token and _count<flow_process_count:
  1231. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1232. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1233. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1234. list_dict = getRow_ots(rows)
  1235. for _dict in list_dict:
  1236. self.queue_dumplicate.put(_dict)
  1237. _count += len(list_dict)
  1238. def comsumer():
  1239. mt = MultiThreadHandler(self.queue_dumplicate,comsumer_handle,None,10,1,ots_client=self.ots_client)
  1240. mt.run()
  1241. def comsumer_handle(item,result_queue,ots_client):
  1242. self.post_extract(item)
  1243. base_list = []
  1244. set_docid = set()
  1245. list_rules = self.translate_dumplicate_rules(flow_dumplicate_status_from,item)
  1246. list_rules.sort(key=lambda x:x["confidence"],reverse=True)
  1247. # print(item,"len_rules",len(list_rules))
  1248. for _rule in list_rules:
  1249. _query = _rule["query"]
  1250. confidence = _rule["confidence"]
  1251. singleNum_keys = _rule["singleNum_keys"]
  1252. contain_keys = _rule["contain_keys"]
  1253. multiNum_keys = _rule["multiNum_keys"]
  1254. self.add_data_by_query(item,base_list,set_docid,_query,confidence,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys)
  1255. item["confidence"] = 999
  1256. if item.get(document_tmp_docid) not in set_docid:
  1257. base_list.append(item)
  1258. final_list = self.dumplicate_fianl_check(base_list)
  1259. best_docid = self.get_best_docid(final_list)
  1260. # log(str(final_list))
  1261. _d = {"partitionkey":item["partitionkey"],
  1262. "docid":item["docid"],
  1263. "status":random.randint(*flow_dumplicate_status_to),
  1264. document_tmp_opertime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")
  1265. }
  1266. dtmp = Document_tmp(_d)
  1267. dup_docid = set()
  1268. for _dict in final_list:
  1269. dup_docid.add(_dict.get(document_tmp_docid))
  1270. if item.get(document_tmp_docid) in dup_docid:
  1271. dup_docid.remove(item.get(document_tmp_docid))
  1272. if len(final_list)==0 or best_docid==item.get(document_tmp_docid):
  1273. dtmp.setValue(document_tmp_save,1,True)
  1274. dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
  1275. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  1276. else:
  1277. dtmp.setValue(document_tmp_save,0,True)
  1278. if best_docid in dup_docid:
  1279. dup_docid.remove(best_docid)
  1280. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  1281. dmp_docid = "%d,%s"%(best_docid,dmp_docid)
  1282. else:
  1283. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  1284. dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
  1285. dtmp.update_row(self.ots_client)
  1286. #只保留当前公告
  1287. # self.save_dumplicate(final_list,best_docid,status_from,status_to)
  1288. #
  1289. # print("=base=",item)
  1290. # if len(final_list)>=1:
  1291. # print("==================")
  1292. # for _dict in final_list:
  1293. # print(_dict)
  1294. # print("========>>>>>>>>>>")
  1295. producer()
  1296. comsumer()
  1297. def merge_document(self,item,status_to=None):
  1298. self.post_extract(item)
  1299. docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
  1300. _d = {"partitionkey":item["partitionkey"],
  1301. "docid":item["docid"],
  1302. }
  1303. dtmp = Document_tmp(_d)
  1304. if item.get(document_tmp_save,1)==1:
  1305. list_should_q = []
  1306. if project_code!="" and tenderee!="":
  1307. _q = BoolQuery(must_queries=[MatchQuery("project_code",project_code),
  1308. TermQuery("tenderee",tenderee)])
  1309. list_should_q.append(_q)
  1310. if project_name!="" and project_code!="":
  1311. _q = BoolQuery(must_queries=[MatchQuery("project_code",project_code),
  1312. TermQuery("project_name",project_name)])
  1313. list_should_q.append(_q)
  1314. if len(list_should_q)>0:
  1315. list_data = self.search_data_by_query(item,list_should_q,100,merge=True,table_name="project2",table_index="project2_index_formerge",sort_column="tenderee",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=["tenderee","win_tenderer"])
  1316. if len(list_data)==1:
  1317. dtmp.setValue("merge_uuid",list_data[0]["uuid"],True)
  1318. print(item["docid"],list_data[0]["uuid"])
  1319. else:
  1320. list_should_q = []
  1321. if bidding_budget!="" and project_code!="":
  1322. _q = BoolQuery(must_queries=[MatchQuery("project_code",project_code),
  1323. TermQuery("bidding_budget",float(bidding_budget))])
  1324. list_should_q.append(_q)
  1325. if tenderee!="" and bidding_budget!="" and project_name!="":
  1326. _q = BoolQuery(must_queries=[MatchQuery("tenderee",tenderee),
  1327. TermQuery("bidding_budget",float(bidding_budget)),
  1328. TermQuery("project_name",project_name)])
  1329. list_should_q.append(_q)
  1330. if tenderee!="" and win_bid_price!="" and project_name!="":
  1331. _q = BoolQuery(must_queries=[MatchQuery("tenderee",tenderee),
  1332. TermQuery("win_bid_price",float(win_bid_price)),
  1333. TermQuery("project_name",project_name)])
  1334. list_should_q.append(_q)
  1335. if len(list_should_q)>0:
  1336. list_data = self.search_data_by_query(item,list_should_q,100,table_name="project2",table_index="project2_index_formerge",sort_column="tenderee",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=["tenderee","win_tenderer"])
  1337. if len(list_data)==1:
  1338. dtmp.setValue("merge_uuid",list_data[0]["uuid"],True)
  1339. print(item["docid"],list_data[0]["uuid"])
  1340. return dtmp.getProperties().get("merge_uuid","")
  1341. # dtmp.update_row(self.ots_client)
  1342. def test_merge(self):
  1343. import pandas as pd
  1344. import queue
  1345. def producer(columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json]):
  1346. list_test_item = []
  1347. should_q = BoolQuery(should_queries=[
  1348. TermQuery("docchannel",101),
  1349. TermQuery("docchannel",119),
  1350. TermQuery("docchannel",120)
  1351. ])
  1352. bool_query = BoolQuery(must_queries=[
  1353. TermQuery("page_time","2022-04-22"),
  1354. should_q,
  1355. TermQuery("save",1)
  1356. ])
  1357. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1358. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1359. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1360. log("flow_dumplicate producer total_count:%d"%total_count)
  1361. list_dict = getRow_ots(rows)
  1362. for _dict in list_dict:
  1363. list_test_item.append(_dict)
  1364. _count = len(list_dict)
  1365. while next_token:
  1366. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1367. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1368. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1369. list_dict = getRow_ots(rows)
  1370. for _dict in list_dict:
  1371. list_test_item.append(_dict)
  1372. _count += len(list_dict)
  1373. print("%d/%d"%(_count,total_count))
  1374. return list_test_item
  1375. from BaseDataMaintenance.model.ots.project import Project
  1376. def comsumer_handle(item,result_queue,ots_client):
  1377. item["merge_uuid"] = self.merge_document(item)
  1378. if item["merge_uuid"]!="":
  1379. _dict = {"uuid":item["merge_uuid"]}
  1380. _p = Project(_dict)
  1381. _p.fix_columns(self.ots_client,["zhao_biao_page_time"],True)
  1382. if _p.getProperties().get("zhao_biao_page_time","")!="":
  1383. item["是否有招标"] = "是"
  1384. list_test_item = producer()
  1385. task_queue = queue.Queue()
  1386. for item in list_test_item:
  1387. task_queue.put(item)
  1388. mt = MultiThreadHandler(task_queue,comsumer_handle,None,30,1,ots_client=self.ots_client)
  1389. mt.run()
  1390. keys = [document_tmp_docid,document_tmp_docchannel,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_doctitle_refine,"win_tenderer","bidding_budget","win_bid_price","merge_uuid","是否有招标"]
  1391. df_data = {}
  1392. for k in keys:
  1393. df_data[k] = []
  1394. for item in list_test_item:
  1395. for k in keys:
  1396. df_data[k].append(item.get(k,""))
  1397. df = pd.DataFrame(df_data)
  1398. df.to_excel("test_merge.xlsx",columns=keys)
  1399. def flow_merge(self,process_count=10000,status_from=[71,80],status_to=[81,90]):
  1400. def producer(columns=[document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json]):
  1401. bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True)])
  1402. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1403. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1404. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1405. log("flow_merge producer total_count:%d"%total_count)
  1406. list_dict = getRow_ots(rows)
  1407. for _dict in list_dict:
  1408. self.queue_merge.put(_dict)
  1409. _count = len(list_dict)
  1410. while next_token and _count<process_count:
  1411. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1412. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1413. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1414. list_dict = getRow_ots(rows)
  1415. for _dict in list_dict:
  1416. self.queue_merge.put(_dict)
  1417. _count += len(list_dict)
  1418. def comsumer():
  1419. mt = MultiThreadHandler(self.queue_merge,comsumer_handle,None,10,1,ots_client=self.ots_client)
  1420. mt.run()
  1421. def comsumer_handle(item,result_queue,ots_client):
  1422. self.merge_document(item,status_to)
  1423. # producer()
  1424. # comsumer()
  1425. pass
  1426. def flow_syncho(self,status_from=[71,80],status_to=[81,90]):
  1427. pass
  1428. def flow_remove(self,process_count=flow_process_count,status_from=flow_remove_status_from):
  1429. def producer():
  1430. current_date = getCurrent_date("%Y-%m-%d")
  1431. tmp_date = timeAdd(current_date,-10)
  1432. bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True),
  1433. RangeQuery(document_tmp_crtime,range_to="%s 00:00:00"%(tmp_date))])
  1434. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1435. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1436. ColumnsToGet(return_type=ColumnReturnType.NONE))
  1437. log("flow_remove producer total_count:%d"%total_count)
  1438. list_dict = getRow_ots(rows)
  1439. for _dict in list_dict:
  1440. self.queue_remove.put(_dict)
  1441. _count = len(list_dict)
  1442. while next_token:
  1443. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1444. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1445. ColumnsToGet(return_type=ColumnReturnType.NONE))
  1446. list_dict = getRow_ots(rows)
  1447. for _dict in list_dict:
  1448. self.queue_remove.put(_dict)
  1449. _count += len(list_dict)
  1450. def comsumer():
  1451. mt = MultiThreadHandler(self.queue_remove,comsumer_handle,None,10,1,ots_client=self.ots_client)
  1452. mt.run()
  1453. def comsumer_handle(item,result_queue,ots_client):
  1454. dtmp = Document_tmp(item)
  1455. dtmp.delete_row(self.ots_client)
  1456. dhtml = Document_html(item)
  1457. dhtml.delete_row(self.ots_client)
  1458. producer()
  1459. comsumer()
  1460. def start_flow_dumplicate(self):
  1461. schedule = BlockingScheduler()
  1462. schedule.add_job(self.flow_remove,"cron",hour="20")
  1463. schedule.add_job(self.flow_remove_project_tmp,"cron",hour="20")
  1464. schedule.add_job(self.flow_dumplicate,"cron",second="*/10")
  1465. schedule.start()
  1466. def flow_remove_project_tmp(self,process_count=flow_process_count):
  1467. def producer():
  1468. current_date = getCurrent_date("%Y-%m-%d")
  1469. tmp_date = timeAdd(current_date,-6*31)
  1470. bool_query = BoolQuery(must_queries=[
  1471. RangeQuery(project_page_time,range_to="%s"%(tmp_date))])
  1472. rows,next_token,total_count,is_all_succeed = self.ots_client.search("project2_tmp","project2_tmp_index",
  1473. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time")]),limit=100,get_total_count=True),
  1474. ColumnsToGet(return_type=ColumnReturnType.NONE))
  1475. log("flow_remove project2_tmp producer total_count:%d"%total_count)
  1476. list_dict = getRow_ots(rows)
  1477. for _dict in list_dict:
  1478. self.queue_remove_project.put(_dict)
  1479. _count = len(list_dict)
  1480. while next_token:
  1481. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1482. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1483. ColumnsToGet(return_type=ColumnReturnType.NONE))
  1484. list_dict = getRow_ots(rows)
  1485. for _dict in list_dict:
  1486. self.queue_remove_project.put(_dict)
  1487. _count += len(list_dict)
  1488. def comsumer():
  1489. mt = MultiThreadHandler(self.queue_remove_project,comsumer_handle,None,10,1,ots_client=self.ots_client)
  1490. mt.run()
  1491. def comsumer_handle(item,result_queue,ots_client):
  1492. ptmp = Project_tmp(item)
  1493. ptmp.delete_row(self.ots_client)
  1494. producer()
  1495. comsumer()
  1496. def start_flow_merge(self):
  1497. schedule = BlockingScheduler()
  1498. schedule.add_job(self.flow_merge,"cron",second="*/10")
  1499. schedule.start()
  1500. def download_attachment():
  1501. ots_client = getConnect_ots()
  1502. queue_attachment = Queue()
  1503. auth = getAuth()
  1504. oss2.defaults.connection_pool_size = 100
  1505. oss2.defaults.multiget_num_threads = 20
  1506. attachment_bucket_name = "attachment-hub"
  1507. if is_internal:
  1508. bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
  1509. else:
  1510. bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
  1511. bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)
  1512. current_path = os.path.dirname(__file__)
  1513. def producer():
  1514. columns = [document_tmp_attachment_path]
  1515. bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_crtime,"2022-03-29 15:00:00","2022-03-29 17:00:00",True,True)])
  1516. rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
  1517. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status",SortOrder.DESC)]),limit=100,get_total_count=True),
  1518. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1519. log("flow_attachment producer total_count:%d"%total_count)
  1520. list_dict = getRow_ots(rows)
  1521. for _dict in list_dict:
  1522. queue_attachment.put(_dict)
  1523. _count = len(list_dict)
  1524. while next_token:
  1525. rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
  1526. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1527. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1528. list_dict = getRow_ots(rows)
  1529. for _dict in list_dict:
  1530. queue_attachment.put(_dict)
  1531. _count += len(list_dict)
  1532. def comsumer():
  1533. mt = MultiThreadHandler(queue_attachment,comsumer_handle,None,10,1)
  1534. mt.run()
  1535. def getAttachments(list_filemd5,columns_to_get=[attachment_filemd5,attachment_path,attachment_size,attachment_attachmenthtml,attachment_filetype,attachment_docids,attachment_status,attachment_swfUrls]):
  1536. list_attachment = []
  1537. rows_to_get = []
  1538. for _md5 in list_filemd5[:50]:
  1539. if _md5 is None:
  1540. continue
  1541. primary_key = [(attachment_filemd5,_md5)]
  1542. rows_to_get.append(primary_key)
  1543. req = BatchGetRowRequest()
  1544. req.add(TableInBatchGetRowItem(attachment_table_name,rows_to_get,columns_to_get,None,1))
  1545. try:
  1546. result = ots_client.batch_get_row(req)
  1547. attach_result = result.get_result_by_table(attachment_table_name)
  1548. for item in attach_result:
  1549. if item.is_ok:
  1550. _dict = getRow_ots_primary(item.row)
  1551. if _dict is not None:
  1552. list_attachment.append(attachment(_dict))
  1553. except Exception as e:
  1554. log(str(list_filemd5))
  1555. log("attachProcess comsumer error %s"%str(e))
  1556. return list_attachment
  1557. def comsumer_handle(item,result_queue):
  1558. page_attachments = json.loads(item.get(document_tmp_attachment_path,"[]"))
  1559. if len(page_attachments)==0:
  1560. pass
  1561. else:
  1562. list_fileMd5 = []
  1563. for _atta in page_attachments:
  1564. list_fileMd5.append(_atta.get(document_tmp_attachment_path_filemd5))
  1565. list_attach = getAttachments(list_fileMd5)
  1566. for attach in list_attach:
  1567. filemd5 = attach.getProperties().get(attachment_filemd5)
  1568. _status = attach.getProperties().get(attachment_status)
  1569. _filetype = attach.getProperties().get(attachment_filetype)
  1570. _size = attach.getProperties().get(attachment_size)
  1571. _path = attach.getProperties().get(attachment_path)
  1572. _uuid = uuid4()
  1573. objectPath = attach.getProperties().get(attachment_path)
  1574. localpath = os.path.join(current_path,"download","%s.%s"%(filemd5,_filetype))
  1575. try:
  1576. if _size>ATTACHMENT_LARGESIZE:
  1577. pass
  1578. else:
  1579. downloadFile(bucket,objectPath,localpath)
  1580. except Exception as e:
  1581. traceback.print_exc()
  1582. producer()
  1583. comsumer()
  1584. def test_attachment_interface():
  1585. current_path = os.path.dirname(__file__)
  1586. task_queue = Queue()
  1587. def producer():
  1588. _count = 0
  1589. list_filename = os.listdir(os.path.join(current_path,"download"))
  1590. for _filename in list_filename:
  1591. _count += 1
  1592. _type = _filename.split(".")[1]
  1593. task_queue.put({"path":os.path.join(current_path,"download",_filename),"file_type":_type})
  1594. if _count>=500:
  1595. break
  1596. def comsumer():
  1597. mt = MultiThreadHandler(task_queue,comsumer_handle,None,10)
  1598. mt.run()
  1599. def comsumer_handle(item,result_queue):
  1600. _path = item.get("path")
  1601. _type = item.get("file_type")
  1602. _data_base64 = base64.b64encode(open(_path,"rb").read())
  1603. #调用接口处理结果
  1604. start_time = time.time()
  1605. _success,_html,swf_images = getAttachDealInterface(_data_base64,_type)
  1606. log("%s result:%s takes:%d"%(_path,str(_success),time.time()-start_time))
  1607. producer()
  1608. comsumer()
  1609. class Dataflow_attachment(Dataflow):
  1610. def __init__(self):
  1611. Dataflow.__init__(self)
  1612. self.process_list_thread = []
  1613. def flow_attachment_process(self):
  1614. self.process_comsumer()
  1615. def monitor_attachment_process(self):
  1616. alive_count = 0
  1617. for _t in self.process_list_thread:
  1618. if _t.is_alive():
  1619. alive_count += 1
  1620. log("attachment_process alive:%d total:%d"%(alive_count,len(self.process_list_thread)))
  1621. def process_comsumer(self):
  1622. if len(self.process_list_thread)==0:
  1623. thread_count = 60
  1624. for i in range(thread_count):
  1625. self.process_list_thread.append(Thread(target=self.process_comsumer_handle))
  1626. for t in self.process_list_thread:
  1627. t.start()
  1628. while 1:
  1629. failed_count = 0
  1630. for _i in range(len(self.process_list_thread)):
  1631. t = self.process_list_thread[_i]
  1632. if not t.is_alive():
  1633. failed_count += 1
  1634. self.prcess_list_thread[_i] = Thread(target=self.process_comsumer_handle)
  1635. self.prcess_list_thread[_i].start()
  1636. if failed_count>0:
  1637. log("attachment failed %d"%(failed_count))
  1638. time.sleep(5)
  1639. def process_comsumer_handle(self):
  1640. while 1:
  1641. _flag = False
  1642. log("attachment handle:%s"%str(threading.get_ident()))
  1643. try:
  1644. item = self.queue_attachment_ocr.get(True,timeout=0.2)
  1645. log("attachment get doc:%s"%(str(item.get("item",{}).get("docid"))))
  1646. self.attachment_recognize(item,None)
  1647. log("attachment get doc:%s succeed"%(str(item.get("item",{}).get("docid"))))
  1648. except Exception as e:
  1649. _flag = True
  1650. pass
  1651. try:
  1652. item = self.queue_attachment_not_ocr.get(True,timeout=0.2)
  1653. log("attachment get doc:%s"%(str(item.get("item",{}).get("docid"))))
  1654. self.attachment_recognize(item,None)
  1655. log("attachment get doc:%s succeed"%(str(item.get("item",{}).get("docid"))))
  1656. except Exception as e:
  1657. _flag = True and _flag
  1658. pass
  1659. if _flag:
  1660. time.sleep(2)
  1661. def attachment_recognize(self,_dict,result_queue):
  1662. item = _dict.get("item")
  1663. list_attach = _dict.get("list_attach")
  1664. dhtml = Document_html({"partitionkey":item.get("partitionkey"),
  1665. "docid":item.get("docid")})
  1666. dhtml.fix_columns(self.ots_client,["dochtmlcon"],True)
  1667. _dochtmlcon = dhtml.getProperties().get("dochtmlcon","")
  1668. _succeed,list_html,swf_urls = self.rec_attachments_by_interface(list_attach,_dochtmlcon,save=True)
  1669. log(str(swf_urls))
  1670. if not _succeed:
  1671. item[document_tmp_status] = random.randint(*flow_attachment_status_failed_to)
  1672. else:
  1673. dhtml.updateSWFImages(swf_urls)
  1674. dhtml.updateAttachment(list_html)
  1675. dhtml.update_row(self.ots_client)
  1676. item[document_tmp_status] = random.randint(*flow_attachment_status_succeed_to)
  1677. item[document_tmp_attachment_extract_status] = 1
  1678. log("document:%d get attachments with result:%s"%(item.get("docid"),str(_succeed)))
  1679. dtmp = Document_tmp(item)
  1680. dtmp.update_row(self.ots_client)
  1681. def flow_attachment(self):
  1682. self.flow_attachment_producer()
  1683. self.flow_attachment_producer_comsumer()
  1684. def getAttachments(self,list_filemd5,columns_to_get=[attachment_filemd5,attachment_path,attachment_size,attachment_attachmenthtml,attachment_filetype,attachment_docids,attachment_status,attachment_swfUrls]):
  1685. list_attachment = []
  1686. rows_to_get = []
  1687. for _md5 in list_filemd5[:50]:
  1688. if _md5 is None:
  1689. continue
  1690. primary_key = [(attachment_filemd5,_md5)]
  1691. rows_to_get.append(primary_key)
  1692. req = BatchGetRowRequest()
  1693. req.add(TableInBatchGetRowItem(attachment_table_name,rows_to_get,columns_to_get,None,1))
  1694. try:
  1695. result = self.ots_client.batch_get_row(req)
  1696. attach_result = result.get_result_by_table(attachment_table_name)
  1697. for item in attach_result:
  1698. if item.is_ok:
  1699. _dict = getRow_ots_primary(item.row)
  1700. if _dict is not None:
  1701. list_attachment.append(attachment(_dict))
  1702. except Exception as e:
  1703. log(str(list_filemd5))
  1704. log("attachProcess comsumer error %s"%str(e))
  1705. return list_attachment
  1706. def flow_attachment_producer(self,columns=[document_tmp_attachment_path,document_tmp_crtime]):
  1707. qsize_ocr = self.queue_attachment_ocr.qsize()
  1708. qsize_not_ocr = self.queue_attachment_not_ocr.qsize()
  1709. log("queue_attachment_ocr:%d,queue_attachment_not_ocr:%d"%(qsize_ocr,qsize_not_ocr))
  1710. #选择加入数据场景
  1711. if min(qsize_ocr,qsize_not_ocr)>200 or max(qsize_ocr,qsize_not_ocr)>1000:
  1712. return
  1713. #去重
  1714. set_docid = set()
  1715. set_docid = set_docid | set(self.list_attachment_ocr) | set(self.list_attachment_not_ocr)
  1716. if qsize_ocr>0:
  1717. self.list_attachment_ocr = self.list_attachment_ocr[-qsize_ocr:]
  1718. else:
  1719. self.list_attachment_ocr = []
  1720. if qsize_not_ocr>0:
  1721. self.list_attachment_not_ocr = self.list_attachment_not_ocr[-qsize_not_ocr:]
  1722. else:
  1723. self.list_attachment_not_ocr = []
  1724. try:
  1725. bool_query = BoolQuery(must_queries=[
  1726. RangeQuery(document_tmp_status,*flow_attachment_status_from,True,True),
  1727. # TermQuery(document_tmp_docid,234925191),
  1728. ])
  1729. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1730. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status",SortOrder.DESC)]),limit=100,get_total_count=True),
  1731. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1732. log("flow_attachment producer total_count:%d"%total_count)
  1733. list_dict = getRow_ots(rows)
  1734. _count = 0
  1735. for _dict in list_dict:
  1736. docid = _dict.get(document_tmp_docid)
  1737. if docid in set_docid:
  1738. continue
  1739. self.queue_attachment.put(_dict,True)
  1740. _count += 1
  1741. while next_token and _count<flow_process_count:
  1742. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1743. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1744. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1745. list_dict = getRow_ots(rows)
  1746. for _dict in list_dict:
  1747. docid = _dict.get(document_tmp_docid)
  1748. if docid in set_docid:
  1749. continue
  1750. self.queue_attachment.put(_dict,True)
  1751. _count += 1
  1752. log("add attachment count:%d"%(_count))
  1753. except Exception as e:
  1754. log("flow attachment producer error:%s"%(str(e)))
  1755. traceback.print_exc()
  1756. def flow_attachment_producer_comsumer(self):
  1757. log("start flow_attachment comsumer")
  1758. mt = MultiThreadHandler(self.queue_attachment,self.comsumer_handle,None,10,1)
  1759. mt.run()
  1760. def set_queue(self,_dict):
  1761. list_attach = _dict.get("list_attach")
  1762. to_ocr = False
  1763. for attach in list_attach:
  1764. if attach.getProperties().get(attachment_filetype) in ["bmp","jpeg","jpg","png","swf","pdf","tif"]:
  1765. to_ocr = True
  1766. break
  1767. if to_ocr:
  1768. self.queue_attachment_ocr.put(_dict,True)
  1769. # self.list_attachment_ocr.append(_dict.get("item").get(document_tmp_docid))
  1770. else:
  1771. self.queue_attachment_not_ocr.put(_dict,True)
  1772. # self.list_attachment_not_ocr.append(_dict.get("item").get(document_tmp_docid))
  1773. def comsumer_handle(self,item,result_queue):
  1774. try:
  1775. page_attachments = json.loads(item.get(document_tmp_attachment_path,"[]"))
  1776. if len(page_attachments)==0:
  1777. item[document_tmp_status] = random.randint(*flow_attachment_status_succeed_to)
  1778. dtmp = Document_tmp(item)
  1779. dtmp.update_row(self.ots_client)
  1780. else:
  1781. list_fileMd5 = []
  1782. for _atta in page_attachments:
  1783. list_fileMd5.append(_atta.get(document_tmp_attachment_path_filemd5))
  1784. list_attach = self.getAttachments(list_fileMd5)
  1785. #未上传成功的2小时内不处理
  1786. if len(page_attachments)!=len(list_attach) and time.mktime(time.localtime())-time.mktime(time.strptime(item.get(document_tmp_crtime),"%Y-%m-%d %H:%M:%S"))<7200:
  1787. item[document_tmp_status] = 1
  1788. dtmp = Document_tmp(item)
  1789. dtmp.update_row(self.ots_client)
  1790. return
  1791. self.set_queue({"item":item,"list_attach":list_attach})
  1792. except Exception as e:
  1793. traceback.print_exc()
  1794. def start_flow_attachment(self):
  1795. schedule = BlockingScheduler()
  1796. schedule.add_job(self.flow_attachment_process,"cron",second="*/20")
  1797. schedule.add_job(self.flow_attachment,"cron",second="*/10")
  1798. schedule.start()
  1799. class Dataflow_extract(Dataflow):
  1800. def __init__(self):
  1801. Dataflow.__init__(self)
  1802. def flow_extract_producer(self,columns=[document_tmp_page_time,document_tmp_doctitle,document_tmp_docchannel,document_tmp_status,document_tmp_original_docchannel,document_tmp_web_source_no]):
  1803. q_size = self.queue_extract.qsize()
  1804. if q_size>100:
  1805. return
  1806. set_docid = set(self.list_extract)
  1807. if q_size>0:
  1808. self.list_extract = self.list_extract[-q_size:]
  1809. else:
  1810. self.list_extract = []
  1811. try:
  1812. bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*flow_extract_status_from,True,True)])
  1813. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1814. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status",SortOrder.ASC)]),limit=100,get_total_count=True),
  1815. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1816. log("flow_extract producer total_count:%d"%total_count)
  1817. list_dict = getRow_ots(rows)
  1818. for _dict in list_dict:
  1819. docid = _dict.get(document_tmp_docid)
  1820. if docid in set_docid:
  1821. self.list_extract.insert(0,docid)
  1822. continue
  1823. else:
  1824. self.queue_extract.put(_dict)
  1825. self.list_extract.append(docid)
  1826. _count = len(list_dict)
  1827. while next_token and _count<flow_process_count:
  1828. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1829. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  1830. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1831. list_dict = getRow_ots(rows)
  1832. for _dict in list_dict:
  1833. docid = _dict.get(document_tmp_docid)
  1834. if docid in set_docid:
  1835. self.list_extract.insert(0,docid)
  1836. continue
  1837. else:
  1838. self.queue_extract.put(_dict)
  1839. self.list_extract.append(docid)
  1840. _count += len(list_dict)
  1841. except Exception as e:
  1842. log("flow extract producer error:%s"%(str(e)))
  1843. traceback.print_exc()
  1844. def flow_extract(self,):
  1845. self.comsumer()
  1846. def comsumer(self):
  1847. mt = MultiThreadHandler(self.queue_extract,self.comsumer_handle,None,35,1,True)
  1848. mt.run()
  1849. def comsumer_handle(self,item,result_queue):
  1850. dhtml = Document_html({"partitionkey":item.get("partitionkey"),
  1851. "docid":item.get("docid")})
  1852. dhtml.fix_columns(self.ots_client,["dochtmlcon"],True)
  1853. item[document_tmp_dochtmlcon] = dhtml.getProperties().get(document_tmp_dochtmlcon,"")
  1854. _extract = Document_extract({})
  1855. _extract.setValue(document_extract2_partitionkey,item.get(document_partitionkey))
  1856. _extract.setValue(document_extract2_docid,item.get(document_docid))
  1857. all_done = 1
  1858. if all_done:
  1859. data = item
  1860. resp = requests.post(self.other_url,json=data,headers=self.header)
  1861. if (resp.status_code >=200 and resp.status_code<=210):
  1862. _extract.setValue(document_extract2_other_json,resp.content.decode("utf8"),True)
  1863. else:
  1864. all_done = -1
  1865. data = {}
  1866. for k,v in item.items():
  1867. data[k] = v
  1868. data["timeout"] = 240
  1869. data["doc_id"] = data.get(document_tmp_docid)
  1870. data["content"] = data.get(document_tmp_dochtmlcon,"")
  1871. if document_tmp_dochtmlcon in data:
  1872. data.pop(document_tmp_dochtmlcon)
  1873. data["title"] = data.get(document_tmp_doctitle,"")
  1874. data["web_source_no"] = item.get(document_tmp_web_source_no,"")
  1875. data["original_docchannel"] = item.get(document_tmp_original_docchannel,"")
  1876. if all_done:
  1877. resp = requests.post(self.extract_url,json=data,headers=self.header)
  1878. if (resp.status_code >=200 and resp.status_code<=210):
  1879. _extract.setValue(document_extract2_extract_json,resp.content.decode("utf8"),True)
  1880. else:
  1881. all_done = -2
  1882. if all_done:
  1883. resp = requests.post(self.industy_url,json=data,headers=self.header)
  1884. if (resp.status_code >=200 and resp.status_code<=210):
  1885. _extract.setValue(document_extract2_industry_json,resp.content.decode("utf8"),True)
  1886. else:
  1887. all_done = -3
  1888. _dict = {document_partitionkey:item.get(document_tmp_partitionkey),
  1889. document_docid:item.get(document_tmp_docid),
  1890. }
  1891. dtmp = Document_tmp(_dict)
  1892. if all_done!=1:
  1893. sentMsgToDD("要素提取失败:docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
  1894. dtmp.setValue(document_tmp_status,random.randint(*flow_extract_status_failed_to),True)
  1895. dtmp.update_row(self.ots_client)
  1896. else:
  1897. dtmp.setValue(document_tmp_status,random.randint(*flow_extract_status_succeed_to),True)
  1898. dtmp.update_row(self.ots_client)
  1899. # 插入接口表,上线放开
  1900. _extract.setValue(document_extract2_status,random.randint(1,50),True)
  1901. _extract.update_row(self.ots_client)
  1902. log("process docid:%d %s"%(data["doc_id"],str(all_done)))
  1903. def start_flow_extract(self):
  1904. schedule = BlockingScheduler()
  1905. schedule.add_job(self.flow_extract_producer,"cron",second="*/10")
  1906. schedule.add_job(self.flow_extract,"cron",second="*/10")
  1907. schedule.start()
  1908. class Dataflow_dumplicate(Dataflow):
  1909. class DeleteListener():
  1910. def __init__(self,conn,_func,*args,**kwargs):
  1911. self.conn = conn
  1912. self._func = _func
  1913. def on_error(self, headers,*args,**kwargs):
  1914. log('received an error %s' % str(headers.body))
  1915. def on_message(self, headers,*args,**kwargs):
  1916. try:
  1917. message_id = headers.headers["message-id"]
  1918. body = headers.body
  1919. log("get message %s"%(message_id))
  1920. self._func(_dict={"frame":headers,"conn":self.conn},result_queue=None)
  1921. except Exception as e:
  1922. traceback.print_exc()
  1923. pass
  1924. def __del__(self):
  1925. self.conn.disconnect()
  1926. def __init__(self,start_delete_listener=True):
  1927. Dataflow.__init__(self,)
  1928. self.c_f_get_extractCount = f_get_extractCount()
  1929. self.c_f_get_package = f_get_package()
  1930. logging.basicConfig(level = logging.info,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  1931. self.fix_doc_docid = None
  1932. self.bdm = BaseDataMonitor()
  1933. self.check_rule = 1
  1934. if start_delete_listener:
  1935. self.delete_comsumer_counts = 2
  1936. self.doc_delete_queue = "/queue/doc_delete_queue"
  1937. self.doc_delete_result = "/queue/doc_delete_result"
  1938. self.pool_mq_ali = ConnectorPool(1,10,getConnect_activateMQ_ali)
  1939. for _ in range(self.delete_comsumer_counts):
  1940. conn = getConnect_activateMQ_ali()
  1941. listener = self.DeleteListener(conn,self.delete_doc_handle)
  1942. createComsumer(listener,self.doc_delete_queue)
  1943. def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart"]):
  1944. dict_time = {}
  1945. for k in keys:
  1946. dict_time[k] = _extract.get(k)
  1947. return dict_time
  1948. def get_attrs_before_dump(self,docid,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]):
  1949. bool_query = BoolQuery(must_queries=[
  1950. TermQuery("docid",docid)
  1951. ])
  1952. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
  1953. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1954. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1955. log("flow_dumplicate producer total_count:%d"%total_count)
  1956. if total_count==0:
  1957. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  1958. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  1959. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  1960. list_dict = getRow_ots(rows)
  1961. if len(list_dict)>0:
  1962. return self.post_extract(list_dict[0])
  1963. def post_extract(self,_dict):
  1964. win_tenderer,bidding_budget,win_bid_price,_ = self.f_decode_sub_docs_json(_dict.get(document_tmp_project_code),_dict.get(document_tmp_project_name),_dict.get(document_tmp_tenderee),_dict.get(document_tmp_agency),_dict.get(document_tmp_sub_docs_json))
  1965. _dict["win_tenderer"] = win_tenderer
  1966. _dict["bidding_budget"] = bidding_budget
  1967. _dict["win_bid_price"] = win_bid_price
  1968. extract_json = _dict.get(document_tmp_extract_json,"{}")
  1969. _extract = json.loads(extract_json)
  1970. _dict["product"] = ",".join(_extract.get("product",[]))
  1971. _dict["fingerprint"] = _extract.get("fingerprint","")
  1972. _dict["project_codes"] = _extract.get("code",[])
  1973. if len(_dict["project_codes"])>0:
  1974. _dict["project_code"] = _dict["project_codes"][0]
  1975. else:
  1976. _dict["project_code"] = ""
  1977. _dict["doctitle_refine"] = _extract.get("doctitle_refine","")
  1978. if _dict["doctitle_refine"]=="":
  1979. _dict["doctitle_refine"] = _dict.get("doctitle")
  1980. _dict["moneys"] = set(_extract.get("moneys",[]))
  1981. _dict["moneys_attachment"] = set(_extract.get("moneys_attachment",[]))
  1982. _dict["nlp_enterprise"] = json.dumps({"indoctextcon":_extract.get("nlp_enterprise",[]),
  1983. "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])},ensure_ascii=False)
  1984. _dict["extract_count"] = _extract.get("extract_count",0)
  1985. _dict["package"] = self.c_f_get_package.evaluate(extract_json)
  1986. _dict["project_name"] = _extract.get("name","")
  1987. _dict["dict_time"] = self.get_dict_time(_extract)
  1988. _dict["punish"] = _extract.get("punish",{})
  1989. _dict["approval"] = _extract.get("approval",[])
  1990. return _dict
  1991. def dumplicate_fianl_check(self,base_list,b_log=False):
  1992. the_group = base_list
  1993. the_group.sort(key=lambda x:x["confidence"],reverse=True)
  1994. _index = 0
  1995. base_fingerprint = "None"
  1996. if len(base_list)>0:
  1997. base_fingerprint = base_list[0]["fingerprint"]
  1998. final_group = []
  1999. for _i in range(len(base_list)):
  2000. _dict1 = base_list[_i]
  2001. fingerprint_less = _dict1["fingerprint"]
  2002. _pass = True
  2003. if fingerprint_less==base_fingerprint:
  2004. _index = _i
  2005. final_group.append(_dict1)
  2006. continue
  2007. for _dict2 in final_group:
  2008. _prob,day_dis = self.dumplicate_check(_dict1,_dict2,_dict1.get("min_counts",10),b_log=b_log)
  2009. if _prob<=0.1:
  2010. _pass = False
  2011. break
  2012. log("checking index:%d %s %.2f"%(_i,str(_pass),_prob))
  2013. _index = _i
  2014. if _pass:
  2015. final_group.append(_dict1)
  2016. else:
  2017. break
  2018. return final_group
  2019. def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
  2020. document_less = _dict1
  2021. docid_less = _dict1["docid"]
  2022. docchannel_less = document_less.get("docchannel",0)
  2023. page_time_less = document_less.get("page_time")
  2024. doctitle_refine_less = document_less["doctitle_refine"]
  2025. project_codes_less = document_less.get("project_codes")
  2026. nlp_enterprise_less = document_less["nlp_enterprise"]
  2027. tenderee_less = document_less.get("tenderee","")
  2028. agency_less = document_less.get("agency")
  2029. win_tenderer_less = document_less["win_tenderer"]
  2030. bidding_budget_less = document_less["bidding_budget"]
  2031. win_bid_price_less = document_less["win_bid_price"]
  2032. product_less = document_less.get("product")
  2033. package_less = document_less.get("package")
  2034. json_time_less = document_less.get("dict_time")
  2035. project_name_less = document_less.get("project_name")
  2036. fingerprint_less = document_less.get("fingerprint")
  2037. extract_count_less = document_less.get("extract_count",0)
  2038. web_source_no_less = document_less.get("web_source_no")
  2039. province_less = document_less.get("province")
  2040. city_less = document_less.get("city")
  2041. district_less = document_less.get("district")
  2042. moneys_less = document_less.get("moneys")
  2043. moneys_attachment_less = document_less.get("moneys_attachment")
  2044. page_attachments_less = document_less.get("page_attachments","[]")
  2045. punish_less = document_less.get("punish",{})
  2046. approval_less = document_less.get("approval",[])
  2047. source_type_less = document_less.get("source_type")
  2048. document_greater = _dict2
  2049. docid_greater = _dict2["docid"]
  2050. page_time_greater = document_greater["page_time"]
  2051. docchannel_greater = document_greater.get("docchannel",0)
  2052. doctitle_refine_greater = document_greater.get("doctitle_refine","")
  2053. project_codes_greater = document_greater["project_codes"]
  2054. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  2055. tenderee_greater = document_greater.get("tenderee","")
  2056. agency_greater = document_greater.get("agency","")
  2057. win_tenderer_greater = document_greater["win_tenderer"]
  2058. bidding_budget_greater = document_greater["bidding_budget"]
  2059. win_bid_price_greater = document_greater["win_bid_price"]
  2060. product_greater = document_greater.get("product")
  2061. package_greater = document_greater.get("package")
  2062. json_time_greater = document_greater["dict_time"]
  2063. project_name_greater = document_greater.get("project_name")
  2064. fingerprint_greater = document_greater.get("fingerprint")
  2065. extract_count_greater = document_greater.get("extract_count",0)
  2066. web_source_no_greater = document_greater.get("web_source_no")
  2067. province_greater = document_greater.get("province")
  2068. city_greater = document_greater.get("city")
  2069. district_greater = document_greater.get("district")
  2070. moneys_greater = document_greater.get("moneys")
  2071. moneys_attachment_greater = document_greater.get("moneys_attachment")
  2072. page_attachments_greater = document_greater.get("page_attachments","[]")
  2073. punish_greater = document_greater.get("punish",{})
  2074. approval_greater = document_greater.get("approval",[])
  2075. source_type_greater = document_greater.get("source_type")
  2076. hard_level=1
  2077. if docchannel_less==docchannel_greater==302:
  2078. hard_level=2
  2079. if web_source_no_less==web_source_no_greater=="17397-3":
  2080. hard_level=2
  2081. if self.check_rule==1:
  2082. _prob = check_dumplicate_rule(document_less,document_greater,min_counts,b_log=b_log,hard_level=hard_level)
  2083. else:
  2084. _prob = check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater)
  2085. pagetime_stamp_less = getTimeStamp(page_time_less)
  2086. pagetime_stamp_greater = getTimeStamp(page_time_greater)
  2087. day_dis = abs(pagetime_stamp_greater-pagetime_stamp_less)//86400
  2088. if day_dis>7:
  2089. _prob = 0
  2090. elif day_dis>3:
  2091. if _prob<0.4:
  2092. _prob = 0
  2093. return _prob,day_dis
  2094. def dumplicate_check_bak(self,_dict1,_dict2,min_counts,b_log=False):
  2095. document_less = _dict1
  2096. docid_less = _dict1["docid"]
  2097. docchannel_less = document_less["docchannel"]
  2098. page_time_less = document_less["page_time"]
  2099. doctitle_refine_less = document_less["doctitle_refine"]
  2100. project_codes_less = document_less["project_codes"]
  2101. nlp_enterprise_less = document_less["nlp_enterprise"]
  2102. tenderee_less = document_less["tenderee"]
  2103. agency_less = document_less["agency"]
  2104. win_tenderer_less = document_less["win_tenderer"]
  2105. bidding_budget_less = document_less["bidding_budget"]
  2106. win_bid_price_less = document_less["win_bid_price"]
  2107. product_less = document_less["product"]
  2108. package_less = document_less["package"]
  2109. json_time_less = document_less["dict_time"]
  2110. project_name_less = document_less["project_name"]
  2111. fingerprint_less = document_less["fingerprint"]
  2112. extract_count_less = document_less["extract_count"]
  2113. document_greater = _dict2
  2114. docid_greater = _dict2["docid"]
  2115. page_time_greater = document_greater["page_time"]
  2116. doctitle_refine_greater = document_greater["doctitle_refine"]
  2117. project_codes_greater = document_greater["project_codes"]
  2118. nlp_enterprise_greater = document_greater["nlp_enterprise"]
  2119. tenderee_greater = document_greater["tenderee"]
  2120. agency_greater = document_greater["agency"]
  2121. win_tenderer_greater = document_greater["win_tenderer"]
  2122. bidding_budget_greater = document_greater["bidding_budget"]
  2123. win_bid_price_greater = document_greater["win_bid_price"]
  2124. product_greater = document_greater["product"]
  2125. package_greater = document_greater["package"]
  2126. json_time_greater = document_greater["dict_time"]
  2127. project_name_greater = document_greater["project_name"]
  2128. fingerprint_greater = document_greater["fingerprint"]
  2129. extract_count_greater = document_greater["extract_count"]
  2130. if fingerprint_less==fingerprint_greater:
  2131. return 1
  2132. same_count = 0
  2133. all_count = 8
  2134. if len(set(project_codes_less) & set(project_codes_greater))>0:
  2135. same_count += 1
  2136. if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
  2137. same_count += 1
  2138. if getLength(agency_less)>0 and agency_less==agency_greater:
  2139. same_count += 1
  2140. if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
  2141. same_count += 1
  2142. if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
  2143. same_count += 1
  2144. if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
  2145. same_count += 1
  2146. if getLength(project_name_less)>0 and project_name_less==project_name_greater:
  2147. same_count += 1
  2148. if getLength(doctitle_refine_less)>0 and (doctitle_refine_less==doctitle_refine_greater or doctitle_refine_less in doctitle_refine_greater or doctitle_refine_greater in doctitle_refine_less):
  2149. same_count += 1
  2150. base_prob = 0
  2151. if min_counts<3:
  2152. base_prob = 0.9
  2153. elif min_counts<5:
  2154. base_prob = 0.8
  2155. elif min_counts<8:
  2156. base_prob = 0.7
  2157. else:
  2158. base_prob = 0.6
  2159. _prob = base_prob*same_count/all_count
  2160. if _prob<0.1 and min(extract_count_less,extract_count_greater)<=3:
  2161. _prob = 0.15
  2162. if _prob<0.1:
  2163. return _prob
  2164. check_result = {"pass":1}
  2165. if docchannel_less in (51,102,103,104,115,116,117):
  2166. if doctitle_refine_less!=doctitle_refine_greater:
  2167. if page_time_less!=page_time_greater:
  2168. check_result["docchannel"] = 0
  2169. check_result["pass"] = 0
  2170. else:
  2171. check_result["docchannel"] = 2
  2172. if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
  2173. check_result["doctitle"] = 0
  2174. check_result["pass"] = 0
  2175. if b_log:
  2176. logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
  2177. else:
  2178. check_result["doctitle"] = 2
  2179. #added check
  2180. if not check_codes(project_codes_less,project_codes_greater):
  2181. check_result["code"] = 0
  2182. check_result["pass"] = 0
  2183. if b_log:
  2184. logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
  2185. else:
  2186. if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
  2187. check_result["code"] = 2
  2188. else:
  2189. check_result["code"] = 1
  2190. if not check_product(product_less,product_greater,doctitle_refine_less,doctitle_refine_greater):
  2191. check_result["product"] = 0
  2192. check_result["pass"] = 0
  2193. if b_log:
  2194. logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
  2195. else:
  2196. if getLength(product_less)>0 and getLength(product_greater)>0:
  2197. check_result["product"] = 2
  2198. else:
  2199. check_result["product"] = 1
  2200. if not check_demand():
  2201. check_result["pass"] = 0
  2202. if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
  2203. tenderee_less,tenderee_greater,
  2204. agency_less,agency_greater,
  2205. win_tenderer_less,win_tenderer_greater):
  2206. check_result["entity"] = 0
  2207. check_result["pass"] = 0
  2208. if b_log:
  2209. logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
  2210. else:
  2211. if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
  2212. check_result["entity"] = 2
  2213. elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
  2214. check_result["entity"] = 2
  2215. else:
  2216. check_result["entity"] = 1
  2217. if not check_money(bidding_budget_less,bidding_budget_greater,
  2218. win_bid_price_less,win_bid_price_greater):
  2219. if b_log:
  2220. logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
  2221. check_result["money"] = 0
  2222. check_result["pass"] = 0
  2223. else:
  2224. if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
  2225. check_result["money"] = 2
  2226. elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
  2227. check_result["money"] = 2
  2228. else:
  2229. check_result["money"] = 1
  2230. #added check
  2231. if not check_package(package_less,package_greater):
  2232. if b_log:
  2233. logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
  2234. check_result["package"] = 0
  2235. check_result["pass"] = 0
  2236. else:
  2237. if getLength(package_less)>0 and getLength(package_greater)>0:
  2238. check_result["package"] = 2
  2239. else:
  2240. check_result["package"] = 1
  2241. #added check
  2242. if not check_time(json_time_less,json_time_greater):
  2243. if b_log:
  2244. logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
  2245. if isinstance(json_time_less,dict):
  2246. time_less = json_time_less
  2247. else:
  2248. time_less = json.loads(json_time_less)
  2249. if isinstance(json_time_greater,dict):
  2250. time_greater = json_time_greater
  2251. else:
  2252. time_greater = json.loads(json_time_greater)
  2253. for k,v in time_less.items():
  2254. if getLength(v)>0:
  2255. v1 = time_greater.get(k,"")
  2256. if getLength(v1)>0:
  2257. if v!=v1:
  2258. log("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
  2259. check_result["time"] = 0
  2260. check_result["pass"] = 0
  2261. else:
  2262. if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
  2263. check_result["time"] = 2
  2264. else:
  2265. check_result["time"] = 1
  2266. if check_result.get("pass",0)==0:
  2267. if b_log:
  2268. logging.info(str(check_result))
  2269. if check_result.get("money",1)==0:
  2270. return 0
  2271. if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
  2272. return _prob
  2273. else:
  2274. return 0
  2275. if check_result.get("time",1)==0:
  2276. return 0
  2277. return _prob
  2278. def search_data_by_query(self,item,_query,confidence,retry_times=3,merge=False,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count],b_log=False):
  2279. for _ in range(retry_times):
  2280. try:
  2281. _time = time.time()
  2282. check_time = 0
  2283. if isinstance(_query,list):
  2284. bool_query = BoolQuery(should_queries=_query)
  2285. else:
  2286. bool_query = _query
  2287. rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  2288. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=100,get_total_count=True),
  2289. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  2290. list_dict = getRow_ots(rows)
  2291. list_data = []
  2292. for _dict in list_dict:
  2293. self.post_extract(_dict)
  2294. _docid = _dict.get(document_tmp_docid)
  2295. if merge:
  2296. list_data.append(_dict)
  2297. else:
  2298. if _docid!=item.get(document_tmp_docid):
  2299. _time1 = time.time()
  2300. confidence,day_dis = self.dumplicate_check(item,_dict,total_count,b_log=b_log)
  2301. check_time+= time.time()-_time1
  2302. _dict["confidence"] = confidence
  2303. _dict["min_counts"] = total_count
  2304. list_data.append(_dict)
  2305. all_time = time.time()-_time
  2306. # log("check:%d rows takes%.4f,check%.4f"%(len(list_dict),all_time-check_time,check_time))
  2307. return list_data
  2308. except Exception as e:
  2309. traceback.print_exc()
  2310. return []
  2311. def add_data_by_query(self,item,base_list,set_docid,_query,confidence,table_name,table_index,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_save,document_tmp_status,document_tmp_product,document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count],b_log=False):
  2312. list_dict = self.search_data_by_query(item,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys,columns=columns,b_log=b_log)
  2313. for _dict in list_dict:
  2314. _docid = _dict.get(document_tmp_docid)
  2315. confidence = _dict["confidence"]
  2316. if b_log:
  2317. log("confidence %d %.3f total_count %d"%(_docid,confidence,_dict.get('min_counts',0)))
  2318. if confidence>0.1:
  2319. if _docid not in set_docid:
  2320. base_list.append(_dict)
  2321. set_docid.add(_docid)
  2322. set_docid.add(_docid)
  2323. def appendRule(self,list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=False):
  2324. for k,v in _dict.items():
  2325. if getLength(v)==0:
  2326. return
  2327. _dict.update(base_dict)
  2328. if b_log:
  2329. log("rule dict:"+str(_dict))
  2330. _query = self.generate_dumplicate_query(_dict,must_not_dict)
  2331. _rule = {"confidence":confidence,
  2332. "item":item,
  2333. "query":_query,
  2334. "singleNum_keys":[],
  2335. "contain_keys":[],
  2336. "multiNum_keys":[],
  2337. "_dict":_dict}
  2338. list_rules.append(_rule)
  2339. def translate_dumplicate_rules(self,status_from,item,get_all=False,to_log=False,day_dis=7,table_name ="document_tmp",table_index="document_tmp_index"):
  2340. docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
  2341. current_date = getCurrent_date("%Y-%m-%d")
  2342. if page_time=='':
  2343. page_time = current_date
  2344. two_day_dict = {"page_time":[timeAdd(page_time,-7),timeAdd(page_time,7)]}
  2345. if table_name in {"document_tmp","document"}:
  2346. if page_time>=timeAdd(current_date,-7):
  2347. table_name = "document_tmp"
  2348. table_index = "document_tmp_index"
  2349. base_dict = {
  2350. "docchannel":item.get("docchannel",52),
  2351. "status":[status_from[0]],
  2352. "page_time":[timeAdd(page_time,-day_dis),timeAdd(page_time,day_dis)]
  2353. }
  2354. must_not_dict = {"save":0,"docid":item.get("docid")}
  2355. doctitle_refine_name = "doctitle_refine"
  2356. else:
  2357. table_name = "document"
  2358. table_index = "document_index"
  2359. if get_all:
  2360. _status = [201,450]
  2361. else:
  2362. _status = [201,300]
  2363. base_dict = {
  2364. "docchannel":item["docchannel"],
  2365. "status":_status,
  2366. "page_time":[timeAdd(page_time,-day_dis),timeAdd(page_time,day_dis)]
  2367. }
  2368. must_not_dict = {"docid":item.get("docid")}
  2369. doctitle_refine_name = "doctitle"
  2370. else:
  2371. _status = [201,300]
  2372. base_dict = {
  2373. "docchannel":item["docchannel"],
  2374. "status":_status,
  2375. "page_time":[timeAdd(page_time,-day_dis),timeAdd(page_time,day_dis)]
  2376. }
  2377. must_not_dict = {"docid":item.get("docid")}
  2378. doctitle_refine_name = "doctitle"
  2379. list_rules = []
  2380. singleNum_keys = ["tenderee","win_tenderer"]
  2381. confidence = 100
  2382. self.appendRule(list_rules,{document_tmp_fingerprint:fingerprint},base_dict,must_not_dict,confidence,item,b_log=to_log)
  2383. confidence = 90
  2384. _dict = {document_tmp_agency:agency,
  2385. "win_tenderer":win_tenderer,
  2386. "win_bid_price":win_bid_price}
  2387. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2388. _dict = {document_tmp_agency:agency,
  2389. "win_tenderer":win_tenderer,
  2390. "bidding_budget":bidding_budget}
  2391. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2392. _dict = {document_tmp_agency:agency,
  2393. "win_bid_price":win_bid_price,
  2394. "bidding_budget":bidding_budget}
  2395. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2396. _dict = {win_tenderer:win_tenderer,
  2397. "win_bid_price":win_bid_price,
  2398. "bidding_budget":bidding_budget}
  2399. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2400. _dict = {"tenderee":tenderee,
  2401. "win_tenderer":win_tenderer,
  2402. "win_bid_price":win_bid_price}
  2403. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2404. _dict = {"tenderee":tenderee,
  2405. "win_tenderer":win_tenderer,
  2406. "bidding_budget":bidding_budget}
  2407. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2408. _dict = {"tenderee":tenderee,
  2409. "win_bid_price":win_bid_price,
  2410. "bidding_budget":bidding_budget}
  2411. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2412. _dict = {"tenderee":tenderee,
  2413. "agency":agency,
  2414. "win_tenderer":win_tenderer}
  2415. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2416. _dict = {"tenderee":tenderee,
  2417. "agency":agency,
  2418. "win_bid_price":win_bid_price}
  2419. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2420. _dict = {"tenderee":tenderee,
  2421. "agency":agency,
  2422. "bidding_budget":bidding_budget}
  2423. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2424. _dict = {"tenderee":tenderee,
  2425. "project_codes":project_code
  2426. }
  2427. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2428. _dict = {"tenderee":tenderee,
  2429. "win_bid_price":win_bid_price
  2430. }
  2431. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2432. _dict = {"agency":agency,
  2433. "project_codes":project_code
  2434. }
  2435. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2436. _dict = {"win_tenderer":win_tenderer,
  2437. "bidding_budget":bidding_budget
  2438. }
  2439. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2440. _dict = {"project_codes":project_code,
  2441. "win_bid_price":win_bid_price
  2442. }
  2443. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2444. _dict = {"project_codes":project_code,
  2445. "bidding_budget":bidding_budget
  2446. }
  2447. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2448. _dict = {"project_codes":project_code,
  2449. doctitle_refine_name:doctitle_refine
  2450. }
  2451. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2452. _dict = {"tenderee":tenderee,
  2453. "bidding_budget":bidding_budget
  2454. }
  2455. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2456. _dict = {"project_codes":project_code,
  2457. "win_tenderer":win_tenderer
  2458. }
  2459. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2460. base_dict.update(two_day_dict)
  2461. confidence=85
  2462. _dict = {"tenderee":tenderee,
  2463. "agency":agency
  2464. }
  2465. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2466. _dict = {"tenderee":tenderee,
  2467. "project_name":project_name
  2468. }
  2469. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2470. if getLength(product)>0:
  2471. l_p = product.split(",")
  2472. _dict = {"tenderee":tenderee,
  2473. "product":l_p[0]
  2474. }
  2475. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2476. _dict = {"tenderee":tenderee,
  2477. "win_tenderer":win_tenderer
  2478. }
  2479. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2480. _dict = {"tenderee":tenderee,
  2481. doctitle_refine_name:doctitle_refine
  2482. }
  2483. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2484. _dict = {"agency":agency,
  2485. "project_name":project_name
  2486. }
  2487. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2488. _dict = {"project_codes":project_code,
  2489. "project_name":project_name
  2490. }
  2491. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2492. _dict = {"project_name":project_name,
  2493. "win_tenderer":win_tenderer
  2494. }
  2495. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2496. _dict = {"project_name":project_name,
  2497. "win_bid_price":win_bid_price
  2498. }
  2499. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2500. _dict = {"project_name":project_name,
  2501. "bidding_budget":bidding_budget
  2502. }
  2503. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2504. _dict = {"project_name":project_name,
  2505. doctitle_refine_name:doctitle_refine
  2506. }
  2507. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2508. _dict = {"win_tenderer":win_tenderer,
  2509. "win_bid_price":win_bid_price
  2510. }
  2511. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2512. _dict = {"win_tenderer":win_tenderer,
  2513. doctitle_refine_name:doctitle_refine
  2514. }
  2515. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2516. _dict = {"win_bid_price":win_bid_price,
  2517. "bidding_budget":bidding_budget
  2518. }
  2519. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2520. confidence=80
  2521. _dict = {"project_codes":project_code}
  2522. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2523. _dict = {"win_bid_price":win_bid_price,
  2524. doctitle_refine_name:doctitle_refine
  2525. }
  2526. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2527. _dict = {"bidding_budget":bidding_budget,
  2528. doctitle_refine_name:doctitle_refine
  2529. }
  2530. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2531. confidence=80
  2532. _dict = {doctitle_refine_name:doctitle_refine}
  2533. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2534. confidence=70
  2535. _dict = {"project_name":project_name}
  2536. self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
  2537. return list_rules,table_name,table_index
  2538. def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]):
  2539. q_size = self.queue_dumplicate.qsize()
  2540. log("dumplicate queue size %d"%(q_size))
  2541. while 1:
  2542. try:
  2543. docid = self.queue_dumplicate_processed.get(block=False)
  2544. if docid in self.dumplicate_set:
  2545. self.dumplicate_set.remove(docid)
  2546. except Exception as e:
  2547. break
  2548. if q_size>process_count//3:
  2549. return
  2550. bool_query = BoolQuery(must_queries=[
  2551. RangeQuery(document_tmp_status,*status_from,True,True),
  2552. # TermQuery("docid",271983871)
  2553. ])
  2554. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  2555. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(document_update_document,SortOrder.DESC),FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
  2556. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  2557. log("flow_dumplicate producer total_count:%d"%total_count)
  2558. list_dict = getRow_ots(rows)
  2559. for _dict in list_dict:
  2560. docid = _dict.get(document_tmp_docid)
  2561. if docid in self.dumplicate_set:
  2562. continue
  2563. self.dumplicate_set.add(docid)
  2564. self.queue_dumplicate.put(_dict)
  2565. _count = len(list_dict)
  2566. while next_token and _count<process_count:
  2567. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  2568. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  2569. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  2570. list_dict = getRow_ots(rows)
  2571. for _dict in list_dict:
  2572. docid = _dict.get(document_tmp_docid)
  2573. if docid in self.dumplicate_set:
  2574. continue
  2575. self.dumplicate_set.add(docid)
  2576. self.queue_dumplicate.put(_dict)
  2577. _count += len(list_dict)
  2578. # _l = list(self.dumplicate_set)
  2579. # _l.sort(key=lambda x:x,reverse=True)
  2580. # self.dumplicate_set = set(_l[:flow_process_count*2])
  2581. def comsumer_flow_dumplicate(self):
  2582. mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,60,1,ots_client=self.ots_client)
  2583. mt.run()
  2584. def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
  2585. self.producer_flow_dumplicate(process_count=process_count,status_from=status_from)
  2586. # self.comsumer_flow_dumplicate()
  2587. def flow_dumpcate_comsumer(self):
  2588. from multiprocessing import Process
  2589. process_count = 6
  2590. thread_count = 12
  2591. list_process = []
  2592. def start_thread():
  2593. mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,thread_count,1,need_stop=False,restart=True,timeout=600,ots_client=self.ots_client)
  2594. mt.run()
  2595. for _ in range(process_count):
  2596. p = Process(target=start_thread)
  2597. list_process.append(p)
  2598. for p in list_process:
  2599. p.start()
  2600. while 1:
  2601. for _i in range(len(list_process)):
  2602. p = list_process[_i]
  2603. if not p.is_alive():
  2604. p = Process(target=start_thread)
  2605. list_process[_i] = p
  2606. p.start()
  2607. time.sleep(1)
  2608. # mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,40,1,ots_client=self.ots_client)
  2609. # mt.run()
  2610. def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment,document_tenderee_code,document_agency_code,document_candidates],document_name="document"):
  2611. '''
  2612. 根据docid查询公告内容,先查询document_tmp,再查询document
  2613. :param list_docids:
  2614. :return:
  2615. '''
  2616. list_docs = []
  2617. set_fingerprint = set()
  2618. for _docid in list_docids:
  2619. docid = int(_docid)
  2620. _dict = {document_partitionkey:getPartitionKey(docid),
  2621. document_docid:docid}
  2622. if document_name in {"document","document_tmp"}:
  2623. _doc = Document_tmp(_dict)
  2624. _exists = _doc.fix_columns(self.ots_client,columns_to_get,True)
  2625. if not _exists:
  2626. _doc = Document(_dict)
  2627. _exists = _doc.fix_columns(self.ots_client,columns_to_get,True)
  2628. else:
  2629. _doc = Document(_dict)
  2630. _doc.table_name = document_name
  2631. _exists = _doc.fix_columns(self.ots_client,columns_to_get,True)
  2632. if _exists:
  2633. _fingerprint = _doc.getProperties().get(document_fingerprint)
  2634. if _fingerprint in set_fingerprint:
  2635. continue
  2636. set_fingerprint.add(_fingerprint)
  2637. list_docs.append(_doc)
  2638. for _doc in list_docs:
  2639. try:
  2640. _sub_docs_json = _doc.getProperties().get(document_tmp_sub_docs_json)
  2641. if _sub_docs_json is not None:
  2642. _doc.setValue("sub_docs",json.loads(_sub_docs_json),False)
  2643. except Exception as e:
  2644. traceback.print_exc()
  2645. list_docs.sort(key=lambda x:x.getProperties().get(document_page_time,""))
  2646. return list_docs
  2647. def is_same_package(self,_dict1,_dict2):
  2648. sub_project_name1 = _dict1.get(project_sub_project_name,"")
  2649. if sub_project_name1=="Project":
  2650. sub_project_name1 = ""
  2651. win_tenderer1 = _dict1.get(project_win_tenderer,"")
  2652. win_bid_price1 = _dict1.get(project_win_bid_price,0)
  2653. bidding_budget1 = _dict1.get(project_bidding_budget,0)
  2654. sub_project_name2 = _dict2.get(project_sub_project_name,"")
  2655. if sub_project_name2=="Project":
  2656. sub_project_name2 = ""
  2657. win_tenderer2 = _dict2.get(project_win_tenderer,"")
  2658. win_bid_price2 = _dict2.get(project_win_bid_price,0)
  2659. bidding_budget2 = _dict2.get(project_bidding_budget,0)
  2660. _set = set([a for a in [sub_project_name1,sub_project_name2] if a!=""])
  2661. if len(_set)>1:
  2662. return False
  2663. _set = set([a for a in [win_tenderer1,win_tenderer2] if a!=""])
  2664. if len(_set)>1:
  2665. return False
  2666. _set = set([a for a in [win_bid_price1,win_bid_price2] if a!=0])
  2667. if len(_set)>1:
  2668. return False
  2669. _set = set([a for a in [bidding_budget1,bidding_budget2] if a!=0])
  2670. if len(_set)>1:
  2671. return False
  2672. return True
  2673. def getUpdate_dict(self,_dict):
  2674. update_dict = {}
  2675. for k,v in _dict.items():
  2676. if v is None:
  2677. continue
  2678. if isinstance(v,str):
  2679. if v=="":
  2680. continue
  2681. if isinstance(v,(float,int)):
  2682. if v==0:
  2683. continue
  2684. update_dict[k] = v
  2685. return update_dict
  2686. def update_projects_by_document(self,docid,save,projects,document_name="document"):
  2687. '''
  2688. 更新projects中对应的document的属性
  2689. :param docid:
  2690. :param projects: 项目集合
  2691. :param action:add/delete add时附加唯一属性,delete时删除唯一属性
  2692. :return:
  2693. '''
  2694. list_docs = self.search_docs([docid],document_name=document_name)
  2695. docs = [_doc.getProperties() for _doc in list_docs]
  2696. project_dict = generate_common_properties(docs)
  2697. list_package_properties = generate_packages_properties(docs)
  2698. _dict = {}
  2699. #更新公共属性
  2700. _replace_replace = False
  2701. v = project_dict.get(document_district,"")
  2702. if not (v is None or v=="" or v=="[]" or v=="未知"):
  2703. _replace_replace = True
  2704. for k,v in project_dict.items():
  2705. if not _replace_replace:
  2706. if k in [document_district,document_city,document_province,document_area]:
  2707. continue
  2708. if v is None or v=="" or v=="[]" or v=="未知":
  2709. continue
  2710. if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_candidates,project_zhong_biao_page_time,project_zhao_biao_page_time,project_page_time,project_docchannel):
  2711. continue
  2712. _dict[k] = v
  2713. for _proj in projects:
  2714. _proj.update(_dict)
  2715. for _proj in projects:
  2716. if _proj.get(project_page_time,"")<=project_dict.get(project_page_time,""):
  2717. _proj[project_page_time] = project_dict.get(project_page_time,"")
  2718. _proj[project_docchannel] = project_dict.get(project_docchannel,"")
  2719. else:
  2720. if project_docchannel in project_dict:
  2721. project_dict.pop(project_docchannel)
  2722. if _proj.get(project_zhong_biao_page_time,"")>project_dict.get(project_zhong_biao_page_time,""):
  2723. _proj[project_zhong_biao_page_time] = project_dict.get(project_zhong_biao_page_time,"")
  2724. if _proj.get(project_zhao_biao_page_time,"")>project_dict.get(project_zhao_biao_page_time,""):
  2725. _proj[project_zhao_biao_page_time] = project_dict.get(project_zhao_biao_page_time,"")
  2726. for _proj in projects:
  2727. #拼接属性
  2728. append_dict = {}
  2729. set_docid = set()
  2730. set_product = set()
  2731. set_code = set()
  2732. set_nlp_enterprise = set()
  2733. set_nlp_enterprise_attachment = set()
  2734. set_candidates = set()
  2735. _docids = _proj.get(project_docids,"")
  2736. _codes = _proj.get(project_project_codes,"")
  2737. _product = _proj.get(project_product,"")
  2738. set_docid = set(_docids.split(","))
  2739. if save==1:
  2740. set_docid.add(str(docid))
  2741. else:
  2742. if str(docid) in set_docid:
  2743. set_docid.remove(str(docid))
  2744. set_code = set_code | set(_codes.split(","))
  2745. set_product = set_product | set(_product.split(","))
  2746. try:
  2747. set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
  2748. set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
  2749. list_candidates = json.loads(project_dict.get(project_candidates,"[]"))
  2750. for item in list_candidates:
  2751. if item.get("name") is not None and item.get("name") not in set_candidates:
  2752. set_candidates.add(item.get("name"))
  2753. set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
  2754. set_product = set_product | set(project_dict.get(project_product,"").split(","))
  2755. set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
  2756. set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
  2757. for item in json.loads(_proj.get(project_candidates,"[]")):
  2758. if item.get("name") is not None and item.get("name") not in set_candidates:
  2759. set_candidates.add(item.get("name"))
  2760. list_candidates.append(item)
  2761. except Exception as e:
  2762. pass
  2763. append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
  2764. append_dict[project_docid_number] = len(set_docid)
  2765. append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""])
  2766. append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""])
  2767. append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
  2768. append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
  2769. append_dict[project_candidates] = json.dumps(list_candidates,ensure_ascii=False)
  2770. dict_dynamic = {}
  2771. set_docid = set()
  2772. _dynamic = json.loads(_proj.get(project_project_dynamics,"[]"))
  2773. for _dy in _dynamic:
  2774. _docid = _dy.get("docid")
  2775. dict_dynamic[_docid] = _dy
  2776. _dynamic = json.loads(project_dict.get(project_project_dynamics,"[]"))
  2777. for _dy in _dynamic:
  2778. _docid = _dy.get("docid")
  2779. dict_dynamic[_docid] = _dy
  2780. list_dynamics = []
  2781. for k,v in dict_dynamic.items():
  2782. list_dynamics.append(v)
  2783. list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
  2784. append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
  2785. _proj.update(append_dict)
  2786. dict_package = {}
  2787. for _pp in projects:
  2788. _counts = 0
  2789. sub_project_name = _pp.get(project_sub_project_name,"")
  2790. if sub_project_name=="Project":
  2791. sub_project_name = ""
  2792. win_tenderer = _pp.get(project_win_tenderer,"")
  2793. win_bid_price = _pp.get(project_win_bid_price,0)
  2794. bidding_budget = _pp.get(project_bidding_budget,0)
  2795. if win_tenderer!="" and bidding_budget!=0:
  2796. _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
  2797. dict_package[_key] = _pp
  2798. _counts += 1
  2799. if win_tenderer!="" and win_bid_price!=0:
  2800. _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
  2801. dict_package[_key] = _pp
  2802. _counts +=1
  2803. if _counts==0:
  2804. if win_tenderer!="":
  2805. _key = "%s-%s"%(sub_project_name,win_tenderer)
  2806. dict_package[_key] = _pp
  2807. _counts += 1
  2808. if bidding_budget!=0:
  2809. _key = "%s-%s"%(sub_project_name,str(bidding_budget))
  2810. dict_package[_key] = _pp
  2811. _counts += 1
  2812. #更新私有属性
  2813. if len(projects)==1 and len(list_package_properties)==1:
  2814. _pp = list_package_properties[0]
  2815. pp = projects[0]
  2816. ud = self.getUpdate_dict(_pp)
  2817. self.set_project_uuid(ud,pp.get("uuid"))
  2818. pp.update(_pp)
  2819. else:
  2820. for _pp in list_package_properties:
  2821. flag_update = False
  2822. sub_project_name = _pp.get(project_sub_project_name,"")
  2823. if sub_project_name=="Project":
  2824. sub_project_name = ""
  2825. win_tenderer = _pp.get(project_win_tenderer,"")
  2826. win_bid_price = _pp.get(project_win_bid_price,0)
  2827. bidding_budget = _pp.get(project_bidding_budget,0)
  2828. if win_tenderer!="" and bidding_budget!=0:
  2829. _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
  2830. if _key in dict_package:
  2831. if self.is_same_package(_pp,dict_package[_key]):
  2832. ud = self.getUpdate_dict(_pp)
  2833. self.set_project_uuid(ud,dict_package[_key].get("uuid"))
  2834. dict_package[_key].update(ud)
  2835. flag_update = True
  2836. continue
  2837. if win_tenderer!="" and win_bid_price!=0:
  2838. _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
  2839. if _key in dict_package:
  2840. if self.is_same_package(_pp,dict_package[_key]):
  2841. ud = self.getUpdate_dict(_pp)
  2842. self.set_project_uuid(ud,dict_package[_key].get("uuid"))
  2843. dict_package[_key].update(ud)
  2844. flag_update = True
  2845. continue
  2846. if win_tenderer!="":
  2847. _key = "%s-%s"%(sub_project_name,win_tenderer)
  2848. if _key in dict_package:
  2849. if self.is_same_package(_pp,dict_package[_key]):
  2850. ud = self.getUpdate_dict(_pp)
  2851. self.set_project_uuid(ud,dict_package[_key].get("uuid"))
  2852. dict_package[_key].update(ud)
  2853. flag_update = True
  2854. continue
  2855. if bidding_budget!=0:
  2856. _key = "%s-%s"%(sub_project_name,str(bidding_budget))
  2857. if _key in dict_package:
  2858. if self.is_same_package(_pp,dict_package[_key]):
  2859. ud = self.getUpdate_dict(_pp)
  2860. self.set_project_uuid(ud,dict_package[_key].get("uuid"))
  2861. dict_package[_key].update(ud)
  2862. flag_update = True
  2863. continue
  2864. if not flag_update:
  2865. _pp.update(project_dict)
  2866. projects.append(_pp)
  2867. _counts = 0
  2868. if win_tenderer!="" and bidding_budget!=0:
  2869. _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
  2870. dict_package[_key] = _pp
  2871. _counts += 1
  2872. if win_tenderer!="" and win_bid_price!=0:
  2873. _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
  2874. dict_package[_key] = _pp
  2875. _counts +=1
  2876. if _counts==0:
  2877. if win_tenderer!="":
  2878. _key = "%s-%s"%(sub_project_name,win_tenderer)
  2879. dict_package[_key] = _pp
  2880. _counts += 1
  2881. if bidding_budget!=0:
  2882. _key = "%s-%s"%(sub_project_name,str(bidding_budget))
  2883. dict_package[_key] = _pp
  2884. _counts += 1
  2885. def delete_projects_by_document(self,docid):
  2886. '''
  2887. 更新projects中对应的document的属性
  2888. :param docid:
  2889. :param projects: 项目集合
  2890. :param action:add/delete add时附加唯一属性,delete时删除唯一属性
  2891. :return:
  2892. '''
  2893. set_docid = set()
  2894. list_delete_projects = []
  2895. list_projects = self.search_projects_with_document([docid])
  2896. for _proj in list_projects:
  2897. _p = {}
  2898. _docids = _proj.get(project_docids,"")
  2899. print(_proj.get(project_uuid))
  2900. _p["delete_uuid"] = _proj.get(project_uuid)
  2901. _p["to_delete"] = True
  2902. list_delete_projects.append(_p)
  2903. if _docids!="":
  2904. set_docid = set_docid | set(_docids.split(","))
  2905. if str(docid) in set_docid:
  2906. set_docid.remove(str(docid))
  2907. list_docid = list(set_docid)
  2908. list_projects = []
  2909. if len(list_docid)>0:
  2910. list_docs = self.search_docs(list_docid)
  2911. print("search_docs(list_docid)")
  2912. list_projects = self.generate_projects_from_document(list_docs)
  2913. print("generate_projects_from_document")
  2914. list_projects = dumplicate_projects(list_projects,max_count=20)
  2915. print("dumplicate_projects")
  2916. list_projects.extend(list_delete_projects)
  2917. project_json = to_project_json(list_projects)
  2918. return project_json
  2919. def delete_doc_handle(self,_dict,result_queue):
  2920. try:
  2921. headers = _dict.get("frame")
  2922. conn = _dict.get("conn")
  2923. if headers is not None:
  2924. message_id = headers.headers["message-id"]
  2925. body = headers.body
  2926. item = json.loads(body)
  2927. docid = item.get("docid")
  2928. log("==========start delete docid:%s"%(str(docid)))
  2929. if docid is None:
  2930. ackMsg(conn,message_id)
  2931. delete_result = self.delete_projects_by_document(docid)
  2932. log("1")
  2933. _uuid = uuid4().hex
  2934. _d = {PROJECT_PROCESS_UUID:_uuid,
  2935. PROJECT_PROCESS_CRTIME:1,
  2936. PROJECT_PROCESS_PROJECTS:delete_result}
  2937. _pp = Project_process(_d)
  2938. log("2")
  2939. try:
  2940. if _pp.update_row(self.ots_client):
  2941. ackMsg(conn,message_id)
  2942. except Exception as e:
  2943. ackMsg(conn,message_id)
  2944. log("3")
  2945. #取消插入结果队列,改成插入project_process表
  2946. # if send_msg_toacmq(self.pool_mq_ali,delete_result,self.doc_delete_result):
  2947. # ackMsg(conn,message_id)
  2948. log("==========end delete docid:%s"%(str(docid)))
  2949. else:
  2950. log("has not headers")
  2951. except Exception as e:
  2952. traceback.print_exc()
  2953. ackMsg(conn,message_id)
  2954. log("==========end delete docid:%s"%(str(docid)))
  2955. def generate_common_properties(self,list_docs):
  2956. '''
  2957. #通用属性生成
  2958. :param list_docis:
  2959. :return:
  2960. '''
  2961. #计数法选择
  2962. choose_dict = {}
  2963. project_dict = {}
  2964. for _key in [document_bidway,document_industry,document_info_type,document_info_source,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count]:
  2965. for _doc in list_docs:
  2966. _value = _doc.getProperties().get(_key,"")
  2967. if _value!="":
  2968. if _key not in choose_dict:
  2969. choose_dict[_key] = {}
  2970. if _value not in choose_dict[_key]:
  2971. choose_dict[_key][_value] = 0
  2972. choose_dict[_key][_value] += 1
  2973. _find = False
  2974. for _key in [document_district,document_city,document_province,document_area]:
  2975. area_dict = {}
  2976. for _doc in list_docs:
  2977. loc = _doc.getProperties().get(_key,"未知")
  2978. if loc not in ('全国','未知',"0"):
  2979. if loc not in area_dict:
  2980. area_dict[loc] = 0
  2981. area_dict[loc] += 1
  2982. list_loc = []
  2983. for k,v in area_dict.items():
  2984. list_loc.append([k,v])
  2985. list_loc.sort(key=lambda x:x[1],reverse=True)
  2986. if len(list_loc)>0:
  2987. project_dict[document_district] = _doc.getProperties().get(document_district)
  2988. project_dict[document_city] = _doc.getProperties().get(document_city)
  2989. project_dict[document_province] = _doc.getProperties().get(document_province)
  2990. project_dict[document_area] = _doc.getProperties().get(document_area)
  2991. _find = True
  2992. break
  2993. if not _find:
  2994. if len(list_docs)>0:
  2995. project_dict[document_district] = list_docs[0].getProperties().get(document_district)
  2996. project_dict[document_city] = list_docs[0].getProperties().get(document_city)
  2997. project_dict[document_province] = list_docs[0].getProperties().get(document_province)
  2998. project_dict[document_area] = list_docs[0].getProperties().get(document_area)
  2999. for _key,_value in choose_dict.items():
  3000. _l = []
  3001. for k,v in _value.items():
  3002. _l.append([k,v])
  3003. _l.sort(key=lambda x:x[1],reverse=True)
  3004. if len(_l)>0:
  3005. _v = _l[0][0]
  3006. if _v in ('全国','未知'):
  3007. if len(_l)>1:
  3008. _v = _l[1][0]
  3009. project_dict[_key] = _v
  3010. list_dynamics = []
  3011. docid_number = 0
  3012. visuable_docids = []
  3013. zhao_biao_page_time = ""
  3014. zhong_biao_page_time = ""
  3015. list_codes = []
  3016. list_product = []
  3017. p_page_time = ""
  3018. remove_docids = set()
  3019. for _doc in list_docs:
  3020. table_name = _doc.getProperties().get("table_name")
  3021. status = _doc.getProperties().get(document_status,0)
  3022. _save = _doc.getProperties().get(document_tmp_save,1)
  3023. doctitle = _doc.getProperties().get(document_doctitle,"")
  3024. docchannel = _doc.getProperties().get(document_docchannel)
  3025. page_time = _doc.getProperties().get(document_page_time,"")
  3026. _docid = _doc.getProperties().get(document_docid)
  3027. _bidway = _doc.getProperties().get(document_bidway,"")
  3028. _docchannel = _doc.getProperties().get(document_life_docchannel,0)
  3029. project_codes = _doc.getProperties().get(document_project_codes)
  3030. product = _doc.getProperties().get(document_product)
  3031. sub_docs = _doc.getProperties().get("sub_docs",[])
  3032. is_multipack = True if len(sub_docs)>1 else False
  3033. extract_count = _doc.getProperties().get(document_tmp_extract_count,0)
  3034. if product is not None:
  3035. list_product.extend(product.split(","))
  3036. if project_codes is not None:
  3037. _c = project_codes.split(",")
  3038. list_codes.extend(_c)
  3039. if p_page_time=="":
  3040. p_page_time = page_time
  3041. if zhao_biao_page_time=="" and _docchannel in (51,52,102,103,114):
  3042. zhao_biao_page_time = page_time
  3043. if zhong_biao_page_time=="" and _docchannel in (101,118,119,120):
  3044. zhong_biao_page_time = page_time
  3045. is_visuable = 0
  3046. if table_name=="document":
  3047. if status>=201 and status<=300:
  3048. docid_number +=1
  3049. visuable_docids.append(str(_docid))
  3050. is_visuable = 1
  3051. else:
  3052. remove_docids.add(str(_docid))
  3053. else:
  3054. if _save==1:
  3055. docid_number +=1
  3056. visuable_docids.append(str(_docid))
  3057. is_visuable = 1
  3058. else:
  3059. remove_docids.add(str(_docid))
  3060. list_dynamics.append({document_docid:_docid,
  3061. document_doctitle:doctitle,
  3062. document_docchannel:_docchannel,
  3063. document_bidway:_bidway,
  3064. document_page_time:page_time,
  3065. document_status:201 if is_visuable==1 else 401,
  3066. "is_multipack":is_multipack,
  3067. document_tmp_extract_count:extract_count
  3068. }
  3069. )
  3070. project_dict[project_project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
  3071. project_dict[project_docid_number] = docid_number
  3072. project_dict[project_docids] = ",".join(list(set(visuable_docids)-remove_docids))
  3073. if zhao_biao_page_time !="":
  3074. project_dict[project_zhao_biao_page_time] = zhao_biao_page_time
  3075. if zhong_biao_page_time !="":
  3076. project_dict[project_zhong_biao_page_time] = zhong_biao_page_time
  3077. project_dict[project_project_codes] = ",".join(list(set(list_codes)))
  3078. project_dict[project_page_time] = p_page_time
  3079. project_dict[project_product] = ",".join(list(set(list_product)))
  3080. return project_dict
  3081. def generate_packages_properties(self,list_docs):
  3082. '''
  3083. 生成分包属性
  3084. :param list_docs:
  3085. :return:
  3086. '''
  3087. list_properties = []
  3088. set_key = set()
  3089. for _doc in list_docs:
  3090. _dict = {}
  3091. sub_docs = _doc.getProperties().get("sub_docs")
  3092. if sub_docs is not None:
  3093. for _d in sub_docs:
  3094. sub_project_code = _d.get(project_sub_project_code,"")
  3095. sub_project_name = _d.get(project_sub_project_name,"")
  3096. win_tenderer = _d.get(project_win_tenderer,"")
  3097. win_bid_price = _d.get(project_win_bid_price,"")
  3098. _key = "%s-%s-%s-%s"%(sub_project_code,sub_project_name,win_tenderer,win_bid_price)
  3099. if _key in set_key:
  3100. continue
  3101. set_key.add(_key)
  3102. list_properties.append(_d)
  3103. return list_properties
  3104. def generate_projects_from_document(self,list_docs):
  3105. '''
  3106. #通过公告生成projects
  3107. :param list_docids:
  3108. :return:
  3109. '''
  3110. #判断标段数
  3111. list_projects = generate_projects([doc.getProperties() for doc in list_docs])
  3112. return list_projects
  3113. def search_projects_with_document(self,list_docids,project_table,project_table_index):
  3114. '''
  3115. 通过docid集合查询对应的projects
  3116. :param list_docids:
  3117. :return:
  3118. '''
  3119. log("search_projects_with_document %s"%str(list_docids))
  3120. list_should_q = []
  3121. for _docid in list_docids:
  3122. list_should_q.append(TermQuery("docids",_docid))
  3123. bool_query = BoolQuery(should_queries=list_should_q)
  3124. _query = {"query":bool_query,"limit":20}
  3125. list_project_dict = getDocument(_query,self.ots_client,[
  3126. project_uuid,project_docids,project_zhao_biao_page_time,
  3127. project_zhong_biao_page_time,
  3128. project_page_time,
  3129. project_area,
  3130. project_province,
  3131. project_city,
  3132. project_district,
  3133. project_info_type,
  3134. project_industry,
  3135. project_qcodes,
  3136. project_project_name,
  3137. project_project_code,
  3138. project_project_codes,
  3139. project_project_addr,
  3140. project_tenderee,
  3141. project_tenderee_addr,
  3142. project_tenderee_phone,
  3143. project_tenderee_contact,
  3144. project_agency,
  3145. project_agency_phone,
  3146. project_agency_contact,
  3147. project_sub_project_name,
  3148. project_sub_project_code,
  3149. project_bidding_budget,
  3150. project_win_tenderer,
  3151. project_win_bid_price,
  3152. project_win_tenderer_manager,
  3153. project_win_tenderer_phone,
  3154. project_second_tenderer,
  3155. project_second_bid_price,
  3156. project_second_tenderer_manager,
  3157. project_second_tenderer_phone,
  3158. project_third_tenderer,
  3159. project_third_bid_price,
  3160. project_third_tenderer_manager,
  3161. project_third_tenderer_phone,
  3162. project_procurement_system,
  3163. project_bidway,
  3164. project_dup_data,
  3165. project_docid_number,
  3166. project_project_dynamics,
  3167. project_product,
  3168. project_moneysource,
  3169. project_service_time,
  3170. project_time_bidclose,
  3171. project_time_bidopen,
  3172. project_time_bidstart,
  3173. project_time_commencement,
  3174. project_time_completion,
  3175. project_time_earnest_money_start,
  3176. project_time_earnest_money_end,
  3177. project_time_get_file_end,
  3178. project_time_get_file_start,
  3179. project_time_publicity_end,
  3180. project_time_publicity_start,
  3181. project_time_registration_end,
  3182. project_time_registration_start,
  3183. project_time_release,
  3184. project_dup_docid,
  3185. project_info_source,
  3186. project_nlp_enterprise,
  3187. project_nlp_enterprise_attachment,
  3188. project_tenderee_code,
  3189. project_agency_code,
  3190. project_candidates,
  3191. project_docchannel
  3192. ],sort="page_time",table_name=project_table,table_index=project_table_index)
  3193. return list_project_dict
  3194. def set_project_uuid(self,_dict,_uuid):
  3195. if _uuid is not None and _uuid!="":
  3196. if "uuid" in _dict:
  3197. _dict["uuid"] = "%s,%s"%(_dict["uuid"],_uuid)
  3198. else:
  3199. _dict["uuid"] = _uuid
  3200. def getMerge_rules(self,page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price,province,city,district):
  3201. whole_time_start = time.time()
  3202. _time = time.time()
  3203. list_query = []
  3204. list_code = [a for a in project_codes.split(",") if a!='']
  3205. should_q_code = BoolQuery(should_queries=[MatchQuery(project_project_codes,a) for a in list_code[:20]])
  3206. # print("should_q_code",[a for a in list_code[:20]])
  3207. should_q_cod = BoolQuery(should_queries=[MatchQuery(project_project_code,a) for a in list_code[:20]])
  3208. list_product = [a for a in product.split(",") if a!='']
  3209. should_q_product = BoolQuery(should_queries=[MatchQuery(project_product,a) for a in list_product[:20]])
  3210. should_q_area = None
  3211. if province!="" or city!="" or district!="":
  3212. should_q = []
  3213. if province not in ("","全国","未知") and province is not None:
  3214. should_q.append(TermQuery(project_province,province))
  3215. if city not in ("","全国","未知") and city is not None:
  3216. should_q.append(TermQuery(project_city,city))
  3217. if district not in ("","全国","未知") and district is not None:
  3218. should_q.append(TermQuery(project_district,district))
  3219. if len(should_q)>0:
  3220. should_q_area = BoolQuery(should_queries=should_q)
  3221. prepare_time = time.time()-_time
  3222. _time = time.time()
  3223. # log("list_code %s"%(str(list_code)))
  3224. # log("list_product %s"%(str(list_product)))
  3225. # log("tenderee %s"%(tenderee))
  3226. # log("bidding_budget %s"%(bidding_budget))
  3227. # log("win_tenderer %s"%(win_tenderer))
  3228. # log("win_bid_price %s"%(win_bid_price))
  3229. # log("project_name %s"%(project_name))
  3230. log_time = time.time()-_time
  3231. _time = time.time()
  3232. if tenderee!="" and len(list_code)>0:
  3233. _query = [TermQuery(project_tenderee,tenderee),
  3234. should_q_code,
  3235. ]
  3236. list_query.append([_query,2])
  3237. _query = [TermQuery(project_tenderee,tenderee),
  3238. should_q_cod
  3239. ]
  3240. list_query.append([_query,2])
  3241. if tenderee!="" and len(list_product)>0:
  3242. _query = [TermQuery(project_tenderee,tenderee),
  3243. should_q_product]
  3244. list_query.append([_query,1])
  3245. if tenderee!="" and project_name!="":
  3246. _query = [TermQuery(project_tenderee,tenderee),
  3247. TermQuery(project_project_name,project_name)]
  3248. list_query.append([_query,2])
  3249. if tenderee!="" and agency!="":
  3250. _query = [TermQuery(project_tenderee,tenderee),
  3251. TermQuery(project_agency,agency)]
  3252. list_query.append([_query,0])
  3253. if tenderee!="" and float(bidding_budget)>0:
  3254. _query = [TermQuery(project_tenderee,tenderee),
  3255. TermQuery(project_bidding_budget,bidding_budget)]
  3256. list_query.append([_query,2])
  3257. if float(bidding_budget)>0 and float(win_bid_price)>0:
  3258. _query = [TermQuery(project_bidding_budget,bidding_budget),
  3259. TermQuery(project_win_bid_price,win_bid_price)]
  3260. list_query.append([_query,2])
  3261. if tenderee!="" and win_tenderer!="":
  3262. _query = [TermQuery(project_tenderee,tenderee),
  3263. TermQuery(project_win_tenderer,win_tenderer)]
  3264. list_query.append([_query,2])
  3265. if agency!="" and win_tenderer!="":
  3266. _query = [TermQuery(project_agency,agency),
  3267. TermQuery(project_win_tenderer,win_tenderer)]
  3268. list_query.append([_query,0])
  3269. if agency!="" and len(list_product)>0:
  3270. _query = [TermQuery(project_agency,agency),
  3271. should_q_product]
  3272. list_query.append([_query,1])
  3273. if win_tenderer!="" and len(list_code)>0:
  3274. _query = [TermQuery(project_win_tenderer,win_tenderer),
  3275. should_q_code]
  3276. list_query.append([_query,2])
  3277. _query = [TermQuery(project_win_tenderer,win_tenderer),
  3278. should_q_cod]
  3279. list_query.append([_query,2])
  3280. if win_tenderer!="" and sub_project_name!="":
  3281. _query = [TermQuery(project_win_tenderer,win_tenderer),
  3282. TermQuery(project_sub_project_name,sub_project_name)
  3283. ]
  3284. list_query.append([_query,2])
  3285. if win_tenderer!="" and float(win_bid_price)>0:
  3286. _query = [TermQuery(project_win_tenderer,win_tenderer),
  3287. TermQuery(project_win_bid_price,win_bid_price)]
  3288. list_query.append([_query,2])
  3289. if win_tenderer!="" and float(bidding_budget)>0:
  3290. _query = [TermQuery(project_win_tenderer,win_tenderer),
  3291. TermQuery(project_bidding_budget,bidding_budget)]
  3292. list_query.append([_query,2])
  3293. if len(list_code)>0 and len(list_product)>0:
  3294. _query = [should_q_code,
  3295. should_q_product]
  3296. list_query.append([_query,2])
  3297. if len(list_code)>0:
  3298. _query = [
  3299. should_q_code]
  3300. list_query.append([_query,2])
  3301. _query = [
  3302. should_q_cod]
  3303. list_query.append([_query,1])
  3304. if project_name!="" and project_name is not None:
  3305. _query = [
  3306. TermQuery(project_project_name,project_name)]
  3307. list_query.append([_query,1])
  3308. _query_title = [MatchPhraseQuery(project_doctitles,project_name)]
  3309. list_query.append([_query_title,1])
  3310. if len(list_product)>0 and should_q_area is not None:
  3311. _query = [should_q_area,
  3312. should_q_product]
  3313. list_query.append([_query,0])
  3314. generate_time = time.time()-_time
  3315. whole_time = time.time()-whole_time_start
  3316. # log("projects merge rules whole_time:%.3f prepare_time:%.3f log_time:%.3f generate_time:%.3f"%(whole_time,prepare_time,log_time,generate_time))
  3317. return list_query
  3318. def merge_projects(self,list_projects,b_log=False,check_columns=[project_uuid,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_project_name,project_project_code,project_project_codes,project_tenderee,project_agency,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_project_dynamics,project_product,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_nlp_enterprise,project_nlp_enterprise_attachment,project_docids,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_addr,project_tenderee_addr,project_agency_phone,project_agency_contact,project_tenderee_phone,project_tenderee_contact,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_moneysource,project_service_time,project_dup_docid,project_info_source],project_table="project2",project_table_index="project2_index_formerge"):
  3319. '''
  3320. 对项目进行合并
  3321. :return:
  3322. '''
  3323. try:
  3324. whole_time_start = time.time()
  3325. set_uuid = set()
  3326. for _proj in list_projects:
  3327. _uuid = _proj.get("uuid")
  3328. if _uuid is not None:
  3329. set_uuid = set_uuid | set(_uuid.split(","))
  3330. projects_merge_count = 0
  3331. projects_check_rule_time = 0
  3332. projects_update_time = 0
  3333. projects_query_time = 0
  3334. projects_prepare_time = 0
  3335. current_date = getCurrent_date("%Y-%m-%d")
  3336. min_date = timeAdd(current_date,-35,format="%Y-%m-%d")
  3337. search_table = "project2"
  3338. search_table_index = "project2_index_formerge"
  3339. project_cls = Project
  3340. docids = ""
  3341. for _proj in list_projects[:30]:
  3342. must_not_q = []
  3343. for _uuid in list(set_uuid):
  3344. must_not_q.append(TermQuery("uuid",_uuid))
  3345. docids = _proj.get(project_docids,"")
  3346. page_time = _proj.get(project_page_time,"")
  3347. project_codes = _proj.get(project_project_codes,"")
  3348. project_name = _proj.get(project_project_name,"")
  3349. tenderee = _proj.get(project_tenderee,"")
  3350. agency = _proj.get(project_agency,"")
  3351. product = _proj.get(project_product,"")
  3352. sub_project_name = _proj.get(project_sub_project_name,"")
  3353. bidding_budget = _proj.get(project_bidding_budget,-1)
  3354. win_tenderer = _proj.get(project_win_tenderer,"")
  3355. win_bid_price = _proj.get(project_win_bid_price,-1)
  3356. _dynamic = _proj.get(project_project_dynamics,"[]")
  3357. is_yanshou = False
  3358. list_dynamic = json.loads(_dynamic)
  3359. for _d in list_dynamic:
  3360. _title = _d.get("doctitle","")
  3361. if re.search("验收公[示告]|验收结果",_title) is not None or _d.get("docchannel")==122:
  3362. is_yanshou = True
  3363. break
  3364. province = _proj.get(project_province,"")
  3365. city = _proj.get(project_city,"")
  3366. district = _proj.get(project_district,"")
  3367. if is_yanshou:
  3368. page_time_less = timeAdd(page_time,-850)
  3369. page_time_greater = timeAdd(page_time,820)
  3370. else:
  3371. page_time_less = timeAdd(page_time,-450)
  3372. page_time_greater = timeAdd(page_time,420)
  3373. sub_project_q = TermQuery(project_sub_project_name,sub_project_name) if sub_project_name.replace("Project","")!="" else None
  3374. _time = time.time()
  3375. list_must_query = self.getMerge_rules(page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price,province,city,district)
  3376. list_merge_data = []
  3377. search_table = "project2"
  3378. search_table_index = "project2_index_formerge"
  3379. project_cls = Project
  3380. search_table = project_table
  3381. search_table_index = project_table_index
  3382. # print("page_time,min_date",page_time,min_date)
  3383. # if page_time>=min_date:
  3384. # search_table = "project2_tmp"
  3385. # search_table_index = "project2_tmp_index"
  3386. # project_cls = Project_tmp
  3387. _step = 2
  3388. _begin = 0
  3389. must_queries = []
  3390. if page_time_less is not None and page_time_greater is not None:
  3391. must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
  3392. # RangeQuery("status",201,301)
  3393. ]
  3394. #sub_project_name非必要条件
  3395. # if sub_project_q is not None:
  3396. # must_queries.append(sub_project_q)
  3397. projects_prepare_time += time.time()-_time
  3398. _time = time.time()
  3399. sort_type = SortOrder.DESC
  3400. while _begin<len(list_must_query):
  3401. if sort_type==SortOrder.DESC:
  3402. sort_type=SortOrder.ASC
  3403. if sort_type==SortOrder.ASC:
  3404. sort_type=SortOrder.DESC
  3405. list_should_q = []
  3406. _limit = 10
  3407. for must_q,_count in list_must_query[_begin:_begin+_step]:
  3408. must_q1 = list(must_q)
  3409. must_q1.extend(must_queries)
  3410. list_should_q.append(BoolQuery(must_queries=must_q1))
  3411. _limit += _count*5
  3412. _query = BoolQuery(
  3413. should_queries=list_should_q,
  3414. must_not_queries=must_not_q[:100]
  3415. )
  3416. # rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search("project2","project2_index_formerge",
  3417. # SearchQuery(_query,limit=_limit),
  3418. # columns_to_get=ColumnsToGet(column_names=[project_uuid,project_docids,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_name,project_project_code,project_project_codes,project_project_addr,project_tenderee,project_tenderee_addr,project_tenderee_phone,project_tenderee_contact,project_agency,project_agency_phone,project_agency_contact,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_project_dynamics,project_product,project_moneysource,project_service_time,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_dup_docid,project_info_source,project_nlp_enterprise,project_nlp_enterprise_attachment],return_type=ColumnReturnType.SPECIFIED))
  3419. rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search(search_table,search_table_index,
  3420. SearchQuery(_query,sort=Sort(sorters=[FieldSort(project_page_time,sort_type)]),limit=_limit),
  3421. columns_to_get=ColumnsToGet(column_names=check_columns,return_type=ColumnReturnType.SPECIFIED))
  3422. list_data = getRow_ots(rows)
  3423. list_merge_data.extend(list_data)
  3424. # print(list_data)
  3425. for _data in list_data:
  3426. must_not_q.append(TermQuery(project_uuid,_data.get(project_uuid)))
  3427. _begin += _step
  3428. projects_query_time += time.time()-_time
  3429. #优先匹配招标金额相近的
  3430. projects_merge_count = len(list_merge_data)
  3431. list_merge_data.sort(key=lambda x:x.get(project_page_time,""))
  3432. list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
  3433. # log(page_time_less+"=="+page_time_greater)
  3434. if b_log:
  3435. log("list_merge_data count:%d"%(len(list_merge_data)))
  3436. list_check_data = []
  3437. for _data in list_merge_data:
  3438. _time = time.time()
  3439. _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True)
  3440. if b_log:
  3441. log(str(_check))
  3442. projects_check_rule_time += time.time()-_time
  3443. if _check:
  3444. list_check_data.append([_data,_prob])
  3445. list_check_data.sort(key=lambda x:x[1],reverse=True)
  3446. for _data,_ in list_check_data:
  3447. _time = time.time()
  3448. _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True)
  3449. projects_check_rule_time += time.time()-_time
  3450. _time = time.time()
  3451. if _check:
  3452. # o_proj = project_cls(_data)
  3453. # o_proj.fix_columns(self.ots_client,fix_columns,True)
  3454. # for k in fix_columns:
  3455. # _data[k] = o_proj.getProperties().get(k)
  3456. update_projects_by_project(_data,[_proj])
  3457. projects_update_time += time.time()-_time
  3458. whole_time = time.time()-whole_time_start
  3459. log("%s %s merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(search_table,docids,whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
  3460. return list_projects
  3461. except Exception as e:
  3462. traceback.print_exc()
  3463. assert 1==2
  3464. def dumplicate_document_in_merge(self,list_projects,dup_docid,_docid,_docchannel,document_name="document",b_log=False):
  3465. '''
  3466. 合并时去重
  3467. :param list_projects:
  3468. :return:
  3469. '''
  3470. dup_docid = set([str(a) for a in dup_docid])
  3471. set_dup_total = set()
  3472. docid_item = self.get_attrs_before_dump(_docid)
  3473. best_docid = None
  3474. for _proj in list_projects:
  3475. try:
  3476. docids = _proj.get(project_docids,"")
  3477. set_docids = set([a for a in docids.split(",") if a!=""])
  3478. _project_dynamics = _proj.get(project_project_dynamics,"[]")
  3479. list_dynamics = json.loads(_project_dynamics)
  3480. set_dup_docid = set()
  3481. list_dup_result = [(_docid,docid_item.get("extract_count"))]
  3482. log("=========%s---%s"%(str(set_docids),str(_docid)))
  3483. if str(_docid) in set_docids:
  3484. list_to_dup_docid = []
  3485. for _d in list_dynamics:
  3486. docid = _d.get(document_docid)
  3487. doctitle = _d.get(document_doctitle,"")
  3488. docchannel = _d.get(document_docchannel,0)
  3489. status = _d.get(document_status,0)
  3490. if status>=401:
  3491. continue
  3492. if str(docid) not in set_docids:
  3493. continue
  3494. if str(docid) in dup_docid:
  3495. continue
  3496. if docchannel!=_docchannel:
  3497. continue
  3498. if docid==_docid:
  3499. continue
  3500. list_to_dup_docid.append(_d)
  3501. for _d in list_to_dup_docid:
  3502. docid = _d.get(document_docid)
  3503. _item = self.get_attrs_before_dump(docid)
  3504. _prob = check_dumplicate_rule(docid_item,_item,5,b_log=b_log)
  3505. log("dumplicate_document_in_merge %s-%s prob %.2f"%(str(_docid),str(docid),_prob))
  3506. if _prob>0.4:
  3507. docid = int(docid)
  3508. _d = {"partitionkey":docid%500+1,
  3509. "docid":docid,
  3510. }
  3511. _doc = Document(_d)
  3512. _doc.table_name = document_name
  3513. if _doc.fix_columns(self.ots_client,[document_page_time,document_update_document],True):
  3514. if _doc.getProperties().get(document_update_document,"")!="true":
  3515. list_dup_result.append((docid,_item.get("extract_count")))
  3516. list_dup_result.sort(key=lambda x:x[0])
  3517. list_dup_result.sort(key=lambda x:x[1],reverse=True)
  3518. if len(list_dup_result)>0:
  3519. best_docid1 = list_dup_result[0][0]
  3520. if best_docid1 not in set_dup_total:
  3521. best_docid = best_docid1
  3522. for _d in list_dup_result[1:]:
  3523. set_dup_docid.add(str(_d[0]))
  3524. for _dynamic in list_dynamics:
  3525. if _dynamic.get(document_docid) in set_dup_docid:
  3526. _dynamic[document_status] = 401
  3527. set_docids = set_docids-set_dup_docid-dup_docid
  3528. set_dup_total |= set_dup_docid
  3529. if len(set_docids)==0:
  3530. print(set_dup_docid,dup_docid)
  3531. log("projects set_docids length is zero %s"%(docids))
  3532. return None,None
  3533. else:
  3534. _proj[project_docids] = ",".join(list(set_docids))
  3535. _proj[project_project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
  3536. _proj[project_docid_number] = len(set_docids)
  3537. _proj[project_dup_docid] = ",".join(list(set_dup_docid))
  3538. # log("dumplicate_document docid%s dynamic %d takes%.3f"%(str(docid),len(list_dynamics),time.time()-_time))
  3539. except Exception as e:
  3540. traceback.print_exc()
  3541. if best_docid in set_dup_total:
  3542. best_docid = None
  3543. return best_docid,list(set_dup_total)
  3544. def merge_document_real(self,item,dup_docid,save,document_name="document",project_table="project2",project_table_index="project2_index_formerge",b_log=False):
  3545. '''
  3546. 实时项目合并
  3547. :param item:
  3548. :param dup_docid:重复的公告集合
  3549. :param status_to:
  3550. :return:
  3551. '''
  3552. try:
  3553. list_docids = []
  3554. _docid = item.get(document_tmp_docid)
  3555. list_docids.append(_docid)
  3556. print("dup_docid",dup_docid)
  3557. if save==0:
  3558. dup_docid.insert(0,_docid)
  3559. if isinstance(dup_docid,list):
  3560. list_docids.extend(dup_docid)
  3561. list_docids = [a for a in list_docids if a is not None]
  3562. _time = time.time()
  3563. list_projects = self.search_projects_with_document(list_docids,project_table,project_table_index)
  3564. log("search %d projects takes:%.3f"%(len(list_projects),time.time()-_time))
  3565. if len(list_projects)==0:
  3566. # _time = time.time()
  3567. list_docs = self.search_docs(list_docids,document_name=document_name)
  3568. # log("search document takes:%.3f"%(time.time()-_time))
  3569. # _time = time.time()
  3570. list_projects = self.generate_projects_from_document(list_docs)
  3571. # log("generate projects takes:%.3f"%(time.time()-_time))
  3572. else:
  3573. _time = time.time()
  3574. self.update_projects_by_document(_docid,save,list_projects,document_name=document_name)
  3575. # log("update projects takes:%.3f"%(time.time()-_time))
  3576. _time = time.time()
  3577. list_projects = dumplicate_projects(list_projects)
  3578. # log("dumplicate projects takes:%.3f"%(time.time()-_time))
  3579. _time = time.time()
  3580. list_projects = self.merge_projects(list_projects,b_log,project_table=project_table,project_table_index=project_table_index)
  3581. # log("merge projects takes:%.3f"%(time.time()-_time))
  3582. _time = time.time()
  3583. best_docid,list_merge_dump = self.dumplicate_document_in_merge(list_projects,dup_docid,_docid,item.get(document_docchannel),document_name=document_name,b_log=b_log)
  3584. # log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
  3585. if list_merge_dump is None:
  3586. list_projects = []
  3587. _time = time.time()
  3588. project_json = to_project_json(list_projects)
  3589. # log("json projects takes:%.3f"%(time.time()-_time))
  3590. if b_log:
  3591. log("project_json:%s"%project_json)
  3592. return project_json,best_docid,list_merge_dump
  3593. except Exception as e:
  3594. raise RuntimeError("error on dumplicate")
  3595. def is_exist_fingerprint(self,final_list,_docid,_fingerprint,is_tmp=False):
  3596. set_fingerprint = set()
  3597. for _i in range(1,len(final_list)):
  3598. _dict = final_list[_i]
  3599. b_docid = _dict[document_tmp_docid]
  3600. _save = _dict.get(document_tmp_save,0)
  3601. _status = _dict.get(document_tmp_status,0)
  3602. if not is_tmp:
  3603. if _status>=201 and _status<=300:
  3604. _save = 1
  3605. fingerprint_less = _dict.get(document_tmp_fingerprint,"")
  3606. if b_docid==_docid:
  3607. pass
  3608. else:
  3609. if _save==1:
  3610. set_fingerprint.add(fingerprint_less)
  3611. if _fingerprint in set_fingerprint:
  3612. return True
  3613. return False
  3614. def exists_normal_fingerprint(self,_fingerprint,docid,table_name="document",table_index="document_index"):
  3615. query = BoolQuery(must_queries=[
  3616. RangeQuery("status",201,301),
  3617. TermQuery("fingerprint",_fingerprint),
  3618. RangeQuery("docid",0,docid-400000),
  3619. ]
  3620. )
  3621. rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  3622. SearchQuery(query,get_total_count=True,limit=1))
  3623. if total_count>0:
  3624. return True
  3625. return False
  3626. def check_page_time(self,item,table_name="document",table_index="document_index"):
  3627. page_time = item.get(document_page_time,"")
  3628. has_before = False
  3629. has_after = False
  3630. bidclose_time = page_time
  3631. web_source_name = item.get(document_tmp_web_source_name,"")
  3632. docchannel = item.get(document_tmp_docchannel,"0")
  3633. try:
  3634. docchannel = int(docchannel)
  3635. except:
  3636. docchannel = 0
  3637. if docchannel<200:
  3638. if len(page_time)>0:
  3639. l_page_time = timeAdd(page_time,days=-90)
  3640. dict_time = item.get("dict_time",{})
  3641. for k,v in dict_time.items():
  3642. if v is not None and len(v)>0:
  3643. if l_page_time>v:
  3644. has_before = True
  3645. if v>page_time:
  3646. has_after = True
  3647. if k==document_tmp_time_bidclose:
  3648. bidclose_time = v
  3649. set_web_source = {"中国招标投标公共服务平台","比地招标"}
  3650. if web_source_name in set_web_source and bidclose_time<page_time:
  3651. return False
  3652. log("%s check page_time has_before %s has_after %s"%(str(item.get(document_docid)),str(has_before),str(has_after)))
  3653. if has_before:
  3654. _query = BoolQuery(must_queries=[MatchPhraseQuery(document_doctitle,item.get(document_doctitle,""))],
  3655. must_not_queries=[TermQuery(document_docid,item.get(document_docid,0))])
  3656. if not has_after:
  3657. rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  3658. SearchQuery(_query,get_total_count=True,limit=1))
  3659. if total_count>0:
  3660. log("%s check page_time false %s==%s-%s"%(str(item.get(document_docid)),l_page_time,k,v))
  3661. return False
  3662. if item.get(document_web_source_name,"")=="中国政府采购网":
  3663. rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
  3664. SearchQuery(_query,get_total_count=True,limit=1))
  3665. if total_count>0:
  3666. log("%s check 中国政府采购网 false "%(str(item.get(document_docid))))
  3667. return False
  3668. return True
  3669. def dumplicate_comsumer_handle_interface(self,docid,document_table,document_table_index,project_table,project_table_index,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=False,upgrade=False):
  3670. result_dict = {"success":True}
  3671. try:
  3672. bool_query = BoolQuery(must_queries=[
  3673. TermQuery("docid",docid)
  3674. ])
  3675. rows,next_token,total_count,is_all_succeed = self.ots_client.search(document_table,document_table_index,
  3676. SearchQuery(bool_query,limit=1,get_total_count=True),
  3677. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  3678. list_dict = getRow_ots(rows)
  3679. if len(list_dict)==0:
  3680. raise RuntimeError("未查找到docid为%s的数据"%(str(docid)))
  3681. item = list_dict[0]
  3682. self.post_extract(item)
  3683. log("dumplicate start on:%s"%(str(item.get(document_tmp_docid))))
  3684. base_list = []
  3685. set_docid = set()
  3686. list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,get_all=False,to_log=False,table_name=document_table,table_index=document_table_index)
  3687. # print("len_rules",len(list_rules),table_name,table_index)
  3688. list_rules.sort(key=lambda x:x["confidence"],reverse=True)
  3689. log("dumplicate %s rules:%d"%(str(item.get(document_tmp_docid)),len(list_rules)))
  3690. list_rules = list_rules[:30]
  3691. _i = 0
  3692. step = 2
  3693. item["confidence"] = 999
  3694. if item.get(document_tmp_docid) not in set_docid:
  3695. base_list.append(item)
  3696. set_docid.add(item.get(document_tmp_docid))
  3697. while _i<len(list_rules):
  3698. must_not_q = []
  3699. if len(base_list)>0:
  3700. must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
  3701. _query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
  3702. must_not_queries=must_not_q)
  3703. _rule = list_rules[_i]
  3704. confidence = _rule["confidence"]
  3705. singleNum_keys = _rule["singleNum_keys"]
  3706. contain_keys = _rule["contain_keys"]
  3707. multiNum_keys = _rule["multiNum_keys"]
  3708. self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=b_log)
  3709. _i += step
  3710. _time = time.time()
  3711. # log("%d start final check with length:%d"%(item["docid"],len(base_list)))
  3712. final_list = self.dumplicate_fianl_check(base_list,b_log)
  3713. exist_finterprint = self.is_exist_fingerprint(final_list,item.get(document_tmp_docid),item.get(document_tmp_fingerprint),is_tmp=table_name=="document_tmp")
  3714. exist_normal_fingerprint = self.exists_normal_fingerprint(item.get(document_tmp_fingerprint),item.get(document_tmp_docid),table_name=table_name,table_index=table_index)
  3715. # print("exist_normal_fingerprint",exist_normal_fingerprint)
  3716. # log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
  3717. best_docid = self.get_best_docid(final_list)
  3718. final_list_docid = [a["docid"] for a in final_list]
  3719. # log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
  3720. _d = {"partitionkey":item["partitionkey"],
  3721. "docid":item["docid"],
  3722. "status":random.randint(*flow_dumplicate_status_to),
  3723. document_tmp_opertime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")
  3724. }
  3725. dtmp = Document_tmp(_d)
  3726. dup_docid = set()
  3727. for _dict in final_list:
  3728. if _dict.get("update_document","")!="true":
  3729. dup_docid.add(_dict.get(document_tmp_docid))
  3730. if item.get(document_tmp_docid) in dup_docid:
  3731. dup_docid.remove(item.get(document_tmp_docid))
  3732. remove_list = []
  3733. _unnormal = False
  3734. dmp_docid = ""
  3735. _check_time = self.check_page_time(item,table_name=table_name,table_index=table_index)
  3736. if (_check_time and not exist_normal_fingerprint and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
  3737. dtmp.setValue(document_tmp_save,1,True)
  3738. # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
  3739. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3740. for _dict in final_list:
  3741. if _dict.get(document_tmp_docid) in dup_docid:
  3742. remove_list.append(_dict)
  3743. else:
  3744. if exist_normal_fingerprint:
  3745. log("%s has exist_normal_fingerprint"%(str(item.get(document_docid))))
  3746. best_docid = -1
  3747. dmp_docid = ""
  3748. _unnormal = True
  3749. if not _check_time:
  3750. best_docid = -2
  3751. dmp_docid = ""
  3752. _unnormal = True
  3753. dtmp.setValue(document_tmp_save,0,True)
  3754. if best_docid in dup_docid:
  3755. dup_docid.remove(best_docid)
  3756. for _dict in final_list:
  3757. if _dict.get(document_tmp_docid) in dup_docid:
  3758. remove_list.append(_dict)
  3759. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3760. else:
  3761. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3762. for _dict in final_list:
  3763. if _dict.get(document_tmp_docid) in dup_docid:
  3764. remove_list.append(_dict)
  3765. list_docids = list(dup_docid)
  3766. # if item.get(document_update_document)=="true":
  3767. # dtmp.setValue(document_tmp_save,1,True)
  3768. list_merge_dump = []
  3769. if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
  3770. if exist_finterprint:
  3771. log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
  3772. dtmp.setValue(document_tmp_projects,"[]",True)
  3773. else:
  3774. project_json,merge_best_docid,list_merge_dump = self.merge_document_real(item,list_docids,dtmp.getProperties().get(document_tmp_save),document_name=document_table,project_table=project_table,project_table_index=project_table_index,b_log=b_log)
  3775. if merge_best_docid is not None and (best_docid is None or best_docid==item.get(document_tmp_docid) or best_docid<0):
  3776. best_docid = merge_best_docid
  3777. if list_merge_dump is not None and len(list_merge_dump)>0 and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
  3778. dtmp.setValue(document_tmp_save,0,True)
  3779. if list_merge_dump is not None:
  3780. dmp_docid = "%s,%s"%(dmp_docid,",".join([str(a) for a in list_merge_dump]))
  3781. dtmp.setValue(document_tmp_projects,project_json,True)
  3782. result_dict["projects"] = project_json
  3783. log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
  3784. dmp_docid = set([a for a in dmp_docid.split(",") if a!=""])
  3785. if str(best_docid) in dmp_docid:
  3786. dmp_docid.remove(str(best_docid))
  3787. dmp_docid = ",".join([str(a) for a in list(dmp_docid)])
  3788. result_dict["best_docid"] = str(best_docid) if best_docid is not None else ""
  3789. result_dict["save"] = dtmp.getProperties().get("save")
  3790. result_dict["dmp_docid"] = dmp_docid
  3791. except Exception as e:
  3792. result_dict["success"] = False
  3793. result_dict["errmsg"] = str(e)
  3794. return result_dict
  3795. def dumplicate_comsumer_handle(self,item,result_queue,ots_client,get_all=False,upgrade=True):
  3796. try:
  3797. start_time = time.time()
  3798. b_log = False if upgrade else True
  3799. self.post_extract(item)
  3800. log("dumplicate start on:%s"%(str(item.get(document_tmp_docid))))
  3801. base_list = []
  3802. set_docid = set()
  3803. list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,get_all=get_all,to_log=b_log)
  3804. # print("len_rules",len(list_rules),table_name,table_index)
  3805. list_rules.sort(key=lambda x:x["confidence"],reverse=True)
  3806. log("dumplicate %s rules:%d"%(str(item.get(document_tmp_docid)),len(list_rules)))
  3807. list_rules = list_rules[:30]
  3808. _i = 0
  3809. step = 2
  3810. item["confidence"] = 999
  3811. if item.get(document_tmp_docid) not in set_docid:
  3812. base_list.append(item)
  3813. set_docid.add(item.get(document_tmp_docid))
  3814. while _i<len(list_rules):
  3815. must_not_q = []
  3816. if len(base_list)>0:
  3817. must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
  3818. _query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
  3819. must_not_queries=must_not_q)
  3820. _rule = list_rules[_i]
  3821. confidence = _rule["confidence"]
  3822. singleNum_keys = _rule["singleNum_keys"]
  3823. contain_keys = _rule["contain_keys"]
  3824. multiNum_keys = _rule["multiNum_keys"]
  3825. self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=b_log)
  3826. _i += step
  3827. _time = time.time()
  3828. # log("%d start final check with length:%d"%(item["docid"],len(base_list)))
  3829. final_list = self.dumplicate_fianl_check(base_list,b_log)
  3830. exist_finterprint = self.is_exist_fingerprint(final_list,item.get(document_tmp_docid),item.get(document_tmp_fingerprint),is_tmp=table_name=="document_tmp")
  3831. exist_normal_fingerprint = self.exists_normal_fingerprint(item.get(document_tmp_fingerprint),item.get(document_tmp_docid))
  3832. # print("exist_normal_fingerprint",exist_normal_fingerprint)
  3833. # log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
  3834. best_docid = self.get_best_docid(final_list)
  3835. final_list_docid = [a["docid"] for a in final_list]
  3836. # log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
  3837. _d = {"partitionkey":item["partitionkey"],
  3838. "docid":item["docid"],
  3839. "status":random.randint(*flow_dumplicate_status_to),
  3840. document_tmp_opertime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")
  3841. }
  3842. dtmp = Document_tmp(_d)
  3843. dup_docid = set()
  3844. for _dict in final_list:
  3845. if _dict.get("update_document","")!="true":
  3846. dup_docid.add(_dict.get(document_tmp_docid))
  3847. if item.get(document_tmp_docid) in dup_docid:
  3848. dup_docid.remove(item.get(document_tmp_docid))
  3849. remove_list = []
  3850. _unnormal = False
  3851. dmp_docid = ""
  3852. _check_time = self.check_page_time(item)
  3853. if (_check_time and not exist_normal_fingerprint and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
  3854. dtmp.setValue(document_tmp_save,1,True)
  3855. # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
  3856. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3857. for _dict in final_list:
  3858. if _dict.get(document_tmp_docid) in dup_docid:
  3859. remove_list.append(_dict)
  3860. else:
  3861. if exist_normal_fingerprint:
  3862. log("%s has exist_normal_fingerprint"%(str(item.get(document_docid))))
  3863. best_docid = -1
  3864. dmp_docid = ""
  3865. _unnormal = True
  3866. if not _check_time:
  3867. best_docid = -2
  3868. dmp_docid = ""
  3869. _unnormal = True
  3870. dtmp.setValue(document_tmp_save,0,True)
  3871. if best_docid in dup_docid:
  3872. dup_docid.remove(best_docid)
  3873. for _dict in final_list:
  3874. if _dict.get(document_tmp_docid) in dup_docid:
  3875. remove_list.append(_dict)
  3876. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3877. else:
  3878. dmp_docid = ",".join([str(a) for a in list(dup_docid)])
  3879. for _dict in final_list:
  3880. if _dict.get(document_tmp_docid) in dup_docid:
  3881. remove_list.append(_dict)
  3882. list_docids = list(dup_docid)
  3883. # if item.get(document_update_document)=="true":
  3884. # dtmp.setValue(document_tmp_save,1,True)
  3885. list_merge_dump = []
  3886. if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
  3887. if exist_finterprint:
  3888. log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
  3889. dtmp.setValue(document_tmp_projects,"[]",True)
  3890. else:
  3891. project_json,merge_best_docid,list_merge_dump = self.merge_document_real(item,list_docids,dtmp.getProperties().get(document_tmp_save),b_log=b_log)
  3892. if merge_best_docid is not None and (best_docid is None or best_docid==item.get(document_tmp_docid) or best_docid<0):
  3893. best_docid = merge_best_docid
  3894. if list_merge_dump is not None and len(list_merge_dump)>0 and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
  3895. dtmp.setValue(document_tmp_save,0,True)
  3896. if list_merge_dump is not None:
  3897. dmp_docid = "%s,%s"%(dmp_docid,",".join([str(a) for a in list_merge_dump]))
  3898. dtmp.setValue(document_tmp_projects,project_json,True)
  3899. log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
  3900. dmp_docid = set([a for a in dmp_docid.split(",") if a!=""])
  3901. if str(best_docid) in dmp_docid:
  3902. dmp_docid.remove(str(best_docid))
  3903. dmp_docid = ",".join([str(a) for a in list(dmp_docid)])
  3904. if _unnormal:
  3905. dmp_docid = ""
  3906. if upgrade:
  3907. # print(dtmp.getProperties())
  3908. dmp_docid = dmp_docid.replace(",,",",")
  3909. dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
  3910. dtmp.setValue(document_tmp_best_docid,best_docid,True)
  3911. _flag = dtmp.update_row(self.ots_client)
  3912. if not _flag:
  3913. for i in range(10):
  3914. list_proj_json = dtmp.getProperties().get(document_tmp_projects)
  3915. if list_proj_json is not None:
  3916. list_proj = json.loads(list_proj_json)
  3917. dtmp.setValue(document_tmp_projects,json.dumps(list_proj[:len(list_proj)//2]),True)
  3918. if dtmp.update_row(self.ots_client):
  3919. break
  3920. self.changeSaveStatus(remove_list)
  3921. self.changeSaveStatus(list_merge_dump)
  3922. else:
  3923. return list_docids
  3924. except Exception as e:
  3925. traceback.print_exc()
  3926. log("dumplicate error on:%s"%(str(item.get(document_tmp_docid))))
  3927. finally:
  3928. log("dumplicate end on:%s"%(str(item.get(document_tmp_docid))))
  3929. self.queue_dumplicate_processed.put(item.get(document_tmp_docid))
  3930. def fix_doc_which_not_in_project(self):
  3931. '''
  3932. 将成品公告中不存在于project2的数据取出,并放入document_tmp中重新进行去重和合并
  3933. :return:
  3934. '''
  3935. def fix_doc_handle(item,result_queue):
  3936. _docid = item.get(document_tmp_docid)
  3937. b_q = BoolQuery(must_queries=[TermQuery(project_docids,str(_docid))])
  3938. rows,next_token,total_count,is_all_succeed = self.ots_client.search("project2","project2_index",
  3939. SearchQuery(b_q,get_total_count=True),
  3940. ColumnsToGet(return_type=ColumnReturnType.NONE))
  3941. if total_count==0:
  3942. log("fix_doc:%s not in project2"%(str(_docid)))
  3943. d_tmp = Document_tmp(item)
  3944. d_tmp.setValue(document_tmp_status,flow_dumplicate_status_from[0],True)
  3945. d_tmp.update_row(self.ots_client)
  3946. current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
  3947. before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-20)
  3948. after_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
  3949. if self.fix_doc_docid is None:
  3950. bool_query = BoolQuery(must_queries=[
  3951. TermQuery(document_tmp_save,1),
  3952. RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
  3953. RangeQuery(document_tmp_docchannel,0,300),
  3954. RangeQuery(document_tmp_opertime,before_date,after_date)
  3955. ])
  3956. else:
  3957. bool_query = BoolQuery(must_queries=[
  3958. TermQuery(document_tmp_save,1),
  3959. RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
  3960. RangeQuery(document_tmp_docchannel,0,300),
  3961. RangeQuery(document_tmp_docid,self.fix_doc_docid),
  3962. RangeQuery(document_tmp_opertime,before_date,after_date)
  3963. ])
  3964. list_data = []
  3965. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  3966. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True,limit=100),
  3967. ColumnsToGet(return_type=ColumnReturnType.NONE))
  3968. list_d = getRow_ots(rows)
  3969. list_data.extend(list_d)
  3970. while next_token:
  3971. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
  3972. SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
  3973. ColumnsToGet(return_type=ColumnReturnType.NONE))
  3974. list_d = getRow_ots(rows)
  3975. list_data.extend(list_d)
  3976. print("%d/%d"%(len(list_data),total_count))
  3977. if len(list_data)>0:
  3978. self.fix_doc_docid = list_data[-1].get(document_tmp_docid)
  3979. log("current fix_doc_docid:%s"%(str(self.fix_doc_docid)))
  3980. task_queue = Queue()
  3981. for _data in list_data:
  3982. task_queue.put(_data)
  3983. mt = MultiThreadHandler(task_queue,fix_doc_handle,None,30)
  3984. mt.run()
  3985. def send_daily_check_data(self):
  3986. import datetime
  3987. def get_download_url(bucket, ObjectName, timeout):
  3988. url = ""
  3989. exist = bucket.object_exists(ObjectName)
  3990. if exist:
  3991. get_url = False
  3992. for i in range(3):
  3993. try:
  3994. url = bucket.sign_url('GET', ObjectName, timeout)
  3995. url = url.replace("-internal", "") # 替换地址里的内网标识
  3996. get_url = True
  3997. except:
  3998. pass
  3999. if get_url:
  4000. break
  4001. return url
  4002. file_timeout = 60 * 60 * 24 * 5 # 文件下载链接保存 5 天
  4003. # 获取昨天的日期
  4004. date = str(datetime.date.today() - datetime.timedelta(days=1))
  4005. oss_path = 'tmp_document_quality_data/'
  4006. object_path = oss_path + date + '/'
  4007. msg = "每日数据质量检查结果(报警):"
  4008. csv_name = "数据质量监控检查结果.xlsx"
  4009. ObjectName = object_path + csv_name
  4010. url = get_download_url(self.bucket,ObjectName,file_timeout)
  4011. if url:
  4012. msg += "\n文件名:\"%s\",链接:%s" % (csv_name, url)
  4013. csv_name = "公告重复量大的编号.xlsx"
  4014. ObjectName = object_path + csv_name
  4015. url = get_download_url(self.bucket, ObjectName, file_timeout)
  4016. if url:
  4017. msg += "\n文件名:\"%s\",链接:%s" % (csv_name, url)
  4018. csv_name = "公告附件重复量大的编号.xlsx"
  4019. ObjectName = object_path + csv_name
  4020. url = get_download_url(self.bucket, ObjectName, file_timeout)
  4021. if url:
  4022. msg += "\n文件名:\"%s\",链接:%s" % (csv_name, url)
  4023. csv_name = "附件识别异常的站源.xlsx"
  4024. ObjectName = object_path + csv_name
  4025. url = get_download_url(self.bucket, ObjectName, file_timeout)
  4026. if url:
  4027. msg += "\n文件名:\"%s\",链接:%s" % (csv_name, url)
  4028. csv_name = "报名时间,截止时间在发布时间之前的公告.xlsx"
  4029. ObjectName = object_path + csv_name
  4030. url = get_download_url(self.bucket, ObjectName, file_timeout)
  4031. if url:
  4032. msg += "\n文件名:\"%s\",链接:%s" % (csv_name, url)
  4033. atMobiles = ['18813973429'] # 维阵
  4034. ACCESS_TOKEN_DATAWORKS = "https://oapi.dingtalk.com/robot/send?access_token=9489f01c4ab9f0c3f87e2ff5c3e35eb9fb0d17afb6244de4683596df1111daea"
  4035. sentMsgToDD(msg,ACCESS_TOKEN_DATAWORKS,atMobiles=atMobiles)
  4036. def send_daily_check_data2(self):
  4037. import datetime
  4038. import pandas as pd
  4039. from itertools import groupby
  4040. dict_channel = {"公告变更": 51,
  4041. "招标公告": 52,
  4042. "中标信息": 101,
  4043. "招标预告": 102,
  4044. "招标答疑": 103,
  4045. "资审结果": 105,
  4046. "法律法规": 106,
  4047. "新闻资讯": 107,
  4048. "采购意向": 114,
  4049. "拍卖出让": 115,
  4050. "土地矿产": 116,
  4051. "产权交易": 117,
  4052. "废标公告": 118,
  4053. "候选人公示": 119,
  4054. "合同公告": 120}
  4055. label2channel = {v:k for k,v in dict_channel.items()}
  4056. def post_data(url,json_data):
  4057. post_sucess = False
  4058. for i in range(3):
  4059. if not post_sucess:
  4060. try:
  4061. # 发送POST请求,传输JSON数据
  4062. response = requests.post(url, json=json_data)
  4063. # 检查响应状态码
  4064. if response.status_code == 200:
  4065. post_sucess = True
  4066. except requests.exceptions.RequestException as e:
  4067. log("send_daily_check_data2,post error reason: %s"%(str(e)))
  4068. pass
  4069. return post_sucess
  4070. res_json = {
  4071. "data": [],
  4072. "count": 0
  4073. }
  4074. # 获取昨天的日期
  4075. date = str(datetime.date.today() - datetime.timedelta(days=1))
  4076. oss_path = 'tmp_document_quality_data/'
  4077. object_path = oss_path + date + '/'
  4078. csv_name = "数据质量监控检查结果.xlsx"
  4079. ObjectName = object_path + csv_name
  4080. LocalPath = os.path.join(self.current_path,"download",csv_name)
  4081. down_res = downloadFile(self.bucket,ObjectName,LocalPath,retry=3)
  4082. if down_res:
  4083. df = pd.read_excel(LocalPath)
  4084. for web_source_no,original_docchannel,error_rule in zip(df['web_source_no'],df['original_docchannel'],df['error_rule']):
  4085. error_rule = json.loads(error_rule)
  4086. for error_type,error_sample in error_rule.items():
  4087. tmp_data = {
  4088. "WEB_SOURCE_NO": web_source_no,
  4089. "WEBTYPE": label2channel.get(original_docchannel, ""),
  4090. "TYPE": error_type,
  4091. "ITEMS": error_sample
  4092. }
  4093. res_json['data'].append(tmp_data)
  4094. res_json['count'] += 1
  4095. os.remove(LocalPath)
  4096. csv_name = "公告重复量大的编号.xlsx"
  4097. ObjectName = object_path + csv_name
  4098. down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
  4099. if down_res:
  4100. df = pd.read_excel(LocalPath)
  4101. tmp_list = []
  4102. for web_source_no,fingerprint,original_docchannel,cnt,res in zip(df['web_source_no'], df['fingerprint'],
  4103. df['original_docchannel'],df['cnt'],df['res']):
  4104. tmp_data = {
  4105. "WEB_SOURCE_NO": web_source_no,
  4106. "WEBTYPE": label2channel.get(original_docchannel, ""),
  4107. "TYPE": "编号公告重复",
  4108. "FINGERPRINT": fingerprint,
  4109. "ITEMS": json.loads(res)
  4110. }
  4111. tmp_list.append(tmp_data)
  4112. tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
  4113. for key, group in groupby(tmp_list, lambda x: (x['WEB_SOURCE_NO'])):
  4114. group = list(group)[:5]
  4115. res_json['data'].extend(group)
  4116. res_json['count'] += len(group)
  4117. os.remove(LocalPath)
  4118. csv_name = "公告附件重复量大的编号.xlsx"
  4119. ObjectName = object_path + csv_name
  4120. down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
  4121. if down_res:
  4122. df = pd.read_excel(LocalPath)
  4123. tmp_list = []
  4124. for web_source_no,filemd5,original_docchannel,cnt,res in zip(df['web_source_no'],df['filemd5'],
  4125. df['original_docchannel'],df['cnt'],df['res']):
  4126. tmp_data = {
  4127. "WEB_SOURCE_NO": web_source_no,
  4128. "WEBTYPE": label2channel.get(original_docchannel, ""),
  4129. "TYPE": "编号附件重复",
  4130. "FILEMD5": filemd5,
  4131. "ITEMS": json.loads(res)
  4132. }
  4133. tmp_list.append(tmp_data)
  4134. tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
  4135. for key, group in groupby(tmp_list, lambda x: (x['WEB_SOURCE_NO'])):
  4136. group = list(group)[:5]
  4137. res_json['data'].extend(group)
  4138. res_json['count'] += len(group)
  4139. os.remove(LocalPath)
  4140. csv_name = "附件识别异常的站源.xlsx"
  4141. ObjectName = object_path + csv_name
  4142. down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
  4143. if down_res:
  4144. df = pd.read_excel(LocalPath)
  4145. for web_source_no,original_docchannel,error_ratio,error_sample,res in zip(df['web_source_no'], df['original_docchannel'],
  4146. df['error_ratio'],df['error_sample'],df['res']):
  4147. tmp_data = {
  4148. "WEB_SOURCE_NO": web_source_no,
  4149. "WEBTYPE": label2channel.get(original_docchannel, ""),
  4150. "TYPE": "附件识别异常",
  4151. "ITEMS": json.loads(res)
  4152. }
  4153. res_json['data'].append(tmp_data)
  4154. res_json['count'] += 1
  4155. os.remove(LocalPath)
  4156. csv_name = "报名时间,截止时间在发布时间之前的公告.xlsx"
  4157. ObjectName = object_path + csv_name
  4158. down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
  4159. if down_res:
  4160. df = pd.read_excel(LocalPath)
  4161. tmp_list = []
  4162. for web_source_no,original_docchannel,res in zip(df['web_source_no'],df['original_docchannel'],df['res']):
  4163. tmp_data = {
  4164. "WEB_SOURCE_NO": web_source_no,
  4165. "WEBTYPE": label2channel.get(original_docchannel, ""),
  4166. "TYPE": "截止日期在发布日期之前",
  4167. "ITEMS": json.loads(res)
  4168. }
  4169. tmp_list.append(tmp_data)
  4170. res_json['data'].extend(tmp_list)
  4171. res_json['count'] += len(tmp_list)
  4172. os.remove(LocalPath)
  4173. # url = "http://120.132.118.205:17090/saveQualityListData"
  4174. url = "http://data-monitor.bidizhaobiao.com/oldApi/saveQualityListData"
  4175. res = post_data(url,res_json)
  4176. if res:
  4177. log("send_daily_check_data2,sent data len: %d"%(res_json['count']))
  4178. def start_flow_dumplicate(self):
  4179. schedule = BlockingScheduler()
  4180. schedule.add_job(self.flow_dumplicate,"cron",second="*/5")
  4181. schedule.add_job(self.flow_dumpcate_comsumer,"cron",second="*/30")
  4182. schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
  4183. schedule.add_job(self.flow_remove,"cron",hour="20")
  4184. schedule.add_job(self.send_daily_check_data,"cron",hour='9', minute='10')
  4185. schedule.add_job(self.send_daily_check_data2,"cron",hour='9', minute='10')
  4186. schedule.add_job(self.flow_remove_project_tmp,"cron",hour="20")
  4187. schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="*/10")
  4188. schedule.start()
  4189. def changeSaveStatus(self,list_dict):
  4190. if list_dict is not None:
  4191. for _dict in list_dict:
  4192. if isinstance(_dict,dict):
  4193. if _dict.get(document_tmp_save,1)==1:
  4194. _d = {"partitionkey":_dict["partitionkey"],
  4195. "docid":_dict["docid"],
  4196. document_tmp_save:0
  4197. }
  4198. _d_tmp = Document_tmp(_d)
  4199. if _d_tmp.exists_row(self.ots_client):
  4200. _d_tmp.update_row(self.ots_client)
  4201. elif isinstance(_dict,int):
  4202. _d = {"partitionkey":_dict%500+1,
  4203. "docid":_dict,
  4204. document_tmp_save:0
  4205. }
  4206. _d_tmp = Document_tmp(_d)
  4207. if _d_tmp.fix_columns(self.ots_client,["status",document_update_document],True):
  4208. if _d_tmp.getProperties().get("status")==1:
  4209. if _d_tmp.getProperties().get(document_update_document,"")!="true":
  4210. _d_tmp.setValue("status",0,True)
  4211. _d_tmp.update_row(self.ots_client)
  4212. def test_dumplicate(self,docid):
  4213. # columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
  4214. columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]
  4215. item = self.get_attrs_before_dump(docid,columns)
  4216. if item:
  4217. log("start dumplicate_comsumer_handle")
  4218. self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=False,upgrade=True)
  4219. return
  4220. def test_merge(self,list_docid_less,list_docid_greater):
  4221. list_docs_less = self.search_docs(list_docid_less)
  4222. list_projects_less = self.generate_projects_from_document(list_docs_less)
  4223. list_docs_greater = self.search_docs(list_docid_greater)
  4224. list_projects_greater = self.generate_projects_from_document(list_docs_greater)
  4225. list_projects_less.extend(list_projects_greater)
  4226. list_projects = dumplicate_projects(list_projects_less,b_log=True)
  4227. project_json = to_project_json(list_projects)
  4228. log("project_json:%s"%project_json)
  4229. return project_json
  4230. def getRemainDoc(self,docid):
  4231. columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]
  4232. bool_query = BoolQuery(must_queries=[
  4233. TermQuery("docid",docid)
  4234. ])
  4235. rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
  4236. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  4237. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  4238. list_dict = getRow_ots(rows)
  4239. if len(list_dict)>0:
  4240. item = list_dict[0]
  4241. start_time = time.time()
  4242. self.post_extract(item)
  4243. base_list = []
  4244. set_docid = set()
  4245. list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,to_log=True)
  4246. list_rules.sort(key=lambda x:x["confidence"],reverse=True)
  4247. _i = 0
  4248. step = 5
  4249. item["confidence"] = 999
  4250. if item.get(document_tmp_docid) not in set_docid:
  4251. base_list.append(item)
  4252. set_docid.add(item.get(document_tmp_docid))
  4253. while _i<len(list_rules):
  4254. must_not_q = []
  4255. if len(base_list)>0:
  4256. must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
  4257. _query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
  4258. must_not_queries=must_not_q)
  4259. _rule = list_rules[_i]
  4260. confidence = _rule["confidence"]
  4261. singleNum_keys = _rule["singleNum_keys"]
  4262. contain_keys = _rule["contain_keys"]
  4263. multiNum_keys = _rule["multiNum_keys"]
  4264. self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json])
  4265. _i += step
  4266. _time = time.time()
  4267. log("%d start final check with length:%d"%(item["docid"],len(base_list)))
  4268. final_list = self.dumplicate_fianl_check(base_list)
  4269. log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
  4270. best_docid = self.get_best_docid(final_list)
  4271. return best_docid
  4272. return None
  4273. def compare_dumplicate_check():
  4274. import pandas as pd
  4275. df_dump = Dataflow_dumplicate(start_delete_listener=False)
  4276. test_count = 1000
  4277. # columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
  4278. columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district]
  4279. bool_query = BoolQuery(must_queries=[
  4280. RangeQuery("docid",400453395,400463395)
  4281. ])
  4282. rows,next_token,total_count,is_all_succeed = df_dump.ots_client.search("document","document_index",
  4283. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=10,get_total_count=True),
  4284. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  4285. log("flow_dumplicate producer total_count:%d"%total_count)
  4286. list_dict = getRow_ots(rows)
  4287. while 1:
  4288. if not next_token or len(list_dict)>=test_count:
  4289. break
  4290. rows,next_token,total_count,is_all_succeed = df_dump.ots_client.search("document","document_index",
  4291. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
  4292. ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
  4293. list_dict.extend(getRow_ots(rows))
  4294. def _handle1(_item,result_queue):
  4295. try:
  4296. list_docid = df_dump.dumplicate_comsumer_handle(_item,None,df_dump.ots_client,get_all=True,upgrade=False)
  4297. _item["before"] = list_docid
  4298. except Exception as e:
  4299. pass
  4300. dump_result = {}
  4301. for item in list_dict:
  4302. dump_result[item["docid"]] = {}
  4303. task_queue = Queue()
  4304. list_item = []
  4305. for item in list_dict:
  4306. _item = {}
  4307. _item.update(item)
  4308. list_item.append(_item)
  4309. task_queue.put(_item)
  4310. mt = MultiThreadHandler(task_queue,_handle1,None,30)
  4311. mt.run()
  4312. for item in list_item:
  4313. dump_result[item["docid"]]["before"] = item.get("before")
  4314. df_dump.check_rule = 2
  4315. def _handle2(_item,result_queue):
  4316. try:
  4317. list_docid1 = df_dump.dumplicate_comsumer_handle(_item,None,df_dump.ots_client,get_all=True,upgrade=False)
  4318. _item["after"] = list_docid1
  4319. except Exception as e:
  4320. pass
  4321. task_queue = Queue()
  4322. list_item = []
  4323. for item in list_dict:
  4324. _item = {}
  4325. _item.update(item)
  4326. list_item.append(_item)
  4327. task_queue.put(_item)
  4328. mt = MultiThreadHandler(task_queue,_handle2,None,30)
  4329. mt.run()
  4330. for item in list_item:
  4331. dump_result[item["docid"]]["after"] = item.get("after")
  4332. df_data = {"docid":[],
  4333. "before":[],
  4334. "after":[],
  4335. "before-after":[],
  4336. "after-before":[]}
  4337. for docid,_d in dump_result.items():
  4338. df_data["docid"].append(docid)
  4339. before = _d.get("before",[])
  4340. after = _d.get("after",[])
  4341. df_data["before"].append(str(before))
  4342. df_data["after"].append(str(after))
  4343. df_data["before-after"].append(str(set(before)-set(after)))
  4344. df_data["after-before"].append(str(set(after)-set(before)))
  4345. df = pd.DataFrame(df_data,columns=["docid","before","after","before-after","after-before"])
  4346. df.to_excel("compare_dump.xlsx")
  4347. def fix_merge_docid(docid):
  4348. def get_uuid_docids(docid):
  4349. ots_client = getConnect_ots()
  4350. bool_query = BoolQuery(must_queries=[
  4351. TermQuery("docids",docid)
  4352. ])
  4353. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  4354. SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("page_time")]),limit=100,get_total_count=True),
  4355. ColumnsToGet(["docids"],return_type=ColumnReturnType.SPECIFIED))
  4356. list_row = getRow_ots(rows)
  4357. while next_token:
  4358. rows,next_token,total_count,is_all_succeed = ots_client.search("project2","project2_index",
  4359. SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
  4360. ColumnsToGet(["docids"],return_type=ColumnReturnType.SPECIFIED))
  4361. list_row.extend(getRow_ots(rows))
  4362. return list_row
  4363. def get_new_docid(list_docid1,list_docid2):
  4364. return list(set(list_docid1)-set(list_docid2))
  4365. def get_list_docid(list_row):
  4366. list_docid = []
  4367. for row in list_row:
  4368. docids = row.get("docids",'')
  4369. if docids:
  4370. list_docid.extend([int(a) for a in docids.split(",")])
  4371. return list(set(list_docid))
  4372. def get_list_uuid(list_row):
  4373. list_uuid = []
  4374. for row in list_row:
  4375. uuid = row.get("uuid",'')
  4376. if uuid:
  4377. list_uuid.append(uuid)
  4378. return list(set(list_uuid))
  4379. list_row = get_uuid_docids(docid)
  4380. print(list_row)
  4381. list_docid1 = get_list_docid(list_row)
  4382. list_new_docid = get_new_docid(list_docid1,[docid])
  4383. while 1:
  4384. if len(list_new_docid)==0:
  4385. break
  4386. list_row2 = []
  4387. for _docid in list_new_docid:
  4388. list_row2.extend(get_uuid_docids(_docid))
  4389. list_docid1 = get_list_docid(list_row)
  4390. list_docid2 = get_list_docid(list_row2)
  4391. list_new_docid = get_new_docid(list_docid1,list_docid2)
  4392. list_row.extend(list_row2)
  4393. list_uuid = get_list_uuid(list_row)
  4394. list_docid = get_list_docid(list_row)
  4395. print(list_uuid)
  4396. print(list_docid)
  4397. for _docid in list_docid:
  4398. _d = Document({document_partitionkey:_docid%500+1,
  4399. document_docid:_docid,
  4400. document_status:1})
  4401. if _d.exists_row(ots_client):
  4402. _d.update_row(ots_client)
  4403. for _uuid in list_uuid:
  4404. _p = Project({project_uuid:_uuid,})
  4405. _p.delete_row(ots_client)
  4406. if __name__ == '__main__':
  4407. a = time.time()
  4408. # df = Dataflow()
  4409. # df.flow_init()
  4410. # df.flow_test()
  4411. # df.test_merge()
  4412. # df.start_flow_attachment()
  4413. # df.start_flow_extract()
  4414. # df.start_flow_dumplicate()
  4415. # # df.start_flow_merge()
  4416. # df.start_flow_remove()
  4417. # download_attachment()
  4418. # test_attachment_interface()
  4419. df_dump = Dataflow_dumplicate(start_delete_listener=False)
  4420. # df_dump.start_flow_dumplicate()
  4421. df_dump.test_dumplicate(613075691
  4422. )
  4423. # df_dump.dumplicate_comsumer_handle_interface(603504420,document_table="document_0000",document_table_index="document_0000_index",project_table="project_0000",project_table_index="project_0000_index_formerge")
  4424. # compare_dumplicate_check()
  4425. # df_dump.test_merge([391898061
  4426. # ],[371551361,])
  4427. # df_dump.flow_remove_project_tmp()
  4428. # fix_merge_docid(595271944)
  4429. print("takes",time.time()-a)
  4430. # df_dump.fix_doc_which_not_in_project()
  4431. # df_dump.delete_projects_by_document(16288036)
  4432. # log("=======")
  4433. # for i in range(3):
  4434. # time.sleep(20)
  4435. #
  4436. # a = {"docid":74295123}
  4437. # send_msg_toacmq(df_dump.pool_mq_ali,json.dumps(a),df_dump.doc_delete_queue)