luojiehua
/
BaseDataMaintenance


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926
							
import sys,os

# sys.path.append("/data")

from BaseDataMaintenance.dataSource.source import getConnect_ots,getConnect_ots_capacity,getConnect_activateMQ_ali
from tablestore import *
from BaseDataMaintenance.common.Utils import *
from BaseDataMaintenance.common.multiThread import MultiThreadHandler
from BaseDataMaintenance.common.multiProcess import MultiProcessHandler
from queue import Queue

from BaseDataMaintenance.model.ots.document_tmp import *
from BaseDataMaintenance.model.ots.attachment import *
from BaseDataMaintenance.model.ots.document_html import *
from BaseDataMaintenance.model.ots.document_extract2 import *
from BaseDataMaintenance.model.ots.project import *
from BaseDataMaintenance.model.ots.document import *

import base64
from BaseDataMaintenance.dataSource.interface import getAttachDealInterface,sentMsgToDD
from uuid import uuid4
from BaseDataMaintenance.common.ossUtils import *
from BaseDataMaintenance.dataSource.source import is_internal,getAuth

from apscheduler.schedulers.blocking import BlockingScheduler

from BaseDataMaintenance.maintenance.dataflow_settings import *
from threading import Thread
import oss2
from BaseDataMaintenance.maintenance.documentDumplicate import *

from BaseDataMaintenance.common.otsUtils import *
from BaseDataMaintenance.common.activateMQUtils import *

from BaseDataMaintenance.dataSource.pool import ConnectorPool

def getSet(list_dict,key):
    _set = set()
    for item in list_dict:
        if key in item:
            if item[key]!='' and item[key] is not None:
                if re.search("^\d[\d\.]*$",item[key]) is not None:
                    _set.add(str(float(item[key])))
                else:
                    _set.add(str(item[key]))
    return _set

def getSimilarityOfString(str1,str2):
    _set1 = set()
    _set2 = set()
    if str1 is not None:
        for i in range(1,len(str1)):
            _set1.add(str1[i-1:i+1])
    if str2 is not None:
        for i in range(1,len(str2)):
            _set2.add(str2[i-1:i+1])
    _len = max(1,min(len(_set1),len(_set2)))
    return len(_set1&_set2)/_len

def getDiffIndex(list_dict,key,confidence=100):
    _set = set()
    for _i in range(len(list_dict)):
        item = list_dict[_i]
        if item["confidence"]>=confidence:
            continue
        if key in item:
            if item[key]!='' and item[key] is not None:
                if re.search("^\d+(\.\d+)?$",item[key]) is not None:
                    _set.add(str(float(item[key])))
                else:
                    _set.add(str(item[key]))
        if len(_set)>1:
            return _i
    return len(list_dict)

def transformSWF(bucket,attachment_hub_url,objectPath,localpath,swf_dir):
    swf_urls = []
    try:
        list_files = os.listdir(swf_dir)
        list_files.sort(key=lambda x:x)
        headers = dict()
        headers["x-oss-object-acl"] = oss2.OBJECT_ACL_PUBLIC_READ
        for _file in list_files:
            swf_localpath = "%s/%s"%(swf_dir,_file)
            swf_objectPath = "%s/%s"%(objectPath.split(".")[0],_file)
            uploadFileByPath(bucket,swf_localpath,swf_objectPath,headers)
            _url = "%s/%s"%(attachment_hub_url,swf_objectPath)
            swf_urls.append(_url)
            os.remove(swf_localpath)
    except Exception as e:
        traceback.print_exc()
    return swf_urls


class Dataflow():

    def __init__(self):
        self.ots_client = getConnect_ots()
        self.queue_init = Queue()
        self.queue_attachment = Queue()
        self.queue_attachment_ocr = Queue()
        self.queue_attachment_not_ocr = Queue()
        self.list_attachment_ocr = []
        self.list_attachment_not_ocr = []
        self.queue_extract = Queue()
        self.list_extract = []
        self.queue_dumplicate = Queue()
        self.queue_merge = Queue()
        self.queue_syncho = Queue()
        self.queue_remove = Queue()

        self.attachment_rec_interface = ""

        self.ots_client = getConnect_ots()

        if is_internal:
            self.bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
        else:
            self.bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"

        if is_internal:
            self.extract_url = "http://1255640119316927.vpc.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
            self.industy_url = "http://1255640119316927.vpc.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/industry_extract"
            self.other_url = "http://1255640119316927.vpc.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/other_extract"
        else:
            self.extract_url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
            self.industy_url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/industry_extract"
            self.other_url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/other_extract"
        self.header = {'Content-Type': 'application/json',"Authorization":"NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg=="}

        self.attachment_hub_url = "https://attachment-hub.oss-cn-hangzhou.aliyuncs.com/"
        self.auth = getAuth()
        oss2.defaults.connection_pool_size = 100
        oss2.defaults.multiget_num_threads = 20
        log("bucket_url:%s"%(self.bucket_url))
        self.attachment_bucket_name = "attachment-hub"
        self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
        self.current_path = os.path.dirname(__file__)


    def flow_init(self):
        def producer():
            bool_query = BoolQuery(must_queries=[RangeQuery("crtime",'2022-04-20')])
            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                                ColumnsToGet(return_type=ColumnReturnType.ALL))
            log("flow_init producer total_count:%d"%total_count)
            list_dict = getRow_ots(rows)
            for _dict in list_dict:
                self.queue_init.put(_dict)
            _count = len(list_dict)
            while next_token:
                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                    ColumnsToGet(return_type=ColumnReturnType.ALL))
                list_dict = getRow_ots(rows)
                for _dict in list_dict:
                    self.queue_init.put(_dict)
                _count += len(list_dict)
        def comsumer():
            mt = MultiThreadHandler(self.queue_init,comsumer_handle,None,30,1,ots_client=self.ots_client)
            mt.run()


        def comsumer_handle(item,result_queue,ots_client):
            _dochtmlcon = item.get(document_tmp_dochtmlcon,"")
            if document_tmp_dochtmlcon in item:
                item.pop(document_tmp_dochtmlcon)
            if document_tmp_doctextcon in item:
                item.pop(document_tmp_doctextcon)
            if document_tmp_attachmenttextcon in item:
                item.pop(document_tmp_attachmenttextcon)

            _status = item.get(document_tmp_status)
            new_status = None
            if _status>=201 and _status<=300:
                item[document_tmp_save] = 1
                new_status = 81
            elif _status>=401 and _status<=450:
                item[document_tmp_save] = 0
                new_status = 81
            else:
                new_status = 1
            # new_status = 1
            item[document_tmp_status] = new_status
            dtmp = Document_tmp(item)
            dhtml = Document_html({document_tmp_partitionkey:item.get(document_tmp_partitionkey),
                                   document_tmp_docid:item.get(document_tmp_docid),
                                   document_tmp_dochtmlcon:_dochtmlcon})


            dtmp.update_row(ots_client)
            dhtml.update_row(ots_client)

        producer()
        comsumer()


    def getTitleFromHtml(self,filemd5,_html):
        _soup = BeautifulSoup(_html,"lxml")

        _find = _soup.find("a",attrs={"data":filemd5})
        _title = ""
        if _find is not None:
            _title = _find.get_text()
        return _title

    def getSourceLinkFromHtml(self,filemd5,_html):
        _soup = BeautifulSoup(_html,"lxml")

        _find = _soup.find("a",attrs={"filelink":filemd5})
        filelink = ""
        if _find is None:
            _find = _soup.find("img",attrs={"filelink":filemd5})
            if _find is not None:
                filelink = _find.attrs.get("src","")
        else:
            filelink = _find.attrs.get("href","")
        return filelink


    def request_attachment_interface(self,attach,_dochtmlcon):

        filemd5 = attach.getProperties().get(attachment_filemd5)
        _status = attach.getProperties().get(attachment_status)
        _filetype = attach.getProperties().get(attachment_filetype)
        _size = attach.getProperties().get(attachment_size)
        _path = attach.getProperties().get(attachment_path)
        _uuid = uuid4()
        objectPath = attach.getProperties().get(attachment_path)
        localpath = os.path.join(self.current_path,"download",_uuid.hex)
        docids = attach.getProperties().get(attachment_docids)

        try:
            if _size>ATTACHMENT_LARGESIZE:
                attach.setValue(attachment_status, ATTACHMENT_TOOLARGE)
                log("attachment :%s of path:%s to large"%(filemd5,_path))
                attach.update_row(self.ots_client)
                return True
            else:
                d_start_time = time.time()
                if downloadFile(self.bucket,objectPath,localpath):
                    time_download = time.time()-d_start_time
                    _data_base64 = base64.b64encode(open(localpath,"rb").read())
                    #调用接口处理结果
                    start_time = time.time()
                    _success,_html,swf_images = getAttachDealInterface(_data_base64,_filetype)
                    if _success:
                        log("process filemd5:%s of type:%s with size:%.3fM download:%ds recognize takes %ds,ret_size:%d"%(filemd5,_filetype,round(_size/1024/1024,4),time_download,time.time()-start_time,len(_html)))
                    else:
                        log("attach interface failed of docid:%s filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
                        sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
                        _html = ""
                        return False

                    swf_images = eval(swf_images)
                    if attach.getProperties().get(attachment_filetype)=="swf" and len(swf_images)>0:

                        swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
                        if len(swf_urls)==0:
                            objectPath = attach.getProperties().get(attachment_path,"")
                            localpath = os.path.join(self.current_path,"download/%s.swf"%(uuid4().hex))
                            swf_dir = os.path.join(self.current_path,"swf_images",uuid4().hex)
                            if not os.path.exists(swf_dir):
                                os.mkdir(swf_dir)
                            for _i in range(len(swf_images)):
                                _base = swf_images[_i]
                                _base = base64.b64decode(_base)
                                filename = "swf_page_%d.png"%(_i)
                                filepath = os.path.join(swf_dir,filename)
                                with open(filepath,"wb") as f:
                                    f.write(_base)

                            swf_urls = transformSWF(self.bucket,self.attachment_hub_url,objectPath,None,swf_dir)
                            if os.path.exists(swf_dir):
                                os.rmdir(swf_dir)
                            attach.setValue(attachment_swfUrls,json.dumps(swf_urls,ensure_ascii=False),True)

                    if re.search("<td",_html) is not None:
                        attach.setValue(attachment_has_table,1,True)

                    _file_title = self.getTitleFromHtml(filemd5,_dochtmlcon)
                    filelink = self.getSourceLinkFromHtml(filemd5,_dochtmlcon)
                    if _file_title!="":
                        attach.setValue(attachment_file_title,_file_title,True)
                    if filelink!="":
                        attach.setValue(attachment_file_link,filelink,True)


                    attach.setValue(attachment_attachmenthtml,_html,True)
                    attach.setValue(attachment_attachmentcon,BeautifulSoup(_html,"lxml").get_text(),True)
                    attach.setValue(attachment_status,ATTACHMENT_PROCESSED,True)
                    attach.setValue(attachment_recsize,len(_html),True)
                    attach.setValue(attachment_process_time,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
                    attach.update_row(self.ots_client) #线上再开放更新
                    return True
                else:
                    return False

        except oss2.exceptions.NotFound as e:
            return True

        except Exception as e:
            traceback.print_exc()
        finally:
            try:
                os.remove(localpath)
            except:
                pass

    def rec_attachments_by_interface(self,list_attach,_dochtmlcon,save=True):
        list_html = []
        swf_urls = []
        for _attach in list_attach:
            #测试全跑

            if _attach.getProperties().get(attachment_status) in (ATTACHMENT_PROCESSED,ATTACHMENT_TOOLARGE):
                _html = _attach.getProperties().get(attachment_attachmenthtml,"")
                if _html is None:
                    _html = ""
                list_html.append(_html)
            else:
                _succeed = self.request_attachment_interface(_attach,_dochtmlcon)
                if not _succeed:
                    return False,"",[]
                _html = _attach.getProperties().get(attachment_attachmenthtml,"")
                if _html is None:
                    _html = ""
                list_html.append(_html)

            if _attach.getProperties().get(attachment_filetype)=="swf":
                swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))


        return True,list_html,swf_urls


    def generate_dumplicate_query(self,_dict,_dict_must_not,set_match=set(["project_code","project_codes","product"]),set_nested=set(["win_tenderer","bidding_budget","win_bid_price"]),
                                  set_term=set(["project_name","doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save","docid"]),
                                  set_range=set(["page_time","status"]),set_phrase=set(["doctitle"])):
        list_must_queries = []
        list_must_no_queries = []
        for k,v in _dict.items():
            if k in set_match:
                if isinstance(v,str):
                    l_s = []
                    for s_v in v.split(","):
                        l_s.append(MatchQuery(k,s_v))
                    list_must_queries.append(BoolQuery(should_queries=l_s))
            elif k in set_nested:
                _v = v
                if k!="" and k=="bidding_budget" or k=="win_bid_price":
                    _v = float(_v)
                    list_must_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
            elif k in set_term:
                list_must_queries.append(TermQuery(k,v))
            elif k in set_phrase:
                list_must_queries.append(MatchPhraseQuery(k,v))
            elif k in set_range:
                if len(v)==1:
                    list_must_queries.append(RangeQuery(k,v[0]))
                elif len(v)==2:
                    list_must_queries.append(RangeQuery(k,v[0],v[1],True,True))
        for k,v in _dict_must_not.items():
            if k in set_match:
                if isinstance(v,str):
                    l_s = []
                    for s_v in v.split(","):
                        l_s.append(MatchQuery(k,s_v))
                    list_must_no_queries.append(BoolQuery(should_queries=l_s))
            elif k in set_nested:
                _v = v
                if k!="" and k=="bidding_budget" or k=="win_bid_price":
                    _v = float(_v)
                    list_must_no_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
            elif k in set_term:
                list_must_no_queries.append(TermQuery(k,v))
            elif k in set_range:
                if len(v)==1:
                    list_must_no_queries.append(RangeQuery(k,v[0]))
                elif len(v)==2:
                    list_must_no_queries.append(RangeQuery(k,v[0],v[1],True,True))

        return BoolQuery(must_queries=list_must_queries,must_not_queries=list_must_no_queries)

    def f_decode_sub_docs_json(self, project_code,project_name,tenderee,agency,sub_docs_json):
        columns = {"win_tenderer":"","bidding_budget":"","win_bid_price":""}
        extract_count = 0
        if project_code is not None and project_code!="":
            extract_count += 1
        if project_name is not None and project_name!="":
            extract_count += 1
        if tenderee is not None and tenderee!="":
            extract_count += 1
        if agency is not None and agency!="":
            extract_count += 1
        if sub_docs_json is not None:
            sub_docs = json.loads(sub_docs_json)
            sub_docs.sort(key=lambda x:x.get("bidding_budget",0),reverse=True)
            sub_docs.sort(key=lambda x:x.get("win_bid_price",0),reverse=True)
            # log("==%s"%(str(sub_docs)))
            for sub_docs in sub_docs:
                for _key_sub_docs in sub_docs.keys():
                    extract_count += 1
                    if _key_sub_docs in columns:
                        if columns[_key_sub_docs]=="" and str(sub_docs[_key_sub_docs]) not in ["","0"]:
                            if _key_sub_docs in ["bidding_budget","win_bid_price"]:
                                if float(sub_docs[_key_sub_docs])>0:
                                    columns[_key_sub_docs] = str(float(sub_docs[_key_sub_docs]))
                            else:
                                columns[_key_sub_docs] = str(sub_docs[_key_sub_docs])
        return columns["win_tenderer"],columns["bidding_budget"],columns["win_bid_price"],extract_count

    def post_extract(self,_dict):
        win_tenderer,bidding_budget,win_bid_price,extract_count = self.f_decode_sub_docs_json(_dict.get(document_tmp_project_code),_dict.get(document_tmp_project_name),_dict.get(document_tmp_tenderee),_dict.get(document_tmp_agency),_dict.get(document_tmp_sub_docs_json))
        _dict["win_tenderer"] = win_tenderer
        _dict["bidding_budget"] = bidding_budget
        _dict["win_bid_price"] = win_bid_price
        if "extract_count" not in _dict:
            _dict["extract_count"] = extract_count

    def get_dump_columns(self,_dict):
        docchannel = _dict.get(document_tmp_docchannel,0)
        project_code = _dict.get(document_tmp_project_code,"")
        project_name = _dict.get(document_tmp_project_name,"")
        tenderee = _dict.get(document_tmp_tenderee,"")
        agency = _dict.get(document_tmp_agency,"")
        doctitle_refine = _dict.get(document_tmp_doctitle_refine,"")
        win_tenderer = _dict.get("win_tenderer","")
        bidding_budget = _dict.get("bidding_budget","")
        if bidding_budget==0:
            bidding_budget = ""
        win_bid_price = _dict.get("win_bid_price","")
        if win_bid_price==0:
            win_bid_price = ""
        page_time = _dict.get(document_tmp_page_time,"")
        fingerprint = _dict.get(document_tmp_fingerprint,"")
        product = _dict.get(document_tmp_product,"")
        return docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product

    def f_set_docid_limitNum_contain(self,item, _split,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"]):
        flag = True

        for _key in singleNum_keys:
            if len(getSet(_split,_key))>1:
                flag = False
                break
        for _key in multiNum_keys:
            if len(getSet(_split,_key))<=1:
                flag = False
                break

        project_code = item.get("project_code","")
        for _key in notlike_keys:
            if not flag:
                break
            for _d in _split:
                _key_v = _d.get(_key,"")
                _sim = getSimilarityOfString(project_code,_key_v)
                if _sim>0.7 and _sim<1:
                    flag = False
                    break


        #判断组内每条公告是否包含
        if flag:
            if len(contain_keys)>0:
                for _key in contain_keys:
                    MAX_CONTAIN_COLUMN = None
                    for _d in _split:
                        contain_column = _d.get(_key)
                        if contain_column is not None and contain_column !="":
                            if MAX_CONTAIN_COLUMN is None:
                                MAX_CONTAIN_COLUMN = contain_column
                            else:
                                if len(MAX_CONTAIN_COLUMN)<len(contain_column):
                                    if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
                                        flag = False
                                        break
                                    MAX_CONTAIN_COLUMN = contain_column
                                else:
                                    if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
                                        flag = False
                                        break
        if flag:
            return _split
        return []

    def search_data_by_query(self,item,_query,confidence,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):

        list_data = []
        if isinstance(_query,list):
            bool_query = BoolQuery(should_queries=_query)
        else:
            bool_query = _query
        rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
                                                                            SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=50,get_total_count=True),
                                                                            ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        list_dict = getRow_ots(rows)
        for _dict in list_dict:
            self.post_extract(_dict)
            _dict["confidence"] = confidence
            list_data.append(_dict)


        # _count = len(list_dict)
        # while next_token:
        #     rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
        #                                                                         SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
        #                                                                         ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        #     list_dict = getRow_ots(rows)
        #     for _dict in list_dict:
        #         self.post_extract(_dict)
        #         _dict["confidence"] = confidence
        #         list_data.append(_dict)
        list_dict = self.f_set_docid_limitNum_contain(item,list_dict,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys)
        return list_dict

    def add_data_by_query(self,item,base_list,set_docid,_query,confidence,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_save,document_tmp_status,document_tmp_product,document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):

        list_dict = self.search_data_by_query(item,_query,confidence,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys,columns=columns)
        for _dict in list_dict:
            self.post_extract(_dict)
            _docid = _dict.get(document_tmp_docid)
            if _docid not in set_docid:
                base_list.append(_dict)
                set_docid.add(_docid)

    def translate_dumplicate_rules(self,status_from,item):
        docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
        if page_time=='':
            page_time = getCurrent_date("%Y-%m-%d")
        base_dict = {
            "status":[status_from[0]],
            "page_time":[timeAdd(page_time,-2),timeAdd(page_time,2)]
        }

        must_not_dict = {"save":0}

        list_rules = []
        singleNum_keys = ["tenderee","win_tenderer"]
        if fingerprint!="":
            _dict = {}
            confidence = 100
            _dict[document_tmp_fingerprint] = fingerprint
            _dict.update(base_dict)
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "item":item,
                     "query":_query,
                     "singleNum_keys":[],
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)

        if docchannel in (52,118):

            if bidding_budget!="" and tenderee!="" and project_code!="":
                confidence = 90
                _dict = {document_tmp_docchannel:docchannel,
                         "bidding_budget":item.get("bidding_budget"),
                         document_tmp_tenderee:item.get(document_tmp_tenderee,""),
                         document_tmp_project_code:item.get(document_tmp_project_code,"")
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            if doctitle_refine!="" and tenderee!="" and bidding_budget!="":
                confidence = 80
                _dict = {document_tmp_docchannel:docchannel,
                         "doctitle_refine":doctitle_refine,
                         "tenderee":tenderee,
                         bidding_budget:"bidding_budget"
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            if project_code!="" and doctitle_refine!="" and agency!="" and bidding_budget!="":
                confidence = 90
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "doctitle_refine":doctitle_refine,
                         "agency":agency,
                         "bidding_budget":bidding_budget
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            if project_code!="" and tenderee!="" and bidding_budget!="":
                confidence = 91
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "tenderee":tenderee,
                         "bidding_budget":bidding_budget
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            if doctitle_refine!="" and agency!="" and bidding_budget!="":
                confidence = 71
                _dict = {document_tmp_docchannel:docchannel,
                         "doctitle_refine":doctitle_refine,
                         "agency":agency,
                         "bidding_budget":bidding_budget
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            if project_code!="" and project_name!="" and agency!="" and bidding_budget!="":
                confidence = 91
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "project_name":project_name,
                         "agency":agency,
                         "bidding_budget":bidding_budget
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                n_singleKeys = [i for i in singleNum_keys]
                n_singleKeys.append(document_tmp_web_source_no)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":n_singleKeys,
                         "contain_keys":[],
                         "multiNum_keys":[]}
                list_rules.append(_rule)
            ##-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1
            if project_code!="" and project_name!="" and tenderee!="" and bidding_budget!="":
                confidence = 91
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "project_name":project_name,
                         "tenderee":tenderee,
                         "bidding_budget":bidding_budget
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                n_singleKeys = [i for i in singleNum_keys]
                n_singleKeys.append(document_tmp_web_source_no)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":n_singleKeys,
                         "contain_keys":[],
                         "multiNum_keys":[]}
                list_rules.append(_rule)


            if project_code!="" and doctitle_refine!="" and tenderee!="" and bidding_budget!="":
                confidence = 71
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "doctitle_refine":doctitle_refine,
                         "tenderee":tenderee,
                         "bidding_budget":bidding_budget
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            #-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
            if project_name!="" and agency!="":
                tmp_bidding = 0
                if bidding_budget!="":
                    tmp_bidding = bidding_budget
                confidence = 51
                _dict = {document_tmp_docchannel:docchannel,
                         "project_name":project_name,
                         "agency":agency,
                         "bidding_budget":tmp_bidding
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            #-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
            if project_code!="" and agency!="":
                tmp_bidding = 0
                if bidding_budget!="":
                    tmp_bidding = bidding_budget
                confidence = 51
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "agency":agency,
                         "bidding_budget":tmp_bidding
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

        if docchannel not in (101,119,120):

            #-- 7. 非中标公告 - 同项目名称 - 同发布日期 - 同招标人 - 同预算 -  同类型 - 信息源>1 - 同项目编号
            if project_name!="" and tenderee!="" and project_code!="":
                tmp_bidding = 0
                if bidding_budget!="":
                    tmp_bidding = bidding_budget
                confidence = 51
                _dict = {document_tmp_docchannel:docchannel,
                         "project_name":project_name,
                         "tenderee":tenderee,
                         "project_code":project_code
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)


        if docchannel in (101,119,120):

            #-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0)
            if project_code!="" and project_name!="" and win_tenderer!="":
                tmp_win = 0
                if win_bid_price!="":
                    tmp_win = win_bid_price
                confidence = 61
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "project_name":project_name,
                         "win_tenderer":win_tenderer,
                         "win_bid_price":tmp_win
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[]}
                list_rules.append(_rule)

            if project_code!="" and project_name!="" and bidding_budget!="" and product!="":
                confidence = 72
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "project_name":project_name,
                         "bidding_budget":bidding_budget,
                         "product":product
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                n_singleKeys = [i for i in singleNum_keys]
                n_singleKeys.append(document_tmp_web_source_no)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":n_singleKeys,
                         "contain_keys":[],
                         "multiNum_keys":[]}
                list_rules.append(_rule)

            if project_code!='' and doctitle_refine!="" and win_tenderer!="" and win_bid_price!="":
                confidence = 91
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "doctitle_refine":doctitle_refine,
                         "win_tenderer":win_tenderer,
                         "win_bid_price":win_bid_price
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                n_singleKeys = [i for i in singleNum_keys]
                n_singleKeys.append(document_tmp_web_source_no)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":n_singleKeys,
                         "contain_keys":[],
                         "multiNum_keys":[]}
                list_rules.append(_rule)

            ##-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1
            if project_code!="" and project_name!="" and win_tenderer!="" and win_bid_price!="":
                confidence = 91
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "project_name":project_name,
                         "win_tenderer":win_tenderer,
                         "win_bid_price":win_bid_price
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                n_singleKeys = [i for i in singleNum_keys]
                n_singleKeys.append(document_tmp_web_source_no)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":n_singleKeys,
                         "contain_keys":[],
                         "multiNum_keys":[]}
                list_rules.append(_rule)

            if project_name!="" and win_tenderer!="" and win_bid_price!="":
                confidence = 91
                _dict = {document_tmp_docchannel:docchannel,
                         "project_name":project_name,
                         "win_tenderer":win_tenderer,
                         "win_bid_price":win_bid_price,
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)
            if project_code!="" and win_tenderer!="" and win_bid_price!="":
                confidence = 91
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "win_tenderer":win_tenderer,
                         "win_bid_price":win_bid_price,
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            if project_code!="" and doctitle_refine!="" and win_tenderer!="" and win_bid_price!="":
                confidence = 91
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "doctitle_refine":doctitle_refine,
                         "win_tenderer":win_tenderer,
                         "win_bid_price":win_bid_price
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                n_singleKeys = [i for i in singleNum_keys]
                n_singleKeys.append(document_tmp_web_source_no)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":n_singleKeys,
                         "contain_keys":[],
                         "multiNum_keys":[]}
                list_rules.append(_rule)

            if doctitle_refine!="" and win_tenderer!="" and win_bid_price!="":
                confidence=90
                _dict = {document_tmp_docchannel:docchannel,
                         "doctitle_refine":doctitle_refine,
                         "win_tenderer":win_tenderer,
                         "win_bid_price":win_bid_price
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            if project_name!="" and win_tenderer!="" and win_bid_price!="" and project_code!="":
                confidence=95
                _dict = {document_tmp_docchannel:docchannel,
                         "project_name":project_name,
                         "win_tenderer":win_tenderer,
                         "win_bid_price":win_bid_price,
                         "project_code":project_code
                         }
                _dict.update(base_dict)
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

        if docchannel in (51,103,115,116):
            #9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
            if doctitle_refine!="" and tenderee!="":
                tmp_budget = 0
                if bidding_budget!="":
                    tmp_budget = bidding_budget
                confidence=81
                _dict = {document_tmp_docchannel:docchannel,
                         "doctitle_refine":doctitle_refine,
                         "tenderee":tenderee,
                         "bidding_budget":tmp_budget,
                         }
                _dict.update(base_dict)
                _dict["page_time"] = [page_time,page_time]
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            #-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
            if project_code!="" and tenderee!="":
                confidence=81
                tmp_budget = 0
                if bidding_budget!="":
                    tmp_budget = bidding_budget
                _dict = {document_tmp_docchannel:docchannel,
                         "project_code":project_code,
                         "tenderee":tenderee,
                         "bidding_budget":tmp_budget,
                         }
                _dict.update(base_dict)
                _dict["page_time"] = [page_time,page_time]
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)
            if project_name!="" and tenderee!="":
                confidence=81
                tmp_budget = 0
                if bidding_budget!="":
                    tmp_budget = bidding_budget
                _dict = {document_tmp_docchannel:docchannel,
                         "project_name":project_name,
                         "tenderee":tenderee,
                         "bidding_budget":tmp_budget,
                         }
                _dict.update(base_dict)
                _dict["page_time"] = [page_time,page_time]
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)
            if agency!="" and tenderee!="":
                confidence=81
                tmp_budget = 0
                if bidding_budget!="":
                    tmp_budget = bidding_budget
                _dict = {document_tmp_docchannel:docchannel,
                         "agency":agency,
                         "tenderee":tenderee,
                         "bidding_budget":tmp_budget,
                         "product":product
                         }
                _dict.update(base_dict)
                _dict["page_time"] = [page_time,page_time]
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)

            if agency!="" and project_code!="":
                confidence=81
                tmp_budget = 0
                if bidding_budget!="":
                    tmp_budget = bidding_budget
                _dict = {document_tmp_docchannel:docchannel,
                         "agency":agency,
                         "project_code":project_code,
                         "bidding_budget":tmp_budget,
                         "product":product
                         }
                _dict.update(base_dict)
                _dict["page_time"] = [page_time,page_time]
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)
            if agency!="" and project_name!="":
                confidence=81
                tmp_budget = 0
                if bidding_budget!="":
                    tmp_budget = bidding_budget
                _dict = {document_tmp_docchannel:docchannel,
                         "agency":agency,
                         "project_name":project_name,
                         "bidding_budget":tmp_budget,
                         "product":product
                         }
                _dict.update(base_dict)
                _dict["page_time"] = [page_time,page_time]
                _query = self.generate_dumplicate_query(_dict,must_not_dict)
                _rule = {"confidence":confidence,
                         "query":_query,
                         "singleNum_keys":singleNum_keys,
                         "contain_keys":[],
                         "multiNum_keys":[document_tmp_web_source_no]}
                list_rules.append(_rule)


        #五选二
        if tenderee!="" and bidding_budget!="" and product!="":
            confidence=80
            _dict = {document_tmp_docchannel:docchannel,
                     "tenderee":tenderee,
                     "bidding_budget":bidding_budget,
                     "product":product,
                     }
            _dict.update(base_dict)
            _dict["page_time"] = [page_time,page_time]
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "query":_query,
                     "singleNum_keys":singleNum_keys,
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)

        if tenderee!="" and win_tenderer!="" and product!="":
            confidence=80
            _dict = {document_tmp_docchannel:docchannel,
                     "tenderee":tenderee,
                     "win_tenderer":win_tenderer,
                     "product":product,
                     }
            _dict.update(base_dict)
            _dict["page_time"] = [page_time,page_time]
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "query":_query,
                     "singleNum_keys":singleNum_keys,
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)

        if tenderee!="" and win_bid_price!="":
            confidence=80
            _dict = {document_tmp_docchannel:docchannel,
                     "tenderee":tenderee,
                     "win_bid_price":win_bid_price,
                     "product":product,
                     }
            _dict.update(base_dict)
            _dict["page_time"] = [page_time,page_time]
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "query":_query,
                     "singleNum_keys":singleNum_keys,
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)

        if tenderee!="" and agency!="":
            confidence=80
            _dict = {document_tmp_docchannel:docchannel,
                     "tenderee":tenderee,
                     "agency":agency,
                     "product":product,
                     }
            _dict.update(base_dict)
            _dict["page_time"] = [page_time,page_time]
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "query":_query,
                     "singleNum_keys":singleNum_keys,
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)


        if win_tenderer!="" and bidding_budget!="":
            confidence=80
            _dict = {document_tmp_docchannel:docchannel,
                     "win_tenderer":win_tenderer,
                     "bidding_budget":bidding_budget,
                     "product":product,
                     }
            _dict.update(base_dict)
            _dict["page_time"] = [page_time,page_time]
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "query":_query,
                     "singleNum_keys":singleNum_keys,
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)

        if win_bid_price!="" and bidding_budget!="":
            confidence=80
            _dict = {document_tmp_docchannel:docchannel,
                     "win_bid_price":win_bid_price,
                     "bidding_budget":bidding_budget,
                     "product":product,
                     }
            _dict.update(base_dict)
            _dict["page_time"] = [page_time,page_time]
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "query":_query,
                     "singleNum_keys":singleNum_keys,
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)

        if agency!="" and bidding_budget!="":
            confidence=80
            _dict = {document_tmp_docchannel:docchannel,
                     "agency":agency,
                     "bidding_budget":bidding_budget,
                     "product":product,
                     }
            _dict.update(base_dict)
            _dict["page_time"] = [page_time,page_time]
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "query":_query,
                     "singleNum_keys":singleNum_keys,
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)


        if win_tenderer!="" and win_bid_price!="":
            confidence=80
            _dict = {document_tmp_docchannel:docchannel,
                     "win_tenderer":win_tenderer,
                     "win_bid_price":win_bid_price,
                     "product":product,
                     }
            _dict.update(base_dict)
            _dict["page_time"] = [page_time,page_time]
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "query":_query,
                     "singleNum_keys":singleNum_keys,
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)

        if win_tenderer!="" and agency!="":
            confidence=80
            _dict = {document_tmp_docchannel:docchannel,
                     "win_tenderer":win_tenderer,
                     "agency":agency,
                     "product":product,
                     }
            _dict.update(base_dict)
            _dict["page_time"] = [page_time,page_time]
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "query":_query,
                     "singleNum_keys":singleNum_keys,
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)

        if doctitle_refine!="" and product!="" and len(doctitle_refine)>7:
            confidence=80
            _dict = {document_tmp_docchannel:docchannel,
                     "doctitle_refine":doctitle_refine,
                     "product":product,
                     }
            _dict.update(base_dict)
            _query = self.generate_dumplicate_query(_dict,must_not_dict)
            _rule = {"confidence":confidence,
                     "query":_query,
                     "singleNum_keys":singleNum_keys,
                     "contain_keys":[],
                     "multiNum_keys":[]}
            list_rules.append(_rule)

        return list_rules

    def dumplicate_fianl_check(self,base_list):
        the_group = base_list
        the_group.sort(key=lambda x:x["confidence"],reverse=True)
        if len(the_group)>10:
            keys = ["tenderee","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
        else:
            keys = ["tenderee","win_tenderer","win_bid_price","bidding_budget"]


        #置信度
        list_key_index = []
        for _k in keys:
            if _k=="doctitle":
                list_key_index.append(getDiffIndex(the_group,_k,confidence=30))
            else:
                list_key_index.append(getDiffIndex(the_group,_k))

        _index = min(list_key_index)
        if _index>1:
            return the_group[:_index]
        return []

    def get_best_docid(self,base_list):
        if len(base_list)>0:
            base_list.sort(key=lambda x:x["docid"])
            base_list.sort(key=lambda x:x["extract_count"],reverse=True)
            return base_list[0]["docid"]

    def save_dumplicate(self,base_list,best_docid,status_from,status_to):
        #best_docid need check while others can save directly
        list_dict = []
        for item in base_list:
            docid = item["docid"]
            _dict = {"partitionkey":item["partitionkey"],
                     "docid":item["docid"]}
            if docid==best_docid:
                if item.get("save",1)!=0:
                    _dict["save"] = 1
            else:
                _dict["save"] = 0
            if item.get("status")>=status_from[0] and item.get("status")<=status_from[1]:
                _dict["status"] = random.randint(status_to[0],status_to[1])
            list_dict.append(_dict)
        for _dict in list_dict:
            dtmp = Document_tmp(_dict)
            dtmp.update_row(self.ots_client)


    def flow_test(self,status_to=[1,10]):
        def producer():
            bool_query = BoolQuery(must_queries=[
                # ExistsQuery("docid"),
                # RangeQuery("crtime",range_to='2022-04-10'),
                #                                 RangeQuery("status",61),
                                                 NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
                                                 ],
                                   must_not_queries=[
                                       # NestedQuery("page_attachments",WildcardQuery("page_attachments.fileMd5","*")),
                                       TermQuery("attachment_extract_status",1),
                                       RangeQuery("status",1,11)
                                                     ]
                                   )
            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                                ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
            log("flow_init producer total_count:%d"%total_count)
            list_dict = getRow_ots(rows)
            for _dict in list_dict:
                self.queue_init.put(_dict)
            _count = len(list_dict)
            while next_token and _count<1000000:
                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                    ColumnsToGet(["docid"],return_type=ColumnReturnType.SPECIFIED))
                list_dict = getRow_ots(rows)
                for _dict in list_dict:
                    self.queue_init.put(_dict)
                _count += len(list_dict)
                print("%d/%d"%(_count,total_count))
        def comsumer():
            mt = MultiThreadHandler(self.queue_init,comsumer_handle,None,30,1,ots_client=self.ots_client)
            mt.run()


        def comsumer_handle(item,result_queue,ots_client):
            # print(item)
            dtmp = Document_tmp(item)

            dtmp.setValue(document_tmp_status,random.randint(*status_to),True)
            dtmp.update_row(ots_client)
            # dhtml = Document_html(item)
            # dhtml.update_row(ots_client)


            # dtmp.delete_row(ots_client)
            # dhtml.delete_row(ots_client)


        producer()
        comsumer()


    def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
        def producer(columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json]):
            bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True)])
            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                                ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            log("flow_dumplicate producer total_count:%d"%total_count)
            list_dict = getRow_ots(rows)
            for _dict in list_dict:
                self.queue_dumplicate.put(_dict)
            _count = len(list_dict)
            while next_token and _count<flow_process_count:
                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                    ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
                list_dict = getRow_ots(rows)
                for _dict in list_dict:
                    self.queue_dumplicate.put(_dict)
                _count += len(list_dict)
        def comsumer():
            mt = MultiThreadHandler(self.queue_dumplicate,comsumer_handle,None,10,1,ots_client=self.ots_client)
            mt.run()


        def comsumer_handle(item,result_queue,ots_client):
            self.post_extract(item)

            base_list = []
            set_docid = set()

            list_rules = self.translate_dumplicate_rules(flow_dumplicate_status_from,item)

            list_rules.sort(key=lambda x:x["confidence"],reverse=True)
            # print(item,"len_rules",len(list_rules))
            for _rule in list_rules:
                _query = _rule["query"]
                confidence = _rule["confidence"]
                singleNum_keys = _rule["singleNum_keys"]
                contain_keys = _rule["contain_keys"]
                multiNum_keys = _rule["multiNum_keys"]
                self.add_data_by_query(item,base_list,set_docid,_query,confidence,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys)

            item["confidence"] = 999
            if item.get(document_tmp_docid) not in set_docid:
                base_list.append(item)

            final_list = self.dumplicate_fianl_check(base_list)
            best_docid = self.get_best_docid(final_list)
            # log(str(final_list))

            _d = {"partitionkey":item["partitionkey"],
                  "docid":item["docid"],
                  "status":random.randint(*flow_dumplicate_status_to),
                  document_tmp_opertime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")
                  }
            dtmp = Document_tmp(_d)

            dup_docid = set()
            for _dict in final_list:
                dup_docid.add(_dict.get(document_tmp_docid))
            if item.get(document_tmp_docid) in dup_docid:
                dup_docid.remove(item.get(document_tmp_docid))

            if len(final_list)==0 or best_docid==item.get(document_tmp_docid):
                dtmp.setValue(document_tmp_save,1,True)
                dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
                dmp_docid = ",".join([str(a) for a in list(dup_docid)])
            else:
                dtmp.setValue(document_tmp_save,0,True)
                if best_docid in dup_docid:
                    dup_docid.remove(best_docid)
                    dmp_docid = ",".join([str(a) for a in list(dup_docid)])
                    dmp_docid = "%d,%s"%(best_docid,dmp_docid)
                else:
                    dmp_docid = ",".join([str(a) for a in list(dup_docid)])


            dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
            dtmp.update_row(self.ots_client)

            #只保留当前公告
            # self.save_dumplicate(final_list,best_docid,status_from,status_to)
            #
            # print("=base=",item)
            # if len(final_list)>=1:
            #     print("==================")
            #     for _dict in final_list:
            #         print(_dict)
            #     print("========>>>>>>>>>>")

        producer()
        comsumer()


    def merge_document(self,item,status_to=None):
        self.post_extract(item)
        docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)


        _d = {"partitionkey":item["partitionkey"],
              "docid":item["docid"],
              }
        dtmp = Document_tmp(_d)
        if item.get(document_tmp_save,1)==1:
            list_should_q = []
            if project_code!="" and tenderee!="":
                _q = BoolQuery(must_queries=[MatchQuery("project_code",project_code),
                                             TermQuery("tenderee",tenderee)])
                list_should_q.append(_q)

            if project_name!="" and project_code!="":
                _q = BoolQuery(must_queries=[MatchQuery("project_code",project_code),
                                             TermQuery("project_name",project_name)])
                list_should_q.append(_q)
            if len(list_should_q)>0:
                list_data = self.search_data_by_query(item,list_should_q,100,merge=True,table_name="project2",table_index="project2_index_formerge",sort_column="tenderee",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=["tenderee","win_tenderer"])

                if len(list_data)==1:
                    dtmp.setValue("merge_uuid",list_data[0]["uuid"],True)
                    print(item["docid"],list_data[0]["uuid"])
            else:
                list_should_q = []
                if bidding_budget!="" and project_code!="":
                    _q = BoolQuery(must_queries=[MatchQuery("project_code",project_code),
                                                 TermQuery("bidding_budget",float(bidding_budget))])
                    list_should_q.append(_q)
                if tenderee!="" and bidding_budget!="" and project_name!="":
                    _q = BoolQuery(must_queries=[MatchQuery("tenderee",tenderee),
                                                 TermQuery("bidding_budget",float(bidding_budget)),
                                                 TermQuery("project_name",project_name)])
                    list_should_q.append(_q)
                if tenderee!="" and win_bid_price!="" and project_name!="":
                    _q = BoolQuery(must_queries=[MatchQuery("tenderee",tenderee),
                                                 TermQuery("win_bid_price",float(win_bid_price)),
                                                 TermQuery("project_name",project_name)])
                    list_should_q.append(_q)
                    if len(list_should_q)>0:
                        list_data = self.search_data_by_query(item,list_should_q,100,table_name="project2",table_index="project2_index_formerge",sort_column="tenderee",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=["tenderee","win_tenderer"])

                        if len(list_data)==1:
                            dtmp.setValue("merge_uuid",list_data[0]["uuid"],True)
                            print(item["docid"],list_data[0]["uuid"])

        return dtmp.getProperties().get("merge_uuid","")
        # dtmp.update_row(self.ots_client)


    def test_merge(self):
        import pandas as pd
        import queue
        def producer(columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json]):
            list_test_item = []
            should_q = BoolQuery(should_queries=[
                TermQuery("docchannel",101),
                TermQuery("docchannel",119),
                TermQuery("docchannel",120)
            ])
            bool_query = BoolQuery(must_queries=[
                TermQuery("page_time","2022-04-22"),
                should_q,
                TermQuery("save",1)

            ])
            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                                ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            log("flow_dumplicate producer total_count:%d"%total_count)
            list_dict = getRow_ots(rows)
            for _dict in list_dict:
                list_test_item.append(_dict)
            _count = len(list_dict)
            while next_token:
                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                    ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
                list_dict = getRow_ots(rows)
                for _dict in list_dict:
                    list_test_item.append(_dict)
                _count += len(list_dict)
                print("%d/%d"%(_count,total_count))
            return list_test_item

        from BaseDataMaintenance.model.ots.project import Project
        def comsumer_handle(item,result_queue,ots_client):
            item["merge_uuid"] = self.merge_document(item)
            if item["merge_uuid"]!="":
                _dict = {"uuid":item["merge_uuid"]}
                _p = Project(_dict)
                _p.fix_columns(self.ots_client,["zhao_biao_page_time"],True)
                if _p.getProperties().get("zhao_biao_page_time","")!="":
                    item["是否有招标"] = "是"


        list_test_item = producer()
        task_queue = queue.Queue()
        for item in list_test_item:
            task_queue.put(item)
        mt = MultiThreadHandler(task_queue,comsumer_handle,None,30,1,ots_client=self.ots_client)
        mt.run()
        keys = [document_tmp_docid,document_tmp_docchannel,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_doctitle_refine,"win_tenderer","bidding_budget","win_bid_price","merge_uuid","是否有招标"]
        df_data = {}
        for k in keys:
            df_data[k] = []
        for item in list_test_item:
            for k in keys:
                df_data[k].append(item.get(k,""))

        df = pd.DataFrame(df_data)
        df.to_excel("test_merge.xlsx",columns=keys)


    def flow_merge(self,process_count=10000,status_from=[71,80],status_to=[81,90]):
        def producer(columns=[document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json]):
            bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True)])
            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                                ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            log("flow_merge producer total_count:%d"%total_count)
            list_dict = getRow_ots(rows)
            for _dict in list_dict:
                self.queue_merge.put(_dict)
            _count = len(list_dict)
            while next_token and _count<process_count:
                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                    ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
                list_dict = getRow_ots(rows)
                for _dict in list_dict:
                    self.queue_merge.put(_dict)
                _count += len(list_dict)
        def comsumer():
            mt = MultiThreadHandler(self.queue_merge,comsumer_handle,None,10,1,ots_client=self.ots_client)
            mt.run()


        def comsumer_handle(item,result_queue,ots_client):
            self.merge_document(item,status_to)

        # producer()
        # comsumer()
        pass

    def flow_syncho(self,status_from=[71,80],status_to=[81,90]):
        pass

    def flow_remove(self,process_count=flow_process_count,status_from=flow_remove_status_from):

        def producer():
            current_date = getCurrent_date("%Y-%m-%d")
            tmp_date = timeAdd(current_date,-4)
            bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True),
                                                 RangeQuery(document_tmp_crtime,range_to="%s 00:00:00"%(tmp_date))])
            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                                ColumnsToGet(return_type=ColumnReturnType.NONE))
            log("flow_remove producer total_count:%d"%total_count)
            list_dict = getRow_ots(rows)
            for _dict in list_dict:
                self.queue_remove.put(_dict)
            _count = len(list_dict)
            while next_token:
                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                    ColumnsToGet(return_type=ColumnReturnType.NONE))
                list_dict = getRow_ots(rows)
                for _dict in list_dict:
                    self.queue_remove.put(_dict)
                _count += len(list_dict)
        def comsumer():
            mt = MultiThreadHandler(self.queue_remove,comsumer_handle,None,10,1,ots_client=self.ots_client)
            mt.run()


        def comsumer_handle(item,result_queue,ots_client):
            dtmp = Document_tmp(item)
            dtmp.delete_row(self.ots_client)
            dhtml = Document_html(item)
            dhtml.delete_row(self.ots_client)
        producer()
        comsumer()


    def start_flow_dumplicate(self):
        schedule = BlockingScheduler()
        schedule.add_job(self.flow_dumplicate,"cron",second="*/10")
        schedule.start()

    def start_flow_merge(self):
        schedule = BlockingScheduler()
        schedule.add_job(self.flow_merge,"cron",second="*/10")
        schedule.start()

    def start_flow_remove(self):
        schedule = BlockingScheduler()
        schedule.add_job(self.flow_remove,"cron",hour="20")
        schedule.start()

def download_attachment():
    ots_client = getConnect_ots()
    queue_attachment = Queue()

    auth = getAuth()
    oss2.defaults.connection_pool_size = 100
    oss2.defaults.multiget_num_threads = 20
    attachment_bucket_name = "attachment-hub"
    if is_internal:
        bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
    else:
        bucket_url = "http://oss-cn-hangzhou.aliyuncs.com"
    bucket = oss2.Bucket(auth,bucket_url,attachment_bucket_name)
    current_path = os.path.dirname(__file__)
    def producer():
        columns = [document_tmp_attachment_path]
        bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_crtime,"2022-03-29 15:00:00","2022-03-29 17:00:00",True,True)])
        rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
                                                                            SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status",SortOrder.DESC)]),limit=100,get_total_count=True),
                                                                            ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        log("flow_attachment producer total_count:%d"%total_count)
        list_dict = getRow_ots(rows)
        for _dict in list_dict:
            queue_attachment.put(_dict)
        _count = len(list_dict)
        while next_token:
            rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
                                                                                SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            list_dict = getRow_ots(rows)
            for _dict in list_dict:
                queue_attachment.put(_dict)
            _count += len(list_dict)
    def comsumer():
        mt = MultiThreadHandler(queue_attachment,comsumer_handle,None,10,1)
        mt.run()


    def getAttachments(list_filemd5,columns_to_get=[attachment_filemd5,attachment_path,attachment_size,attachment_attachmenthtml,attachment_filetype,attachment_docids,attachment_status,attachment_swfUrls]):
        list_attachment = []
        rows_to_get = []
        for _md5 in list_filemd5[:50]:
            if _md5 is None:
                continue
            primary_key = [(attachment_filemd5,_md5)]
            rows_to_get.append(primary_key)
        req = BatchGetRowRequest()
        req.add(TableInBatchGetRowItem(attachment_table_name,rows_to_get,columns_to_get,None,1))
        try:
            result = ots_client.batch_get_row(req)
            attach_result = result.get_result_by_table(attachment_table_name)
            for item in attach_result:
                if item.is_ok:
                    _dict = getRow_ots_primary(item.row)
                    if _dict is not None:
                        list_attachment.append(attachment(_dict))

        except Exception as e:
            log(str(list_filemd5))
            log("attachProcess comsumer error %s"%str(e))
        return list_attachment

    def comsumer_handle(item,result_queue):
        page_attachments = json.loads(item.get(document_tmp_attachment_path,"[]"))
        if len(page_attachments)==0:
            pass
        else:
            list_fileMd5 = []
            for _atta in page_attachments:
                list_fileMd5.append(_atta.get(document_tmp_attachment_path_filemd5))
            list_attach = getAttachments(list_fileMd5)
            for attach in list_attach:
                filemd5 = attach.getProperties().get(attachment_filemd5)
                _status = attach.getProperties().get(attachment_status)
                _filetype = attach.getProperties().get(attachment_filetype)
                _size = attach.getProperties().get(attachment_size)
                _path = attach.getProperties().get(attachment_path)
                _uuid = uuid4()
                objectPath = attach.getProperties().get(attachment_path)
                localpath = os.path.join(current_path,"download","%s.%s"%(filemd5,_filetype))

                try:
                    if _size>ATTACHMENT_LARGESIZE:
                        pass
                    else:
                        downloadFile(bucket,objectPath,localpath)
                except Exception as e:
                    traceback.print_exc()

    producer()
    comsumer()

def test_attachment_interface():
    current_path = os.path.dirname(__file__)
    task_queue = Queue()
    def producer():
        _count = 0
        list_filename = os.listdir(os.path.join(current_path,"download"))
        for _filename in list_filename:
            _count += 1
            _type = _filename.split(".")[1]
            task_queue.put({"path":os.path.join(current_path,"download",_filename),"file_type":_type})
            if _count>=500:
                break

    def comsumer():
        mt = MultiThreadHandler(task_queue,comsumer_handle,None,10)
        mt.run()

    def comsumer_handle(item,result_queue):
        _path = item.get("path")
        _type = item.get("file_type")
        _data_base64 = base64.b64encode(open(_path,"rb").read())
        #调用接口处理结果
        start_time = time.time()
        _success,_html,swf_images = getAttachDealInterface(_data_base64,_type)
        log("%s result:%s takes:%d"%(_path,str(_success),time.time()-start_time))

    producer()
    comsumer()

class Dataflow_attachment(Dataflow):

    def __init__(self):
        Dataflow.__init__(self)

    def flow_attachment_process(self):
        self.process_comsumer()

    def process_comsumer(self):
        list_thread = []
        thread_count = 60

        for i in range(thread_count):
            list_thread.append(Thread(target=self.process_comsumer_handle))

        for t in list_thread:
            t.start()

        for t in list_thread:
            t.join()


    def process_comsumer_handle(self):
        while 1:
            _flag = False
            try:
                item = self.queue_attachment_ocr.get(True,timeout=0.2)
                self.attachment_recognize(item,None)
            except Exception as e:
                _flag = True
                pass
            try:
                item = self.queue_attachment_not_ocr.get(True,timeout=0.2)
                self.attachment_recognize(item,None)
            except Exception as e:
                _flag = True and _flag
                pass
            if _flag:
                time.sleep(2)

    def attachment_recognize(self,_dict,result_queue):

        item = _dict.get("item")
        list_attach = _dict.get("list_attach")

        dhtml = Document_html({"partitionkey":item.get("partitionkey"),
                               "docid":item.get("docid")})
        dhtml.fix_columns(self.ots_client,["dochtmlcon"],True)


        _dochtmlcon = dhtml.getProperties().get("dochtmlcon","")
        _succeed,list_html,swf_urls = self.rec_attachments_by_interface(list_attach,_dochtmlcon,save=True)
        log(str(swf_urls))
        if not _succeed:
            item[document_tmp_status] = random.randint(*flow_attachment_status_failed_to)
        else:
            dhtml.updateSWFImages(swf_urls)
            dhtml.updateAttachment(list_html)
            dhtml.update_row(self.ots_client)
            item[document_tmp_status] = random.randint(*flow_attachment_status_succeed_to)
            item[document_tmp_attachment_extract_status] = 1
        log("document:%d get attachments with result:%s"%(item.get("docid"),str(_succeed)))
        dtmp = Document_tmp(item)
        dtmp.update_row(self.ots_client)


    def flow_attachment(self):
        self.flow_attachment_producer()
        self.flow_attachment_producer_comsumer()

    def getAttachments(self,list_filemd5,columns_to_get=[attachment_filemd5,attachment_path,attachment_size,attachment_attachmenthtml,attachment_filetype,attachment_docids,attachment_status,attachment_swfUrls]):
        list_attachment = []
        rows_to_get = []
        for _md5 in list_filemd5[:50]:
            if _md5 is None:
                continue
            primary_key = [(attachment_filemd5,_md5)]
            rows_to_get.append(primary_key)
        req = BatchGetRowRequest()
        req.add(TableInBatchGetRowItem(attachment_table_name,rows_to_get,columns_to_get,None,1))
        try:
            result = self.ots_client.batch_get_row(req)
            attach_result = result.get_result_by_table(attachment_table_name)
            for item in attach_result:
                if item.is_ok:
                    _dict = getRow_ots_primary(item.row)
                    if _dict is not None:
                        list_attachment.append(attachment(_dict))

        except Exception as e:
            log(str(list_filemd5))
            log("attachProcess comsumer error %s"%str(e))
        return list_attachment

    def flow_attachment_producer(self,columns=[document_tmp_attachment_path,document_tmp_crtime]):

        qsize_ocr = self.queue_attachment_ocr.qsize()
        qsize_not_ocr = self.queue_attachment_not_ocr.qsize()


        log("queue_attachment_ocr:%d,queue_attachment_not_ocr:%d"%(qsize_ocr,qsize_not_ocr))
        #选择加入数据场景
        if min(qsize_ocr,qsize_not_ocr)>200 or max(qsize_ocr,qsize_not_ocr)>1000:
            return

        #去重
        set_docid = set()
        set_docid = set_docid | set(self.list_attachment_ocr) | set(self.list_attachment_not_ocr)

        if qsize_ocr>0:
            self.list_attachment_ocr = self.list_attachment_ocr[-qsize_ocr:]
        else:
            self.list_attachment_ocr = []
        if qsize_not_ocr>0:
            self.list_attachment_not_ocr = self.list_attachment_not_ocr[-qsize_not_ocr:]
        else:
            self.list_attachment_not_ocr = []


        try:
            bool_query = BoolQuery(must_queries=[
                RangeQuery(document_tmp_status,*flow_attachment_status_from,True,True),
                # TermQuery(document_tmp_docid,234925191),
            ])

            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status",SortOrder.DESC)]),limit=100,get_total_count=True),
                                                                                ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            log("flow_attachment producer total_count:%d"%total_count)
            list_dict = getRow_ots(rows)
            _count = 0
            for _dict in list_dict:
                docid = _dict.get(document_tmp_docid)
                if docid in set_docid:
                    continue
                self.queue_attachment.put(_dict,True)

                _count += 1
            while next_token and _count<flow_process_count:
                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                    ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
                list_dict = getRow_ots(rows)
                for _dict in list_dict:
                    docid = _dict.get(document_tmp_docid)
                    if docid in set_docid:
                        continue
                    self.queue_attachment.put(_dict,True)
                    _count += 1
                log("add attachment count:%d"%(_count))
        except Exception as e:
            log("flow attachment producer error:%s"%(str(e)))
            traceback.print_exc()

    def flow_attachment_producer_comsumer(self):
        log("start flow_attachment comsumer")
        mt = MultiThreadHandler(self.queue_attachment,self.comsumer_handle,None,10,1)
        mt.run()

    def set_queue(self,_dict):
        list_attach = _dict.get("list_attach")
        to_ocr = False
        for attach in list_attach:
            if attach.getProperties().get(attachment_filetype) in ["bmp","jpeg","jpg","png","swf","pdf","tif"]:
                to_ocr = True
                break
        if to_ocr:
            self.queue_attachment_ocr.put(_dict,True)
            # self.list_attachment_ocr.append(_dict.get("item").get(document_tmp_docid))
        else:
            self.queue_attachment_not_ocr.put(_dict,True)
            # self.list_attachment_not_ocr.append(_dict.get("item").get(document_tmp_docid))

    def comsumer_handle(self,item,result_queue):
        try:
            page_attachments = json.loads(item.get(document_tmp_attachment_path,"[]"))
            if len(page_attachments)==0:
                item[document_tmp_status] = random.randint(*flow_attachment_status_succeed_to)
                dtmp = Document_tmp(item)
                dtmp.update_row(self.ots_client)
            else:
                list_fileMd5 = []
                for _atta in page_attachments:
                    list_fileMd5.append(_atta.get(document_tmp_attachment_path_filemd5))

                list_attach = self.getAttachments(list_fileMd5)

                #未上传成功的2小时内不处理
                if len(page_attachments)!=len(list_attach) and time.mktime(time.localtime())-time.mktime(time.strptime(item.get(document_tmp_crtime),"%Y-%m-%d %H:%M:%S"))<7200:
                    item[document_tmp_status] = 1
                    dtmp = Document_tmp(item)
                    dtmp.update_row(self.ots_client)
                    return
                self.set_queue({"item":item,"list_attach":list_attach})
        except Exception as e:
            traceback.print_exc()

    def start_flow_attachment(self):
        schedule = BlockingScheduler()
        schedule.add_job(self.flow_attachment_process,"cron",second="*/20")
        schedule.add_job(self.flow_attachment,"cron",second="*/10")
        schedule.start()

class Dataflow_extract(Dataflow):

    def __init__(self):
        Dataflow.__init__(self)

    def flow_extract_producer(self,columns=[document_tmp_page_time,document_tmp_doctitle,document_tmp_docchannel,document_tmp_status,document_tmp_original_docchannel,document_tmp_web_source_no]):

        q_size = self.queue_extract.qsize()
        if q_size>100:
            return


        set_docid = set(self.list_extract)
        if q_size>0:
            self.list_extract = self.list_extract[-q_size:]
        else:
            self.list_extract = []


        try:
            bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*flow_extract_status_from,True,True)])
            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("status",SortOrder.ASC)]),limit=100,get_total_count=True),
                                                                                ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            log("flow_extract producer total_count:%d"%total_count)
            list_dict = getRow_ots(rows)
            for _dict in list_dict:
                docid = _dict.get(document_tmp_docid)
                if docid in set_docid:
                    self.list_extract.insert(0,docid)
                    continue
                else:
                    self.queue_extract.put(_dict)
                    self.list_extract.append(docid)
            _count = len(list_dict)
            while next_token and _count<flow_process_count:
                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                    ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
                list_dict = getRow_ots(rows)
                for _dict in list_dict:
                    docid = _dict.get(document_tmp_docid)
                    if docid in set_docid:
                        self.list_extract.insert(0,docid)
                        continue
                    else:
                        self.queue_extract.put(_dict)
                        self.list_extract.append(docid)
                _count += len(list_dict)
        except Exception as e:
            log("flow extract producer error:%s"%(str(e)))
            traceback.print_exc()

    def flow_extract(self,):
        self.comsumer()

    def comsumer(self):
        mt = MultiThreadHandler(self.queue_extract,self.comsumer_handle,None,35,1,True)
        mt.run()

    def comsumer_handle(self,item,result_queue):
        dhtml = Document_html({"partitionkey":item.get("partitionkey"),
                               "docid":item.get("docid")})
        dhtml.fix_columns(self.ots_client,["dochtmlcon"],True)

        item[document_tmp_dochtmlcon] = dhtml.getProperties().get(document_tmp_dochtmlcon,"")
        _extract = Document_extract({})
        _extract.setValue(document_extract2_partitionkey,item.get(document_partitionkey))
        _extract.setValue(document_extract2_docid,item.get(document_docid))
        all_done = 1
        if all_done:
            data = item
            resp = requests.post(self.other_url,json=data,headers=self.header)
            if (resp.status_code >=200 and resp.status_code<=210):
                _extract.setValue(document_extract2_other_json,resp.content.decode("utf8"),True)
            else:
                all_done = -1

        data = {}
        for k,v in item.items():
            data[k] = v
        data["timeout"] = 240
        data["doc_id"] = data.get(document_tmp_docid)
        data["content"] = data.get(document_tmp_dochtmlcon,"")
        if document_tmp_dochtmlcon in data:
            data.pop(document_tmp_dochtmlcon)
        data["title"] = data.get(document_tmp_doctitle,"")
        data["web_source_no"] = item.get(document_tmp_web_source_no,"")
        data["original_docchannel"] = item.get(document_tmp_original_docchannel,"")
        if all_done:
            resp = requests.post(self.extract_url,json=data,headers=self.header)
            if (resp.status_code >=200 and resp.status_code<=210):
                _extract.setValue(document_extract2_extract_json,resp.content.decode("utf8"),True)
            else:
                all_done = -2
        if all_done:
            resp = requests.post(self.industy_url,json=data,headers=self.header)
            if (resp.status_code >=200 and resp.status_code<=210):
                _extract.setValue(document_extract2_industry_json,resp.content.decode("utf8"),True)
            else:
                all_done = -3
        _dict = {document_partitionkey:item.get(document_tmp_partitionkey),
                 document_docid:item.get(document_tmp_docid),
                 }
        dtmp = Document_tmp(_dict)
        if all_done!=1:
            sentMsgToDD("要素提取失败：docid:%d with result:%d"%(item.get(document_tmp_docid),all_done))
            dtmp.setValue(document_tmp_status,random.randint(*flow_extract_status_failed_to),True)
            dtmp.update_row(self.ots_client)

        else:

            dtmp.setValue(document_tmp_status,random.randint(*flow_extract_status_succeed_to),True)
            dtmp.update_row(self.ots_client)

            # 插入接口表,上线放开
            _extract.setValue(document_extract2_status,random.randint(1,50),True)
            _extract.update_row(self.ots_client)

        log("process docid:%d %s"%(data["doc_id"],str(all_done)))


    def start_flow_extract(self):
        schedule = BlockingScheduler()
        schedule.add_job(self.flow_extract_producer,"cron",second="*/10")
        schedule.add_job(self.flow_extract,"cron",second="*/10")
        schedule.start()

class Dataflow_dumplicate(Dataflow):

    class DeleteListener():
        def __init__(self,conn,_func,*args,**kwargs):
            self.conn = conn
            self._func = _func

        def on_error(self, headers):
            log('received an error %s' % str(headers.body))

        def on_message(self, headers):
            try:
                message_id = headers.headers["message-id"]
                body = headers.body
                log("get message %s"%(message_id))
                self._func(_dict={"frame":headers,"conn":self.conn},result_queue=None)
            except Exception as e:
                traceback.print_exc()
                pass

        def __del__(self):
            self.conn.disconnect()

    def __init__(self):
        Dataflow.__init__(self)
        self.c_f_get_extractCount = f_get_extractCount()
        self.c_f_get_package = f_get_package()
        logging.basicConfig(level = logging.info,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

        self.delete_comsumer_counts = 2

        self.doc_delete_queue = "/queue/doc_delete_queue"
        self.doc_delete_result = "/queue/doc_delete_result"

        self.pool_mq_ali = ConnectorPool(1,10,getConnect_activateMQ_ali)

        for _ in range(self.delete_comsumer_counts):
            conn = getConnect_activateMQ_ali()
            listener = self.DeleteListener(conn,self.delete_doc_handle)
            createComsumer(listener,self.doc_delete_queue)


    def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart","time_release"]):
        dict_time = {}
        for k in keys:
            dict_time[k] = _extract.get(k)
        return dict_time


    def post_extract(self,_dict):
        win_tenderer,bidding_budget,win_bid_price,_ = self.f_decode_sub_docs_json(_dict.get(document_tmp_project_code),_dict.get(document_tmp_project_name),_dict.get(document_tmp_tenderee),_dict.get(document_tmp_agency),_dict.get(document_tmp_sub_docs_json))
        _dict["win_tenderer"] = win_tenderer
        _dict["bidding_budget"] = bidding_budget
        _dict["win_bid_price"] = win_bid_price
        extract_json = _dict.get(document_tmp_extract_json,"{}")
        _extract = json.loads(extract_json)
        _dict["product"] = ",".join(_extract.get("product",[]))
        _dict["fingerprint"] = _extract.get("fingerprint","")
        _dict["project_codes"] = _extract.get("code",[])
        if len(_dict["project_codes"])>0:
            _dict["project_code"] = _dict["project_codes"][0]
        else:
            _dict["project_code"] = ""
        _dict["doctitle_refine"] = _extract.get("doctitle_refine","")
        _dict["nlp_enterprise"] = str({"indoctextcon":_extract.get("nlp_enterprise",[]),
                                       "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])})
        _dict["extract_count"] = self.c_f_get_extractCount.evaluate(extract_json)
        _dict["package"] = self.c_f_get_package.evaluate(extract_json)
        _dict["project_name"] = _extract.get("name","")
        _dict["dict_time"] = self.get_dict_time(_extract)

    def dumplicate_fianl_check(self,base_list):
        the_group = base_list
        the_group.sort(key=lambda x:x["confidence"],reverse=True)

        _index = 0
        base_fingerprint = "None"
        if len(base_list)>0:
            base_fingerprint = base_list[0]["fingerprint"]
        for _i in range(1,len(base_list)):
            _dict1 = base_list[_i]
            fingerprint_less = _dict1["fingerprint"]
            _pass = True
            if fingerprint_less==base_fingerprint:
                _index = _i
                continue
            for _j in range(min(_i,5)):
                _dict2 = base_list[_j]
                _prob = self.dumplicate_check(_dict1,_dict2,_dict2.get("min_counts",10),b_log=False)
                # print("_prob:",_prob)
                if _prob<=0.1:
                    _pass = False
                    break
            log("checking index:%d"%(_i))
            _index = _i
            if not _pass:
                _index -= 1
                break

        if _index>=1:
            return the_group[:_index+1]
        return []

    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
        document_less = _dict1
        docid_less = _dict1["docid"]
        docchannel_less = document_less["docchannel"]
        page_time_less = document_less["page_time"]
        doctitle_refine_less = document_less["doctitle_refine"]
        project_codes_less = document_less["project_codes"]
        nlp_enterprise_less = document_less["nlp_enterprise"]
        tenderee_less = document_less["tenderee"]
        agency_less = document_less["agency"]
        win_tenderer_less = document_less["win_tenderer"]
        bidding_budget_less = document_less["bidding_budget"]
        win_bid_price_less = document_less["win_bid_price"]
        product_less = document_less["product"]
        package_less = document_less["package"]
        json_time_less = document_less["dict_time"]
        project_name_less = document_less["project_name"]
        fingerprint_less = document_less["fingerprint"]
        extract_count_less = document_less["extract_count"]

        document_greater = _dict2
        docid_greater = _dict2["docid"]
        page_time_greater = document_greater["page_time"]
        doctitle_refine_greater = document_greater["doctitle_refine"]
        project_codes_greater = document_greater["project_codes"]
        nlp_enterprise_greater = document_greater["nlp_enterprise"]
        tenderee_greater = document_greater["tenderee"]
        agency_greater = document_greater["agency"]
        win_tenderer_greater = document_greater["win_tenderer"]
        bidding_budget_greater = document_greater["bidding_budget"]
        win_bid_price_greater = document_greater["win_bid_price"]
        product_greater = document_greater["product"]
        package_greater = document_greater["package"]
        json_time_greater = document_greater["dict_time"]
        project_name_greater = document_greater["project_name"]
        fingerprint_greater = document_greater["fingerprint"]
        extract_count_greater = document_greater["extract_count"]


        if fingerprint_less==fingerprint_greater:
            return 1

        same_count = 0
        all_count = 8
        if len(set(project_codes_less) & set(project_codes_greater))>0:
            same_count += 1
        if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
            same_count += 1
        if getLength(agency_less)>0 and agency_less==agency_greater:
            same_count += 1
        if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
            same_count += 1
        if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
            same_count += 1
        if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
            same_count += 1
        if getLength(project_name_less)>0 and project_name_less==project_name_greater:
            same_count += 1
        if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
            same_count += 1
        base_prob = 0
        if min_counts<3:
            base_prob = 0.9
        elif min_counts<5:
            base_prob = 0.8
        elif min_counts<8:
            base_prob = 0.7
        else:
            base_prob = 0.6
        _prob = base_prob*same_count/all_count
        if _prob<0.1 and min(extract_count_less,extract_count_greater)<=3:
            _prob = 0.15
        if _prob<0.1:
            return _prob

        check_result = {"pass":1}
        if docchannel_less in (51,102,103,104,115,116,117):
            if doctitle_refine_less!=doctitle_refine_greater:
                if page_time_less!=page_time_greater:
                    check_result["docchannel"] = 0
                    check_result["pass"] = 0
                else:
                    check_result["docchannel"] = 2
        if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
            check_result["doctitle"] = 0
            check_result["pass"] = 0
            if b_log:
                logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
        else:
            check_result["doctitle"] = 2

        #added check
        if not check_codes(project_codes_less,project_codes_greater):
            check_result["code"] = 0
            check_result["pass"] = 0
            if b_log:
                logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
        else:
            if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
                check_result["code"] = 2
            else:
                check_result["code"] = 1


        if not check_product(product_less,product_greater):
            check_result["product"] = 0
            check_result["pass"] = 0
            if b_log:
                logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
        else:
            if getLength(product_less)>0 and getLength(product_greater)>0:
                check_result["product"] = 2
            else:
                check_result["product"] = 1

        if not check_demand():
            check_result["pass"] = 0

        if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
                            tenderee_less,tenderee_greater,
                            agency_less,agency_greater,
                            win_tenderer_less,win_tenderer_greater):
            check_result["entity"] = 0
            check_result["pass"] = 0
            if b_log:
                logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
        else:
            if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
                check_result["entity"] = 2
            elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
                check_result["entity"] = 2
            else:
                check_result["entity"] = 1

        if not check_money(bidding_budget_less,bidding_budget_greater,
                           win_bid_price_less,win_bid_price_greater):
            if b_log:
                logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
            check_result["money"] = 0
            check_result["pass"] = 0
        else:
            if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
                check_result["money"] = 2
            elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
                check_result["money"] = 2
            else:
                check_result["money"] = 1

        #added check
        if not check_package(package_less,package_greater):
            if b_log:
                logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
            check_result["package"] = 0
            check_result["pass"] = 0
        else:
            if getLength(package_less)>0 and getLength(package_greater)>0:
                check_result["package"] = 2
            else:
                check_result["package"] = 1

        #added check
        if not check_time(json_time_less,json_time_greater):
            if b_log:
                logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
                if isinstance(json_time_less,dict):
                    time_less = json_time_less
                else:
                    time_less = json.loads(json_time_less)
                if isinstance(json_time_greater,dict):
                    time_greater = json_time_greater
                else:
                    time_greater = json.loads(json_time_greater)
                for k,v in time_less.items():
                    if getLength(v)>0:
                        v1 = time_greater.get(k,"")
                        if getLength(v1)>0:
                            if v!=v1:
                                log("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))

            check_result["time"] = 0
            check_result["pass"] = 0
        else:
            if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
                check_result["time"] = 2
            else:
                check_result["time"] = 1

        if check_result.get("pass",0)==0:
            if b_log:
                logging.info(str(check_result))

            if check_result.get("money",1)==0:
                return 0

            if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
                return _prob
            else:
                return 0
            if check_result.get("time",1)==0:
                return 0
        return _prob

    def search_data_by_query(self,item,_query,confidence,retry_times=3,merge=False,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):

        for _ in range(retry_times):
            try:
                _time = time.time()
                check_time = 0
                if isinstance(_query,list):
                    bool_query = BoolQuery(should_queries=_query)
                else:
                    bool_query = _query
                rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
                                                                                    SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=30,get_total_count=True),
                                                                                    ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
                list_dict = getRow_ots(rows)
                list_data = []
                for _dict in list_dict:
                    self.post_extract(_dict)
                    _docid = _dict.get(document_tmp_docid)
                    if merge:
                        list_data.append(_dict)
                    else:
                        if _docid!=item.get(document_tmp_docid):
                            _time1 = time.time()
                            confidence = self.dumplicate_check(item,_dict,total_count,b_log=False)
                            check_time+= time.time()-_time1

                            _dict["confidence"] = confidence
                            _dict["min_counts"] = total_count
                            list_data.append(_dict)
                all_time = time.time()-_time
                # log("check:%d rows takes%.4f,check%.4f"%(len(list_dict),all_time-check_time,check_time))
                return list_data
            except Exception as e:
                traceback.print_exc()
        return []

    def add_data_by_query(self,item,base_list,set_docid,_query,confidence,table_name,table_index,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_save,document_tmp_status,document_tmp_product,document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
        list_dict = self.search_data_by_query(item,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys,columns=columns)
        for _dict in list_dict:
            _docid = _dict.get(document_tmp_docid)
            confidence = _dict["confidence"]
            if confidence>0.1:
                if _docid not in set_docid:
                    base_list.append(_dict)
                    set_docid.add(_docid)
            set_docid.add(_docid)

    def appendRule(self,list_rules,_dict,base_dict,must_not_dict,confidence,item,to_log=True):
        for k,v in _dict.items():
            if getLength(v)==0:
                return
        _dict.update(base_dict)
        if to_log:
            log(str(_dict))
        _query = self.generate_dumplicate_query(_dict,must_not_dict)
        _rule = {"confidence":confidence,
                 "item":item,
                 "query":_query,
                 "singleNum_keys":[],
                 "contain_keys":[],
                 "multiNum_keys":[]}
        list_rules.append(_rule)

    def translate_dumplicate_rules(self,status_from,item,get_all=False,to_log=False):
        docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
        current_date = getCurrent_date("%Y-%m-%d")
        if page_time=='':
            page_time = current_date

        if page_time>=timeAdd(current_date,-2):
            table_name = "document_tmp"
            table_index = "document_tmp_index"
            base_dict = {
                "docchannel":item["docchannel"],
                "status":[status_from[0]],
                "page_time":[timeAdd(page_time,-2),timeAdd(page_time,2)]
            }
            must_not_dict = {"save":0,"docid":item.get("docid")}
            doctitle_refine_name = "doctitle_refine"
        else:
            table_name = "document"
            table_index = "document_index"
            if get_all:
                _status = [201,450]
            else:
                _status = [201,300]
            base_dict = {
                "docchannel":item["docchannel"],
                "status":_status,
                "page_time":[timeAdd(page_time,-2),timeAdd(page_time,2)]
            }
            must_not_dict = {"docid":item.get("docid")}
            doctitle_refine_name = "doctitle"


        list_rules = []
        singleNum_keys = ["tenderee","win_tenderer"]

        confidence = 100
        self.appendRule(list_rules,{document_tmp_fingerprint:fingerprint},base_dict,must_not_dict,confidence,item)
        confidence = 90
        _dict = {document_tmp_agency:agency,
                 "win_tenderer":win_tenderer,
                 "win_bid_price":win_bid_price}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
        _dict = {document_tmp_agency:agency,
                 "win_tenderer":win_tenderer,
                 "bidding_budget":bidding_budget}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
        _dict = {document_tmp_agency:agency,
                 "win_bid_price":win_bid_price,
                 "bidding_budget":bidding_budget}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
        _dict = {win_tenderer:win_tenderer,
                 "win_bid_price":win_bid_price,
                 "bidding_budget":bidding_budget}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
        _dict = {"tenderee":tenderee,
                 "win_tenderer":win_tenderer,
                 "win_bid_price":win_bid_price}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
        _dict = {"tenderee":tenderee,
                 "win_tenderer":win_tenderer,
                 "bidding_budget":bidding_budget}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"tenderee":tenderee,
                 "win_bid_price":win_bid_price,
                 "bidding_budget":bidding_budget}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
        _dict = {"tenderee":tenderee,
                 "agency":agency,
                 "win_tenderer":win_tenderer}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
        _dict = {"tenderee":tenderee,
                 "agency":agency,
                 "win_bid_price":win_bid_price}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"tenderee":tenderee,
                 "agency":agency,
                 "bidding_budget":bidding_budget}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        confidence=85
        _dict = {"tenderee":tenderee,
                 "agency":agency
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
        _dict = {"tenderee":tenderee,
                 "project_codes":project_code
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
        _dict = {"tenderee":tenderee,
                 "project_name":project_name
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        if getLength(product)>0:
            l_p = product.split(",")
            _dict = {"tenderee":tenderee,
                     "product":l_p[0]
                     }
            self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"tenderee":tenderee,
                 "win_tenderer":win_tenderer
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"tenderee":tenderee,
                 "win_bid_price":win_bid_price
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"tenderee":tenderee,
                 "bidding_budget":bidding_budget
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"tenderee":tenderee,
                 doctitle_refine_name:doctitle_refine
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"agency":agency,
                 "project_codes":project_code
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"agency":agency,
                 "project_name":project_name
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"project_codes":project_code,
                 "project_name":project_name
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"project_codes":project_code,
                 "win_tenderer":win_tenderer
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"project_codes":project_code,
                 "win_bid_price":win_bid_price
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"project_codes":project_code,
                 "bidding_budget":bidding_budget
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"project_codes":project_code,
                 doctitle_refine_name:doctitle_refine
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"project_name":project_name,
                 "win_tenderer":win_tenderer
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"project_name":project_name,
                 "win_bid_price":win_bid_price
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"project_name":project_name,
                 "bidding_budget":bidding_budget
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"project_name":project_name,
                 doctitle_refine_name:doctitle_refine
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"win_tenderer":win_tenderer,
                 "win_bid_price":win_bid_price
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"win_tenderer":win_tenderer,
                 "bidding_budget":bidding_budget
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"win_tenderer":win_tenderer,
                 doctitle_refine_name:doctitle_refine
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"win_bid_price":win_bid_price,
                 "bidding_budget":bidding_budget
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"win_bid_price":win_bid_price,
                 doctitle_refine_name:doctitle_refine
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        _dict = {"bidding_budget":bidding_budget,
                 doctitle_refine_name:doctitle_refine
                 }
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        confidence=80
        _dict = {doctitle_refine_name:doctitle_refine}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
        _dict = {"project_codes":project_code}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        confidence=70
        _dict = {"project_name":project_name}
        self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)

        return list_rules,table_name,table_index


    def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
        def producer(columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]):
            bool_query = BoolQuery(must_queries=[
                RangeQuery(document_tmp_status,*status_from,True,True),
                # TermQuery("docid",271983871)
            ])
            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                                ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
            log("flow_dumplicate producer total_count:%d"%total_count)
            list_dict = getRow_ots(rows)
            for _dict in list_dict:
                self.queue_dumplicate.put(_dict)
            _count = len(list_dict)
            while next_token and _count<flow_process_count:
                rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                                    ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
                list_dict = getRow_ots(rows)
                for _dict in list_dict:
                    self.queue_dumplicate.put(_dict)
                _count += len(list_dict)
        def comsumer():
            mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,60,1,ots_client=self.ots_client)
            mt.run()

        producer()
        comsumer()

    def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count]):
        '''
        根据docid查询公告内容，先查询document_tmp，再查询document
        :param list_docids:
        :return:
        '''
        list_docs = []
        for _docid in list_docids:
            docid = int(_docid)
            _dict = {document_partitionkey:getPartitionKey(docid),
                     document_docid:docid}
            _doc = Document_tmp(_dict)
            _exists = _doc.fix_columns(self.ots_client,columns_to_get,True)
            if not _exists:
                _doc = Document(_dict)
                _exists = _doc.fix_columns(self.ots_client,columns_to_get,True)
            if _exists:
                list_docs.append(_doc)
        for _doc in list_docs:
            try:
                _sub_docs_json = _doc.getProperties().get(document_tmp_sub_docs_json)
                if _sub_docs_json is not None:
                    _doc.setValue("sub_docs",json.loads(_sub_docs_json),False)

            except Exception as e:
                traceback.print_exc()
        list_docs.sort(key=lambda x:x.getProperties().get(document_page_time,""))
        return list_docs

    def is_same_package(self,_dict1,_dict2):
        sub_project_name1 = _dict1.get(project_sub_project_name,"")
        if sub_project_name1=="Project":
            sub_project_name1 = ""
        win_tenderer1 = _dict1.get(project_win_tenderer,"")
        win_bid_price1 = _dict1.get(project_win_bid_price,0)
        bidding_budget1 = _dict1.get(project_bidding_budget,0)

        sub_project_name2 = _dict2.get(project_sub_project_name,"")
        if sub_project_name2=="Project":
            sub_project_name2 = ""
        win_tenderer2 = _dict2.get(project_win_tenderer,"")
        win_bid_price2 = _dict2.get(project_win_bid_price,0)
        bidding_budget2 = _dict2.get(project_bidding_budget,0)

        _set = set([a for a in [sub_project_name1,sub_project_name2] if a!=""])
        if len(_set)>1:
            return False
        _set = set([a for a in [win_tenderer1,win_tenderer2] if a!=""])
        if len(_set)>1:
            return False
        _set = set([a for a in [win_bid_price1,win_bid_price2] if a!=0])
        if len(_set)>1:
            return False
        _set = set([a for a in [bidding_budget1,bidding_budget2] if a!=0])
        if len(_set)>1:
            return False
        return True


    def getUpdate_dict(self,_dict):
        update_dict = {}
        for k,v in _dict.items():
            if v is None:
                continue
            if isinstance(v,str):
                if v=="":
                    continue
            if isinstance(v,(float,int)):
                if v==0:
                    continue
            update_dict[k] = v
        return update_dict

    def update_projects_by_project(self,project_dict,projects):

        _dict = {}
        #更新公共属性
        for k,v in project_dict.items():
            if k in (project_dynamics,project_product,project_project_codes,project_docids,project_uuid):
                continue
            for _proj in projects:
                if k not in _proj:
                    _dict[k] = v
                elif _proj.get(k,"未知") in ('全国',"未知"):
                    _dict[k] = v
        for _proj in projects:
            _proj.update(_dict)

        #拼接属性
        append_dict = {}
        set_docid = set()
        set_product = set()
        set_code = set()
        set_uuid = set()
        for _proj in projects:
            _docids = _proj.get(project_docids,"")
            _codes = _proj.get(project_project_codes,"")
            _product = _proj.get(project_product,"")
            _uuid = _proj.get(project_uuid,"")
            set_docid = set_docid | set(_docids.split(","))
            set_code = set_code | set(_codes.split(","))
            set_product = set_product | set(_product.split(","))
            set_uuid = set_uuid | set(_uuid.split(","))
        set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
        set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
        set_product = set_product | set(project_dict.get(project_product,"").split(","))

        set_uuid = set_uuid | set(project_dict.get(project_uuid,"").split(","))

        append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
        append_dict[project_docid_number] = len(set_docid)
        append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""])
        append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""])
        append_dict[project_uuid] = ",".join([a for a in list(set_uuid) if a!=""])


        dict_dynamic = {}
        set_docid = set()
        for _proj in projects:
            _dynamic = json.loads(_proj.get(project_dynamics,"[]"))
            for _dy in _dynamic:
                _docid = _dy.get("docid")
                dict_dynamic[_docid] = _dy
        _dynamic = json.loads(project_dict.get(project_dynamics,"[]"))
        for _dy in _dynamic:
            _docid = _dy.get("docid")
            dict_dynamic[_docid] = _dy
        list_dynamics = []
        for k,v in dict_dynamic.items():
            list_dynamics.append(v)
        list_dynamics.sort(key=lambda x:x.get(document_page_time,""))

        append_dict[project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)

        for _proj in projects:
            _proj.update(append_dict)


    def update_projects_by_document(self,docid,projects):
        '''
        更新projects中对应的document的属性
        :param docid:
        :param projects: 项目集合
        :param action:add/delete add时附加唯一属性,delete时删除唯一属性
        :return:
        '''
        list_docs = self.search_docs([docid])
        project_dict = self.generate_common_properties(list_docs)

        list_package_properties = self.generate_packages_properties(list_docs)

        _dict = {}
        #更新公共属性
        for k,v in project_dict.items():
            if v is None or v=="":
                continue
            if k in (project_dynamics,project_product,project_project_codes,project_docids):
                continue
            for _proj in projects:
                if k not in _proj:
                    _dict[k] = v
                elif _proj.get(k,"未知")=="未知":
                    _dict[k] = v
        for _proj in projects:
            _proj.update(_dict)

        #拼接属性
        append_dict = {}
        set_docid = set()
        set_product = set()
        set_code = set()
        for _proj in projects:
            _docids = _proj.get(project_docids,"")
            _codes = _proj.get(project_project_codes,"")
            _product = _proj.get(project_product,"")
            set_docid = set_docid | set(_docids.split(","))
            set_code = set_code | set(_codes.split(","))
            set_product = set_product | set(_product.split(","))
        set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
        set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
        set_product = set_product | set(project_dict.get(project_product,"").split(","))

        append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
        append_dict[project_docid_number] = len(set_docid)
        append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""])
        append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""])


        dict_dynamic = {}
        set_docid = set()
        for _proj in projects:
            _dynamic = json.loads(_proj.get(project_dynamics,"[]"))
            for _dy in _dynamic:
                _docid = _dy.get("docid")
                dict_dynamic[_docid] = _dy
        _dynamic = json.loads(project_dict.get(project_dynamics,"[]"))
        for _dy in _dynamic:
            _docid = _dy.get("docid")
            dict_dynamic[_docid] = _dy
        list_dynamics = []
        for k,v in dict_dynamic.items():
            list_dynamics.append(v)
        list_dynamics.sort(key=lambda x:x.get(document_page_time,""))

        append_dict[project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)

        for _proj in projects:
            _proj.update(append_dict)


        dict_package = {}
        for _pp in projects:
            _counts = 0
            sub_project_name = _pp.get(project_sub_project_name,"")
            if sub_project_name=="Project":
                sub_project_name = ""
            win_tenderer = _pp.get(project_win_tenderer,"")
            win_bid_price = _pp.get(project_win_bid_price,0)
            bidding_budget = _pp.get(project_bidding_budget,0)
            if win_tenderer!="" and bidding_budget!=0:
                _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
                dict_package[_key] = _pp
                _counts += 1
            if win_tenderer!="" and  win_bid_price!=0:
                _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
                dict_package[_key] = _pp
                _counts +=1
            if _counts==0:
                if win_tenderer!="":
                    _key = "%s-%s"%(sub_project_name,win_tenderer)
                    dict_package[_key] = _pp
                    _counts += 1
                if bidding_budget!=0:
                    _key = "%s-%s"%(sub_project_name,str(bidding_budget))
                    dict_package[_key] = _pp
                    _counts += 1


        #更新私有属性
        for _pp in list_package_properties:

            flag_update = False
            sub_project_name = _pp.get(project_sub_project_name,"")
            if sub_project_name=="Project":
                sub_project_name = ""
            win_tenderer = _pp.get(project_win_tenderer,"")
            win_bid_price = _pp.get(project_win_bid_price,0)
            bidding_budget = _pp.get(project_bidding_budget,0)
            if win_tenderer!="" and bidding_budget!=0:
                _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
                if _key in dict_package:
                    if self.is_same_package(_pp,dict_package[_key]):
                        ud = self.getUpdate_dict(_pp)
                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
                        dict_package[_key].update(ud)
                        flag_update = True
                        continue
            if win_tenderer!="" and  win_bid_price!=0:
                _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
                if _key in dict_package:
                    if self.is_same_package(_pp,dict_package[_key]):
                        ud = self.getUpdate_dict(_pp)
                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
                        dict_package[_key].update(ud)
                        flag_update = True
                        continue
            if win_tenderer!="":
                _key = "%s-%s"%(sub_project_name,win_tenderer)
                if _key in dict_package:
                    if self.is_same_package(_pp,dict_package[_key]):
                        ud = self.getUpdate_dict(_pp)
                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
                        dict_package[_key].update(ud)
                        flag_update = True
                        continue
            if bidding_budget!=0:
                _key = "%s-%s"%(sub_project_name,str(bidding_budget))
                if _key in dict_package:
                    if self.is_same_package(_pp,dict_package[_key]):
                        ud = self.getUpdate_dict(_pp)
                        self.set_project_uuid(ud,dict_package[_key].get("uuid"))
                        dict_package[_key].update(ud)
                        flag_update = True
                        continue
            if not flag_update:
                _pp.update(project_dict)
                projects.append(_pp)


                _counts = 0
                if win_tenderer!="" and bidding_budget!=0:
                    _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
                    dict_package[_key] = _pp
                _counts += 1
                if win_tenderer!="" and  win_bid_price!=0:
                    _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
                    dict_package[_key] = _pp
                    _counts +=1
                if _counts==0:
                    if win_tenderer!="":
                        _key = "%s-%s"%(sub_project_name,win_tenderer)
                        dict_package[_key] = _pp
                        _counts += 1
                    if bidding_budget!=0:
                        _key = "%s-%s"%(sub_project_name,str(bidding_budget))
                        dict_package[_key] = _pp
                        _counts += 1


    def delete_projects_by_document(self,docid):
        '''
        更新projects中对应的document的属性
        :param docid:
        :param projects: 项目集合
        :param action:add/delete add时附加唯一属性,delete时删除唯一属性
        :return:
        '''

        set_docid = set()
        list_delete_projects = []
        list_projects = self.search_projects_with_document([docid])
        for _proj in list_projects:
            _p = {}
            _docids = _proj.get(project_docids,"")
            print(_proj.get(project_uuid))
            _p["delete_uuid"] = _proj.get(project_uuid)
            _p["to_delete"] = True
            list_delete_projects.append(_p)
            if _docids!="":
                set_docid = set_docid | set(_docids.split(","))
        if str(docid) in set_docid:
            set_docid.remove(str(docid))
        list_docid = list(set_docid)

        list_projects = []
        if len(list_docid)>0:
            list_docs = self.search_docs(list_docid)
            list_projects = self.generate_projects_from_document(list_docs)
            list_projects = self.dumplicate_projects(list_projects)
        list_projects.extend(list_delete_projects)
        project_json = self.to_project_json(list_projects)
        print("delete_json",project_json)
        return project_json


    def delete_doc_handle(self,_dict,result_queue):
        headers = _dict.get("frame")
        conn = _dict.get("conn")
        log("==========delete")
        if headers is not None:
            message_id = headers.headers["message-id"]
            body = headers.body
            item = json.loads(body)
            docid = item.get("docid")
            if docid is None:
                return
            delete_result = self.delete_projects_by_document(docid)
            if send_msg_toacmq(self.pool_mq_ali,delete_result,self.doc_delete_result):
                ackMsg(conn,message_id)

    def generate_common_properties(self,list_docs):
        '''
        #通用属性生成
        :param list_docis:
        :return:
        '''
        #计数法选择
        choose_dict = {}
        project_dict = {}
        for _key in [document_bidway,document_industry,document_info_type,document_info_source,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count]:
            for _doc in list_docs:
                _value = _doc.getProperties().get(_key,"")
                if _value!="":
                    if _key not in choose_dict:
                        choose_dict[_key] = {}
                    if _value not in choose_dict[_key]:
                        choose_dict[_key][_value] = 0
                    choose_dict[_key][_value] += 1


        _find = False
        for _key in [document_district,document_city,document_province,document_area]:
            area_dict = {}
            for _doc in list_docs:
                loc = _doc.getProperties().get(_key,"未知")
                if loc not in ('全国','未知',"0"):
                    if loc not in area_dict:
                        area_dict[loc] = 0
                    area_dict[loc] += 1
            list_loc = []
            for k,v in area_dict.items():
                list_loc.append([k,v])
            list_loc.sort(key=lambda x:x[1],reverse=True)
            if len(list_loc)>0:
                project_dict[document_district] = _doc.getProperties().get(document_district)
                project_dict[document_city] = _doc.getProperties().get(document_city)
                project_dict[document_province] = _doc.getProperties().get(document_province)
                project_dict[document_area] = _doc.getProperties().get(document_area)
                _find = True
                break
        if not _find:
            if len(list_docs)>0:
                project_dict[document_district] = list_docs[0].getProperties().get(document_district)
                project_dict[document_city] = list_docs[0].getProperties().get(document_city)
                project_dict[document_province] = list_docs[0].getProperties().get(document_province)
                project_dict[document_area] = list_docs[0].getProperties().get(document_area)


        print("choose_dict",choose_dict)
        for _key,_value in choose_dict.items():
            _l = []
            for k,v in _value.items():
                _l.append([k,v])
            _l.sort(key=lambda x:x[1],reverse=True)
            if len(_l)>0:
                _v = _l[0][0]
                if _v in ('全国','未知'):
                    if len(_l)>1:
                        _v = _l[1][0]
                project_dict[_key] = _v


        list_dynamics = []
        docid_number = 0
        visuable_docids = []
        zhao_biao_page_time = ""
        zhong_biao_page_time = ""
        list_codes = []

        list_product = []
        p_page_time = ""
        remove_docids = set()
        for _doc in list_docs:
            table_name = _doc.getProperties().get("table_name")
            status = _doc.getProperties().get(document_status,0)
            _save = _doc.getProperties().get(document_tmp_save,1)
            doctitle = _doc.getProperties().get(document_doctitle,"")
            docchannel = _doc.getProperties().get(document_docchannel)
            page_time = _doc.getProperties().get(document_page_time,"")
            _docid = _doc.getProperties().get(document_docid)
            _bidway = _doc.getProperties().get(document_bidway,"")
            _docchannel = _doc.getProperties().get(document_life_docchannel,0)
            project_codes = _doc.getProperties().get(document_project_codes)
            product = _doc.getProperties().get(document_product)
            sub_docs = _doc.getProperties().get("sub_docs",[])

            is_multipack = True if len(sub_docs)>1 else False
            extract_count = _doc.getProperties().get(document_tmp_extract_count,0)

            if product is not None:
                list_product.extend(product.split(","))

            if project_codes is not None:
                _c = project_codes.split(",")
                list_codes.extend(_c)

            if p_page_time=="":
                p_page_time = page_time

            print("docid %s page_time:%s docchannel %s"%(str(_docid),str(page_time),str(_docchannel)))
            if zhao_biao_page_time=="" and _docchannel in (51,52,102,103,114):
                zhao_biao_page_time = page_time
            if zhong_biao_page_time=="" and _docchannel in (101,118,119,120):
                zhong_biao_page_time = page_time
            is_visuable = 0
            if table_name=="document":
                if status>=201 and status<=300:
                    docid_number +=1
                    visuable_docids.append(str(_docid))
                    is_visuable = 1
                else:
                    remove_docids.add(str(_docid))
            else:
                if _save==1:
                    docid_number +=1
                    visuable_docids.append(str(_docid))
                    is_visuable = 1
                else:
                    remove_docids.add(str(_docid))
            list_dynamics.append({document_docid:_docid,
                                     document_doctitle:doctitle,
                                     document_docchannel:_docchannel,
                                     document_bidway:_bidway,
                                     document_page_time:page_time,
                                     document_status:201 if is_visuable==1 else 401,
                                  "is_multipack":is_multipack,
                                  document_tmp_extract_count:extract_count
                                  }
                                    )

        project_dict[project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
        project_dict[project_docid_number] = docid_number
        project_dict[project_docids] = ",".join(list(set(visuable_docids)-remove_docids))
        if zhao_biao_page_time !="":
            project_dict[project_zhao_biao_page_time] = zhao_biao_page_time
        if zhong_biao_page_time !="":
            project_dict[project_zhong_biao_page_time] = zhong_biao_page_time
        project_dict[project_project_codes] = ",".join(list(set(list_codes)))
        project_dict[project_page_time] = p_page_time
        project_dict[project_product] = ",".join(list(set(list_product)))

        return project_dict


    def generate_packages_properties(self,list_docs):
        '''
        生成分包属性
        :param list_docs:
        :return:
        '''

        list_properties = []
        set_key = set()
        for _doc in list_docs:
            _dict = {}
            sub_docs = _doc.getProperties().get("sub_docs")
            if sub_docs is not None:
                for _d in sub_docs:
                    sub_project_code = _d.get(project_sub_project_code,"")
                    sub_project_name = _d.get(project_sub_project_name,"")
                    win_tenderer = _d.get(project_win_tenderer,"")
                    win_bid_price = _d.get(project_win_bid_price,"")
                    _key = "%s-%s-%s-%s"%(sub_project_code,sub_project_name,win_tenderer,win_bid_price)
                    if _key in set_key:
                        continue
                    set_key.add(_key)
                    list_properties.append(_d)
        return list_properties


    def generate_projects_from_document(self,list_docs):
        '''
        #通过公告生成projects
        :param list_docids:
        :return:
        '''
        #判断标段数

        list_projects = []

        project_dict = self.generate_common_properties(list_docs)

        list_package_properties = self.generate_packages_properties(list_docs)
        #生成包数据
        for _pp in list_package_properties:
            _pp.update(project_dict)
            list_projects.append(_pp)

        return list_projects

    def search_projects_with_document(self,list_docids):
        '''
        通过docid集合查询对应的projects
        :param list_docids:
        :return:
        '''
        print("==",list_docids)
        list_should_q = []
        for _docid in list_docids:
            list_should_q.append(TermQuery("docids",_docid))
        bool_query = BoolQuery(should_queries=list_should_q)
        _query = {"query":bool_query,"limit":20}
        list_project_dict = getDocument(_query,self.ots_client,[
            project_uuid,project_docids,project_zhao_biao_page_time,
            project_zhong_biao_page_time,
            project_page_time,
            project_area,
            project_province,
            project_city,
            project_district,
            project_info_type,
            project_industry,
            project_qcodes,
            project_project_name,
            project_project_code,
            project_project_codes,
            project_project_addr,
            project_tenderee,
            project_tenderee_addr,
            project_tenderee_phone,
            project_tenderee_contact,
            project_agency,
            project_agency_phone,
            project_agency_contact,
            project_sub_project_name,
            project_sub_project_code,
            project_bidding_budget,
            project_win_tenderer,
            project_win_bid_price,
            project_win_tenderer_manager,
            project_win_tenderer_phone,
            project_second_tenderer,
            project_second_bid_price,
            project_second_tenderer_manager,
            project_second_tenderer_phone,
            project_third_tenderer,
            project_third_bid_price,
            project_third_tenderer_manager,
            project_third_tenderer_phone,
            project_procurement_system,
            project_bidway,
            project_dup_data,
            project_docid_number,
            project_dynamics
        ],sort="page_time",table_name="project2",table_index="project2_index")

        return list_project_dict

    def set_project_uuid(self,_dict,_uuid):
        if _uuid is not None and _uuid!="":
            if "uuid" in _dict:
                _dict["uuid"] = "%s,%s"%(_dict["uuid"],_uuid)
            else:
                _dict["uuid"] = _uuid

    def dumplicate_projects(self,list_projects):
        '''
        对多标段项目进行去重
        :return:
        '''
        cluster_projects = list_projects
        while 1:
            _update = False
            list_p = []

            for _pp in cluster_projects:
                _find = False
                for _p in list_p:
                    if self.check_merge_rule(_p,_pp):
                        self.update_projects_by_project(_pp,[_p])
                        _find = True
                        _update = True
                if not _find:
                    list_p.append(_pp)

            if len(cluster_projects)==len(list_p):
                break
            cluster_projects = list_p

        return cluster_projects

    def getMerge_rules(self,page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price):
        list_query = []
        page_time_less = timeAdd(page_time,-150)
        page_time_greater = timeAdd(page_time,120)
        list_code = [a for a in project_codes.split(",") if a!='']
        should_q_code = BoolQuery(should_queries=[MatchQuery(project_project_codes,a) for a in list_code[:20]])
        should_q_cod = BoolQuery(should_queries=[MatchQuery(project_project_code,a) for a in list_code[:20]])
        list_product = [a for a in product.split(",") if a!='']
        should_q_product = BoolQuery(should_queries=[MatchQuery(project_product,a) for a in list_product[:20]])

        sub_project_q = TermQuery(project_sub_project_name,sub_project_name) if sub_project_name.replace("Project","")!="" else ExistsQuery(project_uuid)

        log("page_time_less %s"%(page_time_less))
        log("page_time_greater %s"%(page_time_greater))
        log("list_code %s"%(str(list_code)))
        log("list_product %s"%(str(list_product)))
        log("tenderee %s"%(tenderee))
        log("bidding_budget %s"%(bidding_budget))
        log("win_tenderer %s"%(win_tenderer))
        log("win_bid_price %s"%(win_bid_price))
        log("project_name %s"%(project_name))

        if tenderee!="" and len(list_code)>0:
            _query = [TermQuery(project_tenderee,tenderee),
                                             sub_project_q,
                                             should_q_code,
                                             RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,2])

            _query = [TermQuery(project_tenderee,tenderee),
                      sub_project_q,
                      should_q_cod,
                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,2])

        if tenderee!="" and len(list_product)>0:
            _query = [TermQuery(project_tenderee,tenderee),
                      sub_project_q,
                      should_q_product,
                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,2])

        if tenderee!="" and project_name!="":
            _query = [TermQuery(project_tenderee,tenderee),
                                             sub_project_q,
                                             TermQuery(project_project_name,project_name),
                                             RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,2])

        if tenderee!="" and bidding_budget>0:
            _query = [TermQuery(project_tenderee,tenderee),
                                             sub_project_q,
                                             TermQuery(project_bidding_budget,bidding_budget),
                                             RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,2])

        if tenderee!="" and win_tenderer!="":
            _query = [TermQuery(project_tenderee,tenderee),
                      sub_project_q,
                      TermQuery(project_win_tenderer,win_tenderer),
                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,2])

        if win_tenderer!="" and len(list_code)>0:
            _query = [TermQuery(project_win_tenderer,win_tenderer),
                                             sub_project_q,
                                             should_q_code,
                                             RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,2])

            _query = [TermQuery(project_win_tenderer,win_tenderer),
                      sub_project_q,
                      should_q_cod,
                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,2])

        if win_tenderer!="" and win_bid_price>0:
            _query = [TermQuery(project_win_tenderer,win_tenderer),
                                             sub_project_q,
                                             TermQuery(project_win_bid_price,win_bid_price),
                                             RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,2])

        if len(list_code)>0:
            _query = [
                      sub_project_q,
                      should_q_code,
                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,1])

            _query = [
                sub_project_q,
                should_q_cod,
                RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,1])

        if project_name!="":
            _query = [
                      sub_project_q,
                      TermQuery(project_project_name,project_name),
                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
            list_query.append([_query,1])
        return list_query

    def check_merge_rule(self,_proj,_dict,b_log=False):
        page_time = _proj.get(project_page_time,"")
        project_codes = _proj.get(project_project_codes,"")
        project_name = _proj.get(project_project_name,"")
        tenderee = _proj.get(project_tenderee,"")
        agency = _proj.get(project_agency,"")
        product = _proj.get(project_product,"")
        sub_project_name = _proj.get(project_sub_project_name,"")
        bidding_budget = _proj.get(project_bidding_budget,-1)
        win_tenderer = _proj.get(project_win_tenderer,"")
        win_bid_price = _proj.get(project_win_bid_price,-1)
        project_code = _proj.get(project_project_code,"")

        list_code = [a for a in project_codes.split(",") if a!='']
        if project_code!="":
            list_code.append(project_code)

        page_time_to_merge = _dict.get(project_page_time,"")
        project_codes_to_merge = _dict.get(project_project_codes,"")
        project_name_to_merge = _dict.get(project_project_name,"")
        tenderee_to_merge = _dict.get(project_tenderee,"")
        agency_to_merge = _dict.get(project_agency,"")
        product_to_merge = _dict.get(project_product,"")
        sub_project_name_to_merge = _dict.get(project_sub_project_name,"")
        bidding_budget_to_merge = _dict.get(project_bidding_budget,-1)
        win_tenderer_to_merge = _dict.get(project_win_tenderer,"")
        win_bid_price_to_merge = _dict.get(project_win_bid_price,-1)
        project_code_to_merge = _dict.get(project_project_code,"")

        list_code_to_merge = [a for a in project_codes_to_merge.split(",") if a!='']
        if project_code_to_merge!="":
            list_code.append(project_code_to_merge)

        #check sub_project_name
        _set = set([a for a in [sub_project_name.replace("Project",""),sub_project_name_to_merge.replace("Project","")] if a!=""])
        if len(_set)>1:
            if b_log:
                log("check sub_project_name failed %s===%s"%(str(_proj),str(_dict)))
            return False

        _set = set([a for a in [tenderee,tenderee_to_merge] if a!=""])
        if len(_set)>1:
            if b_log:
                log("check tenderee failed %s===%s"%(str(_proj),str(_dict)))
            return False
        _set = set([a for a in [agency,agency_to_merge] if a!=""])
        if len(_set)>1:
            if b_log:
                log("check agency failed %s===%s"%(str(_proj),str(_dict)))
            return False
        _set = set([a for a in [win_tenderer,win_tenderer_to_merge] if a!=""])
        if len(_set)>1:
            if b_log:
                log("check win_tenderer failed %s===%s"%(str(_proj),str(_dict)))
            return False

        _set = set([a for a in [bidding_budget,bidding_budget_to_merge] if a>0])
        if len(_set)>1:
            if b_log:
                log("check bidding_budget failed %s===%s"%(str(_proj),str(_dict)))
            return False

        _set = set([a for a in [win_bid_price,win_bid_price_to_merge] if a>0])
        if len(_set)>1:
            if b_log:
                log("check win_bid_price failed %s===%s"%(str(_proj),str(_dict)))
            return False

        #check project_codes
        has_same = False
        has_similar = False
        for _c in list_code:
            for _c1 in list_code_to_merge:
                _simi = getSimilarityOfString(_c,_c1)
                if _simi==1:
                    has_same = True
                elif _simi>0.7:
                    has_similar = True

        if not has_same and has_similar:
            if b_log:
                log("check code failed %s===%s"%(str(_proj),str(_dict)))
            return False

        #check product
        set_product = set([a for a in product.split(",") if a!=""])
        set_product_to_merge = set([a for a in product_to_merge.split(",") if a!=""])
        if len(set_product)>0 and len(set_product_to_merge)>0 and len(set_product&set_product_to_merge)==0:
            if b_log:
                log("check product failed %s===%s"%(str(_proj),str(_dict)))
            return False

        #check money
        _set = set([a for a in [bidding_budget,bidding_budget_to_merge] if a>0])

        _set1 = set([a for a in [win_bid_price,win_bid_price_to_merge] if a>0])

        if len(_set)==1 and len(_set1)==1:
            if max(_set1)>max(_set):
                if b_log:
                    log("check money failed %s===%s"%(str(_proj),str(_dict)))
                return False

        return True


    def merge_projects(self,list_projects,columns=[project_tenderee,project_agency,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_sub_project_name,project_product,project_zhao_biao_page_time,project_zhong_biao_page_time,project_project_code,project_project_codes,project_docids]):
        '''
        对项目进行合并
        :return:
        '''
        set_uuid = set()
        for _proj in list_projects:
            _uuid = _proj.get("uuid")
            if _uuid is not None:
                set_uuid = set_uuid | set(_uuid.split(","))
        must_not_q = []
        for _uuid in list(set_uuid):
            must_not_q.append(TermQuery("uuid",_uuid))

        for _proj in list_projects:
            page_time = _proj.get(project_page_time,"")
            project_codes = _proj.get(project_project_codes,"")
            project_name = _proj.get(project_project_name,"")
            tenderee = _proj.get(project_tenderee,"")
            agency = _proj.get(project_agency,"")
            product = _proj.get(project_product,"")
            sub_project_name = _proj.get(project_sub_project_name,"")
            bidding_budget = _proj.get(project_bidding_budget,-1)
            win_tenderer = _proj.get(project_win_tenderer,"")
            win_bid_price = _proj.get(project_win_bid_price,-1)

            list_must_query = self.getMerge_rules(page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price)

            print("rules count:%d"%(len(list_must_query)))

            list_merge_data = []

            for must_q,_count in list_must_query:
                _query = BoolQuery(must_queries=must_q,
                                   must_not_queries=must_not_q[:100])
                _limit = _count*10
                rows,next_token,total_count,is_all_succeed = self.ots_client.search("project2","project2_index_formerge",
                                                                                    SearchQuery(_query,limit=_limit),
                                                                                    columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
                list_data = getRow_ots(rows)

                list_merge_data.extend(list_data)

                # print(list_data)
                for _data in list_data:

                    must_not_q.append(TermQuery(project_uuid,_data.get(project_uuid)))

            #优先匹配招标金额相近的
            list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
            for _data in list_merge_data:
                if self.check_merge_rule(_proj,_data,b_log=False):
                    print("pass",_data)
                    self.update_projects_by_project(_data,[_proj])

        return list_projects

    def to_project_json(self,projects):

        list_proj = []
        for _proj in projects:
            _uuid = _proj.get(project_uuid,"")
            list_uuid = [a for a in _uuid.split(",") if a!=""]
            if len(list_uuid)>0:
                _proj["keep_uuid"] = list_uuid[0]
                _proj["delete_uuid"] = ",".join(list_uuid[1:])
            else:
                _proj["keep_uuid"] = _proj.get("keep_uuid","")
                _proj["delete_uuid"] = _proj.get("delete_uuid","")
            list_proj.append(_proj)
            if project_uuid in _proj:
                _proj.pop(project_uuid)
        return json.dumps(list_proj,ensure_ascii=False)

    def dumplicate_document_in_merge(self,list_projects):
        '''
        合并时去重
        :param list_projects:
        :return:
        '''

        for _proj in list_projects:
            dict_channel_proj = {}
            list_dynamics = json.loads(_proj.get(project_dynamics,"[]"))
            set_dup_docid = set()
            for _d in list_dynamics:
                docid = _d.get(document_docid)
                _status = _d.get(document_status,201)
                is_multipack = _d.get("is_multipack",True)
                extract_count = _d.get(document_tmp_extract_count,0)
                docchannel = _d.get(document_docchannel,0)
                if _status>=201 and _status<=300 and docchannel>0:
                    if docchannel in dict_channel_proj:
                        n_d = dict_channel_proj[docchannel]
                        n_docid = n_d.get(document_docid)
                        n_is_multipack = n_d.get("is_multipack",True)
                        n_extract_count = n_d.get(document_tmp_extract_count,0)
                        if not n_is_multipack:
                            if is_multipack:
                                set_dup_docid.add(str(n_docid))
                                dict_channel_proj[docchannel] = _d
                            else:
                                if extract_count>n_extract_count:
                                    set_dup_docid.add(str(n_docid))
                                    dict_channel_proj[docchannel] = _d
                                elif extract_count==n_extract_count:
                                    if n_docid>docid:
                                        set_dup_docid.add(str(n_docid))
                                        dict_channel_proj[docchannel] = _d
                                    else:
                                        set_dup_docid.add(str(docid))
                                else:
                                    set_dup_docid.add(str(docid))
                        else:
                            if not is_multipack:
                                set_dup_docid.add(str(docid))
                    else:
                        dict_channel_proj[docchannel] = _d

            docids = _proj.get(project_docids,"")
            set_docids = set([a for a in docids.split(",") if a!=""])
            set_docids = set_docids-set_dup_docid
            _proj[project_docids] = ",".join(list(set_docids))
            _proj[project_docid_number] = len(set_docids)
            _proj[project_dup_docid] = ",".join(list(set_dup_docid))


    def merge_document_real(self,item,dup_docid,table_name,status_to=None):
        '''
        实时项目合并
        :param item:
        :param dup_docid:重复的公告集合
        :param status_to:
        :return:
        '''
        list_docids = []
        _docid = item.get(document_tmp_docid)
        list_docids.append(_docid)
        if isinstance(dup_docid,list):
            list_docids.extend(dup_docid)
        list_docids = [a for a in list_docids if a is not None]

        list_projects = self.search_projects_with_document(list_docids)
        if len(list_projects)==0:
            list_docs = self.search_docs(list_docids)
            list_projects = self.generate_projects_from_document(list_docs)
        else:
            self.update_projects_by_document(_docid,list_projects)
        list_projects = self.dumplicate_projects(list_projects)
        list_projects = self.merge_projects(list_projects)

        self.dumplicate_document_in_merge(list_projects)

        project_json = self.to_project_json(list_projects)
        print("project_json",project_json)
        return project_json

    def dumplicate_comsumer_handle(self,item,result_queue,ots_client,get_all=False,upgrade=True):
        try:
            start_time = time.time()
            self.post_extract(item)

            base_list = []
            set_docid = set()

            list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,get_all=get_all,to_log=True)

            list_rules.sort(key=lambda x:x["confidence"],reverse=True)
            _i = 0
            step = 5

            item["confidence"] = 999
            if item.get(document_tmp_docid) not in set_docid:
                base_list.append(item)
                set_docid.add(item.get(document_tmp_docid))

            while _i<len(list_rules):
                must_not_q = []
                if len(base_list)>0:
                    must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
                _query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
                                   must_not_queries=must_not_q)
                _rule = list_rules[_i]
                confidence = _rule["confidence"]
                singleNum_keys = _rule["singleNum_keys"]
                contain_keys = _rule["contain_keys"]
                multiNum_keys = _rule["multiNum_keys"]
                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json])
                _i += step


            _time = time.time()
            log("%d start final check with length:%d"%(item["docid"],len(base_list)))
            final_list = self.dumplicate_fianl_check(base_list)
            log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
            best_docid = self.get_best_docid(final_list)

            final_list_docid = [a["docid"] for a in final_list]
            log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
            _d = {"partitionkey":item["partitionkey"],
                  "docid":item["docid"],
                  "status":random.randint(*flow_dumplicate_status_to),
                  document_tmp_opertime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")
                  }
            dtmp = Document_tmp(_d)


            dup_docid = set()
            for _dict in final_list:
                dup_docid.add(_dict.get(document_tmp_docid))
            if item.get(document_tmp_docid) in dup_docid:
                dup_docid.remove(item.get(document_tmp_docid))


            remove_list = []

            if len(final_list)==0 or best_docid==item.get(document_tmp_docid):
                dtmp.setValue(document_tmp_save,1,True)
                # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
                dmp_docid = ",".join([str(a) for a in list(dup_docid)])
                for _dict in final_list:
                    if _dict.get(document_tmp_docid) in dup_docid:
                        remove_list.append(_dict)
            else:
                dtmp.setValue(document_tmp_save,0,True)
                if best_docid in dup_docid:
                    dup_docid.remove(best_docid)
                    for _dict in final_list:
                        if _dict.get(document_tmp_docid) in dup_docid:
                            remove_list.append(_dict)
                    dmp_docid = ",".join([str(a) for a in list(dup_docid)])
                    dmp_docid = "%d,%s"%(best_docid,dmp_docid)
                else:
                    dmp_docid = ",".join([str(a) for a in list(dup_docid)])
                    for _dict in final_list:
                        if _dict.get(document_tmp_docid) in dup_docid:
                            remove_list.append(_dict)

            list_docids = list(dup_docid)
            list_docids.append(best_docid)
            dtmp.setValue(document_tmp_projects,self.merge_document_real(item,list_docids,table_name,flow_dumplicate_status_to),True)

            log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
            if upgrade:
                if table_name=="document_tmp":
                    self.changeSaveStatus(remove_list)

                # print(dtmp.getProperties())
                dmp_docid = ",".join([str(a) for a in list(dup_docid)])
                dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
                dtmp.update_row(self.ots_client)

            # log("dump takes %.2f"%(time.time()-start_time))
        except Exception as e:
            traceback.print_exc()
            log("error on dumplicate of %s"%(str(item.get(document_tmp_docid))))

    def start_flow_dumplicate(self):
        schedule = BlockingScheduler()
        schedule.add_job(self.flow_dumplicate,"cron",second="*/10")
        schedule.start()

    def changeSaveStatus(self,list_dict):
        for _dict in list_dict:
            if _dict.get(document_tmp_save,1)==1:
                _d = {"partitionkey":_dict["partitionkey"],
                      "docid":_dict["docid"],
                      document_tmp_save:0
                      }
                _d_tmp = Document_tmp(_d)
                _d_tmp.update_row(self.ots_client)


    def test_dumplicate(self,docid):
        columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]
        bool_query = BoolQuery(must_queries=[
            TermQuery("docid",docid)
        ])
        rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
                                                                            SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                            ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        log("flow_dumplicate producer total_count:%d"%total_count)
        if total_count==0:
            rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
                                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                                ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        list_dict = getRow_ots(rows)

        for item in list_dict:
            self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=True,upgrade=False)
            return

    def getRemainDoc(self,docid):
        columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]
        bool_query = BoolQuery(must_queries=[
            TermQuery("docid",docid)
        ])
        rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
                                                                            SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
                                                                            ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
        list_dict = getRow_ots(rows)

        if len(list_dict)>0:
            item = list_dict[0]
            start_time = time.time()
            self.post_extract(item)

            base_list = []
            set_docid = set()

            list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,to_log=True)

            list_rules.sort(key=lambda x:x["confidence"],reverse=True)
            _i = 0
            step = 5

            item["confidence"] = 999
            if item.get(document_tmp_docid) not in set_docid:
                base_list.append(item)
                set_docid.add(item.get(document_tmp_docid))

            while _i<len(list_rules):
                must_not_q = []
                if len(base_list)>0:
                    must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
                _query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
                                   must_not_queries=must_not_q)
                _rule = list_rules[_i]
                confidence = _rule["confidence"]
                singleNum_keys = _rule["singleNum_keys"]
                contain_keys = _rule["contain_keys"]
                multiNum_keys = _rule["multiNum_keys"]
                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json])
                _i += step


            _time = time.time()
            log("%d start final check with length:%d"%(item["docid"],len(base_list)))
            final_list = self.dumplicate_fianl_check(base_list)
            log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
            best_docid = self.get_best_docid(final_list)
            return best_docid
        return None

if __name__ == '__main__':
    # df = Dataflow()
    # df.flow_init()
    # df.flow_test()
    # df.test_merge()
    # df.start_flow_attachment()
    # df.start_flow_extract()
    # df.start_flow_dumplicate()
    # # df.start_flow_merge()
    # df.start_flow_remove()

    # download_attachment()
    # test_attachment_interface()
    df_dump = Dataflow_dumplicate()
    # df_dump.start_flow_dumplicate()
    a = time.time()
    df_dump.test_dumplicate(272934158)
    print("takes",time.time()-a)
    # df_dump.delete_projects_by_document(16288036)
    # log("=======")
    # for i in range(3):
    #     time.sleep(20)
    #
    # a = {"docid":16288036}
    # send_msg_toacmq(df_dump.pool_mq_ali,json.dumps(a),df_dump.doc_delete_queue)