luojiehua
/
BaseDataMaintenance


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
							from BaseDataMaintenance.model.ots.BaseModel import BaseModel
from tablestore import *
from BaseDataMaintenance.common.Utils import *
from bs4 import BeautifulSoup

document_tmp_partitionkey = "partitionkey"
document_tmp_docid = "docid"
document_tmp_dochtmlcon = "dochtmlcon"
document_tmp_doctextcon = "doctextcon"
document_tmp_doctitle = "doctitle"
document_tmp_attachmenttextcon = "attachmenttextcon"
document_tmp_attachment_path = "page_attachments"
document_tmp_attachment_path_filemd5 = "fileMd5"
document_tmp_attachment_path_fileTitle = "fileTitle"
document_tmp_attachment_path_fileLink = "fileLink"
document_tmp_uuid = "uuid"
document_tmp_crtime = "crtime"
document_tmp_status = "status"
document_tmp_tenderee = "tenderee"
document_tmp_agency = "agency"
document_tmp_project_code = "project_code"
document_tmp_product = "product"
document_tmp_project_name = "project_name"
document_tmp_doctitle_refine = "doctitle_refine"
document_tmp_extract_count = "extract_count"
document_tmp_sub_docs_json = "sub_docs_json"
document_tmp_save = "save"
document_tmp_dup_docid = "dup_docid"
document_tmp_best_docid = "best_docid"
document_tmp_merge_uuid = "merge_uuid"
document_tmp_projects = "projects"
document_tmp_page_time = "page_time"
document_tmp_attachment_extract_status = "attachment_extract_status"
document_tmp_web_source_no = "web_source_no"
document_tmp_fingerprint = "fingerprint"
document_tmp_opertime = "opertime"
document_tmp_docchannel = "docchannel"
document_tmp_original_docchannel = "original_docchannel"

document_tmp_extract_json = "extract_json"
document_tmp_industry_json = "industry_json"
document_tmp_other_json = "other_json"

document_tmp_time_bidclose = "time_bidclose"
document_tmp_time_bidopen = "time_bidopen"
document_tmp_time_completion = "time_completion"
document_tmp_time_earnest_money_end = "time_earnest_money_end"
document_tmp_time_earnest_money_start = "time_earnest_money_start"
document_tmp_time_get_file_end = "time_get_file_end"
document_tmp_time_get_file_start = "time_get_file_start"
document_tmp_time_publicity_end = "time_publicity_end"
document_tmp_time_publicity_start = "time_publicity_start"
document_tmp_time_registration_end = "time_registration_end"
document_tmp_time_registration_start = "time_registration_start"
document_tmp_time_release = "time_release"


class Document_tmp(BaseModel):

    def __init__(self,_dict):
        BaseModel.__init__(self)
        for k,v in _dict.items():
            self.setValue(k,v,True)
        self.table_name = "document_tmp"
        self.prefixs = ["www.bidizhaobiao.com","bxkc.oss-cn-shanghai.aliyuncs.com"]

    def getPrimary_keys(self):
        return [document_tmp_partitionkey,document_tmp_docid]


    def isLegalUrl(self,_url,_type):
        _flag = False
        for _prefix in self.prefixs:
            if _url.find(_prefix)>=0:
                _flag = True
        if _type==0:
            if _flag:
                return True
            else:
                return False
        else:
            if _flag:
                return False
            else:
                return True


    def updateSWFImages(self,swf_urls):
        if len(swf_urls)>0:
            _dochtmlcon = self.getProperties().get(document_tmp_dochtmlcon)
            _soup = BeautifulSoup(_dochtmlcon,"lxml")
            if _soup.find("img",{"src":swf_urls[0]}) is None:
                _div = "<div>"
                for _url in swf_urls:
                    _div += '<p><img src="%s"/></p>'%(_url)
                _div += "</div>"
                _dochtmlcon += _div
                self.setValue(document_tmp_dochtmlcon,_dochtmlcon,True)

    def getRichTextFetch(self,list_html):
        _text = ""
        for _ht in list_html:
            if isinstance(_ht,str):
                _text += "<div>%s</div>"%(_ht)
            elif isinstance(_ht,dict):
                _filemd5 = _ht.get("filemd5","")
                _html = _ht.get("html","")
                _text += '<div filemd5="%s">%s</div>'%(_filemd5,_html)
        return _text

    def updateAttachment(self,list_html):
        if len(list_html)>0:
            _dochtmlcon = self.getProperties().get(document_tmp_dochtmlcon,"")
            _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
            _dochtmlcon_len = len(bytes(_dochtmlcon,encoding="utf8"))
            fix_len = self.COLUMN_MAX_SIZE-_dochtmlcon_len-100

            # _text = '\n<div style="display:none;" class="richTextFetch">%s</div>'%("\n".join(list_html))
            _text = '\n<div style="display:none;" class="richTextFetch">%s</div>'%(self.getRichTextFetch(list_html))


            if _dochtmlcon is not None:
                _soup = BeautifulSoup(_dochtmlcon,"lxml")
                _node = _soup.find("div",attrs={"class":"richTextFetch"})
                if _node is not None:
                    _node.decompose()
                self.setValue(document_tmp_dochtmlcon,str(_soup)+_text,True)


    def getTitleFromHtml(self,filemd5,_html):
        _soup = BeautifulSoup(_html,"lxml")

        _find = _soup.find("a",attrs={"data":filemd5})
        _title = ""
        if _find is not None:
            _title = _find.get_text()
        return _title

    def getSourceLinkFromHtml(self,filemd5,_html):
        _soup = BeautifulSoup(_html,"lxml")

        _find = _soup.find("a",attrs={"filelink":filemd5})
        filelink = ""
        if _find is None:
            _find = _soup.find("img",attrs={"filelink":filemd5})
            if _find is not None:
                filelink = _find.attrs.get("src","")
        else:
            filelink = _find.attrs.get("href","")
        return filelink

import random
def turn_extract_status():
    from BaseDataMaintenance.dataSource.source import getConnect_ots
    from BaseDataMaintenance.common.multiThread import MultiThreadHandler
    import queue
    from threading import Thread
    import json
    task_queue = queue.Queue()
    from BaseDataMaintenance.model.ots.attachment import attachment_filemd5,attachment_file_title,attachment_file_link
    ots_client = getConnect_ots()
    def producer(task_queue,ots_client):


        bool_query = BoolQuery(must_queries=[
                                            # WildcardQuery(document_tmp_web_source_no,"00295*"),
                                            # RangeQuery(document_tmp_crtime,"2021-07-26 00:00:00"),
                                            RangeQuery(document_tmp_status,61,70,True,True),
                                            #TermQuery(document_tmp_docid,171146519),
                                            ]
        )

        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_tmp_index",
                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
                                                                       columns_to_get=ColumnsToGet([document_tmp_fingerprint],return_type=ColumnReturnType.SPECIFIED))
        list_data = getRow_ots(rows)
        print(total_count)
        _count = len(list_data)
        for _data in list_data:
            _document = Document_tmp(_data)
            task_queue.put(_document)
        while next_token:
            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_tmp_index",
                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                           columns_to_get=ColumnsToGet([document_tmp_fingerprint],return_type=ColumnReturnType.SPECIFIED))
            list_data = getRow_ots(rows)
            _count += len(list_data)
            print("%d/%d"%(_count,total_count))
            for _data in list_data:
                _document = Document_tmp(_data)
                task_queue.put(_document)

    def _handle(item,result_queue,ots_client):
        #change attach value
        # list_attachment = json.loads(item.getProperties().get(document_tmp_attachment_path))
        # print("docid",item.getProperties().get(document_tmp_docid))
        # for attach in list_attachment:
        #
        #     filemd5 = attach.get(document_tmp_attachment_path_filemd5,"")
        #     _document_tmp_html = item.getProperties().get(document_tmp_dochtmlcon,"")
        #
        #     _file_title = item.getTitleFromHtml(filemd5,_document_tmp_html)
        #     filelink = item.getSourceLinkFromHtml(filemd5,_document_tmp_html)
        #     attach[document_tmp_attachment_path_fileTitle] = _file_title
        #     attach[document_tmp_attachment_path_fileLink] = filelink
        #
        # item.setValue(document_tmp_attachment_path,json.dumps(list_attachment,ensure_ascii=False),True)
        # item.all_columns.remove(document_tmp_dochtmlcon)

        #change status
        item.setValue(document_tmp_status,random.randint(1,50),True)
        item.update_row(ots_client)


    t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_client})
    t_producer.start()
    t_producer.join()
    # mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client)
    # mt.run()
    dict_fingerprint = {}
    while True:
        try:
            item = task_queue.get(timeout=2)
            fingerprint = item.getProperties().get(document_tmp_fingerprint)
            if fingerprint is not None:
                if fingerprint not in dict_fingerprint:
                    dict_fingerprint[fingerprint] = []
                dict_fingerprint[fingerprint].append(item)
        except Exception as e:
            print(e)
            break
    print(len(dict_fingerprint.keys()))
    status_queue = queue.Queue()
    for k,v in dict_fingerprint.items():
        print("key",k,len(v))
        v.sort(key=lambda x:x.docid)
        for _d in v[1:]:
            _d.setValue(document_tmp_status,random.randint(401,450),True)
            status_queue.put(_d)

    mt = MultiThreadHandler(status_queue,_handle,None,30,ots_client=ots_client)
    mt.run()


def turn_document_tmp_status():
    from BaseDataMaintenance.dataSource.source import getConnect_ots
    from BaseDataMaintenance.common.multiThread import MultiThreadHandler
    import queue
    from threading import Thread
    import json
    task_queue = queue.Queue()
    from BaseDataMaintenance.model.ots.attachment import attachment_filemd5,attachment_file_title,attachment_file_link
    ots_client = getConnect_ots()

    def producer1(task_queue,ots_client):
        a = ''
        for l_a in a.split("\n"):
            l_a = l_a.strip()
            if l_a !="":
                task_queue.put(Document_tmp({document_tmp_partitionkey:int(l_a)%500+1,
                                             document_tmp_docid:int(l_a),
                                             document_tmp_status:66}))

    def producer(task_queue,ots_client):

        bool_query = BoolQuery(
            must_queries=[
                # TermQuery("fingerprint","md5=2cc044b81ec13acddcc970b71b780365")
                # TermQuery("save",66),
                RangeQuery("status",1,51),
                # BoolQuery(should_queries=[
                #                           # TermQuery("tenderee","山西利民工业有限责任公司"),
                #                           # MatchPhraseQuery("doctitle","中国电信"),
                #                           # MatchPhraseQuery("doctextcon","中国电信"),
                #                           # MatchPhraseQuery("attachmenttextcon","中国电信")]),
                #                           # RangeQuery(document_tmp_status,88,120,True,True),
                #                           RangeQuery("page_time","2022-03-24","2022-03-25",True,False),
                #                           ExistsQuery
                #                                  #,TermQuery(document_tmp_docid,171146519)
                #                                  ]
                # )
            ],
            must_not_queries=[
                # TermQuery("docid",288599518)
                # ExistsQuery("doctitle"),
                # ExistsQuery("page_time"),
                              ]
        )

        rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
                                                                       columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
        list_data = getRow_ots(rows)
        print(total_count)
        # print(list_data)
        _count = len(list_data)
        for _data in list_data:
            _document = Document_tmp(_data)
            task_queue.put(_document)
        print(list_data)
        while next_token:
            rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
                                                                           columns_to_get=ColumnsToGet(return_type=ColumnReturnType.NONE))
            list_data = getRow_ots(rows)
            _count += len(list_data)
            print("%d/%d"%(_count,total_count))
            for _data in list_data:
                _document = Document_tmp(_data)
                task_queue.put(_document)

        # docids = [223820830,224445409]
        # for docid in docids:
        #     _dict = {document_tmp_docid:int(docid),
        #              document_tmp_partitionkey:int(docid)%500+1,
        #              }
        #     task_queue.put(Document(_dict))
        # import pandas as pd
        # df = pd.read_excel("2022-01-19_214304_export11.xlsx")
        # for docid,tenderee,win in zip(df["docid"],df["招标单位"],df["中标单位"]):
        #     if not isinstance(tenderee,(str)) or not isinstance(win,(str)) or win=="" or tenderee=="":
        #         # print(docid)
        #         _dict = {document_tmp_docid:int(docid),
        #                  document_tmp_partitionkey:int(docid)%500+1,
        #                  }
        #         task_queue.put(Document(_dict))
        log("task_queue size:%d"%(task_queue.qsize()))

    def _handle(item,result_queue,ots_client):
        #change attach value
        # list_attachment = json.loads(item.getProperties().get(document_tmp_attachment_path))
        # print("docid",item.getProperties().get(document_tmp_docid))
        # for attach in list_attachment:
        #
        #     filemd5 = attach.get(document_tmp_attachment_path_filemd5,"")
        #     _document_tmp_html = item.getProperties().get(document_tmp_dochtmlcon,"")
        #
        #     _file_title = item.getTitleFromHtml(filemd5,_document_tmp_html)
        #     filelink = item.getSourceLinkFromHtml(filemd5,_document_tmp_html)
        #     attach[document_tmp_attachment_path_fileTitle] = _file_title
        #     attach[document_tmp_attachment_path_fileLink] = filelink
        #
        # item.setValue(document_tmp_attachment_path,json.dumps(list_attachment,ensure_ascii=False),True)
        # item.all_columns.remove(document_tmp_dochtmlcon)

        #change status
        # item.setValue(document_tmp_docchannel,item.getProperties().get(document_tmp_original_docchannel),True)
        # _extract_json = item.getProperties().get(document_tmp_extract_json,"")
        # _extract_json = _extract_json.replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '')
        # item.setValue(document_tmp_extract_json,_extract_json,True)
        # json.loads(_extract_json)
        item.setValue(document_tmp_status,0,True)
        # item.setValue(document_tmp_save,1,True)
        # if item.exists_row(ots_client):
        #     item.update_row(ots_client)
        # print(item.getProperties())
        item.update_row(ots_client)
        # log("update %d status done"%(item.getProperties().get(document_tmp_docid)))
        # item.delete_row(ots_client)
        # from BaseDataMaintenance.model.ots.document import Document
        #
        # Doc = Document(item.getProperties())
        # if Doc.fix_columns(ots_client,["status"],True):
        #     if Doc.getProperties().get("status",0)>=401:
        #         print(Doc.getProperties().get("docid"),"redo")
        #         item.setValue("status",66,True)
        #         item.update_row(ots_client)
        # pass

    t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_client})
    t_producer.start()
    t_producer.join()
    mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client)
    mt.run()


if __name__=="__main__":
    # turn_extract_status()
    turn_document_tmp_status()