|
@@ -0,0 +1,137 @@
|
|
|
+from BaseDataMaintenance.model.ots.BaseModel import BaseModel
|
|
|
+from tablestore import *
|
|
|
+from BaseDataMaintenance.common.Utils import *
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+class Credit_item_info(BaseModel):
|
|
|
+
|
|
|
+ def __init__(self,_dict):
|
|
|
+ BaseModel.__init__(self)
|
|
|
+ for k,v in _dict.items():
|
|
|
+ self.setValue(k,v,True)
|
|
|
+ self.table_name = "credit_item_info"
|
|
|
+ self.prefixs = ["www.bidizhaobiao.com","bxkc.oss-cn-shanghai.aliyuncs.com"]
|
|
|
+
|
|
|
+ def getPrimary_keys(self):
|
|
|
+ return ["record_id"]
|
|
|
+
|
|
|
+ def getAll_columns(self):
|
|
|
+ return ["record_id","qylb"]
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def drop_credit_item():
|
|
|
+ from BaseDataMaintenance.dataSource.source import getConnect_ots_capacity
|
|
|
+ from BaseDataMaintenance.common.multiThread import MultiThreadHandler
|
|
|
+ import queue
|
|
|
+ from threading import Thread
|
|
|
+ import json
|
|
|
+ task_queue = queue.Queue()
|
|
|
+ from BaseDataMaintenance.model.ots.attachment import attachment_filemd5,attachment_file_title,attachment_file_link
|
|
|
+ ots_capacity = getConnect_ots_capacity()
|
|
|
+ from BaseDataMaintenance.model.ots.document_extract2 import Document_extract2
|
|
|
+
|
|
|
+ def producer(task_queue,ots_client):
|
|
|
+
|
|
|
+
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery("web_source","青岛市建设市场监管与信用信息综合平台"),
|
|
|
+ BoolQuery(must_queries=[
|
|
|
+ # TermQuery("tenderee","山西利民工业有限责任公司"),
|
|
|
+ # MatchPhraseQuery("doctitle","中国电信"),
|
|
|
+ # MatchPhraseQuery("doctextcon","中国电信"),
|
|
|
+ # MatchPhraseQuery("attachmenttextcon","中国电信")]),
|
|
|
+ WildcardQuery("qylb","建筑行业-施工企业*")
|
|
|
+ # RangeQuery("page_time","2021-12-20","2022-01-05",True,False),
|
|
|
+ #,TermQuery(document_docid,171146519)
|
|
|
+ ]
|
|
|
+ ),
|
|
|
+ # TermQuery("docid",228359000)
|
|
|
+ ],
|
|
|
+ # must_not_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*"))]
|
|
|
+ )
|
|
|
+
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("credit_item_info","credit_item_info_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("create_time",SortOrder.DESC)]),limit=100,get_total_count=True),
|
|
|
+ columns_to_get=ColumnsToGet(["qylb"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ print(total_count)
|
|
|
+ _count = len(list_data)
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("credit_item_info","credit_item_info_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
+ columns_to_get=ColumnsToGet(["qylb"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ _count += len(list_data)
|
|
|
+ print("%d/%d"%(_count,total_count))
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+
|
|
|
+ # docids = [223820830,224445409]
|
|
|
+ # for docid in docids:
|
|
|
+ # _dict = {document_docid:int(docid),
|
|
|
+ # document_partitionkey:int(docid)%500+1,
|
|
|
+ # }
|
|
|
+ # task_queue.put(Document(_dict))
|
|
|
+ # import pandas as pd
|
|
|
+ # df = pd.read_excel("2022-01-19_214304_export11.xlsx")
|
|
|
+ # for docid,tenderee,win in zip(df["docid"],df["招标单位"],df["中标单位"]):
|
|
|
+ # if not isinstance(tenderee,(str)) or not isinstance(win,(str)) or win=="" or tenderee=="":
|
|
|
+ # # print(docid)
|
|
|
+ # _dict = {document_docid:int(docid),
|
|
|
+ # document_partitionkey:int(docid)%500+1,
|
|
|
+ # }
|
|
|
+ # task_queue.put(Document(_dict))
|
|
|
+ log("task_queue size:%d"%(task_queue.qsize()))
|
|
|
+
|
|
|
+ def _handle(item,result_queue,ots_client):
|
|
|
+ #change attach value
|
|
|
+ # list_attachment = json.loads(item.getProperties().get(document_attachment_path))
|
|
|
+ # print("docid",item.getProperties().get(document_docid))
|
|
|
+ # for attach in list_attachment:
|
|
|
+ #
|
|
|
+ # filemd5 = attach.get(document_attachment_path_filemd5,"")
|
|
|
+ # _document_html = item.getProperties().get(document_dochtmlcon,"")
|
|
|
+ #
|
|
|
+ # _file_title = item.getTitleFromHtml(filemd5,_document_html)
|
|
|
+ # filelink = item.getSourceLinkFromHtml(filemd5,_document_html)
|
|
|
+ # attach[document_attachment_path_fileTitle] = _file_title
|
|
|
+ # attach[document_attachment_path_fileLink] = filelink
|
|
|
+ #
|
|
|
+ # item.setValue(document_attachment_path,json.dumps(list_attachment,ensure_ascii=False),True)
|
|
|
+ # item.all_columns.remove(document_dochtmlcon)
|
|
|
+
|
|
|
+ #change status
|
|
|
+ # item.setValue(document_docchannel,item.getProperties().get(document_original_docchannel),True)
|
|
|
+ # item.setValue(document_status,random.randint(151,170),True)
|
|
|
+ # item.update_row(ots_client)
|
|
|
+ # log("update %d status done"%(item.getProperties().get(document_docid)))
|
|
|
+ # _dict = {}
|
|
|
+ # _dict.update(item)
|
|
|
+ # _dict.pop("status")
|
|
|
+ # _dict["status"] = 1
|
|
|
+ # print(_dict)
|
|
|
+ # _document = Document(_dict)
|
|
|
+ # _document.update_row(ots_client)
|
|
|
+ # _d_extract = Document_extract2(_dict)
|
|
|
+ # _d_extract.delete_row(ots_client)
|
|
|
+ _credit = Credit_item_info(item)
|
|
|
+ _credit.setValue("qylb",re.sub("建筑行业-施工企业-","",_credit.getProperties().get("qylb","")),True)
|
|
|
+ print(_credit.getAttribute_turple())
|
|
|
+ _credit.update_row(ots_capacity)
|
|
|
+ pass
|
|
|
+
|
|
|
+
|
|
|
+ t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_capacity})
|
|
|
+ t_producer.start()
|
|
|
+ t_producer.join()
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_capacity)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+
|
|
|
+if __name__=="__main__":
|
|
|
+ drop_credit_item()
|