|
@@ -10,6 +10,7 @@ from BaseDataMaintenance.model.ots.document_html import *
|
|
|
from BaseDataMaintenance.model.ots.document_extract2 import *
|
|
|
from BaseDataMaintenance.model.ots.project import *
|
|
|
from BaseDataMaintenance.model.ots.document import *
|
|
|
+from BaseDataMaintenance.model.ots.project_process import *
|
|
|
|
|
|
import base64
|
|
|
from BaseDataMaintenance.dataSource.interface import getAttachDealInterface,sentMsgToDD
|
|
@@ -1245,8 +1246,19 @@ class Dataflow():
|
|
|
return []
|
|
|
|
|
|
def get_best_docid(self,base_list):
|
|
|
+ to_reverse = False
|
|
|
+ dict_source_count = {}
|
|
|
+ for _item in base_list:
|
|
|
+ _web_source = _item.get(document_tmp_web_source_no)
|
|
|
+ _fingerprint = _item.get(document_tmp_fingerprint)
|
|
|
+ if _web_source is not None:
|
|
|
+ if _web_source not in dict_source_count:
|
|
|
+ dict_source_count[_web_source] = set()
|
|
|
+ dict_source_count[_web_source].add(_fingerprint)
|
|
|
+ if len(dict_source_count[_web_source])>=2:
|
|
|
+ to_reverse=True
|
|
|
if len(base_list)>0:
|
|
|
- base_list.sort(key=lambda x:x["docid"])
|
|
|
+ base_list.sort(key=lambda x:x["docid"],reverse=to_reverse)
|
|
|
base_list.sort(key=lambda x:x["extract_count"],reverse=True)
|
|
|
return base_list[0]["docid"]
|
|
|
|
|
@@ -2081,10 +2093,10 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
self.conn = conn
|
|
|
self._func = _func
|
|
|
|
|
|
- def on_error(self, headers):
|
|
|
+ def on_error(self, headers,*args,**kwargs):
|
|
|
log('received an error %s' % str(headers.body))
|
|
|
|
|
|
- def on_message(self, headers):
|
|
|
+ def on_message(self, headers,*args,**kwargs):
|
|
|
try:
|
|
|
message_id = headers.headers["message-id"]
|
|
|
body = headers.body
|
|
@@ -2103,6 +2115,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
self.c_f_get_package = f_get_package()
|
|
|
logging.basicConfig(level = logging.info,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
+ self.fix_doc_docid = None
|
|
|
|
|
|
if start_delete_listener:
|
|
|
self.delete_comsumer_counts = 2
|
|
@@ -2153,6 +2166,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
_index = 0
|
|
|
base_fingerprint = "None"
|
|
|
+
|
|
|
if len(base_list)>0:
|
|
|
base_fingerprint = base_list[0]["fingerprint"]
|
|
|
for _i in range(1,len(base_list)):
|
|
@@ -2176,6 +2190,17 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
break
|
|
|
|
|
|
if _index>=1:
|
|
|
+ # #对重复入库的进行去重
|
|
|
+ # _l = the_group[:_index+1]
|
|
|
+ # set_fingerprint = set()
|
|
|
+ # final_l = []
|
|
|
+ # for _dict in _l:
|
|
|
+ # fingerprint_less = _dict["fingerprint"]
|
|
|
+ # if fingerprint_less in set_fingerprint:
|
|
|
+ # continue
|
|
|
+ # else:
|
|
|
+ # final_l.append(_dict)
|
|
|
+ # set_fingerprint.add(fingerprint_less)
|
|
|
return the_group[:_index+1]
|
|
|
return []
|
|
|
|
|
@@ -2701,6 +2726,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
:return:
|
|
|
'''
|
|
|
list_docs = []
|
|
|
+ set_fingerprint = set()
|
|
|
for _docid in list_docids:
|
|
|
docid = int(_docid)
|
|
|
_dict = {document_partitionkey:getPartitionKey(docid),
|
|
@@ -2711,6 +2737,10 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
_doc = Document(_dict)
|
|
|
_exists = _doc.fix_columns(self.ots_client,columns_to_get,True)
|
|
|
if _exists:
|
|
|
+ _fingerprint = _doc.getProperties().get(document_fingerprint)
|
|
|
+ if _fingerprint in set_fingerprint:
|
|
|
+ continue
|
|
|
+ set_fingerprint.add(_fingerprint)
|
|
|
list_docs.append(_doc)
|
|
|
for _doc in list_docs:
|
|
|
try:
|
|
@@ -2783,7 +2813,6 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
docs = [_doc.getProperties() for _doc in list_docs]
|
|
|
|
|
|
project_dict = generate_common_properties(docs)
|
|
|
- print("list_docs",project_dict)
|
|
|
|
|
|
list_package_properties = generate_packages_properties(docs)
|
|
|
|
|
@@ -2821,44 +2850,42 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
|
|
|
except Exception as e:
|
|
|
pass
|
|
|
- set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
|
|
|
- set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
|
|
|
- set_product = set_product | set(project_dict.get(project_product,"").split(","))
|
|
|
+ set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
|
|
|
+ set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
|
|
|
+ set_product = set_product | set(project_dict.get(project_product,"").split(","))
|
|
|
|
|
|
- try:
|
|
|
- set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
|
|
|
- set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
|
|
|
- except Exception as e:
|
|
|
- pass
|
|
|
+ try:
|
|
|
+ set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
|
|
|
+ set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
|
|
|
|
|
|
- append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
|
|
|
- append_dict[project_docid_number] = len(set_docid)
|
|
|
- append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""])
|
|
|
- append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""])
|
|
|
+ append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
|
|
|
+ append_dict[project_docid_number] = len(set_docid)
|
|
|
+ append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""])
|
|
|
+ append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""])
|
|
|
|
|
|
- append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
|
|
|
- append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
|
|
|
+ append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
|
|
|
+ append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
|
|
|
|
|
|
- dict_dynamic = {}
|
|
|
- set_docid = set()
|
|
|
- for _proj in projects:
|
|
|
+
|
|
|
+ dict_dynamic = {}
|
|
|
+ set_docid = set()
|
|
|
_dynamic = json.loads(_proj.get(project_project_dynamics,"[]"))
|
|
|
for _dy in _dynamic:
|
|
|
_docid = _dy.get("docid")
|
|
|
dict_dynamic[_docid] = _dy
|
|
|
- _dynamic = json.loads(project_dict.get(project_project_dynamics,"[]"))
|
|
|
- for _dy in _dynamic:
|
|
|
- _docid = _dy.get("docid")
|
|
|
- dict_dynamic[_docid] = _dy
|
|
|
- list_dynamics = []
|
|
|
- for k,v in dict_dynamic.items():
|
|
|
- list_dynamics.append(v)
|
|
|
- list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
|
|
|
-
|
|
|
- append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
|
|
|
+ _dynamic = json.loads(project_dict.get(project_project_dynamics,"[]"))
|
|
|
+ for _dy in _dynamic:
|
|
|
+ _docid = _dy.get("docid")
|
|
|
+ dict_dynamic[_docid] = _dy
|
|
|
+ list_dynamics = []
|
|
|
+ for k,v in dict_dynamic.items():
|
|
|
+ list_dynamics.append(v)
|
|
|
+ list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
|
|
|
|
|
|
- for _proj in projects:
|
|
|
+ append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
|
|
|
_proj.update(append_dict)
|
|
|
|
|
|
|
|
@@ -3011,8 +3038,17 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
if docid is None:
|
|
|
return
|
|
|
delete_result = self.delete_projects_by_document(docid)
|
|
|
- if send_msg_toacmq(self.pool_mq_ali,delete_result,self.doc_delete_result):
|
|
|
+
|
|
|
+ _uuid = uuid4().hex
|
|
|
+ _d = {PROJECT_PROCESS_UUID:_uuid,
|
|
|
+ PROJECT_PROCESS_CRTIME:1,
|
|
|
+ PROJECT_PROCESS_PROJECTS:delete_result}
|
|
|
+ _pp = Project_process(_d)
|
|
|
+ if _pp.update_row(self.ots_client):
|
|
|
ackMsg(conn,message_id)
|
|
|
+ #取消插入结果队列,改成插入project_process表
|
|
|
+ # if send_msg_toacmq(self.pool_mq_ali,delete_result,self.doc_delete_result):
|
|
|
+ # ackMsg(conn,message_id)
|
|
|
|
|
|
def generate_common_properties(self,list_docs):
|
|
|
'''
|
|
@@ -3283,7 +3319,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
|
|
|
|
|
|
- def getMerge_rules(self,page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price):
|
|
|
+ def getMerge_rules(self,page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price,province,city,district):
|
|
|
|
|
|
whole_time_start = time.time()
|
|
|
_time = time.time()
|
|
@@ -3291,10 +3327,25 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
list_code = [a for a in project_codes.split(",") if a!='']
|
|
|
should_q_code = BoolQuery(should_queries=[MatchQuery(project_project_codes,a) for a in list_code[:20]])
|
|
|
+
|
|
|
+ # print("should_q_code",[a for a in list_code[:20]])
|
|
|
should_q_cod = BoolQuery(should_queries=[MatchQuery(project_project_code,a) for a in list_code[:20]])
|
|
|
list_product = [a for a in product.split(",") if a!='']
|
|
|
should_q_product = BoolQuery(should_queries=[MatchQuery(project_product,a) for a in list_product[:20]])
|
|
|
|
|
|
+ should_q_area = None
|
|
|
+ if province!="" or city!="" or district!="":
|
|
|
+ should_q = []
|
|
|
+ if province not in ("","全国","未知") and province is not None:
|
|
|
+ should_q.append(TermQuery(project_province,province))
|
|
|
+ if city not in ("","全国","未知") and city is not None:
|
|
|
+ should_q.append(TermQuery(project_city,city))
|
|
|
+ if district not in ("","全国","未知") and district is not None:
|
|
|
+ should_q.append(TermQuery(project_district,district))
|
|
|
+ if len(should_q)>0:
|
|
|
+ should_q_area = BoolQuery(should_queries=should_q)
|
|
|
+
|
|
|
+
|
|
|
prepare_time = time.time()-_time
|
|
|
|
|
|
_time = time.time()
|
|
@@ -3330,16 +3381,37 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
TermQuery(project_project_name,project_name)]
|
|
|
list_query.append([_query,2])
|
|
|
|
|
|
+ if tenderee!="" and agency!="":
|
|
|
+ _query = [TermQuery(project_tenderee,tenderee),
|
|
|
+ TermQuery(project_agency,agency)]
|
|
|
+ list_query.append([_query,1])
|
|
|
+
|
|
|
if tenderee!="" and bidding_budget>0:
|
|
|
_query = [TermQuery(project_tenderee,tenderee),
|
|
|
TermQuery(project_bidding_budget,bidding_budget)]
|
|
|
list_query.append([_query,2])
|
|
|
|
|
|
+ if bidding_budget>0 and win_bid_price>0:
|
|
|
+ _query = [TermQuery(project_bidding_budget,bidding_budget),
|
|
|
+ TermQuery(project_win_bid_price,win_bid_price)]
|
|
|
+ list_query.append([_query,2])
|
|
|
+
|
|
|
+
|
|
|
if tenderee!="" and win_tenderer!="":
|
|
|
_query = [TermQuery(project_tenderee,tenderee),
|
|
|
TermQuery(project_win_tenderer,win_tenderer)]
|
|
|
list_query.append([_query,2])
|
|
|
|
|
|
+ if agency!="" and win_tenderer!="":
|
|
|
+ _query = [TermQuery(project_agency,agency),
|
|
|
+ TermQuery(project_win_tenderer,win_tenderer)]
|
|
|
+ list_query.append([_query,2])
|
|
|
+
|
|
|
+ if agency!="" and len(list_product)>0:
|
|
|
+ _query = [TermQuery(project_agency,agency),
|
|
|
+ should_q_product]
|
|
|
+ list_query.append([_query,2])
|
|
|
+
|
|
|
if win_tenderer!="" and len(list_code)>0:
|
|
|
_query = [TermQuery(project_win_tenderer,win_tenderer),
|
|
|
should_q_code]
|
|
@@ -3354,6 +3426,16 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
TermQuery(project_win_bid_price,win_bid_price)]
|
|
|
list_query.append([_query,2])
|
|
|
|
|
|
+ if win_tenderer!="" and bidding_budget>0:
|
|
|
+ _query = [TermQuery(project_win_tenderer,win_tenderer),
|
|
|
+ TermQuery(project_bidding_budget,bidding_budget)]
|
|
|
+ list_query.append([_query,2])
|
|
|
+
|
|
|
+ if len(list_code)>0 and len(list_product)>0:
|
|
|
+ _query = [should_q_code,
|
|
|
+ should_q_product]
|
|
|
+ list_query.append([_query,2])
|
|
|
+
|
|
|
if len(list_code)>0:
|
|
|
_query = [
|
|
|
should_q_code]
|
|
@@ -3363,10 +3445,15 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
should_q_cod]
|
|
|
list_query.append([_query,1])
|
|
|
|
|
|
- if project_name!="":
|
|
|
+ if project_name!="" and project_name is not None:
|
|
|
_query = [
|
|
|
TermQuery(project_project_name,project_name)]
|
|
|
list_query.append([_query,1])
|
|
|
+ if len(list_product)>0 and should_q_area is not None:
|
|
|
+ _query = [should_q_area,
|
|
|
+ should_q_product]
|
|
|
+ list_query.append([_query,1])
|
|
|
+
|
|
|
generate_time = time.time()-_time
|
|
|
whole_time = time.time()-whole_time_start
|
|
|
log("projects merge rules whole_time:%.3f prepare_time:%.3f log_time:%.3f generate_time:%.3f"%(whole_time,prepare_time,log_time,generate_time))
|
|
@@ -3391,6 +3478,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
for _uuid in list(set_uuid):
|
|
|
must_not_q.append(TermQuery("uuid",_uuid))
|
|
|
|
|
|
+
|
|
|
projects_merge_count = 0
|
|
|
projects_check_rule_time = 0
|
|
|
projects_update_time = 0
|
|
@@ -3409,21 +3497,28 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
win_tenderer = _proj.get(project_win_tenderer,"")
|
|
|
win_bid_price = _proj.get(project_win_bid_price,-1)
|
|
|
|
|
|
+ province = _proj.get(project_province,"")
|
|
|
+ city = _proj.get(project_city,"")
|
|
|
+ district = _proj.get(project_district,"")
|
|
|
+
|
|
|
page_time_less = timeAdd(page_time,-150)
|
|
|
page_time_greater = timeAdd(page_time,120)
|
|
|
sub_project_q = TermQuery(project_sub_project_name,sub_project_name) if sub_project_name.replace("Project","")!="" else None
|
|
|
_time = time.time()
|
|
|
- list_must_query = self.getMerge_rules(page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price)
|
|
|
+ list_must_query = self.getMerge_rules(page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price,province,city,district)
|
|
|
|
|
|
|
|
|
list_merge_data = []
|
|
|
|
|
|
- _step = 5
|
|
|
+ _step = 3
|
|
|
_begin = 0
|
|
|
must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
|
|
|
]
|
|
|
- if sub_project_q is not None:
|
|
|
- must_queries.append(sub_project_q)
|
|
|
+
|
|
|
+ #sub_project_name非必要条件
|
|
|
+ # if sub_project_q is not None:
|
|
|
+ # must_queries.append(sub_project_q)
|
|
|
+
|
|
|
projects_prepare_time += time.time()-_time
|
|
|
_time = time.time()
|
|
|
while _begin<len(list_must_query):
|
|
@@ -3440,7 +3535,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
must_not_queries=must_not_q[:100])
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search("project2","project2_index_formerge",
|
|
|
SearchQuery(_query,limit=_limit),
|
|
|
- columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ columns_to_get=ColumnsToGet(column_names=[project_uuid,project_docids,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_name,project_project_code,project_project_codes,project_project_addr,project_tenderee,project_tenderee_addr,project_tenderee_phone,project_tenderee_contact,project_agency,project_agency_phone,project_agency_contact,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_project_dynamics,project_product,project_moneysource,project_service_time,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_dup_docid,project_info_source,project_nlp_enterprise,project_nlp_enterprise_attachment],return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data = getRow_ots(rows)
|
|
|
|
|
|
list_merge_data.extend(list_data)
|
|
@@ -3453,10 +3548,15 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
projects_query_time += time.time()-_time
|
|
|
#优先匹配招标金额相近的
|
|
|
projects_merge_count = len(list_merge_data)
|
|
|
+ list_merge_data.sort(key=lambda x:x.get(project_page_time,""))
|
|
|
list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
|
|
|
+ # log(page_time_less+"=="+page_time_greater)
|
|
|
+ # log("list_merge_data:%s"%(str(list_merge_data)))
|
|
|
for _data in list_merge_data:
|
|
|
_time = time.time()
|
|
|
_check = check_merge_rule(_proj,_data,b_log=b_log)
|
|
|
+ if b_log:
|
|
|
+ log(str(_check))
|
|
|
projects_check_rule_time += time.time()-_time
|
|
|
if _check:
|
|
|
_time = time.time()
|
|
@@ -3474,6 +3574,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
|
|
|
|
|
|
+
|
|
|
def merge_document_real(self,item,dup_docid,table_name,status_to=None,b_log=False):
|
|
|
'''
|
|
|
实时项目合并
|
|
@@ -3551,7 +3652,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
singleNum_keys = _rule["singleNum_keys"]
|
|
|
contain_keys = _rule["contain_keys"]
|
|
|
multiNum_keys = _rule["multiNum_keys"]
|
|
|
- self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json])
|
|
|
+ self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint])
|
|
|
_i += step
|
|
|
|
|
|
|
|
@@ -3616,6 +3717,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
# print(dtmp.getProperties())
|
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
|
|
|
+ dtmp.setValue(document_tmp_best_docid,best_docid,True)
|
|
|
dtmp.update_row(self.ots_client)
|
|
|
|
|
|
# log("dump takes %.2f"%(time.time()-start_time))
|
|
@@ -3623,9 +3725,71 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
traceback.print_exc()
|
|
|
log("error on dumplicate of %s"%(str(item.get(document_tmp_docid))))
|
|
|
|
|
|
+
|
|
|
+ def fix_doc_which_not_in_project(self):
|
|
|
+ '''
|
|
|
+ 将成品公告中不存在于project2的数据取出,并放入document_tmp中重新进行去重和合并
|
|
|
+ :return:
|
|
|
+ '''
|
|
|
+ def fix_doc_handle(item,result_queue):
|
|
|
+ _docid = item.get(document_tmp_docid)
|
|
|
+ b_q = BoolQuery(must_queries=[TermQuery(project_docids,str(_docid))])
|
|
|
+
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("project2","project2_index",
|
|
|
+ SearchQuery(b_q,get_total_count=True),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ if total_count==0:
|
|
|
+ log("fix_doc:%s not in project2"%(str(_docid)))
|
|
|
+ d_tmp = Document_tmp(item)
|
|
|
+ d_tmp.setValue(document_tmp_status,flow_dumplicate_status_from[0],True)
|
|
|
+ d_tmp.update_row(self.ots_client)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ if self.fix_doc_docid is None:
|
|
|
+ current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
|
|
|
+ before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery(document_tmp_save,1),
|
|
|
+ RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
|
|
|
+ RangeQuery(document_tmp_opertime,before_date)
|
|
|
+ ])
|
|
|
+ else:
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery(document_tmp_save,1),
|
|
|
+ RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
|
|
|
+ RangeQuery(document_tmp_docid,self.fix_doc_docid)
|
|
|
+ ])
|
|
|
+
|
|
|
+ list_data = []
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),get_total_count=True,limit=100),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ list_d = getRow_ots(rows)
|
|
|
+ list_data.extend(list_d)
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,get_total_count=True,limit=100),
|
|
|
+ ColumnsToGet(return_type=ColumnReturnType.NONE))
|
|
|
+ list_d = getRow_ots(rows)
|
|
|
+ list_data.extend(list_d)
|
|
|
+ print("%d/%d"%(len(list_data),total_count))
|
|
|
+ if len(list_data)>0:
|
|
|
+ self.fix_doc_docid = list_data[-1].get(document_tmp_docid)
|
|
|
+ log("current fix_doc_docid:%s"%(str(self.fix_doc_docid)))
|
|
|
+ task_queue = Queue()
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+
|
|
|
+ mt = MultiThreadHandler(task_queue,fix_doc_handle,None,30)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
def start_flow_dumplicate(self):
|
|
|
schedule = BlockingScheduler()
|
|
|
schedule.add_job(self.flow_dumplicate,"cron",second="*/5")
|
|
|
+ schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
|
|
|
schedule.start()
|
|
|
|
|
|
def changeSaveStatus(self,list_dict):
|
|
@@ -3641,7 +3805,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
|
|
|
def test_dumplicate(self,docid):
|
|
|
- columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]
|
|
|
+ columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint]
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
TermQuery("docid",docid)
|
|
|
])
|
|
@@ -3728,12 +3892,13 @@ if __name__ == '__main__':
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
a = time.time()
|
|
|
- df_dump.test_dumplicate(275459183)
|
|
|
+ df_dump.test_dumplicate(183573001)
|
|
|
print("takes",time.time()-a)
|
|
|
+ # df_dump.fix_doc_which_not_in_project()
|
|
|
# df_dump.delete_projects_by_document(16288036)
|
|
|
# log("=======")
|
|
|
# for i in range(3):
|
|
|
# time.sleep(20)
|
|
|
#
|
|
|
- # a = {"docid":16288036}
|
|
|
- # send_msg_toacmq(df_dump.pool_mq_ali,json.dumps(a),df_dump.doc_delete_queue)
|
|
|
+ # a = {"docid":74295123}
|
|
|
+ # send_msg_toacmq(df_dump.pool_mq_ali,json.dumps(a),df_dump.doc_delete_queue)
|