|
@@ -260,7 +260,7 @@ class Dataflow():
|
|
|
log("process filemd5:%s of type:%s with size:%.3fM download:%ds recognize takes %ds,ret_size:%d"%(filemd5,_filetype,round(_size/1024/1024,4),time_download,time.time()-start_time,len(_html)))
|
|
|
else:
|
|
|
log("attach interface failed of docid:%s filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
|
|
|
- sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
|
|
|
+ # sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
|
|
|
_html = ""
|
|
|
return False
|
|
|
|
|
@@ -415,7 +415,10 @@ class Dataflow():
|
|
|
if agency is not None and agency!="":
|
|
|
extract_count += 1
|
|
|
if sub_docs_json is not None:
|
|
|
- sub_docs = json.loads(sub_docs_json)
|
|
|
+ try:
|
|
|
+ sub_docs = json.loads(sub_docs_json)
|
|
|
+ except Exception as e:
|
|
|
+ sub_docs = []
|
|
|
sub_docs.sort(key=lambda x:float(x.get("bidding_budget",0)),reverse=True)
|
|
|
sub_docs.sort(key=lambda x:float(x.get("win_bid_price",0)),reverse=True)
|
|
|
# log("==%s"%(str(sub_docs)))
|
|
@@ -3780,8 +3783,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
district = _proj.get(project_district,"")
|
|
|
|
|
|
if is_yanshou:
|
|
|
- page_time_less = timeAdd(page_time,-750)
|
|
|
- page_time_greater = timeAdd(page_time,720)
|
|
|
+ page_time_less = timeAdd(page_time,-850)
|
|
|
+ page_time_greater = timeAdd(page_time,820)
|
|
|
else:
|
|
|
page_time_less = timeAdd(page_time,-450)
|
|
|
page_time_greater = timeAdd(page_time,420)
|
|
@@ -3885,8 +3888,9 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
update_projects_by_project(_data,[_proj])
|
|
|
projects_update_time += time.time()-_time
|
|
|
|
|
|
- whole_time = time.time()-whole_time_start
|
|
|
- log("%s %s merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(search_table,docids,whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
|
|
|
+ whole_time = time.time()-whole_time_start
|
|
|
+ log("%s %s merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(search_table,docids,whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
|
|
|
+
|
|
|
|
|
|
return list_projects
|
|
|
except Exception as e:
|
|
@@ -4050,7 +4054,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
singleNum_keys = _rule["singleNum_keys"]
|
|
|
contain_keys = _rule["contain_keys"]
|
|
|
multiNum_keys = _rule["multiNum_keys"]
|
|
|
- self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type],b_log=b_log)
|
|
|
+ self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=b_log)
|
|
|
_i += step
|
|
|
|
|
|
|
|
@@ -4075,7 +4079,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
dup_docid = set()
|
|
|
for _dict in final_list:
|
|
|
- dup_docid.add(_dict.get(document_tmp_docid))
|
|
|
+ if _dict.get("update_document","")!="true":
|
|
|
+ dup_docid.add(_dict.get(document_tmp_docid))
|
|
|
if item.get(document_tmp_docid) in dup_docid:
|
|
|
dup_docid.remove(item.get(document_tmp_docid))
|
|
|
|
|
@@ -4097,6 +4102,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
for _dict in final_list:
|
|
|
if _dict.get(document_tmp_docid) in dup_docid:
|
|
|
remove_list.append(_dict)
|
|
|
+
|
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
dmp_docid = "%d,%s"%(best_docid,dmp_docid)
|
|
|
else:
|
|
@@ -4108,8 +4114,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_docids = list(dup_docid)
|
|
|
list_docids.append(best_docid)
|
|
|
|
|
|
- if item.get(document_update_document)=="true":
|
|
|
- dtmp.setValue(document_tmp_save,1,True)
|
|
|
+ # if item.get(document_update_document)=="true":
|
|
|
+ # dtmp.setValue(document_tmp_save,1,True)
|
|
|
|
|
|
list_merge_dump = []
|
|
|
if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
|
|
@@ -4172,19 +4178,23 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
|
|
|
|
|
|
+ current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
|
|
|
+ before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-20)
|
|
|
+ after_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
|
|
|
if self.fix_doc_docid is None:
|
|
|
- current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
|
|
|
- before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
TermQuery(document_tmp_save,1),
|
|
|
RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
|
|
|
- RangeQuery(document_tmp_opertime,before_date)
|
|
|
+ RangeQuery(document_tmp_docchannel,0,300),
|
|
|
+ RangeQuery(document_tmp_opertime,before_date,after_date)
|
|
|
])
|
|
|
else:
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
TermQuery(document_tmp_save,1),
|
|
|
RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
|
|
|
- RangeQuery(document_tmp_docid,self.fix_doc_docid)
|
|
|
+ RangeQuery(document_tmp_docchannel,0,300),
|
|
|
+ RangeQuery(document_tmp_docid,self.fix_doc_docid),
|
|
|
+ RangeQuery(document_tmp_opertime,before_date,after_date)
|
|
|
])
|
|
|
|
|
|
list_data = []
|
|
@@ -4219,7 +4229,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
|
|
|
schedule.add_job(self.flow_remove,"cron",hour="20")
|
|
|
schedule.add_job(self.flow_remove_project_tmp,"cron",hour="20")
|
|
|
- # schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
|
|
|
+ schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="*/10")
|
|
|
schedule.start()
|
|
|
|
|
|
def changeSaveStatus(self,list_dict):
|
|
@@ -4440,7 +4450,7 @@ if __name__ == '__main__':
|
|
|
# test_attachment_interface()
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
- df_dump.test_dumplicate(505243916
|
|
|
+ df_dump.test_dumplicate(519262974
|
|
|
)
|
|
|
# compare_dumplicate_check()
|
|
|
# df_dump.test_merge([391898061
|