|
@@ -1351,7 +1351,7 @@ class Dataflow():
|
|
|
|
|
|
|
|
|
def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
|
|
|
- def producer(columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json]):
|
|
|
+ def producer(columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_product,document_tmp_fingerprint,document_tmp_tenderee,document_tmp_agency,document_tmp_project_code,document_tmp_project_name,document_tmp_doctitle_refine,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_web_source_name]):
|
|
|
bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True)])
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
|
|
|
SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
|
|
@@ -2854,7 +2854,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
return list_rules,table_name,table_index
|
|
|
|
|
|
- def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path]):
|
|
|
+ def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_name]):
|
|
|
q_size = self.queue_dumplicate.qsize()
|
|
|
log("dumplicate queue size %d"%(q_size))
|
|
|
|
|
@@ -2913,7 +2913,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
def flow_dumpcate_comsumer(self):
|
|
|
from multiprocessing import Process
|
|
|
- process_count = 3
|
|
|
+ process_count = 6
|
|
|
thread_count = 12
|
|
|
list_process = []
|
|
|
def start_thread():
|
|
@@ -3956,6 +3956,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
page_time = item.get(document_page_time,"")
|
|
|
has_before = False
|
|
|
has_after = False
|
|
|
+
|
|
|
if len(page_time)>0:
|
|
|
l_page_time = timeAdd(page_time,days=-90)
|
|
|
dict_time = item.get("dict_time",{})
|
|
@@ -3965,9 +3966,22 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
has_before = True
|
|
|
if v>page_time:
|
|
|
has_after = True
|
|
|
- if not has_after and has_before:
|
|
|
- log("check page_time false %s==%s-%s"%(l_page_time,k,v))
|
|
|
- return False
|
|
|
+ if has_before:
|
|
|
+ _query = BoolQuery(must_queries=[MatchPhraseQuery(document_doctitle,item.get(document_doctitle,""))],
|
|
|
+ must_not_queries=[TermQuery(document_docid,item.get(document_docid,0))])
|
|
|
+ if not has_after:
|
|
|
+ log("check page_time false %s==%s-%s"%(l_page_time,k,v))
|
|
|
+
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
|
|
|
+ SearchQuery(_query,get_total_count=True,limit=1))
|
|
|
+ if total_count>0:
|
|
|
+ return False
|
|
|
+ if item.get(document_web_source_name,"")=="中国政府采购网":
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
|
|
|
+ SearchQuery(_query,get_total_count=True,limit=1))
|
|
|
+ if total_count>0:
|
|
|
+ return False
|
|
|
+
|
|
|
return True
|
|
|
|
|
|
|
|
@@ -4071,11 +4085,13 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
dtmp.setValue(document_tmp_save,1,True)
|
|
|
|
|
|
list_merge_dump = []
|
|
|
- if exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0:
|
|
|
+ if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
|
|
|
log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
|
|
|
dtmp.setValue(document_tmp_projects,"[]",True)
|
|
|
else:
|
|
|
project_json,list_merge_dump = self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log)
|
|
|
+ if list_merge_dump is not None and str(item.get(document_tmp_docid)) in list_merge_dump:
|
|
|
+ dtmp.setValue(document_tmp_save,0,True)
|
|
|
dtmp.setValue(document_tmp_projects,project_json,True)
|
|
|
log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
|
|
|
|
|
@@ -4205,7 +4221,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
def test_dumplicate(self,docid):
|
|
|
# columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
|
|
|
- columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path]
|
|
|
+ columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_name]
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
TermQuery("docid",docid)
|
|
|
])
|
|
@@ -4396,7 +4412,7 @@ if __name__ == '__main__':
|
|
|
# test_attachment_interface()
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
- df_dump.test_dumplicate(378760606
|
|
|
+ df_dump.test_dumplicate(332439629
|
|
|
)
|
|
|
# compare_dumplicate_check()
|
|
|
# df_dump.test_merge([391898061
|