|
@@ -4038,42 +4038,48 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
bidclose_time = page_time
|
|
|
web_source_name = item.get(document_tmp_web_source_name,"")
|
|
|
+ docchannel = item.get(document_tmp_docchannel,"0")
|
|
|
|
|
|
-
|
|
|
-
|
|
|
- if len(page_time)>0:
|
|
|
- l_page_time = timeAdd(page_time,days=-90)
|
|
|
- dict_time = item.get("dict_time",{})
|
|
|
- for k,v in dict_time.items():
|
|
|
- if v is not None and len(v)>0:
|
|
|
- if l_page_time>v:
|
|
|
- has_before = True
|
|
|
- if v>page_time:
|
|
|
- has_after = True
|
|
|
- if k==document_tmp_time_bidclose:
|
|
|
- bidclose_time = v
|
|
|
-
|
|
|
- set_web_source = {"中国招标投标公共服务平台","比地招标"}
|
|
|
-
|
|
|
- if web_source_name in set_web_source and bidclose_time<page_time:
|
|
|
- return False
|
|
|
-
|
|
|
- log("check page_time has_before %s has_after %s"%(str(has_before),str(has_after)))
|
|
|
- if has_before:
|
|
|
- _query = BoolQuery(must_queries=[MatchPhraseQuery(document_doctitle,item.get(document_doctitle,""))],
|
|
|
- must_not_queries=[TermQuery(document_docid,item.get(document_docid,0))])
|
|
|
- if not has_after:
|
|
|
- log("check page_time false %s==%s-%s"%(l_page_time,k,v))
|
|
|
-
|
|
|
- rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
|
|
|
- SearchQuery(_query,get_total_count=True,limit=1))
|
|
|
- if total_count>0:
|
|
|
- return False
|
|
|
- if item.get(document_web_source_name,"")=="中国政府采购网":
|
|
|
- rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
|
|
|
- SearchQuery(_query,get_total_count=True,limit=1))
|
|
|
- if total_count>0:
|
|
|
- return False
|
|
|
+ try:
|
|
|
+ docchannel = int(docchannel)
|
|
|
+ except:
|
|
|
+ docchannel = 0
|
|
|
+
|
|
|
+ if docchannel<200:
|
|
|
+
|
|
|
+ if len(page_time)>0:
|
|
|
+ l_page_time = timeAdd(page_time,days=-90)
|
|
|
+ dict_time = item.get("dict_time",{})
|
|
|
+ for k,v in dict_time.items():
|
|
|
+ if v is not None and len(v)>0:
|
|
|
+ if l_page_time>v:
|
|
|
+ has_before = True
|
|
|
+ if v>page_time:
|
|
|
+ has_after = True
|
|
|
+ if k==document_tmp_time_bidclose:
|
|
|
+ bidclose_time = v
|
|
|
+
|
|
|
+ set_web_source = {"中国招标投标公共服务平台","比地招标"}
|
|
|
+
|
|
|
+ if web_source_name in set_web_source and bidclose_time<page_time:
|
|
|
+ return False
|
|
|
+
|
|
|
+ log("check page_time has_before %s has_after %s"%(str(has_before),str(has_after)))
|
|
|
+ if has_before:
|
|
|
+ _query = BoolQuery(must_queries=[MatchPhraseQuery(document_doctitle,item.get(document_doctitle,""))],
|
|
|
+ must_not_queries=[TermQuery(document_docid,item.get(document_docid,0))])
|
|
|
+ if not has_after:
|
|
|
+ log("check page_time false %s==%s-%s"%(l_page_time,k,v))
|
|
|
+
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
|
|
|
+ SearchQuery(_query,get_total_count=True,limit=1))
|
|
|
+ if total_count>0:
|
|
|
+ return False
|
|
|
+ if item.get(document_web_source_name,"")=="中国政府采购网":
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
|
|
|
+ SearchQuery(_query,get_total_count=True,limit=1))
|
|
|
+ if total_count>0:
|
|
|
+ return False
|
|
|
|
|
|
return True
|
|
|
|
|
@@ -4515,7 +4521,7 @@ if __name__ == '__main__':
|
|
|
# test_attachment_interface()
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
- df_dump.test_dumplicate(576859812
|
|
|
+ df_dump.test_dumplicate(578681000
|
|
|
)
|
|
|
# compare_dumplicate_check()
|
|
|
# df_dump.test_merge([391898061
|