|
@@ -260,7 +260,7 @@ class Dataflow():
|
|
|
log("process filemd5:%s of type:%s with size:%.3fM download:%ds recognize takes %ds,ret_size:%d"%(filemd5,_filetype,round(_size/1024/1024,4),time_download,time.time()-start_time,len(_html)))
|
|
|
else:
|
|
|
log("attach interface failed of docid:%s filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
|
|
|
- sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
|
|
|
+ # sentMsgToDD("attach interface failed of docid:%s of filemd5:%s of type:%s size:%.3fM with result:%s"%(str(docids),filemd5,_filetype,round(_size/1024/1024,4),str(_html)))
|
|
|
_html = ""
|
|
|
return False
|
|
|
|
|
@@ -350,8 +350,8 @@ class Dataflow():
|
|
|
|
|
|
|
|
|
def generate_dumplicate_query(self,_dict,_dict_must_not,set_match=set(["project_code","project_codes","product"]),set_nested=set(["win_tenderer","bidding_budget","win_bid_price"]),
|
|
|
- set_term=set(["project_name","doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save","docid"]),
|
|
|
- set_range=set(["page_time","status"]),set_phrase=set(["doctitle"])):
|
|
|
+ set_term=set(["doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save","docid"]),
|
|
|
+ set_range=set(["page_time","status"]),set_phrase=set(["doctitle","project_name"])):
|
|
|
list_must_queries = []
|
|
|
list_must_no_queries = []
|
|
|
for k,v in _dict.items():
|
|
@@ -415,7 +415,10 @@ class Dataflow():
|
|
|
if agency is not None and agency!="":
|
|
|
extract_count += 1
|
|
|
if sub_docs_json is not None:
|
|
|
- sub_docs = json.loads(sub_docs_json)
|
|
|
+ try:
|
|
|
+ sub_docs = json.loads(sub_docs_json)
|
|
|
+ except Exception as e:
|
|
|
+ sub_docs = []
|
|
|
sub_docs.sort(key=lambda x:float(x.get("bidding_budget",0)),reverse=True)
|
|
|
sub_docs.sort(key=lambda x:float(x.get("win_bid_price",0)),reverse=True)
|
|
|
# log("==%s"%(str(sub_docs)))
|
|
@@ -2203,7 +2206,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
createComsumer(listener,self.doc_delete_queue)
|
|
|
|
|
|
|
|
|
- def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart","time_release"]):
|
|
|
+ def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart"]):
|
|
|
dict_time = {}
|
|
|
for k in keys:
|
|
|
dict_time[k] = _extract.get(k)
|
|
@@ -2231,10 +2234,12 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
_dict["moneys_attachment"] = set(_extract.get("moneys_attachment",[]))
|
|
|
_dict["nlp_enterprise"] = json.dumps({"indoctextcon":_extract.get("nlp_enterprise",[]),
|
|
|
"notindoctextcon":_extract.get("nlp_enterprise_attachment",[])},ensure_ascii=False)
|
|
|
- _dict["extract_count"] = self.c_f_get_extractCount.evaluate(extract_json)
|
|
|
+ _dict["extract_count"] = _extract.get("extract_count",0)
|
|
|
_dict["package"] = self.c_f_get_package.evaluate(extract_json)
|
|
|
_dict["project_name"] = _extract.get("name","")
|
|
|
_dict["dict_time"] = self.get_dict_time(_extract)
|
|
|
+ _dict["punish"] = _extract.get("punish",{})
|
|
|
+ _dict["approval"] = _extract.get("approval",[])
|
|
|
|
|
|
def dumplicate_fianl_check(self,base_list,b_log=False):
|
|
|
the_group = base_list
|
|
@@ -2272,22 +2277,22 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
|
|
|
document_less = _dict1
|
|
|
docid_less = _dict1["docid"]
|
|
|
- docchannel_less = document_less["docchannel"]
|
|
|
- page_time_less = document_less["page_time"]
|
|
|
+ docchannel_less = document_less.get("docchannel",0)
|
|
|
+ page_time_less = document_less.get("page_time")
|
|
|
doctitle_refine_less = document_less["doctitle_refine"]
|
|
|
- project_codes_less = document_less["project_codes"]
|
|
|
+ project_codes_less = document_less.get("project_codes")
|
|
|
nlp_enterprise_less = document_less["nlp_enterprise"]
|
|
|
- tenderee_less = document_less["tenderee"]
|
|
|
- agency_less = document_less["agency"]
|
|
|
+ tenderee_less = document_less.get("tenderee","")
|
|
|
+ agency_less = document_less.get("agency")
|
|
|
win_tenderer_less = document_less["win_tenderer"]
|
|
|
bidding_budget_less = document_less["bidding_budget"]
|
|
|
win_bid_price_less = document_less["win_bid_price"]
|
|
|
- product_less = document_less["product"]
|
|
|
- package_less = document_less["package"]
|
|
|
- json_time_less = document_less["dict_time"]
|
|
|
- project_name_less = document_less["project_name"]
|
|
|
- fingerprint_less = document_less["fingerprint"]
|
|
|
- extract_count_less = document_less["extract_count"]
|
|
|
+ product_less = document_less.get("product")
|
|
|
+ package_less = document_less.get("package")
|
|
|
+ json_time_less = document_less.get("dict_time")
|
|
|
+ project_name_less = document_less.get("project_name")
|
|
|
+ fingerprint_less = document_less.get("fingerprint")
|
|
|
+ extract_count_less = document_less.get("extract_count",0)
|
|
|
web_source_no_less = document_less.get("web_source_no")
|
|
|
province_less = document_less.get("province")
|
|
|
city_less = document_less.get("city")
|
|
@@ -2295,26 +2300,29 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
moneys_less = document_less.get("moneys")
|
|
|
moneys_attachment_less = document_less.get("moneys_attachment")
|
|
|
page_attachments_less = document_less.get(document_tmp_attachment_path,"[]")
|
|
|
+ punish_less = document_less.get("punish",{})
|
|
|
+ approval_less = document_less.get("approval",[])
|
|
|
+ source_type_less = document_less.get("source_type")
|
|
|
|
|
|
|
|
|
document_greater = _dict2
|
|
|
docid_greater = _dict2["docid"]
|
|
|
page_time_greater = document_greater["page_time"]
|
|
|
- docchannel_greater = document_greater["docchannel"]
|
|
|
- doctitle_refine_greater = document_greater["doctitle_refine"]
|
|
|
+ docchannel_greater = document_greater.get("docchannel",0)
|
|
|
+ doctitle_refine_greater = document_greater.get("doctitle_refine","")
|
|
|
project_codes_greater = document_greater["project_codes"]
|
|
|
nlp_enterprise_greater = document_greater["nlp_enterprise"]
|
|
|
- tenderee_greater = document_greater["tenderee"]
|
|
|
- agency_greater = document_greater["agency"]
|
|
|
+ tenderee_greater = document_greater.get("tenderee","")
|
|
|
+ agency_greater = document_greater.get("agency","")
|
|
|
win_tenderer_greater = document_greater["win_tenderer"]
|
|
|
bidding_budget_greater = document_greater["bidding_budget"]
|
|
|
win_bid_price_greater = document_greater["win_bid_price"]
|
|
|
- product_greater = document_greater["product"]
|
|
|
- package_greater = document_greater["package"]
|
|
|
+ product_greater = document_greater.get("product")
|
|
|
+ package_greater = document_greater.get("package")
|
|
|
json_time_greater = document_greater["dict_time"]
|
|
|
- project_name_greater = document_greater["project_name"]
|
|
|
- fingerprint_greater = document_greater["fingerprint"]
|
|
|
- extract_count_greater = document_greater["extract_count"]
|
|
|
+ project_name_greater = document_greater.get("project_name")
|
|
|
+ fingerprint_greater = document_greater.get("fingerprint")
|
|
|
+ extract_count_greater = document_greater.get("extract_count",0)
|
|
|
web_source_no_greater = document_greater.get("web_source_no")
|
|
|
province_greater = document_greater.get("province")
|
|
|
city_greater = document_greater.get("city")
|
|
@@ -2324,12 +2332,16 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
moneys_attachment_greater = document_greater.get("moneys_attachment")
|
|
|
page_attachments_greater = document_greater.get(document_tmp_attachment_path,"[]")
|
|
|
|
|
|
+ punish_greater = document_greater.get("punish",{})
|
|
|
+ approval_greater = document_greater.get("approval",[])
|
|
|
+ source_type_greater = document_greater.get("source_type")
|
|
|
+
|
|
|
hard_level=1
|
|
|
if web_source_no_less==web_source_no_greater=="17397-3":
|
|
|
hard_level=2
|
|
|
|
|
|
if self.check_rule==1:
|
|
|
- _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
|
|
|
+ _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less=punish_less,punish_greater=punish_greater,approval_less=approval_less,approval_greater=approval_greater,source_type_less=source_type_less,source_type_greater=source_type_greater)
|
|
|
else:
|
|
|
_prob = check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater)
|
|
|
|
|
@@ -2559,7 +2571,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
else:
|
|
|
bool_query = _query
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
|
|
|
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=30,get_total_count=True),
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=100,get_total_count=True),
|
|
|
ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_dict = getRow_ots(rows)
|
|
|
list_data = []
|
|
@@ -2854,7 +2866,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
return list_rules,table_name,table_index
|
|
|
|
|
|
- def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_name]):
|
|
|
+ def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]):
|
|
|
q_size = self.queue_dumplicate.qsize()
|
|
|
log("dumplicate queue size %d"%(q_size))
|
|
|
|
|
@@ -2939,7 +2951,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
# mt.run()
|
|
|
|
|
|
|
|
|
- def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment]):
|
|
|
+ def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment,document_tenderee_code,document_agency_code,document_candidates]):
|
|
|
'''
|
|
|
根据docid查询公告内容,先查询document_tmp,再查询document
|
|
|
:param list_docids:
|
|
@@ -3049,7 +3061,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
continue
|
|
|
if v is None or v=="" or v=="[]" or v=="未知":
|
|
|
continue
|
|
|
- if k in (project_project_dynamics,project_product,project_project_codes,project_docids):
|
|
|
+ if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_candidates):
|
|
|
continue
|
|
|
_dict[k] = v
|
|
|
for _proj in projects:
|
|
@@ -3058,14 +3070,19 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
if _proj.get(project_page_time,"")<project_dict.get(project_page_time,""):
|
|
|
_proj[project_page_time] = project_dict.get(project_page_time,"")
|
|
|
|
|
|
- #拼接属性
|
|
|
- append_dict = {}
|
|
|
- set_docid = set()
|
|
|
- set_product = set()
|
|
|
- set_code = set()
|
|
|
- set_nlp_enterprise = set()
|
|
|
- set_nlp_enterprise_attachment = set()
|
|
|
+
|
|
|
for _proj in projects:
|
|
|
+ #拼接属性
|
|
|
+ append_dict = {}
|
|
|
+ set_docid = set()
|
|
|
+ set_product = set()
|
|
|
+ set_code = set()
|
|
|
+ set_nlp_enterprise = set()
|
|
|
+ set_nlp_enterprise_attachment = set()
|
|
|
+ set_candidates = set()
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
_docids = _proj.get(project_docids,"")
|
|
|
_codes = _proj.get(project_project_codes,"")
|
|
|
_product = _proj.get(project_product,"")
|
|
@@ -3081,15 +3098,22 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
try:
|
|
|
set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
|
|
|
set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
|
|
|
- except Exception as e:
|
|
|
- pass
|
|
|
+ list_candidates = json.loads(project_dict.get(project_candidates,"[]"))
|
|
|
+ for item in list_candidates:
|
|
|
+ if item.get("name") is not None and item.get("name") not in set_candidates:
|
|
|
+ set_candidates.add(item.get("name"))
|
|
|
|
|
|
- set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
|
|
|
- set_product = set_product | set(project_dict.get(project_product,"").split(","))
|
|
|
|
|
|
- try:
|
|
|
+ set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
|
|
|
+ set_product = set_product | set(project_dict.get(project_product,"").split(","))
|
|
|
+
|
|
|
set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
|
|
|
set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
|
|
|
+
|
|
|
+ for item in json.loads(_proj.get(project_candidates,"[]")):
|
|
|
+ if item.get("name") is not None and item.get("name") not in set_candidates:
|
|
|
+ set_candidates.add(item.get("name"))
|
|
|
+ list_candidates.append(item)
|
|
|
except Exception as e:
|
|
|
pass
|
|
|
|
|
@@ -3101,6 +3125,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
|
|
|
append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
|
|
|
+ append_dict[project_candidates] = json.dumps(list_candidates,ensure_ascii=False)
|
|
|
|
|
|
|
|
|
dict_dynamic = {}
|
|
@@ -3119,6 +3144,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
|
|
|
|
|
|
append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
|
|
|
+
|
|
|
_proj.update(append_dict)
|
|
|
|
|
|
|
|
@@ -3151,74 +3177,84 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
|
|
|
#更新私有属性
|
|
|
- for _pp in list_package_properties:
|
|
|
-
|
|
|
- flag_update = False
|
|
|
- sub_project_name = _pp.get(project_sub_project_name,"")
|
|
|
- if sub_project_name=="Project":
|
|
|
- sub_project_name = ""
|
|
|
- win_tenderer = _pp.get(project_win_tenderer,"")
|
|
|
- win_bid_price = _pp.get(project_win_bid_price,0)
|
|
|
- bidding_budget = _pp.get(project_bidding_budget,0)
|
|
|
- if win_tenderer!="" and bidding_budget!=0:
|
|
|
- _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
|
|
|
- if _key in dict_package:
|
|
|
- if self.is_same_package(_pp,dict_package[_key]):
|
|
|
- ud = self.getUpdate_dict(_pp)
|
|
|
- self.set_project_uuid(ud,dict_package[_key].get("uuid"))
|
|
|
- dict_package[_key].update(ud)
|
|
|
- flag_update = True
|
|
|
- continue
|
|
|
- if win_tenderer!="" and win_bid_price!=0:
|
|
|
- _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
|
|
|
- if _key in dict_package:
|
|
|
- if self.is_same_package(_pp,dict_package[_key]):
|
|
|
- ud = self.getUpdate_dict(_pp)
|
|
|
- self.set_project_uuid(ud,dict_package[_key].get("uuid"))
|
|
|
- dict_package[_key].update(ud)
|
|
|
- flag_update = True
|
|
|
- continue
|
|
|
- if win_tenderer!="":
|
|
|
- _key = "%s-%s"%(sub_project_name,win_tenderer)
|
|
|
- if _key in dict_package:
|
|
|
- if self.is_same_package(_pp,dict_package[_key]):
|
|
|
- ud = self.getUpdate_dict(_pp)
|
|
|
- self.set_project_uuid(ud,dict_package[_key].get("uuid"))
|
|
|
- dict_package[_key].update(ud)
|
|
|
- flag_update = True
|
|
|
- continue
|
|
|
- if bidding_budget!=0:
|
|
|
- _key = "%s-%s"%(sub_project_name,str(bidding_budget))
|
|
|
- if _key in dict_package:
|
|
|
- if self.is_same_package(_pp,dict_package[_key]):
|
|
|
- ud = self.getUpdate_dict(_pp)
|
|
|
- self.set_project_uuid(ud,dict_package[_key].get("uuid"))
|
|
|
- dict_package[_key].update(ud)
|
|
|
- flag_update = True
|
|
|
- continue
|
|
|
- if not flag_update:
|
|
|
- _pp.update(project_dict)
|
|
|
- projects.append(_pp)
|
|
|
+ if len(projects)==1 and len(list_package_properties)==1:
|
|
|
+ _pp = list_package_properties[0]
|
|
|
+ pp = projects[0]
|
|
|
+ ud = self.getUpdate_dict(_pp)
|
|
|
+ self.set_project_uuid(ud,pp.get("uuid"))
|
|
|
+ pp.update(_pp)
|
|
|
+ else:
|
|
|
|
|
|
+ for _pp in list_package_properties:
|
|
|
|
|
|
- _counts = 0
|
|
|
+ flag_update = False
|
|
|
+ sub_project_name = _pp.get(project_sub_project_name,"")
|
|
|
+ if sub_project_name=="Project":
|
|
|
+ sub_project_name = ""
|
|
|
+ win_tenderer = _pp.get(project_win_tenderer,"")
|
|
|
+ win_bid_price = _pp.get(project_win_bid_price,0)
|
|
|
+ bidding_budget = _pp.get(project_bidding_budget,0)
|
|
|
if win_tenderer!="" and bidding_budget!=0:
|
|
|
_key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
|
|
|
- dict_package[_key] = _pp
|
|
|
- _counts += 1
|
|
|
+ if _key in dict_package:
|
|
|
+ if self.is_same_package(_pp,dict_package[_key]):
|
|
|
+ ud = self.getUpdate_dict(_pp)
|
|
|
+ self.set_project_uuid(ud,dict_package[_key].get("uuid"))
|
|
|
+ dict_package[_key].update(ud)
|
|
|
+ flag_update = True
|
|
|
+ continue
|
|
|
if win_tenderer!="" and win_bid_price!=0:
|
|
|
_key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
|
|
|
- dict_package[_key] = _pp
|
|
|
- _counts +=1
|
|
|
- if _counts==0:
|
|
|
- if win_tenderer!="":
|
|
|
- _key = "%s-%s"%(sub_project_name,win_tenderer)
|
|
|
+ if _key in dict_package:
|
|
|
+ if self.is_same_package(_pp,dict_package[_key]):
|
|
|
+ ud = self.getUpdate_dict(_pp)
|
|
|
+ self.set_project_uuid(ud,dict_package[_key].get("uuid"))
|
|
|
+ dict_package[_key].update(ud)
|
|
|
+ flag_update = True
|
|
|
+ continue
|
|
|
+ if win_tenderer!="":
|
|
|
+ _key = "%s-%s"%(sub_project_name,win_tenderer)
|
|
|
+ if _key in dict_package:
|
|
|
+ if self.is_same_package(_pp,dict_package[_key]):
|
|
|
+ ud = self.getUpdate_dict(_pp)
|
|
|
+ self.set_project_uuid(ud,dict_package[_key].get("uuid"))
|
|
|
+ dict_package[_key].update(ud)
|
|
|
+ flag_update = True
|
|
|
+ continue
|
|
|
+ if bidding_budget!=0:
|
|
|
+ _key = "%s-%s"%(sub_project_name,str(bidding_budget))
|
|
|
+ if _key in dict_package:
|
|
|
+ if self.is_same_package(_pp,dict_package[_key]):
|
|
|
+ ud = self.getUpdate_dict(_pp)
|
|
|
+ self.set_project_uuid(ud,dict_package[_key].get("uuid"))
|
|
|
+ dict_package[_key].update(ud)
|
|
|
+ flag_update = True
|
|
|
+ continue
|
|
|
+ if not flag_update:
|
|
|
+ _pp.update(project_dict)
|
|
|
+ projects.append(_pp)
|
|
|
+
|
|
|
+
|
|
|
+ _counts = 0
|
|
|
+ if win_tenderer!="" and bidding_budget!=0:
|
|
|
+ _key = "%s-%s-%s"%(sub_project_name,str(win_tenderer),str(bidding_budget))
|
|
|
dict_package[_key] = _pp
|
|
|
- _counts += 1
|
|
|
- if bidding_budget!=0:
|
|
|
- _key = "%s-%s"%(sub_project_name,str(bidding_budget))
|
|
|
+ _counts += 1
|
|
|
+ if win_tenderer!="" and win_bid_price!=0:
|
|
|
+ _key = "%s-%s-%s"%(sub_project_name,win_tenderer,str(win_bid_price))
|
|
|
dict_package[_key] = _pp
|
|
|
- _counts += 1
|
|
|
+ _counts +=1
|
|
|
+ if _counts==0:
|
|
|
+ if win_tenderer!="":
|
|
|
+ _key = "%s-%s"%(sub_project_name,win_tenderer)
|
|
|
+ dict_package[_key] = _pp
|
|
|
+ _counts += 1
|
|
|
+ if bidding_budget!=0:
|
|
|
+ _key = "%s-%s"%(sub_project_name,str(bidding_budget))
|
|
|
+ dict_package[_key] = _pp
|
|
|
+ _counts += 1
|
|
|
+
|
|
|
+
|
|
|
|
|
|
|
|
|
|
|
@@ -3255,33 +3291,42 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_projects = dumplicate_projects(list_projects)
|
|
|
list_projects.extend(list_delete_projects)
|
|
|
project_json = to_project_json(list_projects)
|
|
|
- print("delete_json",project_json)
|
|
|
return project_json
|
|
|
|
|
|
|
|
|
def delete_doc_handle(self,_dict,result_queue):
|
|
|
headers = _dict.get("frame")
|
|
|
conn = _dict.get("conn")
|
|
|
- log("==========delete")
|
|
|
+
|
|
|
if headers is not None:
|
|
|
message_id = headers.headers["message-id"]
|
|
|
body = headers.body
|
|
|
item = json.loads(body)
|
|
|
docid = item.get("docid")
|
|
|
+ log("==========start delete docid:%s"%(str(docid)))
|
|
|
if docid is None:
|
|
|
- return
|
|
|
+ ackMsg(conn,message_id)
|
|
|
delete_result = self.delete_projects_by_document(docid)
|
|
|
|
|
|
+ log("1")
|
|
|
_uuid = uuid4().hex
|
|
|
_d = {PROJECT_PROCESS_UUID:_uuid,
|
|
|
PROJECT_PROCESS_CRTIME:1,
|
|
|
PROJECT_PROCESS_PROJECTS:delete_result}
|
|
|
_pp = Project_process(_d)
|
|
|
- if _pp.update_row(self.ots_client):
|
|
|
+ log("2")
|
|
|
+ try:
|
|
|
+ if _pp.update_row(self.ots_client):
|
|
|
+ ackMsg(conn,message_id)
|
|
|
+ except Exception as e:
|
|
|
ackMsg(conn,message_id)
|
|
|
+ log("3")
|
|
|
#取消插入结果队列,改成插入project_process表
|
|
|
# if send_msg_toacmq(self.pool_mq_ali,delete_result,self.doc_delete_result):
|
|
|
# ackMsg(conn,message_id)
|
|
|
+ log("==========end delete docid:%s"%(str(docid)))
|
|
|
+ else:
|
|
|
+ log("has not headers")
|
|
|
|
|
|
def generate_common_properties(self,list_docs):
|
|
|
'''
|
|
@@ -3539,6 +3584,9 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
project_info_source,
|
|
|
project_nlp_enterprise,
|
|
|
project_nlp_enterprise_attachment,
|
|
|
+ project_tenderee_code,
|
|
|
+ project_agency_code,
|
|
|
+ project_candidates
|
|
|
],sort="page_time",table_name="project2",table_index="project2_index")
|
|
|
|
|
|
return list_project_dict
|
|
@@ -3654,6 +3702,14 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
should_q_cod]
|
|
|
list_query.append([_query,2])
|
|
|
|
|
|
+ if win_tenderer!="" and sub_project_name!="":
|
|
|
+ _query = [TermQuery(project_win_tenderer,win_tenderer),
|
|
|
+ TermQuery(project_sub_project_name,sub_project_name)
|
|
|
+ ]
|
|
|
+ list_query.append([_query,2])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
if win_tenderer!="" and float(win_bid_price)>0:
|
|
|
_query = [TermQuery(project_win_tenderer,win_tenderer),
|
|
|
TermQuery(project_win_bid_price,win_bid_price)]
|
|
@@ -3710,10 +3766,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
_uuid = _proj.get("uuid")
|
|
|
if _uuid is not None:
|
|
|
set_uuid = set_uuid | set(_uuid.split(","))
|
|
|
- must_not_q = []
|
|
|
- for _uuid in list(set_uuid):
|
|
|
- must_not_q.append(TermQuery("uuid",_uuid))
|
|
|
- print("must_not_q uuid:%s"%(_uuid))
|
|
|
+
|
|
|
|
|
|
|
|
|
projects_merge_count = 0
|
|
@@ -3729,6 +3782,10 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
docids = ""
|
|
|
for _proj in list_projects[:30]:
|
|
|
|
|
|
+ must_not_q = []
|
|
|
+ for _uuid in list(set_uuid):
|
|
|
+ must_not_q.append(TermQuery("uuid",_uuid))
|
|
|
+
|
|
|
docids = _proj.get(project_docids,"")
|
|
|
page_time = _proj.get(project_page_time,"")
|
|
|
project_codes = _proj.get(project_project_codes,"")
|
|
@@ -3754,8 +3811,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
district = _proj.get(project_district,"")
|
|
|
|
|
|
if is_yanshou:
|
|
|
- page_time_less = timeAdd(page_time,-750)
|
|
|
- page_time_greater = timeAdd(page_time,720)
|
|
|
+ page_time_less = timeAdd(page_time,-850)
|
|
|
+ page_time_greater = timeAdd(page_time,820)
|
|
|
else:
|
|
|
page_time_less = timeAdd(page_time,-450)
|
|
|
page_time_greater = timeAdd(page_time,420)
|
|
@@ -3784,6 +3841,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
if page_time_less is not None and page_time_greater is not None:
|
|
|
must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
|
|
|
+ # RangeQuery("status",201,301)
|
|
|
]
|
|
|
|
|
|
#sub_project_name非必要条件
|
|
@@ -3832,7 +3890,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_merge_data.sort(key=lambda x:x.get(project_page_time,""))
|
|
|
list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
|
|
|
# log(page_time_less+"=="+page_time_greater)
|
|
|
- # log("list_merge_data:%s"%(str(list_merge_data)))
|
|
|
+ if b_log:
|
|
|
+ log("list_merge_data count:%d"%(len(list_merge_data)))
|
|
|
list_check_data = []
|
|
|
for _data in list_merge_data:
|
|
|
_time = time.time()
|
|
@@ -3858,8 +3917,9 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
update_projects_by_project(_data,[_proj])
|
|
|
projects_update_time += time.time()-_time
|
|
|
|
|
|
- whole_time = time.time()-whole_time_start
|
|
|
- log("%s %s merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(search_table,docids,whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
|
|
|
+ whole_time = time.time()-whole_time_start
|
|
|
+ log("%s %s merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(search_table,docids,whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
|
|
|
+
|
|
|
|
|
|
return list_projects
|
|
|
except Exception as e:
|
|
@@ -3892,10 +3952,9 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_docids = [a for a in list_docids if a is not None]
|
|
|
|
|
|
|
|
|
-
|
|
|
_time = time.time()
|
|
|
list_projects = self.search_projects_with_document(list_docids)
|
|
|
- # log("search projects takes:%.3f"%(time.time()-_time))
|
|
|
+ log("search %d projects takes:%.3f"%(len(list_projects),time.time()-_time))
|
|
|
if len(list_projects)==0:
|
|
|
# _time = time.time()
|
|
|
list_docs = self.search_docs(list_docids)
|
|
@@ -3914,7 +3973,6 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_projects = self.merge_projects(list_projects,b_log)
|
|
|
# log("merge projects takes:%.3f"%(time.time()-_time))
|
|
|
|
|
|
-
|
|
|
_time = time.time()
|
|
|
list_merge_dump = dumplicate_document_in_merge(list_projects,dup_docid[:-1])
|
|
|
# log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
|
|
@@ -3923,6 +3981,27 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_projects = []
|
|
|
|
|
|
_time = time.time()
|
|
|
+
|
|
|
+ projects = list_projects
|
|
|
+ for _proj in projects:
|
|
|
+ dup_docid = _proj.get(project_dup_docid,"")
|
|
|
+ list_dup_docid = dup_docid.split(",")
|
|
|
+ new_dup_docid = []
|
|
|
+ for _docid in list_dup_docid:
|
|
|
+ if _docid=="":
|
|
|
+ continue
|
|
|
+ docid = int(_docid)
|
|
|
+ _d = {"partitionkey":docid%500+1,
|
|
|
+ "docid":docid,
|
|
|
+ }
|
|
|
+ _doc = Document(_d)
|
|
|
+
|
|
|
+ if _doc.fix_columns(self.ots_client,[document_update_document],True):
|
|
|
+ if _doc.getProperties().get(document_update_document,"")!="true":
|
|
|
+ new_dup_docid.append(str(docid))
|
|
|
+ _proj[project_dup_docid] = ",".join(new_dup_docid)
|
|
|
+ list_projects = projects
|
|
|
+
|
|
|
project_json = to_project_json(list_projects)
|
|
|
# log("json projects takes:%.3f"%(time.time()-_time))
|
|
|
if b_log:
|
|
@@ -3957,6 +4036,11 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
has_before = False
|
|
|
has_after = False
|
|
|
|
|
|
+ bidclose_time = page_time
|
|
|
+ web_source_name = item.get(document_tmp_web_source_name,"")
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
if len(page_time)>0:
|
|
|
l_page_time = timeAdd(page_time,days=-90)
|
|
|
dict_time = item.get("dict_time",{})
|
|
@@ -3966,6 +4050,14 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
has_before = True
|
|
|
if v>page_time:
|
|
|
has_after = True
|
|
|
+ if k==document_tmp_time_bidclose:
|
|
|
+ bidclose_time = v
|
|
|
+
|
|
|
+ set_web_source = {"中国招标投标公共服务平台","比地招标"}
|
|
|
+
|
|
|
+ if web_source_name in set_web_source and bidclose_time<page_time:
|
|
|
+ return False
|
|
|
+
|
|
|
log("check page_time has_before %s has_after %s"%(str(has_before),str(has_after)))
|
|
|
if has_before:
|
|
|
_query = BoolQuery(must_queries=[MatchPhraseQuery(document_doctitle,item.get(document_doctitle,""))],
|
|
@@ -4024,7 +4116,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
singleNum_keys = _rule["singleNum_keys"]
|
|
|
contain_keys = _rule["contain_keys"]
|
|
|
multiNum_keys = _rule["multiNum_keys"]
|
|
|
- self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path],b_log=b_log)
|
|
|
+ self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=b_log)
|
|
|
_i += step
|
|
|
|
|
|
|
|
@@ -4049,7 +4141,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
dup_docid = set()
|
|
|
for _dict in final_list:
|
|
|
- dup_docid.add(_dict.get(document_tmp_docid))
|
|
|
+ if _dict.get("update_document","")!="true":
|
|
|
+ dup_docid.add(_dict.get(document_tmp_docid))
|
|
|
if item.get(document_tmp_docid) in dup_docid:
|
|
|
dup_docid.remove(item.get(document_tmp_docid))
|
|
|
|
|
@@ -4057,7 +4150,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
remove_list = []
|
|
|
|
|
|
|
|
|
- if self.check_page_time(item) and (len(final_list)==0 or best_docid==item.get(document_tmp_docid)):
|
|
|
+ if (self.check_page_time(item) and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
|
|
|
dtmp.setValue(document_tmp_save,1,True)
|
|
|
# dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
|
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
@@ -4071,6 +4164,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
for _dict in final_list:
|
|
|
if _dict.get(document_tmp_docid) in dup_docid:
|
|
|
remove_list.append(_dict)
|
|
|
+
|
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
dmp_docid = "%d,%s"%(best_docid,dmp_docid)
|
|
|
else:
|
|
@@ -4082,16 +4176,19 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_docids = list(dup_docid)
|
|
|
list_docids.append(best_docid)
|
|
|
|
|
|
- if item.get(document_update_document)=="true":
|
|
|
- dtmp.setValue(document_tmp_save,1,True)
|
|
|
+ # if item.get(document_update_document)=="true":
|
|
|
+ # dtmp.setValue(document_tmp_save,1,True)
|
|
|
|
|
|
list_merge_dump = []
|
|
|
if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
|
|
|
- log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
|
|
|
+ if exist_finterprint:
|
|
|
+ log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
|
|
|
dtmp.setValue(document_tmp_projects,"[]",True)
|
|
|
else:
|
|
|
project_json,list_merge_dump = self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log)
|
|
|
- if list_merge_dump is not None and str(item.get(document_tmp_docid)) in list_merge_dump:
|
|
|
+
|
|
|
+
|
|
|
+ if list_merge_dump is not None and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
|
|
|
dtmp.setValue(document_tmp_save,0,True)
|
|
|
dtmp.setValue(document_tmp_projects,project_json,True)
|
|
|
log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
|
|
@@ -4145,19 +4242,23 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
|
|
|
|
|
|
+ current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
|
|
|
+ before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-20)
|
|
|
+ after_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
|
|
|
if self.fix_doc_docid is None:
|
|
|
- current_date = getCurrent_date(format="%Y-%m-%d %H:%M:%S")
|
|
|
- before_date = timeAdd(current_date,0,format="%Y-%m-%d %H:%M:%S",minutes=-5)
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
TermQuery(document_tmp_save,1),
|
|
|
RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
|
|
|
- RangeQuery(document_tmp_opertime,before_date)
|
|
|
+ RangeQuery(document_tmp_docchannel,0,300),
|
|
|
+ RangeQuery(document_tmp_opertime,before_date,after_date)
|
|
|
])
|
|
|
else:
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
TermQuery(document_tmp_save,1),
|
|
|
RangeQuery(document_tmp_status,flow_dumplicate_status_to[0]),
|
|
|
- RangeQuery(document_tmp_docid,self.fix_doc_docid)
|
|
|
+ RangeQuery(document_tmp_docchannel,0,300),
|
|
|
+ RangeQuery(document_tmp_docid,self.fix_doc_docid),
|
|
|
+ RangeQuery(document_tmp_opertime,before_date,after_date)
|
|
|
])
|
|
|
|
|
|
list_data = []
|
|
@@ -4192,7 +4293,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
|
|
|
schedule.add_job(self.flow_remove,"cron",hour="20")
|
|
|
schedule.add_job(self.flow_remove_project_tmp,"cron",hour="20")
|
|
|
- # schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
|
|
|
+ schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="*/10")
|
|
|
schedule.start()
|
|
|
|
|
|
def changeSaveStatus(self,list_dict):
|
|
@@ -4213,16 +4314,17 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
document_tmp_save:0
|
|
|
}
|
|
|
_d_tmp = Document_tmp(_d)
|
|
|
- if _d_tmp.fix_columns(self.ots_client,["status"],True):
|
|
|
+ if _d_tmp.fix_columns(self.ots_client,["status",document_update_document],True):
|
|
|
if _d_tmp.getProperties().get("status")==1:
|
|
|
- _d_tmp.setValue("status",0,True)
|
|
|
- _d_tmp.update_row(self.ots_client)
|
|
|
+ if _d_tmp.getProperties().get(document_update_document,"")!="true":
|
|
|
+ _d_tmp.setValue("status",0,True)
|
|
|
+ _d_tmp.update_row(self.ots_client)
|
|
|
|
|
|
|
|
|
|
|
|
def test_dumplicate(self,docid):
|
|
|
# columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
|
|
|
- columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_name]
|
|
|
+ columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
TermQuery("docid",docid)
|
|
|
])
|
|
@@ -4413,7 +4515,7 @@ if __name__ == '__main__':
|
|
|
# test_attachment_interface()
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
- df_dump.test_dumplicate(455485514
|
|
|
+ df_dump.test_dumplicate(576859812
|
|
|
)
|
|
|
# compare_dumplicate_check()
|
|
|
# df_dump.test_merge([391898061
|