|
@@ -363,9 +363,12 @@ class Dataflow():
|
|
list_must_queries.append(BoolQuery(should_queries=l_s))
|
|
list_must_queries.append(BoolQuery(should_queries=l_s))
|
|
elif k in set_nested:
|
|
elif k in set_nested:
|
|
_v = v
|
|
_v = v
|
|
- if k!="" and k=="bidding_budget" or k=="win_bid_price":
|
|
|
|
- _v = float(_v)
|
|
|
|
- list_must_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
|
|
|
|
|
|
+ if k!="":
|
|
|
|
+ if k=="bidding_budget" or k=="win_bid_price":
|
|
|
|
+ _v = float(_v)
|
|
|
|
+ list_must_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
|
|
|
|
+ else:
|
|
|
|
+ list_must_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
|
|
elif k in set_term:
|
|
elif k in set_term:
|
|
list_must_queries.append(TermQuery(k,v))
|
|
list_must_queries.append(TermQuery(k,v))
|
|
elif k in set_phrase:
|
|
elif k in set_phrase:
|
|
@@ -384,9 +387,12 @@ class Dataflow():
|
|
list_must_no_queries.append(BoolQuery(should_queries=l_s))
|
|
list_must_no_queries.append(BoolQuery(should_queries=l_s))
|
|
elif k in set_nested:
|
|
elif k in set_nested:
|
|
_v = v
|
|
_v = v
|
|
- if k!="" and k=="bidding_budget" or k=="win_bid_price":
|
|
|
|
- _v = float(_v)
|
|
|
|
- list_must_no_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
|
|
|
|
|
|
+ if k!="":
|
|
|
|
+ if k=="bidding_budget" or k=="win_bid_price":
|
|
|
|
+ _v = float(_v)
|
|
|
|
+ list_must_no_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
|
|
|
|
+ else:
|
|
|
|
+ list_must_no_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
|
|
elif k in set_term:
|
|
elif k in set_term:
|
|
list_must_no_queries.append(TermQuery(k,v))
|
|
list_must_no_queries.append(TermQuery(k,v))
|
|
elif k in set_range:
|
|
elif k in set_range:
|
|
@@ -1593,7 +1599,7 @@ class Dataflow():
|
|
|
|
|
|
def producer():
|
|
def producer():
|
|
current_date = getCurrent_date("%Y-%m-%d")
|
|
current_date = getCurrent_date("%Y-%m-%d")
|
|
- tmp_date = timeAdd(current_date,-4)
|
|
|
|
|
|
+ tmp_date = timeAdd(current_date,-10)
|
|
bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True),
|
|
bool_query = BoolQuery(must_queries=[RangeQuery(document_tmp_status,*status_from,True,True),
|
|
RangeQuery(document_tmp_crtime,range_to="%s 00:00:00"%(tmp_date))])
|
|
RangeQuery(document_tmp_crtime,range_to="%s 00:00:00"%(tmp_date))])
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
|
|
@@ -2244,8 +2250,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
continue
|
|
continue
|
|
for _j in range(min(_i,10)):
|
|
for _j in range(min(_i,10)):
|
|
_dict2 = base_list[_j]
|
|
_dict2 = base_list[_j]
|
|
- _prob = self.dumplicate_check(_dict1,_dict2,_dict1.get("min_counts",10),b_log=b_log)
|
|
|
|
- print("_prob:",_prob)
|
|
|
|
|
|
+ _prob,day_dis = self.dumplicate_check(_dict1,_dict2,_dict1.get("min_counts",10),b_log=b_log)
|
|
if _prob<=0.1:
|
|
if _prob<=0.1:
|
|
_pass = False
|
|
_pass = False
|
|
break
|
|
break
|
|
@@ -2320,7 +2325,19 @@ class Dataflow_dumplicate(Dataflow):
|
|
if web_source_no_less==web_source_no_greater=="17397-3":
|
|
if web_source_no_less==web_source_no_greater=="17397-3":
|
|
hard_level=2
|
|
hard_level=2
|
|
|
|
|
|
- return check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level)
|
|
|
|
|
|
+ _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level)
|
|
|
|
+
|
|
|
|
+ pagetime_stamp_less = getTimeStamp(page_time_less)
|
|
|
|
+ pagetime_stamp_greater = getTimeStamp(page_time_greater)
|
|
|
|
+
|
|
|
|
+ day_dis = abs(pagetime_stamp_greater-pagetime_stamp_less)//86400
|
|
|
|
+ if day_dis>7:
|
|
|
|
+ _prob = 0
|
|
|
|
+ elif day_dis>3:
|
|
|
|
+ if _prob<0.4:
|
|
|
|
+ _prob = 0
|
|
|
|
+
|
|
|
|
+ return _prob,day_dis
|
|
|
|
|
|
|
|
|
|
def dumplicate_check_bak(self,_dict1,_dict2,min_counts,b_log=False):
|
|
def dumplicate_check_bak(self,_dict1,_dict2,min_counts,b_log=False):
|
|
@@ -2525,7 +2542,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
return 0
|
|
return 0
|
|
return _prob
|
|
return _prob
|
|
|
|
|
|
- def search_data_by_query(self,item,_query,confidence,retry_times=3,merge=False,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
|
|
|
|
|
|
+ def search_data_by_query(self,item,_query,confidence,retry_times=3,merge=False,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count],b_log=False):
|
|
|
|
|
|
for _ in range(retry_times):
|
|
for _ in range(retry_times):
|
|
try:
|
|
try:
|
|
@@ -2548,14 +2565,13 @@ class Dataflow_dumplicate(Dataflow):
|
|
else:
|
|
else:
|
|
if _docid!=item.get(document_tmp_docid):
|
|
if _docid!=item.get(document_tmp_docid):
|
|
_time1 = time.time()
|
|
_time1 = time.time()
|
|
- confidence = self.dumplicate_check(item,_dict,total_count,b_log=False)
|
|
|
|
|
|
+ confidence,day_dis = self.dumplicate_check(item,_dict,total_count,b_log=b_log)
|
|
check_time+= time.time()-_time1
|
|
check_time+= time.time()-_time1
|
|
|
|
|
|
_dict["confidence"] = confidence
|
|
_dict["confidence"] = confidence
|
|
_dict["min_counts"] = total_count
|
|
_dict["min_counts"] = total_count
|
|
|
|
|
|
- if not confidence<0.1:
|
|
|
|
- list_data.append(_dict)
|
|
|
|
|
|
+ list_data.append(_dict)
|
|
all_time = time.time()-_time
|
|
all_time = time.time()-_time
|
|
# log("check:%d rows takes%.4f,check%.4f"%(len(list_dict),all_time-check_time,check_time))
|
|
# log("check:%d rows takes%.4f,check%.4f"%(len(list_dict),all_time-check_time,check_time))
|
|
return list_data
|
|
return list_data
|
|
@@ -2563,12 +2579,15 @@ class Dataflow_dumplicate(Dataflow):
|
|
traceback.print_exc()
|
|
traceback.print_exc()
|
|
return []
|
|
return []
|
|
|
|
|
|
- def add_data_by_query(self,item,base_list,set_docid,_query,confidence,table_name,table_index,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_save,document_tmp_status,document_tmp_product,document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
|
|
|
|
- list_dict = self.search_data_by_query(item,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys,columns=columns)
|
|
|
|
|
|
+ def add_data_by_query(self,item,base_list,set_docid,_query,confidence,table_name,table_index,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_save,document_tmp_status,document_tmp_product,document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count],b_log=False):
|
|
|
|
+ list_dict = self.search_data_by_query(item,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys,columns=columns,b_log=b_log)
|
|
for _dict in list_dict:
|
|
for _dict in list_dict:
|
|
_docid = _dict.get(document_tmp_docid)
|
|
_docid = _dict.get(document_tmp_docid)
|
|
confidence = _dict["confidence"]
|
|
confidence = _dict["confidence"]
|
|
- print("confidence",_docid,confidence)
|
|
|
|
|
|
+
|
|
|
|
+ if b_log:
|
|
|
|
+ log("confidence %d %.3f"%(_docid,confidence))
|
|
|
|
+
|
|
if confidence>0.1:
|
|
if confidence>0.1:
|
|
if _docid not in set_docid:
|
|
if _docid not in set_docid:
|
|
base_list.append(_dict)
|
|
base_list.append(_dict)
|
|
@@ -2581,32 +2600,36 @@ class Dataflow_dumplicate(Dataflow):
|
|
return
|
|
return
|
|
_dict.update(base_dict)
|
|
_dict.update(base_dict)
|
|
if b_log:
|
|
if b_log:
|
|
- log(str(_dict))
|
|
|
|
|
|
+ log("rule dict:"+str(_dict))
|
|
_query = self.generate_dumplicate_query(_dict,must_not_dict)
|
|
_query = self.generate_dumplicate_query(_dict,must_not_dict)
|
|
_rule = {"confidence":confidence,
|
|
_rule = {"confidence":confidence,
|
|
"item":item,
|
|
"item":item,
|
|
"query":_query,
|
|
"query":_query,
|
|
"singleNum_keys":[],
|
|
"singleNum_keys":[],
|
|
"contain_keys":[],
|
|
"contain_keys":[],
|
|
- "multiNum_keys":[]}
|
|
|
|
|
|
+ "multiNum_keys":[],
|
|
|
|
+ "_dict":_dict}
|
|
list_rules.append(_rule)
|
|
list_rules.append(_rule)
|
|
|
|
|
|
- def translate_dumplicate_rules(self,status_from,item,get_all=False,to_log=False):
|
|
|
|
|
|
+ def translate_dumplicate_rules(self,status_from,item,get_all=False,to_log=False,day_dis=7):
|
|
docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
|
|
docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
|
|
current_date = getCurrent_date("%Y-%m-%d")
|
|
current_date = getCurrent_date("%Y-%m-%d")
|
|
if page_time=='':
|
|
if page_time=='':
|
|
page_time = current_date
|
|
page_time = current_date
|
|
|
|
|
|
|
|
+ two_day_dict = {"page_time":[timeAdd(page_time,-2),timeAdd(page_time,2)]}
|
|
|
|
+
|
|
if page_time>=timeAdd(current_date,-2):
|
|
if page_time>=timeAdd(current_date,-2):
|
|
table_name = "document_tmp"
|
|
table_name = "document_tmp"
|
|
table_index = "document_tmp_index"
|
|
table_index = "document_tmp_index"
|
|
base_dict = {
|
|
base_dict = {
|
|
"docchannel":item.get("docchannel",52),
|
|
"docchannel":item.get("docchannel",52),
|
|
"status":[status_from[0]],
|
|
"status":[status_from[0]],
|
|
- "page_time":[timeAdd(page_time,-2),timeAdd(page_time,2)]
|
|
|
|
|
|
+ "page_time":[timeAdd(page_time,-day_dis),timeAdd(page_time,day_dis)]
|
|
}
|
|
}
|
|
must_not_dict = {"save":0,"docid":item.get("docid")}
|
|
must_not_dict = {"save":0,"docid":item.get("docid")}
|
|
doctitle_refine_name = "doctitle_refine"
|
|
doctitle_refine_name = "doctitle_refine"
|
|
|
|
+
|
|
else:
|
|
else:
|
|
table_name = "document"
|
|
table_name = "document"
|
|
table_index = "document_index"
|
|
table_index = "document_index"
|
|
@@ -2617,7 +2640,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
base_dict = {
|
|
base_dict = {
|
|
"docchannel":item["docchannel"],
|
|
"docchannel":item["docchannel"],
|
|
"status":_status,
|
|
"status":_status,
|
|
- "page_time":[timeAdd(page_time,-2),timeAdd(page_time,2)]
|
|
|
|
|
|
+ "page_time":[timeAdd(page_time,-day_dis),timeAdd(page_time,day_dis)]
|
|
}
|
|
}
|
|
must_not_dict = {"docid":item.get("docid")}
|
|
must_not_dict = {"docid":item.get("docid")}
|
|
doctitle_refine_name = "doctitle"
|
|
doctitle_refine_name = "doctitle"
|
|
@@ -2673,82 +2696,95 @@ class Dataflow_dumplicate(Dataflow):
|
|
"bidding_budget":bidding_budget}
|
|
"bidding_budget":bidding_budget}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- confidence=85
|
|
|
|
- _dict = {"tenderee":tenderee,
|
|
|
|
- "agency":agency
|
|
|
|
- }
|
|
|
|
- self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
_dict = {"tenderee":tenderee,
|
|
_dict = {"tenderee":tenderee,
|
|
"project_codes":project_code
|
|
"project_codes":project_code
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
+
|
|
_dict = {"tenderee":tenderee,
|
|
_dict = {"tenderee":tenderee,
|
|
- "project_name":project_name
|
|
|
|
|
|
+ "win_bid_price":win_bid_price
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- if getLength(product)>0:
|
|
|
|
- l_p = product.split(",")
|
|
|
|
- _dict = {"tenderee":tenderee,
|
|
|
|
- "product":l_p[0]
|
|
|
|
- }
|
|
|
|
- self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
+ _dict = {"agency":agency,
|
|
|
|
+ "project_codes":project_code
|
|
|
|
+ }
|
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"tenderee":tenderee,
|
|
|
|
- "win_tenderer":win_tenderer
|
|
|
|
|
|
+ _dict = {"win_tenderer":win_tenderer,
|
|
|
|
+ "bidding_budget":bidding_budget
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"tenderee":tenderee,
|
|
|
|
|
|
+ _dict = {"project_codes":project_code,
|
|
"win_bid_price":win_bid_price
|
|
"win_bid_price":win_bid_price
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"tenderee":tenderee,
|
|
|
|
|
|
+ _dict = {"project_codes":project_code,
|
|
"bidding_budget":bidding_budget
|
|
"bidding_budget":bidding_budget
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"tenderee":tenderee,
|
|
|
|
|
|
+ _dict = {"project_codes":project_code,
|
|
doctitle_refine_name:doctitle_refine
|
|
doctitle_refine_name:doctitle_refine
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"agency":agency,
|
|
|
|
- "project_codes":project_code
|
|
|
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
|
+ "bidding_budget":bidding_budget
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"agency":agency,
|
|
|
|
- "project_name":project_name
|
|
|
|
|
|
+ _dict = {"project_codes":project_code,
|
|
|
|
+ "win_tenderer":win_tenderer
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"project_codes":project_code,
|
|
|
|
|
|
+ base_dict.update(two_day_dict)
|
|
|
|
+
|
|
|
|
+ confidence=85
|
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
|
+ "agency":agency
|
|
|
|
+ }
|
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
+
|
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
"project_name":project_name
|
|
"project_name":project_name
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"project_codes":project_code,
|
|
|
|
|
|
+ if getLength(product)>0:
|
|
|
|
+ l_p = product.split(",")
|
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
|
+ "product":l_p[0]
|
|
|
|
+ }
|
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
+
|
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
"win_tenderer":win_tenderer
|
|
"win_tenderer":win_tenderer
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"project_codes":project_code,
|
|
|
|
- "win_bid_price":win_bid_price
|
|
|
|
|
|
+
|
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
|
+ doctitle_refine_name:doctitle_refine
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"project_codes":project_code,
|
|
|
|
- "bidding_budget":bidding_budget
|
|
|
|
|
|
+
|
|
|
|
+ _dict = {"agency":agency,
|
|
|
|
+ "project_name":project_name
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
_dict = {"project_codes":project_code,
|
|
_dict = {"project_codes":project_code,
|
|
- doctitle_refine_name:doctitle_refine
|
|
|
|
|
|
+ "project_name":project_name
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
|
|
+
|
|
|
|
+
|
|
_dict = {"project_name":project_name,
|
|
_dict = {"project_name":project_name,
|
|
"win_tenderer":win_tenderer
|
|
"win_tenderer":win_tenderer
|
|
}
|
|
}
|
|
@@ -2774,10 +2810,6 @@ class Dataflow_dumplicate(Dataflow):
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
- _dict = {"win_tenderer":win_tenderer,
|
|
|
|
- "bidding_budget":bidding_budget
|
|
|
|
- }
|
|
|
|
- self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
|
|
_dict = {"win_tenderer":win_tenderer,
|
|
_dict = {"win_tenderer":win_tenderer,
|
|
doctitle_refine_name:doctitle_refine
|
|
doctitle_refine_name:doctitle_refine
|
|
@@ -2789,6 +2821,11 @@ class Dataflow_dumplicate(Dataflow):
|
|
}
|
|
}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
|
|
+ confidence=80
|
|
|
|
+ _dict = {"project_codes":project_code}
|
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
+
|
|
|
|
+
|
|
_dict = {"win_bid_price":win_bid_price,
|
|
_dict = {"win_bid_price":win_bid_price,
|
|
doctitle_refine_name:doctitle_refine
|
|
doctitle_refine_name:doctitle_refine
|
|
}
|
|
}
|
|
@@ -2802,13 +2839,13 @@ class Dataflow_dumplicate(Dataflow):
|
|
confidence=80
|
|
confidence=80
|
|
_dict = {doctitle_refine_name:doctitle_refine}
|
|
_dict = {doctitle_refine_name:doctitle_refine}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
- _dict = {"project_codes":project_code}
|
|
|
|
- self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
+
|
|
|
|
|
|
confidence=70
|
|
confidence=70
|
|
_dict = {"project_name":project_name}
|
|
_dict = {"project_name":project_name}
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
|
|
|
|
|
|
|
|
+
|
|
return list_rules,table_name,table_index
|
|
return list_rules,table_name,table_index
|
|
|
|
|
|
def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district]):
|
|
def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district]):
|
|
@@ -3427,7 +3464,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
:param list_docids:
|
|
:param list_docids:
|
|
:return:
|
|
:return:
|
|
'''
|
|
'''
|
|
- print("==",list_docids)
|
|
|
|
|
|
+ log("search_projects_with_document %s"%str(list_docids))
|
|
list_should_q = []
|
|
list_should_q = []
|
|
for _docid in list_docids:
|
|
for _docid in list_docids:
|
|
list_should_q.append(TermQuery("docids",_docid))
|
|
list_should_q.append(TermQuery("docids",_docid))
|
|
@@ -3842,10 +3879,14 @@ class Dataflow_dumplicate(Dataflow):
|
|
list_docids = []
|
|
list_docids = []
|
|
_docid = item.get(document_tmp_docid)
|
|
_docid = item.get(document_tmp_docid)
|
|
list_docids.append(_docid)
|
|
list_docids.append(_docid)
|
|
|
|
+ if save==0:
|
|
|
|
+ dup_docid.insert(0,_docid)
|
|
if isinstance(dup_docid,list):
|
|
if isinstance(dup_docid,list):
|
|
list_docids.extend(dup_docid)
|
|
list_docids.extend(dup_docid)
|
|
list_docids = [a for a in list_docids if a is not None]
|
|
list_docids = [a for a in list_docids if a is not None]
|
|
|
|
|
|
|
|
+
|
|
|
|
+
|
|
_time = time.time()
|
|
_time = time.time()
|
|
list_projects = self.search_projects_with_document(list_docids)
|
|
list_projects = self.search_projects_with_document(list_docids)
|
|
# log("search projects takes:%.3f"%(time.time()-_time))
|
|
# log("search projects takes:%.3f"%(time.time()-_time))
|
|
@@ -3872,6 +3913,9 @@ class Dataflow_dumplicate(Dataflow):
|
|
list_merge_dump = dumplicate_document_in_merge(list_projects,dup_docid[:-1])
|
|
list_merge_dump = dumplicate_document_in_merge(list_projects,dup_docid[:-1])
|
|
# log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
|
|
# log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
|
|
|
|
|
|
|
|
+ if list_merge_dump is None:
|
|
|
|
+ list_projects = []
|
|
|
|
+
|
|
_time = time.time()
|
|
_time = time.time()
|
|
project_json = to_project_json(list_projects)
|
|
project_json = to_project_json(list_projects)
|
|
# log("json projects takes:%.3f"%(time.time()-_time))
|
|
# log("json projects takes:%.3f"%(time.time()-_time))
|
|
@@ -3924,6 +3968,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
def dumplicate_comsumer_handle(self,item,result_queue,ots_client,get_all=False,upgrade=True):
|
|
def dumplicate_comsumer_handle(self,item,result_queue,ots_client,get_all=False,upgrade=True):
|
|
try:
|
|
try:
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
|
|
+ b_log = False if upgrade else True
|
|
self.post_extract(item)
|
|
self.post_extract(item)
|
|
|
|
|
|
|
|
|
|
@@ -3931,14 +3976,15 @@ class Dataflow_dumplicate(Dataflow):
|
|
base_list = []
|
|
base_list = []
|
|
set_docid = set()
|
|
set_docid = set()
|
|
|
|
|
|
- list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,get_all=get_all,to_log=False)
|
|
|
|
|
|
+ list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,get_all=get_all,to_log=b_log)
|
|
# print("len_rules",len(list_rules),table_name,table_index)
|
|
# print("len_rules",len(list_rules),table_name,table_index)
|
|
list_rules.sort(key=lambda x:x["confidence"],reverse=True)
|
|
list_rules.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
|
+
|
|
|
|
+ log("dumplicate %s rules:%d"%(str(item.get(document_tmp_docid)),len(list_rules)))
|
|
list_rules = list_rules[:30]
|
|
list_rules = list_rules[:30]
|
|
_i = 0
|
|
_i = 0
|
|
step = 5
|
|
step = 5
|
|
|
|
|
|
- print("here 1")
|
|
|
|
|
|
|
|
item["confidence"] = 999
|
|
item["confidence"] = 999
|
|
if item.get(document_tmp_docid) not in set_docid:
|
|
if item.get(document_tmp_docid) not in set_docid:
|
|
@@ -3951,17 +3997,17 @@ class Dataflow_dumplicate(Dataflow):
|
|
must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
|
|
must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
|
|
_query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
|
|
_query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
|
|
must_not_queries=must_not_q)
|
|
must_not_queries=must_not_q)
|
|
|
|
+
|
|
_rule = list_rules[_i]
|
|
_rule = list_rules[_i]
|
|
confidence = _rule["confidence"]
|
|
confidence = _rule["confidence"]
|
|
singleNum_keys = _rule["singleNum_keys"]
|
|
singleNum_keys = _rule["singleNum_keys"]
|
|
contain_keys = _rule["contain_keys"]
|
|
contain_keys = _rule["contain_keys"]
|
|
multiNum_keys = _rule["multiNum_keys"]
|
|
multiNum_keys = _rule["multiNum_keys"]
|
|
- self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle])
|
|
|
|
|
|
+ self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle],b_log=b_log)
|
|
_i += step
|
|
_i += step
|
|
|
|
|
|
|
|
|
|
- print("here 2")
|
|
|
|
- b_log = False if upgrade else True
|
|
|
|
|
|
+
|
|
_time = time.time()
|
|
_time = time.time()
|
|
# log("%d start final check with length:%d"%(item["docid"],len(base_list)))
|
|
# log("%d start final check with length:%d"%(item["docid"],len(base_list)))
|
|
final_list = self.dumplicate_fianl_check(base_list,b_log)
|
|
final_list = self.dumplicate_fianl_check(base_list,b_log)
|
|
@@ -4125,26 +4171,27 @@ class Dataflow_dumplicate(Dataflow):
|
|
schedule.start()
|
|
schedule.start()
|
|
|
|
|
|
def changeSaveStatus(self,list_dict):
|
|
def changeSaveStatus(self,list_dict):
|
|
- for _dict in list_dict:
|
|
|
|
- if isinstance(_dict,dict):
|
|
|
|
- if _dict.get(document_tmp_save,1)==1:
|
|
|
|
- _d = {"partitionkey":_dict["partitionkey"],
|
|
|
|
- "docid":_dict["docid"],
|
|
|
|
|
|
+ if list_dict is not None:
|
|
|
|
+ for _dict in list_dict:
|
|
|
|
+ if isinstance(_dict,dict):
|
|
|
|
+ if _dict.get(document_tmp_save,1)==1:
|
|
|
|
+ _d = {"partitionkey":_dict["partitionkey"],
|
|
|
|
+ "docid":_dict["docid"],
|
|
|
|
+ document_tmp_save:0
|
|
|
|
+ }
|
|
|
|
+ _d_tmp = Document_tmp(_d)
|
|
|
|
+ if _d_tmp.exists_row(self.ots_client):
|
|
|
|
+ _d_tmp.update_row(self.ots_client)
|
|
|
|
+ elif isinstance(_dict,int):
|
|
|
|
+ _d = {"partitionkey":_dict%500+1,
|
|
|
|
+ "docid":_dict,
|
|
document_tmp_save:0
|
|
document_tmp_save:0
|
|
}
|
|
}
|
|
_d_tmp = Document_tmp(_d)
|
|
_d_tmp = Document_tmp(_d)
|
|
- if _d_tmp.exists_row(self.ots_client):
|
|
|
|
- _d_tmp.update_row(self.ots_client)
|
|
|
|
- elif isinstance(_dict,int):
|
|
|
|
- _d = {"partitionkey":_dict%500+1,
|
|
|
|
- "docid":_dict,
|
|
|
|
- document_tmp_save:0
|
|
|
|
- }
|
|
|
|
- _d_tmp = Document_tmp(_d)
|
|
|
|
- if _d_tmp.fix_columns(self.ots_client,["status"],True):
|
|
|
|
- if _d_tmp.getProperties().get("status")==1:
|
|
|
|
- _d_tmp.setValue("status",0,True)
|
|
|
|
- _d_tmp.update_row(self.ots_client)
|
|
|
|
|
|
+ if _d_tmp.fix_columns(self.ots_client,["status"],True):
|
|
|
|
+ if _d_tmp.getProperties().get("status")==1:
|
|
|
|
+ _d_tmp.setValue("status",0,True)
|
|
|
|
+ _d_tmp.update_row(self.ots_client)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -4165,7 +4212,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
list_dict = getRow_ots(rows)
|
|
list_dict = getRow_ots(rows)
|
|
|
|
|
|
for item in list_dict:
|
|
for item in list_dict:
|
|
- self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=True,upgrade=False)
|
|
|
|
|
|
+ self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=False,upgrade=False)
|
|
return
|
|
return
|
|
|
|
|
|
def test_merge(self,list_docid_less,list_docid_greater):
|
|
def test_merge(self,list_docid_less,list_docid_greater):
|
|
@@ -4252,10 +4299,10 @@ if __name__ == '__main__':
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
# df_dump.start_flow_dumplicate()
|
|
# df_dump.start_flow_dumplicate()
|
|
a = time.time()
|
|
a = time.time()
|
|
- df_dump.test_dumplicate(397656324
|
|
|
|
|
|
+ df_dump.test_dumplicate(400929607
|
|
)
|
|
)
|
|
- # df_dump.test_merge([385521167
|
|
|
|
- # ],[385521113])
|
|
|
|
|
|
+ # df_dump.test_merge([242672995,235300429,240009762
|
|
|
|
+ # ],[243240169,])
|
|
# df_dump.flow_remove_project_tmp()
|
|
# df_dump.flow_remove_project_tmp()
|
|
print("takes",time.time()-a)
|
|
print("takes",time.time()-a)
|
|
# df_dump.fix_doc_which_not_in_project()
|
|
# df_dump.fix_doc_which_not_in_project()
|