|
@@ -7,6 +7,7 @@ from BaseDataMaintenance.dataSource.source import getConnect_ots,getConnect_ots_
|
|
|
from tablestore import *
|
|
|
from BaseDataMaintenance.common.Utils import *
|
|
|
from BaseDataMaintenance.common.multiThread import MultiThreadHandler
|
|
|
+from BaseDataMaintenance.common.multiProcess import MultiProcessHandler
|
|
|
from queue import Queue
|
|
|
|
|
|
from BaseDataMaintenance.model.ots.document_tmp import *
|
|
@@ -25,6 +26,7 @@ from apscheduler.schedulers.blocking import BlockingScheduler
|
|
|
from BaseDataMaintenance.maintenance.dataflow_settings import *
|
|
|
from threading import Thread
|
|
|
import oss2
|
|
|
+from BaseDataMaintenance.maintenance.documentDumplicate import *
|
|
|
|
|
|
def getSet(list_dict,key):
|
|
|
_set = set()
|
|
@@ -150,7 +152,6 @@ class Dataflow():
|
|
|
for _dict in list_dict:
|
|
|
self.queue_init.put(_dict)
|
|
|
_count += len(list_dict)
|
|
|
- print("%d/%d"%(_count,total_count))
|
|
|
def comsumer():
|
|
|
mt = MultiThreadHandler(self.queue_init,comsumer_handle,None,30,1,ots_client=self.ots_client)
|
|
|
mt.run()
|
|
@@ -336,9 +337,9 @@ class Dataflow():
|
|
|
|
|
|
|
|
|
|
|
|
- def generate_dumplicate_query(self,_dict,_dict_must_not,set_match=set(["project_code","product"]),set_nested=set(["win_tenderer","bidding_budget","win_bid_price"]),
|
|
|
- set_term=set(["project_name","doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save"]),
|
|
|
- set_range=set(["page_time","status"])):
|
|
|
+ def generate_dumplicate_query(self,_dict,_dict_must_not,set_match=set(["project_code","project_codes","product"]),set_nested=set(["win_tenderer","bidding_budget","win_bid_price"]),
|
|
|
+ set_term=set(["project_name","doctitle_refine","docchannel","tenderee","agency","web_source_no","fingerprint","save","docid"]),
|
|
|
+ set_range=set(["page_time","status"]),set_phrase=set(["doctitle"])):
|
|
|
list_must_queries = []
|
|
|
list_must_no_queries = []
|
|
|
for k,v in _dict.items():
|
|
@@ -355,6 +356,8 @@ class Dataflow():
|
|
|
list_must_queries.append(NestedQuery("sub_docs_json",TermQuery("sub_docs_json.%s"%k,_v)))
|
|
|
elif k in set_term:
|
|
|
list_must_queries.append(TermQuery(k,v))
|
|
|
+ elif k in set_phrase:
|
|
|
+ list_must_queries.append(MatchPhraseQuery(k,v))
|
|
|
elif k in set_range:
|
|
|
if len(v)==1:
|
|
|
list_must_queries.append(RangeQuery(k,v[0]))
|
|
@@ -394,7 +397,11 @@ class Dataflow():
|
|
|
if agency is not None and agency!="":
|
|
|
extract_count += 1
|
|
|
if sub_docs_json is not None:
|
|
|
- for sub_docs in json.loads(sub_docs_json):
|
|
|
+ sub_docs = json.loads(sub_docs_json)
|
|
|
+ sub_docs.sort(key=lambda x:x.get("bidding_budget",0),reverse=True)
|
|
|
+ sub_docs.sort(key=lambda x:x.get("win_bid_price",0),reverse=True)
|
|
|
+ # log("==%s"%(str(sub_docs)))
|
|
|
+ for sub_docs in sub_docs:
|
|
|
for _key_sub_docs in sub_docs.keys():
|
|
|
extract_count += 1
|
|
|
if _key_sub_docs in columns:
|
|
@@ -1251,12 +1258,10 @@ class Dataflow():
|
|
|
_dict["save"] = 1
|
|
|
else:
|
|
|
_dict["save"] = 0
|
|
|
- print(item)
|
|
|
if item.get("status")>=status_from[0] and item.get("status")<=status_from[1]:
|
|
|
_dict["status"] = random.randint(status_to[0],status_to[1])
|
|
|
list_dict.append(_dict)
|
|
|
for _dict in list_dict:
|
|
|
- print(_dict)
|
|
|
dtmp = Document_tmp(_dict)
|
|
|
dtmp.update_row(self.ots_client)
|
|
|
|
|
@@ -1299,7 +1304,6 @@ class Dataflow():
|
|
|
|
|
|
def comsumer_handle(item,result_queue,ots_client):
|
|
|
# print(item)
|
|
|
- print("docid",item.get(document_tmp_docid))
|
|
|
dtmp = Document_tmp(item)
|
|
|
|
|
|
dtmp.setValue(document_tmp_status,random.randint(*status_to),True)
|
|
@@ -1373,19 +1377,26 @@ class Dataflow():
|
|
|
}
|
|
|
dtmp = Document_tmp(_d)
|
|
|
|
|
|
+ dup_docid = set()
|
|
|
+ for _dict in final_list:
|
|
|
+ dup_docid.add(_dict.get(document_tmp_docid))
|
|
|
+ if item.get(document_tmp_docid) in dup_docid:
|
|
|
+ dup_docid.remove(item.get(document_tmp_docid))
|
|
|
|
|
|
if len(final_list)==0 or best_docid==item.get(document_tmp_docid):
|
|
|
dtmp.setValue(document_tmp_save,1,True)
|
|
|
dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
|
|
|
+ dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
else:
|
|
|
dtmp.setValue(document_tmp_save,0,True)
|
|
|
+ if best_docid in dup_docid:
|
|
|
+ dup_docid.remove(best_docid)
|
|
|
+ dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
+ dmp_docid = "%d,%s"%(best_docid,dmp_docid)
|
|
|
+ else:
|
|
|
+ dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
+
|
|
|
|
|
|
- dup_docid = set()
|
|
|
- for _dict in final_list:
|
|
|
- dup_docid.add(_dict.get(document_tmp_docid))
|
|
|
- if item.get(document_tmp_docid) in dup_docid:
|
|
|
- dup_docid.remove(item.get(document_tmp_docid))
|
|
|
- dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
|
|
|
dtmp.update_row(self.ots_client)
|
|
|
|
|
@@ -1422,7 +1433,7 @@ class Dataflow():
|
|
|
TermQuery("project_name",project_name)])
|
|
|
list_should_q.append(_q)
|
|
|
if len(list_should_q)>0:
|
|
|
- list_data = self.search_data_by_query(item,list_should_q,100,table_name="project2",table_index="project2_index_formerge",sort_column="tenderee",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=["tenderee","win_tenderer"])
|
|
|
+ list_data = self.search_data_by_query(item,list_should_q,100,merge=True,table_name="project2",table_index="project2_index_formerge",sort_column="tenderee",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=["tenderee","win_tenderer"])
|
|
|
|
|
|
if len(list_data)==1:
|
|
|
dtmp.setValue("merge_uuid",list_data[0]["uuid"],True)
|
|
@@ -2057,18 +2068,778 @@ class Dataflow_extract(Dataflow):
|
|
|
schedule.add_job(self.flow_extract,"cron",second="*/10")
|
|
|
schedule.start()
|
|
|
|
|
|
+class Dataflow_dumplicate(Dataflow):
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ Dataflow.__init__(self)
|
|
|
+ self.c_f_get_extractCount = f_get_extractCount()
|
|
|
+ self.c_f_get_package = f_get_package()
|
|
|
+ logging.basicConfig(level = logging.info,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+
|
|
|
+ def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart","time_release"]):
|
|
|
+ dict_time = {}
|
|
|
+ for k in keys:
|
|
|
+ dict_time[k] = _extract.get(k)
|
|
|
+ return dict_time
|
|
|
+
|
|
|
+
|
|
|
+ def post_extract(self,_dict):
|
|
|
+ win_tenderer,bidding_budget,win_bid_price,_ = self.f_decode_sub_docs_json(_dict.get(document_tmp_project_code),_dict.get(document_tmp_project_name),_dict.get(document_tmp_tenderee),_dict.get(document_tmp_agency),_dict.get(document_tmp_sub_docs_json))
|
|
|
+ _dict["win_tenderer"] = win_tenderer
|
|
|
+ _dict["bidding_budget"] = bidding_budget
|
|
|
+ _dict["win_bid_price"] = win_bid_price
|
|
|
+ extract_json = _dict.get(document_tmp_extract_json,"{}")
|
|
|
+ _extract = json.loads(extract_json)
|
|
|
+ _dict["product"] = ",".join(_extract.get("product",[]))
|
|
|
+ _dict["fingerprint"] = _extract.get("fingerprint","")
|
|
|
+ _dict["project_codes"] = _extract.get("code",[])
|
|
|
+ if len(_dict["project_codes"])>0:
|
|
|
+ _dict["project_code"] = _dict["project_codes"][0]
|
|
|
+ else:
|
|
|
+ _dict["project_code"] = ""
|
|
|
+ _dict["doctitle_refine"] = _extract.get("doctitle_refine","")
|
|
|
+ _dict["nlp_enterprise"] = str({"indoctextcon":_extract.get("nlp_enterprise",[]),
|
|
|
+ "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])})
|
|
|
+ _dict["extract_count"] = self.c_f_get_extractCount.evaluate(extract_json)
|
|
|
+ _dict["package"] = self.c_f_get_package.evaluate(extract_json)
|
|
|
+ _dict["project_name"] = _extract.get("name","")
|
|
|
+ _dict["dict_time"] = self.get_dict_time(_extract)
|
|
|
+
|
|
|
+ def dumplicate_fianl_check(self,base_list):
|
|
|
+ the_group = base_list
|
|
|
+ the_group.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
+
|
|
|
+ _index = 0
|
|
|
+ base_fingerprint = "None"
|
|
|
+ if len(base_list)>0:
|
|
|
+ base_fingerprint = base_list[0]["fingerprint"]
|
|
|
+ for _i in range(1,len(base_list)):
|
|
|
+ _dict1 = base_list[_i]
|
|
|
+ fingerprint_less = _dict1["fingerprint"]
|
|
|
+ _pass = True
|
|
|
+ if fingerprint_less==base_fingerprint:
|
|
|
+ _index = _i
|
|
|
+ continue
|
|
|
+ for _j in range(min(_i,5)):
|
|
|
+ _dict2 = base_list[_j]
|
|
|
+ _prob = self.dumplicate_check(_dict1,_dict2,_dict2.get("min_counts",10),b_log=True)
|
|
|
+ # print("_prob:",_prob)
|
|
|
+ if _prob<=0.1:
|
|
|
+ _pass = False
|
|
|
+ break
|
|
|
+ log("checking index:%d"%(_i))
|
|
|
+ _index = _i
|
|
|
+ if not _pass:
|
|
|
+ _index -= 1
|
|
|
+ break
|
|
|
+
|
|
|
+ if _index>=1:
|
|
|
+ return the_group[:_index+1]
|
|
|
+ return []
|
|
|
+
|
|
|
+ def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
|
|
|
+ document_less = _dict1
|
|
|
+ docid_less = _dict1["docid"]
|
|
|
+ docchannel_less = document_less["docchannel"]
|
|
|
+ page_time_less = document_less["page_time"]
|
|
|
+ doctitle_refine_less = document_less["doctitle_refine"]
|
|
|
+ project_codes_less = document_less["project_codes"]
|
|
|
+ nlp_enterprise_less = document_less["nlp_enterprise"]
|
|
|
+ tenderee_less = document_less["tenderee"]
|
|
|
+ agency_less = document_less["agency"]
|
|
|
+ win_tenderer_less = document_less["win_tenderer"]
|
|
|
+ bidding_budget_less = document_less["bidding_budget"]
|
|
|
+ win_bid_price_less = document_less["win_bid_price"]
|
|
|
+ product_less = document_less["product"]
|
|
|
+ package_less = document_less["package"]
|
|
|
+ json_time_less = document_less["dict_time"]
|
|
|
+ project_name_less = document_less["project_name"]
|
|
|
+ fingerprint_less = document_less["fingerprint"]
|
|
|
+ extract_count_less = document_less["extract_count"]
|
|
|
+
|
|
|
+ document_greater = _dict2
|
|
|
+ docid_greater = _dict2["docid"]
|
|
|
+ page_time_greater = document_greater["page_time"]
|
|
|
+ doctitle_refine_greater = document_greater["doctitle_refine"]
|
|
|
+ project_codes_greater = document_greater["project_codes"]
|
|
|
+ nlp_enterprise_greater = document_greater["nlp_enterprise"]
|
|
|
+ tenderee_greater = document_greater["tenderee"]
|
|
|
+ agency_greater = document_greater["agency"]
|
|
|
+ win_tenderer_greater = document_greater["win_tenderer"]
|
|
|
+ bidding_budget_greater = document_greater["bidding_budget"]
|
|
|
+ win_bid_price_greater = document_greater["win_bid_price"]
|
|
|
+ product_greater = document_greater["product"]
|
|
|
+ package_greater = document_greater["package"]
|
|
|
+ json_time_greater = document_greater["dict_time"]
|
|
|
+ project_name_greater = document_greater["project_name"]
|
|
|
+ fingerprint_greater = document_greater["fingerprint"]
|
|
|
+ extract_count_greater = document_greater["extract_count"]
|
|
|
+
|
|
|
+
|
|
|
+ if fingerprint_less==fingerprint_greater:
|
|
|
+ return 1
|
|
|
+
|
|
|
+ same_count = 0
|
|
|
+ all_count = 8
|
|
|
+ if len(set(project_codes_less) & set(project_codes_greater))>0:
|
|
|
+ same_count += 1
|
|
|
+ if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
|
|
|
+ same_count += 1
|
|
|
+ if getLength(agency_less)>0 and agency_less==agency_greater:
|
|
|
+ same_count += 1
|
|
|
+ if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
|
|
|
+ same_count += 1
|
|
|
+ if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
|
|
|
+ same_count += 1
|
|
|
+ if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
|
|
|
+ same_count += 1
|
|
|
+ if getLength(project_name_less)>0 and project_name_less==project_name_greater:
|
|
|
+ same_count += 1
|
|
|
+ if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
|
|
|
+ same_count += 1
|
|
|
+ base_prob = 0
|
|
|
+ if min_counts<3:
|
|
|
+ base_prob = 0.9
|
|
|
+ elif min_counts<5:
|
|
|
+ base_prob = 0.8
|
|
|
+ elif min_counts<8:
|
|
|
+ base_prob = 0.7
|
|
|
+ else:
|
|
|
+ base_prob = 0.6
|
|
|
+ _prob = base_prob*same_count/all_count
|
|
|
+ if _prob<0.1 and min(extract_count_less,extract_count_greater)<=3:
|
|
|
+ _prob = 0.15
|
|
|
+ if _prob<0.1:
|
|
|
+ return _prob
|
|
|
+
|
|
|
+ check_result = {"pass":1}
|
|
|
+ if docchannel_less in (51,102,103,104,115,116,117):
|
|
|
+ if doctitle_refine_less!=doctitle_refine_greater:
|
|
|
+ if page_time_less!=page_time_greater:
|
|
|
+ check_result["docchannel"] = 0
|
|
|
+ check_result["pass"] = 0
|
|
|
+ else:
|
|
|
+ check_result["docchannel"] = 2
|
|
|
+ if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
|
|
|
+ check_result["doctitle"] = 0
|
|
|
+ check_result["pass"] = 0
|
|
|
+ if b_log:
|
|
|
+ logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
|
|
|
+ else:
|
|
|
+ check_result["doctitle"] = 2
|
|
|
+
|
|
|
+ #added check
|
|
|
+ if not check_codes(project_codes_less,project_codes_greater):
|
|
|
+ check_result["code"] = 0
|
|
|
+ check_result["pass"] = 0
|
|
|
+ if b_log:
|
|
|
+ logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
|
|
|
+ else:
|
|
|
+ if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
|
|
|
+ check_result["code"] = 2
|
|
|
+ else:
|
|
|
+ check_result["code"] = 1
|
|
|
+
|
|
|
+
|
|
|
+ if not check_product(product_less,product_greater):
|
|
|
+ check_result["product"] = 0
|
|
|
+ check_result["pass"] = 0
|
|
|
+ if b_log:
|
|
|
+ logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
|
|
|
+ else:
|
|
|
+ if getLength(product_less)>0 and getLength(product_greater)>0:
|
|
|
+ check_result["product"] = 2
|
|
|
+ else:
|
|
|
+ check_result["product"] = 1
|
|
|
+
|
|
|
+ if not check_demand():
|
|
|
+ check_result["pass"] = 0
|
|
|
+
|
|
|
+ if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
|
|
|
+ tenderee_less,tenderee_greater,
|
|
|
+ agency_less,agency_greater,
|
|
|
+ win_tenderer_less,win_tenderer_greater):
|
|
|
+ check_result["entity"] = 0
|
|
|
+ check_result["pass"] = 0
|
|
|
+ if b_log:
|
|
|
+ logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
|
|
|
+ else:
|
|
|
+ if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
|
|
|
+ check_result["entity"] = 2
|
|
|
+ elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
|
|
|
+ check_result["entity"] = 2
|
|
|
+ else:
|
|
|
+ check_result["entity"] = 1
|
|
|
+
|
|
|
+ if not check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
+ win_bid_price_less,win_bid_price_greater):
|
|
|
+ if b_log:
|
|
|
+ logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
|
|
|
+ check_result["money"] = 0
|
|
|
+ check_result["pass"] = 0
|
|
|
+ else:
|
|
|
+ if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
|
|
|
+ check_result["money"] = 2
|
|
|
+ elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
|
|
|
+ check_result["money"] = 2
|
|
|
+ else:
|
|
|
+ check_result["money"] = 1
|
|
|
+
|
|
|
+ #added check
|
|
|
+ if not check_package(package_less,package_greater):
|
|
|
+ if b_log:
|
|
|
+ logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
|
|
|
+ check_result["package"] = 0
|
|
|
+ check_result["pass"] = 0
|
|
|
+ else:
|
|
|
+ if getLength(package_less)>0 and getLength(package_greater)>0:
|
|
|
+ check_result["package"] = 2
|
|
|
+ else:
|
|
|
+ check_result["package"] = 1
|
|
|
+
|
|
|
+ #added check
|
|
|
+ if not check_time(json_time_less,json_time_greater):
|
|
|
+ if b_log:
|
|
|
+ logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
|
|
|
+ if isinstance(json_time_less,dict):
|
|
|
+ time_less = json_time_less
|
|
|
+ else:
|
|
|
+ time_less = json.loads(json_time_less)
|
|
|
+ if isinstance(json_time_greater,dict):
|
|
|
+ time_greater = json_time_greater
|
|
|
+ else:
|
|
|
+ time_greater = json.loads(json_time_greater)
|
|
|
+ for k,v in time_less.items():
|
|
|
+ if getLength(v)>0:
|
|
|
+ v1 = time_greater.get(k,"")
|
|
|
+ if getLength(v1)>0:
|
|
|
+ if v!=v1:
|
|
|
+ log("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
|
|
|
+
|
|
|
+ check_result["time"] = 0
|
|
|
+ check_result["pass"] = 0
|
|
|
+ else:
|
|
|
+ if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
|
|
|
+ check_result["time"] = 2
|
|
|
+ else:
|
|
|
+ check_result["time"] = 1
|
|
|
+
|
|
|
+ if check_result.get("pass",0)==0:
|
|
|
+ if b_log:
|
|
|
+ logging.info(str(check_result))
|
|
|
+
|
|
|
+ if check_result.get("money",1)==0:
|
|
|
+ return 0
|
|
|
+
|
|
|
+ if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
|
|
|
+ return _prob
|
|
|
+ else:
|
|
|
+ return 0
|
|
|
+ if check_result.get("time",1)==0:
|
|
|
+ return 0
|
|
|
+ return _prob
|
|
|
+
|
|
|
+ def search_data_by_query(self,item,_query,confidence,retry_times=3,merge=False,table_name="document_tmp",table_index="document_tmp_index",sort_column="docid",singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
|
|
|
+
|
|
|
+ for _ in range(retry_times):
|
|
|
+ try:
|
|
|
+ _time = time.time()
|
|
|
+ check_time = 0
|
|
|
+ if isinstance(_query,list):
|
|
|
+ bool_query = BoolQuery(should_queries=_query)
|
|
|
+ else:
|
|
|
+ bool_query = _query
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search(table_name,table_index,
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort(sort_column)]),limit=30,get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+ list_data = []
|
|
|
+ for _dict in list_dict:
|
|
|
+ self.post_extract(_dict)
|
|
|
+ _docid = _dict.get(document_tmp_docid)
|
|
|
+ if merge:
|
|
|
+ list_data.append(_dict)
|
|
|
+ else:
|
|
|
+ if _docid!=item.get(document_tmp_docid):
|
|
|
+ _time1 = time.time()
|
|
|
+ confidence = self.dumplicate_check(item,_dict,total_count,b_log=True)
|
|
|
+ check_time+= time.time()-_time1
|
|
|
+
|
|
|
+ _dict["confidence"] = confidence
|
|
|
+ _dict["min_counts"] = total_count
|
|
|
+ list_data.append(_dict)
|
|
|
+ all_time = time.time()-_time
|
|
|
+ # log("check:%d rows takes%.4f,check%.4f"%(len(list_dict),all_time-check_time,check_time))
|
|
|
+ return list_data
|
|
|
+ except Exception as e:
|
|
|
+ traceback.print_exc()
|
|
|
+ return []
|
|
|
+
|
|
|
+ def add_data_by_query(self,item,base_list,set_docid,_query,confidence,table_name,table_index,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"],columns=[document_tmp_save,document_tmp_status,document_tmp_product,document_tmp_docchannel,document_tmp_web_source_no,document_tmp_doctitle_refine,document_tmp_project_code,document_tmp_project_name,document_tmp_tenderee,document_tmp_agency,document_tmp_sub_docs_json,document_tmp_extract_count]):
|
|
|
+ list_dict = self.search_data_by_query(item,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,notlike_keys=notlike_keys,columns=columns)
|
|
|
+ for _dict in list_dict:
|
|
|
+ _docid = _dict.get(document_tmp_docid)
|
|
|
+ confidence = _dict["confidence"]
|
|
|
+ if confidence>0.1:
|
|
|
+ if _docid not in set_docid:
|
|
|
+ base_list.append(_dict)
|
|
|
+ set_docid.add(_docid)
|
|
|
+ set_docid.add(_docid)
|
|
|
+
|
|
|
+ def appendRule(self,list_rules,_dict,base_dict,must_not_dict,confidence,item,to_log=True):
|
|
|
+ for k,v in _dict.items():
|
|
|
+ if getLength(v)==0:
|
|
|
+ return
|
|
|
+ _dict.update(base_dict)
|
|
|
+ if to_log:
|
|
|
+ log(str(_dict))
|
|
|
+ _query = self.generate_dumplicate_query(_dict,must_not_dict)
|
|
|
+ _rule = {"confidence":confidence,
|
|
|
+ "item":item,
|
|
|
+ "query":_query,
|
|
|
+ "singleNum_keys":[],
|
|
|
+ "contain_keys":[],
|
|
|
+ "multiNum_keys":[]}
|
|
|
+ list_rules.append(_rule)
|
|
|
+
|
|
|
+ def translate_dumplicate_rules(self,status_from,item,get_all=False,to_log=False):
|
|
|
+ docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
|
|
|
+ current_date = getCurrent_date("%Y-%m-%d")
|
|
|
+ if page_time=='':
|
|
|
+ page_time = current_date
|
|
|
+
|
|
|
+ if page_time>=timeAdd(current_date,-2):
|
|
|
+ table_name = "document_tmp"
|
|
|
+ table_index = "document_tmp_index"
|
|
|
+ base_dict = {
|
|
|
+ "docchannel":item["docchannel"],
|
|
|
+ "status":[status_from[0]],
|
|
|
+ "page_time":[timeAdd(page_time,-2),timeAdd(page_time,2)]
|
|
|
+ }
|
|
|
+ must_not_dict = {"save":0,"docid":item.get("docid")}
|
|
|
+ doctitle_refine_name = "doctitle_refine"
|
|
|
+ else:
|
|
|
+ table_name = "document"
|
|
|
+ table_index = "document_index"
|
|
|
+ if get_all:
|
|
|
+ _status = [201,450]
|
|
|
+ else:
|
|
|
+ _status = [201,300]
|
|
|
+ base_dict = {
|
|
|
+ "docchannel":item["docchannel"],
|
|
|
+ "status":_status,
|
|
|
+ "page_time":[timeAdd(page_time,-2),timeAdd(page_time,2)]
|
|
|
+ }
|
|
|
+ must_not_dict = {"docid":item.get("docid")}
|
|
|
+ doctitle_refine_name = "doctitle"
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ list_rules = []
|
|
|
+ singleNum_keys = ["tenderee","win_tenderer"]
|
|
|
+
|
|
|
+ confidence = 100
|
|
|
+ self.appendRule(list_rules,{document_tmp_fingerprint:fingerprint},base_dict,must_not_dict,confidence,item)
|
|
|
+ confidence = 90
|
|
|
+ _dict = {document_tmp_agency:agency,
|
|
|
+ "win_tenderer":win_tenderer,
|
|
|
+ "win_bid_price":win_bid_price}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+ _dict = {document_tmp_agency:agency,
|
|
|
+ "win_tenderer":win_tenderer,
|
|
|
+ "bidding_budget":bidding_budget}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+ _dict = {document_tmp_agency:agency,
|
|
|
+ "win_bid_price":win_bid_price,
|
|
|
+ "bidding_budget":bidding_budget}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+ _dict = {win_tenderer:win_tenderer,
|
|
|
+ "win_bid_price":win_bid_price,
|
|
|
+ "bidding_budget":bidding_budget}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "win_tenderer":win_tenderer,
|
|
|
+ "win_bid_price":win_bid_price}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "win_tenderer":win_tenderer,
|
|
|
+ "bidding_budget":bidding_budget}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "win_bid_price":win_bid_price,
|
|
|
+ "bidding_budget":bidding_budget}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "agency":agency,
|
|
|
+ "win_tenderer":win_tenderer}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "agency":agency,
|
|
|
+ "win_bid_price":win_bid_price}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "agency":agency,
|
|
|
+ "bidding_budget":bidding_budget}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ confidence=85
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "agency":agency
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "project_codes":project_code
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "project_name":project_name
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ if getLength(product)>0:
|
|
|
+ l_p = product.split(",")
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "product":l_p[0]
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "win_tenderer":win_tenderer
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "win_bid_price":win_bid_price
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ "bidding_budget":bidding_budget
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"tenderee":tenderee,
|
|
|
+ doctitle_refine_name:doctitle_refine
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"agency":agency,
|
|
|
+ "project_codes":project_code
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"agency":agency,
|
|
|
+ "project_name":project_name
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"project_codes":project_code,
|
|
|
+ "project_name":project_name
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"project_codes":project_code,
|
|
|
+ "win_tenderer":win_tenderer
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"project_codes":project_code,
|
|
|
+ "win_bid_price":win_bid_price
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"project_codes":project_code,
|
|
|
+ "bidding_budget":bidding_budget
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
|
|
|
+ _dict = {"project_codes":project_code,
|
|
|
+ doctitle_refine_name:doctitle_refine
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"project_name":project_name,
|
|
|
+ "win_tenderer":win_tenderer
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"project_name":project_name,
|
|
|
+ "win_bid_price":win_bid_price
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"project_name":project_name,
|
|
|
+ "bidding_budget":bidding_budget
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"project_name":project_name,
|
|
|
+ doctitle_refine_name:doctitle_refine
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"win_tenderer":win_tenderer,
|
|
|
+ "win_bid_price":win_bid_price
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"win_tenderer":win_tenderer,
|
|
|
+ "bidding_budget":bidding_budget
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"win_tenderer":win_tenderer,
|
|
|
+ doctitle_refine_name:doctitle_refine
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"win_bid_price":win_bid_price,
|
|
|
+ "bidding_budget":bidding_budget
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"win_bid_price":win_bid_price,
|
|
|
+ doctitle_refine_name:doctitle_refine
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ _dict = {"bidding_budget":bidding_budget,
|
|
|
+ doctitle_refine_name:doctitle_refine
|
|
|
+ }
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ confidence=80
|
|
|
+ _dict = {doctitle_refine_name:doctitle_refine}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+ _dict = {"project_codes":project_code}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ confidence=70
|
|
|
+ _dict = {"project_name":project_name}
|
|
|
+ self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item)
|
|
|
+
|
|
|
+ return list_rules,table_name,table_index
|
|
|
+
|
|
|
+
|
|
|
+ def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
|
|
|
+ def producer(columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]):
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ RangeQuery(document_tmp_status,*status_from,True,True),
|
|
|
+ # TermQuery("docid",246433488)
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ log("flow_dumplicate producer total_count:%d"%total_count)
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+ for _dict in list_dict:
|
|
|
+ self.queue_dumplicate.put(_dict)
|
|
|
+ _count = len(list_dict)
|
|
|
+ while next_token and _count<flow_process_count:
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document_tmp","document_tmp_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+ for _dict in list_dict:
|
|
|
+ self.queue_dumplicate.put(_dict)
|
|
|
+ _count += len(list_dict)
|
|
|
+ def comsumer():
|
|
|
+ mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,60,1,ots_client=self.ots_client)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+ producer()
|
|
|
+ comsumer()
|
|
|
+
|
|
|
+ def dumplicate_comsumer_handle(self,item,result_queue,ots_client,get_all=False,upgrade=True):
|
|
|
+ start_time = time.time()
|
|
|
+ self.post_extract(item)
|
|
|
+
|
|
|
+ base_list = []
|
|
|
+ set_docid = set()
|
|
|
+
|
|
|
+ list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,get_all=get_all,to_log=True)
|
|
|
+
|
|
|
+ list_rules.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
+ _i = 0
|
|
|
+ step = 5
|
|
|
+
|
|
|
+ item["confidence"] = 999
|
|
|
+ if item.get(document_tmp_docid) not in set_docid:
|
|
|
+ base_list.append(item)
|
|
|
+ set_docid.add(item.get(document_tmp_docid))
|
|
|
+
|
|
|
+ while _i<len(list_rules):
|
|
|
+ must_not_q = []
|
|
|
+ if len(base_list)>0:
|
|
|
+ must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
|
|
|
+ _query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
|
|
|
+ must_not_queries=must_not_q)
|
|
|
+ _rule = list_rules[_i]
|
|
|
+ confidence = _rule["confidence"]
|
|
|
+ singleNum_keys = _rule["singleNum_keys"]
|
|
|
+ contain_keys = _rule["contain_keys"]
|
|
|
+ multiNum_keys = _rule["multiNum_keys"]
|
|
|
+ self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json])
|
|
|
+ _i += step
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ _time = time.time()
|
|
|
+ log("%d start final check with length:%d"%(item["docid"],len(base_list)))
|
|
|
+ final_list = self.dumplicate_fianl_check(base_list)
|
|
|
+ log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
|
|
|
+ best_docid = self.get_best_docid(final_list)
|
|
|
+
|
|
|
+ final_list_docid = [a["docid"] for a in final_list]
|
|
|
+ log("%d:final_list_docid:%s"%(item["docid"],str(final_list_docid)))
|
|
|
+ _d = {"partitionkey":item["partitionkey"],
|
|
|
+ "docid":item["docid"],
|
|
|
+ "status":random.randint(*flow_dumplicate_status_to),
|
|
|
+ document_tmp_opertime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")
|
|
|
+ }
|
|
|
+ dtmp = Document_tmp(_d)
|
|
|
+
|
|
|
+
|
|
|
+ dup_docid = set()
|
|
|
+ for _dict in final_list:
|
|
|
+ dup_docid.add(_dict.get(document_tmp_docid))
|
|
|
+ if item.get(document_tmp_docid) in dup_docid:
|
|
|
+ dup_docid.remove(item.get(document_tmp_docid))
|
|
|
+
|
|
|
+
|
|
|
+ remove_list = []
|
|
|
+
|
|
|
+ if len(final_list)==0 or best_docid==item.get(document_tmp_docid):
|
|
|
+ dtmp.setValue(document_tmp_save,1,True)
|
|
|
+ dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
|
|
|
+ dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
+ for _dict in final_list:
|
|
|
+ if _dict.get(document_tmp_docid) in dup_docid:
|
|
|
+ remove_list.append(_dict)
|
|
|
+ else:
|
|
|
+ dtmp.setValue(document_tmp_save,0,True)
|
|
|
+ if best_docid in dup_docid:
|
|
|
+ dup_docid.remove(best_docid)
|
|
|
+ for _dict in final_list:
|
|
|
+ if _dict.get(document_tmp_docid) in dup_docid:
|
|
|
+ remove_list.append(_dict)
|
|
|
+ dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
+ dmp_docid = "%d,%s"%(best_docid,dmp_docid)
|
|
|
+ else:
|
|
|
+ dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
+ for _dict in final_list:
|
|
|
+ if _dict.get(document_tmp_docid) in dup_docid:
|
|
|
+ remove_list.append(_dict)
|
|
|
+ log("save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
|
|
|
+ if upgrade:
|
|
|
+ self.changeSaveStatus(remove_list)
|
|
|
+
|
|
|
+ dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
+ dtmp.setValue(document_tmp_dup_docid,dmp_docid,True)
|
|
|
+ dtmp.update_row(self.ots_client)
|
|
|
+
|
|
|
+ # log("dump takes %.2f"%(time.time()-start_time))
|
|
|
+
|
|
|
+ def start_flow_dumplicate(self):
|
|
|
+ schedule = BlockingScheduler()
|
|
|
+ schedule.add_job(self.flow_dumplicate,"cron",second="*/10")
|
|
|
+ schedule.start()
|
|
|
+
|
|
|
+ def changeSaveStatus(self,list_dict):
|
|
|
+ for _dict in list_dict:
|
|
|
+ if _dict.get(document_tmp_save,1)==1:
|
|
|
+ _d = {"partitionkey":_dict["partitionkey"],
|
|
|
+ "docid":_dict["docid"],
|
|
|
+ document_tmp_save:0
|
|
|
+ }
|
|
|
+ _d_tmp = Document_tmp(_d)
|
|
|
+ _d_tmp.update_row(self.ots_client)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ def test_dumplicate(self,docid):
|
|
|
+ columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery("docid",docid)
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ log("flow_dumplicate producer total_count:%d"%total_count)
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+
|
|
|
+ for item in list_dict:
|
|
|
+ self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=True,upgrade=False)
|
|
|
+ return
|
|
|
+
|
|
|
+ def getRemainDoc(self,docid):
|
|
|
+ columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]
|
|
|
+ bool_query = BoolQuery(must_queries=[
|
|
|
+ TermQuery("docid",docid)
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client.search("document","document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid")]),limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_dict = getRow_ots(rows)
|
|
|
+
|
|
|
+ if len(list_dict)>0:
|
|
|
+ item = list_dict[0]
|
|
|
+ start_time = time.time()
|
|
|
+ self.post_extract(item)
|
|
|
+
|
|
|
+ base_list = []
|
|
|
+ set_docid = set()
|
|
|
+
|
|
|
+ list_rules,table_name,table_index = self.translate_dumplicate_rules(flow_dumplicate_status_from,item,to_log=True)
|
|
|
+
|
|
|
+ list_rules.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
+ _i = 0
|
|
|
+ step = 5
|
|
|
+
|
|
|
+ item["confidence"] = 999
|
|
|
+ if item.get(document_tmp_docid) not in set_docid:
|
|
|
+ base_list.append(item)
|
|
|
+ set_docid.add(item.get(document_tmp_docid))
|
|
|
+
|
|
|
+ while _i<len(list_rules):
|
|
|
+ must_not_q = []
|
|
|
+ if len(base_list)>0:
|
|
|
+ must_not_q = [TermQuery("docid",a) for a in list(set_docid)[-100:]]
|
|
|
+ _query = BoolQuery(should_queries=[_rule["query"] for _rule in list_rules[_i:_i+step]],
|
|
|
+ must_not_queries=must_not_q)
|
|
|
+ _rule = list_rules[_i]
|
|
|
+ confidence = _rule["confidence"]
|
|
|
+ singleNum_keys = _rule["singleNum_keys"]
|
|
|
+ contain_keys = _rule["contain_keys"]
|
|
|
+ multiNum_keys = _rule["multiNum_keys"]
|
|
|
+ self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json])
|
|
|
+ _i += step
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ _time = time.time()
|
|
|
+ log("%d start final check with length:%d"%(item["docid"],len(base_list)))
|
|
|
+ final_list = self.dumplicate_fianl_check(base_list)
|
|
|
+ log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
|
|
|
+ best_docid = self.get_best_docid(final_list)
|
|
|
+ return best_docid
|
|
|
+ return None
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- df = Dataflow()
|
|
|
+ # df = Dataflow()
|
|
|
# df.flow_init()
|
|
|
# df.flow_test()
|
|
|
# df.test_merge()
|
|
|
- df.start_flow_attachment()
|
|
|
+ # df.start_flow_attachment()
|
|
|
# df.start_flow_extract()
|
|
|
# df.start_flow_dumplicate()
|
|
|
# # df.start_flow_merge()
|
|
|
# df.start_flow_remove()
|
|
|
|
|
|
# download_attachment()
|
|
|
- test_attachment_interface()
|
|
|
+ # test_attachment_interface()
|
|
|
+ df_dump = Dataflow_dumplicate()
|
|
|
+ # df_dump.start_flow_dumplicate()
|
|
|
+ df_dump.test_dumplicate(25126084)
|