浏览代码

公告去重标题对比规则优化;products对比规则优化

znj 4 天之前
父节点
当前提交
afa6dae0fb
共有 2 个文件被更改,包括 26 次插入8 次删除
  1. 11 4
      BaseDataMaintenance/maintenance/dataflow.py
  2. 15 4
      BaseDataMaintenance/maxcompute/documentDumplicate.py

+ 11 - 4
BaseDataMaintenance/maintenance/dataflow.py

@@ -448,6 +448,7 @@ class Dataflow():
         project_name = _dict.get(document_tmp_project_name,"")
         tenderee = _dict.get(document_tmp_tenderee,"")
         agency = _dict.get(document_tmp_agency,"")
+        doctitle = _dict.get(document_tmp_doctitle,"")
         doctitle_refine = _dict.get(document_tmp_doctitle_refine,"")
         win_tenderer = _dict.get("win_tenderer","")
         bidding_budget = _dict.get("bidding_budget","")
@@ -459,7 +460,7 @@ class Dataflow():
         page_time = _dict.get(document_tmp_page_time,"")
         fingerprint = _dict.get(document_tmp_fingerprint,"")
         product = _dict.get(document_tmp_product,"")
-        return docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product
+        return docchannel,project_code,project_name,tenderee,agency,doctitle,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product
 
     def f_set_docid_limitNum_contain(self,item, _split,singleNum_keys=["tenderee","win_tenderer"],contain_keys=[],multiNum_keys=[],notlike_keys=["project_code"]):
         flag = True
@@ -550,7 +551,7 @@ class Dataflow():
                 set_docid.add(_docid)
 
     def translate_dumplicate_rules(self,status_from,item):
-        docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
+        docchannel,project_code,project_name,tenderee,agency,doctitle,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
         if page_time=='':
             page_time = getCurrent_date("%Y-%m-%d")
         base_dict = {
@@ -1463,7 +1464,7 @@ class Dataflow():
 
     def merge_document(self,item,status_to=None):
         self.post_extract(item)
-        docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
+        docchannel,project_code,project_name,tenderee,agency,doctitle,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
 
 
         _d = {"partitionkey":item["partitionkey"],
@@ -2274,6 +2275,9 @@ class Dataflow_dumplicate(Dataflow):
         _dict["dict_time"] = self.get_dict_time(_extract)
         _dict["punish"] = _extract.get("punish",{})
         _dict["approval"] = _extract.get("approval",[])
+        _dict["products_original"] = _extract.get("product_attrs_original", {}).get("data",[])
+        _dict["products"] = _dict.get("products") if _dict.get("products") is not None else []
+        _dict["products"] = _dict["products"] if isinstance(_dict["products"], list) else json.loads(_dict["products"])
 
         # 专项债字段
         issue_details = _extract.get("debt_dic",{}).get("issue_details",[])
@@ -2680,7 +2684,7 @@ class Dataflow_dumplicate(Dataflow):
         list_rules.append(_rule)
 
     def translate_dumplicate_rules(self,status_from,item,get_all=False,to_log=False,day_dis=7,table_name ="document_tmp",table_index="document_tmp_index"):
-        docchannel,project_code,project_name,tenderee,agency,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
+        docchannel,project_code,project_name,tenderee,agency,doctitle,doctitle_refine,win_tenderer,bidding_budget,win_bid_price,page_time,fingerprint,product = self.get_dump_columns(item)
         current_date = getCurrent_date("%Y-%m-%d")
         if page_time=='':
             page_time = current_date
@@ -2715,6 +2719,7 @@ class Dataflow_dumplicate(Dataflow):
                 }
                 must_not_dict = {"docid":item.get("docid")}
                 doctitle_refine_name = "doctitle"
+                doctitle_refine = doctitle
         else:
             _status = [201,300]
             base_dict = {
@@ -2724,6 +2729,7 @@ class Dataflow_dumplicate(Dataflow):
             }
             must_not_dict = {"docid":item.get("docid")}
             doctitle_refine_name = "doctitle"
+            doctitle_refine = doctitle
 
 
 
@@ -5256,6 +5262,7 @@ class Dataflow_dumplicate(Dataflow):
         if item:
             log("start dumplicate_comsumer_handle")
             self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=False,upgrade=False)
+            # self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=True,upgrade=False)
             return
 
     def test_merge(self,list_docid_less,list_docid_greater):

+ 15 - 4
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -1233,11 +1233,11 @@ def check_time(json_time_less,json_time_greater):
 
 def check_products(products_less,products_greater):
     if isinstance(products_less, list):
-        products_less = products_less
+        pass
     else:
         products_less = json.loads(products_less) if products_less else []
     if isinstance(products_greater, list):
-        products_greater = products_greater
+        pass
     else:
         products_greater = json.loads(products_greater) if products_greater else []
     # if len(products_less)>0 and len(products_greater)>0:
@@ -1298,6 +1298,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     detail_link_less = document_less.get("detail_link")
     is_special_bonds_less = document_less.get("is_special_bonds")
     products_less = document_less.get("products")
+    products_original_less = document_less.get("products_original",[])
 
 
     docid_greater = document_greater["docid"]
@@ -1325,6 +1326,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     detail_link_greater = document_greater.get("detail_link")
     is_special_bonds_greater = document_greater.get("is_special_bonds")
     products_greater = document_greater.get("products")
+    products_original_greater = document_greater.get("products_original", [])
 
     moneys_greater = document_greater.get("moneys")
     moneys_attachment_greater = document_greater.get("moneys_attachment")
@@ -1339,7 +1341,7 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
         # print('fingerprint same')
         return 1
 
-    # # 专项债去重
+    # 专项债去重
     if is_special_bonds_greater==is_special_bonds_less==1:
         detail_link_less = detail_link_less.strip() if detail_link_less else ""
         detail_link_greater = detail_link_greater.strip() if detail_link_greater else ""
@@ -1380,7 +1382,16 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
 
     # 采购产品products对比
     if getLength(products_less)>0 and getLength(products_greater)>0:
-        if not check_products(products_less,products_greater):
+        if products_original_less:# products不是AI补充提取的
+            _products_less = products_original_less
+        else:
+            _products_less = products_less
+        if products_original_greater:
+            _products_greater = products_original_greater
+        else:
+            _products_greater = products_greater
+        if not check_products(_products_less,_products_greater):
+            # print("check_products error")
             return 0
 
     #一篇要素都在附件,且两篇附件md5有重叠