Ver Fonte

公告去重优化,调整时间和产品校验,增加附件及全金额判断

luojiehua há 1 ano atrás
pai
commit
f01faccc36

+ 3 - 3
BaseDataMaintenance/maintenance/check_log.py

@@ -6,7 +6,7 @@ def test_speed(logfile):
     a = open(logfile,"r",encoding="utf8").read()
     set_a = set()
     _c = 0
-    for a in re.split("\n",s):
+    for a in re.split("\n",a):
         a = a.strip()
         if a=="":
             continue
@@ -41,5 +41,5 @@ def check_start_end(logfile):
 
 if __name__ == '__main__':
     logfile = "log.txt"
-    # test_speed(logfile)
-    check_start_end(logfile)
+    test_speed(logfile)
+    # check_start_end(logfile)

+ 24 - 15
BaseDataMaintenance/maintenance/dataflow.py

@@ -2227,8 +2227,10 @@ class Dataflow_dumplicate(Dataflow):
         _dict["doctitle_refine"] = _extract.get("doctitle_refine","")
         if _dict["doctitle_refine"]=="":
             _dict["doctitle_refine"] = _dict.get("doctitle")
-        _dict["nlp_enterprise"] = str({"indoctextcon":_extract.get("nlp_enterprise",[]),
-                                       "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])})
+        _dict["moneys"] = set(_extract.get("moneys",[]))
+        _dict["moneys_attachment"] = set(_extract.get("moneys_attachment",[]))
+        _dict["nlp_enterprise"] = json.dumps({"indoctextcon":_extract.get("nlp_enterprise",[]),
+                                       "notindoctextcon":_extract.get("nlp_enterprise_attachment",[])},ensure_ascii=False)
         _dict["extract_count"] = self.c_f_get_extractCount.evaluate(extract_json)
         _dict["package"] = self.c_f_get_package.evaluate(extract_json)
         _dict["project_name"] = _extract.get("name","")
@@ -2299,6 +2301,10 @@ class Dataflow_dumplicate(Dataflow):
         province_less = document_less.get("province")
         city_less = document_less.get("city")
         district_less = document_less.get("district")
+        moneys_less = document_less.get("moneys")
+        moneys_attachment_less = document_less.get("moneys_attachment")
+        page_attachments_less = document_less.get(document_tmp_attachment_path,"[]")
+
 
         document_greater = _dict2
         docid_greater = _dict2["docid"]
@@ -2323,12 +2329,16 @@ class Dataflow_dumplicate(Dataflow):
         city_greater = document_greater.get("city")
         district_greater = document_greater.get("district")
 
+        moneys_greater = document_greater.get("moneys")
+        moneys_attachment_greater = document_greater.get("moneys_attachment")
+        page_attachments_greater = document_greater.get(document_tmp_attachment_path,"[]")
+
         hard_level=1
         if web_source_no_less==web_source_no_greater=="17397-3":
             hard_level=2
 
         if self.check_rule==1:
-            _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater)
+            _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
         else:
             _prob = check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=b_log,hard_level=hard_level,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater)
 
@@ -2449,7 +2459,7 @@ class Dataflow_dumplicate(Dataflow):
                 check_result["code"] = 1
 
 
-        if not check_product(product_less,product_greater):
+        if not check_product(product_less,product_greater,doctitle_refine_less,doctitle_refine_greater):
             check_result["product"] = 0
             check_result["pass"] = 0
             if b_log:
@@ -2853,7 +2863,7 @@ class Dataflow_dumplicate(Dataflow):
 
         return list_rules,table_name,table_index
 
-    def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district]):
+    def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path]):
         q_size = self.queue_dumplicate.qsize()
         log("dumplicate queue size %d"%(q_size))
 
@@ -4008,7 +4018,7 @@ class Dataflow_dumplicate(Dataflow):
                 singleNum_keys = _rule["singleNum_keys"]
                 contain_keys = _rule["contain_keys"]
                 multiNum_keys = _rule["multiNum_keys"]
-                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle],b_log=b_log)
+                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path],b_log=b_log)
                 _i += step
 
 
@@ -4092,9 +4102,8 @@ class Dataflow_dumplicate(Dataflow):
                             dtmp.setValue(document_tmp_projects,json.dumps(list_proj[:len(list_proj)//2]),True)
                             if dtmp.update_row(self.ots_client):
                                 break
-                if table_name=="document_tmp":
-                    self.changeSaveStatus(remove_list)
-                    self.changeSaveStatus(list_merge_dump)
+                self.changeSaveStatus(remove_list)
+                self.changeSaveStatus(list_merge_dump)
             else:
                 return list_docids
 
@@ -4205,7 +4214,7 @@ class Dataflow_dumplicate(Dataflow):
 
     def test_dumplicate(self,docid):
         # columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
-        columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district]
+        columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path]
         bool_query = BoolQuery(must_queries=[
             TermQuery("docid",docid)
         ])
@@ -4394,12 +4403,12 @@ if __name__ == '__main__':
 
     # download_attachment()
     # test_attachment_interface()
-    # df_dump = Dataflow_dumplicate(start_delete_listener=False)
-    # # df_dump.start_flow_dumplicate()
+    df_dump = Dataflow_dumplicate(start_delete_listener=False)
+    # df_dump.start_flow_dumplicate()
+    df_dump.test_dumplicate(405004237
+                            )
 
-    # df_dump.test_dumplicate(400929607
-    #                         )
-    compare_dumplicate_check()
+    # compare_dumplicate_check()
     # df_dump.test_merge([242672995,235300429,240009762
     #                     ],[243240169,])
     # df_dump.flow_remove_project_tmp()

+ 97 - 24
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -779,7 +779,9 @@ def getLength(_str):
     return len(_str if _str is not None else "")
 
 def check_money(bidding_budget_less,bidding_budget_greater,
-                win_bid_price_less,win_bid_price_greater):
+                win_bid_price_less,win_bid_price_greater,
+                moneys_less,moneys_greater,
+                moneys_attachment_less,moneys_attachment_greater):
 
     #只判断最高前六位
     if getLength(bidding_budget_less)>0:
@@ -799,6 +801,8 @@ def check_money(bidding_budget_less,bidding_budget_greater,
     #check saming
     budget_is_same = ""
     price_is_same = ""
+    logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
+    logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
     if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
         budget_less = float(bidding_budget_less)
         budget_greater = float(bidding_budget_greater)
@@ -811,6 +815,10 @@ def check_money(bidding_budget_less,bidding_budget_greater,
                     budget_is_same = True
             if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
                 budget_is_same = True
+            if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
+                budget_is_same = True
+            if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
+                budget_is_same = True
             if budget_is_same=="":
                 return False
 
@@ -824,6 +832,10 @@ def check_money(bidding_budget_less,bidding_budget_greater,
                     price_is_same = True
             if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
                 price_is_same = True
+            if price_less in moneys_greater or price_less in moneys_attachment_greater:
+                price_is_same = True
+            if price_greater in moneys_less or price_greater in moneys_attachment_less:
+                price_is_same = True
             if price_is_same=="":
                 return False
     return True
@@ -985,7 +997,7 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
                     return False
     return True
 
-def check_product(product_less,product_greater,split_char=","):
+def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
     if getLength(product_less)>0 and getLength(product_greater)>0:
 
         _product_l = product_less.split(split_char)
@@ -997,7 +1009,7 @@ def check_product(product_less,product_greater,split_char=","):
             _product_l = a
         for _l in _product_l:
             for _g in _product_g:
-                if getSimilarityOfString(_l,_g)>=0.8:
+                if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>-0 or doctitle_refine_less.find(_g)>=0:
                     same_count += 1
                     break
         if same_count/len(_product_l)>=0.5:
@@ -1019,6 +1031,8 @@ def check_package(package_less,package_greater,split_char=","):
     return True
 
 def check_time(json_time_less,json_time_greater):
+    has_same = False
+    has_diff = False
     if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
         if isinstance(json_time_less,dict):
             time_less = json_time_less
@@ -1033,12 +1047,52 @@ def check_time(json_time_less,json_time_greater):
                 v1 = time_greater.get(k,"")
                 if getLength(v1)>0:
                     if v[:10]!=v1[:10]:
-                        return False
-    return True
+                        has_diff = True
+                    else:
+                        has_same = True
+    if has_same:
+        if has_diff:
+            return 1
+        return 2
+    if has_diff:
+        return 0
+    return 1
 
-def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
+def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]"):
     if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
         return 1
+
+
+    #一篇要素都在附件,且两篇附件md5有重叠
+    set_md5_less = set()
+    set_md5_greater = set()
+    list_md5_less = json.loads(page_attachments_less)
+    list_md5_greater = json.loads(page_attachments_greater)
+    for _l in list_md5_less:
+        _md5 = _l.get("fileMd5")
+        if _md5 is not None:
+            set_md5_less.add(_md5)
+    for _l in list_md5_greater:
+        _md5 = _l.get("fileMd5")
+        if _md5 is not None:
+            set_md5_greater.add(_md5)
+    if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
+        one_in_attach = False
+        dict_enterprise_less = json.loads(nlp_enterprise_less)
+        dict_enterprise_greater = json.loads(nlp_enterprise_greater)
+        indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
+        notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
+        indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
+        notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
+        if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
+            one_in_attach = True
+        if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
+            one_in_attach = True
+        if one_in_attach:
+            if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
+                return 1
+
+
     if isinstance(project_codes_less,str):
         project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
     elif project_codes_less is None:
@@ -1081,7 +1135,7 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
     if min(extract_count_less,extract_count_greater)<=3:
         if _prob<0.1:
             _prob = 0.15
-        if province_less!=province_greater:
+        if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
             return 0
     if _prob<0.1:
         return _prob
@@ -1115,7 +1169,7 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
             check_result["code"] = 1
 
 
-    if not check_product(product_less,product_greater):
+    if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
         check_result["product"] = 0
         check_result["pass"] = 0
         if b_log:
@@ -1145,8 +1199,12 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
         else:
             check_result["entity"] = 1
 
+    logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
+    logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
     if not check_money(bidding_budget_less,bidding_budget_greater,
-                       win_bid_price_less,win_bid_price_greater):
+                       win_bid_price_less,win_bid_price_greater,
+                       moneys_less,moneys_greater,
+                       moneys_attachment_less,moneys_attachment_greater):
         if b_log:
             logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
         check_result["money"] = 0
@@ -1172,7 +1230,8 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
             check_result["package"] = 1
 
     #added check
-    if not check_time(json_time_less,json_time_greater):
+    _time_check = check_time(json_time_less,json_time_greater)
+    if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
         if b_log:
             logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
             if isinstance(json_time_less,dict):
@@ -1211,8 +1270,6 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
             return _prob
         else:
             return 0
-        if check_result.get("time",1)==0:
-            return 0
     return _prob
 
 def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
@@ -1401,7 +1458,7 @@ def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerp
             return 0
     return _prob
 
-@annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string->double")
+@annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->double")
 class f_dumplicate_check(BaseUDTF):
     def __init__(self):
         import logging
@@ -1414,18 +1471,34 @@ class f_dumplicate_check(BaseUDTF):
                 project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,
                 extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,
                 page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,
-                package_less,package_greater,json_time_less,json_time_greater,json_context):
-        _context = json.loads(json_context)
+                package_less,package_greater,json_time_less,json_time_greater,json_context,
+                province_less,province_greater,city_less,city_greater,district_less,district_greater,
+                web_source_no_less,web_source_no_greater,
+                extract_json_less,extract_json_greater,page_attachments_less,page_attachments_greater):
 
         min_counts = 100
-
-
-
-        for item in _context:
-            if item["counts"]<min_counts:
-                min_counts = item["counts"]
-
-        _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,min_counts,b_log=False)
+        if json_context is not None:
+            _context = json.loads(json_context)
+
+            for item in _context:
+                if item.get("counts",0)>0 and item.get("counts",0)<min_counts:
+                    min_counts = item["counts"]
+        _extract_less = {}
+        if extract_json_less is not None:
+            _extract_less = json.loads(extract_json_less)
+        _extract_greater = {}
+        if extract_json_greater is not None:
+            _extract_greater = json.loads(extract_json_greater)
+        moneys_less = set(_extract_less.get("moneys",[]))
+        moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
+        moneys_greater = set(_extract_greater.get("moneys",[]))
+        moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
+
+        if page_attachments_less is None:
+            page_attachments_less = '[]'
+        if page_attachments_greater is None:
+            page_attachments_greater = '[]'
+        _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
         self.forward(_prob)
 
 @annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
@@ -1472,7 +1545,7 @@ class f_dumplicate_featureMatrix(BaseUDTF):
                 _error += str(a)
             self.forward("[6-%s]"%_error,0)
             return
-        if not check_product(product_less,product_greater):
+        if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
             _error = "%s=%s"%(str(product_less),str(product_greater))
             self.forward("7-%s"%_error,0)
             return