Explorar el Código

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	BaseDataMaintenance/maintenance/dataflow.py
luojiehua hace 1 mes
padre
commit
c18ccf9298

+ 51 - 11
BaseDataMaintenance/maintenance/dataflow.py

@@ -1273,6 +1273,20 @@ class Dataflow():
                 dict_source_count[_web_source].add(_fingerprint)
                 if len(dict_source_count[_web_source])>=2:
                     to_reverse=True
+        # 专项债
+        if len(base_list)>0 and base_list[0].get("is_special_bonds")==1:
+            for _item in base_list:
+                detail_link = _item.get("detail_link")
+                detail_link = detail_link.strip() if detail_link else ""
+                if "bondId=" in detail_link:
+                    bondId = detail_link.split("bondId=")[1]
+                    bondId = bondId.split(",") if bondId else []
+                else:
+                    bondId = []
+                _item['bondId_num'] = len(bondId)
+            # print([i.get("bondId_num") for i in base_list])
+            base_list.sort(key=lambda x:x["bondId_num"],reverse=True)
+            return base_list[0]["docid"]
         if len(base_list)>0:
             base_list.sort(key=lambda x:x["docid"],reverse=to_reverse)
             base_list.sort(key=lambda x:x.get(document_attachment_extract_status,0),reverse=True)
@@ -2209,7 +2223,9 @@ class Dataflow_dumplicate(Dataflow):
     def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart"]):
         dict_time = {}
         for k in keys:
-            dict_time[k] = _extract.get(k)
+            _time = _extract.get(k)
+            _time = _time[:10] if _time else ""
+            dict_time[k] = _time
         return dict_time
 
 
@@ -2258,6 +2274,15 @@ class Dataflow_dumplicate(Dataflow):
         _dict["dict_time"] = self.get_dict_time(_extract)
         _dict["punish"] = _extract.get("punish",{})
         _dict["approval"] = _extract.get("approval",[])
+
+        # 专项债字段
+        issue_details = _extract.get("debt_dic",{}).get("issue_details",[])
+        _dict["is_special_bonds"] = 1 if _dict.get(document_tmp_docchannel)==302 and _dict.get(document_tmp_web_source_name)=='专项债券信息网' and issue_details else 0
+        # 采购意向字段
+        if _dict.get("docchannel")==114:
+            _dict["demand_info"] = _extract.get("demand_info",{}).get("data",[])
+        else:
+            _dict["demand_info"] = []
         return _dict
 
     def dumplicate_fianl_check(self,base_list,b_log=False):
@@ -2371,11 +2396,14 @@ class Dataflow_dumplicate(Dataflow):
         pagetime_stamp_greater = getTimeStamp(page_time_greater)
         
         day_dis = abs(pagetime_stamp_greater-pagetime_stamp_less)//86400
-        if day_dis>7:
-            _prob = 0
-        elif day_dis>3:
-            if _prob<0.4:
+        if document_less.get("is_special_bonds",0)==document_greater.get("is_special_bonds",0)==1:
+            pass
+        else:
+            if day_dis>7:
                 _prob = 0
+            elif day_dis>3:
+                if _prob<0.4:
+                    _prob = 0
 
         return _prob,day_dis
 
@@ -2661,7 +2689,7 @@ class Dataflow_dumplicate(Dataflow):
 
         if table_name in {"document_tmp","document"}:
 
-            if page_time>=timeAdd(current_date,-7):
+            if page_time>=timeAdd(current_date,-7) and item.get("is_special_bonds")!=1:
                 table_name = "document_tmp"
                 table_index = "document_tmp_index"
                 base_dict = {
@@ -2891,6 +2919,17 @@ class Dataflow_dumplicate(Dataflow):
         confidence=80
         _dict = {doctitle_refine_name:doctitle_refine}
         self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
+        # 专项债
+        if item.get("is_special_bonds")==1:
+            confidence = 90
+            _dict = {doctitle_refine_name: doctitle_refine,
+                     document_tmp_web_source_name:"专项债券信息网"}
+            tmp_base_dict = {
+                "docchannel": item["docchannel"],
+                "status": [201, 450],
+                # "page_time": [timeAdd(page_time, -365), timeAdd(page_time, 365)]
+            }
+            self.appendRule(list_rules, _dict, tmp_base_dict, must_not_dict, confidence, item, b_log=to_log)
 
 
         confidence=70
@@ -2900,7 +2939,7 @@ class Dataflow_dumplicate(Dataflow):
 
         return list_rules,table_name,table_index
 
-    def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]):
+    def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type,"detail_link"]):
         q_size = self.queue_dumplicate.qsize()
         log("dumplicate queue size %d"%(q_size))
 
@@ -4424,7 +4463,7 @@ class Dataflow_dumplicate(Dataflow):
                 singleNum_keys = _rule["singleNum_keys"]
                 contain_keys = _rule["contain_keys"]
                 multiNum_keys = _rule["multiNum_keys"]
-                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=b_log)
+                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document,document_tmp_web_source_name,'detail_link'],b_log=b_log)
                 _i += step
 
 
@@ -4874,12 +4913,13 @@ class Dataflow_dumplicate(Dataflow):
 
     def test_dumplicate(self,docid):
         # columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
-        columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]
+        columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type,'detail_link']
+        # print('columns',columns)
         item = self.get_attrs_before_dump(docid,columns)
 
         if item:
             log("start dumplicate_comsumer_handle")
-            self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=False,upgrade=True)
+            self.dumplicate_comsumer_handle(item,None,self.ots_client,get_all=False,upgrade=False)
             return
 
     def test_merge(self,list_docid_less,list_docid_greater):
@@ -5118,7 +5158,7 @@ if __name__ == '__main__':
     # test_attachment_interface()
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
-    df_dump.test_dumplicate(613075691
+    df_dump.test_dumplicate(400075415256
                             )
     # df_dump.dumplicate_comsumer_handle_interface(603504420,document_table="document_0000",document_table_index="document_0000_index",project_table="project_0000",project_table_index="project_0000_index_formerge")
     # compare_dumplicate_check()

+ 79 - 5
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -783,7 +783,7 @@ def check_money(bidding_budget_less,bidding_budget_greater,
                 win_bid_price_less,win_bid_price_greater,
                 moneys_less,moneys_greater,
                 moneys_attachment_less,moneys_attachment_greater):
-
+    # print('bidding_budget_less',bidding_budget_less,'bidding_budget_greater',bidding_budget_greater)
     bidding_budget_less_source = bidding_budget_less
     bidding_budget_greater_source = bidding_budget_greater
     win_bid_price_less_source = win_bid_price_less
@@ -816,9 +816,11 @@ def check_money(bidding_budget_less,bidding_budget_greater,
 
 
         if budget_less!=budget_greater:
-
             if min(budget_less,budget_greater)>0:
-                if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
+                # if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
+                # 金额单位错误,对比时为一万倍,考虑部分小数点后的数,9999<x<10001
+                if (max(budget_less,budget_greater)/min(budget_less,budget_greater)>9999 and max(budget_less,budget_greater)/min(budget_less,budget_greater)<10001)\
+                        or (max(bidding_budget_less_source,bidding_budget_greater_source)/min(bidding_budget_less_source,bidding_budget_greater_source)>9999 and max(bidding_budget_less_source,bidding_budget_greater_source)/min(bidding_budget_less_source,bidding_budget_greater_source)<10001):
                     budget_is_same = True
             if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
                 budget_is_same = True
@@ -842,7 +844,9 @@ def check_money(bidding_budget_less,bidding_budget_greater,
         if price_less!=price_greater:
 
             if min(price_less,price_greater)>0:
-                if max(price_less,price_greater)/min(price_less,price_greater)==10000:
+                # if max(price_less,price_greater)/min(price_less,price_greater)==10000:
+                if (max(price_less,price_greater)/min(price_less,price_greater)>9999 and max(price_less,price_greater)/min(price_less,price_greater)<10001)\
+                        or (max(win_bid_price_less_source,win_bid_price_greater_source)/min(win_bid_price_less_source,win_bid_price_greater_source)>9999 and max(win_bid_price_less_source,win_bid_price_greater_source)/min(win_bid_price_less_source,win_bid_price_greater_source)<10001):
                     price_is_same = True
             if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
                 price_is_same = True
@@ -1241,6 +1245,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     punish_less = document_less.get("punish",{})
     approval_less = document_less.get("approval",[])
     source_type_less = document_less.get("source_type")
+    detail_link_less = document_less.get("detail_link")
+    is_special_bonds_less = document_less.get("is_special_bonds")
 
 
     docid_greater = document_greater["docid"]
@@ -1264,6 +1270,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     province_greater = document_greater.get("province")
     city_greater = document_greater.get("city")
     district_greater = document_greater.get("district")
+    detail_link_greater = document_greater.get("detail_link")
+    is_special_bonds_greater = document_greater.get("is_special_bonds")
 
     moneys_greater = document_greater.get("moneys")
     moneys_attachment_greater = document_greater.get("moneys_attachment")
@@ -1322,7 +1330,6 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
         if b_log:
             logging.info("same web_site,both has attach but not same web_source_no_less:%s,web_source_no_greater:%s"%(web_source_no_less,web_source_no_greater))
         return 0
-
     if isinstance(project_codes_less,str):
         project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
     elif project_codes_less is None:
@@ -1333,6 +1340,73 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     elif project_codes_greater is None:
         project_codes_greater = []
 
+    # 采购意向去重
+    if docchannel_greater==docchannel_less==114:
+        sign = True
+        demand_info_less = document_less.get("demand_info",[])
+        demand_info_greater = document_greater.get("demand_info",[])
+        # if demand_info_less and not demand_info_greater:
+        #     sign = False
+        # elif not demand_info_less and demand_info_greater:
+        #     sign = False
+        # elif demand_info_less and demand_info_greater:
+        if demand_info_less and demand_info_greater:
+            # 重新确定demand_info的数量排序,按大小排序
+            if len(demand_info_greater)<len(demand_info_less):
+                _demand_info_less = demand_info_greater
+                _demand_info_greater = demand_info_less
+                demand_info_less = _demand_info_less
+                demand_info_greater = _demand_info_greater
+            for item1 in demand_info_less:
+                tmp_project_name_less = re.sub("\s","",item1.get("project_name","").strip())
+                tmp_project_name_less = tmp_project_name_less.replace("(","(").replace(")",")")
+                tmp_budget_less = float(item1.get("budget",0) if item1.get("budget",0) else 0)
+                tmp_order_begin_less = item1.get("order_begin","")
+                tmp_order_end_less = item1.get("order_end", "")
+                get_same = False
+                for item2 in demand_info_greater:
+                    tmp_project_name_greater = re.sub("\s", "", item2.get("project_name", "").strip())
+                    tmp_project_name_greater = tmp_project_name_greater.replace("(", "(").replace(")", ")")
+                    tmp_budget_greater = float(item2.get("budget",0) if item2.get("budget",0) else 0)
+                    tmp_order_begin_greater = item2.get("order_begin", "")
+                    tmp_order_end_greater = item2.get("order_end", "")
+                    # 项目名称相同或包含关系,预算金额对比,预计采购时间开始或结束相等(只对比到月份)
+                    if (tmp_project_name_less==tmp_project_name_greater or
+                        (len(tmp_project_name_less)>0 and len(tmp_project_name_greater)>0 and (tmp_project_name_less.find(tmp_project_name_greater)>=0 or tmp_project_name_greater.find(tmp_project_name_less)>=0))) and \
+                            check_money(tmp_budget_less,tmp_budget_greater,0,0,[],[],[],[]) and \
+                            (tmp_order_begin_less[:7]==tmp_order_begin_greater[:7] or tmp_order_end_less[:7]==tmp_order_end_greater[:7]):
+                        get_same = True
+                        break
+                if not get_same:
+                    sign = False
+                    break
+        if not sign:
+            return 0
+        else:
+            if demand_info_greater and len(demand_info_greater)==len(demand_info_less):# demand_info完全相同
+                return 1
+
+    # 专项债去重
+    if is_special_bonds_greater==is_special_bonds_less==1:
+        detail_link_less = detail_link_less.strip() if detail_link_less else ""
+        detail_link_greater = detail_link_greater.strip() if detail_link_greater else ""
+        if "bondId=" in detail_link_less:
+            bondId_less = detail_link_less.split("bondId=")[1]
+            bondId_less = bondId_less.split(",") if bondId_less else []
+        else:
+            bondId_less = []
+        if "bondId=" in detail_link_greater:
+            bondId_greater = detail_link_greater.split("bondId=")[1]
+            bondId_greater = bondId_greater.split(",") if bondId_greater else []
+        else:
+            bondId_greater = []
+        # print('bondId_less',bondId_less)
+        # print('bondId_greater',bondId_greater)
+        if bondId_less and bondId_greater:
+            bondId_less = set(bondId_less)
+            bondId_greater = set(bondId_greater)
+            if bondId_less.issubset(bondId_greater) or bondId_greater.issubset(bondId_less):
+                return 1
 
     same_count = 0
     all_count = 8