Pārlūkot izejas kodu

数据质量检查流程修改

znj 2 mēneši atpakaļ
vecāks
revīzija
16185490fb
1 mainītis faili ar 17 papildinājumiem un 47 dzēšanām
  1. 17 47
      BaseDataMaintenance/maintenance/dataflow.py

+ 17 - 47
BaseDataMaintenance/maintenance/dataflow.py

@@ -4555,14 +4555,9 @@ class Dataflow_dumplicate(Dataflow):
                 for error_type,error_sample in error_rule.items():
                     tmp_data = {
                           "WEB_SOURCE_NO": web_source_no,
-                          "TITLE": "",
-                          "COUNT": 5,
-                          "WEBTYPE": label2channel.get(original_docchannel,""),
-                          "TYPE": error_type,
-                          "FILEMD5": ",".join([str(docid) for docid in error_sample]),
-                          "PUBDATE": "",
-                          "REGDATE": "",
-                          "ENDDATE": ""
+                          "WEBTYPE": label2channel.get(original_docchannel, ""),
+                        "TYPE": error_type,
+                        "ITEMS": error_sample
                         }
                     res_json['data'].append(tmp_data)
                     res_json['count'] += 1
@@ -4574,18 +4569,14 @@ class Dataflow_dumplicate(Dataflow):
         if down_res:
             df = pd.read_excel(LocalPath)
             tmp_list = []
-            for web_source_no,fingerprint,original_docchannel,doctitle,cnt in zip(df['web_source_no'], df['fingerprint'],
-                                                                      df['original_docchannel'],df['doctitle'],df['cnt']):
+            for web_source_no,fingerprint,original_docchannel,cnt,res in zip(df['web_source_no'], df['fingerprint'],
+                                                                      df['original_docchannel'],df['cnt'],df['res']):
                 tmp_data = {
                     "WEB_SOURCE_NO": web_source_no,
-                    "TITLE": doctitle,
-                    "COUNT": cnt,
                     "WEBTYPE": label2channel.get(original_docchannel, ""),
                     "TYPE": "编号公告重复",
-                    "FILEMD5": fingerprint,
-                    "PUBDATE": "",
-                    "REGDATE": "",
-                    "ENDDATE": ""
+                    "FINGERPRINT": fingerprint,
+                    "ITEMS": json.loads(res)
                 }
                 tmp_list.append(tmp_data)
             tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
@@ -4601,18 +4592,14 @@ class Dataflow_dumplicate(Dataflow):
         if down_res:
             df = pd.read_excel(LocalPath)
             tmp_list = []
-            for web_source_no,filemd5,original_docchannel,cnt in zip(df['web_source_no'],df['filemd5'],
-                                                                      df['original_docchannel'],df['cnt']):
+            for web_source_no,filemd5,original_docchannel,cnt,res in zip(df['web_source_no'],df['filemd5'],
+                                                                      df['original_docchannel'],df['cnt'],df['res']):
                 tmp_data = {
                     "WEB_SOURCE_NO": web_source_no,
-                    "TITLE": "",
-                    "COUNT": cnt,
                     "WEBTYPE": label2channel.get(original_docchannel, ""),
                     "TYPE": "编号附件重复",
                     "FILEMD5": filemd5,
-                    "PUBDATE": "",
-                    "REGDATE": "",
-                    "ENDDATE": ""
+                    "ITEMS": json.loads(res)
                 }
                 tmp_list.append(tmp_data)
             tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
@@ -4627,18 +4614,13 @@ class Dataflow_dumplicate(Dataflow):
         down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
         if down_res:
             df = pd.read_excel(LocalPath)
-            for web_source_no,original_docchannel,error_ratio,error_sample in zip(df['web_source_no'], df['original_docchannel'],
-                                                                        df['error_ratio'], df['error_sample']):
+            for web_source_no,original_docchannel,error_ratio,error_sample,res in zip(df['web_source_no'], df['original_docchannel'],
+                                                                        df['error_ratio'],df['error_sample'],df['res']):
                 tmp_data = {
                     "WEB_SOURCE_NO": web_source_no,
-                    "TITLE": "",
-                    "COUNT": 1,
                     "WEBTYPE": label2channel.get(original_docchannel, ""),
                     "TYPE": "附件识别异常",
-                    "FILEMD5": ",".join(json.loads(error_sample)[:5]),
-                    "PUBDATE": "",
-                    "REGDATE": "",
-                    "ENDDATE": ""
+                    "ITEMS": json.loads(res)
                 }
                 res_json['data'].append(tmp_data)
                 res_json['count'] += 1
@@ -4650,28 +4632,16 @@ class Dataflow_dumplicate(Dataflow):
         if down_res:
             df = pd.read_excel(LocalPath)
             tmp_list = []
-            for docid,doctitle,web_source_no,original_docchannel,page_time,time_bidclose,time_registration_end in zip(df['docid'],
-                                                                                     df['doctitle'],df['web_source_no'],df['original_docchannel'],
-                                                                                     df['page_time'],df['time_bidclose'],df['time_registration_end']):
-                time_registration_end = time_registration_end if str(time_registration_end) and str(time_registration_end) != 'nan' else ""
-                time_bidclose = time_bidclose if str(time_bidclose) and str(time_bidclose) != 'nan' else ""
+            for web_source_no,original_docchannel,res in zip(df['web_source_no'],df['original_docchannel'],df['res']):
                 tmp_data = {
                     "WEB_SOURCE_NO": web_source_no,
-                    "TITLE": doctitle,
-                    "COUNT": 1,
                     "WEBTYPE": label2channel.get(original_docchannel, ""),
                     "TYPE": "截止日期在发布日期之前",
-                    "FILEMD5": str(docid),
-                    "PUBDATE": page_time[:10],
-                    "REGDATE": time_registration_end[:10],
-                    "ENDDATE": time_bidclose[:10]
+                    "ITEMS": json.loads(res)
                 }
                 tmp_list.append(tmp_data)
-            tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
-            for key, group in groupby(tmp_list, lambda x: (x['WEB_SOURCE_NO'])):
-                group = list(group)[:5]
-                res_json['data'].extend(group)
-                res_json['count'] += len(group)
+            res_json['data'].extend(tmp_list)
+            res_json['count'] += len(tmp_list)
             os.remove(LocalPath)
 
         # url = "http://120.132.118.205:17090/saveQualityListData"