|
@@ -4734,14 +4734,9 @@ class Dataflow_dumplicate(Dataflow):
|
|
for error_type,error_sample in error_rule.items():
|
|
for error_type,error_sample in error_rule.items():
|
|
tmp_data = {
|
|
tmp_data = {
|
|
"WEB_SOURCE_NO": web_source_no,
|
|
"WEB_SOURCE_NO": web_source_no,
|
|
- "TITLE": "",
|
|
|
|
- "COUNT": 5,
|
|
|
|
- "WEBTYPE": label2channel.get(original_docchannel,""),
|
|
|
|
- "TYPE": error_type,
|
|
|
|
- "FILEMD5": ",".join([str(docid) for docid in error_sample]),
|
|
|
|
- "PUBDATE": "",
|
|
|
|
- "REGDATE": "",
|
|
|
|
- "ENDDATE": ""
|
|
|
|
|
|
+ "WEBTYPE": label2channel.get(original_docchannel, ""),
|
|
|
|
+ "TYPE": error_type,
|
|
|
|
+ "ITEMS": error_sample
|
|
}
|
|
}
|
|
res_json['data'].append(tmp_data)
|
|
res_json['data'].append(tmp_data)
|
|
res_json['count'] += 1
|
|
res_json['count'] += 1
|
|
@@ -4753,18 +4748,14 @@ class Dataflow_dumplicate(Dataflow):
|
|
if down_res:
|
|
if down_res:
|
|
df = pd.read_excel(LocalPath)
|
|
df = pd.read_excel(LocalPath)
|
|
tmp_list = []
|
|
tmp_list = []
|
|
- for web_source_no,fingerprint,original_docchannel,doctitle,cnt in zip(df['web_source_no'], df['fingerprint'],
|
|
|
|
- df['original_docchannel'],df['doctitle'],df['cnt']):
|
|
|
|
|
|
+ for web_source_no,fingerprint,original_docchannel,cnt,res in zip(df['web_source_no'], df['fingerprint'],
|
|
|
|
+ df['original_docchannel'],df['cnt'],df['res']):
|
|
tmp_data = {
|
|
tmp_data = {
|
|
"WEB_SOURCE_NO": web_source_no,
|
|
"WEB_SOURCE_NO": web_source_no,
|
|
- "TITLE": doctitle,
|
|
|
|
- "COUNT": cnt,
|
|
|
|
"WEBTYPE": label2channel.get(original_docchannel, ""),
|
|
"WEBTYPE": label2channel.get(original_docchannel, ""),
|
|
"TYPE": "编号公告重复",
|
|
"TYPE": "编号公告重复",
|
|
- "FILEMD5": fingerprint,
|
|
|
|
- "PUBDATE": "",
|
|
|
|
- "REGDATE": "",
|
|
|
|
- "ENDDATE": ""
|
|
|
|
|
|
+ "FINGERPRINT": fingerprint,
|
|
|
|
+ "ITEMS": json.loads(res)
|
|
}
|
|
}
|
|
tmp_list.append(tmp_data)
|
|
tmp_list.append(tmp_data)
|
|
tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
|
|
tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
|
|
@@ -4780,18 +4771,14 @@ class Dataflow_dumplicate(Dataflow):
|
|
if down_res:
|
|
if down_res:
|
|
df = pd.read_excel(LocalPath)
|
|
df = pd.read_excel(LocalPath)
|
|
tmp_list = []
|
|
tmp_list = []
|
|
- for web_source_no,filemd5,original_docchannel,cnt in zip(df['web_source_no'],df['filemd5'],
|
|
|
|
- df['original_docchannel'],df['cnt']):
|
|
|
|
|
|
+ for web_source_no,filemd5,original_docchannel,cnt,res in zip(df['web_source_no'],df['filemd5'],
|
|
|
|
+ df['original_docchannel'],df['cnt'],df['res']):
|
|
tmp_data = {
|
|
tmp_data = {
|
|
"WEB_SOURCE_NO": web_source_no,
|
|
"WEB_SOURCE_NO": web_source_no,
|
|
- "TITLE": "",
|
|
|
|
- "COUNT": cnt,
|
|
|
|
"WEBTYPE": label2channel.get(original_docchannel, ""),
|
|
"WEBTYPE": label2channel.get(original_docchannel, ""),
|
|
"TYPE": "编号附件重复",
|
|
"TYPE": "编号附件重复",
|
|
"FILEMD5": filemd5,
|
|
"FILEMD5": filemd5,
|
|
- "PUBDATE": "",
|
|
|
|
- "REGDATE": "",
|
|
|
|
- "ENDDATE": ""
|
|
|
|
|
|
+ "ITEMS": json.loads(res)
|
|
}
|
|
}
|
|
tmp_list.append(tmp_data)
|
|
tmp_list.append(tmp_data)
|
|
tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
|
|
tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
|
|
@@ -4806,18 +4793,13 @@ class Dataflow_dumplicate(Dataflow):
|
|
down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
|
|
down_res = downloadFile(self.bucket, ObjectName, LocalPath, retry=3)
|
|
if down_res:
|
|
if down_res:
|
|
df = pd.read_excel(LocalPath)
|
|
df = pd.read_excel(LocalPath)
|
|
- for web_source_no,original_docchannel,error_ratio,error_sample in zip(df['web_source_no'], df['original_docchannel'],
|
|
|
|
- df['error_ratio'], df['error_sample']):
|
|
|
|
|
|
+ for web_source_no,original_docchannel,error_ratio,error_sample,res in zip(df['web_source_no'], df['original_docchannel'],
|
|
|
|
+ df['error_ratio'],df['error_sample'],df['res']):
|
|
tmp_data = {
|
|
tmp_data = {
|
|
"WEB_SOURCE_NO": web_source_no,
|
|
"WEB_SOURCE_NO": web_source_no,
|
|
- "TITLE": "",
|
|
|
|
- "COUNT": 1,
|
|
|
|
"WEBTYPE": label2channel.get(original_docchannel, ""),
|
|
"WEBTYPE": label2channel.get(original_docchannel, ""),
|
|
"TYPE": "附件识别异常",
|
|
"TYPE": "附件识别异常",
|
|
- "FILEMD5": ",".join(json.loads(error_sample)[:5]),
|
|
|
|
- "PUBDATE": "",
|
|
|
|
- "REGDATE": "",
|
|
|
|
- "ENDDATE": ""
|
|
|
|
|
|
+ "ITEMS": json.loads(res)
|
|
}
|
|
}
|
|
res_json['data'].append(tmp_data)
|
|
res_json['data'].append(tmp_data)
|
|
res_json['count'] += 1
|
|
res_json['count'] += 1
|
|
@@ -4829,28 +4811,16 @@ class Dataflow_dumplicate(Dataflow):
|
|
if down_res:
|
|
if down_res:
|
|
df = pd.read_excel(LocalPath)
|
|
df = pd.read_excel(LocalPath)
|
|
tmp_list = []
|
|
tmp_list = []
|
|
- for docid,doctitle,web_source_no,original_docchannel,page_time,time_bidclose,time_registration_end in zip(df['docid'],
|
|
|
|
- df['doctitle'],df['web_source_no'],df['original_docchannel'],
|
|
|
|
- df['page_time'],df['time_bidclose'],df['time_registration_end']):
|
|
|
|
- time_registration_end = time_registration_end if str(time_registration_end) and str(time_registration_end) != 'nan' else ""
|
|
|
|
- time_bidclose = time_bidclose if str(time_bidclose) and str(time_bidclose) != 'nan' else ""
|
|
|
|
|
|
+ for web_source_no,original_docchannel,res in zip(df['web_source_no'],df['original_docchannel'],df['res']):
|
|
tmp_data = {
|
|
tmp_data = {
|
|
"WEB_SOURCE_NO": web_source_no,
|
|
"WEB_SOURCE_NO": web_source_no,
|
|
- "TITLE": doctitle,
|
|
|
|
- "COUNT": 1,
|
|
|
|
"WEBTYPE": label2channel.get(original_docchannel, ""),
|
|
"WEBTYPE": label2channel.get(original_docchannel, ""),
|
|
"TYPE": "截止日期在发布日期之前",
|
|
"TYPE": "截止日期在发布日期之前",
|
|
- "FILEMD5": str(docid),
|
|
|
|
- "PUBDATE": page_time[:10],
|
|
|
|
- "REGDATE": time_registration_end[:10],
|
|
|
|
- "ENDDATE": time_bidclose[:10]
|
|
|
|
|
|
+ "ITEMS": json.loads(res)
|
|
}
|
|
}
|
|
tmp_list.append(tmp_data)
|
|
tmp_list.append(tmp_data)
|
|
- tmp_list.sort(key=lambda x: x['WEB_SOURCE_NO'])
|
|
|
|
- for key, group in groupby(tmp_list, lambda x: (x['WEB_SOURCE_NO'])):
|
|
|
|
- group = list(group)[:5]
|
|
|
|
- res_json['data'].extend(group)
|
|
|
|
- res_json['count'] += len(group)
|
|
|
|
|
|
+ res_json['data'].extend(tmp_list)
|
|
|
|
+ res_json['count'] += len(tmp_list)
|
|
os.remove(LocalPath)
|
|
os.remove(LocalPath)
|
|
|
|
|
|
# url = "http://120.132.118.205:17090/saveQualityListData"
|
|
# url = "http://120.132.118.205:17090/saveQualityListData"
|