|
@@ -106,7 +106,7 @@ def split_with_time(list_dict,sort_key,timedelta=86400*120):
|
|
|
_group = []
|
|
|
for j in range(_begin,len(list_dict)):
|
|
|
_group.append(list_dict[j])
|
|
|
- if len(_group)>1:
|
|
|
+ if len(_group)>0:
|
|
|
list_group.append(_group)
|
|
|
return list_group
|
|
|
return [list_dict]
|
|
@@ -531,18 +531,28 @@ class f_remege_limit_num_contain_bychannel(BaseUDAF):
|
|
|
_channel = otherChannel
|
|
|
if _channel not in dict_channel_id:
|
|
|
dict_channel_id[_channel] = []
|
|
|
- dict_channel_id[_channel].append([_docid,dict_docid_doc[_docid].get("page_time_stamp"),dict_docid_doc[_docid].get("extract_count")])
|
|
|
- channel_dict = {}
|
|
|
+ dict_channel_id[_channel].append({"docid":_docid,"page_time_stamp":dict_docid_doc[_docid].get("page_time_stamp"),"extract_count":dict_docid_doc[_docid].get("extract_count")})
|
|
|
+
|
|
|
+ #根据日期进行切分
|
|
|
+ new_dict_channel_id = {}
|
|
|
+ print(dict_channel_id)
|
|
|
for k,v in dict_channel_id.items():
|
|
|
- v.sort(key=lambda x:x[1])
|
|
|
- v.sort(key=lambda x:x[2],reverse=True)
|
|
|
- channel_dict[v[0][0]] = []
|
|
|
+ list_time_docids = split_with_time(v,"page_time_stamp",86400*5)
|
|
|
+ print(list_time_docids)
|
|
|
+ for _l in list_time_docids:
|
|
|
+ otherChannel += 1
|
|
|
+ new_dict_channel_id[otherChannel] = _l
|
|
|
+ print(new_dict_channel_id)
|
|
|
+ channel_dict = {}
|
|
|
+ for k,v in new_dict_channel_id.items():
|
|
|
+ v.sort(key=lambda x:x["page_time_stamp"])
|
|
|
+ v.sort(key=lambda x:x["extract_count"],reverse=True)
|
|
|
+ channel_dict[v[0]["docid"]] = []
|
|
|
for _docs in v[1:]:
|
|
|
- channel_dict[v[0][0]].append(_docs[0])
|
|
|
+ channel_dict[v[0]["docid"]].append(_docs["docid"])
|
|
|
_d = {"data":channel_dict,"process_time":getCurrent_date()}
|
|
|
final_group_channel.append(_d)
|
|
|
|
|
|
-
|
|
|
return json.dumps(final_group_channel)
|
|
|
|
|
|
@annotate('string -> string')
|
|
@@ -1153,3 +1163,13 @@ class f_get_merge_docids(BaseUDAF):
|
|
|
for _docid in list_docid:
|
|
|
list_docid_str.append(str(_docid))
|
|
|
return ",".join(list_docid_str)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ a = f_remege_limit_num_contain_bychannel()
|
|
|
+ buffer = a.new_buffer()
|
|
|
+ a.iterate(buffer,1,1,86400*1,"1","1","1","1","1","1","1",5,5)
|
|
|
+ a.iterate(buffer,3,1,86400*4,"1","1","1","1","1","1","1",5,5)
|
|
|
+ a.iterate(buffer,5,1,86400*10,"1","1","1","1","1","1","1",5,5)
|
|
|
+ print(a.terminate(buffer))
|
|
|
+ print(1)
|