Explorar o código

md5计算有误的问题

luojiehua hai 1 ano
pai
achega
9ba50c02d3

+ 3 - 3
BaseDataMaintenance/common/documentFingerprint.py

@@ -13,9 +13,9 @@ def getHtmlText(sourceHtml):
         _href = _a.attrs.get("href","")
         if _href.find("www.bidizhaobiao.com")>0:
             _a.decompose()
-    richText = _soup.find("div",attrs={"class":"richTextFetch"})
-    if richText is not None:
-        richText.decompose()
+    # richText = _soup.find("div",attrs={"class":"richTextFetch"})
+    # if richText is not None:
+    #     richText.decompose()
     _text = _soup.get_text()
 
     _text = re.sub("\s*",'',_text)

+ 7 - 7
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -479,7 +479,7 @@ class f_set_docid(BaseUDAF):
             defind_count = list_docs[0]["defind_count"]
         print(defind_count)
         for i in range(len(list_docs)-1):
-            if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*2:
+            if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*7:
                 continue
             else:
                 _group = []
@@ -590,10 +590,10 @@ class f_group_fingerprint(BaseUDAF):
         buffer[0].append(docid)
 
     def merge(self, buffer, pbuffer):
-        buffer[0].extend(pbuffer[0])
+        buffer[0].extend(pbuffer[0][:100000])
 
     def terminate(self, buffer):
-        list_docid = buffer[0]
+        list_docid = buffer[0][:100000]
         list_docid.sort(key=lambda x:x)
         return ",".join([str(a) for a in list_docid])
 
@@ -635,7 +635,7 @@ class f_dump_probability(BaseUDAF):
             list_data.append(_dict)
             if len(list_data)>10000:
                 break
-        list_group = split_with_time(list_data,sort_key="page_time_stamp",timedelta=86400*2)
+        list_group = split_with_time(list_data,sort_key="page_time_stamp",timedelta=86400*7)
         return json.dumps(list_group)
 
 
@@ -1958,7 +1958,7 @@ class f_set_docid_binaryChart(BaseUDAF):
 
     def terminate(self, buffer):
         list_docs = buffer[0]
-        list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*2)
+        list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*7)
 
         list_group = []
 
@@ -2001,7 +2001,7 @@ class f_set_docid_binaryChart(BaseUDAF):
 
 
 
-def split_with_time(list_dict,sort_key,timedelta=86400*2):
+def split_with_time(list_dict,sort_key,timedelta=86400*7):
     if len(list_dict)>0:
         if sort_key in list_dict[0]:
             list_dict.sort(key=lambda x:x[sort_key])
@@ -2116,7 +2116,7 @@ class f_stamp_squence(BaseUDAF):
         list_stamp.sort(key=lambda x:x)
         list_stamp_final = []
         _begin = 0
-        _time_decase = 86400*2
+        _time_decase = 86400*7
         logging.info(str(list_stamp))
         for _index in range(len(list_stamp)-1):
             if list_stamp[_index+1]-list_stamp[_index]<_time_decase:

+ 6 - 0
BaseDataMaintenance/model/oracle/T_SHEN_PI_XIANG_MU.py

@@ -54,8 +54,11 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
 
         new_dict["partition_key"] = partition_key
         new_dict["docid"] = docid
+        new_dict["original_id"] = str(new_dict.get(T_SHEN_PI_XIANG_MU_ID))
         new_dict.pop(T_SHEN_PI_XIANG_MU_ID)
 
+        new_dict["uuid"] = str(new_dict.get(T_SHEN_PI_XIANG_MU_ID))
+
         new_dict["crtime"] = new_dict.get(T_SHEN_PI_XIANG_MU_CREATE_TIME)
         new_dict["docchannel"] = 302
 
@@ -65,6 +68,9 @@ class T_SHEN_PI_XIANG_MU(BaseModel):
         new_dict["dochtmlcon"] = new_dict.get(T_SHEN_PI_XIANG_MU_PAGE_CONTENT)
         new_dict.pop(T_SHEN_PI_XIANG_MU_PAGE_CONTENT)
 
+        new_dict["detail_link"] = new_dict.get(T_SHEN_PI_XIANG_MU_DETAILLINK)
+        new_dict.pop(T_SHEN_PI_XIANG_MU_DETAILLINK)
+
         new_dict[T_SHEN_PI_XIANG_MU_PAGE_ATTACHMENTS] = new_dict.get(T_SHEN_PI_XIANG_MU_ATTACHMENT_PATH,"[]")
 
         opertime = getCurrent_date(format="%Y-%m-%d %H:%M:%S")