|
@@ -0,0 +1,52 @@
|
|
|
+
|
|
|
+
|
|
|
+import hashlib
|
|
|
+import codecs
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+import re
|
|
|
+
|
|
|
+
|
|
|
+def getHtmlText(sourceHtml):
|
|
|
+ _soup = BeautifulSoup(sourceHtml,"lxml")
|
|
|
+ list_a = _soup.find_all("a")
|
|
|
+ for _a in list_a:
|
|
|
+ _href = _a.attrs.get("href","")
|
|
|
+ if _href.find("www.bidizhaobiao.com")>0:
|
|
|
+ _a.decompose()
|
|
|
+ richText = _soup.find("div",attrs={"class":"richTextFetch"})
|
|
|
+ if richText is not None:
|
|
|
+ richText.decompose()
|
|
|
+ _text = _soup.get_text()
|
|
|
+
|
|
|
+ _text = re.sub("\s*",'',_text)
|
|
|
+ if len(_text)==0:
|
|
|
+ _text = str(_soup)
|
|
|
+ return _text
|
|
|
+
|
|
|
+def getMD5(sourceHtml):
|
|
|
+ if sourceHtml is not None and len(sourceHtml)>0:
|
|
|
+ _text = getHtmlText(sourceHtml)
|
|
|
+ if isinstance(_text,str):
|
|
|
+ bs = _text.encode()
|
|
|
+ elif isinstance(_text,bytes):
|
|
|
+ bs = _text
|
|
|
+ else:
|
|
|
+ return ""
|
|
|
+ md5 = hashlib.md5()
|
|
|
+ md5.update(bs)
|
|
|
+ return md5.hexdigest()
|
|
|
+ return ""
|
|
|
+
|
|
|
+
|
|
|
+def getFingerprint(sourceHtml):
|
|
|
+ md5 = getMD5(sourceHtml)
|
|
|
+ if md5!="":
|
|
|
+ _fingerprint = "md5=%s"%(md5)
|
|
|
+ else:
|
|
|
+ _fingerprint = ""
|
|
|
+ return _fingerprint
|
|
|
+
|
|
|
+if __name__=="__main__":
|
|
|
+ sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read()
|
|
|
+ # sourceHtml = "abcddafafffffffffffffffffffffffff你"
|
|
|
+ print(getFingerprint(sourceHtml))
|