import hashlib import codecs from bs4 import BeautifulSoup import re def getHtmlText(sourceHtml): _soup = BeautifulSoup(sourceHtml,"lxml") list_a = _soup.find_all("a") for _a in list_a: _href = _a.attrs.get("href","") if _href.find("www.bidizhaobiao.com")>0: _a.decompose() # richText = _soup.find("div",attrs={"class":"richTextFetch"}) # if richText is not None: # richText.decompose() _text = _soup.get_text() _text = re.sub("\s*",'',_text) if len(_text)==0: _text = str(_soup) return _text def getMD5(sourceHtml): if sourceHtml is not None and len(sourceHtml)>0: _text = getHtmlText(sourceHtml) if isinstance(_text,str): bs = _text.encode() elif isinstance(_text,bytes): bs = _text else: return "" md5 = hashlib.md5() md5.update(bs) return md5.hexdigest() return "" def getFingerprint(sourceHtml): md5 = getMD5(sourceHtml) if md5!="": _fingerprint = "md5=%s"%(md5) else: _fingerprint = "" return _fingerprint if __name__=="__main__": sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read() # sourceHtml = "abcddafafffffffffffffffffffffffffä½ " print(getFingerprint(sourceHtml))