import hashlib import codecs from bs4 import BeautifulSoup import re def getHtmlText(sourceHtml): _text = BeautifulSoup(sourceHtml,"lxml").get_text() _text = re.sub("\s*",'',_text) if len(_text)==0: _text = sourceHtml return _text def getMD5(sourceHtml): if sourceHtml is not None and len(sourceHtml)>0: _text = getHtmlText(sourceHtml) if isinstance(_text,str): bs = _text.encode() elif isinstance(_text,bytes): bs = _text else: return "" md5 = hashlib.md5() md5.update(bs) return md5.hexdigest() return "" def getFingerprint(sourceHtml): md5 = getMD5(sourceHtml) if md5!="": _fingerprint = "md5=%s"%(md5) else: _fingerprint = "" return _fingerprint if __name__=="__main__": sourceHtml = codecs.open("C:\\Users\\User\\Desktop\\2.html","rb",encoding="utf8").read() # sourceHtml = "abcddafafffffffffffffffffffffffffä½ " print(getFingerprint(sourceHtml))