12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- import hashlib
- import codecs
- from bs4 import BeautifulSoup
- import re
- def getHtmlText(sourceHtml):
- _soup = BeautifulSoup(sourceHtml,"lxml")
- list_a = _soup.find_all("a")
- for _a in list_a:
- _href = _a.attrs.get("href","")
- if _href.find("www.bidizhaobiao.com")>0:
- _a.decompose()
- # richText = _soup.find("div",attrs={"class":"richTextFetch"})
- # if richText is not None:
- # richText.decompose()
- _text = _soup.get_text()
- _text = re.sub("\s*",'',_text)
- if len(_text)==0:
- _text = str(_soup)
- return _text
- def getMD5(sourceHtml):
- if sourceHtml is not None and len(sourceHtml)>0:
- _text = getHtmlText(sourceHtml)
- if isinstance(_text,str):
- bs = _text.encode()
- elif isinstance(_text,bytes):
- bs = _text
- else:
- return ""
- md5 = hashlib.md5()
- md5.update(bs)
- return md5.hexdigest()
- return ""
- def getFingerprint(sourceHtml):
- md5 = getMD5(sourceHtml)
- if md5!="":
- _fingerprint = "md5=%s"%(md5)
- else:
- _fingerprint = ""
- return _fingerprint
- if __name__=="__main__":
- sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read()
- # sourceHtml = "abcddafafffffffffffffffffffffffff你"
- print(getFingerprint(sourceHtml))
|