documentFingerprint.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import hashlib
  2. import codecs
  3. from bs4 import BeautifulSoup
  4. import re
  5. def getHtmlText(sourceHtml):
  6. _soup = BeautifulSoup(sourceHtml,"lxml")
  7. list_a = _soup.find_all("a")
  8. for _a in list_a:
  9. _href = _a.attrs.get("href","")
  10. if _href.find("www.bidizhaobiao.com")>0:
  11. _a.decompose()
  12. # richText = _soup.find("div",attrs={"class":"richTextFetch"})
  13. # if richText is not None:
  14. # richText.decompose()
  15. _text = _soup.get_text()
  16. _text = re.sub("\s*",'',_text)
  17. if len(_text)==0:
  18. _text = str(_soup)
  19. return _text
  20. def getMD5(sourceHtml):
  21. if sourceHtml is not None and len(sourceHtml)>0:
  22. _text = getHtmlText(sourceHtml)
  23. if isinstance(_text,str):
  24. bs = _text.encode()
  25. elif isinstance(_text,bytes):
  26. bs = _text
  27. else:
  28. return ""
  29. md5 = hashlib.md5()
  30. md5.update(bs)
  31. return md5.hexdigest()
  32. return ""
  33. def getFingerprint(sourceHtml):
  34. md5 = getMD5(sourceHtml)
  35. if md5!="":
  36. _fingerprint = "md5=%s"%(md5)
  37. else:
  38. _fingerprint = ""
  39. return _fingerprint
  40. if __name__=="__main__":
  41. sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read()
  42. # sourceHtml = "abcddafafffffffffffffffffffffffff你"
  43. print(getFingerprint(sourceHtml))