documentFingerprint.py 1.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. import hashlib
  2. import codecs
  3. from bs4 import BeautifulSoup
  4. import re
  5. def getHtmlText(sourceHtml):
  6. _text = BeautifulSoup(sourceHtml,"lxml").get_text()
  7. _text = re.sub("\s*",'',_text)
  8. if len(_text)==0:
  9. _text = sourceHtml
  10. return _text
  11. def getMD5(sourceHtml):
  12. if sourceHtml is not None and len(sourceHtml)>0:
  13. _text = getHtmlText(sourceHtml)
  14. if isinstance(_text,str):
  15. bs = _text.encode()
  16. elif isinstance(_text,bytes):
  17. bs = _text
  18. else:
  19. return ""
  20. md5 = hashlib.md5()
  21. md5.update(bs)
  22. return md5.hexdigest()
  23. return ""
  24. def getFingerprint(sourceHtml):
  25. md5 = getMD5(sourceHtml)
  26. if md5!="":
  27. _fingerprint = "md5=%s"%(md5)
  28. else:
  29. _fingerprint = ""
  30. return _fingerprint
  31. if __name__=="__main__":
  32. sourceHtml = codecs.open("C:\\Users\\User\\Desktop\\2.html","rb",encoding="utf8").read()
  33. # sourceHtml = "abcddafafffffffffffffffffffffffff你"
  34. print(getFingerprint(sourceHtml))