luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
							

import hashlib
import codecs
from bs4 import BeautifulSoup
import re


def getHtmlText(sourceHtml):
    _soup = BeautifulSoup(sourceHtml,"lxml")
    list_a = _soup.find_all("a")
    for _a in list_a:
        _href = _a.attrs.get("href","")
        if _href.find("www.bidizhaobiao.com")>0:
            _a.decompose()
    # richText = _soup.find("div",attrs={"class":"richTextFetch"})
    # if richText is not None:
    #     richText.decompose()
    _text = _soup.get_text()

    _text = re.sub("\s*",'',_text)
    if len(_text)==0:
        _text = str(_soup)
    return _text

def getMD5(sourceHtml):
    if sourceHtml is not None and len(sourceHtml)>0:
        _text = getHtmlText(sourceHtml)
        if isinstance(_text,str):
            bs = _text.encode()
        elif isinstance(_text,bytes):
            bs = _text
        else:
            return ""
        md5 = hashlib.md5()
        md5.update(bs)
        return md5.hexdigest()
    return ""


def getFingerprint(sourceHtml):
    md5 = getMD5(sourceHtml)
    if md5!="":
        _fingerprint = "md5=%s"%(md5)
    else:
        _fingerprint = ""
    return _fingerprint

if __name__=="__main__":
    sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read()
    # sourceHtml = "abcddafafffffffffffffffffffffffff你"
    print(getFingerprint(sourceHtml))