2 years ago · df253dea2d
--- a/BaseDataMaintenance/common/documentFingerprint.py
+++ b/BaseDataMaintenance/common/documentFingerprint.py
@@ -0,0 +1,52 @@
 
				+
			
 
				+
			
 
				+import hashlib
			
 
				+import codecs
			
 
				+from bs4 import BeautifulSoup
			
 
				+import re
			
 
				+
			
 
				+
			
 
				+def getHtmlText(sourceHtml):
			
 
				+    _soup = BeautifulSoup(sourceHtml,"lxml")
			
 
				+    list_a = _soup.find_all("a")
			
 
				+    for _a in list_a:
			
 
				+        _href = _a.attrs.get("href","")
			
 
				+        if _href.find("www.bidizhaobiao.com")>0:
			
 
				+            _a.decompose()
			
 
				+    richText = _soup.find("div",attrs={"class":"richTextFetch"})
			
 
				+    if richText is not None:
			
 
				+        richText.decompose()
			
 
				+    _text = _soup.get_text()
			
 
				+
			
 
				+    _text = re.sub("\s*",'',_text)
			
 
				+    if len(_text)==0:
			
 
				+        _text = str(_soup)
			
 
				+    return _text
			
 
				+
			
 
				+def getMD5(sourceHtml):
			
 
				+    if sourceHtml is not None and len(sourceHtml)>0:
			
 
				+        _text = getHtmlText(sourceHtml)
			
 
				+        if isinstance(_text,str):
			
 
				+            bs = _text.encode()
			
 
				+        elif isinstance(_text,bytes):
			
 
				+            bs = _text
			
 
				+        else:
			
 
				+            return ""
			
 
				+        md5 = hashlib.md5()
			
 
				+        md5.update(bs)
			
 
				+        return md5.hexdigest()
			
 
				+    return ""
			
 
				+
			
 
				+
			
 
				+def getFingerprint(sourceHtml):
			
 
				+    md5 = getMD5(sourceHtml)
			
 
				+    if md5!="":
			
 
				+        _fingerprint = "md5=%s"%(md5)
			
 
				+    else:
			
 
				+        _fingerprint = ""
			
 
				+    return _fingerprint
			
 
				+
			
 
				+if __name__=="__main__":
			
 
				+    sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read()
			
 
				+    # sourceHtml = "abcddafafffffffffffffffffffffffff你"
			
 
				+    print(getFingerprint(sourceHtml))
			
--- a/BaseDataMaintenance/model/postgres/document_extract.py
+++ b/BaseDataMaintenance/model/postgres/document_extract.py