Browse Source

要素提取对重复入库的数据直接查库,以增加提取速度

luojiehua 2 years ago
parent
commit
df253dea2d

+ 52 - 0
BaseDataMaintenance/common/documentFingerprint.py

@@ -0,0 +1,52 @@
+
+
+import hashlib
+import codecs
+from bs4 import BeautifulSoup
+import re
+
+
+def getHtmlText(sourceHtml):
+    _soup = BeautifulSoup(sourceHtml,"lxml")
+    list_a = _soup.find_all("a")
+    for _a in list_a:
+        _href = _a.attrs.get("href","")
+        if _href.find("www.bidizhaobiao.com")>0:
+            _a.decompose()
+    richText = _soup.find("div",attrs={"class":"richTextFetch"})
+    if richText is not None:
+        richText.decompose()
+    _text = _soup.get_text()
+
+    _text = re.sub("\s*",'',_text)
+    if len(_text)==0:
+        _text = str(_soup)
+    return _text
+
+def getMD5(sourceHtml):
+    if sourceHtml is not None and len(sourceHtml)>0:
+        _text = getHtmlText(sourceHtml)
+        if isinstance(_text,str):
+            bs = _text.encode()
+        elif isinstance(_text,bytes):
+            bs = _text
+        else:
+            return ""
+        md5 = hashlib.md5()
+        md5.update(bs)
+        return md5.hexdigest()
+    return ""
+
+
+def getFingerprint(sourceHtml):
+    md5 = getMD5(sourceHtml)
+    if md5!="":
+        _fingerprint = "md5=%s"%(md5)
+    else:
+        _fingerprint = ""
+    return _fingerprint
+
+if __name__=="__main__":
+    sourceHtml = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","rb",encoding="utf8").read()
+    # sourceHtml = "abcddafafffffffffffffffffffffffff你"
+    print(getFingerprint(sourceHtml))

File diff suppressed because it is too large
+ 47 - 0
BaseDataMaintenance/model/postgres/document_extract.py


Some files were not shown because too many files changed in this diff