浏览代码

要素提取增加web_source_name

luojiehua 2 年之前
父节点
当前提交
9b889a8e9f

+ 3 - 2
BaseDataMaintenance/maintenance/2.py

@@ -13,9 +13,10 @@ import codecs
 if __name__ == '__main__':
 
     text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
-    content = str(BeautifulSoup(text).find("div",id="pcontent"))
+    content = str(BeautifulSoup(text).find("div"))
     _soup = BeautifulSoup(content,"lxml")
     print(len(str(_soup)))
-    _soup = article_limit(_soup,3000)
+    _soup = article_limit(_soup,100)
+
     print(len(str(_soup)))
     print(str(_soup))

+ 1 - 1
BaseDataMaintenance/maintenance/dataflow.py

@@ -3892,7 +3892,7 @@ if __name__ == '__main__':
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
     a = time.time()
-    df_dump.test_dumplicate(183573001)
+    df_dump.test_dumplicate(284480111)
     print("takes",time.time()-a)
     # df_dump.fix_doc_which_not_in_project()
     # df_dump.delete_projects_by_document(16288036)

+ 2 - 1
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -2,7 +2,7 @@
 
 from BaseDataMaintenance.maintenance.dataflow import *
 from BaseDataMaintenance.common.activateMQUtils import *
-from BaseDataMaintenance.dataSource.source import getConnect_activateMQ,getConnection_postgres,getConnection_mysql,getConnection_oracle
+from BaseDataMaintenance.dataSource.source import getConnect_activateMQ,getConnection_postgres,getConnection_mysql,getConnection_oracle,getConnect_ots_capacity
 from BaseDataMaintenance.dataSource.setttings import *
 
 from BaseDataMaintenance.model.postgres.attachment import Attachment_postgres
@@ -711,6 +711,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                 data.pop(document_tmp_dochtmlcon)
             data["title"] = data.get(document_tmp_doctitle,"")
             data["web_source_no"] = item.get(document_tmp_web_source_no,"")
+            data["web_source_name"] = item.get(document_tmp_web_source_name,"")
             data["original_docchannel"] = item.get(document_tmp_original_docchannel,"")
 
             _fingerprint = getFingerprint(str(data["title"])+str(data["content"]))

+ 1 - 0
BaseDataMaintenance/maxcompute/documentMerge.py

@@ -177,6 +177,7 @@ document_tmp_projects = "projects"
 document_tmp_page_time = "page_time"
 document_tmp_attachment_extract_status = "attachment_extract_status"
 document_tmp_web_source_no = "web_source_no"
+document_tmp_web_source_name = "web_source_name"
 document_tmp_fingerprint = "fingerprint"
 document_tmp_opertime = "opertime"
 document_tmp_docchannel = "docchannel"