Bladeren bron

修复配置提取效率问题;附件识别完之后删除比地链接

luojiehua 1 jaar geleden
bovenliggende
commit
0f8e58fa24
2 gewijzigde bestanden met toevoegingen van 2 en 1 verwijderingen
  1. 1 0
      BaseDataMaintenance/maintenance/dataflow_mq.py
  2. 1 1
      BaseDataMaintenance/model/ots/document_html.py

+ 1 - 0
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -210,6 +210,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                 ackMsg(conn,message_id)
             log("document:%d get attachments with result:%s %s retry_times:%d"%(item.get("docid"),str(_succeed),str(_to_ack),_retry_times))
         except Exception as e:
+            traceback.print_exc()
             if time.time()-start_time<10:
                 item["retry_times"] -= 1
             if send_msg_toacmq(self.pool_mq,json.dumps(item,cls=MyEncoder,ensure_ascii=False),self.mq_attachment):

+ 1 - 1
BaseDataMaintenance/model/ots/document_html.py

@@ -55,7 +55,7 @@ class Document_html(BaseModel):
         _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
         _soup = BeautifulSoup(_dochtmlcon,"lxml")
         for a in _soup.find_all("a"):
-            if a.attrs["href"].startswith("http://www.bidizhaobiao.com"):
+            if a.attrs.get("href","").startswith("http://www.bidizhaobiao.com"):
                 a.decompose()
         self.setValue(document_dochtmlcon,re.sub("<html>|</html>|<body>|</body>","",str(_soup)),True)