@@ -1464,6 +1464,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
for article in articles:
doc_id = article[0]
sourceContent = article[1]
+ sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
_send_doc_id = article[3]
_title = article[4]
#表格处理