|
@@ -1,5 +1,6 @@
|
|
|
import inspect
|
|
|
import os
|
|
|
+import re
|
|
|
import sys
|
|
|
from bs4 import BeautifulSoup
|
|
|
sys.path.append(os.path.dirname(__file__) + "/../")
|
|
@@ -41,9 +42,10 @@ class DocConvert:
|
|
|
try:
|
|
|
with open(self.path, 'r') as f:
|
|
|
html_str = f.read()
|
|
|
- soup = BeautifulSoup(html_str, 'lxml')
|
|
|
- text = soup.text
|
|
|
- is_html_doc = True
|
|
|
+ if re.search('<div|<html|<body|<head|<tr|<br|<table|<td', html_str):
|
|
|
+ soup = BeautifulSoup(html_str, 'lxml')
|
|
|
+ text = soup.text
|
|
|
+ is_html_doc = True
|
|
|
except:
|
|
|
pass
|
|
|
|