|
@@ -1775,7 +1775,12 @@ def article_limit(soup,limit_words=30000):
|
|
|
while len(_soup.find_all(recursive=False)) == 1 and \
|
|
|
_soup.get_text(strip=True) == _soup.find_all(recursive=False)[0].get_text(strip=True):
|
|
|
_soup = _soup.find_all(recursive=False)[0]
|
|
|
- try:
|
|
|
+ if len(_soup.find_all(recursive=False)) == 0:
|
|
|
+ _soup.string = str(_soup.get_text())[:max_count-_count]
|
|
|
+ _count += len(re.sub(sub_space, "", _soup.string))
|
|
|
+ _gap = _count - max_count
|
|
|
+ next_soup = None
|
|
|
+ else:
|
|
|
for _soup_part in _soup.find_all(recursive=False):
|
|
|
if not _is_skip:
|
|
|
_count += len(re.sub(sub_space, "", _soup_part.get_text()))
|
|
@@ -1784,13 +1789,12 @@ def article_limit(soup,limit_words=30000):
|
|
|
if _gap <= max_gap:
|
|
|
_is_skip = True
|
|
|
else:
|
|
|
+ _is_skip = True
|
|
|
next_soup = _soup_part
|
|
|
_count -= len(re.sub(sub_space, "", _soup_part.get_text()))
|
|
|
- break
|
|
|
+ continue
|
|
|
else:
|
|
|
_soup_part.decompose()
|
|
|
- except:
|
|
|
- return _count,_gap,None
|
|
|
return _count,_gap,next_soup
|
|
|
|
|
|
text_count = 0
|
|
@@ -1809,6 +1813,7 @@ def article_limit(soup,limit_words=30000):
|
|
|
text_count,gap,n_soup = soup_limit(soup,text_count,max_count=limit_words,max_gap=500)
|
|
|
while n_soup:
|
|
|
text_count, gap, n_soup = soup_limit(n_soup, text_count, max_count=limit_words, max_gap=500)
|
|
|
+
|
|
|
else:
|
|
|
# 有附件
|
|
|
_text = re.sub(sub_space, "", soup.get_text())
|