luojiehua
/
BaseDataMaintenance


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536
							#coding:utf8

from bs4 import BeautifulSoup
import re

def html2text_with_tablehtml(_html):
    # 如果输入是字符串，使用 BeautifulSoup 解析
    if isinstance(_html, str):
        _soup = BeautifulSoup(_html, "lxml")
    else:
        _soup = _html

    # 用于存储处理后的文本
    result_parts = []

    _find = False
    # 遍历所有直接子元素
    for child in _soup.find_all(recursive=False):
        if child.name in ["table", "tbody"]:
            # 如果是表格或表格主体，保留 HTML 代码
            result_parts.append(str(child))
        else:
            # 递归处理其他元素并转换为文本
            text = html2text_with_tablehtml(child)
            result_parts.append(text)
        _find = True
    if not _find:
        result_parts.append(str(_soup.get_text()))

    # 将所有处理后的部分连接成一个字符串
    result = "\n".join(result_parts)
    return result

if __name__ == '__main__':
    _html = "<div><p>这是一个p</p><table><tr><td>这是一个td</td></tr></table></div>"
    print(html2text_with_tablehtml(_html))