#coding:utf8 from bs4 import BeautifulSoup import re def html2text(_html): if type(_html)==str: _soup = BeautifulSoup(_html,"lxml") else: _soup = _html list_table = _soup.find_all("table") list_tbody = _soup.find_all("tbody") if len(list_table)>0 or len(list_tbody)>0: list_childs = _soup.find_all(recursive=False) list_child_text = [] for child in list_childs: list_child_text.append(html2text(child)) return "\n".join(list_child_text) else: if _soup.name=="table" or _soup.name=="tbody": _table_text = "" trs = _soup.find_all("tr") list_tr_text = [] for tr in trs: tds = tr.find_all("th") if len(tds)>0: list_td_text = [] for td in tds: list_td_text.append(re.sub('\s','',td.get_text())) list_tr_text.append("|".join(list_td_text)) tds = tr.find_all("td") if len(tds)>0: list_td_text = [] for td in tds: list_td_text.append(re.sub('\s','',td.get_text())) list_tr_text.append("|".join(list_td_text)) _table_text = "%s\n\n"%"\n".join(list_tr_text) if _table_text == "": _table_text = _soup.get_text() _soup.decompose() return _table_text else: _text = re.sub('\s','',_soup.get_text().strip()) _soup.decompose() return _text def table2list(_html): if type(_html)==str: _soup = BeautifulSoup(_html,'lxml') else: _soup = _html if _soup.name=="table" or _soup.name=="tbody": _table_text = "" trs = _soup.find_all("tr") list_tr_text = [] for tr in trs: tds = tr.find_all("th") if len(tds)>0: list_td_text = [] for td in tds: list_td_text.append(re.sub('\s','',td.get_text())) if len(list_td_text)>0: list_tr_text.append(list_td_text) tds = tr.find_all("td") if len(tds)>0: list_td_text = [] for td in tds: list_td_text.append(re.sub('\s','',td.get_text())) if len(list_td_text)>0: list_tr_text.append(list_td_text) return list_tr_text def tableList2text(table_list): list_tr_text = [] for tr in table_list: tds = tr if len(tds)>0: list_td_text = [] for td in tds: list_td_text.append(re.sub('\s','',td)) list_tr_text.append("|".join(list_td_text)) _table_text = "%s\n\n"%"\n".join(list_tr_text) return _table_text