| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- #coding:utf8
- from bs4 import BeautifulSoup
- import re
- def html2text(_html):
- if type(_html)==str:
- _soup = BeautifulSoup(_html,"lxml")
- else:
- _soup = _html
- list_table = _soup.find_all("table")
- list_tbody = _soup.find_all("tbody")
- if len(list_table)>0 or len(list_tbody)>0:
- list_childs = _soup.find_all(recursive=False)
- list_child_text = []
- for child in list_childs:
- list_child_text.append(html2text(child))
- return "\n".join(list_child_text)
- else:
- if _soup.name=="table" or _soup.name=="tbody":
- _table_text = ""
- trs = _soup.find_all("tr")
- list_tr_text = []
- for tr in trs:
- tds = tr.find_all("th")
- if len(tds)>0:
- list_td_text = []
- for td in tds:
- list_td_text.append(re.sub('\s','',td.get_text()))
- list_tr_text.append("|".join(list_td_text))
- tds = tr.find_all("td")
- if len(tds)>0:
- list_td_text = []
- for td in tds:
- list_td_text.append(re.sub('\s','',td.get_text()))
- list_tr_text.append("|".join(list_td_text))
- _table_text = "%s\n\n"%"\n".join(list_tr_text)
- if _table_text == "":
- _table_text = _soup.get_text()
- _soup.decompose()
- return _table_text
- else:
- _text = re.sub('\s','',_soup.get_text().strip())
- _soup.decompose()
- return _text
- def table2list(_html):
- if type(_html)==str:
- _soup = BeautifulSoup(_html,'lxml')
- else:
- _soup = _html
- if _soup.name=="table" or _soup.name=="tbody":
- _table_text = ""
- trs = _soup.find_all("tr")
- list_tr_text = []
- for tr in trs:
- tds = tr.find_all("th")
- if len(tds)>0:
- list_td_text = []
- for td in tds:
- list_td_text.append(re.sub('\s','',td.get_text()))
- if len(list_td_text)>0:
- list_tr_text.append(list_td_text)
- tds = tr.find_all("td")
- if len(tds)>0:
- list_td_text = []
- for td in tds:
- list_td_text.append(re.sub('\s','',td.get_text()))
- if len(list_td_text)>0:
- list_tr_text.append(list_td_text)
- return list_tr_text
- def tableList2text(table_list):
- list_tr_text = []
- for tr in table_list:
- tds = tr
- if len(tds)>0:
- list_td_text = []
- for td in tds:
- list_td_text.append(re.sub('\s','',td))
- list_tr_text.append("|".join(list_td_text))
- _table_text = "%s\n\n"%"\n".join(list_tr_text)
- return _table_text
|