chatUtil.py 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. #coding:utf8
  2. from bs4 import BeautifulSoup
  3. import re
  4. def html2text(_html):
  5. if type(_html)==str:
  6. _soup = BeautifulSoup(_html,"lxml")
  7. else:
  8. _soup = _html
  9. list_table = _soup.find_all("table")
  10. list_tbody = _soup.find_all("tbody")
  11. if len(list_table)>0 or len(list_tbody)>0:
  12. list_childs = _soup.find_all(recursive=False)
  13. list_child_text = []
  14. for child in list_childs:
  15. list_child_text.append(html2text(child))
  16. return "\n".join(list_child_text)
  17. else:
  18. if _soup.name=="table" or _soup.name=="tbody":
  19. _table_text = ""
  20. trs = _soup.find_all("tr")
  21. list_tr_text = []
  22. for tr in trs:
  23. tds = tr.find_all("th")
  24. if len(tds)>0:
  25. list_td_text = []
  26. for td in tds:
  27. list_td_text.append(re.sub('\s','',td.get_text()))
  28. list_tr_text.append("|".join(list_td_text))
  29. tds = tr.find_all("td")
  30. if len(tds)>0:
  31. list_td_text = []
  32. for td in tds:
  33. list_td_text.append(re.sub('\s','',td.get_text()))
  34. list_tr_text.append("|".join(list_td_text))
  35. _table_text = "%s\n\n"%"\n".join(list_tr_text)
  36. if _table_text == "":
  37. _table_text = _soup.get_text()
  38. _soup.decompose()
  39. return _table_text
  40. else:
  41. _text = re.sub('\s','',_soup.get_text().strip())
  42. _soup.decompose()
  43. return _text
  44. def table2list(_html):
  45. if type(_html)==str:
  46. _soup = BeautifulSoup(_html,'lxml')
  47. else:
  48. _soup = _html
  49. if _soup.name=="table" or _soup.name=="tbody":
  50. _table_text = ""
  51. trs = _soup.find_all("tr")
  52. list_tr_text = []
  53. for tr in trs:
  54. tds = tr.find_all("th")
  55. if len(tds)>0:
  56. list_td_text = []
  57. for td in tds:
  58. list_td_text.append(re.sub('\s','',td.get_text()))
  59. if len(list_td_text)>0:
  60. list_tr_text.append(list_td_text)
  61. tds = tr.find_all("td")
  62. if len(tds)>0:
  63. list_td_text = []
  64. for td in tds:
  65. list_td_text.append(re.sub('\s','',td.get_text()))
  66. if len(list_td_text)>0:
  67. list_tr_text.append(list_td_text)
  68. return list_tr_text
  69. def tableList2text(table_list):
  70. list_tr_text = []
  71. for tr in table_list:
  72. tds = tr
  73. if len(tds)>0:
  74. list_td_text = []
  75. for td in tds:
  76. list_td_text.append(re.sub('\s','',td))
  77. list_tr_text.append("|".join(list_td_text))
  78. _table_text = "%s\n\n"%"\n".join(list_tr_text)
  79. return _table_text