html2text.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. #coding:utf8
  2. from bs4 import BeautifulSoup
  3. import re
  4. def html2text_with_tablehtml(_html):
  5. # 如果输入是字符串,使用 BeautifulSoup 解析
  6. if isinstance(_html, str):
  7. _soup = BeautifulSoup(_html, "lxml")
  8. else:
  9. _soup = _html
  10. # 用于存储处理后的文本
  11. result_parts = []
  12. _find = False
  13. # 遍历所有直接子元素
  14. for child in _soup.contents:
  15. if child.name:
  16. if child.name in ["table", "tbody"]:
  17. #仅仅保存rowspan和colspan属性的标签
  18. for c in child.find_all():
  19. new_attrs = {}
  20. for k,v in c.attrs.items():
  21. if k in ["rowspan","colspan"]:
  22. new_attrs[k] = v
  23. c.attrs = new_attrs
  24. # 如果是表格或表格主体,保留 HTML 代码
  25. result_parts.append("\n"+str(child)+"\n")
  26. else:
  27. # 递归处理其他元素并转换为文本
  28. text = html2text_with_tablehtml(child)
  29. result_parts.append(text)
  30. elif child.string and child.string.strip():
  31. _text = child.string.strip()
  32. result_parts.append(_text)
  33. _find = True
  34. if not _find:
  35. _text = str(_soup.get_text())
  36. if len(_text)>0:
  37. if _soup.name in {"p","div","li"}:
  38. _text += "\n"
  39. result_parts.append(_text)
  40. # 将所有处理后的部分连接成一个字符串
  41. result = "".join(result_parts)
  42. return result
  43. if __name__ == '__main__':
  44. _html = '''
  45. <div>
  46. <div>
  47. <div>
  48. <p>一、 <span>*</span>采购人名称:<span><a target="_blank" class="markBlue" href="/bdqyhx/785132502616420352.html" style="color: #3083EB !important;text-decoration: underline;">合肥经济技术开发区锦绣社区服务中心</a></span><br></p>
  49. <p>二、 <span>*</span>履约供应商名称:<span><a target="_blank" class="markBlue" href="/bdqyhx/627394723685289984.html" style="color: #3083EB !important;text-decoration: underline;">安徽今辉科技有限公司</a></span></p>
  50. <p>三、 <span>*</span>采购项目编号:<span>2071451000000740505</span></p>
  51. <p>四、 <span>*</span>合同编号:</p>
  52. <p>五、 <span>*</span>验收单位:<span><a target="_blank" class="markBlue" href="/bdqyhx/785132502616420352.html" style="color: #3083EB !important;text-decoration: underline;">合肥经济技术开发区锦绣社区服务中心</a></span></p>
  53. <p>六、 <span>*</span>验收日期:<span>2025年3月26日</span></p>
  54. <p>七、 <span>*</span>验收结果: </p>
  55. <table width="100%">
  56. <tbody>
  57. <tr width="100%">
  58. <td width="14.29%">序号</td>
  59. <td width="14.29%">服务内容</td>
  60. <td width="14.29%">验收数量</td>
  61. <td width="14.29%">验收金额(元)</td>
  62. <td width="14.29%">验收标准\规格型号\技术标准</td>
  63. <td width="14.29%">验收结果</td>
  64. <td width="14.29%" colspan="1">备注</td>
  65. </tr>
  66. <tr width="100%">
  67. <td width="14.29%">1</td>
  68. <td width="14.29%">无毒彩泥黏土太空橡皮泥手工儿童玩具男女孩24色超轻粘土糖果袋装</td>
  69. <td width="14.29%">8</td>
  70. <td width="14.29%">144.0</td>
  71. <td width="14.29%">无品牌\花泥</td>
  72. <td width="14.29%">验收通过</td>
  73. <td width="14.29%" colspan="1"></td>
  74. </tr>
  75. <tr width="100%">
  76. <td width="14.29%">2</td>
  77. <td width="14.29%">红双喜 赛顶一星乒乓球40+ 乒乓球</td>
  78. <td width="14.29%">3</td>
  79. <td width="14.29%">120.0</td>
  80. <td width="14.29%">红双喜/DHS\赛顶一星乒乓球40+</td>
  81. <td width="14.29%">验收通过</td>
  82. <td width="14.29%" colspan="1"></td>
  83. </tr>
  84. <tr width="100%">
  85. <td width="14.29%">3</td>
  86. <td width="14.29%">新年春节红灯笼户外连串蜂窝小灯笼塑纸灯笼串节日庆典元旦装饰</td>
  87. <td width="14.29%">30</td>
  88. <td width="14.29%">360.0</td>
  89. <td width="14.29%">无品牌\灯笼</td>
  90. <td width="14.29%">验收通过</td>
  91. <td width="14.29%" colspan="1"></td>
  92. </tr>
  93. <tr width="100%">
  94. <td width="14.29%">4</td>
  95. <td width="14.29%">奥妙 3kg/瓶 洗衣液/洗衣粉</td>
  96. <td width="14.29%">6</td>
  97. <td width="14.29%">270.0</td>
  98. <td width="14.29%">奥妙/OMO\3kg/瓶</td>
  99. <td width="14.29%">验收通过</td>
  100. <td width="14.29%" colspan="1"></td>
  101. </tr>
  102. <tr width="100%">
  103. <td width="14.29%">5</td>
  104. <td width="14.29%">维达 V2182 抽纸</td>
  105. <td width="14.29%">2</td>
  106. <td width="14.29%">480.0</td>
  107. <td width="14.29%">维达/Vinda\V2182</td>
  108. <td width="14.29%">验收通过</td>
  109. <td width="14.29%" colspan="1"></td>
  110. </tr>
  111. <tr width="100%">
  112. <td width="14.29%">6</td>
  113. <td width="14.29%">横幅</td>
  114. <td width="14.29%">5</td>
  115. <td width="14.29%">40.0</td>
  116. <td width="14.29%">无品牌\横幅</td>
  117. <td width="14.29%">验收通过</td>
  118. <td width="14.29%" colspan="1"></td>
  119. </tr>
  120. <tr width="100%">
  121. <td width="14.29%">7</td>
  122. <td width="14.29%">【运费】</td>
  123. <td width="14.29%">1</td>
  124. <td width="14.29%">0.0</td>
  125. <td width="14.29%"></td>
  126. <td width="14.29%">验收通过</td>
  127. <td width="14.29%" colspan="1"></td>
  128. </tr>
  129. </tbody>
  130. </table>
  131. <br>
  132. <br>验收报告:
  133. <br>验收人员名单:
  134. <span>王玲</span>
  135. <p></p>
  136. <p><br></p>
  137. <p><br></p>
  138. <p><br></p>
  139. <p><br></p>
  140. </div>
  141. </div>
  142. </div>
  143. '''
  144. print(html2text_with_tablehtml(_html))