123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- #coding:utf8
- from bs4 import BeautifulSoup
- import re
- def html2text_with_tablehtml(_html):
- # 如果输入是字符串,使用 BeautifulSoup 解析
- if isinstance(_html, str):
- _soup = BeautifulSoup(_html, "lxml")
- else:
- _soup = _html
- # 用于存储处理后的文本
- result_parts = []
- _find = False
- # 遍历所有直接子元素
- for child in _soup.contents:
- if child.name:
- if child.name in ["table", "tbody"]:
- #仅仅保存rowspan和colspan属性的标签
- for c in child.find_all():
- new_attrs = {}
- for k,v in c.attrs.items():
- if k in ["rowspan","colspan"]:
- new_attrs[k] = v
- c.attrs = new_attrs
- # 如果是表格或表格主体,保留 HTML 代码
- result_parts.append("\n"+str(child)+"\n")
- else:
- # 递归处理其他元素并转换为文本
- text = html2text_with_tablehtml(child)
- result_parts.append(text)
- elif child.string and child.string.strip():
- _text = child.string.strip()
- result_parts.append(_text)
- _find = True
- if not _find:
- _text = str(_soup.get_text())
- if len(_text)>0:
- if _soup.name in {"p","div","li"}:
- _text += "\n"
- result_parts.append(_text)
- # 将所有处理后的部分连接成一个字符串
- result = "".join(result_parts)
- return result
- if __name__ == '__main__':
- _html = '''
- <div>
- <div>
- <div>
- <p>一、 <span>*</span>采购人名称:<span><a target="_blank" class="markBlue" href="/bdqyhx/785132502616420352.html" style="color: #3083EB !important;text-decoration: underline;">合肥经济技术开发区锦绣社区服务中心</a></span><br></p>
- <p>二、 <span>*</span>履约供应商名称:<span><a target="_blank" class="markBlue" href="/bdqyhx/627394723685289984.html" style="color: #3083EB !important;text-decoration: underline;">安徽今辉科技有限公司</a></span></p>
- <p>三、 <span>*</span>采购项目编号:<span>2071451000000740505</span></p>
- <p>四、 <span>*</span>合同编号:</p>
- <p>五、 <span>*</span>验收单位:<span><a target="_blank" class="markBlue" href="/bdqyhx/785132502616420352.html" style="color: #3083EB !important;text-decoration: underline;">合肥经济技术开发区锦绣社区服务中心</a></span></p>
- <p>六、 <span>*</span>验收日期:<span>2025年3月26日</span></p>
- <p>七、 <span>*</span>验收结果: </p>
- <table width="100%">
- <tbody>
- <tr width="100%">
- <td width="14.29%">序号</td>
- <td width="14.29%">服务内容</td>
- <td width="14.29%">验收数量</td>
- <td width="14.29%">验收金额(元)</td>
- <td width="14.29%">验收标准\规格型号\技术标准</td>
- <td width="14.29%">验收结果</td>
- <td width="14.29%" colspan="1">备注</td>
- </tr>
- <tr width="100%">
- <td width="14.29%">1</td>
- <td width="14.29%">无毒彩泥黏土太空橡皮泥手工儿童玩具男女孩24色超轻粘土糖果袋装</td>
- <td width="14.29%">8</td>
- <td width="14.29%">144.0</td>
- <td width="14.29%">无品牌\花泥</td>
- <td width="14.29%">验收通过</td>
- <td width="14.29%" colspan="1"></td>
- </tr>
- <tr width="100%">
- <td width="14.29%">2</td>
- <td width="14.29%">红双喜 赛顶一星乒乓球40+ 乒乓球</td>
- <td width="14.29%">3</td>
- <td width="14.29%">120.0</td>
- <td width="14.29%">红双喜/DHS\赛顶一星乒乓球40+</td>
- <td width="14.29%">验收通过</td>
- <td width="14.29%" colspan="1"></td>
- </tr>
- <tr width="100%">
- <td width="14.29%">3</td>
- <td width="14.29%">新年春节红灯笼户外连串蜂窝小灯笼塑纸灯笼串节日庆典元旦装饰</td>
- <td width="14.29%">30</td>
- <td width="14.29%">360.0</td>
- <td width="14.29%">无品牌\灯笼</td>
- <td width="14.29%">验收通过</td>
- <td width="14.29%" colspan="1"></td>
- </tr>
- <tr width="100%">
- <td width="14.29%">4</td>
- <td width="14.29%">奥妙 3kg/瓶 洗衣液/洗衣粉</td>
- <td width="14.29%">6</td>
- <td width="14.29%">270.0</td>
- <td width="14.29%">奥妙/OMO\3kg/瓶</td>
- <td width="14.29%">验收通过</td>
- <td width="14.29%" colspan="1"></td>
- </tr>
- <tr width="100%">
- <td width="14.29%">5</td>
- <td width="14.29%">维达 V2182 抽纸</td>
- <td width="14.29%">2</td>
- <td width="14.29%">480.0</td>
- <td width="14.29%">维达/Vinda\V2182</td>
- <td width="14.29%">验收通过</td>
- <td width="14.29%" colspan="1"></td>
- </tr>
- <tr width="100%">
- <td width="14.29%">6</td>
- <td width="14.29%">横幅</td>
- <td width="14.29%">5</td>
- <td width="14.29%">40.0</td>
- <td width="14.29%">无品牌\横幅</td>
- <td width="14.29%">验收通过</td>
- <td width="14.29%" colspan="1"></td>
- </tr>
- <tr width="100%">
- <td width="14.29%">7</td>
- <td width="14.29%">【运费】</td>
- <td width="14.29%">1</td>
- <td width="14.29%">0.0</td>
- <td width="14.29%"></td>
- <td width="14.29%">验收通过</td>
- <td width="14.29%" colspan="1"></td>
- </tr>
- </tbody>
- </table>
- <br>
- <br>验收报告:
- <br>验收人员名单:
- <span>王玲</span>
- <p></p>
- <p><br></p>
- <p><br></p>
- <p><br></p>
- <p><br></p>
- </div>
- </div>
- </div>
- '''
- print(html2text_with_tablehtml(_html))
|