document_html.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. #coding:utf8
  2. from BaseDataMaintenance.model.ots.BaseModel import BaseModel
  3. from tablestore import *
  4. from BaseDataMaintenance.common.Utils import *
  5. from bs4 import BeautifulSoup
  6. document_partitionkey = "partitionkey"
  7. document_docid = "docid"
  8. document_dochtmlcon = "dochtmlcon"
  9. class Document_html(BaseModel):
  10. def __init__(self,_dict):
  11. BaseModel.__init__(self)
  12. for k,v in _dict.items():
  13. if k=="all_columns":
  14. continue
  15. self.setValue(k,v,True)
  16. self.table_name = "document_html"
  17. self.prefixs = ["www.bidizhaobiao.com","bxkc.oss-cn-shanghai.aliyuncs.com"]
  18. def getPrimary_keys(self):
  19. return ["partitionkey","docid"]
  20. def isLegalUrl(self,_url,_type):
  21. _flag = False
  22. for _prefix in self.prefixs:
  23. if _url.find(_prefix)>=0:
  24. _flag = True
  25. if _type==0:
  26. if _flag:
  27. return True
  28. else:
  29. return False
  30. else:
  31. if _flag:
  32. return False
  33. else:
  34. return True
  35. def updateSWFImages(self,swf_urls):
  36. if len(swf_urls)>0:
  37. _dochtmlcon = self.getProperties().get(document_dochtmlcon)
  38. _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
  39. _soup = BeautifulSoup(_dochtmlcon,"lxml")
  40. if _soup.find("img",{"src":swf_urls[0]}) is None:
  41. _div = "<div>"
  42. for _url in swf_urls:
  43. _div += '<p><img src="%s"/></p>'%(_url)
  44. _div += "</div>"
  45. _dochtmlcon += _div
  46. self.setValue(document_dochtmlcon,_dochtmlcon,True)
  47. def delete_bidi_a(self):
  48. _dochtmlcon = self.getProperties().get(document_dochtmlcon)
  49. _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
  50. _soup = BeautifulSoup(_dochtmlcon,"lxml")
  51. for a in _soup.find_all("a"):
  52. if a.attrs.get("href","").startswith("http://www.bidizhaobiao.com"):
  53. a.decompose()
  54. self.setValue(document_dochtmlcon,re.sub("<html>|</html>|<body>|</body>","",str(_soup)),True)
  55. def getRichTextFetch(self,list_html):
  56. _text = ""
  57. for _ht in list_html:
  58. if isinstance(_ht,str):
  59. _text += "<div>%s</div>"%(_ht)
  60. elif isinstance(_ht,dict):
  61. _filemd5 = _ht.get("filemd5","")
  62. _html = _ht.get("html","")
  63. _text += '<div filemd5="%s">%s</div>'%(_filemd5,_html)
  64. return _text
  65. def updateAttachment(self,list_html):
  66. if len(list_html)>0:
  67. _dochtmlcon = self.getProperties().get(document_dochtmlcon,"")
  68. _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
  69. _dochtmlcon_len = len(bytes(_dochtmlcon,encoding="utf8"))
  70. fix_len = self.COLUMN_MAX_SIZE-_dochtmlcon_len-100
  71. # _text = '\n<div style="display:none;" class="richTextFetch">%s</div>'%("\n".join(list_html))
  72. _text = '\n<div style="display:none;" class="richTextFetch">%s</div>'%(self.getRichTextFetch(list_html))
  73. if _dochtmlcon is not None:
  74. _soup = BeautifulSoup(_dochtmlcon,"lxml")
  75. _node = _soup.find("div",attrs={"class":"richTextFetch"})
  76. if _node is not None:
  77. _node.decompose()
  78. self.setValue(document_dochtmlcon,str(_soup)+_text,True)
  79. def getTitleFromHtml(self,filemd5,_html):
  80. _soup = BeautifulSoup(_html,"lxml")
  81. _find = _soup.find("a",attrs={"data":filemd5})
  82. _title = ""
  83. if _find is not None:
  84. _title = _find.get_text()
  85. return _title
  86. def getSourceLinkFromHtml(self,filemd5,_html):
  87. _soup = BeautifulSoup(_html,"lxml")
  88. _find = _soup.find("a",attrs={"filelink":filemd5})
  89. filelink = ""
  90. if _find is None:
  91. _find = _soup.find("img",attrs={"filelink":filemd5})
  92. if _find is not None:
  93. filelink = _find.attrs.get("src","")
  94. else:
  95. filelink = _find.attrs.get("href","")
  96. return filelink
  97. if __name__ == '__main__':
  98. _html = '''
  99. <div id="pcontent" class="pcontent"><div>
  100. 贵阳市白云区房屋征收管理局征收劳务服务采购项目-贵阳市白云区房屋征收管理局征收劳务服务采购项目合同公示
  101. <br> 详见合同公示附件
  102. <br>文件预览:
  103. <br>
  104. <a target="_blank" class="markBlue" filelink="6683172010dcb9029f93e84148b4024b" href="http://attachment-hub.oss-cn-hangzhou.aliyuncs.com/6683/20230927/2023-09-27/00219/1695779639111.pdf?Expires=1695784428&amp;OSSAccessKeyId=LTAI5tHoEUDSy6FnZjMKsNiZ&amp;Signature=JsHtdlrV%2BWGA1595%2BzTgckLIIm0%3D" original="https://ggzy.guizhou.gov.cn/hallweb/hall/attach/nosession/download?attachId=8a8bb7458aaccc3b018ad45340a844f0" rel="noreferrer">合同网签及备案.<mark data-markjs="true">pdf</mark></a>
  105. <a target="_blank" class="markBlue" data="6683172010dcb9029f93e84148b4024b" href="http://www.bidizhaobiao.com/file/20230927/2023-09-27/00219/1695779639111.pdf" style="display:none">合同网签及备案.<mark data-markjs="true">pdf</mark></a>
  106. <br>
  107. </div>
  108. <div style="display:none;" class="richTextFetch"><div filemd5="6683172010dcb9029f93e84148b4024b"><div><a target="_blank" class="markBlue" href="/bdqyhx/266427705213665280.html" style="color: #3083EB !important;text-decoration: underline;">贵阳市白云区房屋征收管理局</a>征收劳务服务采购项目(贵阳</div><div>市白云区房屋征收管理局征收劳务服务采购项目)采购合同</div><div>公告</div><div>一、合同编号:P520113202300071M001</div><div>二、合同名称:</div><div>三、项目编号:P520113202300071M</div><div>四、项目名称:<a target="_blank" class="markBlue" href="/bdqyhx/266427705213665280.html" style="color: #3083EB !important;text-decoration: underline;">贵阳市白云区房屋征收管理局</a>征收劳务服务采购项目贵阳市白云区房屋征收</div><div>管理局征收劳务服务采购项</div><div>五、合同主体:心</div><div>采购人(甲方:<a target="_blank" class="markBlue" href="/bdqyhx/266427705213665280.html" style="color: #3083EB !important;text-decoration: underline;">贵阳市白云区房屋征收管理局</a></div><div>地址:贵阳市区房屋征收管理局</div><div>联系方式:0851-84603386</div><div>供应商(乙方)::<a target="_blank" class="markBlue" href="/bdqyhx/627082318162223104.html" style="color: #3083EB !important;text-decoration: underline;">贵阳市白之云城教育咨询有限公司</a></div><div>地址:贵司省贵阳阳市的路区云城尚品A3-3组团19栋数字内容产业园</div><div>联系方式:18817313599</div><div>六、合同主要信息:</div><div>1.主要标的信息:</div><div>主要标的名称:<a target="_blank" class="markBlue" href="/bdqyhx/266427705213665280.html" style="color: #3083EB !important;text-decoration: underline;">贵阳市白云区房屋征收管理局</a>征收劳务服务采购项目</div><div>数量:1</div><div>单价:7500000(单位:以采购文件约定的计价单位为准)</div><div>规格型号(或服务要求):劳务服务</div><div>2.合同金额:7500000.00(单位:以采购文件约定的计价单位为准)</div><div>3.履约期限、地点等简要信息:服务期限3年,采购人指定地点</div><div>4.采购方式:公开招标</div><div>七、合同签订日期:2023-09-01</div><div>八、合同公告日期:2023-08-30</div><div>九、其他补充事宜:</div><div>行业划分:其他服务业</div><div>产品类型:服务</div><div>PPP项目:查</div><div>是否联合体:查</div><div>产品供应商:贵阳市白云区云城教育咨询有限公司</div><div>代理机构名称:<a target="_blank" class="markBlue" href="/bdqyhx/557886106142765056.html" style="color: #3083EB !important;text-decoration: underline;">贵州黔诚麟云咨询有限责任公司</a></div><div>十、附件:</div><div>附件:上传合同(采购人应当按照《政府采购法实施条例》有关要求,将政府采购合同中涉</div><div>及国家秘密、商业秘密的内容删除后予以公开)</div><div>合同附件:</div><div>合同协议书.<mark data-markjs="true">pdf</mark></div><div>专用合同条款:</div><div>专用合同条款(服务)20230927091129<mark data-markjs="true">pdf</mark></div></div></div></div>
  109. '''
  110. _d = {"dochtmlcon":_html}
  111. dhtml = Document_html(_d)
  112. dhtml.delete_bidi_a()
  113. print(dhtml.getProperties().get("dochtmlcon"))