123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- #coding:utf8
- from BaseDataMaintenance.model.ots.BaseModel import BaseModel
- from tablestore import *
- from BaseDataMaintenance.common.Utils import *
- from bs4 import BeautifulSoup
- document_partitionkey = "partitionkey"
- document_docid = "docid"
- document_dochtmlcon = "dochtmlcon"
- class Document_html(BaseModel):
- def __init__(self,_dict):
- BaseModel.__init__(self)
- for k,v in _dict.items():
- if k=="all_columns":
- continue
- self.setValue(k,v,True)
- self.table_name = "document_html"
- self.prefixs = ["www.bidizhaobiao.com","bxkc.oss-cn-shanghai.aliyuncs.com"]
- def getPrimary_keys(self):
- return ["partitionkey","docid"]
- def isLegalUrl(self,_url,_type):
- _flag = False
- for _prefix in self.prefixs:
- if _url.find(_prefix)>=0:
- _flag = True
- if _type==0:
- if _flag:
- return True
- else:
- return False
- else:
- if _flag:
- return False
- else:
- return True
- def updateSWFImages(self,swf_urls):
- if len(swf_urls)>0:
- _dochtmlcon = self.getProperties().get(document_dochtmlcon)
- _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
- _soup = BeautifulSoup(_dochtmlcon,"lxml")
- if _soup.find("img",{"src":swf_urls[0]}) is None:
- _div = "<div>"
- for _url in swf_urls:
- _div += '<p><img src="%s"/></p>'%(_url)
- _div += "</div>"
- _dochtmlcon += _div
- self.setValue(document_dochtmlcon,_dochtmlcon,True)
- def delete_bidi_a(self):
- _dochtmlcon = self.getProperties().get(document_dochtmlcon)
- _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
- _soup = BeautifulSoup(_dochtmlcon,"lxml")
- for a in _soup.find_all("a"):
- if a.attrs.get("href","").startswith("http://www.bidizhaobiao.com"):
- a.decompose()
- self.setValue(document_dochtmlcon,re.sub("<html>|</html>|<body>|</body>","",str(_soup)),True)
- def getRichTextFetch(self,list_html):
- _text = ""
- for _ht in list_html:
- if isinstance(_ht,str):
- _text += "<div>%s</div>"%(_ht)
- elif isinstance(_ht,dict):
- _filemd5 = _ht.get("filemd5","")
- _html = _ht.get("html","")
- _text += '<div filemd5="%s">%s</div>'%(_filemd5,_html)
- return _text
- def updateAttachment(self,list_html):
- if len(list_html)>0:
- _dochtmlcon = self.getProperties().get(document_dochtmlcon,"")
- _dochtmlcon = re.sub("<html>|</html>|<body>|</body>","",_dochtmlcon)
- _dochtmlcon_len = len(bytes(_dochtmlcon,encoding="utf8"))
- fix_len = self.COLUMN_MAX_SIZE-_dochtmlcon_len-100
- # _text = '\n<div style="display:none;" class="richTextFetch">%s</div>'%("\n".join(list_html))
- _text = '\n<div style="display:none;" class="richTextFetch">%s</div>'%(self.getRichTextFetch(list_html))
- if _dochtmlcon is not None:
- _soup = BeautifulSoup(_dochtmlcon,"lxml")
- _node = _soup.find("div",attrs={"class":"richTextFetch"})
- if _node is not None:
- _node.decompose()
- self.setValue(document_dochtmlcon,str(_soup)+_text,True)
- def getTitleFromHtml(self,filemd5,_html):
- _soup = BeautifulSoup(_html,"lxml")
- _find = _soup.find("a",attrs={"data":filemd5})
- _title = ""
- if _find is not None:
- _title = _find.get_text()
- return _title
- def getSourceLinkFromHtml(self,filemd5,_html):
- _soup = BeautifulSoup(_html,"lxml")
- _find = _soup.find("a",attrs={"filelink":filemd5})
- filelink = ""
- if _find is None:
- _find = _soup.find("img",attrs={"filelink":filemd5})
- if _find is not None:
- filelink = _find.attrs.get("src","")
- else:
- filelink = _find.attrs.get("href","")
- return filelink
- if __name__ == '__main__':
- _html = '''
- <div id="pcontent" class="pcontent"><div>
- 贵阳市白云区房屋征收管理局征收劳务服务采购项目-贵阳市白云区房屋征收管理局征收劳务服务采购项目合同公示
- <br> 详见合同公示附件
- <br>文件预览:
- <br>
- <a target="_blank" class="markBlue" filelink="6683172010dcb9029f93e84148b4024b" href="http://attachment-hub.oss-cn-hangzhou.aliyuncs.com/6683/20230927/2023-09-27/00219/1695779639111.pdf?Expires=1695784428&OSSAccessKeyId=LTAI5tHoEUDSy6FnZjMKsNiZ&Signature=JsHtdlrV%2BWGA1595%2BzTgckLIIm0%3D" original="https://ggzy.guizhou.gov.cn/hallweb/hall/attach/nosession/download?attachId=8a8bb7458aaccc3b018ad45340a844f0" rel="noreferrer">合同网签及备案.<mark data-markjs="true">pdf</mark></a>
- <a target="_blank" class="markBlue" data="6683172010dcb9029f93e84148b4024b" href="http://www.bidizhaobiao.com/file/20230927/2023-09-27/00219/1695779639111.pdf" style="display:none">合同网签及备案.<mark data-markjs="true">pdf</mark></a>
- <br>
- </div>
- <div style="display:none;" class="richTextFetch"><div filemd5="6683172010dcb9029f93e84148b4024b"><div><a target="_blank" class="markBlue" href="/bdqyhx/266427705213665280.html" style="color: #3083EB !important;text-decoration: underline;">贵阳市白云区房屋征收管理局</a>征收劳务服务采购项目(贵阳</div><div>市白云区房屋征收管理局征收劳务服务采购项目)采购合同</div><div>公告</div><div>一、合同编号:P520113202300071M001</div><div>二、合同名称:</div><div>三、项目编号:P520113202300071M</div><div>四、项目名称:<a target="_blank" class="markBlue" href="/bdqyhx/266427705213665280.html" style="color: #3083EB !important;text-decoration: underline;">贵阳市白云区房屋征收管理局</a>征收劳务服务采购项目贵阳市白云区房屋征收</div><div>管理局征收劳务服务采购项</div><div>五、合同主体:心</div><div>采购人(甲方:<a target="_blank" class="markBlue" href="/bdqyhx/266427705213665280.html" style="color: #3083EB !important;text-decoration: underline;">贵阳市白云区房屋征收管理局</a></div><div>地址:贵阳市区房屋征收管理局</div><div>联系方式:0851-84603386</div><div>供应商(乙方)::<a target="_blank" class="markBlue" href="/bdqyhx/627082318162223104.html" style="color: #3083EB !important;text-decoration: underline;">贵阳市白之云城教育咨询有限公司</a></div><div>地址:贵司省贵阳阳市的路区云城尚品A3-3组团19栋数字内容产业园</div><div>联系方式:18817313599</div><div>六、合同主要信息:</div><div>1.主要标的信息:</div><div>主要标的名称:<a target="_blank" class="markBlue" href="/bdqyhx/266427705213665280.html" style="color: #3083EB !important;text-decoration: underline;">贵阳市白云区房屋征收管理局</a>征收劳务服务采购项目</div><div>数量:1</div><div>单价:7500000(单位:以采购文件约定的计价单位为准)</div><div>规格型号(或服务要求):劳务服务</div><div>2.合同金额:7500000.00(单位:以采购文件约定的计价单位为准)</div><div>3.履约期限、地点等简要信息:服务期限3年,采购人指定地点</div><div>4.采购方式:公开招标</div><div>七、合同签订日期:2023-09-01</div><div>八、合同公告日期:2023-08-30</div><div>九、其他补充事宜:</div><div>行业划分:其他服务业</div><div>产品类型:服务</div><div>PPP项目:查</div><div>是否联合体:查</div><div>产品供应商:贵阳市白云区云城教育咨询有限公司</div><div>代理机构名称:<a target="_blank" class="markBlue" href="/bdqyhx/557886106142765056.html" style="color: #3083EB !important;text-decoration: underline;">贵州黔诚麟云咨询有限责任公司</a></div><div>十、附件:</div><div>附件:上传合同(采购人应当按照《政府采购法实施条例》有关要求,将政府采购合同中涉</div><div>及国家秘密、商业秘密的内容删除后予以公开)</div><div>合同附件:</div><div>合同协议书.<mark data-markjs="true">pdf</mark></div><div>专用合同条款:</div><div>专用合同条款(服务)20230927091129<mark data-markjs="true">pdf</mark></div></div></div></div>
- '''
- _d = {"dochtmlcon":_html}
- dhtml = Document_html(_d)
- dhtml.delete_bidi_a()
- print(dhtml.getProperties().get("dochtmlcon"))
|