#coding:utf8 from BaseDataMaintenance.model.ots.BaseModel import BaseModel from tablestore import * from BaseDataMaintenance.common.Utils import * from bs4 import BeautifulSoup document_partitionkey = "partitionkey" document_docid = "docid" document_dochtmlcon = "dochtmlcon" class Document_html(BaseModel): def __init__(self,_dict): BaseModel.__init__(self) for k,v in _dict.items(): if k=="all_columns": continue self.setValue(k,v,True) self.table_name = "document_html" self.prefixs = ["www.bidizhaobiao.com","bxkc.oss-cn-shanghai.aliyuncs.com"] def getPrimary_keys(self): return ["partitionkey","docid"] def isLegalUrl(self,_url,_type): _flag = False for _prefix in self.prefixs: if _url.find(_prefix)>=0: _flag = True if _type==0: if _flag: return True else: return False else: if _flag: return False else: return True def updateSWFImages(self,swf_urls): if len(swf_urls)>0: _dochtmlcon = self.getProperties().get(document_dochtmlcon) _dochtmlcon = re.sub("|||","",_dochtmlcon) _soup = BeautifulSoup(_dochtmlcon,"lxml") if _soup.find("img",{"src":swf_urls[0]}) is None: _div = "
" for _url in swf_urls: _div += '

'%(_url) _div += "
" _dochtmlcon += _div self.setValue(document_dochtmlcon,_dochtmlcon,True) def delete_bidi_a(self): _dochtmlcon = self.getProperties().get(document_dochtmlcon) _dochtmlcon = re.sub("|||","",_dochtmlcon) _soup = BeautifulSoup(_dochtmlcon,"lxml") for a in _soup.find_all("a"): if a.attrs.get("href","").startswith("http://www.bidizhaobiao.com"): a.decompose() self.setValue(document_dochtmlcon,re.sub("|||","",str(_soup)),True) def getRichTextFetch(self,list_html): _text = "" for _ht in list_html: if isinstance(_ht,str): _text += "
%s
"%(_ht) elif isinstance(_ht,dict): _filemd5 = _ht.get("filemd5","") _html = _ht.get("html","") _text += '
%s
'%(_filemd5,_html) return _text def updateAttachment(self,list_html): if len(list_html)>0: _dochtmlcon = self.getProperties().get(document_dochtmlcon,"") _dochtmlcon = re.sub("|||","",_dochtmlcon) _dochtmlcon_len = len(bytes(_dochtmlcon,encoding="utf8")) fix_len = self.COLUMN_MAX_SIZE-_dochtmlcon_len-100 # _text = '\n'%("\n".join(list_html)) _text = '\n'%(self.getRichTextFetch(list_html)) if _dochtmlcon is not None: _soup = BeautifulSoup(_dochtmlcon,"lxml") _node = _soup.find("div",attrs={"class":"richTextFetch"}) if _node is not None: _node.decompose() self.setValue(document_dochtmlcon,str(_soup)+_text,True) def getTitleFromHtml(self,filemd5,_html): _soup = BeautifulSoup(_html,"lxml") _find = _soup.find("a",attrs={"data":filemd5}) _title = "" if _find is not None: _title = _find.get_text() return _title def getSourceLinkFromHtml(self,filemd5,_html): _soup = BeautifulSoup(_html,"lxml") _find = _soup.find("a",attrs={"filelink":filemd5}) filelink = "" if _find is None: _find = _soup.find("img",attrs={"filelink":filemd5}) if _find is not None: filelink = _find.attrs.get("src","") else: filelink = _find.attrs.get("href","") return filelink if __name__ == '__main__': _html = '''
贵阳市白云区房屋征收管理局征收劳务服务采购项目-贵阳市白云区房屋征收管理局征收劳务服务采购项目合同公示
详见合同公示附件
文件预览:
合同网签及备案.pdf
''' _d = {"dochtmlcon":_html} dhtml = Document_html(_d) dhtml.delete_bidi_a() print(dhtml.getProperties().get("dochtmlcon"))