#coding:utf8 from BaseDataMaintenance.model.ots.BaseModel import BaseModel from tablestore import * from BaseDataMaintenance.common.Utils import * from bs4 import BeautifulSoup document_partitionkey = "partitionkey" document_docid = "docid" document_dochtmlcon = "dochtmlcon" class Document_html(BaseModel): def __init__(self,_dict): BaseModel.__init__(self) for k,v in _dict.items(): if k=="all_columns": continue self.setValue(k,v,True) self.table_name = "document_html" self.prefixs = ["www.bidizhaobiao.com","bxkc.oss-cn-shanghai.aliyuncs.com"] def getPrimary_keys(self): return ["partitionkey","docid"] def isLegalUrl(self,_url,_type): _flag = False for _prefix in self.prefixs: if _url.find(_prefix)>=0: _flag = True if _type==0: if _flag: return True else: return False else: if _flag: return False else: return True def updateSWFImages(self,swf_urls): if len(swf_urls)>0: _dochtmlcon = self.getProperties().get(document_dochtmlcon) _dochtmlcon = re.sub("||
|","",_dochtmlcon) _soup = BeautifulSoup(_dochtmlcon,"lxml") if _soup.find("img",{"src":swf_urls[0]}) is None: _div = "