1234567891011121314151617181920212223242526272829303132333435 |
- import time
- import gunicorn
- import requests
- from bs4 import BeautifulSoup
- from BaseDataMaintenance.common.Utils import article_limit
- import codecs
- def getAttachPath(filemd5,_dochtmlcon):
- _soup = BeautifulSoup(_dochtmlcon,"lxml")
- list_mark = ["data","filelink"]
- for _mark in list_mark:
- _find = _soup.find("a",attrs={_mark:filemd5})
- filelink = ""
- if _find is None:
- _find = _soup.find("img",attrs={_mark:filemd5})
- if _find is not None:
- filelink = _find.attrs.get("src","")
- else:
- filelink = _find.attrs.get("href","")
- if filelink.find("bidizhaobiao")>=0:
- _path = filelink.split("/file")
- if len(_path)>1:
- return _path[1]
- if __name__ == '__main__':
- text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
- filemd5='61393b5ef3d460b3714eb9667682144f'
- print(getAttachPath(filemd5,text))
|