2.py 977 B

1234567891011121314151617181920212223242526272829303132333435
  1. import time
  2. import gunicorn
  3. import requests
  4. from bs4 import BeautifulSoup
  5. from BaseDataMaintenance.common.Utils import article_limit
  6. import codecs
  7. def getAttachPath(filemd5,_dochtmlcon):
  8. _soup = BeautifulSoup(_dochtmlcon,"lxml")
  9. list_mark = ["data","filelink"]
  10. for _mark in list_mark:
  11. _find = _soup.find("a",attrs={_mark:filemd5})
  12. filelink = ""
  13. if _find is None:
  14. _find = _soup.find("img",attrs={_mark:filemd5})
  15. if _find is not None:
  16. filelink = _find.attrs.get("src","")
  17. else:
  18. filelink = _find.attrs.get("href","")
  19. if filelink.find("bidizhaobiao")>=0:
  20. _path = filelink.split("/file")
  21. if len(_path)>1:
  22. return _path[1]
  23. if __name__ == '__main__':
  24. text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
  25. filemd5='61393b5ef3d460b3714eb9667682144f'
  26. print(getAttachPath(filemd5,text))