123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518 |
- '''
- Created on 2018年12月26日
- @author: User
- '''
- import sys
- import os
- import json
- import codecs
- import re
- sys.path.append(os.path.abspath("../.."))
- import requests
- import time
- list_url = ["http://www.csssyxx.com/xwgk/tzgg",
- "http://nyj.yueyang.gov.cn/nyj/7869/7871/default.htm",
- "http://www.xp.gov.cn/zwgk/072011/govmore.html",
- "http://www.yiyang.gov.cn/sjkzx/4464/4469/default.htm",
- "http://lsyshjj.leshan.gov.cn/szwfwzx/tzgg/list.shtml",
- "http://szrzyj.ziyang.gov.cn/catalog.aspx?id=10",
- "http://sww.chengdu.gov.cn/cdswh/gsgg/gdlist1526618933789.shtml",
- "http://cdaudit.chengdu.gov.cn/cdsjj/c113200/list_1.shtml",
- "http://nyj.yueyang.gov.cn/nyj/7869/7871/default.htm",
- "http://yyjt.yiyang.gov.cn/yyjtj/6270/6279/6288/6303/default.htm",
- "http://www.xp.gov.cn/zwgk/072011/govmore.html",
- "http://www.yiyang.gov.cn/sjkzx/4464/4469/default.htm",
- "http://lsyshjj.leshan.gov.cn/szwfwzx/tzgg/list.shtml",
- "http://szrzyj.ziyang.gov.cn/catalog.aspx?id=10",
- "http://jtys.chengdu.gov.cn/cdjt/c108497/xw_list.shtml",
- "http://sww.chengdu.gov.cn/cdswh/gsgg/gdlist1526618933789.shtml",
- "http://sthj.chengdu.gov.cn/cdhbj/c110798/list_1.shtml",
- "http://www.djy.gov.cn/djyszfmhwz/c130537/xwzx_list.shtml",
- "http://jhj.my.gov.cn//xwzx/tzgg/index.html",
- "http://cl.my.gov.cn/tzgg/index.html",
- "http://dag.my.gov.cn/tzgg/index.html",
- "http://fgw.my.gov.cn/tzgg/index.html",
- "http://nyncj.my.gov.cn/tzgg/index.html",
- "http://jyfwzx.my.gov.cn/gggs/index.html",
- "http://jxj.my.gov.cn/gsgg/index.html",
- "http://mymz.my.gov.cn/zwgk/tzgg/index.html",
- "http://wgl.my.gov.cn/gggs/index.html",
- "http://scjg.my.gov.cn/zwdt/gsgg/index.html",
- "http://www.zitong.gov.cn/content/column/4704911?pageIndex=1",
- "http://www.hunancatv.com/tender.aspx?class=64&channel=57",
- "http://www.yantaibank.net/publish/ytbank/31388/31548/index.html#main",
- "http://www.tjbhb.com/bhyhww/jzcg/zjgg/index.html",
- "http://jw.guiyang.gov.cn/c8346/",
- "http://ggzyjy.lsz.gov.cn/TPFront/jyxx/005001/005001003/",
- "http://ggzyjy.lsz.gov.cn/TPFront/jyxx/005001/005001008/",
- "http://www.hflyzx.net/Nav_dongtai.shtml?whichpage=1&SS_ID=11",
- "http://hwcz.snxjyj.cn/xwdt/tzgg.htm",
- "http://www.jtdzpt.com/expsteel/exp/tender/sell/bout/moreTdBoutResult.htm?type=1&status=3",
- "http://www.cinda.com.cn/xdjt/xdjtpd/cgxxgs/list.shtml",
- "http://www.sdebank.com/cms/S101_21/infoCenter/pmxx/index.html",
- "http://bid.9to.com/list.php/catid-236/",
- "http://www.bankcomm.com/BankCommSite/shtml/jyjr/cn/7804/2600473/2600509/list_1.shtml?channelId=7804",
- "http://www.xtjfjt.com/a/xinwenzhongxin/tongzhigonggao/",
- "http://taochonghu.hf168.net/Home/News/67?pageindex=1",
- "http://www.hfsjcxx.com/SortHtml/1/List_19.html",
- "http://www.cimeec.com/mxshop/web/goods.do",
- "https://www.znjjzx.net/col.jsp?id=116",
- "http://www.fcd.com.cn/tzgg/",
- "http://www.bnds.cn/news/4.html",
- "http://www.tjflis.com/webDisplay/ESite/More.jsp?strID=1117&num=3",
- "http://www.nyjyedu.cn/content/init.action?channelId=channel_302&page=1",
- "http://www.ouswgd.cn/swou/xnzb/list2.shtml",
- "http://www.lsgjsyxx.com/gonewlist?columnId=32",
- "https://www.hbdx.gov.cn/info_infoCategory.jsp?data_name=tzgg_new",
- "http://www.hnswdx.gov.cn/news/tzgg/tzgg/index.html",
- "http://sdx.maoming.gov.cn/newslist.aspx?ID=57",
- "http://fjclzz.com/new_list.aspx?fid=9",
- "http://fd4zh.30edu.com.cn/Article/46a834fb-c5e2-4805-983d-8162e9887e7e/1.shtml",
- "http://www.zhgxqrmyy.com/news/93/",
- "http://bdzc.sasu.edu.cn/list.php?pageone=0&cid=28",
- "http://sccyy.chuzhou.gov.cn/3903378.html",
- "http://www.ymgfgs.cn/index.php/news/admin/1/cn/104/104.html",
- "http://www.tjbhb.com/bhyhww/jzcg/zbjg/index.html",
- "http://www.mdzx.net/index.php/xsxwin/tzgg/",
- "http://www.zzsfybjy.com/index.php?m=content&c=index&a=lists&catid=24",
- "http://www.cndnce.com/bidding/barginlist/List_1.aspx",
- "http://www.cndnce.com/bidding/notice/List_1.aspx",
- "http://www.bankcomm.com/BankCommSite/shtml/jyjr/cn/7804/2600473/2600510/list_1.shtml?channelId=7804",
- "http://www.hljnkzyy.org.cn/index.php?p=news_list&lanmu=12",
- "http://www.hsxrmyy.com/zhaobiao/menu.aspx",
- "http://www.zhswdx.cn/portal/rest/menu/ummenuTwoList?menu=zytz&siteType=zytz",
- "https://www.bjsx.com.cn/list.jsp?totalpage=19&PAGENUM=2&urltype=tree.TreeTempUrl&wbtreeid=1049",
- "http://swdx.np.gov.cn/cms/sitemanage/index.shtml?siteId=100422578395950000&page=1",
- "http://www.gdsgznx.com/xwgk/Index.html",
- "http://www.nj13zhs.cn/zhxyjz/articlelist.aspx?id=14&page=1",
- "http://www.99snsn.com/zwgk.html",
- "http://kfyy.changde.gov.cn/col/col32819/index.html",
- "http://gaj.yanan.gov.cn/info/iList.jsp?cat_id=10194",
- "http://www.yanhe.gov.cn/xwzx/tzgg/index.html",
- "http://www.fdfzjt.com/announcement/2",
- "http://www.xagj.com.cn/item?pid=712a95df750c40a48854e25c95bfaa65",
- "http://www.xasrlgs.com/index/news/index/id/3.html",
- "http://www.xazls.com//zbpt/index.htm",
- "http://www.xazls.com//gsgg/index.htm",
- "http://www.aspd.gov.cn/zwgk/zdlygk/ggzypz/ggzyjy/",
- "http://www.aspd.gov.cn/xwzx/tzgg/index.html",
- "http://www.anshunjkq.gov.cn/xxgk/xxgkml/zdlyxxgk/zfcg/zbgg/list.html",
- "http://www.anshunjkq.gov.cn/xxgk/xxgkml/zdlyxxgk/ggzypz/list.html",
- "http://jtj.anshun.gov.cn/jtxw/tzgg/index.html",
- "http://nyncj.anshun.gov.cn/gzdt/tzgg/",
- "http://wgl.anshun.gov.cn/gzdt/tzgg/index.html",
- "http://hgsgwh.anshun.gov.cn/xxgk/xxgkml/zdlyxx/zfcg/zbgg/",
- "http://www.anshunjkq.gov.cn/xxgk/xxgkml/zdlyxxgk/zfcg/zbgg/list.html",
- "http://www.gzzn.gov.cn/xxgk/zxgk/tzgg_40284/index.html",
- "http://www.gzsmzmuseum.cn/list-7.html",
- "http://dsjw.guiyang.gov.cn/c9576/index.html",
- "http://gaj.guiyang.gov.cn/jszx/tzgg/jtgll/",
- "http://jw.guiyang.gov.cn/c8346/",
- "http://jyj.guiyang.gov.cn/pd_jydt/node_2948.htm?current=2951",
- "http://kjj.guiyang.gov.cn/a/xxgk/tzgg/list_25_1.html",
- "http://nync.guiyang.gov.cn/c8121/",
- "http://rfb.guiyang.gov.cn/c6686/index.html",
- "http://gzw.guiyang.gov.cn/c7905/",
- "http://sfj.guiyang.gov.cn/zwpd/list_infor_dtxx.aspx?tid=7&lx=2&pg=0",
- "http://tyj.guiyang.gov.cn/c17124/",
- "http://wjw.guiyang.gov.cn/c7124/",
- "http://gsj.guiyang.gov.cn/tzgg.jsp?syscode=AE01&id=11",
- "http://tzcj.guiyang.gov.cn/c8588/",
- "http://www.gzspm.com/new.asp?anclassid=4",
- "http://snyncj.gzlps.gov.cn/gzdt_42339/gsgg/index.html",
- "http://rfb.gzlps.gov.cn/gzdt/tzgg/index.html",
- "http://mwr.gzlps.gov.cn/gzdt/tzgg/index.html",
- "http://lpstsjyxx.chinayunnet.com/xwgk/tzgg/index.html",
- "http://swjj.gzlps.gov.cn/gzdt/tzgg/index.html",
- "http://www.bjq.gov.cn/xxgk/zfxxgkml/zdxxgk/ggzypz/zfcg_57661/cggg/",
- "http://www.bjq.gov.cn/xwzx/tzgg/index.html",
- "http://www.shiqian.gov.cn/zwgk/xxgkml/zdlygk/tzgg/list.html",
- "http://www.shiqian.gov.cn/zwgk/xxgkml/zdlygk/ggzypz/list.html",
- "http://www.trws.gov.cn/zwgk/xxgkml/zdlygk/ggzypz/zbgg/list.html",
- "http://www.trws.gov.cn/xwzx/tzgg/index.html",
- "http://www.dalong.gov.cn/html/zhengwugongkai/tongzhigonggao/index.html",
- "http://www.trs.gov.cn/xxgk/zdlygk/zfcg/zbgg_59709/index.html",
- "http://www.trs.gov.cn/xwzx/tzgg/gsgg/index.html",
- "http://www.yanhe.gov.cn/xwzx/tzgg/index.html",
- "http://www.yinjiang.gov.cn/xwzx/tzgg/index.html",
- "http://www.yinjiang.gov.cn/xxgk/zdxxgk/ggzypz/tdzy/index.html",
- "http://www.yinjiang.gov.cn/xxgk/zdxxgk/zdjsxm/zbtb/",
- "http://www.yuping.gov.cn/zwgk/xxgkml/zdlygk/zfcg/zbgg/",
- "http://www.yuping.gov.cn/zwgk/xxgkml/zdlygk/zdjsxm/ztb/",
- "http://www.gzcz.gov.cn/xwzx/tzgg/index.html",
- "http://jr.guizhou.gov.cn/tzgg/index.html",
- "http://fpb.guizhou.gov.cn/xwzx/tzgg/index.html",
- "http://www.gzcoop.gov.cn/xwzx/tzgg/",
- "http://www.gzsjyt.gov.cn/xwzx/tzgg/index.html",
- "http://kjt.gzst.gov.cn/xwzx/tzgg_73876/index.html",
- "http://www.gzaas.org.cn/xxgk/zdgk/tzgg/",
- "http://rfb.guizhou.gov.cn/xwzx/tzgg/index.html",
- "http://www.suiyang.gov.cn/xwzx/tzgg/index.html",
- "http://daj.zunyi.gov.cn/gzdt/tzgg/",
- "http://www.zyredcross.cn/news-1-1-0.html",
- "http://www.zysjwssc.com/emall/zunyi/jingjiaInfo.aspx",
- "http://rsj.zunyi.gov.cn/web/12731/index.html",
- "http://zyepb.zunyi.gov.cn/news34/news_more.asp?page=1&word=&lm=&lm2=340&lmname=&open=&n=&hot=&tj=",
- "http://scjgj.zunyi.gov.cn/gsgg/qt/",
- "http://mts.zmu.edu.cn/tzxw/tzgg.htm",
- "http://www.chidi.com.cn/col/col6870/index.html",
- "http://www.chidi.com.cn/col/col6872/index.html",
- "http://mpnr.chengdu.gov.cn/second/zpgjg.aspx?ClassID=001002002006001",
- "http://mpnr.chengdu.gov.cn/List.aspx?ClassID=001002001003",
- "http://gzw.chengdu.gov.cn/cdgzw/c107965/list.shtml",
- "http://www.cfyy.net/tender_sub/",
- "http://www.hxdental.cn/news/bid/",
- "http://www.chinawestagr.com/homepage/list_aff.asp",
- "http://sctcm.sc.gov.cn/get/class/scszyyglj/gggs/index.html",
- "http://www.dzzjcs.com/jyxx/tradeInfo.html",
- "http://gzw.deyang.gov.cn/list.asp?id=1&smallid=248&bname=%E7%BB%BC%E5%90%88%E4%BF%A1%E6%81%AF&Sname=%E9%80%9A%E7%9F%A5%E9%80%9A%E5%91%8A",
- "http://www.zjxzyyy.cn/plus/list.php?tid=45",
- "http://www.scdl.gov.cn/xwzx/tzgg.htm",
- "http://www.dp.gov.cn/dpxw/zwgz/gsgg.htm",
- "http://www.rs.gov.cn/xxgk/gsgg.htm",
- "http://www.ncjttz.com/index.php?m=content&c=index&a=lists&catid=18",
- "http://gsj.nanchong.gov.cn/portal/123",
- "http://www.xichong.gov.cn/news/notice/department/index.html",
- "http://www.xichong.gov.cn/catalog/378/index.html",
- "http://www.xichong.gov.cn/catalog/428/index.html",
- "http://gzw.panzhihua.gov.cn/zwgk/tzgg/index.shtml",
- "http://ctel.invest.com.cn/news/bid/index_1.html",
- "http://nync.guiyang.gov.cn/c8121/",
- "http://www.scjianke.com/news1.aspx?t=38",
- "http://www.sc2zyy.com/list.asp?id=9",
- "http://www.cd7yy.com/news/news-18p1.html?tdsourcetag=s_pctim_aiomsg",
- "http://jw.guiyang.gov.cn/c8346/",
- "http://www.xacbdc.com/index.php?s=zbzs&c=category&id=1",
- "http://gaj.yanan.gov.cn/info/iList.jsp?cat_id=10194",
- "http://www.gzsjyt.gov.cn/xwzx/tzgg/index.html",
- "http://ztbgl.yangtzeu.edu.cn/hwcg.htm?tdsourcetag=s_pctim_aiomsg",
- "http://www.ncct.cc/html/list_11.html",
- "http://www.zqdh.gov.cn/gzjg/zqsdhqzfhcxjsj/zbtb/",
- "http://www.fucai.cn/art/gsgg/",
- "http://www.gscq.com.cn/index.php?s=xm&c=category&id=1",
- "http://yh.yali.edu.cn/index.php?m=web&c=list&id=20&pid=0",
- "http://www.cscjedu.com/News/Module.aspx?id=c5d5fc0b-70d6-412f-a81c-2ed45c1dd3e5",
- "http://www.csssyxx.com/gonewlist?columnId=36",
- "http://www.cstsjy.cn/tongzhigonggao",
- "http://www.hbhtzx.com/view-18.html",
- "http://yh.yali.edu.cn/index.php?m=web&c=list&id=20&pid=0",
- "http://www.cscjedu.com/News/Module.aspx?id=c5d5fc0b-70d6-412f-a81c-2ed45c1dd3e5",
- "http://www.csssyxx.com/gonewlist?columnId=36",
- "http://www.cstsjy.cn/tongzhigonggao",
- "http://www.hbhtzx.com/view-18.html",
- "http://www.yalisy.cn/xxgg",
- "http://www.hiec.cn/e/action/ListInfo/?classid=25",
- "http://www.hsdlzx.net/a/xiaowugongkai/tongzhigonggao/",
- "http://www.zhounanshiyan.com/gongg/xwgk/",
- "https://cyyzxxx.30edu.com.cn/Article/f29e0d5e-04f7-68cf-26bd-d61532162df8/",
- "http://www.myfls.com.cn/pcweb/article_list/DynamicNotice.htm",
- "http://www.wenshang.gov.cn/module/xxgk/search.jsp?divid=div23663&infotypeId=27030301&jdid=104&area=&sortfield=createdatetime:0,orderid:0",
- "http://www.liangshan.gov.cn/module/xxgk/search.jsp?divid=div32242&infotypeId=LSA09100403&jdid=112&area=&currpage=1",
- "http://www.csx.gov.cn/csxjiaoyuju/xxgk8592/ztb23/",
- "http://www.sdde.gov.cn/ywdt/gggs/",
- "http://www.ahjzyy.cn/Nav_wangshang.asp?SS_ID=85",
- "http://www.ahjzyy.cn/SortHtml/1/3465299358.html",
- "http://www.sdde.gov.cn/dexxgk/xzfbm/daxnyj/",
- "http://www.sdde.gov.cn/dexxgk/xzfbm/daxzjj/",
- "http://www.yutai.gov.cn/module/xxgk/search.jsp?divid=div55&infotypeId=YTA060201&jdid=111&area=&currpage=1",
- "http://lxxxgk.bozhou.gov.cn/opennessTarget/?branch_id=53fe9198cbb812e0d509f771&column_code=170302&page=1",
- "http://www.eszwdx.com/index.php?m=content&c=index&a=lists&catid=17",
- "http://www.hbei.com.cn/news/mess/",
- "http://pl.km.gov.cn/qsbmsz/qzfgbm/qwsj/tzgg/",
- "http://gd.km.gov.cn/zfxxgkml/zdlyxxgk/czzjxx/zfcg/",
- "http://jn.km.gov.cn/jrjnol/gsggol/",
- "http://yl.km.gov.cn/tzgg/",
- "http://gbdsj.gd.gov.cn/zwgk/zfcg/index.html",
- "https://www.glutnn.cn/article.aspx?classid=7&page=1",
- "http://safety.lyg.gov.cn/tzgg/tzgg.html",
- "http://www.fenxi.gov.cn/channels/2300.html",
- "http://www.hatjy.com/index.php?m=content&c=index&a=lists&catid=197",
- "http://ggzy.wulanchabu.gov.cn/jyxx/jsgczbhxrgs",
- "http://ledong.hainan.gov.cn/ledong/0400/right.shtml",
- "http://xdp.shuanghui.net:8010/webportal/index/bidnotice/list/2.do",
- "http://ggzyjy.yanbian.gov.cn/jyxx/005006/005006002/aboutsub.html",
- "http://ggzyjy.yanbian.gov.cn/jyxx/005006/005006001/aboutsub.html",
- "http://www.icbc.com.cn/ICBC/%E6%B1%9F%E8%8B%8F%E5%88%86%E8%A1%8C/%E6%9C%80%E6%96%B0%E4%B8%9A%E5%8A%A1/%E9%9B%86%E4%B8%AD%E9%87%87%E8%B4%AD%E4%BF%A1%E6%81%AF%E5%85%AC%E5%BC%80/",
- "http://tiyuju.tangshan.gov.cn/tiyu/tongzhigonggao_tyj/",
- "http://www.jj.gov.cn/col/col1326300/",
- "http://www.jj.gov.cn/col/col1375711/",
- "http://zrghj.sjz.gov.cn/sjz/gsgg/tdsc/",
- "http://tiyuju.tangshan.gov.cn/tiyu/tongzhigonggao_tyj/",
- "http://jkq.nanning.gov.cn/html/xxgkml/ggzypz/kyqcr/",
- "http://jgswj.jiaxing.gov.cn/col/col1537381/",
- "http://www.hbxl.gov.cn/info/index.jsp?id=293&name=%E5%85%AC%E5%91%8A%E5%85%AC%E7%A4%BA&type=DIMMENSION_A",
- "http://zrzyhghj.jiaxing.gov.cn/col/col1541584/",
- "http://jtj.lf.gov.cn/main.php?action=displaymore&s=133&p=1",
- "http://ztbgl.yangtzeu.edu.cn/hwcg.htm",
- "http://ztbgl.yangtzeu.edu.cn/index/yx_bm_cg.htm",
- "http://zbb.lzu.edu.cn/lzupage/B20160106020352.html",
- "http://zbb.lzu.edu.cn/lzupage/B20160106020426.html",
- "http://ggzyjy.yanbian.gov.cn/jyxx/005005/005005002/aboutsub.html",
- "http://www.gdzyy.cn/xinxi/zhaobiao/",
- "https://www.glutnn.cn/article.aspx?classid=7&page=1",
- "http://news.hict.org.cn/Html/xiaonagonggao/",
- "http://www.tjbhb.com/bhyhww/jzcg/zjgg/index.html",
- "http://www.zzsfybjy.com/index.php?m=content&c=index&a=lists&catid=166",
- "http://oa.xjtu.edu.cn/zxgg_index.jsp",
- "http://www.sdjxxrmyy.com/list/?3_1.html",
- "http://v8033290.72119.30la.com.cn/Article/ShowClass.asp?ClassID=27&page=1",
- "http://www.hljzy.org.cn/ten?page",
- "http://www.sgsyy.com/notice/",
- "http://www.hnsyzxyy.com/news/zhaobiao/",
- "http://scjgj.qinghai.gov.cn/channel/gg/index.htm",
- "http://www.hljnkzyy.org.cn/index.php?p=news_list&lanmu=12",
- "http://www.xjxrmyy.net/Home/InformationNote1",
- "http://www.taszlyy.com/findTypeByTypeIdIndex.do?typeid=444",
- "https://www.szcp.com/Roam/Announce/List_17.html",
- "http://www.lnmu3h.com/plus/list.php?tid=340",
- "http://27.24.159.155/gz/list.jsp?a6t=5&a6p=1&a6c=10&urltype=tree.TreeTempUrl&wbtreeid=1019",
- "http://www.cqyfyl.cn/list/535",
- "http://www.pzhzxy66.com/article/lists/category/zbgg.html",
- "http://www.cha.org.cn/plus/list.php?tid=68",
- "http://news.hict.org.cn/Html/xiaonagonggao/",
- "http://www.cinda.com.cn/xdjt/xdjtpd/syhzjh/list.shtml",
- "http://jr.chengdu.gov.cn/jinrongban/c139013/list.shtml",
- "http://cocenter.casicloud.com/xcl/searchRelease.ht",
- "http://www.hbfxly.org/fenlei/?idh=443",
- "http://jyj.changsha.gov.cn/zwgk/czxx/",
- "http://sfj.bozhou.gov.cn/content/channel/59278c4aceab064621611981/",
- "http://yyyzyy.cn/xin-wen-zhong-xin/xin-xi-gong-gao",
- "http://www.motmti.cn/tzgg/index.jhtml",
- "http://www.yhwgyxx.cn/html/yhwx/list.html?id=%E9%80%9A%E7%9F%A5%E5%85%AC%E5%91%8A&pageNo=1&pageSize=10",
- "http://hunan.chinatax.gov.cn/sy/lists/20190719078813",
- "http://scjg.sjz.gov.cn/col/1490159811930/",
- "http://www.youyang.gov.cn/html/xxgk/gcjslyxxgk/xmjszb/",
- "http://hunan.chinatax.gov.cn/yi/county/20190719078353/lists/20190719078446",
- "http://www.bestzx.net/index.php/welcome/article/4/176",
- "http://ezszy.hbfy.gov.cn/DocManage/getDocsByFolder?folderNo=0502",
- "http://shbj.leshan.gov.cn/SiteHuanbaoju/List.aspx?acID=7",
- "http://www.zjsjtysj.gov.cn/Class.asp?ID=99&page=1",
- "http://www.gsbtn96333.com.cn/news-41-1.html",
- "http://www.wusheng.gov.cn/wsxrmzf/c100460/list.shtml",
- "http://tzwjjq.zjtz.gov.cn/col/col25734/index.html",
- "http://www.bestzx.net/index.php/welcome/article/4/176",
- "http://czj.changde.gov.cn/col/col6503/index.html",
- "http://sthjj.changde.gov.cn/col/col7389/index.html",
- "http://fgj.changde.gov.cn/col/col27394/index.html",
- "http://cdjdw.changde.gov.cn/col/col15666/index.html",
- "http://cdgxq.changde.gov.cn/col/col32114/index.html",
- "http://swj.jingzhou.gov.cn/z/zhengwugongkai/tongzhigonggao/",
- "http://www.lhgtj.gov.cn/article.asp?ClassID=33",
- "http://slj.qz.gov.cn/col/col1597457/",
- "http://ezszy.hbfy.gov.cn/DocManage/getDocsByFolder?folderNo=0502",
- "http://jtj.jiujiang.gov.cn/colB/colB6/",
- "http://slj.yq.gov.cn/12693/",
- "http://tyj.jiaxing.gov.cn/col/col1591242/",
- "http://rfb.jiaxing.gov.cn/col/col1537251/",
- "http://www.jxsgxs.cn/zwgk.asp?lmid=11",
- "http://gjj.jiaxing.gov.cn/col/col1629879/",
- "http://www.qz.gov.cn/col/col1525311/",
- "http://www.gsbtn96333.com.cn/news-41-1.html",
- "http://ec.gslq.com/portal/list.do?chnlcode=result",
- "http://ct.yichun.gov.cn/index.php?s=news&c=category&id=12",
- "http://yjglj.beijing.gov.cn/col/col573/index.html#!uid=8268&pageNum=1",
- "http://jn.km.gov.cn/jrjnol/gsggol/",
- "http://www.hanshou.gov.cn/zwgk/tzgg/",
- "http://www.gzdf.gov.cn/14166/14258/14268/index.shtml",
- "http://www.jxjaxzf.gov.cn/Category_318/Index.aspx",
- "http://www.zhuji.gov.cn/col/col1450965/index.html",
- "http://www.zhuji.gov.cn/col/col1453321/index.html",
- "http://www.gdd.gov.cn/hp/zfcg/list.shtml",
- "http://www.ccx.gov.cn/syscolumn/ztzl/ztbzl/",
- "http://www.sjzlq.gov.cn/syscolumn/36/82/index_1.html",
- "http://www.longquan.gov.cn/xxgk/bm/758064163/03/zfcg/",
- "http://www.shengsi.gov.cn/col/col1354811/",
- "http://www.shengsi.gov.cn/col/col1354812/",
- "http://www.zj-xd.cn/news/class/?106.html",
- "http://www.gdveren.com/index.php?controller=List&action=index&id=50&uid=49",
- "http://dct.jiangxi.gov.cn/col/col14522/index.html",
- "http://www.jxxgj.gov.cn/xxgk_1/ztbxx/",
- "http://www.fjrtvu.edu.cn/xxgk1/zbcg.htm",
- "http://www.nj13zhs.cn/zhxyjz/articlelist.aspx?id=14&page=1",
- "http://rmyy.maoming.gov.cn/index-24.html",
- "http://nfzxy.com/yydt/zbxm/",
- "http://www.sqswdx.cn/Category_36/Index.aspx",
- "http://zcc.lcu.edu.cn/zbcg/index.htm",
- "http://zcc.lcu.edu.cn/zccz/index.htm",
- "http://wyglzx.lcu.edu.cn/dtgg/index.htm",
- "http://www.jxhg510.com/jxadmin/news_more.asp?page=1&word=&lm=&lm2=98&lmname=&open=_blank&n=&hot=0&tj=0",
- "http://www.hbsyxx.cn/Item/list.asp?id=1563&page=1",
- "http://cyxxgk.jmu.edu.cn/gkml/cggg.htm"]
- _sum = 0
- _count = 0
- ''' '''
- # with codecs.open("errorLink.txt","r",encoding="utf8") as f:
- # while(True):
- # line = f.readline().strip()
- # if not line:
- # break
- #
- # a = time.time()
- # # user = {"listpage_url":list_url[0]}
- # user = {"listpage_url":"http://www.gsbtn96333.com.cn/news-41-1.html"}
- # #_resp = requests.post("http://192.168.2.52:15015/content_extract", json=user, verify=True)
- # _resp = requests.post("http://127.0.0.1:15015/content_extract", json=user, verify=True)
- # resp_json = _resp.content.decode("utf-8")
- # _resp = json.loads(resp_json)
- # print(resp_json)
- # _sum += 1
- # if "flag" in _resp and _resp["flag"]:
- # _count += 1
- # print("take:",time.time()-a,json.dumps(_resp,sort_keys=True,indent=4,ensure_ascii=False))
- # print(_count,_sum)
- def get_rs(url):
- user = {"listpage_url": url}
- _resp = requests.post("http://192.168.2.102:15015/content_extract", json=user, verify=True) #127.0.0.1 177
- resp_json = _resp.content.decode("utf-8")
- return resp_json
- # _resp = json.loads(resp_json)
- # print(resp_json)
- # print(_resp)
- # url = 'http://www.clrmyy.com/Newslist/NewsList.aspx?code=ZPXX'
- # url = 'http://ec.chongchi.com.cn:8080/Ec468Web/ysxjcggg.jsp' # 列表页太长 js 溢出 #已设置超时
- # url = 'https://tyj.huangshan.gov.cn/content/column/6794951?pageIndex=1'
- # url = 'http://www.yangdong.gov.cn/xwzx/gggs/index.html' # 获取详情页报错
- # url = 'https://www.guit.edu.cn/xwzx/tzgg.htm ' # 日志报错
- # rs = get_rs(url)
- # print(rs)
- # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=b8273cd5944b41c1b6f5aeb88194340f&bmcode=KA024&showlmmc=1&showbm=0¤tPage=2' # 翻页提取失败
- # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
- # url = 'http://www.gztaijiang.gov.cn/zwgk/zdlygk/zfcg/zbgg/index.html' # 所有要素提取失败, 重跑正常
- # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复
- # url = 'http://www.yangzhou.gov.cn/cnyzfront/newxxgk/bmfdgklist.jsp?zjclassid=aedecc7ea4cb4fbdb34df0d57db50c62&bmcode=11321000014407012K' # 所有要素提取失败, 重跑正常
- # url = 'http://www.chengan.gov.cn/main/newsMore.action?subjectid=9052&pagenum=1' # 所有要素提取失败, bug 已修复
- # url = 'http://hsxzwgk.mas.gov.cn/opennessTarget/?branch_id=57a3df762c262ea9a00aadae&column_code=280200' #主页提取失败 #网页打不开# 404
- # url = 'http://www.crra.org.cn/news/tongzhi/o1' # 执行js完毕 getRule_A_Date done 后卡住 已修复
- # url = 'http://www.ptstjxx.org.cn/pttsjyxx_lists-16.html' # 翻页超时错误 已修复 提取正常
- # url = 'https://www.neias.cn/news_list.jsp?id=10775' # 报 201 浏览器打开正常 重新提取 #翻页链接不匹配##下一页规则未获取#
- # url = 'https://www.gzmedri.com/xwdt/list_14_page_1.html' # 报 201 浏览器打开很慢,有时正常
- # # url = 'http://www.wjqwhg.cn/Article?pageIndex=1' #列表页规则未获取# 网页打开报错 504
- #
- # # url = 'http://gxs.yun.liuzhou.gov.cn/xwzx/tzgg/index.shtml' # 所有要素提取失败, bug 已修复 列表页xpath预测错误
- # # url = 'http://sz.nxeduyun.com/index.php?r=space/school/portal/content/index&sid=6ce9765e85694be7838c7f7272199346&cid=50160' #列表页获取失败 已修复
- # # url = 'https://www.nbzjy.cn/list-gsgg.html' # #列表页规则未获取# 已解决
- # # url = 'http://www.gdhealth.net.cn/index.php?m=content&c=index&a=lists&catid=38' # # #列表页规则未获取# chome浏览器打开异常 换另一个浏览器正常
- # # url = 'http://www.kbs.gov.cn/ywdt/tzgg/index.html' #列表页规则未获取# iframe报错 已处理
- # # url = 'http://www.xs9z.com/News.asp?PageNo=1&classid=17' #包含iframe 报错 已处理
- # # url = 'http://www.tdxbmj.cn/html/qyxw1/index.html' #列表页规则未获取# 已优化处理,详情页时间没日期报错,标签id重复导致只提取到一个链接
- # # url = 'http://www.sxsltlyy.com/newslist.php?cid=29' # 列表页获取失败,详情页xpath错误 浏览器打开界面与selenium 的不一样 ua问题已修复
- # # url = 'http://view.landtz.com:8092/jj/index' # #列表页规则未获取# 拍卖多个图标纵向列表 content_xpath of listpage is //*[@class="wp"]/div[2]/div[1]/a[1]/div[2] 预测错误
- # # url = 'http://www.hbbidcloud.cn/suizhou/jyxx/004003/004003006/about.html' # #翻页链接不匹配##下一页规则未获取# 网页本身无翻页机制
- # # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=3' #翻页链接不匹配##下一页规则未获取##详情页列表页区分长度未识别#
- # # url = 'https://www.sxeec.com/gpgg/p4.html' ##翻页链接不匹配##下一页规则未获取# 下一页在标签<i>,链接在父节点<a>标签
- # # url = 'http://sthjj.liaoyuan.gov.cn/xxgk/tzgg/' #翻页链接不匹配 第二页开始规律 翻页超时导致拿不到翻页规则 无头模式打开网页超时, 正常模式不超时
- # # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/index_3.html' #翻页链接不匹配
- # # url = 'http://bj.sxggzyjy.cn/jydt/001001/001001004/001001004001/subPage.html' #翻页链接不匹配##下一页规则未获取#
- # # url = 'http://www.tlgljs.com/cpzs.html'
- # # url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
- # # url = 'http://www.zqcyl.cn/zlzx/ggl/' #抛出异常导致返回结果失败,
- # # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3'
- # # url = 'http://www.cqcjda.com/ShowList.aspx?pkey=3&p=1'
- # # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
- # # url = 'http://www.sxeec.com/gpgg.html'
- # url = 'http://zrzyj.jlbc.gov.cn/xxgk/tzgg/'
- # url = 'http://bbkx.bb.ah.cn/kxxw/tzgg/index.html'
- # url = 'http://www.lzwhg.com/tongzhigonggao/'
- # url = 'http://www.slwr.gov.cn/zfxxgk/gkml/216/240/257/list_640.htm' # 列表页脚本异常
- # url = 'http://view.landtz.com:8091/xh/index?resourceStatus=0&useType=&orderBy=0&title='
- # url = 'http://ggzy.yueqing.gov.cn/yqwebnew/jyxx/001009/001009010/'
- # url = 'http://ggzy.xjbt.gov.cn/TPFront/bt5/083003/083003002/083003002006/'
- # url = 'http://www.longmen.gov.cn/xzfbm/xcl/zwgk/bmwj/tzgg/index.html'
- # url = 'http://nyncj.yq.gov.cn/tzgg/'
- url = 'http://www.yrcc.gov.cn/zwzc/gzgb/gg/index.html'
- url = 'http://www.hzsq.gov.cn/index.php?r=article/Category/index&class_id=61'
- url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
- # url = 'http://www.lzwhg.com/tongzhigonggao/' #翻页失败
- # url = 'http://jmxhfy.chinacourt.gov.cn/article/index/id/M8xNNjBINzAwNiACAAA.shtml' # js报错 getpath
- # url = 'http://www.heshanshi.gov.cn/xxgk/zdlyxxgk/zdjsxmpzhss/ztbgcgk/' # 只有两页,翻页失败 ,已优化
- url = 'http://www.hustp.com/index.php?s=/Index/noticeInfoList/type_id/11.html' # 只有两页,且不规律
- # url = 'http://www.scncggzy.com.cn/TPFront/front_zfcg/071009/'
- # url = 'http://www.stjs.org.cn/zbtb/zbtb_zhongbiaogg.aspx?page=1'
- # url = 'http://www.ccgp-hebei.gov.cn/province/cggg/dyly/'
- # url = 'http://www.ccgp-hebei.gov.cn/province/cggg/dyly/'
- # url = 'http://zyjy.huizhou.gov.cn/ggfw/jyxx/gycqjy/gpjggg/'
- # url = 'http://www.gztpc.com/category/bidding.html?id=230' # 附件丢失,代优化
- # url = 'https://www.nbzjy.cn/list-gsgg.html' # #列表页规则未获取# 已解决
- t1 = time.time()
- rs = get_rs(url)
- t2 = time.time()
- print(rs)
- print("耗时:", t2-t1)
- import pandas as pd
- import time
- l = []
- def get_url_root(text):
- url = re.search('https?:[a-z0-9-./]+\.(cn|com|org|net|gov|edu|biz|cc|mil|top|pub|info)', text)
- if url:
- return url.group(0)
- else:
- return ''
- def get_url(text):
- try:
- url = json.loads(text).get('ruleLink', '')
- return url
- except:
- print('CRAWLER_LINK json loads 出错:', text)
- return ''
- # df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8.csv')[:]
- # df = pd.read_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict100-200.csv')[:]
- # df = pd.read_excel('E:\crawl_data/新建 XLS 工作表.xls')
- # df = pd.read_excel('E:\crawl_data/20220526new_url_2test.xlsx')
- # df.drop_duplicates(subset=['首页网址'], inplace=True)
- #
- # df['url_root'] = df['CRAWLER_LINK '].apply(lambda x:get_url_root(x))
- # df['url'] = df['CRAWLER_LINK '].apply(lambda x:get_url(x))
- # df = df[df['url']!=""]
- # print(len(df))
- # df.drop_duplicates(subset=['url_root'], inplace=True)
- # print(len(df))
- # df.drop_duplicates(subset=['DETAIL_CONTENT_NODE'], inplace=True)
- # # # df = df[100:200]
- # df.reset_index(drop=True, inplace=True)
- # print(len(df), df.columns)
- # t0 = time.time()
- # for i in df.index:
- # # if '#列表页规则未获取#' not in df.loc[i, 'rs']:
- # # continue
- # t1 = time.time()
- # # url = df.loc[i, 'url']
- # url = df.loc[i, '列表页链接']
- # if not re.match('http', url):
- # l.append('')
- # print(url)
- # continue
- # print(url)
- # rs = get_rs(url)
- # # try:
- # # url = json.loads(df.loc[i, 'CRAWLER_LINK ']).get('ruleLink', '')
- # # print(url)
- # # rs = get_rs(url)
- # # except:
- # # rs = json.dumps({'err_msg': 'json loads link error'})
- # print('耗时:', time.time()-t1)
- # print(rs)
- # l.append(rs)
- # df['rs3'] = pd.Series(l)
- # print('完成,总耗时:', time.time()-t0)
- # # # df.to_csv('E:/crawl_data/crawler.BXKC_CRAWLING_RULES_INFO_utf8_predict后1000-900.csv', encoding='utf-8')
- # # df.to_excel('E:/crawl_data/20220526new_url_0531.xlsx', encoding='utf-8')
- # print('写入完成,总耗时:', time.time()-t0)
- # # #
|