123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281 |
- '''
- Created on 2019年8月19日
- @author: User
- '''
- import module.listpage.extractor as ext_listpage
- import module.detail.extractor as ext_detail
- from module.Utils import mergeDict,log,add_err_msg
- import module.htmlDrawing as hd
- def ruleExtract(listpage_url):
- '''
- @summary: 从列表页链接开始,将列表页<a>、date和翻页以及详情页title、date、content的规则抽取出来
- '''
- try:
- result = {"flag":False,"status_code":201}
- status_code = hd.getStatus(listpage_url)
- log("listpage_url:"+listpage_url)
- if status_code==404:
- rule_listpage = None
- rule_detail = None
- result = mergeDict([rule_listpage,rule_detail])
- result["status_code"] = "404"
- add_err_msg(result, "#网页打不开#")
- return result
- print('准备取列表页 ')
- data_listpage = ext_listpage.getRule_listpage(listpage_url)
- print('完成列表页处理')
- # print('data_listpage:', data_listpage)
- if data_listpage is None:
- log("data_listpage is None")
- rule_listpage = None
- rule_detail = None
- result = mergeDict([rule_listpage,rule_detail])
- add_err_msg(result, "#列表页规则未获取#")
- result["status_code"] = "201"
- else:
- rule_listpage,list_hrefs = data_listpage
- print('准备处理详情页')
- rule_detail = ext_detail.getRule_detail(list_hrefs)
- print('详情页处理完毕')
- result = mergeDict([rule_listpage,rule_detail])
- result["status_code"] = "201"
- except Exception as e:
- log(str(e))
- return result
- return result
-
- if __name__=="__main__":
- listpage_url = ["http://www.glls.gov.cn/zwgk/zdly/zdxm/zbtb/",
- "http://www.hbdsyy.com/info.php?class_id=108101",
- "http://1496y4r296.iok.la/portal/list/index/id/10.html",
- "http://xgxz.xiaogan.gov.cn/zbgg/index.jhtml",
- "http://www.2823333.com/channels/270.html",
- "http://wedz.changsha.gov.cn/xxgk/cgzb/",
- "https://bd.ispacechina.com/exp/bidding/sell/signup/index.do?typ=1",
- "http://www.zycqjy.com/wcm/zycqjy/html/gycq/index.html",
- "http://www.120cq.com.cn/tender_job_tender/",
- "http://www.cqcfe.com/type/37020502.html",
- "http://www.zg.gov.cn/web/sgzw/dep_tzgg?tdsourcetag=s_pctim_aiomsg",
- "http://dsjw.guiyang.gov.cn/c9576/index.html",
- "http://gaj.guiyang.gov.cn/jszx/tzgg/jtgll/",
- "http://jw.guiyang.gov.cn/c8346/",
- "http://jyj.guiyang.gov.cn/pd_jydt/node_2948.htm?current=2951",
- "http://kjj.guiyang.gov.cn/a/xxgk/tzgg/list_25_1.html",
- "http://rfb.guiyang.gov.cn/c6686/index.html",
- "http://gzw.guiyang.gov.cn/c7905/",
- "http://www.gyzyyfy.com/list-51-1.html",
- "http://www.gzspm.com/new.asp?anclassid=4",
- "http://www.fdfzjt.com/announcement/2?tdsourcetag=s_pctim_aiomsg",
- "http://fgw.gzlps.gov.cn/gzdt_42194/tzgg_42196/index.html",
- "http://rfb.gzlps.gov.cn/gzdt/tzgg/index.html",
- "http://www.yn-mj.cn/list/ynmjPC/1/38/auto/20/0.html",
- "http://kmyc.yn-tobacco.com/zwgk/gsgg/zbxx/",
- "http://www.ljgucheng.gov.cn/zwgk_14144/zdlygk/ggzyjyxx/",
- "http://ljyc.yn-tobacco.com/zwgk/gsgg/zbxx/",
- "http://www.kmctgs.com/a/tongzhigonggao/",
- "http://zzgs.yn-tobacco.com/zwgk/gggs/",
- "http://www.yn.csg.cn/news.asp?page=1&dispnum=15&label=%D5%D0%CD%B6%B1%EA%D0%C5%CF%A2&sname=&structure=&startdate=&enddate=",
- "http://www.ynbit.com/index.php?c=category&id=44",
- "http://www.ynshrq.com/a/chengpinyingcai/zhaotoubiao/",
- "http://www.suijiang.gov.cn/subsiteIndex/toPage?subsiteFlag=suijiangpc&subsiteId=1&newsClassId=175&pageType=auto&pageSize=20&start=0&objectId=",
- "http://www.ztkt.net/html/tongzhigonggao/index.html",
- "http://www.jiangkou.gov.cn/xxgk/xxgkml/zdlygk/ggzypz/czzfbzajgc/index.html",
- "http://www.shiqian.gov.cn/zwgk/xxgkml/zdlygk/tzgg/list.html",
- "http://www.shiqian.gov.cn/zwgk/xxgkml/zdlygk/tzgg/list.html",
- "http://www.trs.gov.cn/xxgk/zdlygk/zfcg/zbgg_59709/index.html",
- "http://www.yanhe.gov.cn/zwgk/xxgkml/zdlygk/zfcg/cgzb/list.html",
- "http://www.trws.gov.cn/zwgk/xxgkml/zdlygk/ggzypz/zbgg/list_1.html?tdsourcetag=s_pctim_aiomsg",
- "http://jr.guizhou.gov.cn/tzgg/index.html",
- "http://www.yanhe.gov.cn/xwzx/tzgg/index.html",
- "http://czt.guizhou.gov.cn/xwzx/tzgg/index.html",
- "http://jgsw.guizhou.gov.cn/xwzx/tzgg/index.html",
- "http://rfb.guizhou.gov.cn/xwzx/tzgg/index.html",
- "http://www.suiyang.gov.cn/xxgk/zdxxgk/zfcg/index.html",
- "http://daj.zunyi.gov.cn/gzdt/tzgg/",
- "http://www.zysjwssc.com/emall/zunyi/jingjiaInfo.aspx",
- "http://zyepb.zunyi.gov.cn/news34/news_more.asp?page=1&word=&lm=&lm2=340&lmname=&open=&n=&hot=&tj=",
- "http://www.xinpu.gov.cn/xwzx/tzgg/index.html",
- "http://www.gzza.gov.cn/xwzx/tzgg/index.html",
- "http://www.lqzyy.com/new1.html",
- "http://www.cd7yy.com/news/news-18p1.html",
- "http://mpnr.chengdu.gov.cn/second/zpgjg.aspx?ClassID=001002002006001",
- "http://www.cfyy.net/tender_sub/",
- "http://www.hxdental.cn/news/bid/",
- "http://www.sclib.org/list.htm?m=1521488440060363&c=1521488440060375&type=1",
- "http://sctcm.sc.gov.cn/get/class/scszyyglj/gggs/index.html",
- "http://gzw.deyang.gov.cn/list.asp?id=1&smallid=248&bname=综合信息&Sname=通知通告",
- "http://www.gycjtz.com/subject.asp?typeid=28&mode=2&page=1",
- "http://www.lsfybjy.com/list/34.html",
- "http://www.gzsmzmuseum.cn/list-7.html",
- "http://jw.guiyang.gov.cn/c8346/index.html",
- "http://sfj.guiyang.gov.cn/zwpd/list_infor_dtxx.aspx?tid=7&lx=2&pg=1",
- "http://ggzyjy.yanbian.gov.cn/jyxx/005002/005002004/aboutsubgc.html",
- "http://whlyj.sh.gov.cn/node2/n2029/n2031/n2085/n2087/index.html",
- "http://ggzyjy.yanbian.gov.cn/jyxx/005005/005005004/aboutsub.html",
- "http://www.ha.hrss.gov.cn/viewCmsCac.do?cacId=4aef140825e3728f01261be1dc1a01ba&offset=0&",
- "http://www.henanmz.gov.cn/xxgk/xxgkml/gggs/index.html",
- "http://www.jaas.com.cn/index/list_view.php?pn=1&sortid=",
- "http://jlzy.jlsfy.gov.cn/zrbxx/index.jhtml",
- "https://www.ytetc.edu.cn/info/190-1.html",
- "http://www.sccn.gov.cn/zwgk/zwdt/tzgg/",
- "http://xa3yuan.com/3/10/list.aspx",
- "http://sfj.bozhou.gov.cn/content/channel/59278c4aceab064621611981/",
- "http://nyncj.bozhou.gov.cn/content/channel/5928d632ceab066c7361197f/",
- "http://wlt.bozhou.gov.cn/zw/html/type/list-0104-1.html",
- "http://jyecc.net/zxnews/gonggao/",
- "http://www.baotou.gov.cn/01xxgk/xxgk_list.jsp?urltype=egovinfo.EgovInfoList&wbtreeid=1001&sccode=zfcg&subtype=1&gilevel=1",
- "http://www.zlj.gov.cn/web/zlj/gzgg",
- "http://www.zgda.gov.cn/web/daq/gzgg",
- "http://rsj.gz.gov.cn/hrssgz/zwdt_tzgg/list.shtml",
- "http://www.gzeec.org/SortHtml/1/List_14.html",
- "http://www.zhsi.gov.cn/zhsi_web/zhengwu/gggs/subpage_list.jsp",
- "http://www.gaozhou.gov.cn/gaozhou/jrgz/TextInfo.aspx?id=28",
- "http://www.huazhou.gov.cn/site/goverment_list?groups_id=19",
- "http://wenhua.huizhou.gov.cn/pages/cms/hzwhj/html/artList.html?sn=hzwhj&cataId=b2ce7aafeb514d1b92daf29fc1e4fb28&pageNo=1",
- "http://hzzj.huizhou.gov.cn/pages/cms/hzzljdj/html/artList.html?cataId=03c59d0bb9df4bd4b4c2756a10b9db7d",
- "http://www.jiaoling.gov.cn/html/gsgg/index.html",
- "http://www.yfyunchengqu.gov.cn/menhuwangzhan/jcxxgk/zfcg",
- "http://www.luoding.gov.cn/menhuwangzhan/zwgk/tzgg",
- "http://wwj.beijing.gov.cn/bjww/362690/362730/zbgg19/index.html",
- "http://www.cy-edu.net/zwgk/xmxx",
- "http://www.hbsti.ac.cn/html/18050/page.html",
- "http://www.fjptfda.gov.cn/zwgk/gsgg/qt/index.shtml",
- "http://www.hnly.gov.cn/sitesources/hnslyt/page_pc/xxgk/zfxxgkml/gggs/list1.html",
- "http://www.gsgrain.com/businessCenter.aspx?mid=440",
- "http://www.cereal.com.cn/channel/NEWS_CATEGORY/01",
- "http://www.sxgrain.com.cn/html/jyzx/jyjg/",
- "http://www.piduqu120.com/article.php?act=list&catid=99",
- "http://www.xiangya5.com/Column.aspx?ColId=15",
- "http://www.jshbank.com/jsyh/cgzx/index.html",
- "http://ggzyjy.yanbian.gov.cn/jyxx/005005/005005002/aboutsub.html",
- "http://www.xydwrmyy.com/Item/list.asp?id=1664",
- "http://www.cjbjedu.com/Channel.aspx?pddm=0012",
- "http://qz.1203.org/1069/index.jhtml",
- "http://www.hnxyzx.org/news_list/&newsCategoryId=5.html",
- "http://zcc.lcu.edu.cn/zbcg/index.htm",
- "http://wyglzx.lcu.edu.cn/dtgg/index.htm",
- "http://www.ntzlyy.cn/news/13/",
- "http://61.134.48.218:8077/category_22.html?page=1",
- "http://www.hnzfzx.com/ic/TongZhiGongGao.html",
- "http://www.sdjxxrmyy.com/list/?3_1.html",
- "http://www.ysyiyuan.cn/index.aspx?lanmuid=100&sublanmuid=831",
- "http://www.jzmu1h.com/newcent/index.php?c=content&a=list&catid=26",
- "http://www.zydzyy.org.cn/newslist.asp?Id=486",
- "http://www.zyxzyy.cn/news.asp?Page=1&myid=75",
- "http://www.wjrcb.com/wjrcb/gdgg/xwgg/index.html?v=1562635453576",
- "http://www.qnzyy.com/c/cggs.html",
- "http://www.laszyy.cn/list/?id=1644",
- "https://www.ahzjyy.com/cn/list_48.aspx",
- "http://www.wzsfy.com/news/54.html",
- "http://www.gyrmyy.com/showclass.asp?id=44",
- "http://www.zjjss.net/xw.html?category_id=80&channel_id=6",
- "http://www.motmti.cn/tzgg/index.jhtml",
- "http://www.gzstvs.com.cn/list.jsp?cItemId=16&itemId=4&page=1",
- "http://www.simc.cn/gsgg/list.htm",
- "http://www.yxtxzs.cn/DisplayList?ClassID=ouPt2u2teTyEZ8YAMpBGd68cweyi7fja3pfaLK6qxRk5qd8z954VQwiuMiqe1LuO&Page=1",
- "http://www.sdsfjy.com/news_list/newsCategoryId=54.html",
- "http://www.peczzu.edu.cn/index/yngg.htm",
- "http://www.hufe.edu.cn/hncywz/listsvl?bmid=207&lmid=287&mbid=7",
- "http://27.24.159.155/cgw/cggg.htm",
- "http://www.ygu.edu.cn/xxgk2.htm",
- "http://www.gdyvc.cn/announce/",
- "http://news.hict.org.cn/Html/xiaonagonggao/",
- "https://www.szcp.com/Roam/Announce/List_17.html",
- "http://www.hebzgfw.cn/gg/index.html",
- "http://rst.sc.gov.cn/zwgk/gsgg/index.html",
- "http://www.aqjjzx.com/info.php?class_id=102103",
- "http://yjj.huaian.gov.cn/xwzx/tzgg/list.html",
- "http://txjkq.changsha.gov.cn/xxgk/zfcg/",
- "http://nyj.yueyang.gov.cn/nyj/7869/7871/default.htm",
- "http://yyjt.yiyang.gov.cn/yyjtj/6270/6279/6288/6303/default.htm",
- "http://www.yiyang.gov.cn/sjkzx/4464/4469/default.htm",
- "http://szrzyj.ziyang.gov.cn/catalog.aspx?id=10",
- "http://yongfu.gxdlr.gov.cn/list.aspx?id=68",
- "http://txjkq.changsha.gov.cn/xxgk/zfcg/",
- "http://szrzyj.ziyang.gov.cn/catalog.aspx?id=10",
- "http://jtys.chengdu.gov.cn/cdjt/c108497/xw_list.shtml",
- "http://fd4zh.30edu.com.cn/Article/46a834fb-c5e2-4805-983d-8162e9887e7e/1.shtml",
- "http://www.fyxzyy.com/tong-zhi-gong-gao.html?page=1",
- "http://www.zhgxqrmyy.com/news/93/",
- "http://www.njfybjy.com/ywgk/ywgk.asp?ClassID=43&PageNo=1",
- "http://www.jahg.com.cn/Info-news_list-cat_id-43.html",
- "http://www.bzsba.com/info.asp?second_id=2001",
- "http://www.ymgfgs.cn/index.php/news/admin/1/cn/104/104.html",
- "http://www.bankcomm.com/BankCommSite/shtml/jyjr/cn/7804/2600473/2600510/list_1.shtml?channelId=7804",
- "http://www.cinda.com.cn/xdjt/xdjtpd/syhzjh/list.shtml",
- "http://gaj.yanan.gov.cn/info/iList.jsp?cat_id=10194",
- "http://www.yanhe.gov.cn/xwzx/tzgg/index.html",
- "http://www.yanhe.gov.cn/zwgk/xxgkml/zdlygk/zfcg/cgzb/list.html",
- "http://rfb.guizhou.gov.cn/xwzx/tzgg/index.html",
- "http://www.zjxzyyy.cn/plus/list.php?tid=45",
- "http://an.km.gov.cn/xxgk/zdlyxxgkzl/ggzyjyxxgk/",
- "http://www.xazls.com//gsgg/index.htm",
- "http://www.wenwu.gov.cn/overt?classCode=zwgk_tzgg_zbcg",
- "http://www.zyhos.com/news/index.asp?D_CataID=I0003&pageno=1",
- "http://www.ks2ndhospital.com/node/82.jspx",
- "http://www.xjzj.gov.cn/info/iList.jsp?cat_id=11037",
- "http://www.aspd.gov.cn/zwgk/zdlygk/ggzypz/tdzy/index.html",
- "http://www.dalong.gov.cn/html/zhengwugongkai/tongzhigonggao/index.html",
- "http://www.dp.gov.cn/dpxw/zwgz/gsgg.htm",
- "http://nync.guiyang.gov.cn/c8121/",
- "http://www.wenwu.gov.cn/overt?classCode=zwgk_tzgg_zbcg",
- "http://www.sxfu.cn/list/312/314/p/1.html",
- "http://ztbgl.yangtzeu.edu.cn/hwcg.htm?tdsourcetag=s_pctim_aiomsg",
- "http://www.cstsjy.cn/tongzhigonggao",
- "http://lxxxgk.bozhou.gov.cn/opennessTarget/?branch_id=53fe9198cbb812e0d509f771&column_code=170302&page=1",
- "http://dtxzwgk.mas.gov.cn/opennessContent/?branch_id=57a3df762c262ea9a00aadf4",
- "http://lxxxgk.bozhou.gov.cn/opennessTarget/?branch_id=53fe9198cbb812e0d509f764&column_code=230200",
- "http://www.hfbh.com.cn/merchants2.asp",
- "http://www.hfbh.com.cn/merchants.asp?page=1",
- "http://www.hfbus.cn/Welcome/Onlinework.aspx?NewsClassID=32&page=1",
- "http://hfxz.hefei.gov.cn/UserData/SortHtml/1/28460823507.html",
- "http://www.hfbus.cn/Welcome/Onlinework.aspx?NewsClassID=38",
- "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_zfcg_cgxq.aspx?address=016&categorynum=004002017",
- "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_zfcg.aspx?address=016&type=&categorynum=004002011",
- "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_zfcg.aspx?address=017&type=&categorynum=004002011",
- "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg.aspx?address=006&type=&categorynum=004012006",
- "http://ep.btsteel.com/erp/mqm/jsp/mqmj001X.jsp",
- "http://xmzwggzy.xlgl.gov.cn/xmweb/ggzyjy/009002/009002006/009002006004/MoreInfo.aspx?CategoryNum=009002006004",
- "http://jndeggzy.jinan.gov.cn/lwwznew/jyxx/044001/044001001/044001001005/MoreInfo.aspx?CategoryNum=044001001005",
- "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg.aspx?address=013&type=&categorynum=004012005",
- "http://xmzwggzy.xlgl.gov.cn/xmweb/ggzyjy/009001/009001005/009001005005/MoreInfo.aspx?CategoryNum=009001005005",
- "http://xmzwggzy.xlgl.gov.cn/xmweb/ggzyjy/009001/009001006/009001006001/MoreInfo.aspx?CategoryNum=009001006001",
- "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg.aspx?address=013&type=&categorynum=004012010",
- "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_zfcg_cgxq.aspx?address=001&categorynum=004002017",
- "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_ylsb.aspx?address=&categorynum=004006008&Paging=1",
- "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_qt.aspx?address=&categorynum=004007004&Paging=1",
- "http://xmzwggzy.xlgl.gov.cn/xmweb/ggzyjy/009004/009004003/009004003001/MoreInfo.aspx?CategoryNum=009004003001",
- "http://www.lygzxjt.com/list.aspx?type=98",
- "http://www.hbei.com.cn/news/mess/",
- "http://ledong.hainan.gov.cn/ledong/0400/right.shtml",
- "http://ggzyjy.yanbian.gov.cn/jyxx/005006/005006002/aboutsub.html",
- "http://xdp.shuanghui.net:8010/webportal/index/bidnotice/list/2.do",
- "http://ggzyjy.yanbian.gov.cn/jyxx/005006/005006002/aboutsub.html",
- "http://ggzyjy.yanbian.gov.cn/jyxx/005006/005006001/aboutsub.html",
- "http://ggzyjy.lsz.gov.cn/TPFront/jyxx/005004/005004001/",
- "http://bid.9to.com/list.php/catid-629/",
- "https://jyj.yuncheng.gov.cn/item/1513_1.shtml",
- "http://www.sjzkq.gov.cn/col/1539754975467/",
- "http://www.byefy.com/cn/News/list_36.aspx",
- "http://www.baotou.gov.cn/01xxgk/xxgk_list.jsp?urltype=egovinfo.EgovInfoList&wbtreeid=1001&sccode=zfcg&subtype=1&gilevel=1",
- "http://www.qyggfw.cn/w/bid/qualiInqueryResult/morePageList?filterparam=%7B%22assortment%22%3A%223%22%2C%22areaCode%22%3A%22621000%22%2C%22workNotice%22%3A%7B%22noticeNature%22%3A%221%22%2C%22bulletinType%22%3A%221%22%7D%7D",
- "http://news.hict.org.cn/Html/xiaonagonggao/",
- "http://zwgk.gz.gov.cn/GZ46/8.1/list1.shtml",
- "https://www.glutnn.cn/article.aspx?classid=7&page=1",
- "http://www.sjzkq.gov.cn/col/1539754975467/",
- "http://swj.jiaxing.gov.cn/col/col1497228/",
- "http://www.tjbhb.com/bhyhww/jzcg/zjgg/index.html",
- "http://www.mcxzyy.com/list.aspx?fcid=91&cid=103",
- "http://www.tcsrmyy.cn/Infor/lists/category/107.html",
- "http://sfj.bozhou.gov.cn/content/channel/59278c4aceab064621611981/",
- "http://yyyzyy.cn/xin-wen-zhong-xin/xin-xi-gong-gao"]
- data = ruleExtract(listpage_url[-2])
- list_keys = list(data.keys())
- list_keys.sort(key=lambda x:x)
- for item in list_keys:
- print(item,data[item])
-
-
|