extractFlow.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. '''
  2. Created on 2019年8月19日
  3. @author: User
  4. '''
  5. import module.listpage.extractor as ext_listpage
  6. import module.detail.extractor as ext_detail
  7. from module.Utils import mergeDict,log,add_err_msg
  8. import module.htmlDrawing as hd
  9. def ruleExtract(listpage_url):
  10. '''
  11. @summary: 从列表页链接开始,将列表页<a>、date和翻页以及详情页title、date、content的规则抽取出来
  12. '''
  13. try:
  14. result = {"flag":False,"status_code":201}
  15. status_code = hd.getStatus(listpage_url)
  16. log("listpage_url:"+listpage_url)
  17. if status_code==404:
  18. rule_listpage = None
  19. rule_detail = None
  20. result = mergeDict([rule_listpage,rule_detail])
  21. result["status_code"] = "404"
  22. add_err_msg(result, "#网页打不开#")
  23. return result
  24. print('准备取列表页 ')
  25. data_listpage = ext_listpage.getRule_listpage(listpage_url)
  26. print('完成列表页处理')
  27. # print('data_listpage:', data_listpage)
  28. if data_listpage is None:
  29. log("data_listpage is None")
  30. rule_listpage = None
  31. rule_detail = None
  32. result = mergeDict([rule_listpage,rule_detail])
  33. add_err_msg(result, "#列表页规则未获取#")
  34. result["status_code"] = "201"
  35. else:
  36. rule_listpage,list_hrefs = data_listpage
  37. print('准备处理详情页')
  38. rule_detail = ext_detail.getRule_detail(list_hrefs)
  39. print('详情页处理完毕')
  40. result = mergeDict([rule_listpage,rule_detail])
  41. result["status_code"] = "201"
  42. except Exception as e:
  43. log(str(e))
  44. return result
  45. return result
  46. if __name__=="__main__":
  47. listpage_url = ["http://www.glls.gov.cn/zwgk/zdly/zdxm/zbtb/",
  48. "http://www.hbdsyy.com/info.php?class_id=108101",
  49. "http://1496y4r296.iok.la/portal/list/index/id/10.html",
  50. "http://xgxz.xiaogan.gov.cn/zbgg/index.jhtml",
  51. "http://www.2823333.com/channels/270.html",
  52. "http://wedz.changsha.gov.cn/xxgk/cgzb/",
  53. "https://bd.ispacechina.com/exp/bidding/sell/signup/index.do?typ=1",
  54. "http://www.zycqjy.com/wcm/zycqjy/html/gycq/index.html",
  55. "http://www.120cq.com.cn/tender_job_tender/",
  56. "http://www.cqcfe.com/type/37020502.html",
  57. "http://www.zg.gov.cn/web/sgzw/dep_tzgg?tdsourcetag=s_pctim_aiomsg",
  58. "http://dsjw.guiyang.gov.cn/c9576/index.html",
  59. "http://gaj.guiyang.gov.cn/jszx/tzgg/jtgll/",
  60. "http://jw.guiyang.gov.cn/c8346/",
  61. "http://jyj.guiyang.gov.cn/pd_jydt/node_2948.htm?current=2951",
  62. "http://kjj.guiyang.gov.cn/a/xxgk/tzgg/list_25_1.html",
  63. "http://rfb.guiyang.gov.cn/c6686/index.html",
  64. "http://gzw.guiyang.gov.cn/c7905/",
  65. "http://www.gyzyyfy.com/list-51-1.html",
  66. "http://www.gzspm.com/new.asp?anclassid=4",
  67. "http://www.fdfzjt.com/announcement/2?tdsourcetag=s_pctim_aiomsg",
  68. "http://fgw.gzlps.gov.cn/gzdt_42194/tzgg_42196/index.html",
  69. "http://rfb.gzlps.gov.cn/gzdt/tzgg/index.html",
  70. "http://www.yn-mj.cn/list/ynmjPC/1/38/auto/20/0.html",
  71. "http://kmyc.yn-tobacco.com/zwgk/gsgg/zbxx/",
  72. "http://www.ljgucheng.gov.cn/zwgk_14144/zdlygk/ggzyjyxx/",
  73. "http://ljyc.yn-tobacco.com/zwgk/gsgg/zbxx/",
  74. "http://www.kmctgs.com/a/tongzhigonggao/",
  75. "http://zzgs.yn-tobacco.com/zwgk/gggs/",
  76. "http://www.yn.csg.cn/news.asp?page=1&dispnum=15&label=%D5%D0%CD%B6%B1%EA%D0%C5%CF%A2&sname=&structure=&startdate=&enddate=",
  77. "http://www.ynbit.com/index.php?c=category&id=44",
  78. "http://www.ynshrq.com/a/chengpinyingcai/zhaotoubiao/",
  79. "http://www.suijiang.gov.cn/subsiteIndex/toPage?subsiteFlag=suijiangpc&subsiteId=1&newsClassId=175&pageType=auto&pageSize=20&start=0&objectId=",
  80. "http://www.ztkt.net/html/tongzhigonggao/index.html",
  81. "http://www.jiangkou.gov.cn/xxgk/xxgkml/zdlygk/ggzypz/czzfbzajgc/index.html",
  82. "http://www.shiqian.gov.cn/zwgk/xxgkml/zdlygk/tzgg/list.html",
  83. "http://www.shiqian.gov.cn/zwgk/xxgkml/zdlygk/tzgg/list.html",
  84. "http://www.trs.gov.cn/xxgk/zdlygk/zfcg/zbgg_59709/index.html",
  85. "http://www.yanhe.gov.cn/zwgk/xxgkml/zdlygk/zfcg/cgzb/list.html",
  86. "http://www.trws.gov.cn/zwgk/xxgkml/zdlygk/ggzypz/zbgg/list_1.html?tdsourcetag=s_pctim_aiomsg",
  87. "http://jr.guizhou.gov.cn/tzgg/index.html",
  88. "http://www.yanhe.gov.cn/xwzx/tzgg/index.html",
  89. "http://czt.guizhou.gov.cn/xwzx/tzgg/index.html",
  90. "http://jgsw.guizhou.gov.cn/xwzx/tzgg/index.html",
  91. "http://rfb.guizhou.gov.cn/xwzx/tzgg/index.html",
  92. "http://www.suiyang.gov.cn/xxgk/zdxxgk/zfcg/index.html",
  93. "http://daj.zunyi.gov.cn/gzdt/tzgg/",
  94. "http://www.zysjwssc.com/emall/zunyi/jingjiaInfo.aspx",
  95. "http://zyepb.zunyi.gov.cn/news34/news_more.asp?page=1&word=&lm=&lm2=340&lmname=&open=&n=&hot=&tj=",
  96. "http://www.xinpu.gov.cn/xwzx/tzgg/index.html",
  97. "http://www.gzza.gov.cn/xwzx/tzgg/index.html",
  98. "http://www.lqzyy.com/new1.html",
  99. "http://www.cd7yy.com/news/news-18p1.html",
  100. "http://mpnr.chengdu.gov.cn/second/zpgjg.aspx?ClassID=001002002006001",
  101. "http://www.cfyy.net/tender_sub/",
  102. "http://www.hxdental.cn/news/bid/",
  103. "http://www.sclib.org/list.htm?m=1521488440060363&c=1521488440060375&type=1",
  104. "http://sctcm.sc.gov.cn/get/class/scszyyglj/gggs/index.html",
  105. "http://gzw.deyang.gov.cn/list.asp?id=1&smallid=248&bname=综合信息&Sname=通知通告",
  106. "http://www.gycjtz.com/subject.asp?typeid=28&mode=2&page=1",
  107. "http://www.lsfybjy.com/list/34.html",
  108. "http://www.gzsmzmuseum.cn/list-7.html",
  109. "http://jw.guiyang.gov.cn/c8346/index.html",
  110. "http://sfj.guiyang.gov.cn/zwpd/list_infor_dtxx.aspx?tid=7&lx=2&pg=1",
  111. "http://ggzyjy.yanbian.gov.cn/jyxx/005002/005002004/aboutsubgc.html",
  112. "http://whlyj.sh.gov.cn/node2/n2029/n2031/n2085/n2087/index.html",
  113. "http://ggzyjy.yanbian.gov.cn/jyxx/005005/005005004/aboutsub.html",
  114. "http://www.ha.hrss.gov.cn/viewCmsCac.do?cacId=4aef140825e3728f01261be1dc1a01ba&offset=0&",
  115. "http://www.henanmz.gov.cn/xxgk/xxgkml/gggs/index.html",
  116. "http://www.jaas.com.cn/index/list_view.php?pn=1&sortid=",
  117. "http://jlzy.jlsfy.gov.cn/zrbxx/index.jhtml",
  118. "https://www.ytetc.edu.cn/info/190-1.html",
  119. "http://www.sccn.gov.cn/zwgk/zwdt/tzgg/",
  120. "http://xa3yuan.com/3/10/list.aspx",
  121. "http://sfj.bozhou.gov.cn/content/channel/59278c4aceab064621611981/",
  122. "http://nyncj.bozhou.gov.cn/content/channel/5928d632ceab066c7361197f/",
  123. "http://wlt.bozhou.gov.cn/zw/html/type/list-0104-1.html",
  124. "http://jyecc.net/zxnews/gonggao/",
  125. "http://www.baotou.gov.cn/01xxgk/xxgk_list.jsp?urltype=egovinfo.EgovInfoList&wbtreeid=1001&sccode=zfcg&subtype=1&gilevel=1",
  126. "http://www.zlj.gov.cn/web/zlj/gzgg",
  127. "http://www.zgda.gov.cn/web/daq/gzgg",
  128. "http://rsj.gz.gov.cn/hrssgz/zwdt_tzgg/list.shtml",
  129. "http://www.gzeec.org/SortHtml/1/List_14.html",
  130. "http://www.zhsi.gov.cn/zhsi_web/zhengwu/gggs/subpage_list.jsp",
  131. "http://www.gaozhou.gov.cn/gaozhou/jrgz/TextInfo.aspx?id=28",
  132. "http://www.huazhou.gov.cn/site/goverment_list?groups_id=19",
  133. "http://wenhua.huizhou.gov.cn/pages/cms/hzwhj/html/artList.html?sn=hzwhj&cataId=b2ce7aafeb514d1b92daf29fc1e4fb28&pageNo=1",
  134. "http://hzzj.huizhou.gov.cn/pages/cms/hzzljdj/html/artList.html?cataId=03c59d0bb9df4bd4b4c2756a10b9db7d",
  135. "http://www.jiaoling.gov.cn/html/gsgg/index.html",
  136. "http://www.yfyunchengqu.gov.cn/menhuwangzhan/jcxxgk/zfcg",
  137. "http://www.luoding.gov.cn/menhuwangzhan/zwgk/tzgg",
  138. "http://wwj.beijing.gov.cn/bjww/362690/362730/zbgg19/index.html",
  139. "http://www.cy-edu.net/zwgk/xmxx",
  140. "http://www.hbsti.ac.cn/html/18050/page.html",
  141. "http://www.fjptfda.gov.cn/zwgk/gsgg/qt/index.shtml",
  142. "http://www.hnly.gov.cn/sitesources/hnslyt/page_pc/xxgk/zfxxgkml/gggs/list1.html",
  143. "http://www.gsgrain.com/businessCenter.aspx?mid=440",
  144. "http://www.cereal.com.cn/channel/NEWS_CATEGORY/01",
  145. "http://www.sxgrain.com.cn/html/jyzx/jyjg/",
  146. "http://www.piduqu120.com/article.php?act=list&catid=99",
  147. "http://www.xiangya5.com/Column.aspx?ColId=15",
  148. "http://www.jshbank.com/jsyh/cgzx/index.html",
  149. "http://ggzyjy.yanbian.gov.cn/jyxx/005005/005005002/aboutsub.html",
  150. "http://www.xydwrmyy.com/Item/list.asp?id=1664",
  151. "http://www.cjbjedu.com/Channel.aspx?pddm=0012",
  152. "http://qz.1203.org/1069/index.jhtml",
  153. "http://www.hnxyzx.org/news_list/&newsCategoryId=5.html",
  154. "http://zcc.lcu.edu.cn/zbcg/index.htm",
  155. "http://wyglzx.lcu.edu.cn/dtgg/index.htm",
  156. "http://www.ntzlyy.cn/news/13/",
  157. "http://61.134.48.218:8077/category_22.html?page=1",
  158. "http://www.hnzfzx.com/ic/TongZhiGongGao.html",
  159. "http://www.sdjxxrmyy.com/list/?3_1.html",
  160. "http://www.ysyiyuan.cn/index.aspx?lanmuid=100&sublanmuid=831",
  161. "http://www.jzmu1h.com/newcent/index.php?c=content&a=list&catid=26",
  162. "http://www.zydzyy.org.cn/newslist.asp?Id=486",
  163. "http://www.zyxzyy.cn/news.asp?Page=1&myid=75",
  164. "http://www.wjrcb.com/wjrcb/gdgg/xwgg/index.html?v=1562635453576",
  165. "http://www.qnzyy.com/c/cggs.html",
  166. "http://www.laszyy.cn/list/?id=1644",
  167. "https://www.ahzjyy.com/cn/list_48.aspx",
  168. "http://www.wzsfy.com/news/54.html",
  169. "http://www.gyrmyy.com/showclass.asp?id=44",
  170. "http://www.zjjss.net/xw.html?category_id=80&channel_id=6",
  171. "http://www.motmti.cn/tzgg/index.jhtml",
  172. "http://www.gzstvs.com.cn/list.jsp?cItemId=16&itemId=4&page=1",
  173. "http://www.simc.cn/gsgg/list.htm",
  174. "http://www.yxtxzs.cn/DisplayList?ClassID=ouPt2u2teTyEZ8YAMpBGd68cweyi7fja3pfaLK6qxRk5qd8z954VQwiuMiqe1LuO&Page=1",
  175. "http://www.sdsfjy.com/news_list/newsCategoryId=54.html",
  176. "http://www.peczzu.edu.cn/index/yngg.htm",
  177. "http://www.hufe.edu.cn/hncywz/listsvl?bmid=207&lmid=287&mbid=7",
  178. "http://27.24.159.155/cgw/cggg.htm",
  179. "http://www.ygu.edu.cn/xxgk2.htm",
  180. "http://www.gdyvc.cn/announce/",
  181. "http://news.hict.org.cn/Html/xiaonagonggao/",
  182. "https://www.szcp.com/Roam/Announce/List_17.html",
  183. "http://www.hebzgfw.cn/gg/index.html",
  184. "http://rst.sc.gov.cn/zwgk/gsgg/index.html",
  185. "http://www.aqjjzx.com/info.php?class_id=102103",
  186. "http://yjj.huaian.gov.cn/xwzx/tzgg/list.html",
  187. "http://txjkq.changsha.gov.cn/xxgk/zfcg/",
  188. "http://nyj.yueyang.gov.cn/nyj/7869/7871/default.htm",
  189. "http://yyjt.yiyang.gov.cn/yyjtj/6270/6279/6288/6303/default.htm",
  190. "http://www.yiyang.gov.cn/sjkzx/4464/4469/default.htm",
  191. "http://szrzyj.ziyang.gov.cn/catalog.aspx?id=10",
  192. "http://yongfu.gxdlr.gov.cn/list.aspx?id=68",
  193. "http://txjkq.changsha.gov.cn/xxgk/zfcg/",
  194. "http://szrzyj.ziyang.gov.cn/catalog.aspx?id=10",
  195. "http://jtys.chengdu.gov.cn/cdjt/c108497/xw_list.shtml",
  196. "http://fd4zh.30edu.com.cn/Article/46a834fb-c5e2-4805-983d-8162e9887e7e/1.shtml",
  197. "http://www.fyxzyy.com/tong-zhi-gong-gao.html?page=1",
  198. "http://www.zhgxqrmyy.com/news/93/",
  199. "http://www.njfybjy.com/ywgk/ywgk.asp?ClassID=43&PageNo=1",
  200. "http://www.jahg.com.cn/Info-news_list-cat_id-43.html",
  201. "http://www.bzsba.com/info.asp?second_id=2001",
  202. "http://www.ymgfgs.cn/index.php/news/admin/1/cn/104/104.html",
  203. "http://www.bankcomm.com/BankCommSite/shtml/jyjr/cn/7804/2600473/2600510/list_1.shtml?channelId=7804",
  204. "http://www.cinda.com.cn/xdjt/xdjtpd/syhzjh/list.shtml",
  205. "http://gaj.yanan.gov.cn/info/iList.jsp?cat_id=10194",
  206. "http://www.yanhe.gov.cn/xwzx/tzgg/index.html",
  207. "http://www.yanhe.gov.cn/zwgk/xxgkml/zdlygk/zfcg/cgzb/list.html",
  208. "http://rfb.guizhou.gov.cn/xwzx/tzgg/index.html",
  209. "http://www.zjxzyyy.cn/plus/list.php?tid=45",
  210. "http://an.km.gov.cn/xxgk/zdlyxxgkzl/ggzyjyxxgk/",
  211. "http://www.xazls.com//gsgg/index.htm",
  212. "http://www.wenwu.gov.cn/overt?classCode=zwgk_tzgg_zbcg",
  213. "http://www.zyhos.com/news/index.asp?D_CataID=I0003&pageno=1",
  214. "http://www.ks2ndhospital.com/node/82.jspx",
  215. "http://www.xjzj.gov.cn/info/iList.jsp?cat_id=11037",
  216. "http://www.aspd.gov.cn/zwgk/zdlygk/ggzypz/tdzy/index.html",
  217. "http://www.dalong.gov.cn/html/zhengwugongkai/tongzhigonggao/index.html",
  218. "http://www.dp.gov.cn/dpxw/zwgz/gsgg.htm",
  219. "http://nync.guiyang.gov.cn/c8121/",
  220. "http://www.wenwu.gov.cn/overt?classCode=zwgk_tzgg_zbcg",
  221. "http://www.sxfu.cn/list/312/314/p/1.html",
  222. "http://ztbgl.yangtzeu.edu.cn/hwcg.htm?tdsourcetag=s_pctim_aiomsg",
  223. "http://www.cstsjy.cn/tongzhigonggao",
  224. "http://lxxxgk.bozhou.gov.cn/opennessTarget/?branch_id=53fe9198cbb812e0d509f771&column_code=170302&page=1",
  225. "http://dtxzwgk.mas.gov.cn/opennessContent/?branch_id=57a3df762c262ea9a00aadf4",
  226. "http://lxxxgk.bozhou.gov.cn/opennessTarget/?branch_id=53fe9198cbb812e0d509f764&column_code=230200",
  227. "http://www.hfbh.com.cn/merchants2.asp",
  228. "http://www.hfbh.com.cn/merchants.asp?page=1",
  229. "http://www.hfbus.cn/Welcome/Onlinework.aspx?NewsClassID=32&page=1",
  230. "http://hfxz.hefei.gov.cn/UserData/SortHtml/1/28460823507.html",
  231. "http://www.hfbus.cn/Welcome/Onlinework.aspx?NewsClassID=38",
  232. "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_zfcg_cgxq.aspx?address=016&categorynum=004002017",
  233. "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_zfcg.aspx?address=016&type=&categorynum=004002011",
  234. "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_zfcg.aspx?address=017&type=&categorynum=004002011",
  235. "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg.aspx?address=006&type=&categorynum=004012006",
  236. "http://ep.btsteel.com/erp/mqm/jsp/mqmj001X.jsp",
  237. "http://xmzwggzy.xlgl.gov.cn/xmweb/ggzyjy/009002/009002006/009002006004/MoreInfo.aspx?CategoryNum=009002006004",
  238. "http://jndeggzy.jinan.gov.cn/lwwznew/jyxx/044001/044001001/044001001005/MoreInfo.aspx?CategoryNum=044001001005",
  239. "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg.aspx?address=013&type=&categorynum=004012005",
  240. "http://xmzwggzy.xlgl.gov.cn/xmweb/ggzyjy/009001/009001005/009001005005/MoreInfo.aspx?CategoryNum=009001005005",
  241. "http://xmzwggzy.xlgl.gov.cn/xmweb/ggzyjy/009001/009001006/009001006001/MoreInfo.aspx?CategoryNum=009001006001",
  242. "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg.aspx?address=013&type=&categorynum=004012010",
  243. "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_zfcg_cgxq.aspx?address=001&categorynum=004002017",
  244. "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_ylsb.aspx?address=&categorynum=004006008&Paging=1",
  245. "http://ggzy.weifang.gov.cn/wfggzy/showinfo/moreinfo_gg_qt.aspx?address=&categorynum=004007004&Paging=1",
  246. "http://xmzwggzy.xlgl.gov.cn/xmweb/ggzyjy/009004/009004003/009004003001/MoreInfo.aspx?CategoryNum=009004003001",
  247. "http://www.lygzxjt.com/list.aspx?type=98",
  248. "http://www.hbei.com.cn/news/mess/",
  249. "http://ledong.hainan.gov.cn/ledong/0400/right.shtml",
  250. "http://ggzyjy.yanbian.gov.cn/jyxx/005006/005006002/aboutsub.html",
  251. "http://xdp.shuanghui.net:8010/webportal/index/bidnotice/list/2.do",
  252. "http://ggzyjy.yanbian.gov.cn/jyxx/005006/005006002/aboutsub.html",
  253. "http://ggzyjy.yanbian.gov.cn/jyxx/005006/005006001/aboutsub.html",
  254. "http://ggzyjy.lsz.gov.cn/TPFront/jyxx/005004/005004001/",
  255. "http://bid.9to.com/list.php/catid-629/",
  256. "https://jyj.yuncheng.gov.cn/item/1513_1.shtml",
  257. "http://www.sjzkq.gov.cn/col/1539754975467/",
  258. "http://www.byefy.com/cn/News/list_36.aspx",
  259. "http://www.baotou.gov.cn/01xxgk/xxgk_list.jsp?urltype=egovinfo.EgovInfoList&wbtreeid=1001&sccode=zfcg&subtype=1&gilevel=1",
  260. "http://www.qyggfw.cn/w/bid/qualiInqueryResult/morePageList?filterparam=%7B%22assortment%22%3A%223%22%2C%22areaCode%22%3A%22621000%22%2C%22workNotice%22%3A%7B%22noticeNature%22%3A%221%22%2C%22bulletinType%22%3A%221%22%7D%7D",
  261. "http://news.hict.org.cn/Html/xiaonagonggao/",
  262. "http://zwgk.gz.gov.cn/GZ46/8.1/list1.shtml",
  263. "https://www.glutnn.cn/article.aspx?classid=7&page=1",
  264. "http://www.sjzkq.gov.cn/col/1539754975467/",
  265. "http://swj.jiaxing.gov.cn/col/col1497228/",
  266. "http://www.tjbhb.com/bhyhww/jzcg/zjgg/index.html",
  267. "http://www.mcxzyy.com/list.aspx?fcid=91&cid=103",
  268. "http://www.tcsrmyy.cn/Infor/lists/category/107.html",
  269. "http://sfj.bozhou.gov.cn/content/channel/59278c4aceab064621611981/",
  270. "http://yyyzyy.cn/xin-wen-zhong-xin/xin-xi-gong-gao"]
  271. data = ruleExtract(listpage_url[-2])
  272. list_keys = list(data.keys())
  273. list_keys.sort(key=lambda x:x)
  274. for item in list_keys:
  275. print(item,data[item])