123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- '''
- Created on 2019年8月19日
- @author: User
- '''
- from module import predictor
- from module.listpage.content import featureEngine
- from module.listpage.pageTurn import engine
- import module.htmlDrawing as hd
- from module.Utils import mergeDict,log,add_err_msg,error, debug
- import re
- listpageContentPredictor = predictor.ListpageContentPredictor()
- def getRecognize_detail_listpage(list_listpage_url,list_detail_hrefs):
- try:
- pattern = "([/&=\.\|\?\-_\d]+)"
- rule_recog = {}
- set_recog_listpage = set()
- set_recog_listpage_common = set()
- set_recog_detail = set()
- set_recog_detail_common = set()
- set_length_listpage = set()
- set_length_detail = set()
- for _url in list_listpage_url:
- set_temp = set()
- for _split in re.split(pattern, _url):
- set_temp.add(_split)
- set_recog_listpage = set_recog_listpage | set_temp
- if len(set_recog_listpage_common)==0:
- set_recog_listpage_common = set_temp
- else:
- set_recog_listpage_common = set_recog_listpage_common & set_temp
- set_length_listpage.add(len(_url))
- for _url in list_detail_hrefs:
- if _url is None:
- continue
- set_temp = set()
- for _split in re.split(pattern, _url):
- set_temp.add(_split)
- set_recog_detail = set_recog_detail | set_temp
- if len(set_recog_detail_common)==0:
- set_recog_detail_common = set_temp
- else:
- set_recog_detail_common = set_recog_detail_common & set_temp
- set_length_detail.add(len(_url))
- rule_recog["recog_pattern"] = pattern
- set_recog_common = set_recog_listpage & set_recog_detail
- list_recog_listpage = list(set_recog_listpage_common-set_recog_common)
- if len(list_recog_listpage)>0:
- rule_recog["recog_listpage"] = list_recog_listpage
- else:
- rule_recog["recog_listpage"] = None
- list_recog_detail = list(set_recog_detail_common-set_recog_common)
- if len(list_recog_detail)>0:
- rule_recog["recog_detail"] = list_recog_detail
- else:
- rule_recog["recog_detail"] = None
-
- _recog1 = True
- if rule_recog["recog_listpage"] is None and rule_recog["recog_detail"] is None:
- add_err_msg(rule_recog, "#详情页列表页区分字符串未识别#")
- _recog1 = False
-
- _recog2 = False
- if len(set_length_listpage)==1 and len(set_length_detail)==1 and abs(list(set_length_listpage)[0]-list(set_length_detail)[0])>1:
- _recog2 = True
- rule_recog["recog_length"] = [list(set_length_detail)[0],list(set_length_listpage)[0]]
- else:
- rule_recog["recog_length"] = None
- add_err_msg(rule_recog, "#详情页列表页区分长度未识别#")
-
- rule_recog["flag"] = _recog1 or _recog2
- #rule_recog["flag"] = _recog1
- return rule_recog
- except Exception as e:
- error(str(e))
-
-
- def getRule_listpage(listpage_url,try_times=3):
- for i in range(try_times):
- browser = hd.getdriver()
- debug("get driver")
- loadsuccess = hd.loadPage(browser, listpage_url)
- if not loadsuccess:
- log('加载列表主页失败, 重新请求网页。')
- continue
- log('准备执行获取列表页内容标签脚本')
- # with open('d:/html/home_page.html', 'w', encoding='utf-8') as f:
- # f.write(browser.page_source)
- data_listpage = featureEngine.getInput_byJS(browser,listpage_url,"")
- log('获取列表页内容标签成功')
- #print(browser.page_source)
- # hd.adddriver(browser)
- # debug("release driver")
- if data_listpage is not None:
- x,_,list_xpath = data_listpage
- _index = listpageContentPredictor.predict(x)
- log('模型预测列表页标签完毕')
- if len(list_xpath[_index])>0:
- content_xpath = list_xpath[_index][0]
- #content_xpath = "/html"
- log("the content_xpath of listpage is "+str(content_xpath))
- data_rule = featureEngine.getRule_A_Date(browser,listpage_url,content_xpath)
- log('执行脚本获取列表页链接及日期完毕')
- if data_rule is not None:
- dict_rule_A_Date,list_hrefs = data_rule
- # if dict_rule_A_Date.get('flag', '') == False:
- # return None
- # browser = hd.getdriver()
- # debug("get driver")
- log('begin getTurnRule')
- turn_data = engine.getTurnRule(browser,listpage_url)
- log('获取翻页内容完毕')
- # hd.adddriver(browser)
- # debug("release driver")
- dict_rule_pageTurn,list_listpage_url = turn_data
- dict_rule_recog = getRecognize_detail_listpage(list_listpage_url, list_hrefs)
- log('解析列表页规则完毕')
- hd.adddriver(browser)
- debug("release driver")
- return mergeDict([dict_rule_A_Date,dict_rule_pageTurn,dict_rule_recog]),list_hrefs
- hd.adddriver(browser)
- debug("release driver")
- return None
-
-
- if __name__=="__main__":
- listpage_url = "http://www.qyggfw.cn/w/bid/qualiInqueryResult/morePageList?filterparam=%7B%22assortment%22%3A%223%22%2C%22areaCode%22%3A%22621000%22%2C%22workNotice%22%3A%7B%22noticeNature%22%3A%221%22%2C%22bulletinType%22%3A%221%22%7D%7D"
- data = getRule_listpage(listpage_url)[0]
- for item in data.keys():
- print(item,data[item])
-
|