''' Created on 2019年8月19日 @author: User ''' from module import predictor from module.detail.content import featureEngine as featureEngine_content from module.detail.title import featureEngine as featureEngine_title from module.Utils import mergeDict,getCommonXpath import numpy as np import module.htmlDrawing as hd from module.Utils import * detailContentPredictor = predictor.DetailContentPredictor() detailTitlePredictor = predictor.DetailTitlePredictor() def getCommonXpath_time(data_time): list_position_list_xpath = [] for item in data_time: _left = item[0] _top = item[1] _xpath = item[2][0] _fontSize = item[3] _offset_top_title = item[4] _distance = abs(_top-_offset_top_title) if _fontSize is None: _fontSize = 16 _find_flag = False for _position_list_xpath in list_position_list_xpath: if _position_list_xpath[2]==_xpath: _position_left = _position_list_xpath[0] _position_top = _position_list_xpath[1] _position_left = (_position_left+_left)/2 _position_top = (_position_top+_top)/2 if _fontSize<_position_list_xpath[3]: _position_list_xpath[3] = _fontSize _position_list_xpath[4] += 1 _position_distance = _position_list_xpath[5] _position_distance = (_position_distance+_distance)/2 _find_flag = True break if not _find_flag: list_position_list_xpath.append([_left,_top,_xpath,_fontSize,1,_distance]) date_xpath = None _max_len = 0 _max_len_index = None _max_distance = 10000 _score = 0 for i in range(len(list_position_list_xpath)): item = list_position_list_xpath[i] ''' tmp_score = item[4]*0.8-item[5]*0.5 if tmp_score>_score: _score = tmp_score _max_len_index = i date_xpath = item[2] ''' if item[4]>_max_len: _max_len = item[4] date_xpath = item[2] _max_len_index = i _max_distance = item[5] #如果长度一样,则选择距离标题最近的 elif item[4]==_max_len: if item[5]<_max_distance: date_xpath = item[2] _max_len_index = i _max_distance = item[5] return date_xpath def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10): list_xpaths_content = [] list_xpath_remove_content = [] list_data_time = [] list_xpaths_title = [] list_title_top = [] count_hrefs = 0 dict_rule_detail = dict() dict_rule_detail["hasDrew"] = False for _url in list_hrefs: if _url is None: continue list_legal_time = [] _flag = -2 browser = hd.getdriver() debug("get driver") loadsucess = hd.loadPage(browser, _url) if not loadsucess: browser = hd.getdriver() # browser.maximize_window() flag,data = featureEngine_content.getInput_byJS(browser,_url) hasGotten = True if flag: x,inner_html,list_xpath,data_time = data _index = detailContentPredictor.predict(x) pt = '' total_annex = len(re.findall(pt, browser.page_source)) extract_annex = len(re.findall(pt, inner_html[_index])) if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0: extract_xpath = list_xpath[_index][0][0] for i in range(_index-1, _index-5, -1): if len(re.findall(pt, inner_html[i]))== total_annex: log('规格调整模型正文提取附件不完整') _index = i break elif len(list_xpath[i])>0 and list_xpath[i][0][0] not in extract_xpath: break _xpath = list_xpath[_index] _xpath.reverse() list_xpath_remove_content.append(_xpath) for item in data_time: list_legal_time.append(item) _flag += 1 else: hasGotten = False add_err_msg(dict_rule_detail, data) flag,data_title = featureEngine_title.getInput_byJS(browser,_url) hd.adddriver(browser) debug("release driver") if flag: x,_,list_xpath,list_top = data_title log('详情标题获取成功') _index = detailTitlePredictor.predict(x) _xpath = list_xpath[_index] _xpath.reverse() list_xpaths_title.append(_xpath) list_title_top.append(list_top[_index]) #加上标题的offset_top for i in range(len(list_legal_time)): list_legal_time[i].append(list_top[_index]) list_data_time.append(list_legal_time[i]) _flag += 1 else: add_err_msg(dict_rule_detail, data_title) if _flag==0: count_hrefs += 1 if count_hrefs>=MAX_HREFS: break for item in list_xpath_remove_content: _xpath = [] for _xpath_remove in item: _xpath.append(_xpath_remove[0]) list_xpaths_content.append(_xpath) dict_rule_detail["detail_content"] = getCommonXpath(list_xpaths_content) set_remove_list = None for item in list_xpath_remove_content: for _xpath_remove in item: if _xpath_remove[0]==dict_rule_detail["detail_content"]: if set_remove_list is None: set_remove_list = set(_xpath_remove[1]) else: set_remove_list = set(_xpath_remove[1])&set_remove_list dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else [] dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time) dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title) # try: browser = hd.getdriver() debug("get driver") if len(list_hrefs)>0: loadsucess = hd.loadPage(browser, list_hrefs[-1],) log('logPage: ') if loadsucess==False: browser = hd.getdriver() dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]}, {"type":"xpath","rule":dict_rule_detail["detail_date"]}, {"type":"xpath","rule":dict_rule_detail["detail_title"]}]) if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6: log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"]) # dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"]) dict_rule_detail["detail_content"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_content"]) log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"]) if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6: # dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"]) dict_rule_detail["detail_date"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_date"]) if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6: # dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"]) dict_rule_detail["detail_title"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_title"]) # finally: hd.adddriver(browser) debug("release driver") if dict_rule_detail["detail_content"] is not None and dict_rule_detail["detail_date"] is not None and dict_rule_detail["detail_title"] is not None: dict_rule_detail["flag"] = True else: dict_rule_detail["flag"] = False add_err_msg(dict_rule_detail, "#详情规则不完整#") return dict_rule_detail if __name__=="__main__": list_hrefs = ['http://www.beian.miit.gov.cn/', 'http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=22240102000130', 'http://bszs.conac.cn/sitename?method=show&id=072BBE1E1F301B54E053022819AC1765', 'http://121.43.68.40/exposure/jiucuo.html?site_code=2224000007&url=http%3A%2F%2Fwww.ybzfcg.gov.cn%2Fnet%2Fcgxx.jsp%3Fopstatus%3Dxjgg'] print(getRule_detail(list_hrefs))