extractor.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. '''
  2. Created on 2019年8月19日
  3. @author: User
  4. '''
  5. from module import predictor
  6. from module.detail.content import featureEngine as featureEngine_content
  7. from module.detail.title import featureEngine as featureEngine_title
  8. from module.Utils import mergeDict,getCommonXpath
  9. import numpy as np
  10. import module.htmlDrawing as hd
  11. from module.Utils import *
  12. detailContentPredictor = predictor.DetailContentPredictor()
  13. detailTitlePredictor = predictor.DetailTitlePredictor()
  14. def getCommonXpath_time(data_time):
  15. list_position_list_xpath = []
  16. for item in data_time:
  17. _left = item[0]
  18. _top = item[1]
  19. _xpath = item[2][0]
  20. _fontSize = item[3]
  21. _offset_top_title = item[4]
  22. _distance = abs(_top-_offset_top_title)
  23. if _fontSize is None:
  24. _fontSize = 16
  25. _find_flag = False
  26. for _position_list_xpath in list_position_list_xpath:
  27. if _position_list_xpath[2]==_xpath:
  28. _position_left = _position_list_xpath[0]
  29. _position_top = _position_list_xpath[1]
  30. _position_left = (_position_left+_left)/2
  31. _position_top = (_position_top+_top)/2
  32. if _fontSize<_position_list_xpath[3]:
  33. _position_list_xpath[3] = _fontSize
  34. _position_list_xpath[4] += 1
  35. _position_distance = _position_list_xpath[5]
  36. _position_distance = (_position_distance+_distance)/2
  37. _find_flag = True
  38. break
  39. if not _find_flag:
  40. list_position_list_xpath.append([_left,_top,_xpath,_fontSize,1,_distance])
  41. date_xpath = None
  42. _max_len = 0
  43. _max_len_index = None
  44. _max_distance = 10000
  45. _score = 0
  46. for i in range(len(list_position_list_xpath)):
  47. item = list_position_list_xpath[i]
  48. '''
  49. tmp_score = item[4]*0.8-item[5]*0.5
  50. if tmp_score>_score:
  51. _score = tmp_score
  52. _max_len_index = i
  53. date_xpath = item[2]
  54. '''
  55. if item[4]>_max_len:
  56. _max_len = item[4]
  57. date_xpath = item[2]
  58. _max_len_index = i
  59. _max_distance = item[5]
  60. #如果长度一样,则选择距离标题最近的
  61. elif item[4]==_max_len:
  62. if item[5]<_max_distance:
  63. date_xpath = item[2]
  64. _max_len_index = i
  65. _max_distance = item[5]
  66. return date_xpath
  67. def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
  68. list_xpaths_content = []
  69. list_xpath_remove_content = []
  70. list_data_time = []
  71. list_xpaths_title = []
  72. list_title_top = []
  73. count_hrefs = 0
  74. dict_rule_detail = dict()
  75. dict_rule_detail["hasDrew"] = False
  76. for _url in list_hrefs:
  77. if _url is None:
  78. continue
  79. list_legal_time = []
  80. _flag = -2
  81. browser = hd.getdriver()
  82. debug("get driver")
  83. loadsucess = hd.loadPage(browser, _url)
  84. if not loadsucess:
  85. browser = hd.getdriver()
  86. # browser.maximize_window()
  87. flag,data = featureEngine_content.getInput_byJS(browser,_url)
  88. hasGotten = True
  89. if flag:
  90. x,inner_html,list_xpath,data_time = data
  91. _index = detailContentPredictor.predict(x)
  92. pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif).*?</a>'
  93. total_annex = len(re.findall(pt, browser.page_source))
  94. extract_annex = len(re.findall(pt, inner_html[_index]))
  95. if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
  96. extract_xpath = list_xpath[_index][0][0]
  97. for i in range(_index-1, _index-5, -1):
  98. if len(re.findall(pt, inner_html[i]))== total_annex:
  99. log('规格调整模型正文提取附件不完整')
  100. _index = i
  101. break
  102. elif len(list_xpath[i])>0 and list_xpath[i][0][0] not in extract_xpath:
  103. break
  104. _xpath = list_xpath[_index]
  105. _xpath.reverse()
  106. list_xpath_remove_content.append(_xpath)
  107. for item in data_time:
  108. list_legal_time.append(item)
  109. _flag += 1
  110. else:
  111. hasGotten = False
  112. add_err_msg(dict_rule_detail, data)
  113. flag,data_title = featureEngine_title.getInput_byJS(browser,_url)
  114. hd.adddriver(browser)
  115. debug("release driver")
  116. if flag:
  117. x,_,list_xpath,list_top = data_title
  118. log('详情标题获取成功')
  119. _index = detailTitlePredictor.predict(x)
  120. _xpath = list_xpath[_index]
  121. _xpath.reverse()
  122. list_xpaths_title.append(_xpath)
  123. list_title_top.append(list_top[_index])
  124. #加上标题的offset_top
  125. for i in range(len(list_legal_time)):
  126. list_legal_time[i].append(list_top[_index])
  127. list_data_time.append(list_legal_time[i])
  128. _flag += 1
  129. else:
  130. add_err_msg(dict_rule_detail, data_title)
  131. if _flag==0:
  132. count_hrefs += 1
  133. if count_hrefs>=MAX_HREFS:
  134. break
  135. for item in list_xpath_remove_content:
  136. _xpath = []
  137. for _xpath_remove in item:
  138. _xpath.append(_xpath_remove[0])
  139. list_xpaths_content.append(_xpath)
  140. dict_rule_detail["detail_content"] = getCommonXpath(list_xpaths_content)
  141. set_remove_list = None
  142. for item in list_xpath_remove_content:
  143. for _xpath_remove in item:
  144. if _xpath_remove[0]==dict_rule_detail["detail_content"]:
  145. if set_remove_list is None:
  146. set_remove_list = set(_xpath_remove[1])
  147. else:
  148. set_remove_list = set(_xpath_remove[1])&set_remove_list
  149. dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
  150. dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
  151. dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
  152. # try:
  153. browser = hd.getdriver()
  154. debug("get driver")
  155. if len(list_hrefs)>0:
  156. loadsucess = hd.loadPage(browser, list_hrefs[-1],)
  157. log('logPage: ')
  158. if loadsucess==False:
  159. browser = hd.getdriver()
  160. dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
  161. {"type":"xpath","rule":dict_rule_detail["detail_date"]},
  162. {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
  163. if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
  164. log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
  165. # dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
  166. dict_rule_detail["detail_content"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_content"])
  167. log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
  168. if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
  169. # dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
  170. dict_rule_detail["detail_date"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_date"])
  171. if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
  172. # dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
  173. dict_rule_detail["detail_title"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_title"])
  174. # finally:
  175. hd.adddriver(browser)
  176. debug("release driver")
  177. if dict_rule_detail["detail_content"] is not None and dict_rule_detail["detail_date"] is not None and dict_rule_detail["detail_title"] is not None:
  178. dict_rule_detail["flag"] = True
  179. else:
  180. dict_rule_detail["flag"] = False
  181. add_err_msg(dict_rule_detail, "#详情规则不完整#")
  182. return dict_rule_detail
  183. if __name__=="__main__":
  184. list_hrefs = ['http://www.beian.miit.gov.cn/', 'http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=22240102000130', 'http://bszs.conac.cn/sitename?method=show&id=072BBE1E1F301B54E053022819AC1765', 'http://121.43.68.40/exposure/jiucuo.html?site_code=2224000007&url=http%3A%2F%2Fwww.ybzfcg.gov.cn%2Fnet%2Fcgxx.jsp%3Fopstatus%3Dxjgg']
  185. print(getRule_detail(list_hrefs))