extractor.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. '''
  2. Created on 2019年8月19日
  3. @author: User
  4. '''
  5. from module import predictor
  6. from module.detail.content import featureEngine as featureEngine_content
  7. from module.detail.title import featureEngine as featureEngine_title
  8. from module.Utils import mergeDict,getCommonXpath
  9. import numpy as np
  10. import module.htmlDrawing as hd
  11. from module.Utils import *
  12. detailContentPredictor = predictor.DetailContentPredictor()
  13. detailTitlePredictor = predictor.DetailTitlePredictor()
  14. def getCommonXpath_time(data_time):
  15. list_position_list_xpath = []
  16. for item in data_time:
  17. _left = item[0]
  18. _top = item[1]
  19. _xpath = item[2][0]
  20. _fontSize = item[3]
  21. _offset_top_title = item[4]
  22. _distance = abs(_top-_offset_top_title)
  23. if _fontSize is None:
  24. _fontSize = 16
  25. _find_flag = False
  26. for _position_list_xpath in list_position_list_xpath:
  27. if _position_list_xpath[2]==_xpath:
  28. _position_left = _position_list_xpath[0]
  29. _position_top = _position_list_xpath[1]
  30. _position_left = (_position_left+_left)/2
  31. _position_top = (_position_top+_top)/2
  32. if _fontSize<_position_list_xpath[3]:
  33. _position_list_xpath[3] = _fontSize
  34. _position_list_xpath[4] += 1
  35. _position_distance = _position_list_xpath[5]
  36. _position_distance = (_position_distance+_distance)/2
  37. _find_flag = True
  38. break
  39. if not _find_flag:
  40. list_position_list_xpath.append([_left,_top,_xpath,_fontSize,1,_distance])
  41. date_xpath = None
  42. _max_len = 0
  43. _max_len_index = None
  44. _max_distance = 10000
  45. _score = 0
  46. for i in range(len(list_position_list_xpath)):
  47. item = list_position_list_xpath[i]
  48. '''
  49. tmp_score = item[4]*0.8-item[5]*0.5
  50. if tmp_score>_score:
  51. _score = tmp_score
  52. _max_len_index = i
  53. date_xpath = item[2]
  54. '''
  55. if item[4]>_max_len:
  56. _max_len = item[4]
  57. date_xpath = item[2]
  58. _max_len_index = i
  59. _max_distance = item[5]
  60. #如果长度一样,则选择距离标题最近的
  61. elif item[4]==_max_len:
  62. if item[5]<_max_distance:
  63. date_xpath = item[2]
  64. _max_len_index = i
  65. _max_distance = item[5]
  66. return date_xpath
  67. def getRule_detail(list_hrefs,try_times=3,MAX_HREFS=10):
  68. list_xpaths_content = []
  69. list_xpath_remove_content = []
  70. list_data_time = []
  71. list_xpaths_title = []
  72. list_xpaths_time = []
  73. list_title_top = []
  74. count_hrefs = 0
  75. dict_rule_detail = dict()
  76. dict_rule_detail["hasDrew"] = False
  77. for _url in list_hrefs:
  78. if _url is None:
  79. continue
  80. list_legal_time = []
  81. _flag = -2
  82. browser = hd.getdriver()
  83. debug("get driver")
  84. loadsucess = hd.loadPage(browser, _url)
  85. if not loadsucess:
  86. browser = hd.getdriver()
  87. # browser.maximize_window()
  88. flag,data = featureEngine_content.getInput_byJS(browser,_url)
  89. hasGotten = True
  90. if flag:
  91. x,inner_html,list_xpath,data_time = data
  92. _index = detailContentPredictor.predict(x)
  93. pt = '<a.*?\.(zip|rar|tar|7z|wim|docx|doc|xlsx|xls|pdf|txt|hnzf|bmp|tif|PDF|DOC|DOCX|XLS|XLSX).*?</a>|<a[^<]*?(中标通知书|合同|文件|附件).*?</a>'
  94. total_annex = len(re.findall(pt, browser.page_source))
  95. extract_annex = len(re.findall(pt, inner_html[_index]))
  96. if total_annex > extract_annex and _index>5 and len(list_xpath[_index])>0:
  97. extract_xpath = list_xpath[_index][0][0]
  98. for i in range(_index-1, _index-5, -1):
  99. if len(re.findall(pt, inner_html[i]))== total_annex and inner_html[_index] in inner_html[i]:
  100. log('规则调整模型正文提取附件不完整情况')
  101. _index = i
  102. break
  103. _xpath = list_xpath[_index]
  104. _xpath.reverse()
  105. list_xpath_remove_content.append(_xpath)
  106. tmp_xpaths_time = []
  107. for item in data_time:
  108. list_legal_time.append(item)
  109. tmp_xpaths_time.append(item[2][0])
  110. list_xpaths_time.append(tmp_xpaths_time)
  111. _flag += 1
  112. else:
  113. hasGotten = False
  114. add_err_msg(dict_rule_detail, data)
  115. flag,data_title = featureEngine_title.getInput_byJS(browser,_url)
  116. hd.adddriver(browser)
  117. debug("release driver")
  118. if flag:
  119. x,_,list_xpath,list_top = data_title
  120. log('详情标题获取成功')
  121. _index = detailTitlePredictor.predict(x)
  122. _xpath = list_xpath[_index]
  123. _xpath.reverse()
  124. list_xpaths_title.append(_xpath)
  125. list_title_top.append(list_top[_index])
  126. #加上标题的offset_top
  127. for i in range(len(list_legal_time)):
  128. list_legal_time[i].append(list_top[_index])
  129. list_data_time.append(list_legal_time[i])
  130. _flag += 1
  131. else:
  132. add_err_msg(dict_rule_detail, data_title)
  133. if _flag==0:
  134. count_hrefs += 1
  135. if count_hrefs>=MAX_HREFS:
  136. break
  137. for item in list_xpath_remove_content:
  138. _xpath = []
  139. for _xpath_remove in item:
  140. _xpath.append(_xpath_remove[0])
  141. list_xpaths_content.append(_xpath)
  142. dict_rule_detail["detail_content"] = getCommonXpath(list_xpaths_content)
  143. set_remove_list = None
  144. for item in list_xpath_remove_content:
  145. for _xpath_remove in item:
  146. if _xpath_remove[0]==dict_rule_detail["detail_content"]:
  147. if set_remove_list is None:
  148. set_remove_list = set(_xpath_remove[1])
  149. else:
  150. set_remove_list = set(_xpath_remove[1])&set_remove_list
  151. commonxpath_time = None
  152. if len(list_xpaths_time)>2:
  153. xpath_time_set = set(list_xpaths_time[0])
  154. for i in range(1, len(list_xpaths_time)):
  155. xpath_time_set = xpath_time_set&set(list_xpaths_time[i])
  156. if len(xpath_time_set)==1:
  157. commonxpath_time = xpath_time_set.pop()
  158. # dict_rule_detail["detail_date"] = commonxpath_time
  159. dict_rule_detail["detail_removeList"] = list(set_remove_list) if set_remove_list!=None else []
  160. dict_rule_detail["detail_date"] = getCommonXpath_time(list_data_time)
  161. dict_rule_detail["detail_title"] = getCommonXpath(list_xpaths_title)
  162. # try:
  163. browser = hd.getdriver()
  164. debug("get driver")
  165. if len(list_hrefs)>0:
  166. loadsucess = hd.loadPage(browser, list_hrefs[-1],)
  167. log('logPage: ')
  168. if loadsucess==False:
  169. browser = hd.getdriver()
  170. dict_rule_detail["hasDrew"] = dict_rule_detail["hasDrew"] or hd.hasDrew(list_hrefs[0], [{"type":"xpath","rule":dict_rule_detail["detail_content"]},
  171. {"type":"xpath","rule":dict_rule_detail["detail_date"]},
  172. {"type":"xpath","rule":dict_rule_detail["detail_title"]}])
  173. if dict_rule_detail["detail_content"] is not None and len(dict_rule_detail["detail_content"].split("/"))>6:
  174. log("before being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
  175. # dict_rule_detail["detail_content"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_content"])
  176. dict_rule_detail["detail_content"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_content"])
  177. log("after being replaced xpath of detail_content"+dict_rule_detail["detail_content"])
  178. if dict_rule_detail["detail_date"] is not None and len(dict_rule_detail["detail_date"].split("/"))>6:
  179. # dict_rule_detail["detail_date"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_date"])
  180. dict_rule_detail["detail_date"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_date"])
  181. if dict_rule_detail["detail_title"] is not None and len(dict_rule_detail["detail_title"].split("/"))>6:
  182. # dict_rule_detail["detail_title"] = browser.execute_script(scripts_replaceXpath,dict_rule_detail["detail_title"])
  183. dict_rule_detail["detail_title"] = get_js_rs(browser, scripts_replaceXpath,dict_rule_detail["detail_title"])
  184. # finally:
  185. hd.adddriver(browser)
  186. debug("release driver")
  187. if dict_rule_detail["detail_content"] is not None and dict_rule_detail["detail_date"] is not None and dict_rule_detail["detail_title"] is not None:
  188. dict_rule_detail["flag"] = True
  189. else:
  190. dict_rule_detail["flag"] = False
  191. add_err_msg(dict_rule_detail, "#详情规则不完整#")
  192. return dict_rule_detail
  193. if __name__=="__main__":
  194. list_hrefs = ['http://www.beian.miit.gov.cn/', 'http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=22240102000130', 'http://bszs.conac.cn/sitename?method=show&id=072BBE1E1F301B54E053022819AC1765', 'http://121.43.68.40/exposure/jiucuo.html?site_code=2224000007&url=http%3A%2F%2Fwww.ybzfcg.gov.cn%2Fnet%2Fcgxx.jsp%3Fopstatus%3Dxjgg']
  195. print(getRule_detail(list_hrefs))