''' Created on 2019年8月8日 @author: User ''' import re if __name__=="__main__": main_url = "www.baidu.com/343/acb/36.psp" next_page_url = "www.baidu.com/343/ab/1.html" print(re.split('(\d+)',main_url)) main_href = set(re.split('[/|&|?]', main_url)) tmp_href = set(re.split('[/|&|?]', next_page_url)) tmp_href_ele1 = list(tmp_href - main_href) tmp_href_ele2 = list(main_href - tmp_href) print(tmp_href_ele1) print(tmp_href_ele2) tmp_href_digit_list1 = re.findall(r'\d+', tmp_href_ele1[0]) tmp_href_digit_list2 = re.findall(r'\d+', tmp_href_ele2[0]) tmp_href_digit_list = list(set(tmp_href_digit_list1) - set(tmp_href_digit_list2)) begin = next_page_url.find(tmp_href_ele1[0]) end = begin + len(tmp_href_ele1[0]) first_end = next_page_url.find(tmp_href_digit_list[0], begin, end) second_begin = first_end + len(tmp_href_digit_list[0]) first_part = next_page_url[:first_end] second_part = next_page_url[second_begin:] print(first_part,second_part)