123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107 |
- import copy
- import os
- import random
- import re
- import sys
- import time
- from bs4 import BeautifulSoup
- from datetime import datetime
- from multiprocessing import Process
- import datetime as dt
- sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
- from format_convert.utils import file_lock
- def run():
- f = file_lock(os.path.abspath(os.path.dirname(__file__)) + '/19022.lock')
- print("acquire file_lock! process " + str(os.getpid()))
- for i in range(10):
- print("process " + str(os.getpid()) + " " + str(i))
- time.sleep(random.randint(0, 1))
- f.close()
- def merge_table():
- with open(r'C:\Users\Administrator\Desktop\2.html', 'r') as f:
- html_str = f.read()
- html_str_origin = copy.deepcopy(html_str)
- try:
- match1 = re.finditer('<table', html_str)
- match2 = re.finditer('</table>', html_str)
- table_index_list = []
- for m1, m2 in zip(match1, match2):
- table_index_list.append([m1.span()[0], m1.span()[1], m2.span()[0], m2.span()[1]])
- print(table_index_list)
- soup = BeautifulSoup(html_str)
- tables = soup.find_all('table')
- table_td_cnt_list = []
- for table in tables:
- tds = table.tr.find_all('td')
- table_td_cnt_list.append(len(list(tds)))
- print(table_td_cnt_list)
- if len(table_index_list) == len(table_td_cnt_list):
- merge_index_list = []
- temp_index = []
- for i in range(1, len(table_index_list)):
- last_index = table_index_list[i-1]
- index = table_index_list[i]
- last_tds = table_td_cnt_list[i-1]
- tds = table_td_cnt_list[i]
- if index[0] - last_index[-1] == 0 and last_tds == tds:
- temp_index += [i-1, i]
- temp_index = list(set(temp_index))
- else:
- if temp_index:
- merge_index_list.append(temp_index)
- temp_index = []
- if temp_index:
- merge_index_list.append(temp_index)
- print(merge_index_list)
- print('before len(html_str)', len(html_str))
- for merge in merge_index_list:
- start_index = table_index_list[merge[0]][0]
- end_index = table_index_list[merge[-1]][-1]
- table_replace = re.sub('<table[^>]*>|</table>', '', html_str[start_index:end_index])
- table_replace = '<table border="1">' + table_replace + '</table>'
- table_replace += ' '*(end_index-start_index-len(table_replace))
- html_str = html_str[:start_index] + table_replace + html_str[end_index:]
- print('after len(html_str)', len(html_str))
- if len(html_str_origin) == len(html_str):
- with open(r'C:\Users\Administrator\Desktop\3.html', 'w') as f:
- f.write(html_str)
- return html_str
- else:
- return html_str_origin
- else:
- return html_str_origin
- except:
- return html_str_origin
- if __name__ == '__main__':
- # process_list = []
- # for j in range(10):
- # p1 = Process(target=run,)
- # p1.start()
- # process_list.append(p1)
- #
- # for p in process_list:
- # p.join()
- print('|'.join(['a', 'n']))
- _t = datetime.strptime('2023-04-26', '%Y-%m-%d')
- _t2 = datetime.strptime('2023-04-02', '%Y-%m-%d')
- print(abs((_t2-_t).days))
- print(datetime.strftime(_t + dt.timedelta(days=10), '%Y-%m-%d'))
- # merge_table()
- print(datetime.now())
|