test_walk.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. import copy
  2. import os
  3. import random
  4. import re
  5. import sys
  6. import time
  7. from bs4 import BeautifulSoup
  8. from datetime import datetime
  9. from multiprocessing import Process
  10. import datetime as dt
  11. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  12. from format_convert.utils import file_lock
  13. def run():
  14. f = file_lock(os.path.abspath(os.path.dirname(__file__)) + '/19022.lock')
  15. print("acquire file_lock! process " + str(os.getpid()))
  16. for i in range(10):
  17. print("process " + str(os.getpid()) + " " + str(i))
  18. time.sleep(random.randint(0, 1))
  19. f.close()
  20. def merge_table():
  21. with open(r'C:\Users\Administrator\Desktop\2.html', 'r') as f:
  22. html_str = f.read()
  23. html_str_origin = copy.deepcopy(html_str)
  24. try:
  25. match1 = re.finditer('<table', html_str)
  26. match2 = re.finditer('</table>', html_str)
  27. table_index_list = []
  28. for m1, m2 in zip(match1, match2):
  29. table_index_list.append([m1.span()[0], m1.span()[1], m2.span()[0], m2.span()[1]])
  30. print(table_index_list)
  31. soup = BeautifulSoup(html_str)
  32. tables = soup.find_all('table')
  33. table_td_cnt_list = []
  34. for table in tables:
  35. tds = table.tr.find_all('td')
  36. table_td_cnt_list.append(len(list(tds)))
  37. print(table_td_cnt_list)
  38. if len(table_index_list) == len(table_td_cnt_list):
  39. merge_index_list = []
  40. temp_index = []
  41. for i in range(1, len(table_index_list)):
  42. last_index = table_index_list[i-1]
  43. index = table_index_list[i]
  44. last_tds = table_td_cnt_list[i-1]
  45. tds = table_td_cnt_list[i]
  46. if index[0] - last_index[-1] == 0 and last_tds == tds:
  47. temp_index += [i-1, i]
  48. temp_index = list(set(temp_index))
  49. else:
  50. if temp_index:
  51. merge_index_list.append(temp_index)
  52. temp_index = []
  53. if temp_index:
  54. merge_index_list.append(temp_index)
  55. print(merge_index_list)
  56. print('before len(html_str)', len(html_str))
  57. for merge in merge_index_list:
  58. start_index = table_index_list[merge[0]][0]
  59. end_index = table_index_list[merge[-1]][-1]
  60. table_replace = re.sub('<table[^>]*>|</table>', '', html_str[start_index:end_index])
  61. table_replace = '<table border="1">' + table_replace + '</table>'
  62. table_replace += ' '*(end_index-start_index-len(table_replace))
  63. html_str = html_str[:start_index] + table_replace + html_str[end_index:]
  64. print('after len(html_str)', len(html_str))
  65. if len(html_str_origin) == len(html_str):
  66. with open(r'C:\Users\Administrator\Desktop\3.html', 'w') as f:
  67. f.write(html_str)
  68. return html_str
  69. else:
  70. return html_str_origin
  71. else:
  72. return html_str_origin
  73. except:
  74. return html_str_origin
  75. if __name__ == '__main__':
  76. # process_list = []
  77. # for j in range(10):
  78. # p1 = Process(target=run,)
  79. # p1.start()
  80. # process_list.append(p1)
  81. #
  82. # for p in process_list:
  83. # p.join()
  84. print('|'.join(['a', 'n']))
  85. _t = datetime.strptime('2023-04-26', '%Y-%m-%d')
  86. _t2 = datetime.strptime('2023-04-02', '%Y-%m-%d')
  87. print(abs((_t2-_t).days))
  88. print(datetime.strftime(_t + dt.timedelta(days=10), '%Y-%m-%d'))
  89. # merge_table()
  90. print(datetime.now())