123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314 |
- import datetime
- import logging
- import os
- import re
- import sys
- import time
- import psutil
- sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
- from format_convert.utils import get_ip_port, get_intranet_ip, get_args_from_config, get_all_ip, get_using_ip
- # 解析配置文件
- ip_port_dict = get_ip_port()
- ip = get_using_ip()
- print("local ip:", ip)
- # 自定义输出
- std_out = " >>/convert.out 2>&1 &"
- std_out_gpu = " >>/gpu.out 2>&1 &"
- std_out_schedule = " >>/schedule.out 2>&1 &"
- # 获取接口各个参数,提前生成命令
- python_path = get_args_from_config(ip_port_dict, ip, "python_path")[0]
- project_path = get_args_from_config(ip_port_dict, ip, "project_path")[0]
- gunicorn_path = get_args_from_config(ip_port_dict, ip, "gunicorn_path")[0]
- interface_list = ['convert', 'ocr', 'otr', 'idc', 'isr', 'atc', 'yolo', 'office']
- comm_dict = {}
- interface_port_dict = {}
- for name in interface_list:
- if get_args_from_config(ip_port_dict, ip, name, 'MASTER'):
- port_list, num_list, gpu_list = get_args_from_config(ip_port_dict, ip, name, 'MASTER')[0]
- else:
- port_list, num_list, gpu_list = get_args_from_config(ip_port_dict, ip, name)[0]
- interface_port_dict[name] = [port_list, num_list, gpu_list]
- for i, port in enumerate(port_list):
- port_num = num_list[i]
- if int(port_num) == 0:
- continue
- # 设置gpu
- if gpu_list:
- gpu = gpu_list[i]
- else:
- gpu = -1
- os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu)
- gpu_comm = 'export CUDA_VISIBLE_DEVICES=' + str(gpu) + ' && '
- # 设置命令
- if name == 'convert':
- comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "format_convert" + ' ' + name + ":app" + std_out
- elif name == 'yolo':
- comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "/botr/yolov8" + ' ' + name + "_interface:app" + std_out_gpu
- elif name == 'office':
- comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
- office_port_comm_list = []
- for office_port in range(port, port + port_num):
- office_port_comm_list = re.sub("#", str(office_port), comm)
- comm_dict[name] = office_port_comm_list
- else:
- comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "/" + name + ' ' + name + "_interface:app" + std_out_gpu
- if name == 'office':
- continue
- if name in comm_dict.keys():
- comm_dict[name] += [gpu_comm + comm]
- else:
- comm_dict[name] = [gpu_comm + comm]
- # print(name, port_list, num_list, gpu_list)
- # convert_port_list = get_args_from_config(ip_port_dict, ip, "convert", "MASTER")
- # if convert_port_list:
- # convert_port_list = convert_port_list[0]
- # ocr_port_list = get_args_from_config(ip_port_dict, ip, "ocr")
- # otr_port_list = get_args_from_config(ip_port_dict, ip, "otr")
- # idc_port_list = get_args_from_config(ip_port_dict, ip, "idc")
- # isr_port_list = get_args_from_config(ip_port_dict, ip, "isr")
- # atc_port_list = get_args_from_config(ip_port_dict, ip, "atc")
- # yolo_port_list = get_args_from_config(ip_port_dict, ip, "yolo")
- # soffice_port_list = get_args_from_config(ip_port_dict, ip, "office", "MASTER")
- # if soffice_port_list:
- # soffice_port_list = soffice_port_list[0]
- # python_path_list = get_args_from_config(ip_port_dict, ip, "python_path")
- # project_path_list = get_args_from_config(ip_port_dict, ip, "project_path")
- # gunicorn_path_list = get_args_from_config(ip_port_dict, ip, "gunicorn_path")
- # std_out = " >>/convert.out 2>&1 &"
- # std_out_gpu = " >>/gpu.out 2>&1 &"
- # std_out_schedule = " >>/schedule.out 2>&1 &"
- #
- # print("convert_port_list", convert_port_list)
- # print("ocr_port_list", ocr_port_list)
- # print("otr_port_list", otr_port_list)
- # print("idc_port_list", idc_port_list)
- # print("isr_port_list", isr_port_list)
- # print("atc_port_list", atc_port_list)
- # print("yolo_port_list", yolo_port_list)
- # print("soffice_port_list", soffice_port_list)
- #
- # # 根据port生成gunicorn语句
- # ocr_comm_list = []
- # otr_comm_list = []
- # isr_comm_list = []
- # idc_comm_list = []
- # atc_comm_list = []
- # yolo_comm_list = []
- # for i in range(len(ocr_port_list)):
- # ocr_comm_list.append("nohup " + gunicorn_path_list[i] + " -w " + str(len(ocr_port_list[i]))
- # + " -t 300 --keep-alive 600 -b 0.0.0.0:# --chdir "
- # + project_path_list[i] + "/ocr ocr_interface:app" + std_out_gpu)
- # for i in range(len(otr_port_list)):
- # otr_comm_list.append("nohup " + gunicorn_path_list[i] + " -w " + str(len(otr_port_list[i]))
- # + " -t 300 --keep-alive 600 -b 0.0.0.0:# --chdir "
- # + project_path_list[i] + "/otr otr_interface:app" + std_out_gpu)
- # for i in range(len(idc_port_list)):
- # idc_comm_list.append("nohup " + gunicorn_path_list[i] + " -w " + str(len(idc_port_list[i]))
- # + " -t 300 --keep-alive 600 -b 0.0.0.0:# --chdir "
- # + project_path_list[i] + "/idc idc_interface:app" + std_out_gpu)
- # for i in range(len(isr_port_list)):
- # isr_comm_list.append("nohup " + gunicorn_path_list[i] + " -w " + str(len(isr_port_list[i]))
- # + " -t 300 --keep-alive 600 -b 0.0.0.0:# --chdir "
- # + project_path_list[i] + "/isr isr_interface:app" + std_out_gpu)
- # for i in range(len(atc_port_list)):
- # atc_comm_list.append("nohup " + gunicorn_path_list[i] + " -w " + str(len(atc_port_list[i]))
- # + " -t 300 --keep-alive 600 -b 0.0.0.0:# --chdir "
- # + project_path_list[i] + "/atc atc_interface:app" + std_out_gpu)
- # for i in range(len(yolo_port_list)):
- # yolo_comm_list.append("nohup " + gunicorn_path_list[i] + " -w " + str(len(yolo_port_list[i]))
- # + " -t 300 --keep-alive 600 -b 0.0.0.0:# --chdir "
- # + project_path_list[i] + "/botr/yolov8 yolo_interface:app" + std_out_gpu)
- #
- # convert_comm = "nohup " + gunicorn_path_list[0] + " -w " + str(len(convert_port_list)) + " -t 300 -b 0.0.0.0:# --chdir " \
- # + project_path_list[0] + "/format_convert convert:app" + std_out
- # soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
- def get_port():
- net_conn = psutil.net_connections()
- current_port_list = []
- for conn in net_conn:
- current_port_list.append(str(conn.laddr.port))
- current_port_list = list(set(current_port_list))
- current_port_list.sort(key=lambda x: x)
- return current_port_list
- def restart(interface_type, port, index=0):
- # if process_type == "convert":
- # _comm = re.sub("#", port, convert_comm)
- # elif process_type == "ocr":
- # _comm = re.sub("#", port, ocr_comm_list[index])
- # elif process_type == "otr":
- # _comm = re.sub("#", port, otr_comm_list[index])
- # elif process_type == "soffice":
- # _comm = re.sub("#", port, soffice_comm)
- # elif process_type == "idc":
- # _comm = re.sub("#", port, idc_comm_list[index])
- # elif process_type == "isr":
- # _comm = re.sub("#", port, isr_comm_list[index])
- # elif process_type == "atc":
- # _comm = re.sub("#", port, atc_comm_list[index])
- # elif process_type == "yolo":
- # _comm = re.sub("#", port, yolo_comm_list[index])
- # else:
- # _comm = "netstat -nltp"
- # print("no process_type", process_type)
- #
- _comm_list = comm_dict.get(interface_type)
- if not _comm_list:
- print('monitor_process_config restart command error! check config!')
- raise
- for _comm in _comm_list:
- if str(port) in _comm:
- print(datetime.datetime.now(), "restart comm", _comm)
- os.system(_comm)
- def kill_soffice(limit_sec=30):
- try:
- pid_list = psutil.pids()
- for pid in pid_list:
- process = psutil.Process(pid)
- process_cmd = ''
- for c in process.cmdline():
- process_cmd += c + " "
- if process_cmd.strip() == "":
- continue
- if process.status() == "zombie":
- print("zombie cmd", process_cmd)
- if re.search("soffice", process.exe()):
- start_time = process.create_time()
- now_time = time.time()
- run_time = now_time-start_time
- if run_time >= limit_sec:
- comm = "kill -9 " + str(pid)
- print(datetime.datetime.now(), "kill process ", str(pid), str(process.exe()), str(run_time), ">", limit_sec)
- os.system(comm)
- except:
- pass
- def kill_nested_timeout_process():
- try:
- pid_list = psutil.pids()
- suspect_pid_list = []
- for pid in pid_list:
- process = psutil.Process(pid)
- process_cmd = ''
- for c in process.cmdline():
- process_cmd += c + " "
- if process_cmd.strip() == "":
- continue
- if re.search("convert:app", process_cmd):
- ppid = process.ppid()
- start_time = process.create_time()
- now_time = time.time()
- run_time = now_time-start_time
- if str(ppid) == "1":
- suspect_pid_list.append([str(pid), float(run_time)])
- # 时间最久的父进程为1的不能杀,是接口主进程
- if len(suspect_pid_list) <= 1:
- return
- else:
- suspect_pid_list.sort(key=lambda x: x[1], reverse=True)
- for pid, run_time in suspect_pid_list[1:]:
- # print("pid", pid, run_time)
- comm = "kill -9 " + str(pid)
- print(datetime.datetime.now(), "kill process ", str(pid), "father is 1", process_cmd)
- os.system(comm)
- except:
- pass
- def monitor():
- for _name in interface_list:
- if interface_port_dict.get(_name):
- _port_list, _num_list, _gpu_list = interface_port_dict.get(_name)
- current_port_list = get_port()
- for j, p in enumerate(_port_list):
- if str(p) not in current_port_list:
- restart(_name, p)
- # if convert_port_list:
- # for p in convert_port_list[:1]:
- # if p not in current_port_list:
- # restart("convert", p)
- #
- # if ocr_port_list:
- # for j in range(len(ocr_port_list)):
- # for p in ocr_port_list[j][:1]:
- # if p not in current_port_list:
- # restart("ocr", p, index=j)
- #
- # if otr_port_list:
- # for j in range(len(otr_port_list)):
- # for p in otr_port_list[j][:1]:
- # if p not in current_port_list:
- # restart("otr", p, index=j)
- #
- # if idc_port_list:
- # for j in range(len(idc_port_list)):
- # for p in idc_port_list[j][:1]:
- # if p not in current_port_list:
- # restart("idc", p, index=j)
- #
- # if isr_port_list:
- # for j in range(len(isr_port_list)):
- # for p in isr_port_list[j][:1]:
- # if p not in current_port_list:
- # restart("isr", p, index=j)
- #
- # if atc_port_list:
- # for j in range(len(atc_port_list)):
- # for p in atc_port_list[j][:1]:
- # if p not in current_port_list:
- # restart("atc", p, index=j)
- #
- # if yolo_port_list:
- # for j in range(len(yolo_port_list)):
- # for p in yolo_port_list[j][:1]:
- # if p not in current_port_list:
- # restart("yolo", p, index=j)
- #
- # if soffice_port_list:
- # for p in soffice_port_list:
- # if p not in current_port_list:
- # restart("soffice", p)
- kill_soffice()
- kill_nested_timeout_process()
- # if schedule_port_list:
- # for p in schedule_port_list:
- # if p not in current_port_list:
- # restart("schedule", p)
- if __name__ == "__main__":
- for i in range(3):
- # os.system("echo $(date +%F%n%T)")
- monitor()
- time.sleep(10)
|