From 6b99d86eadbcb7f800e14a1c924359a322ceab89 Mon Sep 17 00:00:00 2001 From: Altan Orhon Date: Tue, 19 Sep 2023 09:57:02 -0700 Subject: [PATCH] saving work on modularized --- .gitignore | 4 ++- hyakvnc/__main__.py | 86 ++++++++++++++++++++++++++++++++++++++++++-- hyakvnc/config.py | 47 ------------------------ hyakvnc/slurmutil.py | 12 +++---- 4 files changed, 91 insertions(+), 58 deletions(-) delete mode 100755 hyakvnc/config.py diff --git a/.gitignore b/.gitignore index fbffe1a..6510bee 100644 --- a/.gitignore +++ b/.gitignore @@ -158,7 +158,9 @@ cython_debug/ # be foud at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ # User-specific stuff *.whl + + diff --git a/hyakvnc/__main__.py b/hyakvnc/__main__.py index 94a0bc4..cd3d64f 100644 --- a/hyakvnc/__main__.py +++ b/hyakvnc/__main__.py @@ -1,7 +1,9 @@ +#! /usr/bin/env python3 +# -*- coding: utf-8 -*- + import os import json import base64 -# from config import APPTAINER_CONFIGDIR, HYAKVNC_APPTAINER_INSTANCE_PREFIX from pathlib import Path from typing import Optional, Iterable, Union, List import re @@ -9,13 +11,76 @@ import subprocess +from dataclasses import dataclass +from .version import VERSION +from slurmutil import get_slurm_cluster, get_slurm_partitions, get_slurm_default_account, get_slurm_job_details + +# Base VNC port cannot be changed due to vncserver not having a stable argument +# interface: +BASE_VNC_PORT = os.environ.setdefault("HYAKVNC_BASE_VNC_PORT", "5900") + +# List of Klone login node hostnames +LOGIN_NODE_LIST = os.environ.get("HYAKVNC_LOGIN_NODES", "klone-login01,klone1.hyak.uw.edu,klone2.hyak.uw.edu").split( + ",") + +# Name of Apptainer binary (formerly Singularity) +APPTAINER_BIN = os.environ.setdefault("HYAKVNC_APPTAINER_BIN", "apptainer") + +# Checked to see if klone is authorized for intracluster access +AUTH_KEYS_FILEPATH = Path(os.environ.setdefault("HYAKVNC_AUTH_KEYS_FILEPATH", "~/.ssh/authorized_keys")).expanduser() + +# Apptainer bindpaths can be overwritten if $APPTAINER_BINDPATH is defined. +# Bindpaths are used to mount storage paths to containerized environment. +APPTAINER_BINDPATH = os.environ.setdefault("APPTAINER_BINDPATH", + os.environ.get("HYAKVNC_APPTAINER_BINDPATH", + os.environ.get("SINGULARITY_BINDPATH", + "/tmp,$HOME,$PWD,/gscratch,/opt,/:/hyak_root,/sw,/mmfs1"))) + +APPTAINER_CONFIGDIR = Path(os.getenv("APPTAINER_CONFIGDIR", "~/.apptainer")).expanduser() +APPTAINER_INSTANCES_DIR = APPTAINER_CONFIGDIR / "instances" + +# # SLURM UTILS + +# Slurm configuration variables: +SLURM_CLUSTER = os.getenv("HYAKVNC_SLURM_CLUSTER", os.getenv("SBATCH_CLUSTERS", get_slurm_cluster()).split(",")[0]) +SLURM_ACCOUNT = os.environ.get("HYAKVNC_SLURM_ACCOUNT", os.environ.setdefault("SBATCH_ACCOUNT", + get_slurm_default_account( + cluster=SLURM_CLUSTER))) +SLURM_GPUS = os.environ.setdefault("SBATCH_GPUS", "0") +SLURM_CPUS_PER_TASK = os.environ.setdefault("HYAKVNC_SLURM_CPUS_PER_TASK", "1") +SBATCH_GPUS = os.environ.setdefault("SBATCH_GPUS", "0") +SBATCH_TIMELIMIT = os.environ.setdefault("SBATCH_TIMELIMIT", "1:00:00") + +HYAKVNC_SLURM_JOBNAME_PREFIX = os.getenv("HYAKVNC_SLURM_JOBNAME_PREFIX", "hyakvnc-") +HYAKVNC_APPTAINER_INSTANCE_PREFIX = os.getenv("HYAKVNC_APPTAINER_INSTANCE_PREFIX", HYAKVNC_APPTAINER_INSTANCE_PREFIX + "vncserver-") + + +SBATCH_CLUSTERS = os.environ.setdefault("SBATCH_CLUSTERS", SLURM_CLUSTER) + +found_sbatch_partitions = get_slurm_partitions(account=SBATCH_ACCOUNT, cluster=SBATCH_CLUSTERS) +if found_sbatch_partitions: + HYAKVNC_SLURM_PARTITION = os.environ.get("HYAKVNC_SLURM_PARTITION", os.environ.setdefault("SBATCH_ACCOUNT", + get_slurm_default_account( + cluster=SLURM_CLUSTER))) + +SB + +if any(SBATCH_PARTITION := x for x in get_slurm_partitions(account=SBATCH_ACCOUNT, cluster=SBATCH_CLUSTERS)): + os.environ.setdefault("SBATCH_PARTITION", SBATCH_PARTITION) + +SBATCH_GPUS = os.environ.setdefault("SBATCH_GPUS", "0") +SBATCH_TIMELIMIT = os.environ.setdefault("SBATCH_TIMELIMIT", "1:00:00") + +HYAKVNC_SLURM_JOBNAME_PREFIX = os.getenv("HYAKVNC_SLURM_JOBNAME_PREFIX", "hyakvnc-") +HYAKVNC_APPTAINER_INSTANCE_PREFIX = os.getenv("HYAKVNC_APPTAINER_INSTANCE_PREFIX", "hyakvnc-vncserver-") + def check_remote_pid_exists_and_port_open(host: str, pid: int, port: int) -> bool: cmd = f"ssh {host} ps -p {pid} && nc -z localhost {port}".split() res = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) return res.returncode == 0 -def get_apptainer_vnc_instances(apptainer_config_dir="~/.apptainer", instance_prefix: Optional[str] = "", +def get_apptainer_vnc_instances(apptainer_config_dir="~/.apptainer", instance_prefix: str ="hyakvnc-", read_apptainer_config: bool = False): appdir = Path(apptainer_config_dir).expanduser() / 'instances' / 'app' assert appdir.exists(), f"Could not find apptainer instances dir at {appdir}" @@ -27,7 +92,7 @@ def get_apptainer_vnc_instances(apptainer_config_dir="~/.apptainer", instance_pr all_instance_json_files = appdir.rglob(instance_prefix + '*.json') running_hyakvnc_json_files = {p: r.groupdict() for p in all_instance_json_files if ( - r := re.match(r'(?Phyakvnc-)(?P\d+)-(?P.*)\.json', p.name)) + r := re.match(rf'(?P{instance_prefix})(?P\d+)-(?P.*)\.json', p.name)) } outs = [] # frr := re.search(r'\s+-rfbport\s+(?P\d+\b', fr) @@ -95,3 +160,18 @@ def get_openssh_connection_string_for_instance(instance: dict, login_host: str, port_on_client = port_on_client or port_on_node s = f"ssh -v -f -o StrictHostKeyChecking=no -J {login_host} {compute_node} -L {port_on_client}:localhost:{port_on_node} sleep 10; vncviewer localhost:{port_on_client}" return s + + + + +def create_job_with_container(container_path: str): + #sbatch -A escience --job-name hyakvnc-xubuntu -p gpu-a40 -c 4 --mem=8G --time=1:00:00 --wrap "apptainer instance start --cleanenv --writable-tmpfs + + cmds = ["sbatch"] + + + + + cmd = f"ssh {host} ps -p {pid} && nc -z localhost {port}".split() + res = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + return res.returncode == 0 \ No newline at end of file diff --git a/hyakvnc/config.py b/hyakvnc/config.py deleted file mode 100755 index 77e2341..0000000 --- a/hyakvnc/config.py +++ /dev/null @@ -1,47 +0,0 @@ -import os -from pathlib import Path -from dataclasses import dataclass -from .version import VERSION -from slurmutil import get_slurm_cluster, get_slurm_partitions, get_slurm_default_account, get_slurm_job_details - -# Base VNC port cannot be changed due to vncserver not having a stable argument -# interface: -BASE_VNC_PORT = os.environ.setdefault("HYAKVNC_BASE_VNC_PORT", "5900") - -# List of Klone login node hostnames -LOGIN_NODE_LIST = os.environ.get("HYAKVNC_LOGIN_NODES", "klone-login01,klone1.hyak.uw.edu,klone2.hyak.uw.edu").split( - ",") - -# Name of Apptainer binary (formerly Singularity) -APPTAINER_BIN = os.environ.setdefault("HYAKVNC_APPTAINER_BIN", "apptainer") - -# Checked to see if klone is authorized for intracluster access -AUTH_KEYS_FILEPATH = Path(os.environ.setdefault("HYAKVNC_AUTH_KEYS_FILEPATH", "~/.ssh/authorized_keys")).expanduser() - -# Apptainer bindpaths can be overwritten if $APPTAINER_BINDPATH is defined. -# Bindpaths are used to mount storage paths to containerized environment. -APPTAINER_BINDPATH = os.environ.setdefault("APPTAINER_BINDPATH", - os.environ.get("HYAKVNC_APPTAINER_BINDPATH", - os.environ.get("SINGULARITY_BINDPATH", - "/tmp,$HOME,$PWD,/gscratch,/opt,/:/hyak_root,/sw,/mmfs1"))) - -APPTAINER_CONFIGDIR = Path(os.getenv("APPTAINER_CONFIGDIR", "~/.apptainer")).expanduser() -APPTAINER_INSTANCES_DIR = APPTAINER_CONFIGDIR / "instances" - -# # SLURM UTILS - -# Slurm configuration variables: -SLURM_CLUSTER = os.getenv("HYAKVNC_SLURM_CLUSTER", os.getenv("SBATCH_CLUSTERS", get_slurm_cluster()).split(",")[0]) -SBATCH_CLUSTERS = os.environ.setdefault("SBATCH_CLUSTERS", SLURM_CLUSTER) -SBATCH_ACCOUNT = os.environ.get("HYAKVNC_SLURM_ACCOUNT", os.environ.setdefault("SBATCH_ACCOUNT", - get_slurm_default_account( - cluster=SLURM_CLUSTER))) - -if any(SBATCH_PARTITION := x for x in get_slurm_partitions(account=SBATCH_ACCOUNT, cluster=SBATCH_CLUSTERS)): - os.environ.setdefault("SBATCH_PARTITION", SBATCH_PARTITION) - -SBATCH_GPUS = os.environ.setdefault("SBATCH_GPUS", "0") -SBATCH_TIMELIMIT = os.environ.setdefault("SBATCH_TIMELIMIT", "1:00:00") - -HYAKVNC_SLURM_JOBNAME_PREFIX = os.getenv("HYAKVNC_SLURM_JOBNAME_PREFIX", "hyakvnc-") -HYAKVNC_APPTAINER_INSTANCE_PREFIX = os.getenv("HYAKVNC_APPTAINER_INSTANCE_PREFIX", "hyakvnc-vncserver-") diff --git a/hyakvnc/slurmutil.py b/hyakvnc/slurmutil.py index 1729d86..41d64dd 100755 --- a/hyakvnc/slurmutil.py +++ b/hyakvnc/slurmutil.py @@ -5,31 +5,29 @@ import re from typing import Optional, Iterable, Union, List -from .config import APPTAINER_CONFIGDIR import json from pathlib import Path def get_slurmd_config(): cmd = f"slurmd -C".split() - res = subprocess.run(cmd, capture_output=True, encoding='utf-8').stdout.splitlines() + res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8').stdout.splitlines() return dict([re.match(r'([^=]+)+=(.*)', k).groups() for k in res.split()]) def get_slurm_cluster(): cmd = f"sacctmgr show cluster -nPs format=Cluster".split() - res = subprocess.run(cmd, capture_output=True, encoding='utf-8').stdout.splitlines() + res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8').stdout.splitlines() if any(cluster := x for x in res): return cluster else: raise ValueError("Could not find cluster name") - def get_slurm_default_account(user: Optional[str] = None, cluster: Optional[str] = None): user = user or os.getlogin() cluster = cluster or get_slurm_cluster() cmd = f"sacctmgr show user -nPs {user} format=defaultaccount where cluster={cluster}".split() - res = subprocess.run(cmd, capture_output=True, encoding='utf-8').stdout.splitlines() + res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8').stdout.splitlines() if any(default_account := x for x in res): return default_account else: @@ -41,7 +39,7 @@ def get_slurm_partitions(user: Optional[str] = None, account: Optional[str] = No cluster = cluster or get_slurm_cluster() account = account or get_slurm_default_account(user=user, cluster=cluster) cmd = f"sacctmgr show -nPs user {user} format=qos where account={account} cluster={cluster}".split() - res = subprocess.run(cmd, capture_output=True, encoding='utf-8').stdout.splitlines() + res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8').stdout.splitlines() if any(partitions := x for x in res): return {x.strip(f"{account}-") for x in partitions.split(',')} else: @@ -67,6 +65,6 @@ def get_slurm_job_details(user: Optional[str] = None, jobs: Optional[Union[int, jobs = [jobs] jobs = ','.join([str(x) for x in jobs]) cmds += ['--jobs', jobs] - res = subprocess.run(cmds, capture_output=True, encoding="utf-8").stdout.splitlines() + res = subprocess.run(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8").stdout.splitlines() out = {x["JobId"]: x for x in [dict(zip(fields, line.split())) for line in res if line.strip()]} return out