Skip to content

Commit

Permalink
saving work on modularized
Browse files Browse the repository at this point in the history
  • Loading branch information
maouw committed Sep 19, 2023
1 parent 2510e58 commit 6b99d86
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 58 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,9 @@ cython_debug/
# be foud at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/

# User-specific stuff
*.whl


86 changes: 83 additions & 3 deletions hyakvnc/__main__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,86 @@
#! /usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import json
import base64
# from config import APPTAINER_CONFIGDIR, HYAKVNC_APPTAINER_INSTANCE_PREFIX
from pathlib import Path
from typing import Optional, Iterable, Union, List
import re
import logging
import subprocess


from dataclasses import dataclass
from .version import VERSION
from slurmutil import get_slurm_cluster, get_slurm_partitions, get_slurm_default_account, get_slurm_job_details

# Base VNC port cannot be changed due to vncserver not having a stable argument
# interface:
BASE_VNC_PORT = os.environ.setdefault("HYAKVNC_BASE_VNC_PORT", "5900")

# List of Klone login node hostnames
LOGIN_NODE_LIST = os.environ.get("HYAKVNC_LOGIN_NODES", "klone-login01,klone1.hyak.uw.edu,klone2.hyak.uw.edu").split(
",")

# Name of Apptainer binary (formerly Singularity)
APPTAINER_BIN = os.environ.setdefault("HYAKVNC_APPTAINER_BIN", "apptainer")

# Checked to see if klone is authorized for intracluster access
AUTH_KEYS_FILEPATH = Path(os.environ.setdefault("HYAKVNC_AUTH_KEYS_FILEPATH", "~/.ssh/authorized_keys")).expanduser()

# Apptainer bindpaths can be overwritten if $APPTAINER_BINDPATH is defined.
# Bindpaths are used to mount storage paths to containerized environment.
APPTAINER_BINDPATH = os.environ.setdefault("APPTAINER_BINDPATH",
os.environ.get("HYAKVNC_APPTAINER_BINDPATH",
os.environ.get("SINGULARITY_BINDPATH",
"/tmp,$HOME,$PWD,/gscratch,/opt,/:/hyak_root,/sw,/mmfs1")))

APPTAINER_CONFIGDIR = Path(os.getenv("APPTAINER_CONFIGDIR", "~/.apptainer")).expanduser()
APPTAINER_INSTANCES_DIR = APPTAINER_CONFIGDIR / "instances"

# # SLURM UTILS

# Slurm configuration variables:
SLURM_CLUSTER = os.getenv("HYAKVNC_SLURM_CLUSTER", os.getenv("SBATCH_CLUSTERS", get_slurm_cluster()).split(",")[0])
SLURM_ACCOUNT = os.environ.get("HYAKVNC_SLURM_ACCOUNT", os.environ.setdefault("SBATCH_ACCOUNT",
get_slurm_default_account(
cluster=SLURM_CLUSTER)))
SLURM_GPUS = os.environ.setdefault("SBATCH_GPUS", "0")
SLURM_CPUS_PER_TASK = os.environ.setdefault("HYAKVNC_SLURM_CPUS_PER_TASK", "1")
SBATCH_GPUS = os.environ.setdefault("SBATCH_GPUS", "0")
SBATCH_TIMELIMIT = os.environ.setdefault("SBATCH_TIMELIMIT", "1:00:00")

HYAKVNC_SLURM_JOBNAME_PREFIX = os.getenv("HYAKVNC_SLURM_JOBNAME_PREFIX", "hyakvnc-")
HYAKVNC_APPTAINER_INSTANCE_PREFIX = os.getenv("HYAKVNC_APPTAINER_INSTANCE_PREFIX", HYAKVNC_APPTAINER_INSTANCE_PREFIX + "vncserver-")


SBATCH_CLUSTERS = os.environ.setdefault("SBATCH_CLUSTERS", SLURM_CLUSTER)

found_sbatch_partitions = get_slurm_partitions(account=SBATCH_ACCOUNT, cluster=SBATCH_CLUSTERS)
if found_sbatch_partitions:
HYAKVNC_SLURM_PARTITION = os.environ.get("HYAKVNC_SLURM_PARTITION", os.environ.setdefault("SBATCH_ACCOUNT",
get_slurm_default_account(
cluster=SLURM_CLUSTER)))

SB

if any(SBATCH_PARTITION := x for x in get_slurm_partitions(account=SBATCH_ACCOUNT, cluster=SBATCH_CLUSTERS)):
os.environ.setdefault("SBATCH_PARTITION", SBATCH_PARTITION)

SBATCH_GPUS = os.environ.setdefault("SBATCH_GPUS", "0")
SBATCH_TIMELIMIT = os.environ.setdefault("SBATCH_TIMELIMIT", "1:00:00")

HYAKVNC_SLURM_JOBNAME_PREFIX = os.getenv("HYAKVNC_SLURM_JOBNAME_PREFIX", "hyakvnc-")
HYAKVNC_APPTAINER_INSTANCE_PREFIX = os.getenv("HYAKVNC_APPTAINER_INSTANCE_PREFIX", "hyakvnc-vncserver-")

def check_remote_pid_exists_and_port_open(host: str, pid: int, port: int) -> bool:
cmd = f"ssh {host} ps -p {pid} && nc -z localhost {port}".split()
res = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return res.returncode == 0


def get_apptainer_vnc_instances(apptainer_config_dir="~/.apptainer", instance_prefix: Optional[str] = "",
def get_apptainer_vnc_instances(apptainer_config_dir="~/.apptainer", instance_prefix: str ="hyakvnc-",
read_apptainer_config: bool = False):
appdir = Path(apptainer_config_dir).expanduser() / 'instances' / 'app'
assert appdir.exists(), f"Could not find apptainer instances dir at {appdir}"
Expand All @@ -27,7 +92,7 @@ def get_apptainer_vnc_instances(apptainer_config_dir="~/.apptainer", instance_pr
all_instance_json_files = appdir.rglob(instance_prefix + '*.json')

running_hyakvnc_json_files = {p: r.groupdict() for p in all_instance_json_files if (
r := re.match(r'(?P<prefix>hyakvnc-)(?P<jobid>\d+)-(?P<appinstance>.*)\.json', p.name))
r := re.match(rf'(?P<prefix>{instance_prefix})(?P<jobid>\d+)-(?P<appinstance>.*)\.json', p.name))
}
outs = []
# frr := re.search(r'\s+-rfbport\s+(?P<rfbport>\d+\b', fr)
Expand Down Expand Up @@ -95,3 +160,18 @@ def get_openssh_connection_string_for_instance(instance: dict, login_host: str,
port_on_client = port_on_client or port_on_node
s = f"ssh -v -f -o StrictHostKeyChecking=no -J {login_host} {compute_node} -L {port_on_client}:localhost:{port_on_node} sleep 10; vncviewer localhost:{port_on_client}"
return s




def create_job_with_container(container_path: str):
#sbatch -A escience --job-name hyakvnc-xubuntu -p gpu-a40 -c 4 --mem=8G --time=1:00:00 --wrap "apptainer instance start --cleanenv --writable-tmpfs

cmds = ["sbatch"]




cmd = f"ssh {host} ps -p {pid} && nc -z localhost {port}".split()
res = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
return res.returncode == 0
47 changes: 0 additions & 47 deletions hyakvnc/config.py

This file was deleted.

12 changes: 5 additions & 7 deletions hyakvnc/slurmutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,31 +5,29 @@
import re
from typing import Optional, Iterable, Union, List

from .config import APPTAINER_CONFIGDIR
import json
from pathlib import Path


def get_slurmd_config():
cmd = f"slurmd -C".split()
res = subprocess.run(cmd, capture_output=True, encoding='utf-8').stdout.splitlines()
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8').stdout.splitlines()
return dict([re.match(r'([^=]+)+=(.*)', k).groups() for k in res.split()])


def get_slurm_cluster():
cmd = f"sacctmgr show cluster -nPs format=Cluster".split()
res = subprocess.run(cmd, capture_output=True, encoding='utf-8').stdout.splitlines()
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8').stdout.splitlines()
if any(cluster := x for x in res):
return cluster
else:
raise ValueError("Could not find cluster name")


def get_slurm_default_account(user: Optional[str] = None, cluster: Optional[str] = None):
user = user or os.getlogin()
cluster = cluster or get_slurm_cluster()
cmd = f"sacctmgr show user -nPs {user} format=defaultaccount where cluster={cluster}".split()
res = subprocess.run(cmd, capture_output=True, encoding='utf-8').stdout.splitlines()
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8').stdout.splitlines()
if any(default_account := x for x in res):
return default_account
else:
Expand All @@ -41,7 +39,7 @@ def get_slurm_partitions(user: Optional[str] = None, account: Optional[str] = No
cluster = cluster or get_slurm_cluster()
account = account or get_slurm_default_account(user=user, cluster=cluster)
cmd = f"sacctmgr show -nPs user {user} format=qos where account={account} cluster={cluster}".split()
res = subprocess.run(cmd, capture_output=True, encoding='utf-8').stdout.splitlines()
res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8').stdout.splitlines()
if any(partitions := x for x in res):
return {x.strip(f"{account}-") for x in partitions.split(',')}
else:
Expand All @@ -67,6 +65,6 @@ def get_slurm_job_details(user: Optional[str] = None, jobs: Optional[Union[int,
jobs = [jobs]
jobs = ','.join([str(x) for x in jobs])
cmds += ['--jobs', jobs]
res = subprocess.run(cmds, capture_output=True, encoding="utf-8").stdout.splitlines()
res = subprocess.run(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="utf-8").stdout.splitlines()
out = {x["JobId"]: x for x in [dict(zip(fields, line.split())) for line in res if line.strip()]}
return out

0 comments on commit 6b99d86

Please sign in to comment.