From f75e3a50e670bea8e44360ea69d9a7a3d4f86a64 Mon Sep 17 00:00:00 2001
From: Satya Ortiz-Gagne <satya.ortiz-gagne@mila.quebec>
Date: Tue, 17 Sep 2024 21:42:25 -0400
Subject: [PATCH] Add slurm system setup

---
 config/base.yaml                              |   7 +-
 config/cloud-multinodes-system.yaml           |  13 +
 config/cloud-system.yaml                      |  24 +
 config/examples/cloud-multinodes-system.yaml  |  10 +
 config/examples/cloud-system.yaml             |  11 +
 docs/usage.rst                                |  89 +++-
 milabench/cli/cloud.py                        |  51 +-
 milabench/commands/__init__.py                |  11 +-
 milabench/scripts/covalent/__main__.py        | 463 ++++++++++++++----
 milabench/scripts/covalent/covalent_bashrc.sh |  46 ++
 .../scripts/covalent/milabench_bashrc.sh      |  31 ++
 .../scripts/covalent/python3/__main__.py      |  20 +
 milabench/scripts/covalent/requirements.txt   |   3 +-
 milabench/scripts/utils.py                    |  15 +-
 14 files changed, 669 insertions(+), 125 deletions(-)
 create mode 100644 milabench/scripts/covalent/covalent_bashrc.sh
 create mode 100644 milabench/scripts/covalent/milabench_bashrc.sh
 create mode 100644 milabench/scripts/covalent/python3/__main__.py

diff --git a/config/base.yaml b/config/base.yaml
index f15333458..2509a5132 100644
--- a/config/base.yaml
+++ b/config/base.yaml
@@ -28,6 +28,7 @@ _torchvision:
     --loader: pytorch
     --data: "{milabench_data}/FakeImageNet"
 
+
 _torchvision_ddp:
   inherits: _defaults
   definition: ../benchmarks/torchvision_ddp
@@ -113,6 +114,7 @@ _timm:
     --dataset: "FakeImageNet"
     --workers: "auto({n_worker}, 8)"
 
+
 _accelerate_opt:
   inherits: _defaults
   tags:
@@ -149,6 +151,7 @@ _accelerate_opt:
   use_deepspeed: true
   num_machines: 1
 
+
 fp16:
   inherits: _flops
 
@@ -388,6 +391,7 @@ brax:
     --num-minibatches: 32
     --num-envs: 8192
 
+
 _diffusion:
   inherits: _defaults
   definition: ../benchmarks/diffusion
@@ -530,11 +534,11 @@ _llm:
   definition: ../benchmarks/llm
   install_group: torch
 
+
 llm-lora-single:
   inherits: _llm
   plan:
     method: per_gpu
-  
   argv:
     "{milabench_code}/recipes/lora_finetune_single_device.py": true
     --config: "{milabench_code}/configs/llama3_8B_lora_single_device.yaml"
@@ -596,6 +600,7 @@ llm-lora-ddp-nodes:
   requires_capabilities:
     - "len(nodes) >= ${num_machines}"
 
+
 llm-lora-mp-gpus:
   inherits: _llm
   plan:
diff --git a/config/cloud-multinodes-system.yaml b/config/cloud-multinodes-system.yaml
index 4f7fae391..dd6e8712c 100644
--- a/config/cloud-multinodes-system.yaml
+++ b/config/cloud-multinodes-system.yaml
@@ -38,3 +38,16 @@ system:
       size: Standard_NV72ads_A10_v5
       location: eastus2
       disk_size: 512
+    slurm__a100_x2:
+      address: localhost
+      bashrc_path: "{bashrc_path}"
+      remote_workdir: "scratch/cov-{job_uuid}-workdir"
+      use_srun: null
+      options:
+        ntasks-per-node: 1
+        gpus-per-task: a100l:2
+        cpus-per-task: 12
+        time: "3:0:0"
+        mem: 500000
+        partition: short-unkillable
+        nodelist: cn-g[001-029]
diff --git a/config/cloud-system.yaml b/config/cloud-system.yaml
index 056ef3640..8401be63c 100644
--- a/config/cloud-system.yaml
+++ b/config/cloud-system.yaml
@@ -38,3 +38,27 @@ system:
       size: Standard_NV72ads_A10_v5
       location: eastus2
       disk_size: 512
+    slurm__a100_x1:
+      address: localhost
+      bashrc_path: "{bashrc_path}"
+      remote_workdir: "scratch/cov-{job_uuid}-workdir"
+      use_srun: null
+      options:
+        ntasks-per-node: 1
+        gpus-per-task: a100l:1
+        cpus-per-task: 6
+        time: "3:0:0"
+        mem: 32000
+        partition: unkillable
+    slurm__a100_x4:
+      address: localhost
+      bashrc_path: "{bashrc_path}"
+      remote_workdir: "scratch/cov-{job_uuid}-workdir"
+      use_srun: null
+      options:
+        ntasks-per-node: 1
+        gpus-per-task: a100l:4
+        cpus-per-task: 24
+        time: "3:0:0"
+        mem: 1000000
+        partition: short-unkillable
diff --git a/config/examples/cloud-multinodes-system.yaml b/config/examples/cloud-multinodes-system.yaml
index 5066af5eb..27f1e41b1 100644
--- a/config/examples/cloud-multinodes-system.yaml
+++ b/config/examples/cloud-multinodes-system.yaml
@@ -35,3 +35,13 @@ system:
       volume_size: 8
       region: us-east-2
       state_id: 71669879043a3864225aabb94f91a2d4
+    slurm:
+      address: localhost
+      bashrc_path: "{bashrc_path}"
+      remote_workdir: "scratch/cov-{job_uuid}-workdir"
+      use_srun: null
+      options:
+        ntasks-per-node: 1
+        cpus-per-task: 1
+        time: "0:30:0"
+        mem: 1000
diff --git a/config/examples/cloud-system.yaml b/config/examples/cloud-system.yaml
index b3d1f70aa..d3d3942cc 100644
--- a/config/examples/cloud-system.yaml
+++ b/config/examples/cloud-system.yaml
@@ -28,3 +28,14 @@ system:
       instance_type: t2.micro
       volume_size: 8
       region: us-east-2
+    slurm:
+      # covalent-slurm-plugin args
+      address: localhost
+      bashrc_path: "{bashrc_path}"
+      remote_workdir: "scratch/cov-{job_uuid}-workdir"
+      use_srun: null
+      options:
+        ntasks-per-node: 1
+        cpus-per-task: 1
+        time: "0:30:0"
+        mem: 1000
diff --git a/docs/usage.rst b/docs/usage.rst
index b2a25d85d..4f0fa6f13 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -102,7 +102,7 @@ Create a cloud system configuration
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Add a ``cloud_profiles`` section to the ``system`` configuration which lists the
-supported cloud profiles.
+supported cloud and slurm profiles.
 
 .. notes::
 
@@ -150,14 +150,95 @@ Run milabench on the cloud
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 1. | Initialize the cloud instances
-   | ``milabench cloud --system {{SYSTEM_CONFIG.YAML}} --setup --run-on {{PROFILE}} >{{SYSTEM_CLOUD_CONFIG.YAML}}``
+   | ``milabench cloud --setup --system {{SYSTEM_CONFIG.YAML}} --run-on {{PROFILE}} >{{SYSTEM_CLOUD_CONFIG.YAML}}``
 
 2. | Prepare, install and run milabench
    | ``milabench [prepare|install|run] --system {{SYSTEM_CLOUD_CONFIG.YAML}}``
 
 3. | Destroy the cloud instances
-   | ``milabench teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PROFILE}}``
+   | ``milabench cloud --teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PROFILE}}``
    | or
-   | ``milabench teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PLATFORM}} --all``
+   | ``milabench cloud --teardown --system {{SYSTEM_CLOUD_CONFIG.YAML}} --run-on {{PLATFORM}} --all``
    | to destroy not just a single cloud instance but all instances on a
    specified platform that were instanced from the current local machine
+
+
+Use milabench on slurm
+~~~~~~~~~~~~~~~~~~~~~~
+
+
+Create a slurm system configuration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Add a ``cloud_profiles`` section to the ``system`` configuration which lists the
+supported cloud and slurm profiles.
+
+.. notes::
+
+  Nodes that should be created on the cloud should have the ``1.1.1.1`` ip
+  address placeholder. Other ip addresses will be used as-is and no cloud
+  instance will be created for that node
+
+.. notes::
+
+  A cloud profile entry needs to start with a covalent plugin (e.g. `slurm`). To
+  define multiple profiles on the same cloud platform, use the form
+  ``{PLATFORM}__{PROFILE_NAME}`` (e.g. ``slurm__profile``). All cloud profile
+  attributes will be used as is as argument for the target covalent plugin
+
+.. code-block:: yaml
+
+  system:
+    nodes:
+      - name: manager
+        # Use 1.1.1.1 as an ip placeholder
+        ip: 1.1.1.1
+        main: true
+        user: <username>
+      - name: node1
+        ip: 1.1.1.1
+        main: false
+        user: <username>
+  
+    # Cloud instances profiles
+    cloud_profiles:
+      # The cloud platform to use in the form of {PLATFORM} or
+      # {PLATFORM}__{PROFILE_NAME}
+      slurm:
+        username: usename
+        address: localhost
+        ssh_key_file: ssh_key_file
+        # bashrc_path will be replaced by the content of
+        # milabench/scripts/covalent/covalent_bashrc.sh
+        bashrc_path: "{bashrc_path}"
+        # job_uuid will be replaced by the generated job's uuid
+        remote_workdir: "cov-{job_uuid}-workdir"
+        use_srun: null
+        options:
+          ntasks-per-node: 1
+          cpus-per-task: 1
+          time: "0:30:0"
+          mem: 1000
+
+
+Run milabench on slurm
+^^^^^^^^^^^^^^^^^^^^^^
+
+1. | Initialize the slurm instances
+   | ``milabench cloud --setup --system {{SYSTEM_CONFIG.YAML}} --run-on {{PROFILE}} >{{SYSTEM_SLURM_CONFIG.YAML}}``
+
+2. | Prepare, install and run milabench
+   | ``milabench [prepare|install|run] --system {{SYSTEM_SLURM_CONFIG.YAML}}``
+
+3. | Destroy the slurm instances
+   | ``milabench cloud --teardown --system {{SYSTEM_SLURM_CONFIG.YAML}} --run-on {{PROFILE}}``
+
+.. notes::
+
+  Because the milabench's path is expected to be the same on local machine and
+  the remote machine, it's currently necessary to run the commands from the
+  slurm cluster. As the ``milabench cloud --[setup|teardown]`` commands requires
+  a covalent server to run and to avoid overloading the login nodes resources,
+  it's preferable to request a cpu compute node which will host to the covalent
+  server. An allocation with minimal resources like ``--nodes 1 --cpus-per-task
+  1 --mem 2000`` should be enough.
diff --git a/milabench/cli/cloud.py b/milabench/cli/cloud.py
index 859cdab87..1d89b69e4 100644
--- a/milabench/cli/cloud.py
+++ b/milabench/cli/cloud.py
@@ -60,29 +60,33 @@ def manage_cloud(pack, run_on, action="setup"):
         "private_ip":(lambda v: ("internal_ip",v)),
         "username":(lambda v: ("user",v)),
         "ssh_key_file":(lambda v: ("key",v)),
-        # "env":(lambda v: ("env",[".", v, ";", "conda", "activate", "milabench", "&&"])),
+        "env":(lambda v: ("env",[".", v, "milabench", "&&"])),
+        "slurm_job_id":(lambda v: ("slurm_job_id",v)),
     }
-    plan_params = deepcopy(pack.config["system"]["cloud_profiles"][run_on])
+    plan_params = pack.config["system"]["cloud_profiles"][run_on]
     run_on, *profile = run_on.split("__")
     profile = profile[0] if profile else ""
     default_state_prefix = profile or run_on
     default_state_id = "_".join((pack.config["hash"][:6], blabla()))
 
-    local_base = pack.dirs.base.absolute()
-    local_data_dir = _get_common_dir(ROOT_FOLDER.parent, local_base.parent)
-    if local_data_dir is None:
-        local_data_dir = local_base.parent
-    remote_data_dir = XPath("/data") / local_data_dir.name
+    plan_params["state_prefix"] = plan_params.get("state_prefix", default_state_prefix)
+    plan_params["state_id"] = plan_params.get("state_id", default_state_id)
+    plan_params["keep_alive"] = None
+
+    # local_base = pack.dirs.base.absolute()
+    # local_data_dir = _get_common_dir(ROOT_FOLDER.parent, local_base.parent)
+    # if local_data_dir is None:
+    #     local_data_dir = local_base.parent
+    # remote_data_dir = XPath("/data") / local_data_dir.name
+
+    plan_params_copy = deepcopy(plan_params)
 
     nodes = iter(enumerate(pack.config["system"]["nodes"]))
     for i, n in nodes:
-        if n["ip"] != "1.1.1.1":
+        if n["ip"] != "1.1.1.1" and action == _SETUP:
             continue
 
-        plan_params["state_prefix"] = plan_params.get("state_prefix", default_state_prefix)
-        plan_params["state_id"] = plan_params.get("state_id", default_state_id)
-        plan_params["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1)
-        plan_params["keep_alive"] = None
+        plan_params_copy["cluster_size"] = max(len(pack.config["system"]["nodes"]), i + 1)
 
         import milabench.scripts.covalent as cv
 
@@ -101,17 +105,17 @@ def manage_cloud(pack, run_on, action="setup"):
             "-m", cv.__name__,
             run_on,
             f"--{action}",
-            *_flatten_cli_args(**plan_params)
+            *_flatten_cli_args(**plan_params_copy)
         ]
-        if action == _SETUP:
-            cmd += [
-                "--",
-                "bash", "-c",
-                _or_sudo(f"mkdir -p '{local_data_dir.parent}'") +
-                " && " + _or_sudo(f"chmod a+rwX '{local_data_dir.parent}'") +
-                f" && mkdir -p '{remote_data_dir}'"
-                f" && ln -sfT '{remote_data_dir}' '{local_data_dir}'"
-            ]
+        # if action == _SETUP:
+        #     cmd += [
+        #         "--",
+        #         "bash", "-c",
+        #         _or_sudo(f"mkdir -p '{local_data_dir.parent}'") +
+        #         " && " + _or_sudo(f"chmod a+rwX '{local_data_dir.parent}'") +
+        #         f" && mkdir -p '{remote_data_dir}'"
+        #         f" && ln -sfT '{remote_data_dir}' '{local_data_dir}'"
+        #     ]
         p = subprocess.Popen(
             cmd,
             stdout=subprocess.PIPE,
@@ -155,6 +159,9 @@ def manage_cloud(pack, run_on, action="setup"):
                 stderr
             )
 
+        if action == _TEARDOWN:
+            break
+
     return pack.config["system"]
 
 
diff --git a/milabench/commands/__init__.py b/milabench/commands/__init__.py
index ad281b23b..fcd6ccbf4 100644
--- a/milabench/commands/__init__.py
+++ b/milabench/commands/__init__.py
@@ -450,6 +450,11 @@ def _find_node_config(self) -> Dict:
                 return n
         return {}
 
+    def _load_env(self, node):
+        if node.get("env", None):
+            return node["env"]
+        return []
+
     def is_local(self):
         localnode = self.pack.config["system"]["self"]
 
@@ -484,7 +489,7 @@ def _argv(self, **kwargs) -> List:
         argv.append(f"-p{self.port}")
         argv.append(host)
 
-        return argv # + ["env", "-i"]
+        return argv + self._load_env(node)
 
 
 class SCPCommand(SSHCommand, CmdCommand):
@@ -505,6 +510,10 @@ def __init__(
         self.src = src
         self.dest = dest if dest is not None else self.src
 
+    def _load_env(self, node):
+        del node
+        return []
+
     def _argv(self, **kwargs) -> List:
         argv = super()._argv(**kwargs)
 
diff --git a/milabench/scripts/covalent/__main__.py b/milabench/scripts/covalent/__main__.py
index d4de9d932..689eda8fa 100644
--- a/milabench/scripts/covalent/__main__.py
+++ b/milabench/scripts/covalent/__main__.py
@@ -1,9 +1,50 @@
 import argparse
+import ast
 import os
 import pathlib
 import subprocess
 import sys
 import tempfile
+from time import sleep
+import uuid
+
+
+def _arg_pop(args:argparse.Namespace, key:str):
+    value = args.__getattribute__(key)
+    args.__delattr__(key)
+    return value
+
+ARGS_DEFAULT_SETUP = {
+    "slurm": {
+        "state_prefix": {},
+        "state_id": {},
+        "cluster_size": {},
+        "keep_alive": {"action": "store_true"},
+    }
+}
+
+ARGS_MAP = {
+    "slurm": {
+        "state_prefix": lambda args, k:_arg_pop(args, k),
+        "state_id": lambda args, k:args.options.setdefault("job-name", _arg_pop(args, k)),
+        "cluster_size": lambda args, k:args.options.setdefault("nodes", _arg_pop(args, k)),
+        "keep_alive": lambda args, k:_arg_pop(args, k),
+    }
+}
+
+_SETUP = {}
+
+_TEARDOWN = {}
+
+_CONNECTION_ATTRIBUTES = {
+    "hostname": None,
+    "username": None,
+    "ssh_key_file": None,
+    "private_ip": None,
+    "env": None,
+    "python_path": None,
+    "slurm_job_id": None
+}
 
 
 def serve(*argv):
@@ -19,69 +60,328 @@ def _get_executor_kwargs(args):
     }
 
 
-def executor(executor_cls, args, *argv):
+def _wait_for_any(*dispatch_ids):
     import covalent as ct
 
-    def _popen(cmd, *args, _env=None, **kwargs):
-        _env = _env if _env is not None else {}
+    dispatch_ids = set(dispatch_ids)
+    while True:
+        for dispatch_id in set(dispatch_ids):
+            status = ct.get_result(
+                dispatch_id=dispatch_id,
+                wait=False,
+                status_only=True
+            )["status"]
+            if status in [ct.status.COMPLETED]:
+                yield dispatch_id
+                dispatch_ids.remove(dispatch_id)
+            elif status in [ct.status.FAILED, ct.status.CANCELLED]:
+                raise RuntimeError(f"Job {dispatch_id} failed")
+        sleep(5)
+
+
+def _format(lines:list, **template_kv):
+    for l in lines:
+        for k, v in template_kv.items():
+            if "{{" + k + "}}" in l:
+                yield l[:l.find("{{")] + v + l[l.find("}}")+2:]
+                break
+        else:
+            yield l
 
-        for envvar in _env.keys():
-            envvar_val = _env[envvar]
 
-            if not envvar_val:
-                continue
+def _popen(cmd, *args, _env=None, **kwargs):
+    _env = _env if _env is not None else {}
 
-            envvar_val = pathlib.Path(envvar_val).expanduser()
-            if str(envvar_val) != _env[envvar]:
-                _env[envvar] = str(envvar_val)
+    for envvar in _env.keys():
+        envvar_val = _env[envvar]
 
-        if "MILABENCH_CONFIG_CONTENT" in _env:
-            _config_dir = pathlib.Path(_env["MILABENCH_CONFIG"]).parent
-            with tempfile.NamedTemporaryFile("wt", dir=str(_config_dir), suffix=".yaml", delete=False) as _f:
-                _f.write(_env["MILABENCH_CONFIG_CONTENT"])
-                _env["MILABENCH_CONFIG"] = _f.name
+        if not envvar_val:
+            continue
 
-        try:
-            cmd = (str(pathlib.Path(cmd[0]).expanduser()), *cmd[1:])
-        except IndexError:
-            pass
-
-        cwd = kwargs.pop("cwd", None)
-        if cwd is not None:
-            cwd = str(pathlib.Path(cwd).expanduser())
-            kwargs["cwd"] = cwd
-
-        _env = {**os.environ.copy(), **kwargs.pop("env", {}), **_env}
-
-        kwargs = {
-            **kwargs,
-            "env": _env,
-            "stdout": subprocess.PIPE,
-            "stderr": subprocess.PIPE,
-        }
-        p = subprocess.Popen(cmd, *args, **kwargs)
+        envvar_val = pathlib.Path(envvar_val).expanduser()
+        if str(envvar_val) != _env[envvar]:
+            _env[envvar] = str(envvar_val)
+
+    if "MILABENCH_CONFIG_CONTENT" in _env:
+        _config_dir = pathlib.Path(_env["MILABENCH_CONFIG"]).parent
+        with tempfile.NamedTemporaryFile("wt", dir=str(_config_dir), suffix=".yaml", delete=False) as _f:
+            _f.write(_env["MILABENCH_CONFIG_CONTENT"])
+            _env["MILABENCH_CONFIG"] = _f.name
+
+    try:
+        cmd = (str(pathlib.Path(cmd[0]).expanduser()), *cmd[1:])
+    except IndexError:
+        pass
+
+    cwd = kwargs.pop("cwd", None)
+    if cwd is not None:
+        cwd = str(pathlib.Path(cwd).expanduser())
+        kwargs["cwd"] = cwd
+
+    _env = {**os.environ.copy(), **kwargs.pop("env", {}), **_env}
+
+    kwargs = {
+        **kwargs,
+        "env": _env,
+        "stdout": subprocess.PIPE,
+        "stderr": subprocess.PIPE,
+    }
+    p = subprocess.Popen(cmd, *args, **kwargs)
+
+    stdout_chunks = []
+    while True:
+        line = p.stdout.readline()
+        if not line:
+            break
+        line_str = line.decode("utf-8").strip()
+        stdout_chunks.append(line_str)
+        print(line_str)
+
+    _, stderr = p.communicate()
+    stderr = stderr.decode("utf-8").strip()
+    stdout = os.linesep.join(stdout_chunks)
+
+    if p.returncode != 0:
+        raise subprocess.CalledProcessError(
+            p.returncode,
+            (cmd, args, kwargs),
+            stdout,
+            stderr
+        )
+    return p.returncode, stdout, stderr
+
+
+def _setup_terraform(executor:"ct.executor.BaseExecutor"):
+    import covalent as ct
+
+    result = ct.dispatch_sync(
+        ct.lattice(executor.get_connection_attributes)
+    )().result
+
+    assert result and result[0]
+
+    all_connection_attributes, _ = result
+    master_host:str = next(iter(all_connection_attributes))
+
+    if len(all_connection_attributes) > 1:
+        # Add master node to known host to avoid unknown host error The
+        # authenticity of host '[hostname] ([IP address])' can't be established.
+        new_host = subprocess.run(
+            ["ssh-keyscan", master_host],
+            stdout=subprocess.PIPE,
+            check=True
+        ).stdout.decode("utf8")
+        known_hosts = pathlib.Path("~/.ssh/known_hosts").expanduser()
+        with known_hosts.open("at") as _f:
+            _f.write(new_host)
+
+        # Add ssh file to master node to allow connections to worker nodes
+        ssh_key_file = all_connection_attributes[master_host]["ssh_key_file"]
+        fn = pathlib.Path(ssh_key_file)
+        result = ct.dispatch_sync(
+            ct.lattice(executor.cp_to_remote)
+        )(f".ssh/{fn.name.split('.')[0]}", str(fn))
+
+        assert result.status == ct.status.COMPLETED
+
+    return all_connection_attributes
+
+
+def _teardown_terraform(executor:"ct.executor.BaseExecutor"):
+    result = executor.stop_cloud_instance().result
+    assert result is not None
+
+
+def _slurm_executor(executor:"ct.executor.SlurmExecutor", job_uuid:uuid.UUID):
+    import covalent as ct
+
+    _executor = ct.executor.SlurmExecutor()
+    _executor.from_dict(executor.to_dict())
+
+    executor = _executor
+    executor.conda_env = executor.conda_env or "covalent"
+    bashrc_path = f"""''
+{pathlib.Path(__file__).with_name("covalent_bashrc.sh").read_text()}
+"""
+    bashrc_path = "\n".join(
+        _format(
+            bashrc_path.splitlines(),
+            conda_env=executor.conda_env,
+            python_version=f"{sys.version_info.major}.{sys.version_info.minor}",
+            covalent_version=ct.__version__,
+        )
+    )
+    executor.bashrc_path = executor.bashrc_path or "{bashrc_path}"
+    if "{bashrc_path}" in executor.bashrc_path:
+        executor.bashrc_path = executor.bashrc_path.format(bashrc_path=bashrc_path)
+    executor.remote_workdir = executor.remote_workdir or "cov-{job_uuid}-workdir"
+    executor.remote_workdir = executor.remote_workdir.format(job_uuid=job_uuid)
+    executor.options["job-name"] = (
+        executor.options.get("job-name", None) or f"cov-{job_uuid}"
+    )
+
+    return executor
 
-        stdout_chunks = []
+
+def _setup_slurm(executor:"ct.executor.SlurmExecutor"):
+    import covalent as ct
+
+    job_uuid = uuid.uuid4()
+    job_file = f"covalent_job_{job_uuid}"
+
+    _executor = ct.executor.SlurmExecutor()
+    _executor.from_dict(executor.to_dict())
+
+    executor = _slurm_executor(executor, job_uuid)
+
+    job_connection_executor = ct.executor.SlurmExecutor()
+    job_connection_executor.from_dict(executor.to_dict())
+    # Store job connection attributes
+    job_connection_executor.prerun_commands = f"""
+# print connection attributes
+printenv |
+    grep -E ".*SLURM.*NODENAME|.*SLURM.*JOB_ID" |
+    sort -u >>"{job_file}" &&
+srun printenv |
+    grep -E ".*SLURM.*NODENAME|.*SLURM.*JOB_ID" |
+    sort -u >>"{job_file}" &&
+echo "USERNAME=$USER" >>"{job_file}" &&
+echo "{job_uuid}" >>"{job_file}"
+""".splitlines()
+
+    query_executor = ct.executor.SlurmExecutor()
+    query_executor.from_dict(executor.to_dict())
+    query_executor.options = {
+        "nodes": 1,
+        "cpus-per-task": 1,
+        "mem": 1000,
+        "job-name": executor.options["job-name"],
+    }
+
+    @ct.electron()
+    def _empty():
+        pass
+
+    @ct.electron()
+    def _keep_alive():
         while True:
-            line = p.stdout.readline()
-            if not line:
-                break
-            line_str = line.decode("utf-8").strip()
-            stdout_chunks.append(line_str)
-            print(line_str)
-
-        _, stderr = p.communicate()
-        stderr = stderr.decode("utf-8").strip()
-        stdout = os.linesep.join(stdout_chunks)
-
-        if p.returncode != 0:
-            raise subprocess.CalledProcessError(
-                p.returncode,
-                (cmd, args, kwargs),
-                stdout,
-                stderr
+            sleep(60)
+
+    @ct.electron()
+    def _query_connection_attributes(milabench_bashrc:str=""):
+        _job_file = pathlib.Path(job_file).expanduser()
+        _job_file.touch()
+        content = _job_file.read_text().splitlines()
+        while (not content or content[-1].strip() != f"{job_uuid}"):
+            sleep(5)
+            content = _job_file.read_text().splitlines()
+
+        nodes = []
+        connection_attributes = _CONNECTION_ATTRIBUTES.copy()
+
+        milabench_bashrc = "\n".join(
+            _format(
+                milabench_bashrc.splitlines(),
+                milabench_env="cov-slurm-milabench",
+                python_version=f"{sys.version_info.major}.{sys.version_info.minor}",
             )
-        return p.returncode, stdout, stderr
+        )
+        if milabench_bashrc:
+            milabench_bashrc_file = _job_file.with_name("milabench_bashrc.sh").resolve()
+            milabench_bashrc_file.write_text(milabench_bashrc)
+            connection_attributes["env"] = str(milabench_bashrc_file)
+
+        for l in _job_file.read_text().splitlines():
+            try:
+                key, value = l.strip().split("=")
+            except ValueError:
+                # end flag
+                break
+            if "NODENAME" in key and value not in nodes:
+                nodes.append(value)
+            elif "USERNAME" in key:
+                connection_attributes["username"] = value
+            elif "JOB_ID" in key:
+                connection_attributes["slurm_job_id"] = value
+
+        return {
+            hostname: {
+                **connection_attributes,
+                **{
+                    "hostname": hostname,
+                    "private_ip": hostname,
+                },
+            }
+            for hostname in nodes
+        }
+
+    try:
+        # setup covalent for jobs
+        next(_wait_for_any(ct.dispatch(ct.lattice(_empty, executor=query_executor))()))
+        # setup nodes and retrieve connection attributes 
+        job_dispatch_id = ct.dispatch(
+            ct.lattice(
+                lambda:_keep_alive(),
+                executor=job_connection_executor
+            ),
+            disable_run=False
+        )()
+        query_dispatch_id = ct.dispatch(
+            ct.lattice(
+                _query_connection_attributes,
+                executor=query_executor
+            ),
+            disable_run=False
+        )(
+            milabench_bashrc=pathlib.Path(__file__).with_name("milabench_bashrc.sh").read_text()
+        )
+        next(_wait_for_any(job_dispatch_id, query_dispatch_id))
+        all_connection_attributes = ct.get_result(
+            dispatch_id=query_dispatch_id,
+            wait=False
+        ).result
+
+        assert all_connection_attributes
+
+    except:
+        _teardown_slurm(query_executor)
+        raise
+
+    return all_connection_attributes
+
+
+def _teardown_slurm(executor:"ct.executor.SlurmExecutor"):
+    import covalent as ct
+
+    @ct.electron()
+    def _empty():
+        pass
+
+    assert executor.options["job-name"], "Jobs to teardown must have an explicit name"
+
+    _exec = _slurm_executor(executor, "DELETE")
+    _exec.options = {
+        "nodes": 1,
+        "cpus-per-task": 1,
+        "mem": 1000,
+        "job-name": executor.options["job-name"],
+    }
+    _exec.prerun_commands = f"""
+# cancel jobs
+scancel --jobname="{_exec.options['job-name']}"
+""".splitlines()
+    ct.dispatch_sync(ct.lattice(_empty, executor=_exec))()
+
+
+def executor(executor_cls, args:argparse.Namespace, *argv):
+    import covalent as ct
+
+    _SETUP[ct.executor.AzureExecutor] = _setup_terraform
+    _SETUP[ct.executor.EC2Executor] = _setup_terraform
+    _SETUP[ct.executor.SlurmExecutor] = _setup_slurm
+    _TEARDOWN[ct.executor.AzureExecutor] = _teardown_terraform
+    _TEARDOWN[ct.executor.EC2Executor] = _teardown_terraform
+    _TEARDOWN[ct.executor.SlurmExecutor] = _teardown_slurm
 
     executor:ct.executor.BaseExecutor = executor_cls(
         **_get_executor_kwargs(args),
@@ -89,45 +389,13 @@ def _popen(cmd, *args, _env=None, **kwargs):
     return_code = 0
     try:
         if args.setup:
-            result = ct.dispatch_sync(
-                ct.lattice(executor.get_connection_attributes)
-            )().result
-
-            assert result and result[0]
-
-            all_connection_attributes, _ = result
-            master_host:str = None
-            for hostname, connection_attributes in all_connection_attributes.items():
+            for hostname, connection_attributes in _SETUP[executor_cls](executor).items():
                 print(f"hostname::>{hostname}")
                 for attribute,value in connection_attributes.items():
-                    if attribute == "hostname":
+                    if attribute == "hostname" or value is None:
                         continue
                     print(f"{attribute}::>{value}")
 
-                master_host = master_host or hostname
-
-            if len(all_connection_attributes) > 1:
-                # Add master node to known host to avoid unknown host error
-                # The authenticity of host '[hostname] ([IP address])' can't be established.
-                new_host = subprocess.run(
-                    ["ssh-keyscan", master_host],
-                    stdout=subprocess.PIPE,
-                    check=True
-                ).stdout.decode("utf8")
-                known_hosts = pathlib.Path("~/.ssh/known_hosts").expanduser()
-                with known_hosts.open("at") as _f:
-                    _f.write(new_host)
-
-                # Add ssh file to master node to allow connections to worker
-                # nodes
-                ssh_key_file = all_connection_attributes[master_host]["ssh_key_file"]
-                fn = pathlib.Path(ssh_key_file)
-                result = ct.dispatch_sync(
-                    ct.lattice(executor.cp_to_remote)
-                )(f".ssh/{fn.name.split('.')[0]}", str(fn))
-
-                assert result.status == ct.status.COMPLETED
-
         if argv:
             result = ct.dispatch_sync(
                 ct.lattice(executor.list_running_instances)
@@ -166,8 +434,7 @@ def _popen(cmd, *args, _env=None, **kwargs):
 
     finally:
         if args.teardown:
-            result = executor.stop_cloud_instance().result
-            assert result is not None
+            _TEARDOWN[executor_cls](executor)
 
     return return_code
 
@@ -187,7 +454,7 @@ def main(argv=None):
     subparsers = parser.add_subparsers()
     subparser = subparsers.add_parser("serve")
     subparser.add_argument(f"argv", nargs=argparse.REMAINDER)
-    for p in ("azure","ec2"):
+    for p in ("azure", "ec2", "slurm"):
         try:
             config = ct.get_config(f"executors.{p}")
         except KeyError:
@@ -201,10 +468,21 @@ def main(argv=None):
             add_argument_kwargs = {}
             if isinstance(default, bool):
                 add_argument_kwargs["action"] = "store_false" if default else "store_true"
+            elif any(isinstance(default, t) for t in [dict, list]):
+                add_argument_kwargs["type"] = ast.literal_eval
+                add_argument_kwargs["default"] = str(default)
             else:
                 add_argument_kwargs["default"] = default
             subparser.add_argument(f"--{param.replace('_', '-')}", **add_argument_kwargs)
 
+        for param, add_argument_kwargs in ARGS_DEFAULT_SETUP.get(p, {}).items():
+            if param in config:
+                raise ValueError(
+                    f"Found existing argument {param} in both {config} and"
+                    f" {ARGS_DEFAULT_SETUP}"
+                )
+            subparser.add_argument(f"--{param.replace('_', '-')}", **add_argument_kwargs)
+
     try:
         cv_argv, argv = argv[:argv.index("--")], argv[argv.index("--")+1:]
     except ValueError:
@@ -212,6 +490,9 @@ def main(argv=None):
 
     args = parser.parse_args(cv_argv)
 
+    for arg, _map in ARGS_MAP.get(cv_argv[0], {}).items():
+        _map(args, arg)
+
     if cv_argv[0] == "serve":
         assert not argv
         return serve(*args.argv)
@@ -219,6 +500,8 @@ def main(argv=None):
         executor_cls = ct.executor.AzureExecutor
     elif cv_argv[0] == "ec2":
         executor_cls = ct.executor.EC2Executor
+    elif cv_argv[0] == "slurm":
+        executor_cls = ct.executor.SlurmExecutor
     else:
         raise
 
diff --git a/milabench/scripts/covalent/covalent_bashrc.sh b/milabench/scripts/covalent/covalent_bashrc.sh
new file mode 100644
index 000000000..197620c1c
--- /dev/null
+++ b/milabench/scripts/covalent/covalent_bashrc.sh
@@ -0,0 +1,46 @@
+function _get_options {
+	set +o | cut -d' ' -f2- | while read set_option
+	do
+		echo "${set_option}"
+	done
+}
+
+function _set_options {
+	while [[ $# -gt 0 ]]
+	do
+		local _arg="$1"; shift
+		case "${_arg}" in
+			-o) set -o "$1"; shift ;;
+			+o) set +o "$1"; shift ;;
+			-h | --help | *)
+			exit 1
+			;;
+		esac
+	done
+}
+
+_options=$(_get_options)
+set -o errexit -o pipefail
+
+_NAME="{{conda_env}}"
+conda --version >&2 2>/dev/null || module load anaconda/3
+# {{python_version}} needs to be formatted
+conda activate ${_NAME} || conda create -y -n ${_NAME} "python={{python_version}}" virtualenv
+conda activate ${_NAME}
+
+function conda {
+    echo "DEACTIVATED conda" "$@" >&2
+}
+
+if [ ! -f venv/bin/activate ]
+then
+	python3 -m virtualenv venv/
+	. venv/bin/activate
+	# {{covalent_version}} needs to be formatted
+	python3 -m pip install "covalent=={{covalent_version}}"
+else
+	. venv/bin/activate
+fi
+
+_set_options $_options
+unset _options
diff --git a/milabench/scripts/covalent/milabench_bashrc.sh b/milabench/scripts/covalent/milabench_bashrc.sh
new file mode 100644
index 000000000..1ee96d788
--- /dev/null
+++ b/milabench/scripts/covalent/milabench_bashrc.sh
@@ -0,0 +1,31 @@
+function _get_options {
+	set +o | cut -d' ' -f2- | while read set_option
+	do
+		echo "${set_option}"
+	done
+}
+
+function _set_options {
+	while [[ $# -gt 0 ]]
+	do
+		local _arg="$1"; shift
+		case "${_arg}" in
+			-o) set -o "$1"; shift ;;
+			+o) set +o "$1"; shift ;;
+			-h | --help | *)
+			exit 1
+			;;
+		esac
+	done
+}
+
+_options=$(_get_options)
+set -o errexit -o pipefail
+
+_NAME="{{milabench_env}}"
+conda --version >&2 2>/dev/null || module load anaconda/3
+conda activate ${_NAME} || conda create -y -n ${_NAME} "python={{python_version}}" virtualenv
+conda activate ${_NAME}
+
+_set_options $_options
+unset _options
diff --git a/milabench/scripts/covalent/python3/__main__.py b/milabench/scripts/covalent/python3/__main__.py
new file mode 100644
index 000000000..186d9a170
--- /dev/null
+++ b/milabench/scripts/covalent/python3/__main__.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+
+import subprocess
+import sys
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv[1:]
+
+    from ...utils import get_module_venv
+    from .. import __main__
+    check_if_module = "import covalent"
+    python3, env = get_module_venv(__main__.__file__, check_if_module)
+
+    return subprocess.call([python3, *argv], env=env)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/milabench/scripts/covalent/requirements.txt b/milabench/scripts/covalent/requirements.txt
index b70efc793..4b322dc3f 100644
--- a/milabench/scripts/covalent/requirements.txt
+++ b/milabench/scripts/covalent/requirements.txt
@@ -1,3 +1,4 @@
 covalent==0.232
 covalent-ec2-plugin @ git+https://github.com/satyaog/covalent-ec2-plugin.git@feature/milabench
-covalent-azure-plugin @ git+https://github.com/satyaog/covalent-azure-plugin.git@feature/milabench
\ No newline at end of file
+covalent-azure-plugin @ git+https://github.com/satyaog/covalent-azure-plugin.git@feature/milabench
+covalent-slurm-plugin
diff --git a/milabench/scripts/utils.py b/milabench/scripts/utils.py
index 5aec72d06..022481089 100644
--- a/milabench/scripts/utils.py
+++ b/milabench/scripts/utils.py
@@ -1,3 +1,4 @@
+import getpass
 import json
 import pathlib
 import subprocess
@@ -16,9 +17,9 @@ def get_venv(venv:pathlib.Path) -> dict:
     return json.loads(env)
 
 
-def run_in_module_venv(module_main:str, check_if_module:str, argv:list=None):
+def get_module_venv(module_main:str, check_if_module:str):
     module = pathlib.Path(module_main).resolve().parent
-    cache_dir = pathlib.Path(f"/tmp/milabench/{module.name}_venv")
+    cache_dir = pathlib.Path(f"/tmp/{getpass.getuser()}/milabench/{module.name}_venv")
     python3 = str(cache_dir / "bin/python3")
     try:
         subprocess.run([python3, "-c", check_if_module], check=True,
@@ -38,7 +39,9 @@ def run_in_module_venv(module_main:str, check_if_module:str, argv:list=None):
             str(module / "requirements.txt")
         ], stdout=sys.stderr, check=True)
         subprocess.run([python3, "-c", check_if_module], check=True, stdout=sys.stderr)
-    return subprocess.call(
-        [python3, module_main, *argv],
-        env=get_venv(cache_dir)
-    )
\ No newline at end of file
+    return python3, get_venv(cache_dir)
+
+
+def run_in_module_venv(module_main:str, check_if_module:str, argv:list=None):
+    python3, env = get_module_venv(module_main, check_if_module)
+    return subprocess.call([python3, module_main, *argv], env=env)
\ No newline at end of file