Matgenix · ml-evs · Jan 31, 2024
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -56,6 +56,12 @@ def slurm_ssh_port():
     yield _get_free_port()
 
 
+@pytest.fixture(scope="session")
+def slurm_ssh_mfa_port():
+    """The exposed local port for SSH connections to the Slurm + MFA container."""
+    yield _get_free_port()
+
+
 @pytest.fixture(scope="session")
 def db_port():
     """The exposed local port for connections to the MongoDB stores."""
@@ -72,6 +78,7 @@ def build_and_launch_container(
     dockerfile: Path | None = None,
     image_name: str | None = None,
     ports: dict[str, int] | None = None,
+    target: str | None = None,
 ):
     """Builds and/or launches a container, returning the container object.
 
@@ -81,6 +88,7 @@ def build_and_launch_container(
         image_name: Either the tag to attach to the built image, or an image
             name to pull from the web (may require authenticated docker client).
         ports: A port specification to use for the launched container.
+        target: The docker build target to use.
 
     Yields:
         The launched container object, then stops the container after use.
@@ -95,6 +103,7 @@ def build_and_launch_container(
             tag=image_name,
             rm=True,
             quiet=False,
+            target=target,
         )
 
         for step in logs:
@@ -139,7 +148,22 @@ def slurm_container(docker_client, slurm_ssh_port):
     yield from build_and_launch_container(
         docker_client,
         Path("./tests/integration/dockerfiles/Dockerfile.slurm"),
-        "jobflow-slurm:latest",
+        image_name="jobflow-slurm:latest",
+        ports=ports,
+    )
+
+
+@pytest.fixture(scope="session", autouse=True)
+def slurm_container_mfa(docker_client, slurm_ssh_mfa_port):
+    """Build and launch a container running Slurm + SSH with multi-factor authentication,
+    exposed on a random available port.
+
+    """
+    ports = {"22/tcp": slurm_ssh_mfa_port}
+    yield from build_and_launch_container(
+        docker_client,
+        Path("./tests/integration/dockerfiles/Dockerfile.slurm.mfa"),
+        image_name="jobflow-slurm-mfa:latest",
         ports=ports,
     )
 
@@ -171,6 +195,7 @@ def write_tmp_settings(
     random_project_name,
     store_database_name,
     slurm_ssh_port,
+    slurm_ssh_mfa_port,
     db_port,
 ):
     """Collects the various sub-configs and writes them to a temporary file in a temporary directory."""
@@ -228,6 +253,18 @@ def write_tmp_settings(
                 resources={"partition": "debug", "ntasks": 1, "time": "00:01:00"},
                 connect_kwargs={"allow_agent": False, "look_for_keys": False},
             ),
+            "test_remote_worker_mfa": dict(
+                type="remote",
+                host="localhost",
+                port=slurm_ssh_mfa_port,
+                scheduler_type="slurm",
+                work_dir="/home/jobflow/jfr",
+                user="jobflow",
+                password="jobflow",
+                pre_run="source /home/jobflow/.venv/bin/activate",
+                resources={"partition": "debug", "ntasks": 1, "time": "00:01:00"},
+                connect_kwargs={"allow_agent": False, "look_for_keys": False},
+            ),
         },
         exec_config={"test": {"export": {"TESTING_ENV_VAR": random_project_name}}},
         runner=dict(

diff --git a/tests/integration/dockerfiles/Dockerfile.slurm.mfa b/tests/integration/dockerfiles/Dockerfile.slurm.mfa
@@ -0,0 +1,82 @@
+# syntax=docker/dockerfile:experimental
+# NB: this image is identical to the slurm base image
+# except that it has MFA enabled for ssh
+# Unfortunately this cannot be converted into a single
+# multi-stage build dockerfile until the Docker Python SDK
+# supports BuildKit (see https://github.com/docker/docker-py/issues/2230)
+
+# Using the slurm base image, run an ssh server and install jobflow
+FROM nathanhess/slurm:full AS base
+ARG USERNAME=jobflow
+ARG PASSWORD=jobflow
+WORKDIR /opt
+USER root
+
+# Install OpenSSH server and set it to run on startup
+RUN apt update && apt install -y openssh-server && apt clean && rm -rf /var/lib/apt/lists/*
+RUN sed -i 's/#PasswordAuthentication no/PasswordAuthentication yes/g' /etc/ssh/sshd_config
+RUN sed -ie 's/^SCRIPT/service ssh start\nSCRIPT/g' /etc/startup.sh
+
+# Create desired user with blank password then give user access to startup script as sudo without password
+# See https://github.com/nathan-hess/docker-slurm/blob/a62133d66d624d9ff0ccefbd41a0b1b2abcb9925/dockerfile_base/Dockerfile#L62C1-L65C1
+RUN useradd -rm -d /home/${USERNAME} -s /bin/bash ${USERNAME} && usermod -a -G sudo ${USERNAME}
+RUN echo ${USERNAME}:${PASSWORD} | chpasswd
+RUN printf "${USERNAME} ALL=(root:root) NOPASSWD: /etc/startup.sh\n" >> /etc/sudoers.d/startup \
+    && chmod 0440 /etc/sudoers.d/startup \
+    && visudo -c
+
+# Reset workdir and make jobflow data directory
+WORKDIR /home/${USERNAME}
+USER ${USERNAME}
+SHELL ["/bin/bash", "--login", "-c"]
+
+# Install jobflow from directory, assuming container
+# is built at the root of the jobflow repo
+RUN mkdir jobflow-remote
+COPY src/jobflow_remote jobflow-remote/src/jobflow_remote
+COPY pyproject.toml jobflow-remote/
+
+# versioningit refuses to install a package without its full git history
+# so here we remove versioningit config from pyproject.toml as we don't need
+# the full version number (which allows us to cache many more layers)
+RUN sed -i '/\[tool.versioningit.vcs\]/,+3d' jobflow-remote/pyproject.toml
+
+# Annoyingly we want to use this with the Python SDK
+# which does not support buildkit yet
+# so cannot use --chmod in the copy directly and
+# we have to become root for this step
+USER root
+RUN sudo chmod -R 0777 jobflow-remote
+USER ${USERNAME}
+
+# Install jobflow in a local native virtualenv
+WORKDIR /home/${USERNAME}/jobflow-remote
+# RUN git config --global --add safe.directory /home/${USERNAME}/jobflow-remote
+RUN python3 -m venv /home/${USERNAME}/.venv
+RUN /home/${USERNAME}/.venv/bin/pip install -U pip
+RUN /home/${USERNAME}/.venv/bin/pip install --verbose -e .
+
+# Make a job directory for jobflow
+WORKDIR /home/${USERNAME}
+RUN mkdir jfr
+
+# Install and configure MFA
+USER root
+RUN apt update && apt install -y libpam-google-authenticator && apt clean && rm -rf /var/lib/apt/lists/*
+# Add MFA to sshd pam config: `nullok` comes from the README here: https://github.com/google/google-authenticator-libpam?tab=readme-ov-file#setting-up-a-user
+RUN echo "auth required pam_google_authenticator.so nullok echo_verification_code" >> /etc/pam.d/sshd
+RUN sed -i 's/#PasswordAuthentication yes/ChallengeResponseAuthentication yes/g' /etc/ssh/sshd_config
+RUN cat /etc/ssh/sshd_config
+
+# Configure MFA for jobflow user
+USER ${USERNAME}
+# Secret key
+RUN echo "3GWUG4NXEEG7KQEXBYOT4AJH3Q" > /home/${USERNAME}/.google_authenticator
+# Settings (weird quotes necessary)
+RUN echo '" WINDOW_SIZE 17' >> /home/${USERNAME}/.google_authenticator
+RUN echo '" TOTP_AUTH' >> /home/${USERNAME}/.google_authenticator
+# Emergency backup codes
+RUN echo "13802822" >> /home/${USERNAME}/.google_authenticator
+RUN cat /home/${USERNAME}/.google_authenticator
+# Set appropriate permissions otherwise PAM fails
+RUN chmod 600 /home/${USERNAME}/.google_authenticator
diff --git a/tests/integration/test_slurm.py b/tests/integration/test_slurm.py
@@ -15,10 +15,10 @@ def test_project_init(random_project_name):
     assert len(cm.projects) == 1
     assert cm.projects[random_project_name]
     project = cm.get_project()
-    assert len(project.workers) == 2
+    assert len(project.workers) == 3
 
 
-def test_paramiko_ssh_connection(job_controller, slurm_ssh_port):
+def test_paramiko_ssh_connection(slurm_ssh_port):
     from paramiko import SSHClient
     from paramiko.client import WarningPolicy
 
@@ -34,6 +34,22 @@ def test_paramiko_ssh_connection(job_controller, slurm_ssh_port):
     )
 
 
+def test_paramiko_ssh_mfa_connection(slurm_ssh_mfa_port):
+    from paramiko import SSHClient
+    from paramiko.client import WarningPolicy
+
+    client = SSHClient()
+    client.set_missing_host_key_policy(WarningPolicy)
+    client.connect(
+        "localhost",
+        port=slurm_ssh_mfa_port,
+        username="jobflow",
+        password="jobflow",
+        look_for_keys=False,
+        allow_agent=False,
+    )
+
+
 def test_project_check(job_controller, capsys):
     from jobflow_remote.cli.project import check
 
@@ -43,6 +59,7 @@ def test_project_check(job_controller, capsys):
     expected = [
         "✓ Worker test_local_worker",
         "✓ Worker test_remote_worker",
+        "✓ Worker test_remote_worker_mfa",
         "✓ Jobstore",
         "✓ Queue store",
     ]
@@ -52,7 +69,7 @@ def test_project_check(job_controller, capsys):
 
 @pytest.mark.parametrize(
     "worker",
-    ["test_local_worker", "test_remote_worker"],
+    ["test_local_worker", "test_remote_worker", "test_remote_worker_mfa"],
 )
 def test_submit_flow(worker, job_controller):
     from jobflow import Flow
@@ -90,7 +107,7 @@ def test_submit_flow(worker, job_controller):
 
 @pytest.mark.parametrize(
     "worker",
-    ["test_local_worker", "test_remote_worker"],
+    ["test_local_worker", "test_remote_worker", "test_remote_worker_mfa"],
 )
 def test_submit_flow_with_dependencies(worker, job_controller):
     from jobflow import Flow
@@ -136,7 +153,7 @@ def test_submit_flow_with_dependencies(worker, job_controller):
 
 @pytest.mark.parametrize(
     "worker",
-    ["test_local_worker", "test_remote_worker"],
+    ["test_local_worker", "test_remote_worker", "test_remote_worker_mfa"],
 )
 def test_job_with_callable_kwarg(worker, job_controller):
     """Test whether a callable can be successfully provided as a keyword
@@ -180,7 +197,7 @@ def test_job_with_callable_kwarg(worker, job_controller):
 
 @pytest.mark.parametrize(
     "worker",
-    ["test_local_worker", "test_remote_worker"],
+    ["test_local_worker", "test_remote_worker", "test_remote_worker_mfa"],
 )
 def test_expected_failure(worker, job_controller):
     from jobflow import Flow
@@ -209,7 +226,7 @@ def test_expected_failure(worker, job_controller):
 
 @pytest.mark.parametrize(
     "worker",
-    ["test_local_worker", "test_remote_worker"],
+    ["test_local_worker", "test_remote_worker", "test_remote_worker_mfa"],
 )
 def test_exec_config(worker, job_controller, random_project_name):
     """Tests that an environment variable set in the exec config