From abb2f615b730c4b42517d2a3da8af1d58e0a7b6c Mon Sep 17 00:00:00 2001
From: Mike Henry <11765982+mikemhenry@users.noreply.github.com>
Date: Fri, 2 Jun 2023 08:31:42 -0700
Subject: [PATCH] report mode GPU is in and write out gpu UUID (#699)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* report mode GPU is in and write out gpu UUID

* check for Exclusive_Process instead of Default

* use try/except to be more robust

* Update openmmtools/multistate/multistatesampler.py

Co-authored-by: Iván Pulido <2949729+ijpulidos@users.noreply.github.com>

* make the subprocess throw an exception if there is an error, fix logic in compute mode detection

* make debug message more helpful

* fix UnboundLocalError: local variable 'cuda_query_output' referenced before assignment

* just check the error code

* warn method has been deprecated since version 3.2

---------

Co-authored-by: EC2 Default User <ec2-user@ip-10-0-142-194.ec2.internal>
Co-authored-by: Iván Pulido <2949729+ijpulidos@users.noreply.github.com>
---
 openmmtools/multistate/multistatesampler.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/openmmtools/multistate/multistatesampler.py b/openmmtools/multistate/multistatesampler.py
index 31d49ffa..5f6594cc 100644
--- a/openmmtools/multistate/multistatesampler.py
+++ b/openmmtools/multistate/multistatesampler.py
@@ -35,6 +35,7 @@
 import inspect
 import logging
 import datetime
+import subprocess
 
 import numpy as np
 
@@ -1773,11 +1774,19 @@ def _update_timing(self, iteration_time, partial_total_time, run_initial_iterati
     @staticmethod
     def _display_cuda_devices():
         """Query system nvidia-smi to get available GPUs indices and names in debug log."""
-        # Read nvidia-smi query, should return empty strip if no GPU is found.
-        cuda_query_output = os.popen("nvidia-smi --query-gpu=index,gpu_name --format=csv,noheader").read().strip()
-        # Split by line jump and comma
-        cuda_devices_list = [entry.split(',') for entry in cuda_query_output.split('\n')]
-        logger.debug(f"CUDA devices available: {*cuda_devices_list,}")
+
+        cuda_query_output = subprocess.run("nvidia-smi --query-gpu=gpu_uuid,gpu_name,compute_mode  --format=csv", shell=True, capture_output=True, text=True)
+        # Check if command worked
+        if cuda_query_output.returncode == 0:
+            # Split by line jump and comma
+            cuda_devices_list = [entry for entry in cuda_query_output.stdout.splitlines()]
+            logger.debug(f"CUDA devices available: {*cuda_devices_list,}")
+            # We only support "Default" and not "Exclusive_Process" for the compute mode
+            if "Default" not in cuda_query_output.stdout:
+                logger.warning(f"GPU in 'Exclusive_Process' mode (or Prohibited), one context is allowed per device. This may prevent some openmmtools features from working. GPU must be in 'Default' compute mode")
+        # Handel the case where the command had some error
+        else:
+            logger.debug(f"nvidia-smi command failed: {cuda_query_output.stderr}, this is expected if there is no GPU available")
 
     def _flatten_moves_iterator(self):
         """Recursively flatten MCMC moves. Handles the cases where each move can be a set of moves, for example with