report mode GPU is in and write out gpu UUID (#699)

* report mode GPU is in and write out gpu UUID * check for Exclusive_Process instead of Default * use try/except to be more robust * Update openmmtools/multistate/multistatesampler.py Co-authored-by: Iván Pulido <[email protected]> * make the subprocess throw an exception if there is an error, fix logic in compute mode detection * make debug message more helpful * fix UnboundLocalError: local variable 'cuda_query_output' referenced before assignment * just check the error code * warn method has been deprecated since version 3.2 --------- Co-authored-by: EC2 Default User <[email protected]> Co-authored-by: Iván Pulido <[email protected]>
choderalab · Jun 2, 2023 · abb2f61 · abb2f61
1 parent 3cf98eb
commit abb2f61
Showing 1 changed file with 14 additions and 5 deletions.
diff --git a/openmmtools/multistate/multistatesampler.py b/openmmtools/multistate/multistatesampler.py
@@ -35,6 +35,7 @@
 import inspect
 import logging
 import datetime
+import subprocess
 
 import numpy as np
 
@@ -1773,11 +1774,19 @@ def _update_timing(self, iteration_time, partial_total_time, run_initial_iterati
     @staticmethod
     def _display_cuda_devices():
         """Query system nvidia-smi to get available GPUs indices and names in debug log."""
-        # Read nvidia-smi query, should return empty strip if no GPU is found.
-        cuda_query_output = os.popen("nvidia-smi --query-gpu=index,gpu_name --format=csv,noheader").read().strip()
-        # Split by line jump and comma
-        cuda_devices_list = [entry.split(',') for entry in cuda_query_output.split('\n')]
-        logger.debug(f"CUDA devices available: {*cuda_devices_list,}")
+
+        cuda_query_output = subprocess.run("nvidia-smi --query-gpu=gpu_uuid,gpu_name,compute_mode  --format=csv", shell=True, capture_output=True, text=True)
+        # Check if command worked
+        if cuda_query_output.returncode == 0:
+            # Split by line jump and comma
+            cuda_devices_list = [entry for entry in cuda_query_output.stdout.splitlines()]
+            logger.debug(f"CUDA devices available: {*cuda_devices_list,}")
+            # We only support "Default" and not "Exclusive_Process" for the compute mode
+            if "Default" not in cuda_query_output.stdout:
+                logger.warning(f"GPU in 'Exclusive_Process' mode (or Prohibited), one context is allowed per device. This may prevent some openmmtools features from working. GPU must be in 'Default' compute mode")
+        # Handel the case where the command had some error
+        else:
+            logger.debug(f"nvidia-smi command failed: {cuda_query_output.stderr}, this is expected if there is no GPU available")
 
     def _flatten_moves_iterator(self):
         """Recursively flatten MCMC moves. Handles the cases where each move can be a set of moves, for example with