From 0251840c723223659d054efcbaf41642eca05f54 Mon Sep 17 00:00:00 2001 From: Guido Petretto Date: Wed, 21 Feb 2024 20:43:07 +0100 Subject: [PATCH] map all slurm states --- src/qtoolkit/core/data_objects.py | 14 ++++++++++++++ src/qtoolkit/io/slurm.py | 24 +++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/src/qtoolkit/core/data_objects.py b/src/qtoolkit/core/data_objects.py index 6a0f440..c06e4fe 100644 --- a/src/qtoolkit/core/data_objects.py +++ b/src/qtoolkit/core/data_objects.py @@ -58,6 +58,20 @@ class QState(QTKEnum): queue manager (e.g. PBS, SLURM, ...) needs to be defined. + UNDETERMINED: The job status cannot be determined. This is a permanent + issue, not being solvable by asking again for the job state. + QUEUED: The job is queued for being scheduled and executed. + QUEUED HELD: The job has been placed on hold by the system, the + administrator, or the submitting user. + RUNNING: The job is running on an execution host. + SUSPENDED: The job has been suspended by the user, the system or the + administrator. + REQUEUED: The job was re-queued by the DRM system, and is eligible to run. + REQUEUED HELD: The job was re-queued by the DRM system, and is currently + placed on hold by the system, the administrator, or the submitting user. + DONE: The job finished without an error. + FAILED: The job exited abnormally before finishing. + Note that not all these standardized states are available in the actual queue manager implementations. """ diff --git a/src/qtoolkit/io/slurm.py b/src/qtoolkit/io/slurm.py index 8254f2a..08f48d3 100644 --- a/src/qtoolkit/io/slurm.py +++ b/src/qtoolkit/io/slurm.py @@ -88,6 +88,7 @@ class SlurmState(QSubState): + BOOT_FAIL = "BOOT_FAIL", "BF" CANCELLED = "CANCELLED", "CA" COMPLETING = "COMPLETING", "CG" COMPLETED = "COMPLETED", "CD" @@ -97,7 +98,17 @@ class SlurmState(QSubState): NODE_FAIL = "NODE_FAIL", "NF" OUT_OF_MEMORY = "OUT_OF_MEMORY", "OOM" PENDING = "PENDING", "PD" + PREEMPTED = "PREEMPTED", "PR" + RESV_DEL_HOLD = "RESV_DEL_HOLD", "RD" + REQUEUE_FED = "REQUEUE_FED", "RF" + REQUEUE_HOLD = "REQUEUE_HOLD", "RH" + RESIZING = "RESIZING", "RS" + REVOKED = "REVOKED", "RV" RUNNING = "RUNNING", "R" + SIGNALING = "SIGNALING", "SI" + SPECIAL_EXIT = "SPECIAL_EXIT", "SE" + STAGE_OUT = "STAGE_OUT", "SO" + STOPPED = "STOPPED", "ST" SUSPENDED = "SUSPENDED", "S" TIMEOUT = "TIMEOUT", "TO" @@ -108,7 +119,8 @@ def qstate(self) -> QState: _STATUS_MAPPING = { - SlurmState.CANCELLED: QState.SUSPENDED, # Should this be failed ? + SlurmState.BOOT_FAIL: QState.FAILED, + SlurmState.CANCELLED: QState.FAILED, SlurmState.COMPLETING: QState.RUNNING, SlurmState.COMPLETED: QState.DONE, SlurmState.CONFIGURING: QState.QUEUED, @@ -117,7 +129,17 @@ def qstate(self) -> QState: SlurmState.NODE_FAIL: QState.FAILED, SlurmState.OUT_OF_MEMORY: QState.FAILED, SlurmState.PENDING: QState.QUEUED, + SlurmState.PREEMPTED: QState.FAILED, + SlurmState.RESV_DEL_HOLD: QState.QUEUED_HELD, + SlurmState.REQUEUE_FED: QState.QUEUED, + SlurmState.REQUEUE_HOLD: QState.QUEUED, + SlurmState.RESIZING: QState.RUNNING, + SlurmState.REVOKED: QState.FAILED, SlurmState.RUNNING: QState.RUNNING, + SlurmState.SIGNALING: QState.RUNNING, + SlurmState.SPECIAL_EXIT: QState.FAILED, + SlurmState.STAGE_OUT: QState.RUNNING, + SlurmState.STOPPED: QState.RUNNING, SlurmState.SUSPENDED: QState.SUSPENDED, SlurmState.TIMEOUT: QState.FAILED, }