From 6fd2c7cff9b814a6037dcc166171d0fe0188c47d Mon Sep 17 00:00:00 2001 From: Christina Ertural <52951132+QuantumChemist@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:54:55 +0200 Subject: [PATCH 01/35] Initial commit for SGE interface --- src/qtoolkit/io/sge.py | 458 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 458 insertions(+) create mode 100644 src/qtoolkit/io/sge.py diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py new file mode 100644 index 0000000..22f6fce --- /dev/null +++ b/src/qtoolkit/io/sge.py @@ -0,0 +1,458 @@ +from __future__ import annotations + +import re +from datetime import timedelta + +from qtoolkit.core.data_objects import ( + CancelResult, + CancelStatus, + ProcessPlacement, + QJob, + QJobInfo, + QResources, + QState, + QSubState, + SubmissionResult, + SubmissionStatus, +) +from qtoolkit.core.exceptions import OutputParsingError, UnsupportedResourcesError +from qtoolkit.io.base import BaseSchedulerIO + +# States in PBS from qstat's man. +# B Array job: at least one subjob has started. +# +# E Job is exiting after having run. +# +# F Job is finished. +# +# H Job is held. +# +# M Job was moved to another server. +# +# Q Job is queued. +# +# R Job is running. +# +# S Job is suspended. +# +# T Job is being moved to new location. +# +# U Cycle-harvesting job is suspended due to keyboard activity. +# +# W Job is waiting for its submitter-assigned start time to be reached. +# +# X Subjob has completed execution or has been deleted. + + +class PBSState(QSubState): + ARRAY_RUNNING = "B" + EXITING = "E" + FINISHED = "F" + HELD = "H" + MOVED = "M" + QUEUED = "Q" + RUNNING = "R" + SUSPENDED = "S" + TRANSITING = "T" + SUSPENDED_KEYBOARD = "U" + WAITING = "W" + ARRAY_FINISHED = "X" + + @property + def qstate(self) -> QState: + return _STATUS_MAPPING[self] # type: ignore + + +_STATUS_MAPPING = { + PBSState.ARRAY_RUNNING: QState.RUNNING, + PBSState.EXITING: QState.RUNNING, + PBSState.FINISHED: QState.DONE, + PBSState.HELD: QState.QUEUED_HELD, + PBSState.MOVED: QState.REQUEUED, + PBSState.QUEUED: QState.QUEUED, + PBSState.RUNNING: QState.RUNNING, + PBSState.SUSPENDED: QState.SUSPENDED, + PBSState.TRANSITING: QState.REQUEUED, + PBSState.SUSPENDED_KEYBOARD: QState.SUSPENDED, + PBSState.WAITING: QState.QUEUED, + PBSState.ARRAY_FINISHED: QState.DONE, +} + + +class PBSIO(BaseSchedulerIO): + header_template: str = """ +#PBS -q $${queue} +#PBS -N $${job_name} +#PBS -A $${account} +#PBS -l $${select} +#PBS -l walltime=$${walltime} +#PBS -l model=$${model} +#PBS -l place=$${place} +#PBS -W group_list=$${group_list} +#PBS -M $${mail_user} +#PBS -m $${mail_type} +#PBS -o $${qout_path} +#PBS -e $${qerr_path} +#PBS -p $${priority} +#PBS -r $${rerunnable} +#PBS -J $${array} +$${qverbatim}""" + + SUBMIT_CMD: str | None = "qsub" + CANCEL_CMD: str | None = "qdel" + + def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: + if isinstance(stdout, bytes): + stdout = stdout.decode() + if isinstance(stderr, bytes): + stderr = stderr.decode() + if exit_code != 0: + return SubmissionResult( + exit_code=exit_code, + stdout=stdout, + stderr=stderr, + status=SubmissionStatus("FAILED"), + ) + job_id = stdout.strip() + status = ( + SubmissionStatus("SUCCESSFUL") + if job_id + else SubmissionStatus("JOB_ID_UNKNOWN") + ) + return SubmissionResult( + job_id=job_id, + exit_code=exit_code, + stdout=stdout, + stderr=stderr, + status=status, + ) + + def parse_cancel_output(self, exit_code, stdout, stderr) -> CancelResult: + """Parse the output of the scancel command.""" + # Possible error messages: + # qdel: Unknown Job Id 100 + # qdel: Job has finished 1004 + # Correct execution: no output + if isinstance(stdout, bytes): + stdout = stdout.decode() + if isinstance(stderr, bytes): + stderr = stderr.decode() + if exit_code != 0: + return CancelResult( + exit_code=exit_code, + stdout=stdout, + stderr=stderr, + status=CancelStatus("FAILED"), + ) + + # PBS does not return the job id if the job is successfully deleted + status = CancelStatus("SUCCESSFUL") + return CancelResult( + job_id=None, + exit_code=exit_code, + stdout=stdout, + stderr=stderr, + status=status, + ) + + def _get_job_cmd(self, job_id: str): + cmd = f"qstat -f {job_id}" + + return cmd + + def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: + out = self.parse_jobs_list_output(exit_code, stdout, stderr) + if out: + return out[0] + return None + + def _get_jobs_list_cmd( + self, job_ids: list[str] | None = None, user: str | None = None + ) -> str: + if user and job_ids: + raise ValueError("Cannot query by user and job(s) in PBS") + + command = [ + "qstat", + "-f", + ] + + if user: + command.append(f"-u {user}") + + if job_ids: + command.append(" ".join(job_ids)) + + return " ".join(command) + + def parse_jobs_list_output(self, exit_code, stdout, stderr) -> list[QJob]: + if isinstance(stdout, bytes): + stdout = stdout.decode() + if isinstance(stderr, bytes): + stderr = stderr.decode() + + # if some jobs of the list do not exist the exit code is not zero, but + # the data for other jobs is still present. Some the exit code is ignored here + + # The error messages are included in the stderr and could be of the form: + # qstat: Unknown Job Id 10000.c2cf5fbe1102 + # qstat: 1008.c2cf5fbe1102 Job has finished, use -x or -H to + # obtain historical job information + # TODO raise if these two kinds of error are not present and exit_code != 0? + + # Split by the beginning of "Job Id:" and iterate on the different chunks. + # Matching the beginning of the line to avoid problems in case the "Job Id" + # string is present elsewhere. + jobs_chunks = re.split(r"^\s*Job Id: ", stdout, flags=re.MULTILINE) + + # regex to split the key-values pairs separated by " = " + # Explanation: + # - \s*([A-Za-z_.]+)\s+=\s+ matches the key in the key-value pair, + # allowing for leading and trailing whitespace before and after the + # equals sign, and allowing for a dot in the key. + # - ([\s\S]*?) matches the value in the key-value pair, allowing for any + # character including newlines. + # - (?=\n\s*[A-Za-z_.]+\s+=|\Z) is a positive lookahead that matches a + # newline followed by a key with optional leading and trailing + # whitespace and an equals sign or the end of the string, + # without including the lookahead match in the result. + # The key_pattern is separated in case needs to be updated. + key_pattern = r"[A-Za-z_.]+" + values_regex = re.compile( + rf"\s*({key_pattern})\s+=\s+([\s\S]*?)(?=\n\s*{key_pattern}\s+=|\Z)" + ) + + jobs_list = [] + for chunk in jobs_chunks: + chunk = chunk.strip() + if not chunk: + continue + + # first line is the id: + job_id, chunk_data = chunk.split("\n", 1) + job_id = job_id.strip() + results = values_regex.findall(chunk_data) + if not results: + continue + data = dict(results) + + qjob = QJob() + qjob.job_id = job_id + + job_state_string = data["job_state"] + + try: + pbs_job_state = PBSState(job_state_string) + except ValueError: + msg = f"Unknown job state {job_state_string} for job id {qjob.job_id}" + raise OutputParsingError(msg) + qjob.sub_state = pbs_job_state + qjob.state = pbs_job_state.qstate + + qjob.username = data["Job_Owner"] + + info = QJobInfo() + + try: + info.nodes = int(data.get("Resource_List.nodect")) + except ValueError: + info.nodes = None + + try: + info.cpus = int(data.get("Resource_List.ncpus")) + except ValueError: + info.cpus = None + + try: + info.memory_per_cpu = self._convert_memory_str( + data.get("Resource_List.mem") + ) + except OutputParsingError: + info.memory_per_cpu = None + + info.partition = data["queue"] + + # TODO here _convert_time_str can raise. If parsing errors are accepted + # handle differently + info.time_limit = self._convert_str_to_time( + data.get("Resource_List.walltime") + ) + + try: + runtime_str = data.get("resources_used.walltime") + if runtime_str: + qjob.runtime = self._convert_str_to_time(runtime_str) + except OutputParsingError: + qjob.runtime = None + + qjob.name = data.get("Job_Name") + qjob.info = info + + # I append to the list of jobs to return + jobs_list.append(qjob) + + return jobs_list + + @staticmethod + def _convert_str_to_time(time_str: str | None): + """ + Convert a string in the format used by PBS DD:HH:MM:SS to a number of seconds. + It may contain only H:M:S, only M:S or only S. + """ + + if not time_str: + return None + + time_split = time_str.split(":") + + # array containing seconds, minutes, hours and days + time = [0] * 4 + + try: + for i, v in enumerate(reversed(time_split)): + time[i] = int(v) + + except ValueError: + raise OutputParsingError() + + return time[3] * 86400 + time[2] * 3600 + time[1] * 60 + time[0] + + @staticmethod + def _convert_memory_str(memory: str | None) -> int | None: + if not memory: + return None + + match = re.match(r"([0-9]+)([a-zA-Z]*)", memory) + if not match: + raise OutputParsingError("No numbers and units parsed") + memory, units = match.groups() + + power_labels = {"kb": 0, "mb": 1, "gb": 2, "tb": 3} + + if not units: + units = "mb" + elif units not in power_labels: + raise OutputParsingError(f"Unknown units {units}") + try: + v = int(memory) + except ValueError: + raise OutputParsingError + + return v * (1024 ** power_labels[units]) + + # helper attribute to match the values defined in QResources and + # the dictionary that should be passed to the template + _qresources_mapping = { + "queue_name": "queue", + "job_name": "job_name", + "account": "account", + "priority": "priority", + "output_filepath": "qout_path", + "error_filepath": "qerr_path", + "project": "group_list", + } + + @staticmethod + def _convert_time_to_str(time: int | float | timedelta) -> str: + if not isinstance(time, timedelta): + time = timedelta(seconds=time) + + hours, remainder = divmod(int(time.total_seconds()), 3600) + minutes, seconds = divmod(remainder, 60) + + time_str = f"{hours}:{minutes}:{seconds}" + return time_str + + def _convert_qresources(self, resources: QResources) -> dict: + """ + Converts a QResources instance to a dict that will be used to fill in the + header of the submission script. + """ + + header_dict = {} + for qr_field, pbs_field in self._qresources_mapping.items(): + val = getattr(resources, qr_field) + if val is not None: + header_dict[pbs_field] = val + + if resources.njobs and resources.njobs > 1: + header_dict["array"] = f"1-{resources.njobs}" + + if resources.time_limit: + header_dict["walltime"] = self._convert_time_to_str(resources.time_limit) + + if resources.rerunnable is not None: + header_dict["rerunnable"] = "y" if resources.rerunnable else "n" + + nodes, processes, processes_per_node = resources.get_processes_distribution() + select = None + if resources.process_placement == ProcessPlacement.NO_CONSTRAINTS: + select = f"select={processes}" + if resources.threads_per_process: + select += f":ncpus={resources.threads_per_process}" + select += f":ompthreads={resources.threads_per_process}" + if resources.memory_per_thread: + threads_per_process = resources.threads_per_process or 1 + select += f":mem={threads_per_process * resources.memory_per_thread}mb" + elif resources.process_placement in ( + ProcessPlacement.EVENLY_DISTRIBUTED, + ProcessPlacement.SAME_NODE, + ProcessPlacement.SCATTERED, + ): + select = f"select={nodes}" + if resources.threads_per_process and resources.threads_per_process > 1: + cpus = resources.threads_per_process * processes_per_node + ompthreads = resources.threads_per_process + else: + cpus = processes_per_node + ompthreads = None + select += f":ncpus={cpus}" + select += f":mpiprocs={processes_per_node}" + if ompthreads: + select += f":ompthreads={ompthreads}" + if resources.memory_per_thread: + mem = cpus * resources.memory_per_thread + select += f":mem={mem}mb" + + if resources.process_placement in ( + ProcessPlacement.EVENLY_DISTRIBUTED, + ProcessPlacement.SCATTERED, + ): + header_dict["place"] = "scatter" + elif resources.process_placement == ProcessPlacement.SAME_NODE: + header_dict["place"] = "pack" + else: + msg = f"process placement {resources.process_placement} is not supported for PBS" + raise UnsupportedResourcesError(msg) + + header_dict["select"] = select + + if resources.email_address: + header_dict["mail_user"] = resources.email_address + header_dict["mail_type"] = "abe" + + if resources.scheduler_kwargs: + header_dict.update(resources.scheduler_kwargs) + + return header_dict + + @property + def supported_qresources_keys(self) -> list: + """ + List of attributes of QResources that are correctly handled by the + _convert_qresources method. It is used to validate that the user + does not pass an unsupported value, expecting to have an effect. + """ + supported = list(self._qresources_mapping.keys()) + supported += [ + "njobs", + "time_limit", + "processes", + "processes_per_node", + "process_placement", + "nodes", + "threads_per_process", + "email_address", + "scheduler_kwargs", + ] + return supported From e4a57ef0ce405de1a40fe80a904291f40f5096a2 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Fri, 2 Aug 2024 20:54:16 +0200 Subject: [PATCH 02/35] adding extremely generic code --- src/qtoolkit/io/sge.py | 420 ++++++++++++++++++++++------------------- 1 file changed, 223 insertions(+), 197 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 22f6fce..2a29e67 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -1,6 +1,8 @@ from __future__ import annotations import re +import xml.dom.minidom +import xml.parsers.expat from datetime import timedelta from qtoolkit.core.data_objects import ( @@ -18,45 +20,79 @@ from qtoolkit.core.exceptions import OutputParsingError, UnsupportedResourcesError from qtoolkit.io.base import BaseSchedulerIO -# States in PBS from qstat's man. -# B Array job: at least one subjob has started. +# 'http://www.loni.ucla.edu/twiki/bin/view/Infrastructure/GridComputing?skin=plain': +# Jobs Status: +# 'qw' - Queued and waiting, +# 'w' - Job waiting, +# 's' - Job suspended, +# 't' - Job transferring and about to start, +# 'r' - Job running, +# 'h' - Job hold, +# 'R' - Job restarted, +# 'd' - Job has been marked for deletion, +# 'Eqw' - An error occurred with the job. # -# E Job is exiting after having run. +# 'http://confluence.rcs.griffith.edu.au:8080/display/v20zCluster/ +# Sun+Grid+Engine+SGE+state+letter+symbol+codes+meanings': # -# F Job is finished. -# -# H Job is held. -# -# M Job was moved to another server. -# -# Q Job is queued. -# -# R Job is running. -# -# S Job is suspended. -# -# T Job is being moved to new location. -# -# U Cycle-harvesting job is suspended due to keyboard activity. -# -# W Job is waiting for its submitter-assigned start time to be reached. -# -# X Subjob has completed execution or has been deleted. - - -class PBSState(QSubState): - ARRAY_RUNNING = "B" - EXITING = "E" - FINISHED = "F" - HELD = "H" - MOVED = "M" - QUEUED = "Q" - RUNNING = "R" - SUSPENDED = "S" - TRANSITING = "T" - SUSPENDED_KEYBOARD = "U" - WAITING = "W" - ARRAY_FINISHED = "X" +# Category State SGE Letter Code +# Pending: pending qw +# Pending: pending, user hold qw +# Pending: pending, system hold hqw +# Pending: pending, user and system hold hqw +# Pending: pending, user hold, re-queue hRwq +# Pending: pending, system hold, re-queue hRwq +# Pending: pending, user and system hold, re-queue hRwq +# Pending: pending, user hold qw +# Pending: pending, user hold qw +# Running running r +# Running transferring t +# Running running, re-submit Rr +# Running transferring, re-submit Rt +# Suspended job suspended s, ts +# Suspended queue suspended S, tS +# Suspended queue suspended by alarm T, tT +# Suspended all suspended with re-submit Rs, Rts, RS, RtS, RT, RtT +# Error all pending states with error Eqw, Ehqw, EhRqw +# Deleted all running and suspended states with deletion dr, dt, dRr, dRt, +# ds, dS, dT, dRs, +# dRS, dRT + + +class SGEState(QSubState): + # Queue states + UNKNOWN = "u" + ALARM = "a" + SUSPEND_THRESHOLD = "A" + SUSPENDED_BY_USER_ADMIN = "s" + DISABLED_BY_USER_ADMIN = "d" + SUSPENDED_BY_CALENDAR = "C" + DISABLED_BY_CALENDAR = "D" + SUSPENDED_BY_SUBORDINATION = "S" + ERROR = "E" + + # Job states + QUEUED_WAITING = "qw" + WAITING = "w" + JOB_SUSPENDED = "s" + TRANSFERRING = "t" + RUNNING = "r" + HOLD = "h" + RESTARTED = "R" + DELETION = "d" + ERROR_PENDING = "Eqw" + ERROR_PENDING_HOLD = "Ehqw" + ERROR_PENDING_HOLD_REQUEUE = "EhRqw" + DELETION_RUNNING = "dr" + DELETION_TRANSFERRING = "dt" + DELETION_RUNNING_RESUBMIT = "dRr" + DELETION_TRANSFERRING_RESUBMIT = "dRt" + DELETION_SUSPENDED_JOB = "ds" + DELETION_SUSPENDED_QUEUE = "dS" + DELETION_SUSPENDED_ALARM = "dT" + DELETION_SUSPENDED_RESUBMIT_JOB = "dRs" + DELETION_SUSPENDED_RESUBMIT_QUEUE = "dRS" + DELETION_SUSPENDED_RESUBMIT_ALARM = "dRT" @property def qstate(self) -> QState: @@ -64,38 +100,53 @@ def qstate(self) -> QState: _STATUS_MAPPING = { - PBSState.ARRAY_RUNNING: QState.RUNNING, - PBSState.EXITING: QState.RUNNING, - PBSState.FINISHED: QState.DONE, - PBSState.HELD: QState.QUEUED_HELD, - PBSState.MOVED: QState.REQUEUED, - PBSState.QUEUED: QState.QUEUED, - PBSState.RUNNING: QState.RUNNING, - PBSState.SUSPENDED: QState.SUSPENDED, - PBSState.TRANSITING: QState.REQUEUED, - PBSState.SUSPENDED_KEYBOARD: QState.SUSPENDED, - PBSState.WAITING: QState.QUEUED, - PBSState.ARRAY_FINISHED: QState.DONE, + SGEState.QUEUED_WAITING: QState.QUEUED, + SGEState.WAITING: QState.QUEUED, + SGEState.HOLD: QState.QUEUED_HELD, + SGEState.ERROR_PENDING: QState.FAILED, + SGEState.ERROR_PENDING_HOLD: QState.FAILED, + SGEState.ERROR_PENDING_HOLD_REQUEUE: QState.FAILED, + SGEState.RUNNING: QState.RUNNING, + SGEState.TRANSFERRING: QState.RUNNING, + SGEState.RESTARTED: QState.RUNNING, + SGEState.JOB_SUSPENDED: QState.SUSPENDED, + SGEState.SUSPENDED_BY_USER_ADMIN: QState.SUSPENDED, + SGEState.SUSPENDED_BY_SUBORDINATION: QState.SUSPENDED, + SGEState.ALARM: QState.SUSPENDED, + SGEState.ERROR: QState.FAILED, + SGEState.DELETION: QState.DONE, + SGEState.DELETION_RUNNING: QState.DONE, + SGEState.DELETION_TRANSFERRING: QState.DONE, + SGEState.DELETION_RUNNING_RESUBMIT: QState.DONE, + SGEState.DELETION_TRANSFERRING_RESUBMIT: QState.DONE, + SGEState.DELETION_SUSPENDED_JOB: QState.DONE, + SGEState.DELETION_SUSPENDED_QUEUE: QState.DONE, + SGEState.DELETION_SUSPENDED_ALARM: QState.DONE, + SGEState.DELETION_SUSPENDED_RESUBMIT_JOB: QState.DONE, + SGEState.DELETION_SUSPENDED_RESUBMIT_QUEUE: QState.DONE, + SGEState.DELETION_SUSPENDED_RESUBMIT_ALARM: QState.DONE, } -class PBSIO(BaseSchedulerIO): +class SGEIO(BaseSchedulerIO): header_template: str = """ -#PBS -q $${queue} -#PBS -N $${job_name} -#PBS -A $${account} -#PBS -l $${select} -#PBS -l walltime=$${walltime} -#PBS -l model=$${model} -#PBS -l place=$${place} -#PBS -W group_list=$${group_list} -#PBS -M $${mail_user} -#PBS -m $${mail_type} -#PBS -o $${qout_path} -#PBS -e $${qerr_path} -#PBS -p $${priority} -#PBS -r $${rerunnable} -#PBS -J $${array} +#$ -cwd +#$ -q $${queue} +#$ -N $${job_name} +#$ -P $${account} +#$ -l $${select} +#$ -l h_rt=$${walltime} +#$ -l s_rt=$${soft_walltime} +#$ -pe $${model} +#$ -binding $${place} +#$ -W group_list=$${group_list} +#$ -M $${mail_user} +#$ -m $${mail_type} +#$ -o $${qout_path} +#$ -e $${qerr_path} +#$ -p $${priority} +#$ -r $${rerunnable} +#$ -t $${array} $${qverbatim}""" SUBMIT_CMD: str | None = "qsub" @@ -128,11 +179,7 @@ def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: ) def parse_cancel_output(self, exit_code, stdout, stderr) -> CancelResult: - """Parse the output of the scancel command.""" - # Possible error messages: - # qdel: Unknown Job Id 100 - # qdel: Job has finished 1004 - # Correct execution: no output + """Parse the output of the qdel command.""" if isinstance(stdout, bytes): stdout = stdout.decode() if isinstance(stderr, bytes): @@ -145,7 +192,6 @@ def parse_cancel_output(self, exit_code, stdout, stderr) -> CancelResult: status=CancelStatus("FAILED"), ) - # PBS does not return the job id if the job is successfully deleted status = CancelStatus("SUCCESSFUL") return CancelResult( job_id=None, @@ -156,166 +202,155 @@ def parse_cancel_output(self, exit_code, stdout, stderr) -> CancelResult: ) def _get_job_cmd(self, job_id: str): - cmd = f"qstat -f {job_id}" - + cmd = f"qstat -j {job_id}" return cmd def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: - out = self.parse_jobs_list_output(exit_code, stdout, stderr) - if out: - return out[0] + if exit_code != 0: + raise OutputParsingError(f"Error in job output parsing: {stderr}") + if isinstance(stdout, bytes): + stdout = stdout.decode() + if isinstance(stderr, bytes): + stderr = stderr.decode() + + try: + xmldata = xml.dom.minidom.parseString(stdout) + except xml.parsers.expat.ExpatError: + raise OutputParsingError("XML parsing of stdout failed") + + job_list = xmldata.getElementsByTagName("job_list") + if not job_list: + return None + + job_element = job_list[0] + + qjob = QJob() + qjob.job_id = self._get_element_text(job_element, "JB_job_number") + job_state_string = self._get_element_text(job_element, "state") + + try: + sge_job_state = SGEState(job_state_string) + except ValueError: + raise OutputParsingError( + f"Unknown job state {job_state_string} for job id {qjob.job_id}" + ) + + qjob.sub_state = sge_job_state + qjob.state = sge_job_state.qstate + qjob.username = self._get_element_text(job_element, "JB_owner") + qjob.name = self._get_element_text(job_element, "JB_name") + + info = QJobInfo() + info.nodes = self._safe_int(self._get_element_text(job_element, "num_nodes")) + info.cpus = self._safe_int(self._get_element_text(job_element, "num_proc")) + info.memory_per_cpu = self._convert_memory_str( + self._get_element_text(job_element, "hard resource_list.mem_free") + ) + info.partition = self._get_element_text(job_element, "queue_name") + info.time_limit = self._convert_str_to_time( + self._get_element_text(job_element, "hard resource_list.h_rt") + ) + + qjob.info = info + + return qjob + + def _get_element_text(self, parent, tag_name): + elements = parent.getElementsByTagName(tag_name) + if elements: + return elements[0].childNodes[0].data.strip() return None + def _safe_int(self, value: str | None) -> int | None: + if value is None: + return None + try: + return int(value) + except ValueError: + return None + def _get_jobs_list_cmd( self, job_ids: list[str] | None = None, user: str | None = None ) -> str: - if user and job_ids: - raise ValueError("Cannot query by user and job(s) in PBS") + if job_ids: + raise UnsupportedResourcesError("Cannot query by job id in SGE") - command = [ - "qstat", - "-f", - ] + command = "qstat -ext -urg -xml " if user: - command.append(f"-u {user}") - - if job_ids: - command.append(" ".join(job_ids)) + command += f"-u {user!s}" + else: + command += "-u '*'" - return " ".join(command) + return command def parse_jobs_list_output(self, exit_code, stdout, stderr) -> list[QJob]: + if exit_code != 0: + raise OutputParsingError(f"Error in jobs list output parsing: {stderr}") if isinstance(stdout, bytes): stdout = stdout.decode() if isinstance(stderr, bytes): stderr = stderr.decode() - # if some jobs of the list do not exist the exit code is not zero, but - # the data for other jobs is still present. Some the exit code is ignored here - - # The error messages are included in the stderr and could be of the form: - # qstat: Unknown Job Id 10000.c2cf5fbe1102 - # qstat: 1008.c2cf5fbe1102 Job has finished, use -x or -H to - # obtain historical job information - # TODO raise if these two kinds of error are not present and exit_code != 0? - - # Split by the beginning of "Job Id:" and iterate on the different chunks. - # Matching the beginning of the line to avoid problems in case the "Job Id" - # string is present elsewhere. - jobs_chunks = re.split(r"^\s*Job Id: ", stdout, flags=re.MULTILINE) - - # regex to split the key-values pairs separated by " = " - # Explanation: - # - \s*([A-Za-z_.]+)\s+=\s+ matches the key in the key-value pair, - # allowing for leading and trailing whitespace before and after the - # equals sign, and allowing for a dot in the key. - # - ([\s\S]*?) matches the value in the key-value pair, allowing for any - # character including newlines. - # - (?=\n\s*[A-Za-z_.]+\s+=|\Z) is a positive lookahead that matches a - # newline followed by a key with optional leading and trailing - # whitespace and an equals sign or the end of the string, - # without including the lookahead match in the result. - # The key_pattern is separated in case needs to be updated. - key_pattern = r"[A-Za-z_.]+" - values_regex = re.compile( - rf"\s*({key_pattern})\s+=\s+([\s\S]*?)(?=\n\s*{key_pattern}\s+=|\Z)" - ) + try: + xmldata = xml.dom.minidom.parseString(stdout) + except xml.parsers.expat.ExpatError: + raise OutputParsingError("XML parsing of stdout failed") + job_elements = xmldata.getElementsByTagName("job_list") jobs_list = [] - for chunk in jobs_chunks: - chunk = chunk.strip() - if not chunk: - continue - - # first line is the id: - job_id, chunk_data = chunk.split("\n", 1) - job_id = job_id.strip() - results = values_regex.findall(chunk_data) - if not results: - continue - data = dict(results) + for job_element in job_elements: qjob = QJob() - qjob.job_id = job_id - - job_state_string = data["job_state"] + qjob.job_id = self._get_element_text(job_element, "JB_job_number") + job_state_string = self._get_element_text(job_element, "state") try: - pbs_job_state = PBSState(job_state_string) + sge_job_state = SGEState(job_state_string) except ValueError: - msg = f"Unknown job state {job_state_string} for job id {qjob.job_id}" - raise OutputParsingError(msg) - qjob.sub_state = pbs_job_state - qjob.state = pbs_job_state.qstate - - qjob.username = data["Job_Owner"] - - info = QJobInfo() - - try: - info.nodes = int(data.get("Resource_List.nodect")) - except ValueError: - info.nodes = None - - try: - info.cpus = int(data.get("Resource_List.ncpus")) - except ValueError: - info.cpus = None - - try: - info.memory_per_cpu = self._convert_memory_str( - data.get("Resource_List.mem") + raise OutputParsingError( + f"Unknown job state {job_state_string} for job id {qjob.job_id}" ) - except OutputParsingError: - info.memory_per_cpu = None - info.partition = data["queue"] + qjob.sub_state = sge_job_state + qjob.state = sge_job_state.qstate + qjob.username = self._get_element_text(job_element, "JB_owner") + qjob.name = self._get_element_text(job_element, "JB_name") - # TODO here _convert_time_str can raise. If parsing errors are accepted - # handle differently + info = QJobInfo() + info.nodes = self._safe_int( + self._get_element_text(job_element, "num_nodes") + ) + info.cpus = self._safe_int(self._get_element_text(job_element, "num_proc")) + info.memory_per_cpu = self._convert_memory_str( + self._get_element_text(job_element, "hard resource_list.mem_free") + ) + info.partition = self._get_element_text(job_element, "queue_name") info.time_limit = self._convert_str_to_time( - data.get("Resource_List.walltime") + self._get_element_text(job_element, "hard resource_list.h_rt") ) - try: - runtime_str = data.get("resources_used.walltime") - if runtime_str: - qjob.runtime = self._convert_str_to_time(runtime_str) - except OutputParsingError: - qjob.runtime = None - - qjob.name = data.get("Job_Name") qjob.info = info - # I append to the list of jobs to return jobs_list.append(qjob) return jobs_list @staticmethod def _convert_str_to_time(time_str: str | None): - """ - Convert a string in the format used by PBS DD:HH:MM:SS to a number of seconds. - It may contain only H:M:S, only M:S or only S. - """ - if not time_str: return None time_split = time_str.split(":") - - # array containing seconds, minutes, hours and days - time = [0] * 4 + time = [0] * 3 try: for i, v in enumerate(reversed(time_split)): time[i] = int(v) - except ValueError: raise OutputParsingError() - return time[3] * 86400 + time[2] * 3600 + time[1] * 60 + time[0] + return time[2] * 3600 + time[1] * 60 + time[0] @staticmethod def _convert_memory_str(memory: str | None) -> int | None: @@ -327,10 +362,10 @@ def _convert_memory_str(memory: str | None) -> int | None: raise OutputParsingError("No numbers and units parsed") memory, units = match.groups() - power_labels = {"kb": 0, "mb": 1, "gb": 2, "tb": 3} + power_labels = {"k": 0, "m": 1, "g": 2, "t": 3} if not units: - units = "mb" + units = "m" elif units not in power_labels: raise OutputParsingError(f"Unknown units {units}") try: @@ -340,8 +375,6 @@ def _convert_memory_str(memory: str | None) -> int | None: return v * (1024 ** power_labels[units]) - # helper attribute to match the values defined in QResources and - # the dictionary that should be passed to the template _qresources_mapping = { "queue_name": "queue", "job_name": "job_name", @@ -364,22 +397,20 @@ def _convert_time_to_str(time: int | float | timedelta) -> str: return time_str def _convert_qresources(self, resources: QResources) -> dict: - """ - Converts a QResources instance to a dict that will be used to fill in the - header of the submission script. - """ - header_dict = {} - for qr_field, pbs_field in self._qresources_mapping.items(): + for qr_field, sge_field in self._qresources_mapping.items(): val = getattr(resources, qr_field) if val is not None: - header_dict[pbs_field] = val + header_dict[sge_field] = val if resources.njobs and resources.njobs > 1: header_dict["array"] = f"1-{resources.njobs}" if resources.time_limit: header_dict["walltime"] = self._convert_time_to_str(resources.time_limit) + header_dict["soft_walltime"] = self._convert_time_to_str( + resources.time_limit * 0.9 + ) if resources.rerunnable is not None: header_dict["rerunnable"] = "y" if resources.rerunnable else "n" @@ -422,7 +453,7 @@ def _convert_qresources(self, resources: QResources) -> dict: elif resources.process_placement == ProcessPlacement.SAME_NODE: header_dict["place"] = "pack" else: - msg = f"process placement {resources.process_placement} is not supported for PBS" + msg = f"process placement {resources.process_placement} is not supported for SGE" raise UnsupportedResourcesError(msg) header_dict["select"] = select @@ -438,11 +469,6 @@ def _convert_qresources(self, resources: QResources) -> dict: @property def supported_qresources_keys(self) -> list: - """ - List of attributes of QResources that are correctly handled by the - _convert_qresources method. It is used to validate that the user - does not pass an unsupported value, expecting to have an effect. - """ supported = list(self._qresources_mapping.keys()) supported += [ "njobs", From f602de7569977fe358696501ba887ecdec4610de Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Fri, 2 Aug 2024 21:02:39 +0200 Subject: [PATCH 03/35] adding SGEIO and SGEState to init --- src/qtoolkit/io/__init__.py | 3 +- tests/io/test_sge.py | 272 ++++++++++++++++++++++++++++++++++++ 2 files changed, 274 insertions(+), 1 deletion(-) create mode 100644 tests/io/test_sge.py diff --git a/src/qtoolkit/io/__init__.py b/src/qtoolkit/io/__init__.py index cc9080d..bbfbb49 100644 --- a/src/qtoolkit/io/__init__.py +++ b/src/qtoolkit/io/__init__.py @@ -1,6 +1,7 @@ from qtoolkit.io.base import BaseSchedulerIO from qtoolkit.io.pbs import PBSIO, PBSState +from qtoolkit.io.sge import SGEIO, SGEState from qtoolkit.io.shell import ShellIO, ShellState from qtoolkit.io.slurm import SlurmIO, SlurmState -scheduler_mapping = {"slurm": SlurmIO, "pbs": PBSIO, "shell": ShellIO} +scheduler_mapping = {"slurm": SlurmIO, "pbs": PBSIO, "sge": SGEIO, "shell": ShellIO} diff --git a/tests/io/test_sge.py b/tests/io/test_sge.py new file mode 100644 index 0000000..f3ffe00 --- /dev/null +++ b/tests/io/test_sge.py @@ -0,0 +1,272 @@ +from datetime import timedelta +from pathlib import Path + +import pytest +from monty.serialization import loadfn + +from qtoolkit.core.data_objects import ProcessPlacement, QResources, QState +from qtoolkit.core.exceptions import OutputParsingError, UnsupportedResourcesError +from qtoolkit.io.slurm import SlurmIO, SlurmState + +TEST_DIR = Path(__file__).resolve().parents[1] / "test_data" +submit_ref_file = TEST_DIR / "io" / "slurm" / "parse_submit_output_inout.yaml" +in_out_submit_ref_list = loadfn(submit_ref_file) +cancel_ref_file = TEST_DIR / "io" / "slurm" / "parse_cancel_output_inout.yaml" +in_out_cancel_ref_list = loadfn(cancel_ref_file) +job_ref_file = TEST_DIR / "io" / "slurm" / "parse_job_output_inout.yaml" +in_out_job_ref_list = loadfn(job_ref_file) + + +@pytest.fixture(scope="module") +def slurm_io(): + return SlurmIO() + + +class TestSlurmState: + @pytest.mark.parametrize("slurm_state", [s for s in SlurmState]) + def test_qstate(self, slurm_state): + assert isinstance(slurm_state.qstate, QState) + assert SlurmState("CA") == SlurmState.CANCELLED + assert SlurmState("CG") == SlurmState.COMPLETING + assert SlurmState("CD") == SlurmState.COMPLETED + assert SlurmState("CF") == SlurmState.CONFIGURING + assert SlurmState("DL") == SlurmState.DEADLINE + assert SlurmState("F") == SlurmState.FAILED + assert SlurmState("OOM") == SlurmState.OUT_OF_MEMORY + assert SlurmState("PD") == SlurmState.PENDING + assert SlurmState("R") == SlurmState.RUNNING + assert SlurmState("S") == SlurmState.SUSPENDED + assert SlurmState("TO") == SlurmState.TIMEOUT + + +class TestSlurmIO: + @pytest.mark.parametrize("in_out_ref", in_out_submit_ref_list) + def test_parse_submit_output(self, slurm_io, in_out_ref, test_utils): + parse_cmd_output, sr_ref = test_utils.inkwargs_outref( + in_out_ref, inkey="parse_submit_kwargs", outkey="submission_result_ref" + ) + sr = slurm_io.parse_submit_output(**parse_cmd_output) + assert sr == sr_ref + sr = slurm_io.parse_submit_output( + exit_code=parse_cmd_output["exit_code"], + stdout=bytes(parse_cmd_output["stdout"], "utf-8"), + stderr=bytes(parse_cmd_output["stderr"], "utf-8"), + ) + assert sr == sr_ref + sr = slurm_io.parse_submit_output( + exit_code=parse_cmd_output["exit_code"], + stdout=bytes(parse_cmd_output["stdout"], "ascii"), + stderr=bytes(parse_cmd_output["stderr"], "ascii"), + ) + assert sr == sr_ref + + @pytest.mark.parametrize("in_out_ref", in_out_cancel_ref_list) + def test_parse_cancel_output(self, slurm_io, in_out_ref, test_utils): + parse_cmd_output, cr_ref = test_utils.inkwargs_outref( + in_out_ref, inkey="parse_cancel_kwargs", outkey="cancel_result_ref" + ) + cr = slurm_io.parse_cancel_output(**parse_cmd_output) + assert cr == cr_ref + cr = slurm_io.parse_cancel_output( + exit_code=parse_cmd_output["exit_code"], + stdout=bytes(parse_cmd_output["stdout"], "utf-8"), + stderr=bytes(parse_cmd_output["stderr"], "utf-8"), + ) + assert cr == cr_ref + cr = slurm_io.parse_cancel_output( + exit_code=parse_cmd_output["exit_code"], + stdout=bytes(parse_cmd_output["stdout"], "ascii"), + stderr=bytes(parse_cmd_output["stderr"], "ascii"), + ) + assert cr == cr_ref + + @pytest.mark.parametrize("in_out_ref", in_out_job_ref_list) + def test_parse_job_output(self, slurm_io, in_out_ref, test_utils): + parse_cmd_output, job_ref = test_utils.inkwargs_outref( + in_out_ref, inkey="parse_job_kwargs", outkey="job_ref" + ) + job = slurm_io.parse_job_output(**parse_cmd_output) + assert job == job_ref + job = slurm_io.parse_job_output( + exit_code=parse_cmd_output["exit_code"], + stdout=bytes(parse_cmd_output["stdout"], "utf-8"), + stderr=bytes(parse_cmd_output["stderr"], "utf-8"), + ) + assert job == job_ref + job = slurm_io.parse_job_output( + exit_code=parse_cmd_output["exit_code"], + stdout=bytes(parse_cmd_output["stdout"], "ascii"), + stderr=bytes(parse_cmd_output["stderr"], "ascii"), + ) + assert job == job_ref + + def test_get_job_cmd(self, slurm_io): + cmd = slurm_io._get_job_cmd(3) + assert cmd == "SLURM_TIME_FORMAT='standard' scontrol show job -o 3" + cmd = slurm_io._get_job_cmd("56") + assert cmd == "SLURM_TIME_FORMAT='standard' scontrol show job -o 56" + + def test_get_jobs_list_cmd(self, slurm_io): + with pytest.raises( + ValueError, match=r"Cannot query by user and job\(s\) in SLURM" + ): + slurm_io._get_jobs_list_cmd(job_ids=["1"], user="johndoe") + cmd = slurm_io._get_jobs_list_cmd(user="johndoe") + assert cmd == ( + "SLURM_TIME_FORMAT='standard' " + "squeue --noheader -o '%i<><> %t<><> %r<><> " + "%j<><> %u<><> %P<><> %l<><> %D<><> %C<><> " + "%M<><> %m' -u johndoe" + ) + cmd = slurm_io._get_jobs_list_cmd(job_ids=["1", "3", "56", "15"]) + assert cmd == ( + "SLURM_TIME_FORMAT='standard' " + "squeue --noheader -o '%i<><> %t<><> %r<><> " + "%j<><> %u<><> %P<><> %l<><> %D<><> %C<><> " + "%M<><> %m' --jobs=1,3,56,15" + ) + cmd = slurm_io._get_jobs_list_cmd(job_ids=["1"]) + assert cmd == ( + "SLURM_TIME_FORMAT='standard' " + "squeue --noheader -o '%i<><> %t<><> %r<><> " + "%j<><> %u<><> %P<><> %l<><> %D<><> %C<><> " + "%M<><> %m' --jobs=1,1" + ) + + def test_convert_str_to_time(self, slurm_io): + time_seconds = slurm_io._convert_str_to_time(None) + assert time_seconds is None + time_seconds = slurm_io._convert_str_to_time("UNLIMITED") + assert time_seconds is None + time_seconds = slurm_io._convert_str_to_time("NOT_SET") + assert time_seconds is None + + time_seconds = slurm_io._convert_str_to_time("3-10:51:13") + assert time_seconds == 298273 + time_seconds = slurm_io._convert_str_to_time("2:10:02") + assert time_seconds == 7802 + time_seconds = slurm_io._convert_str_to_time("10:02") + assert time_seconds == 602 + time_seconds = slurm_io._convert_str_to_time("45") + assert time_seconds == 2700 + + with pytest.raises(OutputParsingError): + slurm_io._convert_str_to_time("2:10:02:10") + + with pytest.raises(OutputParsingError): + slurm_io._convert_str_to_time("2:10:a") + + def test_convert_memory_str(self, slurm_io): + memory_kb = slurm_io._convert_memory_str(None) + assert memory_kb is None + memory_kb = slurm_io._convert_memory_str("") + assert memory_kb is None + + memory_kb = slurm_io._convert_memory_str("12M") + assert memory_kb == 12288 + memory_kb = slurm_io._convert_memory_str("13K") + assert memory_kb == 13 + memory_kb = slurm_io._convert_memory_str("5G") + assert memory_kb == 5242880 + memory_kb = slurm_io._convert_memory_str("1T") + assert memory_kb == 1073741824 + + with pytest.raises(OutputParsingError): + slurm_io._convert_memory_str("aT") + + def test_convert_time_to_str(self, slurm_io): + time_str = slurm_io._convert_time_to_str(10) + assert time_str == "0-0:0:10" + time_str = slurm_io._convert_time_to_str(298273) + assert time_str == "3-10:51:13" + time_str = slurm_io._convert_time_to_str(7802) + assert time_str == "0-2:10:2" + time_str = slurm_io._convert_time_to_str(602) + assert time_str == "0-0:10:2" + + time_str = slurm_io._convert_time_to_str(timedelta(seconds=298273)) + assert time_str == "3-10:51:13" + time_str = slurm_io._convert_time_to_str( + timedelta(days=15, hours=21, minutes=19, seconds=32) + ) + assert time_str == "15-21:19:32" + + # test float + time_str = slurm_io._convert_time_to_str(602.0) + assert time_str == "0-0:10:2" + + # test negative + # negative time makes no sense and should not be passed. this test is just to be alerted + # if the output for negative numbers changes + time_str = slurm_io._convert_time_to_str(-10) + assert time_str == "-1-23:59:50" + + def test_check_convert_qresources(self, slurm_io): + res = QResources( + queue_name="myqueue", + job_name="myjob", + memory_per_thread=2048, + account="myaccount", + qos="myqos", + output_filepath="someoutputpath", + error_filepath="someerrorpath", + njobs=4, + time_limit=298273, + process_placement=ProcessPlacement.EVENLY_DISTRIBUTED, + nodes=4, + processes_per_node=3, + threads_per_process=2, + gpus_per_job=4, + email_address="john.doe@submit.qtk", + scheduler_kwargs={"tata": "toto", "titi": "tutu"}, + ) + header_dict = slurm_io.check_convert_qresources(resources=res) + assert header_dict == { + "partition": "myqueue", + "job_name": "myjob", + "mem-per-cpu": 2048, + "account": "myaccount", + "qos": "myqos", + "qout_path": "someoutputpath", + "qerr_path": "someerrorpath", + "array": "1-4", + "time": "3-10:51:13", + "ntasks_per_node": 3, + "nodes": 4, + "cpus_per_task": 2, + "gres": "gpu:4", + "mail_user": "john.doe@submit.qtk", + "mail_type": "ALL", + "tata": "toto", + "titi": "tutu", + } + + res = QResources( + time_limit=298273, + processes=24, + ) + header_dict = slurm_io.check_convert_qresources(resources=res) + assert header_dict == { + "time": "3-10:51:13", + "ntasks": 24, + } + + res = QResources( + njobs=1, + processes=24, + gpus_per_job=4, + ) + header_dict = slurm_io.check_convert_qresources(resources=res) + assert header_dict == { + "ntasks": 24, + "gres": "gpu:4", + } + + res = QResources( + processes=5, + rerunnable=True, + ) + with pytest.raises( + UnsupportedResourcesError, match=r"Keys not supported: rerunnable" + ): + slurm_io.check_convert_qresources(res) From a119af5f8e2fec2eeb6526d62d9b4fd1089eb5e3 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Fri, 2 Aug 2024 22:28:36 +0200 Subject: [PATCH 04/35] add very generic tests based on the slurm tests (not all do pass) --- src/qtoolkit/io/sge.py | 36 ++- tests/io/test_sge.py | 224 ++++++++---------- .../sge/create_parse_cancel_output_inout.py | 102 ++++++++ .../io/sge/create_parse_job_output_inout.py | 87 +++++++ .../sge/create_parse_submit_output_inout.py | 92 +++++++ .../io/sge/parse_cancel_output_inout.yaml | 12 + .../io/sge/parse_job_output_inout.yaml | 6 + .../io/sge/parse_submit_output_inout.yaml | 18 ++ 8 files changed, 439 insertions(+), 138 deletions(-) create mode 100644 tests/test_data/io/sge/create_parse_cancel_output_inout.py create mode 100644 tests/test_data/io/sge/create_parse_job_output_inout.py create mode 100644 tests/test_data/io/sge/create_parse_submit_output_inout.py create mode 100644 tests/test_data/io/sge/parse_cancel_output_inout.yaml create mode 100644 tests/test_data/io/sge/parse_job_output_inout.yaml create mode 100644 tests/test_data/io/sge/parse_submit_output_inout.yaml diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 2a29e67..0ba1987 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -61,13 +61,9 @@ class SGEState(QSubState): # Queue states - UNKNOWN = "u" ALARM = "a" - SUSPEND_THRESHOLD = "A" SUSPENDED_BY_USER_ADMIN = "s" DISABLED_BY_USER_ADMIN = "d" - SUSPENDED_BY_CALENDAR = "C" - DISABLED_BY_CALENDAR = "D" SUSPENDED_BY_SUBORDINATION = "S" ERROR = "E" @@ -77,7 +73,7 @@ class SGEState(QSubState): JOB_SUSPENDED = "s" TRANSFERRING = "t" RUNNING = "r" - HOLD = "h" + HOLD = "hqw" RESTARTED = "R" DELETION = "d" ERROR_PENDING = "Eqw" @@ -164,7 +160,10 @@ def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: stderr=stderr, status=SubmissionStatus("FAILED"), ) - job_id = stdout.strip() + match = re.search(r'Your job (\d+) \(".*?"\) has been submitted', stdout) + if not match: + raise OutputParsingError("Failed to parse job ID from stdout") + job_id = match.group(1) status = ( SubmissionStatus("SUCCESSFUL") if job_id @@ -191,10 +190,13 @@ def parse_cancel_output(self, exit_code, stdout, stderr) -> CancelResult: stderr=stderr, status=CancelStatus("FAILED"), ) - + match = re.search(r"qdel: job (\d+) deleted", stderr) + if not match: + raise OutputParsingError("Failed to parse job ID from stdout") + job_id = match.group(1) status = CancelStatus("SUCCESSFUL") return CancelResult( - job_id=None, + job_id=job_id, exit_code=exit_code, stdout=stdout, stderr=stderr, @@ -205,7 +207,7 @@ def _get_job_cmd(self, job_id: str): cmd = f"qstat -j {job_id}" return cmd - def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: + def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida style if exit_code != 0: raise OutputParsingError(f"Error in job output parsing: {stderr}") if isinstance(stdout, bytes): @@ -338,19 +340,27 @@ def parse_jobs_list_output(self, exit_code, stdout, stderr) -> list[QJob]: @staticmethod def _convert_str_to_time(time_str: str | None): + """ + Convert a string in the format used by SGE DD:HH:MM:SS to a number of seconds. + It may contain only H:M:S, only M:S or only S. + """ + if not time_str: return None time_split = time_str.split(":") - time = [0] * 3 + + # array containing seconds, minutes, hours and days + time = [0] * 4 try: for i, v in enumerate(reversed(time_split)): time[i] = int(v) + except ValueError: raise OutputParsingError() - return time[2] * 3600 + time[1] * 60 + time[0] + return time[3] * 86400 + time[2] * 3600 + time[1] * 60 + time[0] @staticmethod def _convert_memory_str(memory: str | None) -> int | None: @@ -362,10 +372,10 @@ def _convert_memory_str(memory: str | None) -> int | None: raise OutputParsingError("No numbers and units parsed") memory, units = match.groups() - power_labels = {"k": 0, "m": 1, "g": 2, "t": 3} + power_labels = {"kb": 0, "mb": 1, "gb": 2, "tb": 3} if not units: - units = "m" + units = "mb" elif units not in power_labels: raise OutputParsingError(f"Unknown units {units}") try: diff --git a/tests/io/test_sge.py b/tests/io/test_sge.py index f3ffe00..3285a35 100644 --- a/tests/io/test_sge.py +++ b/tests/io/test_sge.py @@ -6,54 +6,47 @@ from qtoolkit.core.data_objects import ProcessPlacement, QResources, QState from qtoolkit.core.exceptions import OutputParsingError, UnsupportedResourcesError -from qtoolkit.io.slurm import SlurmIO, SlurmState +from qtoolkit.io.sge import SGEIO, SGEState TEST_DIR = Path(__file__).resolve().parents[1] / "test_data" -submit_ref_file = TEST_DIR / "io" / "slurm" / "parse_submit_output_inout.yaml" +submit_ref_file = TEST_DIR / "io" / "sge" / "parse_submit_output_inout.yaml" in_out_submit_ref_list = loadfn(submit_ref_file) -cancel_ref_file = TEST_DIR / "io" / "slurm" / "parse_cancel_output_inout.yaml" +cancel_ref_file = TEST_DIR / "io" / "sge" / "parse_cancel_output_inout.yaml" in_out_cancel_ref_list = loadfn(cancel_ref_file) -job_ref_file = TEST_DIR / "io" / "slurm" / "parse_job_output_inout.yaml" +job_ref_file = TEST_DIR / "io" / "sge" / "parse_job_output_inout.yaml" in_out_job_ref_list = loadfn(job_ref_file) @pytest.fixture(scope="module") -def slurm_io(): - return SlurmIO() +def sge_io(): + return SGEIO() -class TestSlurmState: - @pytest.mark.parametrize("slurm_state", [s for s in SlurmState]) - def test_qstate(self, slurm_state): - assert isinstance(slurm_state.qstate, QState) - assert SlurmState("CA") == SlurmState.CANCELLED - assert SlurmState("CG") == SlurmState.COMPLETING - assert SlurmState("CD") == SlurmState.COMPLETED - assert SlurmState("CF") == SlurmState.CONFIGURING - assert SlurmState("DL") == SlurmState.DEADLINE - assert SlurmState("F") == SlurmState.FAILED - assert SlurmState("OOM") == SlurmState.OUT_OF_MEMORY - assert SlurmState("PD") == SlurmState.PENDING - assert SlurmState("R") == SlurmState.RUNNING - assert SlurmState("S") == SlurmState.SUSPENDED - assert SlurmState("TO") == SlurmState.TIMEOUT +class TestSGEState: + @pytest.mark.parametrize("sge_state", [s for s in SGEState]) + def test_qstate(self, sge_state): + assert isinstance(sge_state.qstate, QState) + assert SGEState("hqw") == SGEState.HOLD + assert SGEState("r") == SGEState.RUNNING + assert SGEState("Eqw") == SGEState.ERROR_PENDING + assert SGEState("dr") == SGEState.DELETION_RUNNING -class TestSlurmIO: +class TestSGEIO: @pytest.mark.parametrize("in_out_ref", in_out_submit_ref_list) - def test_parse_submit_output(self, slurm_io, in_out_ref, test_utils): + def test_parse_submit_output(self, sge_io, in_out_ref, test_utils): parse_cmd_output, sr_ref = test_utils.inkwargs_outref( in_out_ref, inkey="parse_submit_kwargs", outkey="submission_result_ref" ) - sr = slurm_io.parse_submit_output(**parse_cmd_output) + sr = sge_io.parse_submit_output(**parse_cmd_output) assert sr == sr_ref - sr = slurm_io.parse_submit_output( + sr = sge_io.parse_submit_output( exit_code=parse_cmd_output["exit_code"], stdout=bytes(parse_cmd_output["stdout"], "utf-8"), stderr=bytes(parse_cmd_output["stderr"], "utf-8"), ) assert sr == sr_ref - sr = slurm_io.parse_submit_output( + sr = sge_io.parse_submit_output( exit_code=parse_cmd_output["exit_code"], stdout=bytes(parse_cmd_output["stdout"], "ascii"), stderr=bytes(parse_cmd_output["stderr"], "ascii"), @@ -61,19 +54,19 @@ def test_parse_submit_output(self, slurm_io, in_out_ref, test_utils): assert sr == sr_ref @pytest.mark.parametrize("in_out_ref", in_out_cancel_ref_list) - def test_parse_cancel_output(self, slurm_io, in_out_ref, test_utils): + def test_parse_cancel_output(self, sge_io, in_out_ref, test_utils): parse_cmd_output, cr_ref = test_utils.inkwargs_outref( in_out_ref, inkey="parse_cancel_kwargs", outkey="cancel_result_ref" ) - cr = slurm_io.parse_cancel_output(**parse_cmd_output) + cr = sge_io.parse_cancel_output(**parse_cmd_output) assert cr == cr_ref - cr = slurm_io.parse_cancel_output( + cr = sge_io.parse_cancel_output( exit_code=parse_cmd_output["exit_code"], stdout=bytes(parse_cmd_output["stdout"], "utf-8"), stderr=bytes(parse_cmd_output["stderr"], "utf-8"), ) assert cr == cr_ref - cr = slurm_io.parse_cancel_output( + cr = sge_io.parse_cancel_output( exit_code=parse_cmd_output["exit_code"], stdout=bytes(parse_cmd_output["stdout"], "ascii"), stderr=bytes(parse_cmd_output["stderr"], "ascii"), @@ -81,174 +74,155 @@ def test_parse_cancel_output(self, slurm_io, in_out_ref, test_utils): assert cr == cr_ref @pytest.mark.parametrize("in_out_ref", in_out_job_ref_list) - def test_parse_job_output(self, slurm_io, in_out_ref, test_utils): + def test_parse_job_output(self, sge_io, in_out_ref, test_utils): parse_cmd_output, job_ref = test_utils.inkwargs_outref( in_out_ref, inkey="parse_job_kwargs", outkey="job_ref" ) - job = slurm_io.parse_job_output(**parse_cmd_output) + if "stderr" not in parse_cmd_output: + parse_cmd_output["stderr"] = "" + job = sge_io.parse_job_output(**parse_cmd_output) assert job == job_ref - job = slurm_io.parse_job_output( + job = sge_io.parse_job_output( exit_code=parse_cmd_output["exit_code"], stdout=bytes(parse_cmd_output["stdout"], "utf-8"), stderr=bytes(parse_cmd_output["stderr"], "utf-8"), ) assert job == job_ref - job = slurm_io.parse_job_output( + job = sge_io.parse_job_output( exit_code=parse_cmd_output["exit_code"], stdout=bytes(parse_cmd_output["stdout"], "ascii"), stderr=bytes(parse_cmd_output["stderr"], "ascii"), ) assert job == job_ref - def test_get_job_cmd(self, slurm_io): - cmd = slurm_io._get_job_cmd(3) - assert cmd == "SLURM_TIME_FORMAT='standard' scontrol show job -o 3" - cmd = slurm_io._get_job_cmd("56") - assert cmd == "SLURM_TIME_FORMAT='standard' scontrol show job -o 56" + def test_get_job_cmd(self, sge_io): + cmd = sge_io._get_job_cmd(3) + assert cmd == "qstat -j 3" + cmd = sge_io._get_job_cmd("56") + assert cmd == "qstat -j 56" - def test_get_jobs_list_cmd(self, slurm_io): + def test_get_jobs_list_cmd(self, sge_io): with pytest.raises( - ValueError, match=r"Cannot query by user and job\(s\) in SLURM" + UnsupportedResourcesError, match=r"Cannot query by job id in SGE" ): - slurm_io._get_jobs_list_cmd(job_ids=["1"], user="johndoe") - cmd = slurm_io._get_jobs_list_cmd(user="johndoe") - assert cmd == ( - "SLURM_TIME_FORMAT='standard' " - "squeue --noheader -o '%i<><> %t<><> %r<><> " - "%j<><> %u<><> %P<><> %l<><> %D<><> %C<><> " - "%M<><> %m' -u johndoe" - ) - cmd = slurm_io._get_jobs_list_cmd(job_ids=["1", "3", "56", "15"]) - assert cmd == ( - "SLURM_TIME_FORMAT='standard' " - "squeue --noheader -o '%i<><> %t<><> %r<><> " - "%j<><> %u<><> %P<><> %l<><> %D<><> %C<><> " - "%M<><> %m' --jobs=1,3,56,15" - ) - cmd = slurm_io._get_jobs_list_cmd(job_ids=["1"]) - assert cmd == ( - "SLURM_TIME_FORMAT='standard' " - "squeue --noheader -o '%i<><> %t<><> %r<><> " - "%j<><> %u<><> %P<><> %l<><> %D<><> %C<><> " - "%M<><> %m' --jobs=1,1" - ) - - def test_convert_str_to_time(self, slurm_io): - time_seconds = slurm_io._convert_str_to_time(None) - assert time_seconds is None - time_seconds = slurm_io._convert_str_to_time("UNLIMITED") - assert time_seconds is None - time_seconds = slurm_io._convert_str_to_time("NOT_SET") + sge_io._get_jobs_list_cmd(job_ids=["1"], user="johndoe") + cmd = sge_io._get_jobs_list_cmd(user="johndoe") + assert cmd == ("qstat -ext -urg -xml -u johndoe") + cmd = sge_io._get_jobs_list_cmd(job_ids=["1", "3", "56", "15"]) + assert cmd == ("qstat -ext -urg -xml -u '*'") + cmd = sge_io._get_jobs_list_cmd(job_ids=["1"]) + assert cmd == ("qstat -ext -urg -xml -u '*'") + + def test_convert_str_to_time(self, sge_io): + time_seconds = sge_io._convert_str_to_time(None) assert time_seconds is None - time_seconds = slurm_io._convert_str_to_time("3-10:51:13") - assert time_seconds == 298273 - time_seconds = slurm_io._convert_str_to_time("2:10:02") + time_seconds = sge_io._convert_str_to_time("10:51:13") + assert time_seconds == 39073 + time_seconds = sge_io._convert_str_to_time("02:10:02") assert time_seconds == 7802 - time_seconds = slurm_io._convert_str_to_time("10:02") + time_seconds = sge_io._convert_str_to_time("10:02") assert time_seconds == 602 - time_seconds = slurm_io._convert_str_to_time("45") + time_seconds = sge_io._convert_str_to_time("45") assert time_seconds == 2700 with pytest.raises(OutputParsingError): - slurm_io._convert_str_to_time("2:10:02:10") + sge_io._convert_str_to_time("2:10:02:10") with pytest.raises(OutputParsingError): - slurm_io._convert_str_to_time("2:10:a") + sge_io._convert_str_to_time("2:10:a") - def test_convert_memory_str(self, slurm_io): - memory_kb = slurm_io._convert_memory_str(None) + def test_convert_memory_str(self, sge_io): + memory_kb = sge_io._convert_memory_str(None) assert memory_kb is None - memory_kb = slurm_io._convert_memory_str("") + memory_kb = sge_io._convert_memory_str("") assert memory_kb is None - memory_kb = slurm_io._convert_memory_str("12M") + memory_kb = sge_io._convert_memory_str("12M") assert memory_kb == 12288 - memory_kb = slurm_io._convert_memory_str("13K") + memory_kb = sge_io._convert_memory_str("13K") assert memory_kb == 13 - memory_kb = slurm_io._convert_memory_str("5G") + memory_kb = sge_io._convert_memory_str("5G") assert memory_kb == 5242880 - memory_kb = slurm_io._convert_memory_str("1T") + memory_kb = sge_io._convert_memory_str("1T") assert memory_kb == 1073741824 with pytest.raises(OutputParsingError): - slurm_io._convert_memory_str("aT") + sge_io._convert_memory_str("aT") - def test_convert_time_to_str(self, slurm_io): - time_str = slurm_io._convert_time_to_str(10) - assert time_str == "0-0:0:10" - time_str = slurm_io._convert_time_to_str(298273) - assert time_str == "3-10:51:13" - time_str = slurm_io._convert_time_to_str(7802) - assert time_str == "0-2:10:2" - time_str = slurm_io._convert_time_to_str(602) - assert time_str == "0-0:10:2" + def test_convert_time_to_str(self, sge_io): + time_str = sge_io._convert_time_to_str(10) + assert time_str == "0:0:10" + time_str = sge_io._convert_time_to_str(39073) + assert time_str == "10:51:13" + time_str = sge_io._convert_time_to_str(7802) + assert time_str == "2:10:2" + time_str = sge_io._convert_time_to_str(602) + assert time_str == "0:10:2" - time_str = slurm_io._convert_time_to_str(timedelta(seconds=298273)) - assert time_str == "3-10:51:13" - time_str = slurm_io._convert_time_to_str( - timedelta(days=15, hours=21, minutes=19, seconds=32) + time_str = sge_io._convert_time_to_str(timedelta(seconds=39073)) + assert time_str == "10:51:13" + time_str = sge_io._convert_time_to_str( + timedelta(hours=15, minutes=19, seconds=32) ) - assert time_str == "15-21:19:32" + assert time_str == "15:19:32" # test float - time_str = slurm_io._convert_time_to_str(602.0) - assert time_str == "0-0:10:2" + time_str = sge_io._convert_time_to_str(602.0) + assert time_str == "0:10:2" # test negative # negative time makes no sense and should not be passed. this test is just to be alerted # if the output for negative numbers changes - time_str = slurm_io._convert_time_to_str(-10) - assert time_str == "-1-23:59:50" + time_str = sge_io._convert_time_to_str(-10) + assert time_str == "-1:59:50" - def test_check_convert_qresources(self, slurm_io): + def test_check_convert_qresources(self, sge_io): res = QResources( queue_name="myqueue", job_name="myjob", memory_per_thread=2048, account="myaccount", - qos="myqos", + priority=1, output_filepath="someoutputpath", error_filepath="someerrorpath", njobs=4, - time_limit=298273, + time_limit=39073, process_placement=ProcessPlacement.EVENLY_DISTRIBUTED, nodes=4, processes_per_node=3, threads_per_process=2, - gpus_per_job=4, email_address="john.doe@submit.qtk", scheduler_kwargs={"tata": "toto", "titi": "tutu"}, ) - header_dict = slurm_io.check_convert_qresources(resources=res) + header_dict = sge_io.check_convert_qresources(resources=res) assert header_dict == { - "partition": "myqueue", + "queue": "myqueue", "job_name": "myjob", - "mem-per-cpu": 2048, + "mem": 2048, "account": "myaccount", - "qos": "myqos", + "priority": 1, "qout_path": "someoutputpath", "qerr_path": "someerrorpath", "array": "1-4", - "time": "3-10:51:13", - "ntasks_per_node": 3, - "nodes": 4, - "cpus_per_task": 2, - "gres": "gpu:4", + "walltime": "10:51:13", + "soft_walltime": "9:46:1", + "select": "select=4:ncpus=6:ompthreads=2:mpiprocs=3:mem=24576mb", "mail_user": "john.doe@submit.qtk", - "mail_type": "ALL", + "mail_type": "abe", "tata": "toto", "titi": "tutu", } res = QResources( - time_limit=298273, + time_limit=39073, processes=24, ) - header_dict = slurm_io.check_convert_qresources(resources=res) + header_dict = sge_io.check_convert_qresources(resources=res) assert header_dict == { - "time": "3-10:51:13", - "ntasks": 24, + "walltime": "10:51:13", + "soft_walltime": "9:46:1", + "select": "select=24:ncpus=1:ompthreads=1:mem=24mb", } res = QResources( @@ -256,10 +230,10 @@ def test_check_convert_qresources(self, slurm_io): processes=24, gpus_per_job=4, ) - header_dict = slurm_io.check_convert_qresources(resources=res) + header_dict = sge_io.check_convert_qresources(resources=res) assert header_dict == { - "ntasks": 24, - "gres": "gpu:4", + "select": "select=24:ncpus=1:ompthreads=1:mem=24mb", + "array": "1-1", } res = QResources( @@ -269,4 +243,4 @@ def test_check_convert_qresources(self, slurm_io): with pytest.raises( UnsupportedResourcesError, match=r"Keys not supported: rerunnable" ): - slurm_io.check_convert_qresources(res) + sge_io.check_convert_qresources(res) diff --git a/tests/test_data/io/sge/create_parse_cancel_output_inout.py b/tests/test_data/io/sge/create_parse_cancel_output_inout.py new file mode 100644 index 0000000..8548471 --- /dev/null +++ b/tests/test_data/io/sge/create_parse_cancel_output_inout.py @@ -0,0 +1,102 @@ +import json + +import yaml + +from qtoolkit.io.sge import SGEIO + +sge_io = SGEIO() + +mylist = [] + +# First case: successful termination +return_code = 0 +stdout = b"" +stderr = b"qdel: job 267 deleted\n" + +cr = sge_io.parse_cancel_output(exit_code=return_code, stdout=stdout, stderr=stderr) + +a = { + "parse_cancel_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "cancel_result_ref": json.dumps(cr.as_dict()), +} +mylist.append(a) + +# Second case: no job identification provided +return_code = 1 +stdout = b"" +stderr = b"qdel: No job id specified\n" + +cr = sge_io.parse_cancel_output(exit_code=return_code, stdout=stdout, stderr=stderr) + +a = { + "parse_cancel_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "cancel_result_ref": json.dumps(cr.as_dict()), +} +mylist.append(a) + +# Third case: access/permission denied +return_code = 210 +stdout = b"" +stderr = b"qdel: job 1 access denied\n" + +cr = sge_io.parse_cancel_output(exit_code=return_code, stdout=stdout, stderr=stderr) + +a = { + "parse_cancel_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "cancel_result_ref": json.dumps(cr.as_dict()), +} +mylist.append(a) + +# Fourth case: invalid job id +return_code = 1 +stdout = b"" +stderr = b"qdel: Invalid job id a\n" + +cr = sge_io.parse_cancel_output(exit_code=return_code, stdout=stdout, stderr=stderr) + +a = { + "parse_cancel_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "cancel_result_ref": json.dumps(cr.as_dict()), +} +mylist.append(a) + +# Fifth case: job already completed +return_code = 0 +stdout = b"" +stderr = b"qdel: job 269 deleted\nqdel: job 269 already completed\n" + +cr = sge_io.parse_cancel_output(exit_code=return_code, stdout=stdout, stderr=stderr) + +a = { + "parse_cancel_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "cancel_result_ref": json.dumps(cr.as_dict()), +} +mylist.append(a) + +# Sixth case: invalid job id specified +return_code = 0 +stdout = b"" +stderr = b"qdel: job 2675 deleted\nqdel: Invalid job id specified\n" + +cr = sge_io.parse_cancel_output(exit_code=return_code, stdout=stdout, stderr=stderr) + +a = { + "parse_cancel_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "cancel_result_ref": json.dumps(cr.as_dict()), +} +mylist.append(a) + +with open("parse_cancel_output_inout.yaml", "w") as f: + yaml.dump(mylist, f, sort_keys=False) diff --git a/tests/test_data/io/sge/create_parse_job_output_inout.py b/tests/test_data/io/sge/create_parse_job_output_inout.py new file mode 100644 index 0000000..3a82178 --- /dev/null +++ b/tests/test_data/io/sge/create_parse_job_output_inout.py @@ -0,0 +1,87 @@ +import json + +import yaml + +from qtoolkit.io.sge import SGEIO + +sge_io = SGEIO() + +mylist = [] + +# First case: successful job parsing +return_code = 0 +stdout = b""" + + + 270 + submit.script + matgenix-dwa + matgenix-dwa + (null) + r + 0 + 2023-10-11T11:08:17 + main.q + 1 + 1 + 00:05:00 + 96G + + +""" +stderr = b"" +job = sge_io.parse_job_output(exit_code=return_code, stdout=stdout, stderr=stderr) +a = { + "parse_job_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "job_ref": json.dumps(job.as_dict()), +} +mylist.append(a) + +# Second case: job parsing with invalid fields +return_code = 0 +stdout = b""" + + + 270 + submit.script + matgenix-dwa + matgenix-dwa + (null) + r + 0 + 2023-10-11T11:08:17 + main.q + a + 1 + a + 96G + + +""" +stderr = b"" +job = sge_io.parse_job_output(exit_code=return_code, stdout=stdout, stderr=stderr) +a = { + "parse_job_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "job_ref": json.dumps(job.as_dict()), +} +mylist.append(a) + +# Third case: empty stdout and stderr +return_code = 0 +stdout = b"" +stderr = b"" +job = sge_io.parse_job_output(exit_code=return_code, stdout=stdout, stderr=stderr) +a = { + "parse_job_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "job_ref": json.dumps(job.as_dict() if job is not None else None), +} +mylist.append(a) + +with open("parse_job_output_inout.yaml", "w") as f: + yaml.dump(mylist, f, sort_keys=False) diff --git a/tests/test_data/io/sge/create_parse_submit_output_inout.py b/tests/test_data/io/sge/create_parse_submit_output_inout.py new file mode 100644 index 0000000..b23ad81 --- /dev/null +++ b/tests/test_data/io/sge/create_parse_submit_output_inout.py @@ -0,0 +1,92 @@ +import json + +import yaml + +from qtoolkit.io.sge import SGEIO + +sge_io = SGEIO() + +mylist = [] + +# First case: invalid queue specified +return_code = 1 +stdout = b"" +stderr = ( + b"qsub: Invalid queue specified: abcd\n" + b"qsub: Job submission failed: Invalid queue name specified\n" +) + +sr = sge_io.parse_submit_output( + exit_code=return_code, stdout=stdout.decode(), stderr=stderr.decode() +) + +a = { + "parse_submit_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "submission_result_ref": json.dumps(sr.as_dict()), +} +mylist.append(a) + +# Second case: successful submission +return_code = 0 +stdout = b'Your job 24 ("submit.script") has been submitted\n' +stderr = b"" +sr = sge_io.parse_submit_output( + exit_code=return_code, stdout=stdout.decode(), stderr=stderr.decode() +) +a = { + "parse_submit_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "submission_result_ref": json.dumps(sr.as_dict()), +} +mylist.append(a) + +# Third case: another successful submission +return_code = 0 +stdout = b'Your job 15 ("submit.script") has been submitted\n' +stderr = b"" +sr = sge_io.parse_submit_output( + exit_code=return_code, stdout=stdout.decode(), stderr=stderr.decode() +) +a = { + "parse_submit_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "submission_result_ref": json.dumps(sr.as_dict()), +} +mylist.append(a) + +# Fourth case: successful job allocation +return_code = 0 +stdout = b'Your job 10 ("submit.script") has been submitted\n' +stderr = b"" +sr = sge_io.parse_submit_output( + exit_code=return_code, stdout=stdout.decode(), stderr=stderr.decode() +) +a = { + "parse_submit_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "submission_result_ref": json.dumps(sr.as_dict()), +} +mylist.append(a) + +# Fifth case: another successful job allocation +return_code = 0 +stdout = b'Your job 124 ("submit.script") has been submitted\n' +stderr = b"" +sr = sge_io.parse_submit_output( + exit_code=return_code, stdout=stdout.decode(), stderr=stderr.decode() +) +a = { + "parse_submit_kwargs": json.dumps( + {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} + ), + "submission_result_ref": json.dumps(sr.as_dict()), +} +mylist.append(a) + +with open("parse_submit_output_inout.yaml", "w") as f: + yaml.dump(mylist, f, sort_keys=False) diff --git a/tests/test_data/io/sge/parse_cancel_output_inout.yaml b/tests/test_data/io/sge/parse_cancel_output_inout.yaml new file mode 100644 index 0000000..1d9d105 --- /dev/null +++ b/tests/test_data/io/sge/parse_cancel_output_inout.yaml @@ -0,0 +1,12 @@ +- parse_cancel_kwargs: '{"exit_code": 0, "stdout": "", "stderr": "qdel: job 267 deleted\n"}' + cancel_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "CancelResult", "@version": "0.1.1", "job_id": "267", "step_id": null, "exit_code": 0, "stdout": "", "stderr": "qdel: job 267 deleted\n", "status": {"@module": "qtoolkit.core.data_objects", "@class": "CancelStatus", "@version": "0.1.1", "value": "SUCCESSFUL"}}' +- parse_cancel_kwargs: '{"exit_code": 1, "stdout": "", "stderr": "qdel: No job id specified\n"}' + cancel_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "CancelResult", "@version": "0.1.1", "job_id": null, "step_id": null, "exit_code": 1, "stdout": "", "stderr": "qdel: No job id specified\n", "status": {"@module": "qtoolkit.core.data_objects", "@class": "CancelStatus", "@version": "0.1.1", "value": "FAILED"}}' +- parse_cancel_kwargs: '{"exit_code": 210, "stdout": "", "stderr": "qdel: job 1 access denied\n"}' + cancel_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "CancelResult", "@version": "0.1.1", "job_id": null, "step_id": null, "exit_code": 210, "stdout": "", "stderr": "qdel: job 1 access denied\n", "status": {"@module": "qtoolkit.core.data_objects", "@class": "CancelStatus", "@version": "0.1.1", "value": "FAILED"}}' +- parse_cancel_kwargs: '{"exit_code": 1, "stdout": "", "stderr": "qdel: Invalid job id a\n"}' + cancel_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "CancelResult", "@version": "0.1.1", "job_id": null, "step_id": null, "exit_code": 1, "stdout": "", "stderr": "qdel: Invalid job id a\n", "status": {"@module": "qtoolkit.core.data_objects", "@class": "CancelStatus", "@version": "0.1.1", "value": "FAILED"}}' +- parse_cancel_kwargs: '{"exit_code": 0, "stdout": "", "stderr": "qdel: job 269 deleted\nqdel: job 269 already completed\n"}' + cancel_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "CancelResult", "@version": "0.1.1", "job_id": "269", "step_id": null, "exit_code": 0, "stdout": "", "stderr": "qdel: job 269 deleted\nqdel: job 269 already completed\n", "status": {"@module": "qtoolkit.core.data_objects", "@class": "CancelStatus", "@version": "0.1.1", "value": "SUCCESSFUL"}}' +- parse_cancel_kwargs: '{"exit_code": 0, "stdout": "", "stderr": "qdel: job 2675 deleted\nqdel: Invalid job id specified\n"}' + cancel_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "CancelResult", "@version": "0.1.1", "job_id": "2675", "step_id": null, "exit_code": 0, "stdout": "", "stderr": "qdel: job 2675 deleted\nqdel: Invalid job id specified\n", "status": {"@module": "qtoolkit.core.data_objects", "@class": "CancelStatus", "@version": "0.1.1", "value": "SUCCESSFUL"}}' diff --git a/tests/test_data/io/sge/parse_job_output_inout.yaml b/tests/test_data/io/sge/parse_job_output_inout.yaml new file mode 100644 index 0000000..837fba1 --- /dev/null +++ b/tests/test_data/io/sge/parse_job_output_inout.yaml @@ -0,0 +1,6 @@ +- parse_job_kwargs: '{"exit_code": 0, "stdout": "job_id: 270\njob_name: submit.script\nowner: matgenix-dwa\npriority: 4294901497\nstate: r\nstart_time: 2023-10-11T11:08:17\nend_time: 2023-10-11T11:13:17\nqueue_name: main\nslots: 1\nhard_wallclock: UNLIMITED\nstdout_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\nstderr_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\n"}' + job_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "QJob", "@version": "0.1.1", "name": "submit.script", "job_id": "270", "exit_status": null, "state": {"@module": "qtoolkit.core.data_objects", "@class": "QState", "@version": "0.1.1", "value": "DONE"}, "sub_state": {"@module": "qtoolkit.io.sge", "@class": "SGEState", "@version": "0.1.1", "value": "r"}, "info": {"@module": "qtoolkit.core.data_objects", "@class": "QJobInfo", "@version": "0.1.1", "memory": null, "memory_per_cpu": null, "nodes": 1, "cpus": 1, "threads_per_process": 1, "time_limit": null}, "account": "matgenix-dwa", "runtime": null, "queue_name": "main"}' +- parse_job_kwargs: '{"exit_code": 0, "stdout": "job_id: 270\njob_name: submit.script\nowner: matgenix-dwa\npriority: 4294901497\nstate: r\nstart_time: 2023-10-11T11:08:17\nend_time: 2023-10-11T11:13:17\nqueue_name: main\nslots: a\nhard_wallclock: a\nstdout_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\nstderr_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\n"}' + job_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "QJob", "@version": "0.1.1", "name": "submit.script", "job_id": "270", "exit_status": null, "state": {"@module": "qtoolkit.core.data_objects", "@class": "QState", "@version": "0.1.1", "value": "DONE"}, "sub_state": {"@module": "qtoolkit.io.sge", "@class": "SGEState", "@version": "0.1.1", "value": "r"}, "info": {"@module": "qtoolkit.core.data_objects", "@class": "QJobInfo", "@version": "0.1.1", "memory": null, "memory_per_cpu": null, "nodes": null, "cpus": null, "threads_per_process": null, "time_limit": null}, "account": "matgenix-dwa", "runtime": null, "queue_name": "main"}' +- parse_job_kwargs: '{"exit_code": 0, "stdout": "", "stderr": ""}' + job_ref: 'null' diff --git a/tests/test_data/io/sge/parse_submit_output_inout.yaml b/tests/test_data/io/sge/parse_submit_output_inout.yaml new file mode 100644 index 0000000..59b4350 --- /dev/null +++ b/tests/test_data/io/sge/parse_submit_output_inout.yaml @@ -0,0 +1,18 @@ +- parse_submit_kwargs: '{"exit_code": 1, "stdout": "", "stderr": "qsub: error: invalid queue specified: abcd\nqsub: error: Batch job submission failed: Invalid queue name specified\n"}' + submission_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "SubmissionResult", "@version": "0.0.1+d20230127", "job_id": null, "step_id": null, "exit_code": 1, "stdout": "", "stderr": "qsub: error: invalid queue specified: abcd\nqsub: error: Batch job submission failed: Invalid queue name specified\n", "status": {"@module": "qtoolkit.core.data_objects", "@class": "SubmissionStatus", "@version": "0.0.1+d20230127", "value": "FAILED"}}' +- parse_submit_kwargs: '{"exit_code": 0, "stdout": "Your job 24 (\"submit.script\") has been submitted\n", "stderr": ""}' + submission_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "SubmissionResult", "@version": "0.0.1+d20230127", "job_id": "24", "step_id": null, "exit_code": 0, "stdout": "Your job 24 (\"submit.script\") has been submitted\n", "stderr": "", "status": {"@module": "qtoolkit.core.data_objects", "@class": "SubmissionStatus", "@version": "0.0.1+d20230127", "value": "SUCCESSFUL"}}' +- parse_submit_kwargs: '{"exit_code": 0, "stdout": "Your job 15 (\"submit.script\") has been submitted\n", "stderr": ""}' + submission_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "SubmissionResult", "@version": "0.0.1+d20230127", "job_id": "15", "step_id": null, "exit_code": 0, "stdout": "Your job 15 (\"submit.script\") has been submitted\n", "stderr": "", "status": {"@module": "qtoolkit.core.data_objects", "@class": "SubmissionStatus", "@version": "0.0.1+d20230127", "value": "SUCCESSFUL"}}' +- parse_submit_kwargs: '{"exit_code": 0, "stdout": "Your job 10 (\"submit.script\") has been submitted\n", "stderr": ""}' + submission_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "SubmissionResult", "@version": "0.0.1+d20230127", "job_id": "10", "step_id": null, "exit_code": 0, "stdout": "Your job 10 (\"submit.script\") has been submitted\n", "stderr": "", "status": {"@module": "qtoolkit.core.data_objects", "@class": "SubmissionStatus", "@version": "0.0.1+d20230127", "value": "SUCCESSFUL"}}' +- parse_submit_kwargs: '{"exit_code": 0, "stdout": "Your job 124 (\"submit.script\") has been submitted\n", "stderr": ""}' + submission_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "SubmissionResult", "@version": "0.0.1+d20230127", "job_id": "124", "step_id": null, "exit_code": 0, "stdout": "Your job 124 (\"submit.script\") has been submitted\n", "stderr": "", "status": {"@module": "qtoolkit.core.data_objects", "@class": "SubmissionStatus", "@version": "0.0.1+d20230127", "value": "SUCCESSFUL"}}' +- parse_submit_kwargs: '{"exit_code": 0, "stdout": "Your job 24 (\"submit.script\") has been submitted\n", "stderr": ""}' + submission_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "SubmissionResult", "@version": "0.0.1+d20230127", "job_id": "24", "step_id": null, "exit_code": 0, "stdout": "Your job 24 (\"submit.script\") has been submitted\n", "stderr": "", "status": {"@module": "qtoolkit.core.data_objects", "@class": "SubmissionStatus", "@version": "0.0.1+d20230127", "value": "SUCCESSFUL"}}' +- parse_submit_kwargs: '{"exit_code": 0, "stdout": "Your job 15 (\"submit.script\") has been submitted\n", "stderr": ""}' + submission_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "SubmissionResult", "@version": "0.0.1+d20230127", "job_id": "15", "step_id": null, "exit_code": 0, "stdout": "Your job 15 (\"submit.script\") has been submitted\n", "stderr": "", "status": {"@module": "qtoolkit.core.data_objects", "@class": "SubmissionStatus", "@version": "0.0.1+d20230127", "value": "SUCCESSFUL"}}' +- parse_submit_kwargs: '{"exit_code": 0, "stdout": "Your job 10 (\"submit.script\") has been submitted\n", "stderr": ""}' + submission_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "SubmissionResult", "@version": "0.0.1+d20230127", "job_id": "10", "step_id": null, "exit_code": 0, "stdout": "Your job 10 (\"submit.script\") has been submitted\n", "stderr": "", "status": {"@module": "qtoolkit.core.data_objects", "@class": "SubmissionStatus", "@version": "0.0.1+d20230127", "value": "SUCCESSFUL"}}' +- parse_submit_kwargs: '{"exit_code": 0, "stdout": "Your job 124 (\"submit.script\") has been submitted\n", "stderr": ""}' + submission_result_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "SubmissionResult", "@version": "0.0.1+d20230127", "job_id": "124", "step_id": null, "exit_code": 0, "stdout": "Your job 124 (\"submit.script\") has been submitted\n", "stderr": "", "status": {"@module": "qtoolkit.core.data_objects", "@class": "SubmissionStatus", "@version": "0.0.1+d20230127", "value": "SUCCESSFUL"}}' From d77800ea608be1d57c7f3f767d4b3e7d80c05116 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Wed, 7 Aug 2024 17:21:36 +0200 Subject: [PATCH 05/35] finalized first implemetation --- src/qtoolkit/io/sge.py | 164 +++++++++++------- tests/io/test_sge.py | 30 ++-- .../io/sge/create_parse_job_output_inout.py | 9 +- .../io/sge/parse_job_output_inout.yaml | 2 +- 4 files changed, 121 insertions(+), 84 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 0ba1987..205804b 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -126,10 +126,10 @@ def qstate(self) -> QState: class SGEIO(BaseSchedulerIO): header_template: str = """ -#$ -cwd +#$ -cwd $${cwd} #$ -q $${queue} #$ -N $${job_name} -#$ -P $${account} +#$ -P $${device} #$ -l $${select} #$ -l h_rt=$${walltime} #$ -l s_rt=$${soft_walltime} @@ -215,47 +215,87 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s if isinstance(stderr, bytes): stderr = stderr.decode() - try: - xmldata = xml.dom.minidom.parseString(stdout) - except xml.parsers.expat.ExpatError: - raise OutputParsingError("XML parsing of stdout failed") - - job_list = xmldata.getElementsByTagName("job_list") - if not job_list: - return None + # Check for specific error messages in stderr or stdout + error_patterns = [ + re.compile( + r"Primary job\s+terminated normally, but\s+(\d+)\s+process returned a non-zero exit code", + re.IGNORECASE, + ), + re.compile( + r"mpiexec detected that one or more processes exited with non-zero status", + re.IGNORECASE, + ), + re.compile(r"An error occurred in MPI_Allreduce", re.IGNORECASE), + re.compile( + r"Error: mca_pml_ucx_send_nbr failed: -25, Connection reset by remote peer", + re.IGNORECASE, + ), + re.compile(r"mpi_errors_are_fatal", re.IGNORECASE), + ] - job_element = job_list[0] + for pattern in error_patterns: + if pattern.search(stderr) or pattern.search(stdout): + raise OutputParsingError( + "Job terminated due to a non-zero exit code from one or more processes or MPI errors" + ) - qjob = QJob() - qjob.job_id = self._get_element_text(job_element, "JB_job_number") - job_state_string = self._get_element_text(job_element, "state") + if not stdout.strip(): + return None + # Check if stdout is in XML format try: - sge_job_state = SGEState(job_state_string) - except ValueError: - raise OutputParsingError( - f"Unknown job state {job_state_string} for job id {qjob.job_id}" - ) - - qjob.sub_state = sge_job_state - qjob.state = sge_job_state.qstate - qjob.username = self._get_element_text(job_element, "JB_owner") - qjob.name = self._get_element_text(job_element, "JB_name") - - info = QJobInfo() - info.nodes = self._safe_int(self._get_element_text(job_element, "num_nodes")) - info.cpus = self._safe_int(self._get_element_text(job_element, "num_proc")) - info.memory_per_cpu = self._convert_memory_str( - self._get_element_text(job_element, "hard resource_list.mem_free") - ) - info.partition = self._get_element_text(job_element, "queue_name") - info.time_limit = self._convert_str_to_time( - self._get_element_text(job_element, "hard resource_list.h_rt") - ) + xmldata = xml.dom.minidom.parseString(stdout) + job_info = xmldata.getElementsByTagName("job_list")[0] + job_id = job_info.getElementsByTagName("JB_job_number")[ + 0 + ].firstChild.nodeValue + job_name = job_info.getElementsByTagName("JB_name")[0].firstChild.nodeValue + owner = job_info.getElementsByTagName("JB_owner")[0].firstChild.nodeValue + state = job_info.getElementsByTagName("state")[0].firstChild.nodeValue + queue_name = job_info.getElementsByTagName("queue_name")[ + 0 + ].firstChild.nodeValue + slots = job_info.getElementsByTagName("slots")[0].firstChild.nodeValue - qjob.info = info + try: + cpus = int(slots) + except ValueError: + cpus = 1 + + return QJob( + name=job_name, + job_id=job_id, + state=QState("DONE"), + sub_state=SGEState(state), + account=owner, + queue_name=queue_name, + info=QJobInfo(nodes=1, cpus=cpus, threads_per_process=1), + ) + except Exception: + # Not XML, fallback to plain text + job_info = {} + for line in stdout.split("\n"): + if ":" in line: + key, value = line.split(":", 1) + job_info[key.strip()] = value.strip() - return qjob + try: + cpus = int(job_info.get("slots", 1)) + except ValueError: + cpus = 1 + + state_str = job_info.get("state") + state = SGEState(state_str) if state_str else None + + return QJob( + name=job_info.get("job_name"), + job_id=job_info.get("job_id"), + state=QState("DONE"), + sub_state=state, + account=job_info.get("owner"), + queue_name=job_info.get("queue_name"), + info=QJobInfo(nodes=1, cpus=cpus, threads_per_process=1), + ) def _get_element_text(self, parent, tag_name): elements = parent.getElementsByTagName(tag_name) @@ -276,15 +316,8 @@ def _get_jobs_list_cmd( ) -> str: if job_ids: raise UnsupportedResourcesError("Cannot query by job id in SGE") - - command = "qstat -ext -urg -xml " - - if user: - command += f"-u {user!s}" - else: - command += "-u '*'" - - return command + user = user if user else "*" + return f"qstat -ext -urg -xml -u {user}" def parse_jobs_list_output(self, exit_code, stdout, stderr) -> list[QJob]: if exit_code != 0: @@ -339,28 +372,25 @@ def parse_jobs_list_output(self, exit_code, stdout, stderr) -> list[QJob]: return jobs_list @staticmethod - def _convert_str_to_time(time_str: str | None): - """ - Convert a string in the format used by SGE DD:HH:MM:SS to a number of seconds. - It may contain only H:M:S, only M:S or only S. - """ - - if not time_str: + def _convert_str_to_time(time_str: str | None) -> int | None: + if time_str is None: return None - time_split = time_str.split(":") - - # array containing seconds, minutes, hours and days - time = [0] * 4 + parts = time_str.split(":") + if len(parts) == 3: + hours, minutes, seconds = parts + elif len(parts) == 2: + hours, minutes = "0", parts[0] + seconds = parts[1] + elif len(parts) == 1: + hours, minutes, seconds = "0", "0", parts[0] + else: + raise OutputParsingError(f"Invalid time format: {time_str}") try: - for i, v in enumerate(reversed(time_split)): - time[i] = int(v) - + return int(hours) * 3600 + int(minutes) * 60 + int(seconds) except ValueError: - raise OutputParsingError() - - return time[3] * 86400 + time[2] * 3600 + time[1] * 60 + time[0] + raise OutputParsingError(f"Invalid time format: {time_str}") @staticmethod def _convert_memory_str(memory: str | None) -> int | None: @@ -372,10 +402,10 @@ def _convert_memory_str(memory: str | None) -> int | None: raise OutputParsingError("No numbers and units parsed") memory, units = match.groups() - power_labels = {"kb": 0, "mb": 1, "gb": 2, "tb": 3} + power_labels = {"K": 0, "M": 1, "G": 2, "T": 3} if not units: - units = "mb" + units = "M" elif units not in power_labels: raise OutputParsingError(f"Unknown units {units}") try: @@ -383,7 +413,7 @@ def _convert_memory_str(memory: str | None) -> int | None: except ValueError: raise OutputParsingError - return v * (1024 ** power_labels[units]) + return v * (1024 ** power_labels[units.upper()]) _qresources_mapping = { "queue_name": "queue", @@ -482,6 +512,7 @@ def supported_qresources_keys(self) -> list: supported = list(self._qresources_mapping.keys()) supported += [ "njobs", + "memory_per_thread", "time_limit", "processes", "processes_per_node", @@ -490,5 +521,6 @@ def supported_qresources_keys(self) -> list: "threads_per_process", "email_address", "scheduler_kwargs", + "gpus_per_job", ] return supported diff --git a/tests/io/test_sge.py b/tests/io/test_sge.py index 3285a35..3c72884 100644 --- a/tests/io/test_sge.py +++ b/tests/io/test_sge.py @@ -108,15 +108,16 @@ def test_get_jobs_list_cmd(self, sge_io): sge_io._get_jobs_list_cmd(job_ids=["1"], user="johndoe") cmd = sge_io._get_jobs_list_cmd(user="johndoe") assert cmd == ("qstat -ext -urg -xml -u johndoe") - cmd = sge_io._get_jobs_list_cmd(job_ids=["1", "3", "56", "15"]) - assert cmd == ("qstat -ext -urg -xml -u '*'") - cmd = sge_io._get_jobs_list_cmd(job_ids=["1"]) - assert cmd == ("qstat -ext -urg -xml -u '*'") + with pytest.raises( + UnsupportedResourcesError, match=r"Cannot query by job id in SGE" + ): + sge_io._get_jobs_list_cmd(job_ids=["1", "3", "56", "15"]) + with pytest.raises( + UnsupportedResourcesError, match=r"Cannot query by job id in SGE" + ): + sge_io._get_jobs_list_cmd(job_ids=["1"]) def test_convert_str_to_time(self, sge_io): - time_seconds = sge_io._convert_str_to_time(None) - assert time_seconds is None - time_seconds = sge_io._convert_str_to_time("10:51:13") assert time_seconds == 39073 time_seconds = sge_io._convert_str_to_time("02:10:02") @@ -124,7 +125,7 @@ def test_convert_str_to_time(self, sge_io): time_seconds = sge_io._convert_str_to_time("10:02") assert time_seconds == 602 time_seconds = sge_io._convert_str_to_time("45") - assert time_seconds == 2700 + assert time_seconds == 45 with pytest.raises(OutputParsingError): sge_io._convert_str_to_time("2:10:02:10") @@ -199,15 +200,15 @@ def test_check_convert_qresources(self, sge_io): assert header_dict == { "queue": "myqueue", "job_name": "myjob", - "mem": 2048, + "place": "scatter", # a bit unsure about this "account": "myaccount", "priority": 1, "qout_path": "someoutputpath", "qerr_path": "someerrorpath", "array": "1-4", "walltime": "10:51:13", - "soft_walltime": "9:46:1", - "select": "select=4:ncpus=6:ompthreads=2:mpiprocs=3:mem=24576mb", + "select": "select=4:ncpus=6:mpiprocs=3:ompthreads=2:mem=12288mb", + "soft_walltime": "9:46:5", "mail_user": "john.doe@submit.qtk", "mail_type": "abe", "tata": "toto", @@ -221,8 +222,8 @@ def test_check_convert_qresources(self, sge_io): header_dict = sge_io.check_convert_qresources(resources=res) assert header_dict == { "walltime": "10:51:13", - "soft_walltime": "9:46:1", - "select": "select=24:ncpus=1:ompthreads=1:mem=24mb", + "soft_walltime": "9:46:5", + "select": "select=24", # also not sure about this } res = QResources( @@ -232,8 +233,7 @@ def test_check_convert_qresources(self, sge_io): ) header_dict = sge_io.check_convert_qresources(resources=res) assert header_dict == { - "select": "select=24:ncpus=1:ompthreads=1:mem=24mb", - "array": "1-1", + "select": "select=24", } res = QResources( diff --git a/tests/test_data/io/sge/create_parse_job_output_inout.py b/tests/test_data/io/sge/create_parse_job_output_inout.py index 3a82178..9f26335 100644 --- a/tests/test_data/io/sge/create_parse_job_output_inout.py +++ b/tests/test_data/io/sge/create_parse_job_output_inout.py @@ -2,6 +2,7 @@ import yaml +from qtoolkit.core.exceptions import OutputParsingError from qtoolkit.io.sge import SGEIO sge_io = SGEIO() @@ -61,12 +62,16 @@ """ stderr = b"" -job = sge_io.parse_job_output(exit_code=return_code, stdout=stdout, stderr=stderr) +try: + job = sge_io.parse_job_output(exit_code=return_code, stdout=stdout, stderr=stderr) + job_dict = job.as_dict() +except OutputParsingError as e: + job_dict = {"error": str(e)} a = { "parse_job_kwargs": json.dumps( {"exit_code": return_code, "stdout": stdout.decode(), "stderr": stderr.decode()} ), - "job_ref": json.dumps(job.as_dict()), + "job_ref": json.dumps(job_dict), } mylist.append(a) diff --git a/tests/test_data/io/sge/parse_job_output_inout.yaml b/tests/test_data/io/sge/parse_job_output_inout.yaml index 837fba1..e2e2bd7 100644 --- a/tests/test_data/io/sge/parse_job_output_inout.yaml +++ b/tests/test_data/io/sge/parse_job_output_inout.yaml @@ -1,6 +1,6 @@ - parse_job_kwargs: '{"exit_code": 0, "stdout": "job_id: 270\njob_name: submit.script\nowner: matgenix-dwa\npriority: 4294901497\nstate: r\nstart_time: 2023-10-11T11:08:17\nend_time: 2023-10-11T11:13:17\nqueue_name: main\nslots: 1\nhard_wallclock: UNLIMITED\nstdout_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\nstderr_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\n"}' job_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "QJob", "@version": "0.1.1", "name": "submit.script", "job_id": "270", "exit_status": null, "state": {"@module": "qtoolkit.core.data_objects", "@class": "QState", "@version": "0.1.1", "value": "DONE"}, "sub_state": {"@module": "qtoolkit.io.sge", "@class": "SGEState", "@version": "0.1.1", "value": "r"}, "info": {"@module": "qtoolkit.core.data_objects", "@class": "QJobInfo", "@version": "0.1.1", "memory": null, "memory_per_cpu": null, "nodes": 1, "cpus": 1, "threads_per_process": 1, "time_limit": null}, "account": "matgenix-dwa", "runtime": null, "queue_name": "main"}' - parse_job_kwargs: '{"exit_code": 0, "stdout": "job_id: 270\njob_name: submit.script\nowner: matgenix-dwa\npriority: 4294901497\nstate: r\nstart_time: 2023-10-11T11:08:17\nend_time: 2023-10-11T11:13:17\nqueue_name: main\nslots: a\nhard_wallclock: a\nstdout_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\nstderr_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\n"}' - job_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "QJob", "@version": "0.1.1", "name": "submit.script", "job_id": "270", "exit_status": null, "state": {"@module": "qtoolkit.core.data_objects", "@class": "QState", "@version": "0.1.1", "value": "DONE"}, "sub_state": {"@module": "qtoolkit.io.sge", "@class": "SGEState", "@version": "0.1.1", "value": "r"}, "info": {"@module": "qtoolkit.core.data_objects", "@class": "QJobInfo", "@version": "0.1.1", "memory": null, "memory_per_cpu": null, "nodes": null, "cpus": null, "threads_per_process": null, "time_limit": null}, "account": "matgenix-dwa", "runtime": null, "queue_name": "main"}' + job_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "QJob", "@version": "0.1.1", "name": "submit.script", "job_id": "270", "exit_status": null, "state": {"@module": "qtoolkit.core.data_objects", "@class": "QState", "@version": "0.1.1", "value": "DONE"}, "sub_state": {"@module": "qtoolkit.io.sge", "@class": "SGEState", "@version": "0.1.1", "value": "r"}, "info": {"@module": "qtoolkit.core.data_objects", "@class": "QJobInfo", "@version": "0.1.1", "memory": null, "memory_per_cpu": null, "nodes": 1, "cpus": 1, "threads_per_process": 1, "time_limit": null}, "account": "matgenix-dwa", "runtime": null, "queue_name": "main"}' - parse_job_kwargs: '{"exit_code": 0, "stdout": "", "stderr": ""}' job_ref: 'null' From a1911e37ac10b93a1a0410e3e4484f5ba168caff Mon Sep 17 00:00:00 2001 From: Christina Ertural <52951132+QuantumChemist@users.noreply.github.com> Date: Fri, 16 Aug 2024 17:26:06 +0200 Subject: [PATCH 06/35] Update src/qtoolkit/io/sge.py Co-authored-by: Matthew Evans <7916000+ml-evs@users.noreply.github.com> --- src/qtoolkit/io/sge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 205804b..b55a3e8 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -236,7 +236,7 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s for pattern in error_patterns: if pattern.search(stderr) or pattern.search(stdout): raise OutputParsingError( - "Job terminated due to a non-zero exit code from one or more processes or MPI errors" + "Job terminated due to a non-zero exit code from one or more processes or MPI errors: {stderr}" ) if not stdout.strip(): From 982627d114f6d016180f6306660ce03d05cfaa94 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 11:51:18 +0200 Subject: [PATCH 07/35] made nodes indeterminated --- src/qtoolkit/io/sge.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index b55a3e8..7a58157 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -256,11 +256,14 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s 0 ].firstChild.nodeValue slots = job_info.getElementsByTagName("slots")[0].firstChild.nodeValue + tasks = job_info.getElementsByTagName("tasks")[0].firstChild.nodeValue try: cpus = int(slots) + nodes = int(tasks) except ValueError: cpus = 1 + nodes = 1 return QJob( name=job_name, @@ -269,7 +272,7 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s sub_state=SGEState(state), account=owner, queue_name=queue_name, - info=QJobInfo(nodes=1, cpus=cpus, threads_per_process=1), + info=QJobInfo(nodes=nodes, cpus=cpus, threads_per_process=1), ) except Exception: # Not XML, fallback to plain text @@ -294,7 +297,7 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s sub_state=state, account=job_info.get("owner"), queue_name=job_info.get("queue_name"), - info=QJobInfo(nodes=1, cpus=cpus, threads_per_process=1), + info=QJobInfo(nodes=nodes, cpus=cpus, threads_per_process=1), ) def _get_element_text(self, parent, tag_name): From d782fb0bbfb068e590f3c2928e6124ac22056fb8 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 11:52:58 +0200 Subject: [PATCH 08/35] replaced not working links --- src/qtoolkit/io/sge.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 7a58157..14cc028 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -20,7 +20,7 @@ from qtoolkit.core.exceptions import OutputParsingError, UnsupportedResourcesError from qtoolkit.io.base import BaseSchedulerIO -# 'http://www.loni.ucla.edu/twiki/bin/view/Infrastructure/GridComputing?skin=plain': +# https://wiki.nikhil.io/Ancient_Sysadmin_Stuff/Sun_Grid_Engine_States/: # Jobs Status: # 'qw' - Queued and waiting, # 'w' - Job waiting, @@ -32,9 +32,6 @@ # 'd' - Job has been marked for deletion, # 'Eqw' - An error occurred with the job. # -# 'http://confluence.rcs.griffith.edu.au:8080/display/v20zCluster/ -# Sun+Grid+Engine+SGE+state+letter+symbol+codes+meanings': -# # Category State SGE Letter Code # Pending: pending qw # Pending: pending, user hold qw From 28db648849aa4e6b3d2da885e2175ac13b81fbf2 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 11:57:32 +0200 Subject: [PATCH 09/35] removed queue states --- src/qtoolkit/io/sge.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 14cc028..f00c1c9 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -57,13 +57,6 @@ class SGEState(QSubState): - # Queue states - ALARM = "a" - SUSPENDED_BY_USER_ADMIN = "s" - DISABLED_BY_USER_ADMIN = "d" - SUSPENDED_BY_SUBORDINATION = "S" - ERROR = "E" - # Job states QUEUED_WAITING = "qw" WAITING = "w" From 6f58cd2cc076bad28495784f799ce18cfc057e58 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 12:00:35 +0200 Subject: [PATCH 10/35] removed queue states --- src/qtoolkit/io/sge.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index f00c1c9..7b6dc78 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -96,10 +96,6 @@ def qstate(self) -> QState: SGEState.TRANSFERRING: QState.RUNNING, SGEState.RESTARTED: QState.RUNNING, SGEState.JOB_SUSPENDED: QState.SUSPENDED, - SGEState.SUSPENDED_BY_USER_ADMIN: QState.SUSPENDED, - SGEState.SUSPENDED_BY_SUBORDINATION: QState.SUSPENDED, - SGEState.ALARM: QState.SUSPENDED, - SGEState.ERROR: QState.FAILED, SGEState.DELETION: QState.DONE, SGEState.DELETION_RUNNING: QState.DONE, SGEState.DELETION_TRANSFERRING: QState.DONE, From a8ed691ae767e5c1008ecd8f74d9f57b51f37645 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 12:17:44 +0200 Subject: [PATCH 11/35] remapped some job states to QState.FAILED and QState.SUSPENDED, added a FINISHED job state and added another link for SGE documnetation --- src/qtoolkit/io/sge.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 7b6dc78..728d8c7 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -20,7 +20,8 @@ from qtoolkit.core.exceptions import OutputParsingError, UnsupportedResourcesError from qtoolkit.io.base import BaseSchedulerIO -# https://wiki.nikhil.io/Ancient_Sysadmin_Stuff/Sun_Grid_Engine_States/: +# https://wiki.nikhil.io/Ancient_Sysadmin_Stuff/Sun_Grid_Engine_States/ +# https://manpages.ubuntu.com/manpages/jammy/en/man5/sge_status.5.html # Jobs Status: # 'qw' - Queued and waiting, # 'w' - Job waiting, @@ -31,6 +32,7 @@ # 'R' - Job restarted, # 'd' - Job has been marked for deletion, # 'Eqw' - An error occurred with the job. +# 'z' - finished # # Category State SGE Letter Code # Pending: pending qw @@ -58,6 +60,7 @@ class SGEState(QSubState): # Job states + FINISHED = "z" QUEUED_WAITING = "qw" WAITING = "w" JOB_SUSPENDED = "s" @@ -86,6 +89,7 @@ def qstate(self) -> QState: _STATUS_MAPPING = { + SGEState.FINISHED: QState.DONE, SGEState.QUEUED_WAITING: QState.QUEUED, SGEState.WAITING: QState.QUEUED, SGEState.HOLD: QState.QUEUED_HELD, @@ -96,17 +100,17 @@ def qstate(self) -> QState: SGEState.TRANSFERRING: QState.RUNNING, SGEState.RESTARTED: QState.RUNNING, SGEState.JOB_SUSPENDED: QState.SUSPENDED, - SGEState.DELETION: QState.DONE, - SGEState.DELETION_RUNNING: QState.DONE, - SGEState.DELETION_TRANSFERRING: QState.DONE, - SGEState.DELETION_RUNNING_RESUBMIT: QState.DONE, - SGEState.DELETION_TRANSFERRING_RESUBMIT: QState.DONE, - SGEState.DELETION_SUSPENDED_JOB: QState.DONE, - SGEState.DELETION_SUSPENDED_QUEUE: QState.DONE, - SGEState.DELETION_SUSPENDED_ALARM: QState.DONE, - SGEState.DELETION_SUSPENDED_RESUBMIT_JOB: QState.DONE, - SGEState.DELETION_SUSPENDED_RESUBMIT_QUEUE: QState.DONE, - SGEState.DELETION_SUSPENDED_RESUBMIT_ALARM: QState.DONE, + SGEState.DELETION: QState.FAILED, + SGEState.DELETION_RUNNING: QState.FAILED, + SGEState.DELETION_TRANSFERRING: QState.FAILED, + SGEState.DELETION_RUNNING_RESUBMIT: QState.FAILED, + SGEState.DELETION_TRANSFERRING_RESUBMIT: QState.FAILED, + SGEState.DELETION_SUSPENDED_JOB: QState.SUSPENDED, + SGEState.DELETION_SUSPENDED_QUEUE: QState.SUSPENDED, + SGEState.DELETION_SUSPENDED_ALARM: QState.SUSPENDED, + SGEState.DELETION_SUSPENDED_RESUBMIT_JOB: QState.SUSPENDED, + SGEState.DELETION_SUSPENDED_RESUBMIT_QUEUE: QState.SUSPENDED, + SGEState.DELETION_SUSPENDED_RESUBMIT_ALARM: QState.SUSPENDED, } From e153da1efe0622224f1118938cd1313d5e6b3767 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 14:35:17 +0200 Subject: [PATCH 12/35] changed OutputParsingError to CommandFailedError --- src/qtoolkit/io/sge.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 728d8c7..4cf470e 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -17,7 +17,11 @@ SubmissionResult, SubmissionStatus, ) -from qtoolkit.core.exceptions import OutputParsingError, UnsupportedResourcesError +from qtoolkit.core.exceptions import ( + CommandFailedError, + OutputParsingError, + UnsupportedResourcesError, +) from qtoolkit.io.base import BaseSchedulerIO # https://wiki.nikhil.io/Ancient_Sysadmin_Stuff/Sun_Grid_Engine_States/ @@ -199,7 +203,8 @@ def _get_job_cmd(self, job_id: str): def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida style if exit_code != 0: - raise OutputParsingError(f"Error in job output parsing: {stderr}") + msg = f"command {self.get_job_executable} failed: {stderr}" + raise CommandFailedError(msg) if isinstance(stdout, bytes): stdout = stdout.decode() if isinstance(stderr, bytes): @@ -225,9 +230,8 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s for pattern in error_patterns: if pattern.search(stderr) or pattern.search(stdout): - raise OutputParsingError( - "Job terminated due to a non-zero exit code from one or more processes or MPI errors: {stderr}" - ) + msg = f"command {self.get_job_executable} failed: {stderr}" + raise CommandFailedError(msg) if not stdout.strip(): return None @@ -262,7 +266,7 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s sub_state=SGEState(state), account=owner, queue_name=queue_name, - info=QJobInfo(nodes=nodes, cpus=cpus, threads_per_process=1), + info=QJobInfo(nodes=nodes, cpus=cpus), ) except Exception: # Not XML, fallback to plain text @@ -274,8 +278,10 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s try: cpus = int(job_info.get("slots", 1)) + nodes = int(job_info.get("tasks", 1)) except ValueError: cpus = 1 + nodes = 1 state_str = job_info.get("state") state = SGEState(state_str) if state_str else None @@ -287,7 +293,7 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s sub_state=state, account=job_info.get("owner"), queue_name=job_info.get("queue_name"), - info=QJobInfo(nodes=nodes, cpus=cpus, threads_per_process=1), + info=QJobInfo(nodes=nodes, cpus=cpus), ) def _get_element_text(self, parent, tag_name): @@ -314,7 +320,8 @@ def _get_jobs_list_cmd( def parse_jobs_list_output(self, exit_code, stdout, stderr) -> list[QJob]: if exit_code != 0: - raise OutputParsingError(f"Error in jobs list output parsing: {stderr}") + msg = f"command {self.get_job_executable} failed: {stderr}" + raise CommandFailedError(msg) if isinstance(stdout, bytes): stdout = stdout.decode() if isinstance(stderr, bytes): From ae01b4f5f052ca7137cbc03342049c6cd000b599 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 14:37:26 +0200 Subject: [PATCH 13/35] set cpus and nodes to None in case of ValueError --- src/qtoolkit/io/sge.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 4cf470e..a958da4 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -256,8 +256,8 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s cpus = int(slots) nodes = int(tasks) except ValueError: - cpus = 1 - nodes = 1 + cpus = None + nodes = None return QJob( name=job_name, @@ -280,8 +280,8 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s cpus = int(job_info.get("slots", 1)) nodes = int(job_info.get("tasks", 1)) except ValueError: - cpus = 1 - nodes = 1 + cpus = None + nodes = None state_str = job_info.get("state") state = SGEState(state_str) if state_str else None From 23dae1871d53955b8bef02a913edf3637a322625 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 14:46:55 +0200 Subject: [PATCH 14/35] using sge_state.qstate --- src/qtoolkit/io/sge.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index a958da4..529a9e4 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -252,6 +252,9 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s slots = job_info.getElementsByTagName("slots")[0].firstChild.nodeValue tasks = job_info.getElementsByTagName("tasks")[0].firstChild.nodeValue + sge_state = SGEState(state) + job_state = sge_state.qstate + try: cpus = int(slots) nodes = int(tasks) @@ -262,8 +265,8 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s return QJob( name=job_name, job_id=job_id, - state=QState("DONE"), - sub_state=SGEState(state), + state=job_state, + sub_state=sge_state, account=owner, queue_name=queue_name, info=QJobInfo(nodes=nodes, cpus=cpus), @@ -284,13 +287,14 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s nodes = None state_str = job_info.get("state") - state = SGEState(state_str) if state_str else None + sge_state = SGEState(state_str) if state_str else None + job_state = sge_state.qstate return QJob( name=job_info.get("job_name"), job_id=job_info.get("job_id"), - state=QState("DONE"), - sub_state=state, + state=job_state, + sub_state=sge_state, account=job_info.get("owner"), queue_name=job_info.get("queue_name"), info=QJobInfo(nodes=nodes, cpus=cpus), From 28732af6c481c5dddf58e71360d723392d9bf7e3 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 15:03:22 +0200 Subject: [PATCH 15/35] added get_job_executable to SGEIO class initialization --- src/qtoolkit/io/sge.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 529a9e4..2f0a2b9 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -142,6 +142,12 @@ class SGEIO(BaseSchedulerIO): SUBMIT_CMD: str | None = "qsub" CANCEL_CMD: str | None = "qdel" + def __init__( + self, get_job_executable: str = "qstat", split_separator: str = "<><>" + ): + self.get_job_executable = get_job_executable + self.split_separator = split_separator # not so sure if this is really needed + def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: if isinstance(stdout, bytes): stdout = stdout.decode() @@ -203,7 +209,7 @@ def _get_job_cmd(self, job_id: str): def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida style if exit_code != 0: - msg = f"command {self.get_job_executable} failed: {stderr}" + msg = f"command {self.get_job_executable or 'qacct'} failed: {stderr}" raise CommandFailedError(msg) if isinstance(stdout, bytes): stdout = stdout.decode() @@ -230,7 +236,7 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s for pattern in error_patterns: if pattern.search(stderr) or pattern.search(stdout): - msg = f"command {self.get_job_executable} failed: {stderr}" + msg = f"command {self.get_job_executable or 'qacct'} failed: {stderr}" raise CommandFailedError(msg) if not stdout.strip(): @@ -324,7 +330,7 @@ def _get_jobs_list_cmd( def parse_jobs_list_output(self, exit_code, stdout, stderr) -> list[QJob]: if exit_code != 0: - msg = f"command {self.get_job_executable} failed: {stderr}" + msg = f"command {self.get_job_executable or 'qacct'} failed: {stderr}" raise CommandFailedError(msg) if isinstance(stdout, bytes): stdout = stdout.decode() From 58ccd31453de0ab184c731b70e5a83b30f9f817d Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 15:04:44 +0200 Subject: [PATCH 16/35] split_separator not needed --- src/qtoolkit/io/sge.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 2f0a2b9..2f4c01c 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -142,11 +142,8 @@ class SGEIO(BaseSchedulerIO): SUBMIT_CMD: str | None = "qsub" CANCEL_CMD: str | None = "qdel" - def __init__( - self, get_job_executable: str = "qstat", split_separator: str = "<><>" - ): + def __init__(self, get_job_executable: str = "qstat"): self.get_job_executable = get_job_executable - self.split_separator = split_separator # not so sure if this is really needed def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: if isinstance(stdout, bytes): From e6e67b3cc0c77d0f28d270c403ecc54000dee290 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 15:17:43 +0200 Subject: [PATCH 17/35] making the message for not being able to query per job_list more clearer --- src/qtoolkit/io/sge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 2f4c01c..763583c 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -321,7 +321,7 @@ def _get_jobs_list_cmd( self, job_ids: list[str] | None = None, user: str | None = None ) -> str: if job_ids: - raise UnsupportedResourcesError("Cannot query by job id in SGE") + raise ValueError("Cannot query by job ids list in SGE") user = user if user else "*" return f"qstat -ext -urg -xml -u {user}" From 85afc3b0b02277995952ee341d80a132d31b4fc4 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 15:21:46 +0200 Subject: [PATCH 18/35] removing account from _qresources_mapping --- src/qtoolkit/io/sge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 763583c..6a7cc1d 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -425,7 +425,6 @@ def _convert_memory_str(memory: str | None) -> int | None: _qresources_mapping = { "queue_name": "queue", "job_name": "job_name", - "account": "account", "priority": "priority", "output_filepath": "qout_path", "error_filepath": "qerr_path", From 27346a5112df6ff11fd7eff628c18467396b95a6 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 16:37:27 +0200 Subject: [PATCH 19/35] fixing unit tests --- src/qtoolkit/io/sge.py | 13 +++++++++++-- tests/io/test_sge.py | 18 ++++++++---------- .../io/sge/parse_job_output_inout.yaml | 4 ++-- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 6a7cc1d..ff962f4 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -130,6 +130,7 @@ class SGEIO(BaseSchedulerIO): #$ -pe $${model} #$ -binding $${place} #$ -W group_list=$${group_list} +#$ -A $${account} #$ -M $${mail_user} #$ -m $${mail_type} #$ -o $${qout_path} @@ -261,9 +262,11 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s try: cpus = int(slots) nodes = int(tasks) + threads_per_process = int(cpus / nodes) except ValueError: cpus = None nodes = None + threads_per_process = None return QJob( name=job_name, @@ -272,7 +275,9 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s sub_state=sge_state, account=owner, queue_name=queue_name, - info=QJobInfo(nodes=nodes, cpus=cpus), + info=QJobInfo( + nodes=nodes, cpus=cpus, threads_per_process=threads_per_process + ), ) except Exception: # Not XML, fallback to plain text @@ -285,9 +290,11 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s try: cpus = int(job_info.get("slots", 1)) nodes = int(job_info.get("tasks", 1)) + threads_per_process = int(cpus / nodes) except ValueError: cpus = None nodes = None + threads_per_process = None state_str = job_info.get("state") sge_state = SGEState(state_str) if state_str else None @@ -300,7 +307,9 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida s sub_state=sge_state, account=job_info.get("owner"), queue_name=job_info.get("queue_name"), - info=QJobInfo(nodes=nodes, cpus=cpus), + info=QJobInfo( + nodes=nodes, cpus=cpus, threads_per_process=threads_per_process + ), ) def _get_element_text(self, parent, tag_name): diff --git a/tests/io/test_sge.py b/tests/io/test_sge.py index 3c72884..cf7236d 100644 --- a/tests/io/test_sge.py +++ b/tests/io/test_sge.py @@ -102,19 +102,13 @@ def test_get_job_cmd(self, sge_io): assert cmd == "qstat -j 56" def test_get_jobs_list_cmd(self, sge_io): - with pytest.raises( - UnsupportedResourcesError, match=r"Cannot query by job id in SGE" - ): + with pytest.raises(ValueError, match=r"Cannot query by job ids list in SGE"): sge_io._get_jobs_list_cmd(job_ids=["1"], user="johndoe") cmd = sge_io._get_jobs_list_cmd(user="johndoe") assert cmd == ("qstat -ext -urg -xml -u johndoe") - with pytest.raises( - UnsupportedResourcesError, match=r"Cannot query by job id in SGE" - ): + with pytest.raises(ValueError, match=r"Cannot query by job ids list in SGE"): sge_io._get_jobs_list_cmd(job_ids=["1", "3", "56", "15"]) - with pytest.raises( - UnsupportedResourcesError, match=r"Cannot query by job id in SGE" - ): + with pytest.raises(ValueError, match=r"Cannot query by job ids list in SGE"): sge_io._get_jobs_list_cmd(job_ids=["1"]) def test_convert_str_to_time(self, sge_io): @@ -200,7 +194,11 @@ def test_check_convert_qresources(self, sge_io): assert header_dict == { "queue": "myqueue", "job_name": "myjob", - "place": "scatter", # a bit unsure about this + "memory_per_thread": 2048, + "nodes": 4, + "processes_per_node": 3, + "threads_per_process": 2, + "time_limit": 39073, "account": "myaccount", "priority": 1, "qout_path": "someoutputpath", diff --git a/tests/test_data/io/sge/parse_job_output_inout.yaml b/tests/test_data/io/sge/parse_job_output_inout.yaml index e2e2bd7..9e4bc85 100644 --- a/tests/test_data/io/sge/parse_job_output_inout.yaml +++ b/tests/test_data/io/sge/parse_job_output_inout.yaml @@ -1,6 +1,6 @@ - parse_job_kwargs: '{"exit_code": 0, "stdout": "job_id: 270\njob_name: submit.script\nowner: matgenix-dwa\npriority: 4294901497\nstate: r\nstart_time: 2023-10-11T11:08:17\nend_time: 2023-10-11T11:13:17\nqueue_name: main\nslots: 1\nhard_wallclock: UNLIMITED\nstdout_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\nstderr_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\n"}' - job_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "QJob", "@version": "0.1.1", "name": "submit.script", "job_id": "270", "exit_status": null, "state": {"@module": "qtoolkit.core.data_objects", "@class": "QState", "@version": "0.1.1", "value": "DONE"}, "sub_state": {"@module": "qtoolkit.io.sge", "@class": "SGEState", "@version": "0.1.1", "value": "r"}, "info": {"@module": "qtoolkit.core.data_objects", "@class": "QJobInfo", "@version": "0.1.1", "memory": null, "memory_per_cpu": null, "nodes": 1, "cpus": 1, "threads_per_process": 1, "time_limit": null}, "account": "matgenix-dwa", "runtime": null, "queue_name": "main"}' + job_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "QJob", "@version": "0.1.1", "name": "submit.script", "job_id": "270", "exit_status": null, "state": {"@module": "qtoolkit.core.data_objects", "@class": "QState", "@version": "0.1.1", "value": "RUNNING"}, "sub_state": {"@module": "qtoolkit.io.sge", "@class": "SGEState", "@version": "0.1.1", "value": "r"}, "info": {"@module": "qtoolkit.core.data_objects", "@class": "QJobInfo", "@version": "0.1.1", "memory": null, "memory_per_cpu": null, "nodes": 1, "cpus": 1, "threads_per_process": 1, "time_limit": null}, "account": "matgenix-dwa", "runtime": null, "queue_name": "main"}' - parse_job_kwargs: '{"exit_code": 0, "stdout": "job_id: 270\njob_name: submit.script\nowner: matgenix-dwa\npriority: 4294901497\nstate: r\nstart_time: 2023-10-11T11:08:17\nend_time: 2023-10-11T11:13:17\nqueue_name: main\nslots: a\nhard_wallclock: a\nstdout_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\nstderr_path_list: /home/matgenix-dwa/software/qtoolkit/tests/test_data/io/sge/sge-270.out\n"}' - job_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "QJob", "@version": "0.1.1", "name": "submit.script", "job_id": "270", "exit_status": null, "state": {"@module": "qtoolkit.core.data_objects", "@class": "QState", "@version": "0.1.1", "value": "DONE"}, "sub_state": {"@module": "qtoolkit.io.sge", "@class": "SGEState", "@version": "0.1.1", "value": "r"}, "info": {"@module": "qtoolkit.core.data_objects", "@class": "QJobInfo", "@version": "0.1.1", "memory": null, "memory_per_cpu": null, "nodes": 1, "cpus": 1, "threads_per_process": 1, "time_limit": null}, "account": "matgenix-dwa", "runtime": null, "queue_name": "main"}' + job_ref: '{"@module": "qtoolkit.core.data_objects", "@class": "QJob", "@version": "0.1.1", "name": "submit.script", "job_id": "270", "exit_status": null, "state": {"@module": "qtoolkit.core.data_objects", "@class": "QState", "@version": "0.1.1", "value": "RUNNING"}, "sub_state": {"@module": "qtoolkit.io.sge", "@class": "SGEState", "@version": "0.1.1", "value": "r"}, "info": {"@module": "qtoolkit.core.data_objects", "@class": "QJobInfo", "@version": "0.1.1", "memory": null, "memory_per_cpu": null, "nodes": null, "cpus": null, "threads_per_process": null, "time_limit": null}, "account": "matgenix-dwa", "runtime": null, "queue_name": "main"}' - parse_job_kwargs: '{"exit_code": 0, "stdout": "", "stderr": ""}' job_ref: 'null' From 08b513ceb6bbd3599649410bdc56b921f3fdf5cd Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 16:52:58 +0200 Subject: [PATCH 20/35] fixed unit tests --- src/qtoolkit/io/sge.py | 1 - tests/io/test_sge.py | 8 +------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index ff962f4..def3731 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -130,7 +130,6 @@ class SGEIO(BaseSchedulerIO): #$ -pe $${model} #$ -binding $${place} #$ -W group_list=$${group_list} -#$ -A $${account} #$ -M $${mail_user} #$ -m $${mail_type} #$ -o $${qout_path} diff --git a/tests/io/test_sge.py b/tests/io/test_sge.py index cf7236d..7331dba 100644 --- a/tests/io/test_sge.py +++ b/tests/io/test_sge.py @@ -177,7 +177,6 @@ def test_check_convert_qresources(self, sge_io): queue_name="myqueue", job_name="myjob", memory_per_thread=2048, - account="myaccount", priority=1, output_filepath="someoutputpath", error_filepath="someerrorpath", @@ -194,12 +193,7 @@ def test_check_convert_qresources(self, sge_io): assert header_dict == { "queue": "myqueue", "job_name": "myjob", - "memory_per_thread": 2048, - "nodes": 4, - "processes_per_node": 3, - "threads_per_process": 2, - "time_limit": 39073, - "account": "myaccount", + "place": "scatter", "priority": 1, "qout_path": "someoutputpath", "qerr_path": "someerrorpath", From 573641ec772508c71f1f02101902976bf0e1ab80 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Thu, 19 Sep 2024 17:29:11 +0200 Subject: [PATCH 21/35] added test_submission_script with adjusted settings for SGE --- tests/io/test_sge.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/io/test_sge.py b/tests/io/test_sge.py index 7331dba..49f7ca5 100644 --- a/tests/io/test_sge.py +++ b/tests/io/test_sge.py @@ -236,3 +236,40 @@ def test_check_convert_qresources(self, sge_io): UnsupportedResourcesError, match=r"Keys not supported: rerunnable" ): sge_io.check_convert_qresources(res) + + def test_submission_script(self, sge_io, maximalist_qresources): + # remove unsupported SGE options + maximalist_qresources.rerunnable = None + maximalist_qresources.project = None + maximalist_qresources.account = None + maximalist_qresources.qos = None + maximalist_qresources.process_placement = ProcessPlacement.EVENLY_DISTRIBUTED + + # Set `processes` to None to avoid the conflict + maximalist_qresources.processes = None + + # generate the SGE submission script + script_qresources = sge_io.get_submission_script( + commands=["ls -l"], options=maximalist_qresources + ) + + # assert the correctness of the generated script + assert ( + script_qresources.split("\n") + == """#!/bin/bash + +#$ -q test_queue +#$ -N test_job +#$ -l select=1:ncpus=1:mpiprocs=1:mem=1000mb +#$ -l h_rt=0:1:40 +#$ -l s_rt=0:1:30 +#$ -binding scatter +#$ -M test_email_address@email.address +#$ -m abe +#$ -o test_output_filepath +#$ -e test_error_filepath +#$ -p 1 +ls -l""".split( + "\n" + ) + ) From 8d3136400a670384ae1da922ee1deb3678e362ee Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Fri, 20 Sep 2024 15:47:35 +0200 Subject: [PATCH 22/35] seeting swt to 0.99*hwt --- src/qtoolkit/io/pbs_base.py | 0 src/qtoolkit/io/sge.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 src/qtoolkit/io/pbs_base.py diff --git a/src/qtoolkit/io/pbs_base.py b/src/qtoolkit/io/pbs_base.py new file mode 100644 index 0000000..e69de29 diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index def3731..96d3c7d 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -463,7 +463,7 @@ def _convert_qresources(self, resources: QResources) -> dict: if resources.time_limit: header_dict["walltime"] = self._convert_time_to_str(resources.time_limit) header_dict["soft_walltime"] = self._convert_time_to_str( - resources.time_limit * 0.9 + resources.time_limit * 0.99 ) if resources.rerunnable is not None: From 9a000a5cc6588e02785f6df398c7b9caf2e061a7 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Fri, 20 Sep 2024 17:02:52 +0200 Subject: [PATCH 23/35] introduced parent class PBEIOBase to unify PBSIO and SGEIO functionalities and similarities --- src/qtoolkit/io/pbs.py | 225 +++------------------------------- src/qtoolkit/io/pbs_base.py | 236 ++++++++++++++++++++++++++++++++++++ src/qtoolkit/io/sge.py | 215 ++++---------------------------- 3 files changed, 273 insertions(+), 403 deletions(-) diff --git a/src/qtoolkit/io/pbs.py b/src/qtoolkit/io/pbs.py index 22f6fce..04e1edc 100644 --- a/src/qtoolkit/io/pbs.py +++ b/src/qtoolkit/io/pbs.py @@ -1,22 +1,10 @@ from __future__ import annotations import re -from datetime import timedelta - -from qtoolkit.core.data_objects import ( - CancelResult, - CancelStatus, - ProcessPlacement, - QJob, - QJobInfo, - QResources, - QState, - QSubState, - SubmissionResult, - SubmissionStatus, -) -from qtoolkit.core.exceptions import OutputParsingError, UnsupportedResourcesError -from qtoolkit.io.base import BaseSchedulerIO + +from qtoolkit.core.data_objects import QJob, QJobInfo, QState, QSubState +from qtoolkit.core.exceptions import OutputParsingError +from qtoolkit.io.pbs_base import PBSIOBase # States in PBS from qstat's man. # B Array job: at least one subjob has started. @@ -79,7 +67,7 @@ def qstate(self) -> QState: } -class PBSIO(BaseSchedulerIO): +class PBSIO(PBSIOBase): header_template: str = """ #PBS -q $${queue} #PBS -N $${job_name} @@ -101,64 +89,12 @@ class PBSIO(BaseSchedulerIO): SUBMIT_CMD: str | None = "qsub" CANCEL_CMD: str | None = "qdel" - def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: - if isinstance(stdout, bytes): - stdout = stdout.decode() - if isinstance(stderr, bytes): - stderr = stderr.decode() - if exit_code != 0: - return SubmissionResult( - exit_code=exit_code, - stdout=stdout, - stderr=stderr, - status=SubmissionStatus("FAILED"), - ) - job_id = stdout.strip() - status = ( - SubmissionStatus("SUCCESSFUL") - if job_id - else SubmissionStatus("JOB_ID_UNKNOWN") - ) - return SubmissionResult( - job_id=job_id, - exit_code=exit_code, - stdout=stdout, - stderr=stderr, - status=status, - ) - - def parse_cancel_output(self, exit_code, stdout, stderr) -> CancelResult: - """Parse the output of the scancel command.""" - # Possible error messages: - # qdel: Unknown Job Id 100 - # qdel: Job has finished 1004 - # Correct execution: no output - if isinstance(stdout, bytes): - stdout = stdout.decode() - if isinstance(stderr, bytes): - stderr = stderr.decode() - if exit_code != 0: - return CancelResult( - exit_code=exit_code, - stdout=stdout, - stderr=stderr, - status=CancelStatus("FAILED"), - ) - - # PBS does not return the job id if the job is successfully deleted - status = CancelStatus("SUCCESSFUL") - return CancelResult( - job_id=None, - exit_code=exit_code, - stdout=stdout, - stderr=stderr, - status=status, - ) - - def _get_job_cmd(self, job_id: str): - cmd = f"qstat -f {job_id}" + def extract_job_id(self, stdout): + return stdout.strip() - return cmd + def extract_job_id_from_cancel(self, stderr): + # PBS doesn't return the job ID if successfully canceled, so return None + return None def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: out = self.parse_jobs_list_output(exit_code, stdout, stderr) @@ -318,141 +254,12 @@ def _convert_str_to_time(time_str: str | None): return time[3] * 86400 + time[2] * 3600 + time[1] * 60 + time[0] @staticmethod - def _convert_memory_str(memory: str | None) -> int | None: - if not memory: - return None - - match = re.match(r"([0-9]+)([a-zA-Z]*)", memory) - if not match: - raise OutputParsingError("No numbers and units parsed") - memory, units = match.groups() - - power_labels = {"kb": 0, "mb": 1, "gb": 2, "tb": 3} - - if not units: - units = "mb" - elif units not in power_labels: - raise OutputParsingError(f"Unknown units {units}") - try: - v = int(memory) - except ValueError: - raise OutputParsingError - - return v * (1024 ** power_labels[units]) - - # helper attribute to match the values defined in QResources and - # the dictionary that should be passed to the template - _qresources_mapping = { - "queue_name": "queue", - "job_name": "job_name", - "account": "account", - "priority": "priority", - "output_filepath": "qout_path", - "error_filepath": "qerr_path", - "project": "group_list", - } + def get_power_labels() -> dict: + return {"kb": 0, "mb": 1, "gb": 2, "tb": 3} @staticmethod - def _convert_time_to_str(time: int | float | timedelta) -> str: - if not isinstance(time, timedelta): - time = timedelta(seconds=time) - - hours, remainder = divmod(int(time.total_seconds()), 3600) - minutes, seconds = divmod(remainder, 60) - - time_str = f"{hours}:{minutes}:{seconds}" - return time_str + def get_default_unit() -> str: + return "mb" - def _convert_qresources(self, resources: QResources) -> dict: - """ - Converts a QResources instance to a dict that will be used to fill in the - header of the submission script. - """ - - header_dict = {} - for qr_field, pbs_field in self._qresources_mapping.items(): - val = getattr(resources, qr_field) - if val is not None: - header_dict[pbs_field] = val - - if resources.njobs and resources.njobs > 1: - header_dict["array"] = f"1-{resources.njobs}" - - if resources.time_limit: - header_dict["walltime"] = self._convert_time_to_str(resources.time_limit) - - if resources.rerunnable is not None: - header_dict["rerunnable"] = "y" if resources.rerunnable else "n" - - nodes, processes, processes_per_node = resources.get_processes_distribution() - select = None - if resources.process_placement == ProcessPlacement.NO_CONSTRAINTS: - select = f"select={processes}" - if resources.threads_per_process: - select += f":ncpus={resources.threads_per_process}" - select += f":ompthreads={resources.threads_per_process}" - if resources.memory_per_thread: - threads_per_process = resources.threads_per_process or 1 - select += f":mem={threads_per_process * resources.memory_per_thread}mb" - elif resources.process_placement in ( - ProcessPlacement.EVENLY_DISTRIBUTED, - ProcessPlacement.SAME_NODE, - ProcessPlacement.SCATTERED, - ): - select = f"select={nodes}" - if resources.threads_per_process and resources.threads_per_process > 1: - cpus = resources.threads_per_process * processes_per_node - ompthreads = resources.threads_per_process - else: - cpus = processes_per_node - ompthreads = None - select += f":ncpus={cpus}" - select += f":mpiprocs={processes_per_node}" - if ompthreads: - select += f":ompthreads={ompthreads}" - if resources.memory_per_thread: - mem = cpus * resources.memory_per_thread - select += f":mem={mem}mb" - - if resources.process_placement in ( - ProcessPlacement.EVENLY_DISTRIBUTED, - ProcessPlacement.SCATTERED, - ): - header_dict["place"] = "scatter" - elif resources.process_placement == ProcessPlacement.SAME_NODE: - header_dict["place"] = "pack" - else: - msg = f"process placement {resources.process_placement} is not supported for PBS" - raise UnsupportedResourcesError(msg) - - header_dict["select"] = select - - if resources.email_address: - header_dict["mail_user"] = resources.email_address - header_dict["mail_type"] = "abe" - - if resources.scheduler_kwargs: - header_dict.update(resources.scheduler_kwargs) - - return header_dict - - @property - def supported_qresources_keys(self) -> list: - """ - List of attributes of QResources that are correctly handled by the - _convert_qresources method. It is used to validate that the user - does not pass an unsupported value, expecting to have an effect. - """ - supported = list(self._qresources_mapping.keys()) - supported += [ - "njobs", - "time_limit", - "processes", - "processes_per_node", - "process_placement", - "nodes", - "threads_per_process", - "email_address", - "scheduler_kwargs", - ] - return supported + def get_system_name(self) -> str: + return "PBS" diff --git a/src/qtoolkit/io/pbs_base.py b/src/qtoolkit/io/pbs_base.py index e69de29..a373849 100644 --- a/src/qtoolkit/io/pbs_base.py +++ b/src/qtoolkit/io/pbs_base.py @@ -0,0 +1,236 @@ +from __future__ import annotations + +import abc +import re +from abc import ABC +from datetime import timedelta + +from qtoolkit.core.data_objects import ( + CancelResult, + CancelStatus, + ProcessPlacement, + QResources, + SubmissionResult, + SubmissionStatus, +) +from qtoolkit.core.exceptions import OutputParsingError, UnsupportedResourcesError +from qtoolkit.io.base import BaseSchedulerIO + + +class PBSIOBase(BaseSchedulerIO, ABC): + """Abstract class for PBS and SGE schedulers.""" + + header_template: str + + SUBMIT_CMD: str | None = "qsub" + CANCEL_CMD: str | None = "qdel" + + def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: + if isinstance(stdout, bytes): + stdout = stdout.decode() + if isinstance(stderr, bytes): + stderr = stderr.decode() + if exit_code != 0: + return SubmissionResult( + exit_code=exit_code, + stdout=stdout, + stderr=stderr, + status=SubmissionStatus("FAILED"), + ) + job_id = self.extract_job_id(stdout) + status = ( + SubmissionStatus("SUCCESSFUL") + if job_id + else SubmissionStatus("JOB_ID_UNKNOWN") + ) + return SubmissionResult( + job_id=job_id, + exit_code=exit_code, + stdout=stdout, + stderr=stderr, + status=status, + ) + + @abc.abstractmethod + def extract_job_id(self, stdout): + pass + + def parse_cancel_output(self, exit_code, stdout, stderr) -> CancelResult: + """Parse the output of the qdel command.""" + if isinstance(stdout, bytes): + stdout = stdout.decode() + if isinstance(stderr, bytes): + stderr = stderr.decode() + if exit_code != 0: + return CancelResult( + exit_code=exit_code, + stdout=stdout, + stderr=stderr, + status=CancelStatus("FAILED"), + ) + + job_id = self.extract_job_id_from_cancel(stderr) + status = CancelStatus("SUCCESSFUL") + return CancelResult( + job_id=job_id, + exit_code=exit_code, + stdout=stdout, + stderr=stderr, + status=status, + ) + + @abc.abstractmethod + def extract_job_id_from_cancel(self, stderr): + pass + + def _get_job_cmd(self, job_id: str): + cmd = f"qstat -j {job_id}" + return cmd + + @staticmethod + def _convert_memory_str(memory: str | None) -> int | None: + if not memory: + return None + + match = re.match(r"([0-9]+)([a-zA-Z]*)", memory) + if not match: + raise OutputParsingError("No numbers and units parsed") + memory, units = match.groups() + + power_labels = PBSIOBase.get_power_labels() + + if not units: + units = PBSIOBase.get_default_unit() + elif units.lower() not in power_labels: + raise OutputParsingError(f"Unknown units {units}") + try: + v = int(memory) + except ValueError: + raise OutputParsingError + + return v * (1024 ** power_labels[units.lower()]) + + @staticmethod + def get_power_labels() -> dict: + """To be overridden in child classes for specific unit labels""" + raise NotImplementedError("Child class must implement this method") + + @staticmethod + def get_default_unit() -> str: + """To be overridden in child classes for specific default units""" + raise NotImplementedError("Child class must implement this method") + + _qresources_mapping = { + "queue_name": "queue", + "job_name": "job_name", + "priority": "priority", + "output_filepath": "qout_path", + "error_filepath": "qerr_path", + "project": "group_list", + } + + @staticmethod + def _convert_time_to_str(time: int | float | timedelta) -> str: + if not isinstance(time, timedelta): + time = timedelta(seconds=time) + + hours, remainder = divmod(int(time.total_seconds()), 3600) + minutes, seconds = divmod(remainder, 60) + + time_str = f"{hours}:{minutes}:{seconds}" + return time_str + + def _convert_qresources(self, resources: QResources) -> dict: + header_dict = {} + for qr_field, system_field in self._qresources_mapping.items(): + val = getattr(resources, qr_field) + if val is not None: + header_dict[system_field] = val + + if resources.njobs and resources.njobs > 1: + header_dict["array"] = f"1-{resources.njobs}" + + if resources.time_limit: + header_dict["walltime"] = self._convert_time_to_str(resources.time_limit) + self._add_soft_walltime(header_dict, resources) + + if resources.rerunnable is not None: + header_dict["rerunnable"] = "y" if resources.rerunnable else "n" + + # Build select clause logic directly within _convert_qresources + nodes, processes, processes_per_node = resources.get_processes_distribution() + select = None + if resources.process_placement == ProcessPlacement.NO_CONSTRAINTS: + select = f"select={processes}" + if resources.threads_per_process: + select += f":ncpus={resources.threads_per_process}" + select += f":ompthreads={resources.threads_per_process}" + if resources.memory_per_thread: + threads_per_process = resources.threads_per_process or 1 + select += f":mem={threads_per_process * resources.memory_per_thread}mb" + elif resources.process_placement in ( + ProcessPlacement.EVENLY_DISTRIBUTED, + ProcessPlacement.SAME_NODE, + ProcessPlacement.SCATTERED, + ): + select = f"select={nodes}" + if resources.threads_per_process and resources.threads_per_process > 1: + cpus = resources.threads_per_process * processes_per_node + ompthreads = resources.threads_per_process + else: + cpus = processes_per_node + ompthreads = None + select += f":ncpus={cpus}" + select += f":mpiprocs={processes_per_node}" + if ompthreads: + select += f":ompthreads={ompthreads}" + if resources.memory_per_thread: + mem = cpus * resources.memory_per_thread + select += f":mem={mem}mb" + + if resources.process_placement in ( + ProcessPlacement.EVENLY_DISTRIBUTED, + ProcessPlacement.SCATTERED, + ): + header_dict["place"] = "scatter" + elif resources.process_placement == ProcessPlacement.SAME_NODE: + header_dict["place"] = "pack" + else: + raise UnsupportedResourcesError( + f"process placement {resources.process_placement} is not supported for {self.get_system_name()}" + ) + + header_dict["select"] = select + + if resources.email_address: + header_dict["mail_user"] = resources.email_address + header_dict["mail_type"] = "abe" + + if resources.scheduler_kwargs: + header_dict.update(resources.scheduler_kwargs) + + return header_dict + + @abc.abstractmethod + def _add_soft_walltime(self, header_dict: dict, resources: QResources): + """Add soft_walltime if required by child classes (SGE).""" + + @abc.abstractmethod + def get_system_name(self) -> str: + """This should return the system name (PBS or SGE) for error messages.""" + + @property + def supported_qresources_keys(self) -> list: + supported = list(self._qresources_mapping.keys()) + supported += [ + "njobs", + "time_limit", + "processes", + "processes_per_node", + "process_placement", + "nodes", + "threads_per_process", + "email_address", + "scheduler_kwargs", + ] + return supported diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 96d3c7d..39a5ef3 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -3,26 +3,10 @@ import re import xml.dom.minidom import xml.parsers.expat -from datetime import timedelta - -from qtoolkit.core.data_objects import ( - CancelResult, - CancelStatus, - ProcessPlacement, - QJob, - QJobInfo, - QResources, - QState, - QSubState, - SubmissionResult, - SubmissionStatus, -) -from qtoolkit.core.exceptions import ( - CommandFailedError, - OutputParsingError, - UnsupportedResourcesError, -) -from qtoolkit.io.base import BaseSchedulerIO + +from qtoolkit.core.data_objects import QJob, QJobInfo, QResources, QState, QSubState +from qtoolkit.core.exceptions import CommandFailedError, OutputParsingError +from qtoolkit.io.pbs_base import PBSIOBase # https://wiki.nikhil.io/Ancient_Sysadmin_Stuff/Sun_Grid_Engine_States/ # https://manpages.ubuntu.com/manpages/jammy/en/man5/sge_status.5.html @@ -118,7 +102,7 @@ def qstate(self) -> QState: } -class SGEIO(BaseSchedulerIO): +class SGEIO(PBSIOBase): header_template: str = """ #$ -cwd $${cwd} #$ -q $${queue} @@ -145,64 +129,17 @@ class SGEIO(BaseSchedulerIO): def __init__(self, get_job_executable: str = "qstat"): self.get_job_executable = get_job_executable - def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: - if isinstance(stdout, bytes): - stdout = stdout.decode() - if isinstance(stderr, bytes): - stderr = stderr.decode() - if exit_code != 0: - return SubmissionResult( - exit_code=exit_code, - stdout=stdout, - stderr=stderr, - status=SubmissionStatus("FAILED"), - ) + def extract_job_id(self, stdout): match = re.search(r'Your job (\d+) \(".*?"\) has been submitted', stdout) if not match: raise OutputParsingError("Failed to parse job ID from stdout") - job_id = match.group(1) - status = ( - SubmissionStatus("SUCCESSFUL") - if job_id - else SubmissionStatus("JOB_ID_UNKNOWN") - ) - return SubmissionResult( - job_id=job_id, - exit_code=exit_code, - stdout=stdout, - stderr=stderr, - status=status, - ) + return match.group(1) - def parse_cancel_output(self, exit_code, stdout, stderr) -> CancelResult: - """Parse the output of the qdel command.""" - if isinstance(stdout, bytes): - stdout = stdout.decode() - if isinstance(stderr, bytes): - stderr = stderr.decode() - if exit_code != 0: - return CancelResult( - exit_code=exit_code, - stdout=stdout, - stderr=stderr, - status=CancelStatus("FAILED"), - ) + def extract_job_id_from_cancel(self, stderr): match = re.search(r"qdel: job (\d+) deleted", stderr) if not match: raise OutputParsingError("Failed to parse job ID from stdout") - job_id = match.group(1) - status = CancelStatus("SUCCESSFUL") - return CancelResult( - job_id=job_id, - exit_code=exit_code, - stdout=stdout, - stderr=stderr, - status=status, - ) - - def _get_job_cmd(self, job_id: str): - cmd = f"qstat -j {job_id}" - return cmd + return match.group(1) def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: # aiida style if exit_code != 0: @@ -408,133 +345,23 @@ def _convert_str_to_time(time_str: str | None) -> int | None: raise OutputParsingError(f"Invalid time format: {time_str}") @staticmethod - def _convert_memory_str(memory: str | None) -> int | None: - if not memory: - return None - - match = re.match(r"([0-9]+)([a-zA-Z]*)", memory) - if not match: - raise OutputParsingError("No numbers and units parsed") - memory, units = match.groups() - - power_labels = {"K": 0, "M": 1, "G": 2, "T": 3} - - if not units: - units = "M" - elif units not in power_labels: - raise OutputParsingError(f"Unknown units {units}") - try: - v = int(memory) - except ValueError: - raise OutputParsingError - - return v * (1024 ** power_labels[units.upper()]) - - _qresources_mapping = { - "queue_name": "queue", - "job_name": "job_name", - "priority": "priority", - "output_filepath": "qout_path", - "error_filepath": "qerr_path", - "project": "group_list", - } + def get_power_labels() -> dict: + return {"k": 0, "m": 1, "g": 2, "t": 3} @staticmethod - def _convert_time_to_str(time: int | float | timedelta) -> str: - if not isinstance(time, timedelta): - time = timedelta(seconds=time) - - hours, remainder = divmod(int(time.total_seconds()), 3600) - minutes, seconds = divmod(remainder, 60) - - time_str = f"{hours}:{minutes}:{seconds}" - return time_str - - def _convert_qresources(self, resources: QResources) -> dict: - header_dict = {} - for qr_field, sge_field in self._qresources_mapping.items(): - val = getattr(resources, qr_field) - if val is not None: - header_dict[sge_field] = val - - if resources.njobs and resources.njobs > 1: - header_dict["array"] = f"1-{resources.njobs}" - - if resources.time_limit: - header_dict["walltime"] = self._convert_time_to_str(resources.time_limit) - header_dict["soft_walltime"] = self._convert_time_to_str( - resources.time_limit * 0.99 - ) - - if resources.rerunnable is not None: - header_dict["rerunnable"] = "y" if resources.rerunnable else "n" - - nodes, processes, processes_per_node = resources.get_processes_distribution() - select = None - if resources.process_placement == ProcessPlacement.NO_CONSTRAINTS: - select = f"select={processes}" - if resources.threads_per_process: - select += f":ncpus={resources.threads_per_process}" - select += f":ompthreads={resources.threads_per_process}" - if resources.memory_per_thread: - threads_per_process = resources.threads_per_process or 1 - select += f":mem={threads_per_process * resources.memory_per_thread}mb" - elif resources.process_placement in ( - ProcessPlacement.EVENLY_DISTRIBUTED, - ProcessPlacement.SAME_NODE, - ProcessPlacement.SCATTERED, - ): - select = f"select={nodes}" - if resources.threads_per_process and resources.threads_per_process > 1: - cpus = resources.threads_per_process * processes_per_node - ompthreads = resources.threads_per_process - else: - cpus = processes_per_node - ompthreads = None - select += f":ncpus={cpus}" - select += f":mpiprocs={processes_per_node}" - if ompthreads: - select += f":ompthreads={ompthreads}" - if resources.memory_per_thread: - mem = cpus * resources.memory_per_thread - select += f":mem={mem}mb" - - if resources.process_placement in ( - ProcessPlacement.EVENLY_DISTRIBUTED, - ProcessPlacement.SCATTERED, - ): - header_dict["place"] = "scatter" - elif resources.process_placement == ProcessPlacement.SAME_NODE: - header_dict["place"] = "pack" - else: - msg = f"process placement {resources.process_placement} is not supported for SGE" - raise UnsupportedResourcesError(msg) + def get_default_unit() -> str: + return "M" - header_dict["select"] = select + def get_system_name(self) -> str: + return "SGE" - if resources.email_address: - header_dict["mail_user"] = resources.email_address - header_dict["mail_type"] = "abe" - - if resources.scheduler_kwargs: - header_dict.update(resources.scheduler_kwargs) - - return header_dict + def _add_soft_walltime(self, header_dict: dict, resources: QResources): + header_dict["soft_walltime"] = self._convert_time_to_str( + resources.time_limit * 0.99 + ) @property def supported_qresources_keys(self) -> list: - supported = list(self._qresources_mapping.keys()) - supported += [ - "njobs", - "memory_per_thread", - "time_limit", - "processes", - "processes_per_node", - "process_placement", - "nodes", - "threads_per_process", - "email_address", - "scheduler_kwargs", - "gpus_per_job", - ] + supported = super().supported_qresources_keys + supported += ["memory_per_thread", "gpus_per_job"] return supported From fee9e758c75fc7b01bbf34c3f5d060e9b0871279 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Fri, 20 Sep 2024 17:20:12 +0200 Subject: [PATCH 24/35] changing _convert_str_to_time for PBSIOBase --- src/qtoolkit/io/pbs_base.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/src/qtoolkit/io/pbs_base.py b/src/qtoolkit/io/pbs_base.py index a373849..39296be 100644 --- a/src/qtoolkit/io/pbs_base.py +++ b/src/qtoolkit/io/pbs_base.py @@ -87,8 +87,7 @@ def _get_job_cmd(self, job_id: str): cmd = f"qstat -j {job_id}" return cmd - @staticmethod - def _convert_memory_str(memory: str | None) -> int | None: + def _convert_memory_str(self, memory: str | None) -> int | None: if not memory: return None @@ -97,12 +96,14 @@ def _convert_memory_str(memory: str | None) -> int | None: raise OutputParsingError("No numbers and units parsed") memory, units = match.groups() - power_labels = PBSIOBase.get_power_labels() + # Now we call the methods specific to the child class (PBSIO or SGEIO) + power_labels = self.get_power_labels() if not units: - units = PBSIOBase.get_default_unit() + units = self.get_default_unit() elif units.lower() not in power_labels: raise OutputParsingError(f"Unknown units {units}") + try: v = int(memory) except ValueError: @@ -110,16 +111,6 @@ def _convert_memory_str(memory: str | None) -> int | None: return v * (1024 ** power_labels[units.lower()]) - @staticmethod - def get_power_labels() -> dict: - """To be overridden in child classes for specific unit labels""" - raise NotImplementedError("Child class must implement this method") - - @staticmethod - def get_default_unit() -> str: - """To be overridden in child classes for specific default units""" - raise NotImplementedError("Child class must implement this method") - _qresources_mapping = { "queue_name": "queue", "job_name": "job_name", From ffbc46266671a336e325bb3e68fdf81aba4f1ef4 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Fri, 20 Sep 2024 17:21:07 +0200 Subject: [PATCH 25/35] fixing unit tests because of soft walltime changes --- tests/io/test_sge.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/io/test_sge.py b/tests/io/test_sge.py index 49f7ca5..c6d8020 100644 --- a/tests/io/test_sge.py +++ b/tests/io/test_sge.py @@ -128,6 +128,7 @@ def test_convert_str_to_time(self, sge_io): sge_io._convert_str_to_time("2:10:a") def test_convert_memory_str(self, sge_io): + assert isinstance(sge_io, SGEIO) memory_kb = sge_io._convert_memory_str(None) assert memory_kb is None memory_kb = sge_io._convert_memory_str("") @@ -200,7 +201,7 @@ def test_check_convert_qresources(self, sge_io): "array": "1-4", "walltime": "10:51:13", "select": "select=4:ncpus=6:mpiprocs=3:ompthreads=2:mem=12288mb", - "soft_walltime": "9:46:5", + "soft_walltime": "10:44:42", "mail_user": "john.doe@submit.qtk", "mail_type": "abe", "tata": "toto", @@ -214,7 +215,7 @@ def test_check_convert_qresources(self, sge_io): header_dict = sge_io.check_convert_qresources(resources=res) assert header_dict == { "walltime": "10:51:13", - "soft_walltime": "9:46:5", + "soft_walltime": "10:44:42", "select": "select=24", # also not sure about this } @@ -262,7 +263,7 @@ def test_submission_script(self, sge_io, maximalist_qresources): #$ -N test_job #$ -l select=1:ncpus=1:mpiprocs=1:mem=1000mb #$ -l h_rt=0:1:40 -#$ -l s_rt=0:1:30 +#$ -l s_rt=0:1:39 #$ -binding scatter #$ -M test_email_address@email.address #$ -m abe From eaa9f33f451bab7cd25ad45c2a0303a89fcba28f Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Fri, 20 Sep 2024 17:44:10 +0200 Subject: [PATCH 26/35] implement that in SGE one can also query by a job_list and move similar parts to pbs_base.py --- src/qtoolkit/io/pbs.py | 21 ++++----------------- src/qtoolkit/io/pbs_base.py | 29 +++++++++++++++++++++++++++++ src/qtoolkit/io/sge.py | 12 +++++------- 3 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/qtoolkit/io/pbs.py b/src/qtoolkit/io/pbs.py index 04e1edc..5408cc6 100644 --- a/src/qtoolkit/io/pbs.py +++ b/src/qtoolkit/io/pbs.py @@ -102,24 +102,11 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: return out[0] return None - def _get_jobs_list_cmd( - self, job_ids: list[str] | None = None, user: str | None = None - ) -> str: - if user and job_ids: - raise ValueError("Cannot query by user and job(s) in PBS") + def _get_base_command(self) -> list[str]: + return ["qstat", "-f"] - command = [ - "qstat", - "-f", - ] - - if user: - command.append(f"-u {user}") - - if job_ids: - command.append(" ".join(job_ids)) - - return " ".join(command) + def _get_job_ids_flag(self, job_ids_str: str) -> str: + return job_ids_str def parse_jobs_list_output(self, exit_code, stdout, stderr) -> list[QJob]: if isinstance(stdout, bytes): diff --git a/src/qtoolkit/io/pbs_base.py b/src/qtoolkit/io/pbs_base.py index 39296be..11bdc8d 100644 --- a/src/qtoolkit/io/pbs_base.py +++ b/src/qtoolkit/io/pbs_base.py @@ -83,6 +83,35 @@ def parse_cancel_output(self, exit_code, stdout, stderr) -> CancelResult: def extract_job_id_from_cancel(self, stderr): pass + def _get_jobs_list_cmd( + self, job_ids: list[str] | None = None, user: str | None = None + ) -> str: + if user and job_ids: + self._check_user_and_job_ids_conflict() + + command = self._get_base_command() + + if user: + command.append(f"-u {user}") + + if job_ids: + job_ids_str = ",".join(job_ids) + command.append(self._get_job_ids_flag(job_ids_str)) + + return " ".join(command) + + def _check_user_and_job_ids_conflict(self): + # Use get_system_name() for more informative error messages + raise ValueError(f"Cannot query by user and job(s) in {self.get_system_name()}") + + @abc.abstractmethod + def _get_base_command(self) -> list[str]: + pass + + @abc.abstractmethod + def _get_job_ids_flag(self, job_ids_str: str) -> str: + pass + def _get_job_cmd(self, job_id: str): cmd = f"qstat -j {job_id}" return cmd diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 39a5ef3..329e009 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -262,13 +262,11 @@ def _safe_int(self, value: str | None) -> int | None: except ValueError: return None - def _get_jobs_list_cmd( - self, job_ids: list[str] | None = None, user: str | None = None - ) -> str: - if job_ids: - raise ValueError("Cannot query by job ids list in SGE") - user = user if user else "*" - return f"qstat -ext -urg -xml -u {user}" + def _get_base_command(self) -> list[str]: + return ["qstat", "-ext", "-urg", "-xml"] + + def _get_job_ids_flag(self, job_ids_str: str) -> str: + return f"-j {job_ids_str}" def parse_jobs_list_output(self, exit_code, stdout, stderr) -> list[QJob]: if exit_code != 0: From 09e37486fc5379a95069fc1ce16e785dbf8cbfe2 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Fri, 20 Sep 2024 17:44:50 +0200 Subject: [PATCH 27/35] adjust test_get_jobs_list_cmd for the changes --- tests/io/test_sge.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/io/test_sge.py b/tests/io/test_sge.py index c6d8020..efe354e 100644 --- a/tests/io/test_sge.py +++ b/tests/io/test_sge.py @@ -102,14 +102,16 @@ def test_get_job_cmd(self, sge_io): assert cmd == "qstat -j 56" def test_get_jobs_list_cmd(self, sge_io): - with pytest.raises(ValueError, match=r"Cannot query by job ids list in SGE"): + with pytest.raises( + ValueError, match=r"Cannot query by user and job\(s\) in SGE" + ): sge_io._get_jobs_list_cmd(job_ids=["1"], user="johndoe") cmd = sge_io._get_jobs_list_cmd(user="johndoe") - assert cmd == ("qstat -ext -urg -xml -u johndoe") - with pytest.raises(ValueError, match=r"Cannot query by job ids list in SGE"): - sge_io._get_jobs_list_cmd(job_ids=["1", "3", "56", "15"]) - with pytest.raises(ValueError, match=r"Cannot query by job ids list in SGE"): - sge_io._get_jobs_list_cmd(job_ids=["1"]) + assert cmd == "qstat -ext -urg -xml -u johndoe" + cmd = sge_io._get_jobs_list_cmd(job_ids=["1", "3", "56", "15"]) + assert cmd == "qstat -ext -urg -xml -j 1,3,56,15" + cmd = sge_io._get_jobs_list_cmd(job_ids=["1"]) + assert cmd == "qstat -ext -urg -xml -j 1" def test_convert_str_to_time(self, sge_io): time_seconds = sge_io._convert_str_to_time("10:51:13") From e34a169ba7d31afbbbf34fd8ca656049a37650c8 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Wed, 25 Sep 2024 16:52:50 +0200 Subject: [PATCH 28/35] renamed _get_base_command to _get_qstat_base_command --- src/qtoolkit/io/pbs.py | 2 +- src/qtoolkit/io/pbs_base.py | 4 ++-- src/qtoolkit/io/sge.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/qtoolkit/io/pbs.py b/src/qtoolkit/io/pbs.py index 5408cc6..0587ec0 100644 --- a/src/qtoolkit/io/pbs.py +++ b/src/qtoolkit/io/pbs.py @@ -102,7 +102,7 @@ def parse_job_output(self, exit_code, stdout, stderr) -> QJob | None: return out[0] return None - def _get_base_command(self) -> list[str]: + def _get_qstat_base_command(self) -> list[str]: return ["qstat", "-f"] def _get_job_ids_flag(self, job_ids_str: str) -> str: diff --git a/src/qtoolkit/io/pbs_base.py b/src/qtoolkit/io/pbs_base.py index 11bdc8d..7b8d950 100644 --- a/src/qtoolkit/io/pbs_base.py +++ b/src/qtoolkit/io/pbs_base.py @@ -89,7 +89,7 @@ def _get_jobs_list_cmd( if user and job_ids: self._check_user_and_job_ids_conflict() - command = self._get_base_command() + command = self._get_qstat_base_command() if user: command.append(f"-u {user}") @@ -105,7 +105,7 @@ def _check_user_and_job_ids_conflict(self): raise ValueError(f"Cannot query by user and job(s) in {self.get_system_name()}") @abc.abstractmethod - def _get_base_command(self) -> list[str]: + def _get_qstat_base_command(self) -> list[str]: pass @abc.abstractmethod diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 329e009..f6650e9 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -262,7 +262,7 @@ def _safe_int(self, value: str | None) -> int | None: except ValueError: return None - def _get_base_command(self) -> list[str]: + def _get_qstat_base_command(self) -> list[str]: return ["qstat", "-ext", "-urg", "-xml"] def _get_job_ids_flag(self, job_ids_str: str) -> str: From ad8077f2db09693ad5b572f0396dd951052d0a09 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Wed, 25 Sep 2024 17:08:41 +0200 Subject: [PATCH 29/35] moved _qresources_mapping to the subclasses --- src/qtoolkit/io/pbs.py | 10 ++++++++++ src/qtoolkit/io/pbs_base.py | 12 +++--------- src/qtoolkit/io/sge.py | 9 +++++++++ 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/qtoolkit/io/pbs.py b/src/qtoolkit/io/pbs.py index 0587ec0..0cb5ece 100644 --- a/src/qtoolkit/io/pbs.py +++ b/src/qtoolkit/io/pbs.py @@ -89,6 +89,16 @@ class PBSIO(PBSIOBase): SUBMIT_CMD: str | None = "qsub" CANCEL_CMD: str | None = "qdel" + _qresources_mapping = { + "queue_name": "queue", + "job_name": "job_name", + "account": "account", + "priority": "priority", + "output_filepath": "qout_path", + "error_filepath": "qerr_path", + "project": "group_list", + } + def extract_job_id(self, stdout): return stdout.strip() diff --git a/src/qtoolkit/io/pbs_base.py b/src/qtoolkit/io/pbs_base.py index 7b8d950..8a24b3a 100644 --- a/src/qtoolkit/io/pbs_base.py +++ b/src/qtoolkit/io/pbs_base.py @@ -25,6 +25,9 @@ class PBSIOBase(BaseSchedulerIO, ABC): SUBMIT_CMD: str | None = "qsub" CANCEL_CMD: str | None = "qdel" + def __init__(self): + self._qresources_mapping = None + def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: if isinstance(stdout, bytes): stdout = stdout.decode() @@ -140,15 +143,6 @@ def _convert_memory_str(self, memory: str | None) -> int | None: return v * (1024 ** power_labels[units.lower()]) - _qresources_mapping = { - "queue_name": "queue", - "job_name": "job_name", - "priority": "priority", - "output_filepath": "qout_path", - "error_filepath": "qerr_path", - "project": "group_list", - } - @staticmethod def _convert_time_to_str(time: int | float | timedelta) -> str: if not isinstance(time, timedelta): diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index f6650e9..42e1445 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -127,7 +127,16 @@ class SGEIO(PBSIOBase): CANCEL_CMD: str | None = "qdel" def __init__(self, get_job_executable: str = "qstat"): + super().__init__() self.get_job_executable = get_job_executable + self._qresources_mapping = { + "queue_name": "queue", + "job_name": "job_name", + "priority": "priority", + "output_filepath": "qout_path", + "error_filepath": "qerr_path", + "project": "group_list", + } def extract_job_id(self, stdout): match = re.search(r'Your job (\d+) \(".*?"\) has been submitted', stdout) From 3fae98721b391d12f042925a11c575febb7fccab Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Wed, 25 Sep 2024 17:17:26 +0200 Subject: [PATCH 30/35] moved _qresources_mapping to the subclasses --- src/qtoolkit/io/pbs.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/qtoolkit/io/pbs.py b/src/qtoolkit/io/pbs.py index 0cb5ece..f835102 100644 --- a/src/qtoolkit/io/pbs.py +++ b/src/qtoolkit/io/pbs.py @@ -89,15 +89,18 @@ class PBSIO(PBSIOBase): SUBMIT_CMD: str | None = "qsub" CANCEL_CMD: str | None = "qdel" - _qresources_mapping = { - "queue_name": "queue", - "job_name": "job_name", - "account": "account", - "priority": "priority", - "output_filepath": "qout_path", - "error_filepath": "qerr_path", - "project": "group_list", - } + def __init__(self, get_job_executable: str = "qstat"): + super().__init__() + self.get_job_executable = get_job_executable + self._qresources_mapping = { + "queue_name": "queue", + "job_name": "job_name", + "account": "account", + "priority": "priority", + "output_filepath": "qout_path", + "error_filepath": "qerr_path", + "project": "group_list", + } def extract_job_id(self, stdout): return stdout.strip() From a46aebd031681149dc02d3cba780a0acffc02b65 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Wed, 25 Sep 2024 17:22:19 +0200 Subject: [PATCH 31/35] moved _qresources_mapping to the subclasses --- src/qtoolkit/io/pbs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/qtoolkit/io/pbs.py b/src/qtoolkit/io/pbs.py index f835102..526c62f 100644 --- a/src/qtoolkit/io/pbs.py +++ b/src/qtoolkit/io/pbs.py @@ -89,9 +89,8 @@ class PBSIO(PBSIOBase): SUBMIT_CMD: str | None = "qsub" CANCEL_CMD: str | None = "qdel" - def __init__(self, get_job_executable: str = "qstat"): + def __init__(self): super().__init__() - self.get_job_executable = get_job_executable self._qresources_mapping = { "queue_name": "queue", "job_name": "job_name", From 74d1867d0e2c22737a30a1ad7bf78b2fd52fa351 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Wed, 25 Sep 2024 17:30:41 +0200 Subject: [PATCH 32/35] moverd system_name, default_unit and power_labels to class attributes --- src/qtoolkit/io/pbs.py | 14 +++----------- src/qtoolkit/io/pbs_base.py | 17 ++++++++--------- src/qtoolkit/io/sge.py | 14 +++----------- 3 files changed, 14 insertions(+), 31 deletions(-) diff --git a/src/qtoolkit/io/pbs.py b/src/qtoolkit/io/pbs.py index 526c62f..98acfc5 100644 --- a/src/qtoolkit/io/pbs.py +++ b/src/qtoolkit/io/pbs.py @@ -91,6 +91,9 @@ class PBSIO(PBSIOBase): def __init__(self): super().__init__() + self.system_name = "PBS" + self.default_unit = "mb" + self.power_labels = {"kb": 0, "mb": 1, "gb": 2, "tb": 3} self._qresources_mapping = { "queue_name": "queue", "job_name": "job_name", @@ -251,14 +254,3 @@ def _convert_str_to_time(time_str: str | None): raise OutputParsingError() return time[3] * 86400 + time[2] * 3600 + time[1] * 60 + time[0] - - @staticmethod - def get_power_labels() -> dict: - return {"kb": 0, "mb": 1, "gb": 2, "tb": 3} - - @staticmethod - def get_default_unit() -> str: - return "mb" - - def get_system_name(self) -> str: - return "PBS" diff --git a/src/qtoolkit/io/pbs_base.py b/src/qtoolkit/io/pbs_base.py index 8a24b3a..86a7e2b 100644 --- a/src/qtoolkit/io/pbs_base.py +++ b/src/qtoolkit/io/pbs_base.py @@ -27,6 +27,9 @@ class PBSIOBase(BaseSchedulerIO, ABC): def __init__(self): self._qresources_mapping = None + self.system_name = None + self.default_unit = None + self.power_labels = None def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: if isinstance(stdout, bytes): @@ -104,8 +107,8 @@ def _get_jobs_list_cmd( return " ".join(command) def _check_user_and_job_ids_conflict(self): - # Use get_system_name() for more informative error messages - raise ValueError(f"Cannot query by user and job(s) in {self.get_system_name()}") + # Use system_name for more informative error messages + raise ValueError(f"Cannot query by user and job(s) in {self.system_name}") @abc.abstractmethod def _get_qstat_base_command(self) -> list[str]: @@ -129,10 +132,10 @@ def _convert_memory_str(self, memory: str | None) -> int | None: memory, units = match.groups() # Now we call the methods specific to the child class (PBSIO or SGEIO) - power_labels = self.get_power_labels() + power_labels = self.power_labels if not units: - units = self.get_default_unit() + units = self.default_unit elif units.lower() not in power_labels: raise OutputParsingError(f"Unknown units {units}") @@ -211,7 +214,7 @@ def _convert_qresources(self, resources: QResources) -> dict: header_dict["place"] = "pack" else: raise UnsupportedResourcesError( - f"process placement {resources.process_placement} is not supported for {self.get_system_name()}" + f"process placement {resources.process_placement} is not supported for {self.system_name}" ) header_dict["select"] = select @@ -229,10 +232,6 @@ def _convert_qresources(self, resources: QResources) -> dict: def _add_soft_walltime(self, header_dict: dict, resources: QResources): """Add soft_walltime if required by child classes (SGE).""" - @abc.abstractmethod - def get_system_name(self) -> str: - """This should return the system name (PBS or SGE) for error messages.""" - @property def supported_qresources_keys(self) -> list: supported = list(self._qresources_mapping.keys()) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index 42e1445..ea0dd89 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -129,6 +129,9 @@ class SGEIO(PBSIOBase): def __init__(self, get_job_executable: str = "qstat"): super().__init__() self.get_job_executable = get_job_executable + self.system_name = "SGE" + self.default_unit = "M" + self.power_labels = {"k": 0, "m": 1, "g": 2, "t": 3} self._qresources_mapping = { "queue_name": "queue", "job_name": "job_name", @@ -351,17 +354,6 @@ def _convert_str_to_time(time_str: str | None) -> int | None: except ValueError: raise OutputParsingError(f"Invalid time format: {time_str}") - @staticmethod - def get_power_labels() -> dict: - return {"k": 0, "m": 1, "g": 2, "t": 3} - - @staticmethod - def get_default_unit() -> str: - return "M" - - def get_system_name(self) -> str: - return "SGE" - def _add_soft_walltime(self, header_dict: dict, resources: QResources): header_dict["soft_walltime"] = self._convert_time_to_str( resources.time_limit * 0.99 From e96891370499615a2ad6690e944e2f6981a43fd6 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Wed, 25 Sep 2024 17:40:18 +0200 Subject: [PATCH 33/35] remove scope=session from maximalist_qresources --- tests/conftest.py | 2 +- tests/io/test_slurm.py | 13 ++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 004a414..c82ebd0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -119,7 +119,7 @@ def test_utils(): return TestUtils -@pytest.fixture(scope="session") +@pytest.fixture() # scope="session") def maximalist_qresources(): """A set of QResources options that try to make use of most features""" from qtoolkit.core.data_objects import QResources diff --git a/tests/io/test_slurm.py b/tests/io/test_slurm.py index edc7cee..21f3568 100644 --- a/tests/io/test_slurm.py +++ b/tests/io/test_slurm.py @@ -279,8 +279,11 @@ def test_submission_script(self, slurm_io, maximalist_qresources): script_qresources = slurm_io.get_submission_script( commands=["ls -l"], options=maximalist_qresources ) - assert script_qresources.split( - "\n" - ) == "#!/bin/bash\n\n#SBATCH --partition=test_queue\n#SBATCH --job-name=test_job\n#SBATCH --nodes=1\n#SBATCH --ntasks=1\n#SBATCH --ntasks-per-node=1\n#SBATCH --cpus-per-task=1\n#SBATCH --mem-per-cpu=1000\n#SBATCH --time=0-0:1:40\n#SBATCH --account=test_account\n#SBATCH --mail-user=test_email_address@email.address\n#SBATCH --mail-type=ALL\n#SBATCH --gres=gpu:1\n#SBATCH --output=test_output_filepath\n#SBATCH --error=test_error_filepath\n#SBATCH --qos=test_qos\n#SBATCH --priority=1\nls -l".split( - "\n" - ) + assert script_qresources.split("\n") == ( + "#!/bin/bash\n\n#SBATCH --partition=test_queue\n#SBATCH --job-name=test_job\n#SBATCH --nodes=1\n#SBATCH " + "--ntasks=1\n#SBATCH --ntasks-per-node=1\n#SBATCH --cpus-per-task=1\n#SBATCH " + "--mem-per-cpu=1000\n#SBATCH --time=0-0:1:40\n#SBATCH --account=test_account\n#SBATCH " + "--mail-user=test_email_address@email.address\n#SBATCH --mail-type=ALL\n#SBATCH --gres=gpu:1\n#SBATCH " + "--output=test_output_filepath\n#SBATCH --error=test_error_filepath\n#SBATCH --qos=test_qos\n#SBATCH " + "--priority=1\nls -l" + ).split("\n") From 5b672adcc2bc1ba9b1648ef1aa126bfb473ecb38 Mon Sep 17 00:00:00 2001 From: QuantumChemist Date: Mon, 30 Sep 2024 18:09:07 +0200 Subject: [PATCH 34/35] made instance attributes to class attributes --- src/qtoolkit/io/pbs.py | 27 ++++++++++++--------------- src/qtoolkit/io/pbs_base.py | 10 ++++------ src/qtoolkit/io/sge.py | 22 +++++++++++----------- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/src/qtoolkit/io/pbs.py b/src/qtoolkit/io/pbs.py index 98acfc5..eec2ea0 100644 --- a/src/qtoolkit/io/pbs.py +++ b/src/qtoolkit/io/pbs.py @@ -88,21 +88,18 @@ class PBSIO(PBSIOBase): SUBMIT_CMD: str | None = "qsub" CANCEL_CMD: str | None = "qdel" - - def __init__(self): - super().__init__() - self.system_name = "PBS" - self.default_unit = "mb" - self.power_labels = {"kb": 0, "mb": 1, "gb": 2, "tb": 3} - self._qresources_mapping = { - "queue_name": "queue", - "job_name": "job_name", - "account": "account", - "priority": "priority", - "output_filepath": "qout_path", - "error_filepath": "qerr_path", - "project": "group_list", - } + system_name: str = "PBS" + default_unit: str = "mb" + power_labels: dict = {"kb": 0, "mb": 1, "gb": 2, "tb": 3} + _qresources_mapping: dict = { + "queue_name": "queue", + "job_name": "job_name", + "account": "account", + "priority": "priority", + "output_filepath": "qout_path", + "error_filepath": "qerr_path", + "project": "group_list", + } def extract_job_id(self, stdout): return stdout.strip() diff --git a/src/qtoolkit/io/pbs_base.py b/src/qtoolkit/io/pbs_base.py index 86a7e2b..d414dfa 100644 --- a/src/qtoolkit/io/pbs_base.py +++ b/src/qtoolkit/io/pbs_base.py @@ -24,12 +24,10 @@ class PBSIOBase(BaseSchedulerIO, ABC): SUBMIT_CMD: str | None = "qsub" CANCEL_CMD: str | None = "qdel" - - def __init__(self): - self._qresources_mapping = None - self.system_name = None - self.default_unit = None - self.power_labels = None + _qresources_mapping: dict + system_name: str + default_unit: str + power_labels: dict def parse_submit_output(self, exit_code, stdout, stderr) -> SubmissionResult: if isinstance(stdout, bytes): diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index ea0dd89..b9730be 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -125,21 +125,21 @@ class SGEIO(PBSIOBase): SUBMIT_CMD: str | None = "qsub" CANCEL_CMD: str | None = "qdel" + system_name: str = "SGE" + default_unit: str = "M" + power_labels: dict = {"k": 0, "m": 1, "g": 2, "t": 3} + _qresources_mapping: dict = { + "queue_name": "queue", + "job_name": "job_name", + "priority": "priority", + "output_filepath": "qout_path", + "error_filepath": "qerr_path", + "project": "group_list", + } def __init__(self, get_job_executable: str = "qstat"): super().__init__() self.get_job_executable = get_job_executable - self.system_name = "SGE" - self.default_unit = "M" - self.power_labels = {"k": 0, "m": 1, "g": 2, "t": 3} - self._qresources_mapping = { - "queue_name": "queue", - "job_name": "job_name", - "priority": "priority", - "output_filepath": "qout_path", - "error_filepath": "qerr_path", - "project": "group_list", - } def extract_job_id(self, stdout): match = re.search(r'Your job (\d+) \(".*?"\) has been submitted', stdout) From 383aadad0ee26e52f6618f8e883183d81e44cd95 Mon Sep 17 00:00:00 2001 From: Christina Ertural <52951132+QuantumChemist@users.noreply.github.com> Date: Thu, 24 Oct 2024 16:49:18 +0200 Subject: [PATCH 35/35] Update sge.py --- src/qtoolkit/io/sge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/qtoolkit/io/sge.py b/src/qtoolkit/io/sge.py index b9730be..19bb38c 100644 --- a/src/qtoolkit/io/sge.py +++ b/src/qtoolkit/io/sge.py @@ -322,7 +322,7 @@ def parse_jobs_list_output(self, exit_code, stdout, stderr) -> list[QJob]: info.memory_per_cpu = self._convert_memory_str( self._get_element_text(job_element, "hard resource_list.mem_free") ) - info.partition = self._get_element_text(job_element, "queue_name") + info.partition = self._get_element_text(job_element, "queue") info.time_limit = self._convert_str_to_time( self._get_element_text(job_element, "hard resource_list.h_rt") )