From 0c32aaaafb0d1c0d29025afacb04cd863926b96c Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 2 Jun 2020 07:02:36 -0400 Subject: [PATCH] retry slurm commands when failing In my experience, sometimes dpgen fails to execute slurm commands. In this commit, dpgen will sleep 60 s and retry for at most 3 times. --- dpgen/dispatcher/SSHContext.py | 10 +++++++++- dpgen/dispatcher/Slurm.py | 7 ++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/dpgen/dispatcher/SSHContext.py b/dpgen/dispatcher/SSHContext.py index 4347eb321..a6ac3c1d4 100644 --- a/dpgen/dispatcher/SSHContext.py +++ b/dpgen/dispatcher/SSHContext.py @@ -148,11 +148,19 @@ def download(self, os.chdir(cwd) def block_checkcall(self, - cmd) : + cmd, + retry=0) : self.ssh_session.ensure_alive() stdin, stdout, stderr = self.ssh.exec_command(('cd %s ;' % self.remote_root) + cmd) exit_status = stdout.channel.recv_exit_status() if exit_status != 0: + if retry<3: + # sleep 60 s + dlog.warning("Get error code %d in calling %s through ssh with job: %s . message: %s" % + (exit_status, cmd, self.job_uuid, stderr.read().decode('utf-8'))) + dlog.warning("Sleep 60 s and retry the command...") + time.sleep(60) + return self.block_checkcall(cmd, retry=retry+1) raise RuntimeError("Get error code %d in calling %s through ssh with job: %s . message: %s" % (exit_status, cmd, self.job_uuid, stderr.read().decode('utf-8'))) return stdin, stdout, stderr diff --git a/dpgen/dispatcher/Slurm.py b/dpgen/dispatcher/Slurm.py index c2595d6e2..9d409dbcc 100644 --- a/dpgen/dispatcher/Slurm.py +++ b/dpgen/dispatcher/Slurm.py @@ -149,7 +149,7 @@ def _get_job_id(self) : else: return "" - def _check_status_inner(self, job_id): + def _check_status_inner(self, job_id, retry=0): ret, stdin, stdout, stderr\ = self.context.block_call ('squeue -o "%.18i %.2t" -j ' + job_id) if (ret != 0) : @@ -160,6 +160,11 @@ def _check_status_inner(self, job_id): else : return JobStatus.terminated else : + # retry 3 times + if retry < 3: + # rest 60s + time.sleep(60) + return self._check_status_inner(job_id, retry=retry+1) raise RuntimeError\ ("status command squeue fails to execute\nerror message:%s\nreturn code %d\n" % (err_str, ret)) status_line = stdout.read().decode('utf-8').split ('\n')[-2]