Skip to content

Commit

Permalink
retry slurm commands when failing
Browse files Browse the repository at this point in the history
In my experience, sometimes dpgen fails to execute slurm commands. In this commit, dpgen will sleep 60 s and retry for at most 3 times.
  • Loading branch information
njzjz committed Jun 2, 2020
1 parent d09681a commit 0c32aaa
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
10 changes: 9 additions & 1 deletion dpgen/dispatcher/SSHContext.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,19 @@ def download(self,
os.chdir(cwd)

def block_checkcall(self,
cmd) :
cmd,
retry=0) :
self.ssh_session.ensure_alive()
stdin, stdout, stderr = self.ssh.exec_command(('cd %s ;' % self.remote_root) + cmd)
exit_status = stdout.channel.recv_exit_status()
if exit_status != 0:
if retry<3:
# sleep 60 s
dlog.warning("Get error code %d in calling %s through ssh with job: %s . message: %s" %
(exit_status, cmd, self.job_uuid, stderr.read().decode('utf-8')))
dlog.warning("Sleep 60 s and retry the command...")
time.sleep(60)
return self.block_checkcall(cmd, retry=retry+1)
raise RuntimeError("Get error code %d in calling %s through ssh with job: %s . message: %s" %
(exit_status, cmd, self.job_uuid, stderr.read().decode('utf-8')))
return stdin, stdout, stderr
Expand Down
7 changes: 6 additions & 1 deletion dpgen/dispatcher/Slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def _get_job_id(self) :
else:
return ""

def _check_status_inner(self, job_id):
def _check_status_inner(self, job_id, retry=0):
ret, stdin, stdout, stderr\
= self.context.block_call ('squeue -o "%.18i %.2t" -j ' + job_id)
if (ret != 0) :
Expand All @@ -160,6 +160,11 @@ def _check_status_inner(self, job_id):
else :
return JobStatus.terminated
else :
# retry 3 times
if retry < 3:
# rest 60s
time.sleep(60)
return self._check_status_inner(job_id, retry=retry+1)
raise RuntimeError\
("status command squeue fails to execute\nerror message:%s\nreturn code %d\n" % (err_str, ret))
status_line = stdout.read().decode('utf-8').split ('\n')[-2]
Expand Down

0 comments on commit 0c32aaa

Please sign in to comment.