Skip to content

Commit

Permalink
Merge pull request #276 from njzjz/slurm
Browse files Browse the repository at this point in the history
retry slurm commands when failing
  • Loading branch information
amcadmus authored Jun 3, 2020
2 parents 0ae4579 + 0c32aaa commit 0d04d5b
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 2 deletions.
10 changes: 9 additions & 1 deletion dpgen/dispatcher/SSHContext.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,19 @@ def download(self,
os.chdir(cwd)

def block_checkcall(self,
cmd) :
cmd,
retry=0) :
self.ssh_session.ensure_alive()
stdin, stdout, stderr = self.ssh.exec_command(('cd %s ;' % self.remote_root) + cmd)
exit_status = stdout.channel.recv_exit_status()
if exit_status != 0:
if retry<3:
# sleep 60 s
dlog.warning("Get error code %d in calling %s through ssh with job: %s . message: %s" %
(exit_status, cmd, self.job_uuid, stderr.read().decode('utf-8')))
dlog.warning("Sleep 60 s and retry the command...")
time.sleep(60)
return self.block_checkcall(cmd, retry=retry+1)
raise RuntimeError("Get error code %d in calling %s through ssh with job: %s . message: %s" %
(exit_status, cmd, self.job_uuid, stderr.read().decode('utf-8')))
return stdin, stdout, stderr
Expand Down
7 changes: 6 additions & 1 deletion dpgen/dispatcher/Slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def _get_job_id(self) :
else:
return ""

def _check_status_inner(self, job_id):
def _check_status_inner(self, job_id, retry=0):
ret, stdin, stdout, stderr\
= self.context.block_call ('squeue -o "%.18i %.2t" -j ' + job_id)
if (ret != 0) :
Expand All @@ -160,6 +160,11 @@ def _check_status_inner(self, job_id):
else :
return JobStatus.terminated
else :
# retry 3 times
if retry < 3:
# rest 60s
time.sleep(60)
return self._check_status_inner(job_id, retry=retry+1)
raise RuntimeError\
("status command squeue fails to execute\nerror message:%s\nreturn code %d\n" % (err_str, ret))
status_line = stdout.read().decode('utf-8').split ('\n')[-2]
Expand Down

0 comments on commit 0d04d5b

Please sign in to comment.