From b2a0b2e33a08671b59d694f195a8712d2ae6c7ad Mon Sep 17 00:00:00 2001 From: David Schultz Date: Sun, 27 Oct 2024 12:13:20 -0500 Subject: [PATCH 1/2] add more error reasons to reset, with comments --- iceprod/server/plugins/condor.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/iceprod/server/plugins/condor.py b/iceprod/server/plugins/condor.py index 13fbcd9c..5f5da478 100644 --- a/iceprod/server/plugins/condor.py +++ b/iceprod/server/plugins/condor.py @@ -79,25 +79,37 @@ def from_condor_status(num): RESET_CONDOR_REASONS = [ +# condor file transfer plugin failed '_condor_stdout: (errno 2) No such file', 'transfer input files failure', 'transfer output files failure', +# resource limits 'cpu consumption limit exceeded', 'memory limit exceeded', 'cgroup memory limit', 'local storage limit on worker node exceeded', 'execution time limit exceeded', +# general retries 'exceeded max iceprod queue time', + 'job has failed', + 'python-initiated action (by user ice3simusr)', ] RESET_STDERR_REASONS = [ +# glidein died 'sigterm', 'killed', +# hopefully transient errors 'bus error (core dumped)', 'segmentation fault (core dumped)', 'operation timed out', 'connection timed out', +# GPU errors + 'opencl error: could not set up context', +# CVMFS errors + 'python: command not found', + 'cannot read file data: Stale file handle', ] @@ -704,7 +716,7 @@ async def wait(self, timeout): if new_status is not None and job.status != new_status: job.status = new_status if new_status == JobStatus.FAILED: - self.submitter.remove(job_id, reason=event.get('HoldReason', None)) + self.submitter.remove(job_id, reason=event.get('HoldReason', 'Job has failed')) else: await self.job_update(job) except Exception: @@ -806,7 +818,7 @@ async def check(self): if job_id not in old_jobs or job.status != old_jobs[job_id].status: if job.status == JobStatus.FAILED: extra = job.extra if job.extra else {} - reason = extra.get('HoldReason', None) + reason = extra.get('HoldReason', 'Job has failed') logger.info("job %s %s.%s removed from cross-check: %r", job_id, job.dataset_id, job.task_id, reason) self.submitter.remove(job_id, reason=reason) From 18feefa3f945fe2a52799efaa81c6a4d929195af Mon Sep 17 00:00:00 2001 From: David Schultz Date: Sun, 27 Oct 2024 12:14:30 -0500 Subject: [PATCH 2/2] fix flake8 --- iceprod/server/plugins/condor.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/iceprod/server/plugins/condor.py b/iceprod/server/plugins/condor.py index 5f5da478..a2bbc9b0 100644 --- a/iceprod/server/plugins/condor.py +++ b/iceprod/server/plugins/condor.py @@ -79,17 +79,17 @@ def from_condor_status(num): RESET_CONDOR_REASONS = [ -# condor file transfer plugin failed + # condor file transfer plugin failed '_condor_stdout: (errno 2) No such file', 'transfer input files failure', 'transfer output files failure', -# resource limits + # resource limits 'cpu consumption limit exceeded', 'memory limit exceeded', 'cgroup memory limit', 'local storage limit on worker node exceeded', 'execution time limit exceeded', -# general retries + # general retries 'exceeded max iceprod queue time', 'job has failed', 'python-initiated action (by user ice3simusr)', @@ -97,17 +97,17 @@ def from_condor_status(num): RESET_STDERR_REASONS = [ -# glidein died + # glidein died 'sigterm', 'killed', -# hopefully transient errors + # hopefully transient errors 'bus error (core dumped)', 'segmentation fault (core dumped)', 'operation timed out', 'connection timed out', -# GPU errors + # GPU errors 'opencl error: could not set up context', -# CVMFS errors + # CVMFS errors 'python: command not found', 'cannot read file data: Stale file handle', ]