diff --git a/iceprod/server/plugins/condor.py b/iceprod/server/plugins/condor.py index e90da5e4..2325c6e7 100644 --- a/iceprod/server/plugins/condor.py +++ b/iceprod/server/plugins/condor.py @@ -77,18 +77,23 @@ def from_condor_status(num): } -RESET_REASONS = [ +RESET_CONDOR_REASONS = [ '_condor_stdout: (errno 2) No such file', - 'sigterm', - 'killed', 'transfer input files failure', 'transfer output files failure', 'cpu consumption limit exceeded', + 'memory limit exceeded', + 'cgroup memory limit', + 'local storage limit on worker node exceeded', + 'execution time limit exceeded', +] + + +RESET_STDERR_REASONS = [ + 'sigterm', + 'killed', 'operation timed out', ] -# 'memory limit exceeded', -# 'local storage limit on worker node exceeded', -# 'execution time limit exceeded', def parse_usage(usage: str) -> int: @@ -741,14 +746,14 @@ async def finish(self, job_id: CondorJobId, success: bool = True, resources: dic if reason: stats['error_summary'] = reason # check condor error for reset reason - for text in RESET_REASONS: + for text in RESET_CONDOR_REASONS: if text.lower() in reason.lower(): future = self.task_reset(job, stats=stats, reason=reason) break if future is None and stderr and stderr.is_file(): # check stderr for reset reason reason = stderr.open().read() - for text in RESET_REASONS: + for text in RESET_STDERR_REASONS: if text.lower() in reason.lower(): future = self.task_reset(job, stats=stats, reason=reason) break diff --git a/tests/server/plugins/condor_test.py b/tests/server/plugins/condor_test.py index b928d47f..b61a9edf 100644 --- a/tests/server/plugins/condor_test.py +++ b/tests/server/plugins/condor_test.py @@ -825,7 +825,7 @@ async def test_reset_task(schedd, i3prod_path, set_time): g.task_reset = AsyncMock() g.task_failure = AsyncMock() - await g.finish(jobid, success=False, reason=iceprod.server.plugins.condor.RESET_REASONS[0]) + await g.finish(jobid, success=False, reason=iceprod.server.plugins.condor.RESET_CONDOR_REASONS[0]) assert g.task_success.call_count == 0 assert g.task_reset.call_count == 1 @@ -838,7 +838,7 @@ async def test_reset_task(schedd, i3prod_path, set_time): g.task_reset = AsyncMock() g.task_failure = AsyncMock() - (p / 'condor.err').open('w').write(iceprod.server.plugins.condor.RESET_REASONS[-1]) + (p / 'condor.err').open('w').write(iceprod.server.plugins.condor.RESET_STDERR_REASONS[0]) await g.finish(jobid, success=False)