diff --git a/src/jobflow_remote/cli/job.py b/src/jobflow_remote/cli/job.py index b84a4551..847c8d2b 100644 --- a/src/jobflow_remote/cli/job.py +++ b/src/jobflow_remote/cli/job.py @@ -297,6 +297,14 @@ def rerun( ), ] = False, raise_on_error: raise_on_error_opt = False, + err_regex: Annotated[ + str, + typer.Option( + "--err-regex", + "-er", + help="A regular expression to match against job error messages. Only jobs with matching error messages will be rerun.", + ), + ] = None, ) -> None: """ Rerun a Job. By default, this is limited to jobs that failed and children did @@ -333,6 +341,7 @@ def rerun( break_lock=break_lock, force=force, raise_on_error=raise_on_error, + err_regex=err_regex, ) diff --git a/src/jobflow_remote/jobs/jobcontroller.py b/src/jobflow_remote/jobs/jobcontroller.py index 22ab0062..35914426 100644 --- a/src/jobflow_remote/jobs/jobcontroller.py +++ b/src/jobflow_remote/jobs/jobcontroller.py @@ -766,6 +766,7 @@ def rerun_jobs( force: bool = False, wait: int | None = None, break_lock: bool = False, + err_regex: str | None = None, ) -> list[str]: """ Rerun a list of selected Jobs, i.e. bring their state back to READY. @@ -811,6 +812,9 @@ def rerun_jobs( Forcibly break the lock on locked documents. Use with care and verify that the lock has been set by a process that is not running anymore. Doing otherwise will likely lead to inconsistencies in the DB. + err_regex : str, optional + A regular expression to match against job error messages. + Only jobs with matching error messages will be rerun. Returns ------- @@ -829,7 +833,8 @@ def rerun_jobs( name=name, metadata=metadata, workers=workers, - custom_query=custom_query, + custom_query=custom_query + | ({"error": {"$regex": err_regex}} if err_regex else {}), raise_on_error=raise_on_error, force=force, wait=wait,