Skip to content

Commit

Permalink
Add job list command (#244)
Browse files Browse the repository at this point in the history
* Add job ls command

* Add tests for job ls command

* Remove optional --namespace flag from 'job ls' command

* Assert job list output

* Minor changes

* Update README file

* Remove 'Slurm' term from descriptions

* Minor changes
  • Loading branch information
IrvingMg authored Nov 15, 2024
1 parent 42a4088 commit a7c544b
Show file tree
Hide file tree
Showing 5 changed files with 118 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/build_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ jobs:
run: python3 xpk.py workload delete --workload $WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Delete the Pathways workload on the cluster
run: python3 xpk.py workload delete --workload $PATHWAYS_WORKLOAD_NAME --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
- name: Run a batch job on the cluster
run: python3 xpk.py batch --cluster $TPU_CLUSTER_NAME --zone=us-central2-b test.sh
- name: List out the jobs on the cluster
run: python3 xpk.py job ls | grep 'xpk-def-app-profile-slurm-'
- name: Delete the cluster created
if: always()
run: python xpk.py cluster delete --cluster $TPU_CLUSTER_NAME --zone=us-central2-b
Expand Down
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,24 @@ when creating the workload otherwise the workload will always finish with `Compl
`125`: Workload finished but did not complete successfully.
`1`: Other failure.
## Job List
* Job List (see jobs submitted via batch command):
```shell
python3 xpk.py job ls
```
* Example Job List Output:
```
NAME PROFILE LOCAL QUEUE COMPLETIONS DURATION AGE
xpk-def-app-profile-slurm-74kbv xpk-def-app-profile 1/1 15s 17h
xpk-def-app-profile-slurm-brcsg xpk-def-app-profile 1/1 9s 3h56m
xpk-def-app-profile-slurm-kw99l xpk-def-app-profile 1/1 5s 3h54m
xpk-def-app-profile-slurm-x99nx xpk-def-app-profile 3/3 29s 17h
```
## Inspector
* Inspector provides debug info to understand cluster health, and why workloads are not running.
Inspector output is saved to a file.
Expand Down
53 changes: 53 additions & 0 deletions src/xpk/commands/job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
Copyright 2024 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

from ..utils.console import xpk_exit, xpk_print
from ..core.core import add_zone_and_project
from ..core.app_profile import APP_PROFILE_TEMPLATE_DEFAULT_NAME
from ..core.commands import (
run_command_with_updates,
)


def job_list(args) -> None:
"""Function around job list.
Args:
args: user provided arguments for running the command.
Returns:
None
"""
add_zone_and_project(args)
xpk_print(
f'Listing jobs for project {args.project} and zone {args.zone}:',
flush=True,
)

if run_slurm_job_list_command(args):
xpk_exit(1)
xpk_exit(0)


def run_slurm_job_list_command(args) -> None:
cmd = (
f'kubectl-kjob list slurm --profile {APP_PROFILE_TEMPLATE_DEFAULT_NAME}'
)

return_code = run_command_with_updates(cmd, 'list slurm jobs', args)
if return_code != 0:
xpk_print(f'Listing jobs returned ERROR {return_code}')
xpk_exit(return_code)
8 changes: 8 additions & 0 deletions src/xpk/parser/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .workload import set_workload_parsers
from .batch import set_batch_parser
from .info import set_info_parser
from .job import set_job_parser


def set_parser(parser: argparse.ArgumentParser):
Expand All @@ -47,6 +48,9 @@ def set_parser(parser: argparse.ArgumentParser):
"batch",
help="Run batch job.",
)
job_parser = xpk_subcommands.add_parser(
"job", help="commands around listing and cancelling jobs"
)

def default_subcommand_function(
_args,
Expand All @@ -65,16 +69,20 @@ def default_subcommand_function(
workload_parser.print_help()
batch_parser.print_help()
info_parser.print_help()
job_parser.print_help()

return 0

parser.set_defaults(func=default_subcommand_function)
workload_parser.set_defaults(func=default_subcommand_function)
cluster_parser.set_defaults(func=default_subcommand_function)
batch_parser.set_defaults(func=default_subcommand_function)
info_parser.set_defaults(func=default_subcommand_function)
job_parser.set_defaults(func=default_subcommand_function)

set_workload_parsers(workload_parser=workload_parser)
set_cluster_parser(cluster_parser=cluster_parser)
set_inspector_parser(inspector_parser=inspector_parser)
set_batch_parser(batch_parser=batch_parser)
set_info_parser(info_parser=info_parser)
set_job_parser(job_parser=job_parser)
35 changes: 35 additions & 0 deletions src/xpk/parser/job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
Copyright 2024 Google LLC
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
https://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

from .common import add_shared_arguments
from ..commands.job import job_list


def set_job_parser(job_parser):
job_subcommands = job_parser.add_subparsers(
title='job subcommands',
dest='xpk_job_subcommands',
help=(
'These are commands related to job management. Look at help for'
' specific subcommands for more details.'
),
)

### "job ls" command parser ###
job_list_parser = job_subcommands.add_parser('ls', help='List jobs.')

add_shared_arguments(job_list_parser)
job_list_parser.set_defaults(func=job_list)

0 comments on commit a7c544b

Please sign in to comment.