From 8d762ddf0d52410f42338a1b77868f39b1e4baca Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 2 Jan 2025 08:08:33 -0800 Subject: [PATCH 01/10] [Docs] Refactor pod_config docs (#4427) * refactor pod_config docs * Update docs/source/reference/kubernetes/kubernetes-getting-started.rst Co-authored-by: Zongheng Yang * Update docs/source/reference/kubernetes/kubernetes-getting-started.rst Co-authored-by: Zongheng Yang --------- Co-authored-by: Zongheng Yang --- .../kubernetes/kubernetes-getting-started.rst | 93 ++++++++++++------- 1 file changed, 61 insertions(+), 32 deletions(-) diff --git a/docs/source/reference/kubernetes/kubernetes-getting-started.rst b/docs/source/reference/kubernetes/kubernetes-getting-started.rst index e4bbb2c8915..3323559bb36 100644 --- a/docs/source/reference/kubernetes/kubernetes-getting-started.rst +++ b/docs/source/reference/kubernetes/kubernetes-getting-started.rst @@ -258,6 +258,67 @@ After launching the cluster with :code:`sky launch -c myclus task.yaml`, you can To learn more about opening ports in SkyPilot tasks, see :ref:`Opening Ports `. +Customizing SkyPilot pods +------------------------- + +You can override the pod configuration used by SkyPilot by setting the :code:`pod_config` key in :code:`~/.sky/config.yaml`. +The value of :code:`pod_config` should be a dictionary that follows the `Kubernetes Pod API `_. This will apply to all pods created by SkyPilot. + +For example, to set custom environment variables and use GPUDirect RDMA, you can add the following to your :code:`~/.sky/config.yaml` file: + +.. code-block:: yaml + + # ~/.sky/config.yaml + kubernetes: + pod_config: + spec: + containers: + - env: # Custom environment variables to set in pod + - name: MY_ENV_VAR + value: MY_ENV_VALUE + resources: # Custom resources for GPUDirect RDMA + requests: + rdma/rdma_shared_device_a: 1 + limits: + rdma/rdma_shared_device_a: 1 + + +Similarly, you can attach `Kubernetes volumes `_ (e.g., an `NFS volume `_) directly to your SkyPilot pods: + +.. code-block:: yaml + + # ~/.sky/config.yaml + kubernetes: + pod_config: + spec: + containers: + - volumeMounts: # Custom volume mounts for the pod + - mountPath: /data + name: nfs-volume + volumes: + - name: nfs-volume + nfs: # Alternatively, use hostPath if your NFS is directly attached to the nodes + server: nfs.example.com + path: /nfs + + +.. tip:: + + As an alternative to setting ``pod_config`` globally, you can also set it on a per-task basis directly in your task YAML with the ``config_overrides`` :ref:`field `. + + .. code-block:: yaml + + # task.yaml + run: | + python myscript.py + + # Set pod_config for this task + experimental: + config_overrides: + pod_config: + ... + + FAQs ---- @@ -293,38 +354,6 @@ FAQs You can use your existing observability tools to filter resources with the label :code:`parent=skypilot` (:code:`kubectl get pods -l 'parent=skypilot'`). As an example, follow the instructions :ref:`here ` to deploy the Kubernetes Dashboard on your cluster. -* **How can I specify custom configuration for the pods created by SkyPilot?** - - You can override the pod configuration used by SkyPilot by setting the :code:`pod_config` key in :code:`~/.sky/config.yaml`. - The value of :code:`pod_config` should be a dictionary that follows the `Kubernetes Pod API `_. - - For example, to set custom environment variables and attach a volume on your pods, you can add the following to your :code:`~/.sky/config.yaml` file: - - .. code-block:: yaml - - kubernetes: - pod_config: - spec: - containers: - - env: - - name: MY_ENV_VAR - value: MY_ENV_VALUE - volumeMounts: # Custom volume mounts for the pod - - mountPath: /foo - name: example-volume - resources: # Custom resource requests and limits - requests: - rdma/rdma_shared_device_a: 1 - limits: - rdma/rdma_shared_device_a: 1 - volumes: - - name: example-volume - hostPath: - path: /tmp - type: Directory - - For more details refer to :ref:`config-yaml`. - * **I am using a custom image. How can I speed up the pod startup time?** You can pre-install SkyPilot dependencies in your custom image to speed up the pod startup time. Simply add these lines at the end of your Dockerfile: From 6a6d6671958c9fce56ffa314144daa1156a43342 Mon Sep 17 00:00:00 2001 From: Hysun He Date: Fri, 3 Jan 2025 15:45:50 +0800 Subject: [PATCH 02/10] [OCI] Set default image to ubuntu LTS 22.04 (#4517) * set default gpu image to skypilot:gpu-ubuntu-2204 * add example * remove comment line * set cpu default image to 2204 * update change history --- examples/oci/gpu-oraclelinux9.yaml | 33 ++++++++++++++++++++++++++++++ examples/oci/gpu-ubuntu-2204.yaml | 33 ++++++++++++++++++++++++++++++ sky/clouds/utils/oci_utils.py | 8 ++++++-- 3 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 examples/oci/gpu-oraclelinux9.yaml create mode 100644 examples/oci/gpu-ubuntu-2204.yaml diff --git a/examples/oci/gpu-oraclelinux9.yaml b/examples/oci/gpu-oraclelinux9.yaml new file mode 100644 index 00000000000..cc7b05ea0fc --- /dev/null +++ b/examples/oci/gpu-oraclelinux9.yaml @@ -0,0 +1,33 @@ +name: gpu-task + +resources: + # Optional; if left out, automatically pick the cheapest cloud. + cloud: oci + + accelerators: A10:1 + + disk_size: 1024 + + disk_tier: high + + image_id: skypilot:gpu-oraclelinux9 + + +# Working directory (optional) containing the project codebase. +# Its contents are synced to ~/sky_workdir/ on the cluster. +workdir: . + +num_nodes: 1 + +# Typical use: pip install -r requirements.txt +# Invoked under the workdir (i.e., can use its files). +setup: | + echo "*** Running setup. ***" + +# Typical use: make use of resources, such as running training. +# Invoked under the workdir (i.e., can use its files). +run: | + echo "*** Running the task on OCI ***" + echo "hello, world" + nvidia-smi + echo "The task is completed." diff --git a/examples/oci/gpu-ubuntu-2204.yaml b/examples/oci/gpu-ubuntu-2204.yaml new file mode 100644 index 00000000000..e0012a31a1a --- /dev/null +++ b/examples/oci/gpu-ubuntu-2204.yaml @@ -0,0 +1,33 @@ +name: gpu-task + +resources: + # Optional; if left out, automatically pick the cheapest cloud. + cloud: oci + + accelerators: A10:1 + + disk_size: 1024 + + disk_tier: high + + image_id: skypilot:gpu-ubuntu-2204 + + +# Working directory (optional) containing the project codebase. +# Its contents are synced to ~/sky_workdir/ on the cluster. +workdir: . + +num_nodes: 1 + +# Typical use: pip install -r requirements.txt +# Invoked under the workdir (i.e., can use its files). +setup: | + echo "*** Running setup. ***" + +# Typical use: make use of resources, such as running training. +# Invoked under the workdir (i.e., can use its files). +run: | + echo "*** Running the task on OCI ***" + echo "hello, world" + nvidia-smi + echo "The task is completed." diff --git a/sky/clouds/utils/oci_utils.py b/sky/clouds/utils/oci_utils.py index 0cd4f33e647..581d4d72d3c 100644 --- a/sky/clouds/utils/oci_utils.py +++ b/sky/clouds/utils/oci_utils.py @@ -6,6 +6,10 @@ configuration. - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add the constant SERVICE_PORT_RULE_TAG + - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Set the default image + from ubuntu 20.04 to ubuntu 22.04, including: + - GPU: skypilot:gpu-ubuntu-2004 -> skypilot:gpu-ubuntu-2204 + - CPU: skypilot:cpu-ubuntu-2004 -> skypilot:cpu-ubuntu-2204 """ import os @@ -117,7 +121,7 @@ def get_default_gpu_image_tag(cls) -> str: # the sky's user-config file (if not specified, use the hardcode one at # last) return skypilot_config.get_nested(('oci', 'default', 'image_tag_gpu'), - 'skypilot:gpu-ubuntu-2004') + 'skypilot:gpu-ubuntu-2204') @classmethod def get_default_image_tag(cls) -> str: @@ -125,7 +129,7 @@ def get_default_image_tag(cls) -> str: # set the default image tag in the sky's user-config file. (if not # specified, use the hardcode one at last) return skypilot_config.get_nested( - ('oci', 'default', 'image_tag_general'), 'skypilot:cpu-ubuntu-2004') + ('oci', 'default', 'image_tag_general'), 'skypilot:cpu-ubuntu-2204') @classmethod def get_sky_user_config_file(cls) -> str: From 459c5ae9d56239d90658bf782c56cc6ad36f5dfe Mon Sep 17 00:00:00 2001 From: Hysun He Date: Fri, 3 Jan 2025 21:54:04 +0800 Subject: [PATCH 03/10] [OCI] 1. Support specify OS with custom image id. 2. Corner case fix (#4524) * Support specify os type with custom image id. * trim space * nit * comment --- sky/clouds/oci.py | 29 ++++++++++++++++++++--------- sky/provision/oci/query_utils.py | 7 +++++-- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index d4ae6f298d2..b0234e2802c 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -232,6 +232,14 @@ def make_deploy_resources_variables( listing_id = None res_ver = None + os_type = None + if ':' in image_id: + # OS type provided in the --image-id. This is the case where + # custom image's ocid provided in the --image-id parameter. + # - ocid1.image...aaa:oraclelinux (os type is oraclelinux) + # - ocid1.image...aaa (OS not provided) + image_id, os_type = image_id.replace(' ', '').split(':') + cpus = resources.cpus instance_type_arr = resources.instance_type.split( oci_utils.oci_config.INSTANCE_TYPE_RES_SPERATOR) @@ -297,15 +305,18 @@ def make_deploy_resources_variables( cpus=None if cpus is None else float(cpus), disk_tier=resources.disk_tier) - image_str = self._get_image_str(image_id=resources.image_id, - instance_type=resources.instance_type, - region=region.name) - - # pylint: disable=import-outside-toplevel - from sky.clouds.service_catalog import oci_catalog - os_type = oci_catalog.get_image_os_from_tag(tag=image_str, - region=region.name) - logger.debug(f'OS type for the image {image_str} is {os_type}') + if os_type is None: + # OS type is not determined yet. So try to get it from vms.csv + image_str = self._get_image_str( + image_id=resources.image_id, + instance_type=resources.instance_type, + region=region.name) + + # pylint: disable=import-outside-toplevel + from sky.clouds.service_catalog import oci_catalog + os_type = oci_catalog.get_image_os_from_tag(tag=image_str, + region=region.name) + logger.debug(f'OS type for the image {image_id} is {os_type}') return { 'instance_type': instance_type, diff --git a/sky/provision/oci/query_utils.py b/sky/provision/oci/query_utils.py index 8cca0629305..3037fcc2703 100644 --- a/sky/provision/oci/query_utils.py +++ b/sky/provision/oci/query_utils.py @@ -506,8 +506,11 @@ def find_nsg(cls, region: str, nsg_name: str, raise exceptions.ResourcesUnavailableError( 'The VCN is not available') - # Get the primary vnic. - assert len(list_vcns_resp.data) > 0 + # Get the primary vnic. The vnic might be an empty list for the + # corner case when the cluster was exited during provision. + if not list_vcns_resp.data: + return None + vcn = list_vcns_resp.data[0] list_nsg_resp = net_client.list_network_security_groups( From 0db9846889dbea6e60440b090c6590eb7016d713 Mon Sep 17 00:00:00 2001 From: zpoint Date: Fri, 3 Jan 2025 23:38:48 +0800 Subject: [PATCH 04/10] Update intermediate bucket related doc (#4521) * doc * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * Update docs/source/examples/managed-jobs.rst Co-authored-by: Romil Bhardwaj * add tip * minor changes --------- Co-authored-by: Romil Bhardwaj --- docs/source/examples/managed-jobs.rst | 42 ++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/docs/source/examples/managed-jobs.rst b/docs/source/examples/managed-jobs.rst index 99fa461249d..2cd99b6c24b 100644 --- a/docs/source/examples/managed-jobs.rst +++ b/docs/source/examples/managed-jobs.rst @@ -152,6 +152,7 @@ The :code:`MOUNT` mode in :ref:`SkyPilot bucket mounting ` ensures Note that the application code should save program checkpoints periodically and reload those states when the job is restarted. This is typically achieved by reloading the latest checkpoint at the beginning of your program. + .. _spot-jobs-end-to-end: An End-to-End Example @@ -455,6 +456,46 @@ especially useful when there are many in-progress jobs to monitor, which the terminal-based CLI may need more than one page to display. +.. _intermediate-bucket: + +Intermediate storage for files +------------------------------ + +For managed jobs, SkyPilot requires an intermediate bucket to store files used in the task, such as local file mounts, temporary files, and the workdir. +If you do not configure a bucket, SkyPilot will automatically create a temporary bucket named :code:`skypilot-filemounts-{username}-{run_id}` for each job launch. SkyPilot automatically deletes the bucket after the job completes. + +Alternatively, you can pre-provision a bucket and use it as an intermediate for storing file by setting :code:`jobs.bucket` in :code:`~/.sky/config.yaml`: + +.. code-block:: yaml + + # ~/.sky/config.yaml + jobs: + bucket: s3://my-bucket # Supports s3://, gs://, https://.blob.core.windows.net/, r2://, cos:/// + + +If you choose to specify a bucket, ensure that the bucket already exists and that you have the necessary permissions. + +When using a pre-provisioned intermediate bucket with :code:`jobs.bucket`, SkyPilot creates job-specific directories under the bucket root to store files. They are organized in the following structure: + +.. code-block:: text + + # cloud bucket, s3://my-bucket/ for example + my-bucket/ + ├── job-15891b25/ # Job-specific directory + │ ├── local-file-mounts/ # Files from local file mounts + │ ├── tmp-files/ # Temporary files + │ └── workdir/ # Files from workdir + └── job-cae228be/ # Another job's directory + ├── local-file-mounts/ + ├── tmp-files/ + └── workdir/ + +When using a custom bucket (:code:`jobs.bucket`), the job-specific directories (e.g., :code:`job-15891b25/`) created by SkyPilot are removed when the job completes. + +.. tip:: + Multiple users can share the same intermediate bucket. Each user's jobs will have their own unique job-specific directories, ensuring that files are kept separate and organized. + + Concept: Jobs Controller ------------------------ @@ -505,4 +546,3 @@ The :code:`resources` field has the same spec as a normal SkyPilot job; see `her These settings will not take effect if you have an existing controller (either stopped or live). For them to take effect, tear down the existing controller first, which requires all in-progress jobs to finish or be canceled. - From 2ccbbffb99cda5894f294d6db98b453eb430da5b Mon Sep 17 00:00:00 2001 From: Aylei Date: Sat, 4 Jan 2025 02:50:29 +0800 Subject: [PATCH 05/10] [aws] cache user identity by 'aws configure list' (#4507) * [aws] cache user identity by 'aws configure list' Signed-off-by: Aylei * refine get_user_identities docstring Signed-off-by: Aylei * address review comments Signed-off-by: Aylei --------- Signed-off-by: Aylei --- sky/clouds/aws.py | 126 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 87 insertions(+), 39 deletions(-) diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index cafc789c5be..c665263e22e 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -2,6 +2,8 @@ import enum import fnmatch import functools +import hashlib +import json import os import re import subprocess @@ -16,6 +18,7 @@ from sky import skypilot_config from sky.adaptors import aws from sky.clouds import service_catalog +from sky.clouds.service_catalog import common as catalog_common from sky.clouds.utils import aws_utils from sky.skylet import constants from sky.utils import common_utils @@ -624,14 +627,10 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: @classmethod def _current_identity_type(cls) -> Optional[AWSIdentityType]: - proc = subprocess.run('aws configure list', - shell=True, - check=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - if proc.returncode != 0: + stdout = cls._aws_configure_list() + if stdout is None: return None - stdout = proc.stdout.decode() + output = stdout.decode() # We determine the identity type by looking at the output of # `aws configure list`. The output looks like: @@ -646,10 +645,10 @@ def _current_identity_type(cls) -> Optional[AWSIdentityType]: def _is_access_key_of_type(type_str: str) -> bool: # The dot (.) does not match line separators. - results = re.findall(fr'access_key.*{type_str}', stdout) + results = re.findall(fr'access_key.*{type_str}', output) if len(results) > 1: raise RuntimeError( - f'Unexpected `aws configure list` output:\n{stdout}') + f'Unexpected `aws configure list` output:\n{output}') return len(results) == 1 if _is_access_key_of_type(AWSIdentityType.SSO.value): @@ -664,37 +663,20 @@ def _is_access_key_of_type(type_str: str) -> bool: return AWSIdentityType.SHARED_CREDENTIALS_FILE @classmethod - @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. - def get_user_identities(cls) -> Optional[List[List[str]]]: - """Returns a [UserId, Account] list that uniquely identifies the user. - - These fields come from `aws sts get-caller-identity`. We permit the same - actual user to: - - - switch between different root accounts (after which both elements - of the list will be different) and have their clusters owned by - each account be protected; or - - - within the same root account, switch between different IAM - users, and treat [user_id=1234, account=A] and - [user_id=4567, account=A] to be the *same*. Namely, switching - between these IAM roles within the same root account will cause - the first element of the returned list to differ, and will allow - the same actual user to continue to interact with their clusters. - Note: this is not 100% safe, since the IAM users can have very - specific permissions, that disallow them to access the clusters - but it is a reasonable compromise as that could be rare. - - Returns: - A list of strings that uniquely identifies the user on this cloud. - For identity check, we will fallback through the list of strings - until we find a match, and print a warning if we fail for the - first string. + @functools.lru_cache(maxsize=1) + def _aws_configure_list(cls) -> Optional[bytes]: + proc = subprocess.run('aws configure list', + shell=True, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if proc.returncode != 0: + return None + return proc.stdout - Raises: - exceptions.CloudUserIdentityError: if the user identity cannot be - retrieved. - """ + @classmethod + @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. + def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]: try: sts = aws.client('sts') # The caller identity contains 3 fields: UserId, Account, Arn. @@ -773,6 +755,72 @@ def get_user_identities(cls) -> Optional[List[List[str]]]: # automatic switching for AWS. Currently we only support one identity. return [user_ids] + @classmethod + @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. + def get_user_identities(cls) -> Optional[List[List[str]]]: + """Returns a [UserId, Account] list that uniquely identifies the user. + + These fields come from `aws sts get-caller-identity` and are cached + locally by `aws configure list` output. The identities are assumed to + be stable for the duration of the `sky` process. Modifying the + credentials while the `sky` process is running will not affect the + identity returned by this function. + + We permit the same actual user to: + + - switch between different root accounts (after which both elements + of the list will be different) and have their clusters owned by + each account be protected; or + + - within the same root account, switch between different IAM + users, and treat [user_id=1234, account=A] and + [user_id=4567, account=A] to be the *same*. Namely, switching + between these IAM roles within the same root account will cause + the first element of the returned list to differ, and will allow + the same actual user to continue to interact with their clusters. + Note: this is not 100% safe, since the IAM users can have very + specific permissions, that disallow them to access the clusters + but it is a reasonable compromise as that could be rare. + + Returns: + A list of strings that uniquely identifies the user on this cloud. + For identity check, we will fallback through the list of strings + until we find a match, and print a warning if we fail for the + first string. + + Raises: + exceptions.CloudUserIdentityError: if the user identity cannot be + retrieved. + """ + stdout = cls._aws_configure_list() + if stdout is None: + # `aws configure list` is not available, possible reasons: + # - awscli is not installed but credentials are valid, e.g. run from + # an EC2 instance with IAM role + # - aws credentials are not set, proceed anyway to get unified error + # message for users + return cls._sts_get_caller_identity() + config_hash = hashlib.md5(stdout).hexdigest()[:8] + # Getting aws identity cost ~1s, so we cache the result with the output of + # `aws configure list` as cache key. Different `aws configure list` output + # can have same aws identity, our assumption is the output would be stable + # in real world, so the number of cache files would be limited. + # TODO(aylei): consider using a more stable cache key and evalute eviction. + cache_path = catalog_common.get_catalog_path( + f'aws/.cache/user-identity-{config_hash}.txt') + if os.path.exists(cache_path): + try: + with open(cache_path, 'r', encoding='utf-8') as f: + return json.loads(f.read()) + except json.JSONDecodeError: + # cache is invalid, ignore it and fetch identity again + pass + + result = cls._sts_get_caller_identity() + with open(cache_path, 'w', encoding='utf-8') as f: + f.write(json.dumps(result)) + return result + @classmethod def get_active_user_identity_str(cls) -> Optional[str]: user_identity = cls.get_active_user_identity() From 061d4bd998739e16fa704a855233e62290f6cbdb Mon Sep 17 00:00:00 2001 From: Chester Li Date: Sat, 4 Jan 2025 03:29:33 +0800 Subject: [PATCH 06/10] [k8s] Add validation for pod_config #4206 (#4466) * [k8s] Add validation for pod_config #4206 Check pod_config when run 'sky check k8s' by using k8s api * update: check pod_config when launch check merged pod_config during launch using k8s api * fix test * ignore check failed when test with dryrun if there is no kube config in env, ignore ValueError when launch with dryrun. For now, we don't support check schema offline. * use deserialize api to check pod_config schema * test * create another api_client with no kubeconfig * test * update error message * update test * test * test * Update sky/backends/backend_utils.py --------- Co-authored-by: Romil Bhardwaj --- sky/backends/backend_utils.py | 7 +++++ sky/provision/kubernetes/utils.py | 46 +++++++++++++++++++++++++++++++ tests/test_config.py | 46 +++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 1de799e7cf8..6e79469a819 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -926,6 +926,13 @@ def write_cluster_config( tmp_yaml_path, cluster_config_overrides=to_provision.cluster_config_overrides) kubernetes_utils.combine_metadata_fields(tmp_yaml_path) + yaml_obj = common_utils.read_yaml(tmp_yaml_path) + pod_config = yaml_obj['available_node_types']['ray_head_default'][ + 'node_config'] + valid, message = kubernetes_utils.check_pod_config(pod_config) + if not valid: + raise exceptions.InvalidCloudConfigs( + f'Invalid pod_config. Details: {message}') if dryrun: # If dryrun, return the unfinished tmp yaml path. diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 487868d1d9e..14b6b42aa58 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -893,6 +893,52 @@ def check_credentials(context: Optional[str], return True, None +def check_pod_config(pod_config: dict) \ + -> Tuple[bool, Optional[str]]: + """Check if the pod_config is a valid pod config + + Using deserialize api to check the pod_config is valid or not. + + Returns: + bool: True if pod_config is valid. + str: Error message about why the pod_config is invalid, None otherwise. + """ + errors = [] + # This api_client won't be used to send any requests, so there is no need to + # load kubeconfig + api_client = kubernetes.kubernetes.client.ApiClient() + + # Used for kubernetes api_client deserialize function, the function will use + # data attr, the detail ref: + # https://github.com/kubernetes-client/python/blob/master/kubernetes/client/api_client.py#L244 + class InnerResponse(): + + def __init__(self, data: dict): + self.data = json.dumps(data) + + try: + # Validate metadata if present + if 'metadata' in pod_config: + try: + value = InnerResponse(pod_config['metadata']) + api_client.deserialize( + value, kubernetes.kubernetes.client.V1ObjectMeta) + except ValueError as e: + errors.append(f'Invalid metadata: {str(e)}') + # Validate spec if present + if 'spec' in pod_config: + try: + value = InnerResponse(pod_config['spec']) + api_client.deserialize(value, + kubernetes.kubernetes.client.V1PodSpec) + except ValueError as e: + errors.append(f'Invalid spec: {str(e)}') + return len(errors) == 0, '.'.join(errors) + except Exception as e: # pylint: disable=broad-except + errors.append(f'Validation error: {str(e)}') + return False, '.'.join(errors) + + def is_kubeconfig_exec_auth( context: Optional[str] = None) -> Tuple[bool, Optional[str]]: """Checks if the kubeconfig file uses exec-based authentication diff --git a/tests/test_config.py b/tests/test_config.py index 5789214dc61..d3eaeb261bc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -7,6 +7,7 @@ import sky from sky import skypilot_config +import sky.exceptions from sky.skylet import constants from sky.utils import common_utils from sky.utils import kubernetes_enums @@ -99,6 +100,29 @@ def _create_task_yaml_file(task_file_path: pathlib.Path) -> None: """)) +def _create_invalid_config_yaml_file(task_file_path: pathlib.Path) -> None: + task_file_path.write_text( + textwrap.dedent("""\ + experimental: + config_overrides: + kubernetes: + pod_config: + metadata: + labels: + test-key: test-value + annotations: + abc: def + spec: + containers: + - name: + imagePullSecrets: + - name: my-secret-2 + + setup: echo 'Setting up...' + run: echo 'Running...' + """)) + + def test_nested_config(monkeypatch) -> None: """Test that the nested config works.""" config = skypilot_config.Config() @@ -335,6 +359,28 @@ def test_k8s_config_with_override(monkeypatch, tmp_path, assert cluster_pod_config['spec']['runtimeClassName'] == 'nvidia' +def test_k8s_config_with_invalid_config(monkeypatch, tmp_path, + enable_all_clouds) -> None: + config_path = tmp_path / 'config.yaml' + _create_config_file(config_path) + monkeypatch.setattr(skypilot_config, 'CONFIG_PATH', config_path) + + _reload_config() + task_path = tmp_path / 'task.yaml' + _create_invalid_config_yaml_file(task_path) + task = sky.Task.from_yaml(task_path) + + # Test Kubernetes pod_config invalid + cluster_name = 'test_k8s_config_with_invalid_config' + task.set_resources_override({'cloud': sky.Kubernetes()}) + exception_occurred = False + try: + sky.launch(task, cluster_name=cluster_name, dryrun=True) + except sky.exceptions.ResourcesUnavailableError: + exception_occurred = True + assert exception_occurred + + def test_gcp_config_with_override(monkeypatch, tmp_path, enable_all_clouds) -> None: config_path = tmp_path / 'config.yaml' From 4ab8e1668053fef8ae87ba9c832073c444078e49 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Fri, 3 Jan 2025 21:03:39 -0800 Subject: [PATCH 07/10] [core] fix wheel timestamp check (#4488) Previously, we were only taking the max timestamp of all the subdirectories of the given directory. So the timestamp could be incorrect if only a file changed, and no directory changed. This fixes the issue by looking at all directories and files given by os.walk(). --- sky/backends/wheel_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sky/backends/wheel_utils.py b/sky/backends/wheel_utils.py index ed580569e0b..805117ee2a3 100644 --- a/sky/backends/wheel_utils.py +++ b/sky/backends/wheel_utils.py @@ -153,7 +153,10 @@ def _get_latest_modification_time(path: pathlib.Path) -> float: if not path.exists(): return -1. try: - return max(os.path.getmtime(root) for root, _, _ in os.walk(path)) + return max( + os.path.getmtime(os.path.join(root, f)) + for root, dirs, files in os.walk(path) + for f in (*dirs, *files)) except ValueError: return -1. From e4939f9fafde4985836689211d9e5f67731e792a Mon Sep 17 00:00:00 2001 From: Hysun He Date: Mon, 6 Jan 2025 09:20:33 +0800 Subject: [PATCH 08/10] [docs] Add image_id doc in task YAML for OCI (#4526) * Add image_id doc for OCI * nit * Update docs/source/reference/yaml-spec.rst Co-authored-by: Tian Xia --------- Co-authored-by: Tian Xia --- docs/source/reference/yaml-spec.rst | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst index 8a490b7e817..d2f0506993a 100644 --- a/docs/source/reference/yaml-spec.rst +++ b/docs/source/reference/yaml-spec.rst @@ -176,9 +176,9 @@ Available fields: # tpu_vm: True # True to use TPU VM (the default); False to use TPU node. # Custom image id (optional, advanced). The image id used to boot the - # instances. Only supported for AWS and GCP (for non-docker image). If not - # specified, SkyPilot will use the default debian-based image suitable for - # machine learning tasks. + # instances. Only supported for AWS, GCP, OCI and IBM (for non-docker image). + # If not specified, SkyPilot will use the default debian-based image + # suitable for machine learning tasks. # # Docker support # You can specify docker image to use by setting the image_id to @@ -204,7 +204,7 @@ Available fields: # image_id: # us-east-1: ami-0729d913a335efca7 # us-west-2: ami-050814f384259894c - image_id: ami-0868a20f5a3bf9702 + # # GCP # To find GCP images: https://cloud.google.com/compute/docs/images # image_id: projects/deeplearning-platform-release/global/images/common-cpu-v20230615-debian-11-py310 @@ -215,6 +215,24 @@ Available fields: # To find Azure images: https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage # image_id: microsoft-dsvm:ubuntu-2004:2004:21.11.04 # + # OCI + # To find OCI images: https://docs.oracle.com/en-us/iaas/images + # You can choose the image with OS version from the following image tags + # provided by SkyPilot: + # image_id: skypilot:gpu-ubuntu-2204 + # image_id: skypilot:gpu-ubuntu-2004 + # image_id: skypilot:gpu-oraclelinux9 + # image_id: skypilot:gpu-oraclelinux8 + # image_id: skypilot:cpu-ubuntu-2204 + # image_id: skypilot:cpu-ubuntu-2004 + # image_id: skypilot:cpu-oraclelinux9 + # image_id: skypilot:cpu-oraclelinux8 + # + # It is also possible to specify your custom image's OCID with OS type, + # for example: + # image_id: ocid1.image.oc1.us-sanjose-1.aaaaaaaaywwfvy67wwe7f24juvjwhyjn3u7g7s3wzkhduxcbewzaeki2nt5q:oraclelinux + # image_id: ocid1.image.oc1.us-sanjose-1.aaaaaaaa5tnuiqevhoyfnaa5pqeiwjv6w5vf6w4q2hpj3atyvu3yd6rhlhyq:ubuntu + # # IBM # Create a private VPC image and paste its ID in the following format: # image_id: @@ -224,6 +242,7 @@ Available fields: # https://www.ibm.com/cloud/blog/use-ibm-packer-plugin-to-create-custom-images-on-ibm-cloud-vpc-infrastructure # To use a more limited but easier to manage tool: # https://github.com/IBM/vpc-img-inst + image_id: ami-0868a20f5a3bf9702 # Labels to apply to the instances (optional). # From 9828f6b9b3ea50a35352c2b530c6717c6eef82b4 Mon Sep 17 00:00:00 2001 From: Hong Date: Mon, 6 Jan 2025 10:26:59 +0800 Subject: [PATCH 09/10] [UX] warning before launching jobs/serve when using a reauth required credentials (#4479) * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * wip * Update sky/backends/cloud_vm_ray_backend.py Minor fix * Update sky/clouds/aws.py Co-authored-by: Romil Bhardwaj * wip * minor changes * wip --------- Co-authored-by: hong Co-authored-by: Romil Bhardwaj --- sky/backends/backend_utils.py | 36 ++++++++++++++++++++++++++++ sky/backends/cloud_vm_ray_backend.py | 17 +++++++++++++ sky/clouds/aws.py | 24 +++++++++++++++++++ sky/clouds/cloud.py | 4 ++++ sky/clouds/gcp.py | 9 +++++++ 5 files changed, 90 insertions(+) diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 6e79469a819..bf92f442d2f 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -650,6 +650,42 @@ def _restore_block(new_block: Dict[str, Any], old_block: Dict[str, Any]): return common_utils.dump_yaml_str(new_config) +def get_expirable_clouds( + enabled_clouds: Sequence[clouds.Cloud]) -> List[clouds.Cloud]: + """Returns a list of clouds that use local credentials and whose credentials can expire. + + This function checks each cloud in the provided sequence to determine if it uses local credentials + and if its credentials can expire. If both conditions are met, the cloud is added to the list of + expirable clouds. + + Args: + enabled_clouds (Sequence[clouds.Cloud]): A sequence of cloud objects to check. + + Returns: + list[clouds.Cloud]: A list of cloud objects that use local credentials and whose credentials can expire. + """ + expirable_clouds = [] + local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value + for cloud in enabled_clouds: + remote_identities = skypilot_config.get_nested( + (str(cloud).lower(), 'remote_identity'), None) + if remote_identities is None: + remote_identities = schemas.get_default_remote_identity( + str(cloud).lower()) + + local_credential_expiring = cloud.can_credential_expire() + if isinstance(remote_identities, str): + if remote_identities == local_credentials_value and local_credential_expiring: + expirable_clouds.append(cloud) + elif isinstance(remote_identities, list): + for profile in remote_identities: + if list(profile.values( + ))[0] == local_credentials_value and local_credential_expiring: + expirable_clouds.append(cloud) + break + return expirable_clouds + + # TODO: too many things happening here - leaky abstraction. Refactor. @timeline.event def write_cluster_config( diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 156f43181b2..c972928cd7d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -26,6 +26,7 @@ import sky from sky import backends +from sky import check as sky_check from sky import cloud_stores from sky import clouds from sky import exceptions @@ -1996,6 +1997,22 @@ def provision_with_retries( skip_unnecessary_provisioning else None) failover_history: List[Exception] = list() + # If the user is using local credentials which may expire, the + # controller may leak resources if the credentials expire while a job + # is running. Here we check the enabled clouds and expiring credentials + # and raise a warning to the user. + if task.is_controller_task(): + enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh() + expirable_clouds = backend_utils.get_expirable_clouds( + enabled_clouds) + + if len(expirable_clouds) > 0: + warnings = (f'\033[93mWarning: Credentials used for ' + f'{expirable_clouds} may expire. Clusters may be ' + f'leaked if the credentials expire while jobs ' + f'are running. It is recommended to use credentials' + f' that never expire or a service account.\033[0m') + logger.warning(warnings) # Retrying launchable resources. while True: diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index c665263e22e..a86a87f4feb 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -103,6 +103,24 @@ class AWSIdentityType(enum.Enum): # region us-east-1 config-file ~/.aws/config SHARED_CREDENTIALS_FILE = 'shared-credentials-file' + def can_credential_expire(self) -> bool: + """Check if the AWS identity type can expire. + + SSO,IAM_ROLE and CONTAINER_ROLE are temporary credentials and refreshed + automatically. ENV and SHARED_CREDENTIALS_FILE are short-lived + credentials without refresh. + IAM ROLE: + https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html + SSO/Container-role refresh token: + https://docs.aws.amazon.com/solutions/latest/dea-api/auth-refreshtoken.html + """ + # TODO(hong): Add a CLI based check for the expiration of the temporary + # credentials + expirable_types = { + AWSIdentityType.ENV, AWSIdentityType.SHARED_CREDENTIALS_FILE + } + return self in expirable_types + @clouds.CLOUD_REGISTRY.register class AWS(clouds.Cloud): @@ -860,6 +878,12 @@ def get_credential_file_mounts(self) -> Dict[str, str]: if os.path.exists(os.path.expanduser(f'~/.aws/{filename}')) } + @functools.lru_cache(maxsize=1) + def can_credential_expire(self) -> bool: + identity_type = self._current_identity_type() + return identity_type is not None and identity_type.can_credential_expire( + ) + def instance_type_exists(self, instance_type): return service_catalog.instance_type_exists(instance_type, clouds='aws') diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 455baeaf5d9..2cb45ca14fc 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -536,6 +536,10 @@ def get_credential_file_mounts(self) -> Dict[str, str]: """ raise NotImplementedError + def can_credential_expire(self) -> bool: + """Returns whether the cloud credential can expire.""" + return False + @classmethod def get_image_size(cls, image_id: str, region: Optional[str]) -> float: """Check the image size from the cloud. diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index ff200f84147..3502fee8e1c 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -132,6 +132,9 @@ class GCPIdentityType(enum.Enum): SHARED_CREDENTIALS_FILE = '' + def can_credential_expire(self) -> bool: + return self == GCPIdentityType.SHARED_CREDENTIALS_FILE + @clouds.CLOUD_REGISTRY.register class GCP(clouds.Cloud): @@ -863,6 +866,12 @@ def get_credential_file_mounts(self) -> Dict[str, str]: pass return credentials + @functools.lru_cache(maxsize=1) + def can_credential_expire(self) -> bool: + identity_type = self._get_identity_type() + return identity_type is not None and identity_type.can_credential_expire( + ) + @classmethod def _get_identity_type(cls) -> Optional[GCPIdentityType]: try: From 38a822ac6b553df0e784e559715ee4269c21f780 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Sun, 5 Jan 2025 22:51:04 -0800 Subject: [PATCH 10/10] [GCP] Activate service account for storage and controller (#4529) * Activate service account for storage * disable logging if not using service account * Activate for controller as well. * revert controller activate * Add comments * format * fix smoke --- sky/cloud_stores.py | 12 ++++++++++-- sky/data/data_utils.py | 12 ++++++++---- tests/smoke_tests/test_managed_job.py | 2 +- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/sky/cloud_stores.py b/sky/cloud_stores.py index 108f33f2c1f..e24c4f3ad03 100644 --- a/sky/cloud_stores.py +++ b/sky/cloud_stores.py @@ -113,8 +113,16 @@ class GcsCloudStorage(CloudStorage): @property def _gsutil_command(self): gsutil_alias, alias_gen = data_utils.get_gsutil_command() - return (f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS=' - f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH} {gsutil_alias}') + return ( + f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS=' + f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH}; ' + # Explicitly activate service account. Unlike the gcp packages + # and other GCP commands, gsutil does not automatically pick up + # the default credential keys when it is a service account. + 'gcloud auth activate-service-account ' + '--key-file=$GOOGLE_APPLICATION_CREDENTIALS ' + '2> /dev/null || true; ' + f'{gsutil_alias}') def is_directory(self, url: str) -> bool: """Returns whether 'url' is a directory. diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index 05c2b42c844..e8dcaa83017 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -523,10 +523,14 @@ def get_gsutil_command() -> Tuple[str, str]: def run_upload_cli(command: str, access_denied_message: str, bucket_name: str, log_path: str): - returncode, stdout, stderr = log_lib.run_with_log(command, - log_path, - shell=True, - require_outputs=True) + returncode, stdout, stderr = log_lib.run_with_log( + command, + log_path, + shell=True, + require_outputs=True, + # We need to use bash as some of the cloud commands uses bash syntax, + # such as [[ ... ]] + executable='/bin/bash') if access_denied_message in stderr: with ux_utils.print_exception_no_traceback(): raise PermissionError('Failed to upload files to ' diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index 22381fc45e3..5c930724523 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -365,7 +365,7 @@ def test_managed_jobs_pipeline_recovery_gcp(): # separated by `-`. (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), - smoke_tests_utils.zJOB_WAIT_NOT_RUNNING.format(job_name=name), + smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name(