diff --git a/docs/source/examples/managed-jobs.rst b/docs/source/examples/managed-jobs.rst index 99fa461249d..2cd99b6c24b 100644 --- a/docs/source/examples/managed-jobs.rst +++ b/docs/source/examples/managed-jobs.rst @@ -152,6 +152,7 @@ The :code:`MOUNT` mode in :ref:`SkyPilot bucket mounting ` ensures Note that the application code should save program checkpoints periodically and reload those states when the job is restarted. This is typically achieved by reloading the latest checkpoint at the beginning of your program. + .. _spot-jobs-end-to-end: An End-to-End Example @@ -455,6 +456,46 @@ especially useful when there are many in-progress jobs to monitor, which the terminal-based CLI may need more than one page to display. +.. _intermediate-bucket: + +Intermediate storage for files +------------------------------ + +For managed jobs, SkyPilot requires an intermediate bucket to store files used in the task, such as local file mounts, temporary files, and the workdir. +If you do not configure a bucket, SkyPilot will automatically create a temporary bucket named :code:`skypilot-filemounts-{username}-{run_id}` for each job launch. SkyPilot automatically deletes the bucket after the job completes. + +Alternatively, you can pre-provision a bucket and use it as an intermediate for storing file by setting :code:`jobs.bucket` in :code:`~/.sky/config.yaml`: + +.. code-block:: yaml + + # ~/.sky/config.yaml + jobs: + bucket: s3://my-bucket # Supports s3://, gs://, https://.blob.core.windows.net/, r2://, cos:/// + + +If you choose to specify a bucket, ensure that the bucket already exists and that you have the necessary permissions. + +When using a pre-provisioned intermediate bucket with :code:`jobs.bucket`, SkyPilot creates job-specific directories under the bucket root to store files. They are organized in the following structure: + +.. code-block:: text + + # cloud bucket, s3://my-bucket/ for example + my-bucket/ + ├── job-15891b25/ # Job-specific directory + │ ├── local-file-mounts/ # Files from local file mounts + │ ├── tmp-files/ # Temporary files + │ └── workdir/ # Files from workdir + └── job-cae228be/ # Another job's directory + ├── local-file-mounts/ + ├── tmp-files/ + └── workdir/ + +When using a custom bucket (:code:`jobs.bucket`), the job-specific directories (e.g., :code:`job-15891b25/`) created by SkyPilot are removed when the job completes. + +.. tip:: + Multiple users can share the same intermediate bucket. Each user's jobs will have their own unique job-specific directories, ensuring that files are kept separate and organized. + + Concept: Jobs Controller ------------------------ @@ -505,4 +546,3 @@ The :code:`resources` field has the same spec as a normal SkyPilot job; see `her These settings will not take effect if you have an existing controller (either stopped or live). For them to take effect, tear down the existing controller first, which requires all in-progress jobs to finish or be canceled. - diff --git a/docs/source/reference/kubernetes/kubernetes-getting-started.rst b/docs/source/reference/kubernetes/kubernetes-getting-started.rst index e4bbb2c8915..3323559bb36 100644 --- a/docs/source/reference/kubernetes/kubernetes-getting-started.rst +++ b/docs/source/reference/kubernetes/kubernetes-getting-started.rst @@ -258,6 +258,67 @@ After launching the cluster with :code:`sky launch -c myclus task.yaml`, you can To learn more about opening ports in SkyPilot tasks, see :ref:`Opening Ports `. +Customizing SkyPilot pods +------------------------- + +You can override the pod configuration used by SkyPilot by setting the :code:`pod_config` key in :code:`~/.sky/config.yaml`. +The value of :code:`pod_config` should be a dictionary that follows the `Kubernetes Pod API `_. This will apply to all pods created by SkyPilot. + +For example, to set custom environment variables and use GPUDirect RDMA, you can add the following to your :code:`~/.sky/config.yaml` file: + +.. code-block:: yaml + + # ~/.sky/config.yaml + kubernetes: + pod_config: + spec: + containers: + - env: # Custom environment variables to set in pod + - name: MY_ENV_VAR + value: MY_ENV_VALUE + resources: # Custom resources for GPUDirect RDMA + requests: + rdma/rdma_shared_device_a: 1 + limits: + rdma/rdma_shared_device_a: 1 + + +Similarly, you can attach `Kubernetes volumes `_ (e.g., an `NFS volume `_) directly to your SkyPilot pods: + +.. code-block:: yaml + + # ~/.sky/config.yaml + kubernetes: + pod_config: + spec: + containers: + - volumeMounts: # Custom volume mounts for the pod + - mountPath: /data + name: nfs-volume + volumes: + - name: nfs-volume + nfs: # Alternatively, use hostPath if your NFS is directly attached to the nodes + server: nfs.example.com + path: /nfs + + +.. tip:: + + As an alternative to setting ``pod_config`` globally, you can also set it on a per-task basis directly in your task YAML with the ``config_overrides`` :ref:`field `. + + .. code-block:: yaml + + # task.yaml + run: | + python myscript.py + + # Set pod_config for this task + experimental: + config_overrides: + pod_config: + ... + + FAQs ---- @@ -293,38 +354,6 @@ FAQs You can use your existing observability tools to filter resources with the label :code:`parent=skypilot` (:code:`kubectl get pods -l 'parent=skypilot'`). As an example, follow the instructions :ref:`here ` to deploy the Kubernetes Dashboard on your cluster. -* **How can I specify custom configuration for the pods created by SkyPilot?** - - You can override the pod configuration used by SkyPilot by setting the :code:`pod_config` key in :code:`~/.sky/config.yaml`. - The value of :code:`pod_config` should be a dictionary that follows the `Kubernetes Pod API `_. - - For example, to set custom environment variables and attach a volume on your pods, you can add the following to your :code:`~/.sky/config.yaml` file: - - .. code-block:: yaml - - kubernetes: - pod_config: - spec: - containers: - - env: - - name: MY_ENV_VAR - value: MY_ENV_VALUE - volumeMounts: # Custom volume mounts for the pod - - mountPath: /foo - name: example-volume - resources: # Custom resource requests and limits - requests: - rdma/rdma_shared_device_a: 1 - limits: - rdma/rdma_shared_device_a: 1 - volumes: - - name: example-volume - hostPath: - path: /tmp - type: Directory - - For more details refer to :ref:`config-yaml`. - * **I am using a custom image. How can I speed up the pod startup time?** You can pre-install SkyPilot dependencies in your custom image to speed up the pod startup time. Simply add these lines at the end of your Dockerfile: diff --git a/docs/source/reference/yaml-spec.rst b/docs/source/reference/yaml-spec.rst index 8a490b7e817..d2f0506993a 100644 --- a/docs/source/reference/yaml-spec.rst +++ b/docs/source/reference/yaml-spec.rst @@ -176,9 +176,9 @@ Available fields: # tpu_vm: True # True to use TPU VM (the default); False to use TPU node. # Custom image id (optional, advanced). The image id used to boot the - # instances. Only supported for AWS and GCP (for non-docker image). If not - # specified, SkyPilot will use the default debian-based image suitable for - # machine learning tasks. + # instances. Only supported for AWS, GCP, OCI and IBM (for non-docker image). + # If not specified, SkyPilot will use the default debian-based image + # suitable for machine learning tasks. # # Docker support # You can specify docker image to use by setting the image_id to @@ -204,7 +204,7 @@ Available fields: # image_id: # us-east-1: ami-0729d913a335efca7 # us-west-2: ami-050814f384259894c - image_id: ami-0868a20f5a3bf9702 + # # GCP # To find GCP images: https://cloud.google.com/compute/docs/images # image_id: projects/deeplearning-platform-release/global/images/common-cpu-v20230615-debian-11-py310 @@ -215,6 +215,24 @@ Available fields: # To find Azure images: https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage # image_id: microsoft-dsvm:ubuntu-2004:2004:21.11.04 # + # OCI + # To find OCI images: https://docs.oracle.com/en-us/iaas/images + # You can choose the image with OS version from the following image tags + # provided by SkyPilot: + # image_id: skypilot:gpu-ubuntu-2204 + # image_id: skypilot:gpu-ubuntu-2004 + # image_id: skypilot:gpu-oraclelinux9 + # image_id: skypilot:gpu-oraclelinux8 + # image_id: skypilot:cpu-ubuntu-2204 + # image_id: skypilot:cpu-ubuntu-2004 + # image_id: skypilot:cpu-oraclelinux9 + # image_id: skypilot:cpu-oraclelinux8 + # + # It is also possible to specify your custom image's OCID with OS type, + # for example: + # image_id: ocid1.image.oc1.us-sanjose-1.aaaaaaaaywwfvy67wwe7f24juvjwhyjn3u7g7s3wzkhduxcbewzaeki2nt5q:oraclelinux + # image_id: ocid1.image.oc1.us-sanjose-1.aaaaaaaa5tnuiqevhoyfnaa5pqeiwjv6w5vf6w4q2hpj3atyvu3yd6rhlhyq:ubuntu + # # IBM # Create a private VPC image and paste its ID in the following format: # image_id: @@ -224,6 +242,7 @@ Available fields: # https://www.ibm.com/cloud/blog/use-ibm-packer-plugin-to-create-custom-images-on-ibm-cloud-vpc-infrastructure # To use a more limited but easier to manage tool: # https://github.com/IBM/vpc-img-inst + image_id: ami-0868a20f5a3bf9702 # Labels to apply to the instances (optional). # diff --git a/examples/oci/gpu-oraclelinux9.yaml b/examples/oci/gpu-oraclelinux9.yaml new file mode 100644 index 00000000000..cc7b05ea0fc --- /dev/null +++ b/examples/oci/gpu-oraclelinux9.yaml @@ -0,0 +1,33 @@ +name: gpu-task + +resources: + # Optional; if left out, automatically pick the cheapest cloud. + cloud: oci + + accelerators: A10:1 + + disk_size: 1024 + + disk_tier: high + + image_id: skypilot:gpu-oraclelinux9 + + +# Working directory (optional) containing the project codebase. +# Its contents are synced to ~/sky_workdir/ on the cluster. +workdir: . + +num_nodes: 1 + +# Typical use: pip install -r requirements.txt +# Invoked under the workdir (i.e., can use its files). +setup: | + echo "*** Running setup. ***" + +# Typical use: make use of resources, such as running training. +# Invoked under the workdir (i.e., can use its files). +run: | + echo "*** Running the task on OCI ***" + echo "hello, world" + nvidia-smi + echo "The task is completed." diff --git a/examples/oci/gpu-ubuntu-2204.yaml b/examples/oci/gpu-ubuntu-2204.yaml new file mode 100644 index 00000000000..e0012a31a1a --- /dev/null +++ b/examples/oci/gpu-ubuntu-2204.yaml @@ -0,0 +1,33 @@ +name: gpu-task + +resources: + # Optional; if left out, automatically pick the cheapest cloud. + cloud: oci + + accelerators: A10:1 + + disk_size: 1024 + + disk_tier: high + + image_id: skypilot:gpu-ubuntu-2204 + + +# Working directory (optional) containing the project codebase. +# Its contents are synced to ~/sky_workdir/ on the cluster. +workdir: . + +num_nodes: 1 + +# Typical use: pip install -r requirements.txt +# Invoked under the workdir (i.e., can use its files). +setup: | + echo "*** Running setup. ***" + +# Typical use: make use of resources, such as running training. +# Invoked under the workdir (i.e., can use its files). +run: | + echo "*** Running the task on OCI ***" + echo "hello, world" + nvidia-smi + echo "The task is completed." diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 1de799e7cf8..bf92f442d2f 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -650,6 +650,42 @@ def _restore_block(new_block: Dict[str, Any], old_block: Dict[str, Any]): return common_utils.dump_yaml_str(new_config) +def get_expirable_clouds( + enabled_clouds: Sequence[clouds.Cloud]) -> List[clouds.Cloud]: + """Returns a list of clouds that use local credentials and whose credentials can expire. + + This function checks each cloud in the provided sequence to determine if it uses local credentials + and if its credentials can expire. If both conditions are met, the cloud is added to the list of + expirable clouds. + + Args: + enabled_clouds (Sequence[clouds.Cloud]): A sequence of cloud objects to check. + + Returns: + list[clouds.Cloud]: A list of cloud objects that use local credentials and whose credentials can expire. + """ + expirable_clouds = [] + local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value + for cloud in enabled_clouds: + remote_identities = skypilot_config.get_nested( + (str(cloud).lower(), 'remote_identity'), None) + if remote_identities is None: + remote_identities = schemas.get_default_remote_identity( + str(cloud).lower()) + + local_credential_expiring = cloud.can_credential_expire() + if isinstance(remote_identities, str): + if remote_identities == local_credentials_value and local_credential_expiring: + expirable_clouds.append(cloud) + elif isinstance(remote_identities, list): + for profile in remote_identities: + if list(profile.values( + ))[0] == local_credentials_value and local_credential_expiring: + expirable_clouds.append(cloud) + break + return expirable_clouds + + # TODO: too many things happening here - leaky abstraction. Refactor. @timeline.event def write_cluster_config( @@ -926,6 +962,13 @@ def write_cluster_config( tmp_yaml_path, cluster_config_overrides=to_provision.cluster_config_overrides) kubernetes_utils.combine_metadata_fields(tmp_yaml_path) + yaml_obj = common_utils.read_yaml(tmp_yaml_path) + pod_config = yaml_obj['available_node_types']['ray_head_default'][ + 'node_config'] + valid, message = kubernetes_utils.check_pod_config(pod_config) + if not valid: + raise exceptions.InvalidCloudConfigs( + f'Invalid pod_config. Details: {message}') if dryrun: # If dryrun, return the unfinished tmp yaml path. diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 156f43181b2..c972928cd7d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -26,6 +26,7 @@ import sky from sky import backends +from sky import check as sky_check from sky import cloud_stores from sky import clouds from sky import exceptions @@ -1996,6 +1997,22 @@ def provision_with_retries( skip_unnecessary_provisioning else None) failover_history: List[Exception] = list() + # If the user is using local credentials which may expire, the + # controller may leak resources if the credentials expire while a job + # is running. Here we check the enabled clouds and expiring credentials + # and raise a warning to the user. + if task.is_controller_task(): + enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh() + expirable_clouds = backend_utils.get_expirable_clouds( + enabled_clouds) + + if len(expirable_clouds) > 0: + warnings = (f'\033[93mWarning: Credentials used for ' + f'{expirable_clouds} may expire. Clusters may be ' + f'leaked if the credentials expire while jobs ' + f'are running. It is recommended to use credentials' + f' that never expire or a service account.\033[0m') + logger.warning(warnings) # Retrying launchable resources. while True: diff --git a/sky/backends/wheel_utils.py b/sky/backends/wheel_utils.py index ed580569e0b..805117ee2a3 100644 --- a/sky/backends/wheel_utils.py +++ b/sky/backends/wheel_utils.py @@ -153,7 +153,10 @@ def _get_latest_modification_time(path: pathlib.Path) -> float: if not path.exists(): return -1. try: - return max(os.path.getmtime(root) for root, _, _ in os.walk(path)) + return max( + os.path.getmtime(os.path.join(root, f)) + for root, dirs, files in os.walk(path) + for f in (*dirs, *files)) except ValueError: return -1. diff --git a/sky/cloud_stores.py b/sky/cloud_stores.py index 108f33f2c1f..e24c4f3ad03 100644 --- a/sky/cloud_stores.py +++ b/sky/cloud_stores.py @@ -113,8 +113,16 @@ class GcsCloudStorage(CloudStorage): @property def _gsutil_command(self): gsutil_alias, alias_gen = data_utils.get_gsutil_command() - return (f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS=' - f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH} {gsutil_alias}') + return ( + f'{alias_gen}; GOOGLE_APPLICATION_CREDENTIALS=' + f'{gcp.DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH}; ' + # Explicitly activate service account. Unlike the gcp packages + # and other GCP commands, gsutil does not automatically pick up + # the default credential keys when it is a service account. + 'gcloud auth activate-service-account ' + '--key-file=$GOOGLE_APPLICATION_CREDENTIALS ' + '2> /dev/null || true; ' + f'{gsutil_alias}') def is_directory(self, url: str) -> bool: """Returns whether 'url' is a directory. diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index cafc789c5be..a86a87f4feb 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -2,6 +2,8 @@ import enum import fnmatch import functools +import hashlib +import json import os import re import subprocess @@ -16,6 +18,7 @@ from sky import skypilot_config from sky.adaptors import aws from sky.clouds import service_catalog +from sky.clouds.service_catalog import common as catalog_common from sky.clouds.utils import aws_utils from sky.skylet import constants from sky.utils import common_utils @@ -100,6 +103,24 @@ class AWSIdentityType(enum.Enum): # region us-east-1 config-file ~/.aws/config SHARED_CREDENTIALS_FILE = 'shared-credentials-file' + def can_credential_expire(self) -> bool: + """Check if the AWS identity type can expire. + + SSO,IAM_ROLE and CONTAINER_ROLE are temporary credentials and refreshed + automatically. ENV and SHARED_CREDENTIALS_FILE are short-lived + credentials without refresh. + IAM ROLE: + https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html + SSO/Container-role refresh token: + https://docs.aws.amazon.com/solutions/latest/dea-api/auth-refreshtoken.html + """ + # TODO(hong): Add a CLI based check for the expiration of the temporary + # credentials + expirable_types = { + AWSIdentityType.ENV, AWSIdentityType.SHARED_CREDENTIALS_FILE + } + return self in expirable_types + @clouds.CLOUD_REGISTRY.register class AWS(clouds.Cloud): @@ -624,14 +645,10 @@ def check_credentials(cls) -> Tuple[bool, Optional[str]]: @classmethod def _current_identity_type(cls) -> Optional[AWSIdentityType]: - proc = subprocess.run('aws configure list', - shell=True, - check=False, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - if proc.returncode != 0: + stdout = cls._aws_configure_list() + if stdout is None: return None - stdout = proc.stdout.decode() + output = stdout.decode() # We determine the identity type by looking at the output of # `aws configure list`. The output looks like: @@ -646,10 +663,10 @@ def _current_identity_type(cls) -> Optional[AWSIdentityType]: def _is_access_key_of_type(type_str: str) -> bool: # The dot (.) does not match line separators. - results = re.findall(fr'access_key.*{type_str}', stdout) + results = re.findall(fr'access_key.*{type_str}', output) if len(results) > 1: raise RuntimeError( - f'Unexpected `aws configure list` output:\n{stdout}') + f'Unexpected `aws configure list` output:\n{output}') return len(results) == 1 if _is_access_key_of_type(AWSIdentityType.SSO.value): @@ -664,37 +681,20 @@ def _is_access_key_of_type(type_str: str) -> bool: return AWSIdentityType.SHARED_CREDENTIALS_FILE @classmethod - @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. - def get_user_identities(cls) -> Optional[List[List[str]]]: - """Returns a [UserId, Account] list that uniquely identifies the user. - - These fields come from `aws sts get-caller-identity`. We permit the same - actual user to: - - - switch between different root accounts (after which both elements - of the list will be different) and have their clusters owned by - each account be protected; or - - - within the same root account, switch between different IAM - users, and treat [user_id=1234, account=A] and - [user_id=4567, account=A] to be the *same*. Namely, switching - between these IAM roles within the same root account will cause - the first element of the returned list to differ, and will allow - the same actual user to continue to interact with their clusters. - Note: this is not 100% safe, since the IAM users can have very - specific permissions, that disallow them to access the clusters - but it is a reasonable compromise as that could be rare. - - Returns: - A list of strings that uniquely identifies the user on this cloud. - For identity check, we will fallback through the list of strings - until we find a match, and print a warning if we fail for the - first string. + @functools.lru_cache(maxsize=1) + def _aws_configure_list(cls) -> Optional[bytes]: + proc = subprocess.run('aws configure list', + shell=True, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + if proc.returncode != 0: + return None + return proc.stdout - Raises: - exceptions.CloudUserIdentityError: if the user identity cannot be - retrieved. - """ + @classmethod + @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. + def _sts_get_caller_identity(cls) -> Optional[List[List[str]]]: try: sts = aws.client('sts') # The caller identity contains 3 fields: UserId, Account, Arn. @@ -773,6 +773,72 @@ def get_user_identities(cls) -> Optional[List[List[str]]]: # automatic switching for AWS. Currently we only support one identity. return [user_ids] + @classmethod + @functools.lru_cache(maxsize=1) # Cache since getting identity is slow. + def get_user_identities(cls) -> Optional[List[List[str]]]: + """Returns a [UserId, Account] list that uniquely identifies the user. + + These fields come from `aws sts get-caller-identity` and are cached + locally by `aws configure list` output. The identities are assumed to + be stable for the duration of the `sky` process. Modifying the + credentials while the `sky` process is running will not affect the + identity returned by this function. + + We permit the same actual user to: + + - switch between different root accounts (after which both elements + of the list will be different) and have their clusters owned by + each account be protected; or + + - within the same root account, switch between different IAM + users, and treat [user_id=1234, account=A] and + [user_id=4567, account=A] to be the *same*. Namely, switching + between these IAM roles within the same root account will cause + the first element of the returned list to differ, and will allow + the same actual user to continue to interact with their clusters. + Note: this is not 100% safe, since the IAM users can have very + specific permissions, that disallow them to access the clusters + but it is a reasonable compromise as that could be rare. + + Returns: + A list of strings that uniquely identifies the user on this cloud. + For identity check, we will fallback through the list of strings + until we find a match, and print a warning if we fail for the + first string. + + Raises: + exceptions.CloudUserIdentityError: if the user identity cannot be + retrieved. + """ + stdout = cls._aws_configure_list() + if stdout is None: + # `aws configure list` is not available, possible reasons: + # - awscli is not installed but credentials are valid, e.g. run from + # an EC2 instance with IAM role + # - aws credentials are not set, proceed anyway to get unified error + # message for users + return cls._sts_get_caller_identity() + config_hash = hashlib.md5(stdout).hexdigest()[:8] + # Getting aws identity cost ~1s, so we cache the result with the output of + # `aws configure list` as cache key. Different `aws configure list` output + # can have same aws identity, our assumption is the output would be stable + # in real world, so the number of cache files would be limited. + # TODO(aylei): consider using a more stable cache key and evalute eviction. + cache_path = catalog_common.get_catalog_path( + f'aws/.cache/user-identity-{config_hash}.txt') + if os.path.exists(cache_path): + try: + with open(cache_path, 'r', encoding='utf-8') as f: + return json.loads(f.read()) + except json.JSONDecodeError: + # cache is invalid, ignore it and fetch identity again + pass + + result = cls._sts_get_caller_identity() + with open(cache_path, 'w', encoding='utf-8') as f: + f.write(json.dumps(result)) + return result + @classmethod def get_active_user_identity_str(cls) -> Optional[str]: user_identity = cls.get_active_user_identity() @@ -812,6 +878,12 @@ def get_credential_file_mounts(self) -> Dict[str, str]: if os.path.exists(os.path.expanduser(f'~/.aws/{filename}')) } + @functools.lru_cache(maxsize=1) + def can_credential_expire(self) -> bool: + identity_type = self._current_identity_type() + return identity_type is not None and identity_type.can_credential_expire( + ) + def instance_type_exists(self, instance_type): return service_catalog.instance_type_exists(instance_type, clouds='aws') diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 455baeaf5d9..2cb45ca14fc 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -536,6 +536,10 @@ def get_credential_file_mounts(self) -> Dict[str, str]: """ raise NotImplementedError + def can_credential_expire(self) -> bool: + """Returns whether the cloud credential can expire.""" + return False + @classmethod def get_image_size(cls, image_id: str, region: Optional[str]) -> float: """Check the image size from the cloud. diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index ff200f84147..3502fee8e1c 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -132,6 +132,9 @@ class GCPIdentityType(enum.Enum): SHARED_CREDENTIALS_FILE = '' + def can_credential_expire(self) -> bool: + return self == GCPIdentityType.SHARED_CREDENTIALS_FILE + @clouds.CLOUD_REGISTRY.register class GCP(clouds.Cloud): @@ -863,6 +866,12 @@ def get_credential_file_mounts(self) -> Dict[str, str]: pass return credentials + @functools.lru_cache(maxsize=1) + def can_credential_expire(self) -> bool: + identity_type = self._get_identity_type() + return identity_type is not None and identity_type.can_credential_expire( + ) + @classmethod def _get_identity_type(cls) -> Optional[GCPIdentityType]: try: diff --git a/sky/clouds/oci.py b/sky/clouds/oci.py index d4ae6f298d2..b0234e2802c 100644 --- a/sky/clouds/oci.py +++ b/sky/clouds/oci.py @@ -232,6 +232,14 @@ def make_deploy_resources_variables( listing_id = None res_ver = None + os_type = None + if ':' in image_id: + # OS type provided in the --image-id. This is the case where + # custom image's ocid provided in the --image-id parameter. + # - ocid1.image...aaa:oraclelinux (os type is oraclelinux) + # - ocid1.image...aaa (OS not provided) + image_id, os_type = image_id.replace(' ', '').split(':') + cpus = resources.cpus instance_type_arr = resources.instance_type.split( oci_utils.oci_config.INSTANCE_TYPE_RES_SPERATOR) @@ -297,15 +305,18 @@ def make_deploy_resources_variables( cpus=None if cpus is None else float(cpus), disk_tier=resources.disk_tier) - image_str = self._get_image_str(image_id=resources.image_id, - instance_type=resources.instance_type, - region=region.name) - - # pylint: disable=import-outside-toplevel - from sky.clouds.service_catalog import oci_catalog - os_type = oci_catalog.get_image_os_from_tag(tag=image_str, - region=region.name) - logger.debug(f'OS type for the image {image_str} is {os_type}') + if os_type is None: + # OS type is not determined yet. So try to get it from vms.csv + image_str = self._get_image_str( + image_id=resources.image_id, + instance_type=resources.instance_type, + region=region.name) + + # pylint: disable=import-outside-toplevel + from sky.clouds.service_catalog import oci_catalog + os_type = oci_catalog.get_image_os_from_tag(tag=image_str, + region=region.name) + logger.debug(f'OS type for the image {image_id} is {os_type}') return { 'instance_type': instance_type, diff --git a/sky/clouds/utils/oci_utils.py b/sky/clouds/utils/oci_utils.py index 0cd4f33e647..581d4d72d3c 100644 --- a/sky/clouds/utils/oci_utils.py +++ b/sky/clouds/utils/oci_utils.py @@ -6,6 +6,10 @@ configuration. - Hysun He (hysun.he@oracle.com) @ Nov.12, 2024: Add the constant SERVICE_PORT_RULE_TAG + - Hysun He (hysun.he@oracle.com) @ Jan.01, 2025: Set the default image + from ubuntu 20.04 to ubuntu 22.04, including: + - GPU: skypilot:gpu-ubuntu-2004 -> skypilot:gpu-ubuntu-2204 + - CPU: skypilot:cpu-ubuntu-2004 -> skypilot:cpu-ubuntu-2204 """ import os @@ -117,7 +121,7 @@ def get_default_gpu_image_tag(cls) -> str: # the sky's user-config file (if not specified, use the hardcode one at # last) return skypilot_config.get_nested(('oci', 'default', 'image_tag_gpu'), - 'skypilot:gpu-ubuntu-2004') + 'skypilot:gpu-ubuntu-2204') @classmethod def get_default_image_tag(cls) -> str: @@ -125,7 +129,7 @@ def get_default_image_tag(cls) -> str: # set the default image tag in the sky's user-config file. (if not # specified, use the hardcode one at last) return skypilot_config.get_nested( - ('oci', 'default', 'image_tag_general'), 'skypilot:cpu-ubuntu-2004') + ('oci', 'default', 'image_tag_general'), 'skypilot:cpu-ubuntu-2204') @classmethod def get_sky_user_config_file(cls) -> str: diff --git a/sky/data/data_utils.py b/sky/data/data_utils.py index 05c2b42c844..e8dcaa83017 100644 --- a/sky/data/data_utils.py +++ b/sky/data/data_utils.py @@ -523,10 +523,14 @@ def get_gsutil_command() -> Tuple[str, str]: def run_upload_cli(command: str, access_denied_message: str, bucket_name: str, log_path: str): - returncode, stdout, stderr = log_lib.run_with_log(command, - log_path, - shell=True, - require_outputs=True) + returncode, stdout, stderr = log_lib.run_with_log( + command, + log_path, + shell=True, + require_outputs=True, + # We need to use bash as some of the cloud commands uses bash syntax, + # such as [[ ... ]] + executable='/bin/bash') if access_denied_message in stderr: with ux_utils.print_exception_no_traceback(): raise PermissionError('Failed to upload files to ' diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 487868d1d9e..14b6b42aa58 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -893,6 +893,52 @@ def check_credentials(context: Optional[str], return True, None +def check_pod_config(pod_config: dict) \ + -> Tuple[bool, Optional[str]]: + """Check if the pod_config is a valid pod config + + Using deserialize api to check the pod_config is valid or not. + + Returns: + bool: True if pod_config is valid. + str: Error message about why the pod_config is invalid, None otherwise. + """ + errors = [] + # This api_client won't be used to send any requests, so there is no need to + # load kubeconfig + api_client = kubernetes.kubernetes.client.ApiClient() + + # Used for kubernetes api_client deserialize function, the function will use + # data attr, the detail ref: + # https://github.com/kubernetes-client/python/blob/master/kubernetes/client/api_client.py#L244 + class InnerResponse(): + + def __init__(self, data: dict): + self.data = json.dumps(data) + + try: + # Validate metadata if present + if 'metadata' in pod_config: + try: + value = InnerResponse(pod_config['metadata']) + api_client.deserialize( + value, kubernetes.kubernetes.client.V1ObjectMeta) + except ValueError as e: + errors.append(f'Invalid metadata: {str(e)}') + # Validate spec if present + if 'spec' in pod_config: + try: + value = InnerResponse(pod_config['spec']) + api_client.deserialize(value, + kubernetes.kubernetes.client.V1PodSpec) + except ValueError as e: + errors.append(f'Invalid spec: {str(e)}') + return len(errors) == 0, '.'.join(errors) + except Exception as e: # pylint: disable=broad-except + errors.append(f'Validation error: {str(e)}') + return False, '.'.join(errors) + + def is_kubeconfig_exec_auth( context: Optional[str] = None) -> Tuple[bool, Optional[str]]: """Checks if the kubeconfig file uses exec-based authentication diff --git a/sky/provision/oci/query_utils.py b/sky/provision/oci/query_utils.py index 8cca0629305..3037fcc2703 100644 --- a/sky/provision/oci/query_utils.py +++ b/sky/provision/oci/query_utils.py @@ -506,8 +506,11 @@ def find_nsg(cls, region: str, nsg_name: str, raise exceptions.ResourcesUnavailableError( 'The VCN is not available') - # Get the primary vnic. - assert len(list_vcns_resp.data) > 0 + # Get the primary vnic. The vnic might be an empty list for the + # corner case when the cluster was exited during provision. + if not list_vcns_resp.data: + return None + vcn = list_vcns_resp.data[0] list_nsg_resp = net_client.list_network_security_groups( diff --git a/tests/smoke_tests/test_managed_job.py b/tests/smoke_tests/test_managed_job.py index 22381fc45e3..5c930724523 100644 --- a/tests/smoke_tests/test_managed_job.py +++ b/tests/smoke_tests/test_managed_job.py @@ -365,7 +365,7 @@ def test_managed_jobs_pipeline_recovery_gcp(): # separated by `-`. (f'MANAGED_JOB_ID=`cat /tmp/{name}-run-id | rev | ' f'cut -d\'_\' -f1 | rev | cut -d\'-\' -f1`; {terminate_cmd}'), - smoke_tests_utils.zJOB_WAIT_NOT_RUNNING.format(job_name=name), + smoke_tests_utils.JOB_WAIT_NOT_RUNNING.format(job_name=name), f'{smoke_tests_utils.GET_JOB_QUEUE} | grep {name} | head -n1 | grep "RECOVERING"', smoke_tests_utils. get_cmd_wait_until_managed_job_status_contains_matching_job_name( diff --git a/tests/test_config.py b/tests/test_config.py index 5789214dc61..d3eaeb261bc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -7,6 +7,7 @@ import sky from sky import skypilot_config +import sky.exceptions from sky.skylet import constants from sky.utils import common_utils from sky.utils import kubernetes_enums @@ -99,6 +100,29 @@ def _create_task_yaml_file(task_file_path: pathlib.Path) -> None: """)) +def _create_invalid_config_yaml_file(task_file_path: pathlib.Path) -> None: + task_file_path.write_text( + textwrap.dedent("""\ + experimental: + config_overrides: + kubernetes: + pod_config: + metadata: + labels: + test-key: test-value + annotations: + abc: def + spec: + containers: + - name: + imagePullSecrets: + - name: my-secret-2 + + setup: echo 'Setting up...' + run: echo 'Running...' + """)) + + def test_nested_config(monkeypatch) -> None: """Test that the nested config works.""" config = skypilot_config.Config() @@ -335,6 +359,28 @@ def test_k8s_config_with_override(monkeypatch, tmp_path, assert cluster_pod_config['spec']['runtimeClassName'] == 'nvidia' +def test_k8s_config_with_invalid_config(monkeypatch, tmp_path, + enable_all_clouds) -> None: + config_path = tmp_path / 'config.yaml' + _create_config_file(config_path) + monkeypatch.setattr(skypilot_config, 'CONFIG_PATH', config_path) + + _reload_config() + task_path = tmp_path / 'task.yaml' + _create_invalid_config_yaml_file(task_path) + task = sky.Task.from_yaml(task_path) + + # Test Kubernetes pod_config invalid + cluster_name = 'test_k8s_config_with_invalid_config' + task.set_resources_override({'cloud': sky.Kubernetes()}) + exception_occurred = False + try: + sky.launch(task, cluster_name=cluster_name, dryrun=True) + except sky.exceptions.ResourcesUnavailableError: + exception_occurred = True + assert exception_occurred + + def test_gcp_config_with_override(monkeypatch, tmp_path, enable_all_clouds) -> None: config_path = tmp_path / 'config.yaml'