Skip to content

Commit

Permalink
[Core][Docker] Support docker login on RunPod. (#4287)
Browse files Browse the repository at this point in the history
* [Core][Docker] Support docker login on RunPod.

* nit

* works

* remove unnecessary

* delete template and registry after termination

* move to graphql api & remove ephemeral resourcdes

* nits
  • Loading branch information
cblmemo authored Jan 8, 2025
1 parent bee4647 commit 2fa37ec
Show file tree
Hide file tree
Showing 7 changed files with 210 additions and 26 deletions.
2 changes: 1 addition & 1 deletion docs/source/getting-started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ RunPod

.. code-block:: shell
pip install "runpod>=1.5.1"
pip install "runpod>=1.6.1"
runpod config
Expand Down
11 changes: 8 additions & 3 deletions sky/provision/docker_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ class DockerLoginConfig:
password: str
server: str

def format_image(self, image: str) -> str:
"""Format the image name with the server prefix."""
server_prefix = f'{self.server}/'
if not image.startswith(server_prefix):
return f'{server_prefix}{image}'
return image

@classmethod
def from_env_vars(cls, d: Dict[str, str]) -> 'DockerLoginConfig':
return cls(
Expand Down Expand Up @@ -220,9 +227,7 @@ def initialize(self) -> str:
wait_for_docker_daemon=True)
# We automatically add the server prefix to the image name if
# the user did not add it.
server_prefix = f'{docker_login_config.server}/'
if not specific_image.startswith(server_prefix):
specific_image = f'{server_prefix}{specific_image}'
specific_image = docker_login_config.format_image(specific_image)

if self.docker_config.get('pull_before_run', True):
assert specific_image, ('Image must be included in config if ' +
Expand Down
11 changes: 10 additions & 1 deletion sky/provision/runpod/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
node_type = 'head' if head_instance_id is None else 'worker'
try:
instance_id = utils.launch(
name=f'{cluster_name_on_cloud}-{node_type}',
cluster_name=cluster_name_on_cloud,
node_type=node_type,
instance_type=config.node_config['InstanceType'],
region=region,
disk_size=config.node_config['DiskSize'],
Expand All @@ -92,6 +93,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
public_key=config.node_config['PublicKey'],
preemptible=config.node_config['Preemptible'],
bid_per_gpu=config.node_config['BidPerGPU'],
docker_login_config=config.provider_config.get(
'docker_login_config'),
)
except Exception as e: # pylint: disable=broad-except
logger.warning(f'run_instances error: {e}')
Expand Down Expand Up @@ -145,6 +148,8 @@ def terminate_instances(
"""See sky/provision/__init__.py"""
del provider_config # unused
instances = _filter_instances(cluster_name_on_cloud, None)
template_name, registry_auth_id = utils.get_registry_auth_resources(
cluster_name_on_cloud)
for inst_id, inst in instances.items():
logger.debug(f'Terminating instance {inst_id}: {inst}')
if worker_only and inst['name'].endswith('-head'):
Expand All @@ -157,6 +162,10 @@ def terminate_instances(
f'Failed to terminate instance {inst_id}: '
f'{common_utils.format_exception(e, use_bracket=False)}'
) from e
if template_name is not None:
utils.delete_pod_template(template_name)
if registry_auth_id is not None:
utils.delete_register_auth(registry_auth_id)


def get_cluster_info(
Expand Down
183 changes: 170 additions & 13 deletions sky/provision/runpod/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@

import base64
import time
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Tuple

from sky import sky_logging
from sky.adaptors import runpod
from sky.provision import docker_utils
import sky.provision.runpod.api.commands as runpod_commands
from sky.skylet import constants
from sky.utils import common_utils
Expand Down Expand Up @@ -47,6 +48,11 @@
}


def _construct_docker_login_template_name(cluster_name: str) -> str:
"""Constructs the registry auth template name."""
return f'{cluster_name}-docker-login-template'


def retry(func):
"""Decorator to retry a function."""

Expand All @@ -66,9 +72,83 @@ def wrapper(*args, **kwargs):
return wrapper


# Adapted from runpod.api.queries.pods.py::QUERY_POD.
# Adding containerRegistryAuthId to the query.
_QUERY_POD = """
query myPods {
myself {
pods {
id
containerDiskInGb
containerRegistryAuthId
costPerHr
desiredStatus
dockerArgs
dockerId
env
gpuCount
imageName
lastStatusChange
machineId
memoryInGb
name
podType
port
ports
uptimeSeconds
vcpuCount
volumeInGb
volumeMountPath
runtime {
ports{
ip
isIpPublic
privatePort
publicPort
type
}
}
machine {
gpuDisplayName
}
}
}
}
"""


def _sky_get_pods() -> dict:
"""List all pods with extra registry auth information.
Adapted from runpod.get_pods() to include containerRegistryAuthId.
"""
raw_return = runpod.runpod.api.graphql.run_graphql_query(_QUERY_POD)
cleaned_return = raw_return['data']['myself']['pods']
return cleaned_return


_QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH = """
query myself {
myself {
podTemplates {
name
containerRegistryAuthId
}
}
}
"""


def _list_pod_templates_with_container_registry() -> dict:
"""List all pod templates."""
raw_return = runpod.runpod.api.graphql.run_graphql_query(
_QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH)
return raw_return['data']['myself']['podTemplates']


def list_instances() -> Dict[str, Dict[str, Any]]:
"""Lists instances associated with API key."""
instances = runpod.runpod.get_pods()
instances = _sky_get_pods()

instance_dict: Dict[str, Dict[str, Any]] = {}
for instance in instances:
Expand Down Expand Up @@ -100,14 +180,75 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
return instance_dict


def launch(name: str, instance_type: str, region: str, disk_size: int,
image_name: str, ports: Optional[List[int]], public_key: str,
preemptible: Optional[bool], bid_per_gpu: float) -> str:
def delete_pod_template(template_name: str) -> None:
"""Deletes a pod template."""
try:
runpod.runpod.api.graphql.run_graphql_query(
f'mutation {{deleteTemplate(templateName: "{template_name}")}}')
except runpod.runpod.error.QueryError as e:
logger.warning(f'Failed to delete template {template_name}: {e}'
'Please delete it manually.')


def delete_register_auth(registry_auth_id: str) -> None:
"""Deletes a registry auth."""
try:
runpod.runpod.delete_container_registry_auth(registry_auth_id)
except runpod.runpod.error.QueryError as e:
logger.warning(f'Failed to delete registry auth {registry_auth_id}: {e}'
'Please delete it manually.')


def _create_template_for_docker_login(
cluster_name: str,
image_name: str,
docker_login_config: Optional[Dict[str, str]],
) -> Tuple[str, Optional[str]]:
"""Creates a template for the given image with the docker login config.
Returns:
formatted_image_name: The formatted image name.
template_id: The template ID. None for no docker login config.
"""
if docker_login_config is None:
return image_name, None
login_config = docker_utils.DockerLoginConfig(**docker_login_config)
container_registry_auth_name = f'{cluster_name}-registry-auth'
container_template_name = _construct_docker_login_template_name(
cluster_name)
# The `name` argument is only for display purpose and the registry server
# will be splitted from the docker image name (Tested with AWS ECR).
# Here we only need the username and password to create the registry auth.
# TODO(tian): Now we create a template and a registry auth for each cluster.
# Consider create one for each server and reuse them. Challenges including
# calculate the reference count and delete them when no longer needed.
create_auth_resp = runpod.runpod.create_container_registry_auth(
name=container_registry_auth_name,
username=login_config.username,
password=login_config.password,
)
registry_auth_id = create_auth_resp['id']
create_template_resp = runpod.runpod.create_template(
name=container_template_name,
image_name=None,
registry_auth_id=registry_auth_id,
)
return login_config.format_image(image_name), create_template_resp['id']


def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
disk_size: int, image_name: str, ports: Optional[List[int]],
public_key: str, preemptible: Optional[bool], bid_per_gpu: float,
docker_login_config: Optional[Dict[str, str]]) -> str:
"""Launches an instance with the given parameters.
Converts the instance_type to the RunPod GPU name, finds the specs for the
GPU, and launches the instance.
Returns:
instance_id: The instance ID.
"""
name = f'{cluster_name}-{node_type}'
gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
gpu_quantity = int(instance_type.split('_')[0].replace('x', ''))
cloud_type = instance_type.split('_')[2]
Expand Down Expand Up @@ -139,31 +280,35 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
# Use base64 to deal with the tricky quoting issues caused by runpod API.
encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8')

docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; '
f'bash init.sh\'')

# Port 8081 is occupied for nginx in the base image.
custom_ports_str = ''
if ports is not None:
custom_ports_str = ''.join([f'{p}/tcp,' for p in ports])
ports_str = (f'22/tcp,'
f'{custom_ports_str}'
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
f'{constants.SKY_REMOTE_RAY_PORT}/http')

docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; '
f'bash init.sh\'')
ports = (f'22/tcp,'
f'{custom_ports_str}'
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
f'{constants.SKY_REMOTE_RAY_PORT}/http')
image_name_formatted, template_id = _create_template_for_docker_login(
cluster_name, image_name, docker_login_config)

params = {
'name': name,
'image_name': image_name,
'image_name': image_name_formatted,
'gpu_type_id': gpu_type,
'cloud_type': cloud_type,
'container_disk_in_gb': disk_size,
'min_vcpu_count': 4 * gpu_quantity,
'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity,
'gpu_count': gpu_quantity,
'country_code': region,
'ports': ports,
'ports': ports_str,
'support_public_ip': True,
'docker_args': docker_args,
'template_id': template_id,
}

if preemptible is None or not preemptible:
Expand All @@ -177,6 +322,18 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
return new_instance['id']


def get_registry_auth_resources(
cluster_name: str) -> Tuple[Optional[str], Optional[str]]:
"""Gets the registry auth resources."""
container_registry_auth_name = _construct_docker_login_template_name(
cluster_name)
for template in _list_pod_templates_with_container_registry():
if template['name'] == container_registry_auth_name:
return container_registry_auth_name, template[
'containerRegistryAuthId']
return None, None


def remove(instance_id: str) -> None:
"""Terminates the given instance."""
runpod.runpod.terminate_pod(instance_id)
Expand Down
4 changes: 3 additions & 1 deletion sky/setup_files/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,9 @@
'oci': ['oci'] + local_ray,
'kubernetes': ['kubernetes>=20.0.0'],
'remote': remote,
'runpod': ['runpod>=1.5.1'],
# For the container registry auth api. Reference:
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
'runpod': ['runpod>=1.6.1'],
'fluidstack': [], # No dependencies needed for fluidstack
'cudo': ['cudo-compute>=0.1.10'],
'paperspace': [], # No dependencies needed for paperspace
Expand Down
12 changes: 5 additions & 7 deletions sky/skylet/providers/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def docker_start_cmds(
docker_cmd,
):
"""Generating docker start command without --rm.
The code is borrowed from `ray.autoscaler._private.docker`.
Changes we made:
Expand Down Expand Up @@ -159,19 +159,17 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str],
return True

# SkyPilot: Docker login if user specified a private docker registry.
if "docker_login_config" in self.docker_config:
if 'docker_login_config' in self.docker_config:
# TODO(tian): Maybe support a command to get the login password?
docker_login_config: docker_utils.DockerLoginConfig = self.docker_config[
"docker_login_config"]
docker_login_config: docker_utils.DockerLoginConfig = (
self.docker_config['docker_login_config'])
self._run_with_retry(
f'{self.docker_cmd} login --username '
f'{docker_login_config.username} --password '
f'{docker_login_config.password} {docker_login_config.server}')
# We automatically add the server prefix to the image name if
# the user did not add it.
server_prefix = f'{docker_login_config.server}/'
if not specific_image.startswith(server_prefix):
specific_image = f'{server_prefix}{specific_image}'
specific_image = docker_login_config.format_image(specific_image)

if self.docker_config.get('pull_before_run', True):
assert specific_image, ('Image must be included in config if '
Expand Down
13 changes: 13 additions & 0 deletions sky/templates/runpod-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@ provider:
module: sky.provision.runpod
region: "{{region}}"
disable_launch_config_check: true
# For RunPod, we directly set the image id for the docker as runtime environment
# support, thus we need to avoid the DockerInitializer detects the docker field
# and performs the initialization. Therefore we put the docker login config in
# the provider config here.
{%- if docker_login_config is not none %}
docker_login_config:
username: |-
{{docker_login_config.username}}
password: |-
{{docker_login_config.password}}
server: |-
{{docker_login_config.server}}
{%- endif %}

auth:
ssh_user: root
Expand Down

0 comments on commit 2fa37ec

Please sign in to comment.