diff --git a/sky/authentication.py b/sky/authentication.py index 91d90444076..db884187b61 100644 --- a/sky/authentication.py +++ b/sky/authentication.py @@ -6,7 +6,6 @@ import socket import subprocess import sys -import textwrap import time from typing import Any, Dict, Tuple import uuid @@ -15,6 +14,7 @@ from cryptography.hazmat.primitives import serialization from cryptography.hazmat.primitives.asymmetric import rsa from cryptography.hazmat.backends import default_backend +import yaml from sky import clouds from sky import sky_logging @@ -88,26 +88,22 @@ def get_or_generate_keys() -> Tuple[str, str]: return private_key_path, public_key_path +def _replace_cloud_init_ssh_info_in_config(config: Dict[str, Any], + public_key: str) -> Dict[str, Any]: + config_str = common_utils.dump_yaml_str(config) + config_str = config_str.replace('skypilot:ssh_user', + config['auth']['ssh_user']) + config_str = config_str.replace('skypilot:ssh_public_key_content', + public_key) + config = yaml.safe_load(config_str) + return config + + def setup_aws_authentication(config: Dict[str, Any]) -> Dict[str, Any]: _, public_key_path = get_or_generate_keys() with open(public_key_path, 'r') as f: public_key = f.read() - # Use cloud init in UserData to set up the authorized_keys to get - # around the number of keys limit and permission issues with - # ec2.describe_key_pairs. - # Note that sudo and shell need to be specified to ensure setup works. - # Reference: https://cloudinit.readthedocs.io/en/latest/reference/modules.html#users-and-groups # pylint: disable=line-too-long - for node_type in config['available_node_types']: - config['available_node_types'][node_type]['node_config']['UserData'] = ( - textwrap.dedent(f"""\ - #cloud-config - users: - - name: {config['auth']['ssh_user']} - shell: /bin/bash - sudo: ALL=(ALL) NOPASSWD:ALL - ssh-authorized-keys: - - {public_key} - """)) + config = _replace_cloud_init_ssh_info_in_config(config, public_key) return config diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 3bf61f1f3f7..acef79cbd09 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -127,8 +127,13 @@ # it's possible to failover to 1b, which leaves a leaked instance in 1a. Here, # we use the new yaml's zone field, which is guaranteed to be the existing zone # '1a'. +# - UserData: The UserData field of the old yaml may be outdated, and we want to +# use the new yaml's UserData field, which contains the authorized key setup as +# well as the disabling of the auto-update with apt-get. _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [ ('provider', 'availability_zone'), + ('available_node_types', 'ray.head.default', 'node_config', 'UserData'), + ('available_node_types', 'ray.worker.default', 'node_config', 'UserData'), ] diff --git a/sky/templates/aws-ray.yml.j2 b/sky/templates/aws-ray.yml.j2 index 717a35ac1bd..75f3f956ed0 100644 --- a/sky/templates/aws-ray.yml.j2 +++ b/sky/templates/aws-ray.yml.j2 @@ -59,6 +59,26 @@ available_node_types: # SpotOptions: # MaxPrice: MAX_HOURLY_PRICE {% endif %} + # Use cloud init in UserData to set up the authorized_keys to get + # around the number of keys limit and permission issues with + # ec2.describe_key_pairs. + # Note that sudo and shell need to be specified to ensure setup works. + # Reference: https://cloudinit.readthedocs.io/en/latest/reference/modules.html#users-and-groups + # The bootcmd is to disable automatic APT updates, to avoid the lock + # when user call `apt install` on the node. + # Reference: https://unix.stackexchange.com/a/471192 + UserData: | + #cloud-config + users: + - name: skypilot:ssh_user + shell: /bin/bash + sudo: ALL=(ALL) NOPASSWD:ALL + ssh_authorized_keys: + - skypilot:ssh_public_key_content + bootcmd: + - echo 'APT::Periodic::Enable "0";' > /etc/apt/apt.conf.d/10cloudinit-disable + - apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades + - echo "Removed APT" | systemd-cat TagSpecifications: - ResourceType: instance Tags: @@ -88,6 +108,18 @@ available_node_types: # SpotOptions: # MaxPrice: MAX_HOURLY_PRICE {% endif %} + UserData: | + #cloud-config + users: + - name: skypilot:ssh_user + shell: /bin/bash + sudo: ALL=(ALL) NOPASSWD:ALL + ssh_authorized_keys: + - skypilot:ssh_public_key_content + bootcmd: + - echo 'APT::Periodic::Enable "0";' > /etc/apt/apt.conf.d/10cloudinit-disable + - apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades + - echo "Removed APT" | systemd-cat TagSpecifications: - ResourceType: instance Tags: @@ -127,17 +159,7 @@ setup_commands: # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase. # Line 'mkdir -p ..': disable host key check # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys` - - function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; }; - sudo systemctl stop unattended-upgrades || true; - sudo systemctl disable unattended-upgrades || true; - sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true; - p=$(mylsof "/var/lib/dpkg/lock-frontend"); echo "$p"; - sudo kill -9 `echo "$p" | tail -n 1` || true; - sudo rm /var/lib/dpkg/lock-frontend; - sudo pkill -9 dpkg; - sudo pkill -9 apt-get; - sudo dpkg --configure --force-overwrite -a; - mkdir -p ~/.ssh; touch ~/.ssh/config; + - mkdir -p ~/.ssh; touch ~/.ssh/config; pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc); (type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc; (type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc; diff --git a/tests/test_smoke.py b/tests/test_smoke.py index f7edc896804..5f294767107 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -1375,7 +1375,7 @@ def test_spot_failed_setup(generic_cloud: str): 'spot-failed-setup', [ f'sky spot launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml', - 'sleep 300', + 'sleep 330', # Make sure the job failed quickly. f'{_SPOT_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED_SETUP"', ],