Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AWS] Disable unattended-upgrade with cloud-init #1949

Merged
merged 12 commits into from
May 11, 2023
30 changes: 13 additions & 17 deletions sky/authentication.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import socket
import subprocess
import sys
import textwrap
import time
from typing import Any, Dict, Tuple
import uuid
Expand All @@ -15,6 +14,7 @@
from cryptography.hazmat.primitives import serialization
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.hazmat.backends import default_backend
import yaml

from sky import clouds
from sky import sky_logging
Expand Down Expand Up @@ -88,26 +88,22 @@ def get_or_generate_keys() -> Tuple[str, str]:
return private_key_path, public_key_path


def _replace_cloud_init_ssh_info_in_config(config: Dict[str, Any],
public_key: str) -> Dict[str, Any]:
config_str = common_utils.dump_yaml_str(config)
config_str = config_str.replace('skypilot:ssh_user',
config['auth']['ssh_user'])
config_str = config_str.replace('skypilot:ssh_public_key_content',
public_key)
config = yaml.safe_load(config_str)
return config


def setup_aws_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
_, public_key_path = get_or_generate_keys()
with open(public_key_path, 'r') as f:
public_key = f.read()
# Use cloud init in UserData to set up the authorized_keys to get
# around the number of keys limit and permission issues with
# ec2.describe_key_pairs.
# Note that sudo and shell need to be specified to ensure setup works.
# Reference: https://cloudinit.readthedocs.io/en/latest/reference/modules.html#users-and-groups # pylint: disable=line-too-long
for node_type in config['available_node_types']:
config['available_node_types'][node_type]['node_config']['UserData'] = (
textwrap.dedent(f"""\
#cloud-config
users:
- name: {config['auth']['ssh_user']}
shell: /bin/bash
sudo: ALL=(ALL) NOPASSWD:ALL
ssh-authorized-keys:
- {public_key}
"""))
config = _replace_cloud_init_ssh_info_in_config(config, public_key)
return config


Expand Down
5 changes: 5 additions & 0 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,13 @@
# it's possible to failover to 1b, which leaves a leaked instance in 1a. Here,
# we use the new yaml's zone field, which is guaranteed to be the existing zone
# '1a'.
# - UserData: The UserData field of the old yaml may be outdated, and we want to
# use the new yaml's UserData field, which contains the authorized key setup as
# well as the disabling of the auto-update with apt-get.
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
_RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
('provider', 'availability_zone'),
('available_node_types', 'ray.head.default', 'node_config', 'UserData'),
('available_node_types', 'ray.worker.default', 'node_config', 'UserData'),
]


Expand Down
44 changes: 33 additions & 11 deletions sky/templates/aws-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,26 @@ available_node_types:
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
{% endif %}
# Use cloud init in UserData to set up the authorized_keys to get
# around the number of keys limit and permission issues with
# ec2.describe_key_pairs.
# Note that sudo and shell need to be specified to ensure setup works.
# Reference: https://cloudinit.readthedocs.io/en/latest/reference/modules.html#users-and-groups
# The bootcmd is to disable automatic APT updates, to avoid the lock
# when user call `apt install` on the node.
# Reference: https://unix.stackexchange.com/a/471192
UserData: |
#cloud-config
users:
- name: skypilot:ssh_user
shell: /bin/bash
sudo: ALL=(ALL) NOPASSWD:ALL
ssh_authorized_keys:
- skypilot:ssh_public_key_content
bootcmd:
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
- echo 'APT::Periodic::Enable "0";' > /etc/apt/apt.conf.d/10cloudinit-disable
- apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades
- echo "Removed APT" | systemd-cat
TagSpecifications:
- ResourceType: instance
Tags:
Expand Down Expand Up @@ -88,6 +108,18 @@ available_node_types:
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
{% endif %}
UserData: |
#cloud-config
users:
- name: skypilot:ssh_user
shell: /bin/bash
sudo: ALL=(ALL) NOPASSWD:ALL
ssh_authorized_keys:
- skypilot:ssh_public_key_content
bootcmd:
- echo 'APT::Periodic::Enable "0";' > /etc/apt/apt.conf.d/10cloudinit-disable
- apt-get -y purge update-notifier-common ubuntu-release-upgrader-core landscape-common unattended-upgrades
- echo "Removed APT" | systemd-cat
TagSpecifications:
- ResourceType: instance
Tags:
Expand Down Expand Up @@ -127,17 +159,7 @@ setup_commands:
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
# Line 'mkdir -p ..': disable host key check
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
- function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
sudo systemctl stop unattended-upgrades || true;
sudo systemctl disable unattended-upgrades || true;
sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
p=$(mylsof "/var/lib/dpkg/lock-frontend"); echo "$p";
sudo kill -9 `echo "$p" | tail -n 1` || true;
sudo rm /var/lib/dpkg/lock-frontend;
sudo pkill -9 dpkg;
sudo pkill -9 apt-get;
sudo dpkg --configure --force-overwrite -a;
mkdir -p ~/.ssh; touch ~/.ssh/config;
- mkdir -p ~/.ssh; touch ~/.ssh/config;
pip3 --version > /dev/null 2>&1 || (curl -sSL https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3 get-pip.py && echo "PATH=$HOME/.local/bin:$PATH" >> ~/.bashrc);
(type -a python | grep -q python3) || echo 'alias python=python3' >> ~/.bashrc;
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
Expand Down
2 changes: 1 addition & 1 deletion tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -1375,7 +1375,7 @@ def test_spot_failed_setup(generic_cloud: str):
'spot-failed-setup',
[
f'sky spot launch -n {name} --cloud {generic_cloud} -y -d tests/test_yamls/failed_setup.yaml',
'sleep 300',
'sleep 330',
# Make sure the job failed quickly.
f'{_SPOT_QUEUE_WAIT} | grep {name} | head -n1 | grep "FAILED_SETUP"',
],
Expand Down