Skip to content

Commit

Permalink
Add scripts for building VM images
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 committed Dec 9, 2024
1 parent 2d81c47 commit 7ec773a
Show file tree
Hide file tree
Showing 9 changed files with 403 additions and 0 deletions.
57 changes: 57 additions & 0 deletions vm_images/linux/bootstrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash
set -euo pipefail

## Install Python3
sudo apt-get update
sudo apt-get install -y python3 python3-pip python3-venv
sudo pip3 install --break-system-packages 'pip>=23' 'wheel>=0.42' pydistcheck

## Install Docker
# Add Docker's official GPG key:
sudo install -m 0755 -d /etc/apt/keyrings
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc
sudo chmod a+r /etc/apt/keyrings/docker.asc
# Add the repository to Apt sources:
echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
sudo apt-get update
sudo apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
# Allow users to use Docker without sudo
sudo usermod -aG docker ubuntu

# Start Docker daemon
sudo systemctl is-active --quiet docker.service || sudo systemctl start docker.service
sudo systemctl is-enabled --quiet docker.service || sudo systemctl enable docker.service
sleep 10 # Docker daemon takes time to come up after installing
sudo docker info

## Install NVIDIA Container Toolkit
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sudo apt-get update
sudo apt-get install -y nvidia-container-toolkit
sudo nvidia-ctk runtime configure --runtime=docker
sudo systemctl restart docker

sleep 10
sudo docker run --rm --gpus all ubuntu nvidia-smi
sudo systemctl stop docker

## Install AWS CLI v2
wget -nv https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip -O awscliv2.zip
unzip -q awscliv2.zip
sudo ./aws/install
rm -rf ./aws/ ./awscliv2.zip

## Install jq and yq
sudo apt update && sudo apt install jq
mkdir yq/
pushd yq/
wget -nv https://github.com/mikefarah/yq/releases/download/v4.44.3/yq_linux_amd64.tar.gz -O - | \
tar xz && sudo mv ./yq_linux_amd64 /usr/bin/yq
popd
rm -rf yq/
14 changes: 14 additions & 0 deletions vm_images/linux/install_drivers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
set -euo pipefail

## Install basic tools
echo 'debconf debconf/frontend select Noninteractive' | sudo debconf-set-selections
sudo apt-get update
sudo apt-get install -y cmake git build-essential wget ca-certificates curl unzip

## Install CUDA Toolkit 12.6 (Driver will be installed later)
wget -nv https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo apt-get update
sudo apt-get -y install cuda-toolkit-12-6 cuda-drivers-565
rm cuda-keyring_1.1-1_all.deb
79 changes: 79 additions & 0 deletions vm_images/linux/linux.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
packer {
required_plugins {
amazon = {
source = "github.com/hashicorp/amazon"
version = "~> 1"
}
}
}

locals {
ami_name_prefix = "xgboost-ci"
image_name = "RunsOn worker with Ubuntu 24.04 + CUDA driver"
region = "us-west-2"
timestamp = regex_replace(timestamp(), "[- TZ:]", "")
volume_size = 40
}

data "amazon-ami" "aws-ubuntu-x64" {
filters = {
name = "ubuntu/images/hvm-ssd-gp3/ubuntu-noble-24.04-amd64-server-*"
root-device-type = "ebs"
virtualization-type = "hvm"
}
most_recent = true
owners = ["amazon"]
}

source "amazon-ebs" "runs-on-linux" {
source_ami = "${data.amazon-ami.aws-ubuntu-x64.id}"
ami_name = "${local.ami_name_prefix}-runs-on-linux-${local.timestamp}"
ami_description = "${local.image_name}"
ami_regions = ["${local.region}"]
ami_virtualization_type = "hvm"
associate_public_ip_address = true
communicator = "ssh"
instance_type = "g4dn.xlarge"
region = "${local.region}"
ssh_timeout = "10m"
ssh_username = "ubuntu"
ssh_file_transfer_method = "sftp"
user_data_file = "setup_ssh.sh"
launch_block_device_mappings {
device_name = "/dev/sda1"
volume_size = "${local.volume_size}"
volume_type = "gp3"
delete_on_termination = true
}
aws_polling { # Wait up to 1 hour until the AMI is ready
delay_seconds = 15
max_attempts = 240
}
snapshot_tags = {
Name = "${local.image_name}"
BuildTime = "${local.timestamp}"
}
tags = {
Name = "${local.image_name}"
BuildTime = "${local.timestamp}"
}
}

build {
sources = ["source.amazon-ebs.runs-on-linux"]

provisioner "shell" {
script = "install_drivers.sh"
pause_after = "30s"
}

provisioner "shell" {
expect_disconnect = true
inline = ["echo 'Reboot VM'", "sudo reboot"]
}

provisioner "shell" {
pause_before = "1m0s"
script = "bootstrap.sh"
}
}
2 changes: 2 additions & 0 deletions vm_images/linux/setup_ssh.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
systemctl start ssh
75 changes: 75 additions & 0 deletions vm_images/windows/bootstrap.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
## Install packages from Chocolatey

# jq & yq
Write-Output "Installing jq and yq..."
choco install jq --version=1.7.1
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
choco install yq --version=4.40.2
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# AWS CLI
Write-Output "Installing AWS CLI..."
choco install awscli --version=2.18.11
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# Git
Write-Host '>>> Installing Git...'
choco install git --version=2.47.0
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# CMake
Write-Host '>>> Installing CMake 3.30.5...'
choco install cmake --version 3.30.5 --installargs "ADD_CMAKE_TO_PATH=System"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# Notepad++
Write-Host '>>> Installing Notepad++...'
choco install notepadplusplus
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# Miniforge3
Write-Host '>>> Installing Miniforge3...'
choco install miniforge3 --params="'/InstallationType:AllUsers /RegisterPython:1 /D:C:\tools\miniforge3'"
C:\tools\miniforge3\Scripts\conda.exe init --user --system
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
. "C:\Windows\System32\WindowsPowerShell\v1.0\profile.ps1"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
conda config --set auto_activate_base false
conda install -n base 'pip>=23' 'wheel>=0.42' pydistcheck
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# Java 11
Write-Host '>>> Installing Java 11...'
choco install openjdk11
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# Maven
Write-Host '>>> Installing Maven...'
choco install maven
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# GraphViz
Write-Host '>>> Installing GraphViz...'
choco install graphviz
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# Visual Studio 2022 Community
Write-Host '>>> Installing Visual Studio 2022 Community...'
choco install visualstudio2022community `
--params "--wait --passive --norestart"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
choco install visualstudio2022-workload-nativedesktop --params `
"--wait --passive --norestart --includeOptional"
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# CUDA 12.5
Write-Host '>>> Installing CUDA 12.5...'
choco install cuda --version=12.5.1.555
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }

# R 4.3
Write-Host '>>> Installing R...'
choco install r.project --version=4.3.2
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
choco install rtools --version=4.3.5550
if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
14 changes: 14 additions & 0 deletions vm_images/windows/install_choco.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/InstallChoco.ps1
## Author: Christopher Horrell (https://github.com/chorrell)

$ErrorActionPreference = "Stop"

# Install Chocolatey
# See https://chocolatey.org/install#individual
Set-ExecutionPolicy Bypass -Scope Process -Force
[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
Invoke-Expression ((New-Object System.Net.WebClient).DownloadString("https://community.chocolatey.org/install.ps1"))

# Globally Auto confirm every action
# See: https://docs.chocolatey.org/en-us/faqs#why-do-i-have-to-confirm-packages-now-is-there-a-way-to-remove-this
choco feature enable -n allowGlobalConfirmation
58 changes: 58 additions & 0 deletions vm_images/windows/setup_ssh.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
<powershell>
## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/SetupSsh.ps1
## Author: Christopher Horrell (https://github.com/chorrell)

# Don't display progress bars
# See: https://learn.microsoft.com/en-us/powershell/module/microsoft.powershell.core/about/about_preference_variables?view=powershell-7.3#progresspreference
$ProgressPreference = "SilentlyContinue"
$ErrorActionPreference = "Stop"

# Install OpenSSH using Add-WindowsCapability
# See: https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_install_firstuse?tabs=powershell#install-openssh-for-windows

Write-Host "Installing and starting ssh-agent"
Add-WindowsCapability -Online -Name OpenSSH.Client~~~~0.0.1.0
Set-Service -Name ssh-agent -StartupType Automatic
Start-Service ssh-agent

Write-Host "Installing and starting sshd"
Add-WindowsCapability -Online -Name OpenSSH.Server~~~~0.0.1.0
Set-Service -Name sshd -StartupType Automatic
Start-Service sshd

# Confirm the Firewall rule is configured. It should be created automatically by setup. Run the following to verify
if (!(Get-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -ErrorAction SilentlyContinue | Select-Object Name, Enabled)) {
Write-Output "Firewall Rule 'OpenSSH-Server-In-TCP' does not exist, creating it..."
New-NetFirewallRule -Name "OpenSSH-Server-In-TCP" -DisplayName "OpenSSH Server (sshd)" -Enabled True -Direction Inbound -Protocol TCP -Action Allow -LocalPort 22
} else {
Write-Output "Firewall rule 'OpenSSH-Server-In-TCP' has been created and exists."
}

# Set default shell to Powershell
New-ItemProperty -Path "HKLM:\SOFTWARE\OpenSSH" -Name DefaultShell -Value "C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe" -PropertyType String -Force

$keyDownloadScript = Join-Path $env:ProgramData "ssh\download-key.ps1"

@'
# Download private key to $env:ProgramData\ssh\administrators_authorized_keys
$openSSHAuthorizedKeys = Join-Path $env:ProgramData "ssh\administrators_authorized_keys"
$keyUrl = "http://169.254.169.254/latest/meta-data/public-keys/0/openssh-key"
Invoke-WebRequest $keyUrl -OutFile $openSSHAuthorizedKeys
# Ensure ACL for administrators_authorized_keys is correct
# See https://learn.microsoft.com/en-us/windows-server/administration/openssh/openssh_server_configuration#authorizedkeysfile
icacls.exe $openSSHAuthorizedKeys /inheritance:r /grant "Administrators:F" /grant "SYSTEM:F"
'@ | Out-File $keyDownloadScript

# Create Task
$taskName = "DownloadKey"
$principal = New-ScheduledTaskPrincipal -UserID "NT AUTHORITY\SYSTEM" -LogonType ServiceAccount -RunLevel Highest
$action = New-ScheduledTaskAction -Execute "Powershell.exe" -Argument "-NoProfile -File ""$keyDownloadScript"""
$trigger = New-ScheduledTaskTrigger -AtStartup
Register-ScheduledTask -Action $action -Trigger $trigger -Principal $principal -TaskName $taskName -Description $taskName

# Fetch key via $keyDownloadScript
& Powershell.exe -ExecutionPolicy Bypass -File $keyDownloadScript

</powershell>
14 changes: 14 additions & 0 deletions vm_images/windows/sysprep.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## Adopted from https://github.com/chorrell/packer-aws-windows-openssh/blob/20c40aa60b54469b3d85650a2e2e45e35ed83bc7/files/PrepareImage.ps1
## Author: Christopher Horrell (https://github.com/chorrell)

$ErrorActionPreference = "Stop"

Write-Output "Cleaning up keys"
$openSSHAuthorizedKeys = Join-Path $env:ProgramData "ssh\administrators_authorized_keys"
Remove-Item -Recurse -Force -Path $openSSHAuthorizedKeys

# Make sure task is enabled
Enable-ScheduledTask "DownloadKey"

Write-Output "Running Sysprep"
& "$Env:Programfiles\Amazon\EC2Launch\ec2launch.exe" sysprep
Loading

0 comments on commit 7ec773a

Please sign in to comment.