diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 947f9410f..e15937099 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -15,37 +15,26 @@ jobs: openstack: name: openstack-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.builds.label }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails matrix: # build RL8+OFED, RL9+OFED, RL9+OFED+CUDA versions - os_version: - - RL8 - - RL9 - build: - - openstack.openhpc - - openstack.openhpc-cuda - exclude: - - os_version: RL8 - build: openstack.openhpc-cuda + builds: + - label: openhpc-RL8-ofed + source_image_name: RL8-ofed + inventory_groups: 'control,login,compute' + - label: openhpc-RL9-ofed + source_image_name: RL9-ofed + inventory_groups: 'control,login,compute' + - label: openhpc-RL9-cuda + source_image_name: RL9-cuda + inventory_groups: 'control,login,compute' env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud }} - SOURCE_IMAGES_MAP: | - { - "RL8": { - "openstack.openhpc": "rocky-latest-RL8", - "openstack.openhpc-cuda": "rocky-latest-cuda-RL8" - }, - "RL9": { - "openstack.openhpc": "rocky-latest-RL9", - "openstack.openhpc-cuda": "rocky-latest-cuda-RL9" - } - } - steps: - uses: actions/checkout@v2 @@ -79,6 +68,20 @@ jobs: . venv/bin/activate . environments/.stackhpc/activate + - name: Select branch-specific or latest nightly image + id: select_source_image + run: | + . venv/bin/activate + . environments/.stackhpc/activate + BRANCH=${{ github.ref_name }} + BRANCH_VERSION=${BRANCH//\//-} # replace '/' with '-' using bash parameter expansion + NIGHTLY_IMAGE_ID=$( \ + openstack image show -c id -f value ${{ matrix.builds.source_image_name }}-${BRANCH_VERSION} || \ + openstack image show -c id -f value ${{ matrix.builds.source_image_name }}-latest \ + ) + echo selected source_image $NIGHTLY_IMAGE_ID: $(openstack image show -c name -f value $NIGHTLY_IMAGE_ID) + echo "source_image_id=$NIGHTLY_IMAGE_ID" >> "$GITHUB_OUTPUT" + - name: Build fat image with packer id: packer_build run: | @@ -88,15 +91,15 @@ jobs: cd packer/ packer init . - PACKER_LOG=1 packer build \ + packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var source_image=${{ steps.select_source_image.outputs.source_image_id }} \ + -var image_name=${{ matrix.builds.label }} \ + -var inventory_groups=${{ matrix.builds.inventory_groups }} \ openstack.pkr.hcl env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version][matrix.build] }} + PACKER_LOG: '1' - name: Get created image names from manifest id: manifest @@ -113,7 +116,7 @@ jobs: - name: Upload manifest artifact uses: actions/upload-artifact@v4 with: - name: image-details-${{ matrix.build }}-${{ matrix.os_version }} + name: image-details-${{ matrix.builds.label }} path: | ./image-id.txt ./image-name.txt diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index 607dabd2e..5bdaa9999 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -1,3 +1,5 @@ +# NB: When run in a non-main branch (via workflow_dispatch), image scanning and distribution to other clouds does not happen +# on the basis that in this case a fatimage must be built and will be scanned. name: Build nightly image on: workflow_dispatch: @@ -14,34 +16,30 @@ on: - cron: '0 0 * * *' # Run at midnight jobs: - openstack: - name: openstack-imagebuild + build: + name: nightly-imagebuild concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.build }} # to branch/PR + OS + build + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.builds.label }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails - matrix: # build RL8, RL9, RL9+CUDA versions - os_version: - - RL8 - - RL9 - build: - - openstack.rocky-latest - - openstack.rocky-latest-cuda - exclude: - - os_version: RL8 - build: openstack.rocky-latest-cuda - + matrix: + builds: + - label: RL8-ofed + source_image_name: Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2 + inventory_groups: 'update,ofed' + - label: RL9-ofed + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: 'update,ofed' + - label: RL9-cuda + source_image_name: Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 + inventory_groups: 'update,ofed,cuda' env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack CI_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} - SOURCE_IMAGES_MAP: | - { - "RL8": "Rocky-8-GenericCloud-Base-8.9-20231119.0.x86_64.qcow2", - "RL9": "Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2" - } + IMAGE_VERSION: ${{ github.event_name == 'schedule' && 'latest' || github.ref_name }} steps: - uses: actions/checkout@v2 @@ -85,18 +83,18 @@ jobs: cd packer/ packer init . - PACKER_LOG=1 packer build \ + packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -only=${{ matrix.build }} \ -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ - -var "source_image_name=${{ env.SOURCE_IMAGE }}" \ + -var source_image_name=${{ matrix.builds.source_image_name }} \ + -var image_name=${{ matrix.builds.label }} \ + -var image_version=${{ env.IMAGE_VERSION }} \ + -var inventory_groups=${{ matrix.builds.inventory_groups }} \ openstack.pkr.hcl - env: - PKR_VAR_os_version: ${{ matrix.os_version }} - SOURCE_IMAGE: ${{ fromJSON(env.SOURCE_IMAGES_MAP)[matrix.os_version] }} + PACKER_LOG: '1' - - name: Get created image names from manifest + - name: Get image info and ensure it can be used for subsequent builds id: manifest run: | . venv/bin/activate @@ -105,8 +103,10 @@ jobs: sleep 5 done IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + echo image: ${IMAGE_NAME} ${IMAGE_ID} echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" + openstack image unset --property signature_verified $IMAGE_ID - name: Delete old latest image run: | @@ -122,9 +122,10 @@ jobs: upload: name: upload-nightly-targets - needs: openstack + needs: build + if: github.ref_name == 'main' concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }}-${{ matrix.image }}-${{ matrix.target_cloud }} + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.builds.label }}-${{ matrix.target_cloud }} cancel-in-progress: true runs-on: ubuntu-22.04 strategy: @@ -134,21 +135,16 @@ jobs: - LEAFCLOUD - SMS - ARCUS - os_version: - - RL8 - - RL9 - image: - - rocky-latest - - rocky-latest-cuda + builds: + - image: RL8-ofed-latest + - image: RL9-ofed-latest + - image: RL9-cuda-latest exclude: - - os_version: RL8 - image: rocky-latest-cuda - - target_cloud: LEAFCLOUD + - target_cloud: LEAFCLOUD # why?? Should this not be source_cloud/vars.CI_CLOUD env: OS_CLOUD: openstack SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} TARGET_CLOUD: ${{ matrix.target_cloud }} - IMAGE_NAME: "${{ matrix.image }}-${{ matrix.os_version }}" steps: - uses: actions/checkout@v2 @@ -176,7 +172,7 @@ jobs: run: | . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/source_clouds.yaml - openstack image save --file ${{ env.IMAGE_NAME }} ${{ env.IMAGE_NAME }} + openstack image save --file ${{ matrix.builds.image }} ${{ matrix.builds.image }} shell: bash - name: Upload to target cloud @@ -184,8 +180,8 @@ jobs: . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - openstack image create "${{ env.IMAGE_NAME }}" \ - --file "${{ env.IMAGE_NAME }}" \ + openstack image create "${{ matrix.builds.image }}" \ + --file "${{ matrix.builds.image }}" \ --disk-format qcow2 \ shell: bash @@ -194,9 +190,9 @@ jobs: . venv/bin/activate export OS_CLIENT_CONFIG_FILE=~/.config/openstack/target_clouds.yaml - IMAGE_COUNT=$(openstack image list --name ${{ env.IMAGE_NAME }} -f value -c ID | wc -l) + IMAGE_COUNT=$(openstack image list --name ${{ matrix.builds.image }} -f value -c ID | wc -l) if [ "$IMAGE_COUNT" -gt 1 ]; then - OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ env.IMAGE_NAME }}" -f value -c ID | head -n 1) + OLD_IMAGE_ID=$(openstack image list --sort created_at:asc --name "${{ matrix.builds.image }}" -f value -c ID | head -n 1) openstack image delete "$OLD_IMAGE_ID" else echo "Only one image exists, skipping deletion." diff --git a/ansible/.gitignore b/ansible/.gitignore index f6f5c5f4d..c8296cc8c 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -60,3 +60,5 @@ roles/* !roles/tuned/** !roles/lustre/ !roles/lustre/** +!roles/builder/ +!roles/builder/** diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 7cad2dc59..0b0577289 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -207,8 +207,14 @@ gather_facts: yes tags: finalise tasks: - - name: Cleanup image - import_tasks: cleanup.yml + - name: Carry out checks on image + import_role: + name: builder + tasks_from: checks.yml + - name: Finalise image + import_role: + name: builder + tasks_from: finalise.yml - name: Shutdown Packer VM community.general.shutdown: diff --git a/ansible/roles/builder/defaults/main.yml b/ansible/roles/builder/defaults/main.yml new file mode 100644 index 000000000..605761c04 --- /dev/null +++ b/ansible/roles/builder/defaults/main.yml @@ -0,0 +1 @@ +builder_delete_syslog: false \ No newline at end of file diff --git a/ansible/roles/builder/tasks/checks.yml b/ansible/roles/builder/tasks/checks.yml new file mode 100644 index 000000000..4452c384c --- /dev/null +++ b/ansible/roles/builder/tasks/checks.yml @@ -0,0 +1,29 @@ +- name: Check whether OFED is installed + command: ofed_info + changed_when: false + failed_when: + - _ofed_info.rc > 0 + - "'No such file or directory' not in _ofed_info.msg" + register: _ofed_info + +- name: Get package facts + package_facts: + +- name: Check e.g. libfabric package hasn't downgraded OFED-installed packages + assert: + that: "'mlnx' in ansible_facts.packages[item].0.version" + fail_msg: "OFED is installed but package {{ item }} has a non-OFED version: {{ ansible_facts.packages[item].0.version }}" + when: "'MLNX_OFED_LINUX-' in _ofed_info.stdout" + loop: "{{ builder_ofed_check_packages }}" + vars: + builder_ofed_check_packages: + - ibacm + - infiniband-diags + - libibumad + - libibverbs + - libibverbs-utils + - librdmacm + - librdmacm-utils + - rdma-core-devel + - rdma-core # didn't actually see this one get downgraded + diff --git a/ansible/roles/builder/tasks/finalise.yml b/ansible/roles/builder/tasks/finalise.yml new file mode 100644 index 000000000..5f46f1161 --- /dev/null +++ b/ansible/roles/builder/tasks/finalise.yml @@ -0,0 +1,75 @@ +# Finalise a Packer build VM + +- meta: flush_handlers + +- name: Remove dnf caches + command: dnf clean all + +# If image build happens on a Neutron subnet with property dns_namservers defined, then cloud-init +# disables NetworkManager's control of /etc/resolv.conf and appends nameservers itself. +# We don't want network configuration during instance boot to depend on the configuration +# of the network the builder was on, so we reset these aspects. +- name: Delete /etc/resolv.conf + file: + path: /etc/resolv.conf + state: absent + when: "'resolv_conf' not in group_names" # if its been overriden, deleting it is the wrong thing to do + +- name: Reenable NetworkManager control of resolv.conf + # NB: This *doesn't* delete the 90-dns-none.conf file created by the resolv_conf role + # as if nameservers are explicitly being set by that role we don't want to allow NM + # to override it again. + file: + path: /etc/NetworkManager/conf.d/99-cloud-init.conf + state: absent + +- name: Get remote environment for ansible_user + setup: + gather_subset: env + become: no + +- name: Delete any injected ssh config for ansible_user + file: + path: "{{ ansible_env.HOME }}/.ssh/" + state: absent + +- name: Run cloud-init cleanup + command: cloud-init clean --logs --seed + +- name: Cleanup /tmp + command : rm -rf /tmp/* + +- name: Get package facts + package_facts: + +- name: Ensure image summary directory exists + file: + path: /var/lib/image/ + state: directory + owner: root + group: root + mode: u=rwX,go=rX + +- name: Write image summary + copy: + content: "{{ image_info | to_nice_json }}" + dest: /var/lib/image/image.json + vars: + image_info: + branch: "{{ lookup('pipe', 'git rev-parse --abbrev-ref HEAD') }}" + build: "{{ ansible_nodename | split('.') | first }}" # hostname is image name, which contains build info + os: "{{ ansible_distribution }} {{ ansible_distribution_version }}" + kernel: "{{ ansible_kernel }}" + ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}" + cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}" + slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" + ondemand: "{{ ansible_facts.packages['ondemand'].0.version | default('-') }}" + +- name: Clear system logs + file: + path: /var/log/messages + state: absent + when: "{{ builder_delete_syslog | bool }}" + +- name: Shutdown Packer VM + community.general.shutdown: diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml index 0d040b55e..46902ded9 100644 --- a/ansible/roles/ofed/defaults/main.yml +++ b/ansible/roles/ofed/defaults/main.yml @@ -1,4 +1,4 @@ -ofed_version: '23.10-3.2.2.0' # LTS +ofed_version: '24.07-0.6.1.0' ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz ofed_distro: rhel # NB: not expected to work on other distros due to installation differences ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' diff --git a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl index 5adf4199c..06bc27ebf 100644 --- a/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl +++ b/environments/.stackhpc/LEAFCLOUD.pkrvars.hcl @@ -5,3 +5,4 @@ ssh_keypair_name = "slurm-app-ci" ssh_private_key_file = "~/.ssh/id_rsa" security_groups = ["default", "SSH"] floating_ip_network = "external" +image_name_suffix = "-ofed24" diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 9f396e964..9e5b2fdfb 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241024-1439-177083b1", - "RL9": "openhpc-RL9-241024-1438-177083b1", - "RL9-cuda": "openhpc-cuda-RL9-241024-1628-177083b1" + "RL8": "openhpc-RL8-241101-1318-986306fe-ofed24", + "RL9": "openhpc-RL9-241101-1318-986306fe-ofed24", + "RL9-cuda": "openhpc-cuda-RL9-241101-1319-986306fe-ofed24" } } \ No newline at end of file diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index fae0bf7b2..d3a0e24e0 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -23,6 +23,7 @@ data "git-commit" "cwd-head" { } locals { git_commit = data.git-commit.cwd-head.hash timestamp = formatdate("YYMMDD-hhmm", timestamp()) + image_version = var.image_version == "auto" ? "${local.timestamp}-${substr(local.git_commit, 0, 8)}" : var.image_version } # Path pointing to root of repository - automatically set by environment variable PKR_VAR_repo_root @@ -39,12 +40,6 @@ variable "networks" { type = list(string) } -variable "os_version" { - type = string - description = "'RL8' or 'RL9' with default source_image_* mappings" - default = "RL9" -} - # Must supply either source_image_name or source_image_id variable "source_image_name" { type = string @@ -123,19 +118,13 @@ variable "volume_type" { } variable "volume_size" { - type = map(number) - default = { - # fat image builds, GB: - rocky-latest = 15 - rocky-latest-cuda = 30 - openhpc = 15 - openhpc-cuda = 30 - } + type = number + default = 15 # same as default non-CUDA build } -variable "extra_build_volume_size" { +variable "volume_size_cuda" { type = number - default = 15 # same as default non-CUDA build + default = 30 } variable "image_disk_format" { @@ -148,27 +137,34 @@ variable "metadata" { default = {} } -variable "groups" { - type = map(list(string)) - description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name" - default = { - # fat image builds: - rocky-latest = ["update", "ofed"] - rocky-latest-cuda = ["update", "ofed", "cuda"] - openhpc = ["control", "compute", "login"] - openhpc-cuda = ["control", "compute", "login"] - } +variable "inventory_groups" { + type = string + description = "comma-separated list of inventory groups, in addition to 'builder'" + default = "" } -variable "extra_build_groups" { - type = list(string) - default = [] +# variable "groups" { +# type = map(list(string)) +# description = "Additional inventory groups (other than 'builder') to add build VM to, keyed by source name" +# default = { + # fat image builds: + # rocky-latest = ["update", "ofed"] + # rocky-latest-cuda = ["update", "ofed", "cuda"] + # openhpc = ["control", "compute", "login"] + # openhpc-cuda = ["control", "compute", "login"] +# } +# } + +variable "image_name" { + type = string + description = "Built image name." + default = "openhpc" } -variable "extra_build_image_name" { +variable "image_version" { type = string - description = "Infix for 'extra' build image name" - default = "extra" + description = "Suffix for built image names. Default special value 'auto' uses a timestamp+git commit" + default = "auto" } source "openstack" "openhpc" { @@ -176,7 +172,7 @@ source "openstack" "openhpc" { flavor = var.flavor use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type - volume_size = lookup(var.volume_size, source.name, var.extra_build_volume_size) + volume_size = contains(split(",", var.inventory_groups), "cuda") ? var.volume_size_cuda : var.volume_size metadata = var.metadata instance_metadata = {ansible_init_disable = "true"} networks = var.networks @@ -204,39 +200,14 @@ source "openstack" "openhpc" { build { - # latest nightly image: source "source.openstack.openhpc" { name = "rocky-latest" - image_name = "${source.name}-${var.os_version}" - } - - # latest nightly cuda image: - source "source.openstack.openhpc" { - name = "rocky-latest-cuda" - image_name = "${source.name}-${var.os_version}" - } - - # OFED fat image: - source "source.openstack.openhpc" { - name = "openhpc" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" - } - - # CUDA fat image: - source "source.openstack.openhpc" { - name = "openhpc-cuda" - image_name = "${source.name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" - } - - # Extended site-specific image, built on fat image: - source "source.openstack.openhpc" { - name = "openhpc-extra" - image_name = "openhpc-${var.extra_build_image_name}-${var.os_version}-${local.timestamp}-${substr(local.git_commit, 0, 8)}" + image_name = replace(join("-", [var.image_name, local.image_version]), "/", "-") } provisioner "ansible" { playbook_file = "${var.repo_root}/ansible/fatimage.yml" - groups = concat(["builder"], lookup(var.groups, source.name, var.extra_build_groups)) + groups = concat(["builder"], split(",", var.inventory_groups)) keep_inventory_file = true # for debugging use_proxy = false # see https://www.packer.io/docs/provisioners/ansible#troubleshooting extra_arguments = [