Skip to content

Fix issues when using GenericCloud image #532

Fix issues when using GenericCloud image

Fix issues when using GenericCloud image #532

Workflow file for this run

name: Test deployment and reimage on OpenStack
on:
workflow_dispatch:
push:
branches:
- main
pull_request:
jobs:
openstack:
name: openstack-ci
concurrency: ${{ github.ref }} # to branch/PR
runs-on: ubuntu-20.04
env:
ANSIBLE_FORCE_COLOR: True
OS_CLOUD: openstack
TF_VAR_cluster_name: ci${{ github.run_id }}
CI_CLOUD: ${{ vars.CI_CLOUD }}
steps:
- uses: actions/checkout@v2
- name: Record which cloud CI is running on
run: |
echo CI_CLOUD: ${{ vars.CI_CLOUD }}
- name: Setup ssh
run: |
set -x
mkdir ~/.ssh
echo "${{ secrets[format('{0}_SSH_KEY', vars.CI_CLOUD)] }}" > ~/.ssh/id_rsa
chmod 0600 ~/.ssh/id_rsa
shell: bash
- name: Add bastion's ssh key to known_hosts
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts
shell: bash
- name: Install ansible etc
run: dev/setup-env.sh
- name: Install terraform
uses: hashicorp/setup-terraform@v1
with:
terraform: v1.5.5
- name: Initialise terraform
run: terraform init
working-directory: ${{ github.workspace }}/environments/.stackhpc/terraform
- name: Write clouds.yaml
run: |
mkdir -p ~/.config/openstack/
echo "${{ secrets[format('{0}_CLOUDS_YAML', vars.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml
shell: bash
- name: Setup environment-specific inventory/terraform inputs
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook ansible/adhoc/generate-passwords.yml
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
- name: Provision nodes using fat image
id: provision_servers
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform apply -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
- name: Delete infrastructure if provisioning failed
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
if: failure() && steps.provision_servers.outcome == 'failure'
- name: Configure cluster
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible all -m wait_for_connection
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Run MPI-based tests
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/adhoc/hpctests.yml
# - name: Run EESSI tests
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible-playbook -vv ansible/ci/check_eessi.yml
- name: Confirm Open Ondemand is up (via SOCKS proxy)
run: |
. venv/bin/activate
. environments/.stackhpc/activate
# load ansible variables into shell:
ansible-playbook ansible/ci/output_vars.yml \
-e output_vars_hosts=openondemand \
-e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \
-e output_vars_items=bastion_ip,bastion_user,openondemand_servername
source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt
# setup ssh proxying:
sudo apt-get --yes install proxychains
echo proxychains installed
ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip}
echo port 9050 forwarded
# check OOD server returns 200:
statuscode=$(proxychains wget \
--quiet \
--spider \
--server-response \
--no-check-certificate \
--http-user=testuser \
--http-password=${TESTUSER_PASSWORD} https://${openondemand_servername} \
2>&1)
(echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1)
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}
# - name: Build environment-specific compute image
# id: packer_build
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# cd packer/
# packer init
# PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl
# ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs
# - name: Test reimage of compute nodes to new environment-specific image (via slurm)
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]"
# ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
# ansible-playbook -v ansible/ci/check_slurm.yml
- name: Test reimage of login and control nodes (via rebuild adhoc)
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml
ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down
ansible-playbook -v ansible/site.yml
ansible-playbook -v ansible/ci/check_slurm.yml
- name: Check sacct state survived reimage
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml
- name: Check MPI-based tests are shown in Grafana
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_grafana.yml
- name: Delete infrastructure
run: |
. venv/bin/activate
. environments/.stackhpc/activate
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform
terraform destroy -auto-approve -var-file="${{ vars.CI_CLOUD }}.tfvars"
if: ${{ success() || cancelled() }}
# - name: Delete images
# run: |
# . venv/bin/activate
# . environments/.stackhpc/activate
# ansible-playbook -vv ansible/ci/delete_images.yml