From 70fcff66598282318e4259fa50d6253c233a150f Mon Sep 17 00:00:00 2001 From: Chris Scott Date: Wed, 24 Apr 2024 13:37:02 +1200 Subject: [PATCH 01/10] add pre_pull option to ood_apps --- .gitignore | 1 - deployment-checklist.md | 19 +++++++++++++++++++ .../kuberenetes/tasks/pod-pre-puller.yml | 4 +++- terraform/terraform.tfvars | 10 ++++++++++ vars/ondemand-config.yml.example | 8 +++++++- 5 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 deployment-checklist.md create mode 100644 terraform/terraform.tfvars diff --git a/.gitignore b/.gitignore index 88c1d17..f7805dd 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ host.ini terraform/terraform.tfstate terraform/terraform.tfstate.d/* terraform/terraform.tfstate.backup -terraform/terraform.tfvars env.sh .terraform .terraform.lock.hcl diff --git a/deployment-checklist.md b/deployment-checklist.md new file mode 100644 index 0000000..6629548 --- /dev/null +++ b/deployment-checklist.md @@ -0,0 +1,19 @@ +# Deployment checklist + +In *vars/ondemand-config.yml.example*: + +- adjust `num_users_create` and `num_trainers_create` +- adjust `ood_apps` + - check `version` and `k8s_container` + - enable required apps + - set which images to pre-pull +- set `enable_pod_prepull` if desired (should default to on probably) +- set `control_plane_flavor`, usually to `balanced1.4cpu8ram` for production +- set `cluster_worker_count` and `worker_flavor` to have enough capacity for the number of users + +In *terraform/terraform.tfvars*: + +- adjust `services_flavor_id` (usually use the id for *8cpu16ram* for production) +- adjust `services_volume_size`, must be big enough for all the user home directories +- adjust `webnode_flavor_id` (the id for *8cpu16ram* works well for up to 30-40 users, not tested past that) +- adjust `webnode_volume_size`, usually leave at 30 GB diff --git a/roles/openondemand-k8s/kuberenetes/tasks/pod-pre-puller.yml b/roles/openondemand-k8s/kuberenetes/tasks/pod-pre-puller.yml index dbb66ff..77c120e 100644 --- a/roles/openondemand-k8s/kuberenetes/tasks/pod-pre-puller.yml +++ b/roles/openondemand-k8s/kuberenetes/tasks/pod-pre-puller.yml @@ -4,7 +4,9 @@ ood_enabled_apps: "{{ ood_enabled_apps + [{ 'name': item.key, 'container_url': item.value.k8s_container }] }}" with_items: "{{ ood_apps | dict2items }}" - when: item.value.enabled == true + when: + - item.value.enabled == true + - item.value.pre_pull == true vars: ood_enabled_apps: [] diff --git a/terraform/terraform.tfvars b/terraform/terraform.tfvars new file mode 100644 index 0000000..27e700c --- /dev/null +++ b/terraform/terraform.tfvars @@ -0,0 +1,10 @@ +# flavours: +# - 4cpu8ram: e07cfee1-43af-4bf6-baac-3bdf7c1b88f8 +# - 8cpu16ram: 2d02e6a4-3937-4ed3-951a-8e27867ff53e +# - 16cpu32ram: 674fa81a-69c7-4bf7-b3a9-59989fb63618 + +services_flavor_id = "e07cfee1-43af-4bf6-baac-3bdf7c1b88f8" +services_volume_size = 30 + +webnode_flavor_id = "e07cfee1-43af-4bf6-baac-3bdf7c1b88f8" +webnode_volume_size = 30 diff --git a/vars/ondemand-config.yml.example b/vars/ondemand-config.yml.example index a9cd773..192908f 100644 --- a/vars/ondemand-config.yml.example +++ b/vars/ondemand-config.yml.example @@ -84,36 +84,42 @@ ood_apps: repo: https://github.com/nesi/training-environment-jupyter-ml101-app.git version: 'v0.2.2' enabled: true + pre_pull: false jupyter_ml102: k8s_container: ghcr.io/nesi/training-environment-jupyter-ml102-app:v0.2.0 repo: https://github.com/nesi/training-environment-jupyter-ml102-app.git version: 'v0.2.0' enabled: true + pre_pull: false rstudio_rnaseq: k8s_container: ghcr.io/nesi/training-environment-rstudio-rnaseq-app:v0.2.2 repo: https://github.com/nesi/training-environment-rstudio-rnaseq-app.git version: 'v0.2.2' enabled: true + pre_pull: false rstudio: k8s_container: ghcr.io/nesi/training-environment-rstudio-app:v0.3.0 repo: https://github.com/nesi/training-environment-rstudio-app.git version: 'v0.3.0' enabled: true + pre_pull: false shell4b: k8s_container: ghcr.io/nesi/training-environment-jupyter-intermediate-shell-app:v0.3.3 repo: https://github.com/nesi/training-environment-jupyter-intermediate-shell-app.git version: 'v0.3.3' enabled: true + pre_pull: false containers: k8s_container: ghcr.io/nesi/training-environment-jupyter-containers-app:v0.1.0 repo: https://github.com/nesi/training-environment-jupyter-containers-app.git version: 'v0.1.0' - enabled: true + enabled: false + pre_pull: false # this is currently required for containers app to run properly (fakeroot) # Note: you should probably set to false unless you are running a containers workshop From 1d1b39ee341ea3e410c6a8f23c8b002b81aa3d0f Mon Sep 17 00:00:00 2001 From: Chris Scott Date: Wed, 24 Apr 2024 13:37:13 +1200 Subject: [PATCH 02/10] make "tmp_dir" more persistent --- roles/capi-cluster/get-nodes/defaults/main.yml | 2 +- roles/capi-cluster/workload/defaults/main.yml | 4 ++-- roles/openondemand-k8s/kuberenetes/defaults/main.yml | 4 ++-- roles/openondemand-k8s/web-node/defaults/main.yml | 4 ++-- roles/pre-checks/certs/defaults/main.yml | 4 ++-- roles/pre-checks/install-cert/defaults/main.yml | 4 ++-- roles/pre-checks/kubernetes/defaults/main.yml | 4 ++-- roles/pre-checks/openstack/defaults/main.yml | 4 ++-- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/roles/capi-cluster/get-nodes/defaults/main.yml b/roles/capi-cluster/get-nodes/defaults/main.yml index 65e13b0..e49732c 100644 --- a/roles/capi-cluster/get-nodes/defaults/main.yml +++ b/roles/capi-cluster/get-nodes/defaults/main.yml @@ -4,4 +4,4 @@ k8s_ood_enable: false cluster_name: capi-cluster capi_ssh_user: cloud-user -tmp_dir: "/tmp/{{ cluster_name }}" \ No newline at end of file +tmp_dir: "/var/nesi/{{ cluster_name }}" diff --git a/roles/capi-cluster/workload/defaults/main.yml b/roles/capi-cluster/workload/defaults/main.yml index 198eec3..6b83827 100644 --- a/roles/capi-cluster/workload/defaults/main.yml +++ b/roles/capi-cluster/workload/defaults/main.yml @@ -39,7 +39,7 @@ cluster_settle_timeout_base: 3 bin_dir: /usr/local/bin -tmp_dir: "/tmp/{{ cluster_name }}" +tmp_dir: "/var/nesi/{{ cluster_name }}" clouds_yaml_local_location: ~/.config/openstack/clouds.yaml clouds_yaml_location: "{{ tmp_dir }}/clouds.yaml" @@ -72,4 +72,4 @@ kube_oidc_username_prefix: "-" kube_oidc_groups_claim: groups kube_oidc_groups_prefix: 'oidc:' # Copy oidc CA file to the following path if needed -kube_oidc_ca_file: "{{ ca_cert_file }}" \ No newline at end of file +kube_oidc_ca_file: "{{ ca_cert_file }}" diff --git a/roles/openondemand-k8s/kuberenetes/defaults/main.yml b/roles/openondemand-k8s/kuberenetes/defaults/main.yml index 41bbe65..3683f04 100644 --- a/roles/openondemand-k8s/kuberenetes/defaults/main.yml +++ b/roles/openondemand-k8s/kuberenetes/defaults/main.yml @@ -2,7 +2,7 @@ k8s_ood_enable: false bin_dir: /usr/local/bin -tmp_dir: "/tmp/{{ cluster_name }}" +tmp_dir: "/var/nesi/{{ cluster_name }}" kube_oidc_auth: true @@ -17,4 +17,4 @@ ood_apps: k8s_container: ghcr.io/nesi/training-environment-jupyter-ml101-app:v0.2.1 repo: https://github.com/nesi/training-environment-jupyter-ml101-app.git version: 'v0.2.1' - enabled: false \ No newline at end of file + enabled: false diff --git a/roles/openondemand-k8s/web-node/defaults/main.yml b/roles/openondemand-k8s/web-node/defaults/main.yml index f81f4d9..295ead8 100644 --- a/roles/openondemand-k8s/web-node/defaults/main.yml +++ b/roles/openondemand-k8s/web-node/defaults/main.yml @@ -4,7 +4,7 @@ k8s_ood_enable: false cluster_name: capi-cluster bin_dir: /usr/local/bin -tmp_dir: "/tmp/{{ cluster_name }}" +tmp_dir: "/var/nesi/{{ cluster_name }}" kube_root_ca_location: /etc/pki/ood-certs @@ -26,4 +26,4 @@ k8s_namespace_prefix: "user-" kube_oidc_url: https://ood-idp.training.data.nesi.org.nz/realms/ondemand kube_oidc_client_id: kubernetes kube_oidc_client_secret_id: secret -kube_oidc_username_prefix: "-" \ No newline at end of file +kube_oidc_username_prefix: "-" diff --git a/roles/pre-checks/certs/defaults/main.yml b/roles/pre-checks/certs/defaults/main.yml index c0e6933..6859c08 100644 --- a/roles/pre-checks/certs/defaults/main.yml +++ b/roles/pre-checks/certs/defaults/main.yml @@ -1,11 +1,11 @@ --- cluster_name: capi-cluster -tmp_dir: "/tmp/{{ cluster_name }}" +tmp_dir: "/var/nesi/{{ cluster_name }}" ca_file_path: /usr/local/share/ca-certificates clouds_yaml_local_location: ~/.config/openstack/clouds.yaml clouds_yaml_location: "{{ tmp_dir }}/clouds.yaml" -kube_config_location: "~/.kube" \ No newline at end of file +kube_config_location: "~/.kube" diff --git a/roles/pre-checks/install-cert/defaults/main.yml b/roles/pre-checks/install-cert/defaults/main.yml index 6466196..2e2070f 100644 --- a/roles/pre-checks/install-cert/defaults/main.yml +++ b/roles/pre-checks/install-cert/defaults/main.yml @@ -1,11 +1,11 @@ --- cluster_name: capi-cluster -tmp_dir: "/tmp/{{ cluster_name }}" +tmp_dir: "/var/nesi/{{ cluster_name }}" cert_file_path: /etc/ssl clouds_yaml_local_location: ~/.config/openstack/clouds.yaml clouds_yaml_location: "{{ tmp_dir }}/clouds.yaml" -kube_config_location: "~/.kube" \ No newline at end of file +kube_config_location: "~/.kube" diff --git a/roles/pre-checks/kubernetes/defaults/main.yml b/roles/pre-checks/kubernetes/defaults/main.yml index 160f72b..0d37b39 100644 --- a/roles/pre-checks/kubernetes/defaults/main.yml +++ b/roles/pre-checks/kubernetes/defaults/main.yml @@ -3,11 +3,11 @@ k8s_ood_enable: false cluster_name: capi-cluster -tmp_dir: "/tmp/{{ cluster_name }}" +tmp_dir: "/var/nesi/{{ cluster_name }}" ca_file_path: /usr/local/share/ca-certificates clouds_yaml_local_location: ~/.config/openstack/clouds.yaml clouds_yaml_location: "{{ tmp_dir }}/clouds.yaml" -kube_config_location: "~/.kube" \ No newline at end of file +kube_config_location: "~/.kube" diff --git a/roles/pre-checks/openstack/defaults/main.yml b/roles/pre-checks/openstack/defaults/main.yml index 160f72b..0d37b39 100644 --- a/roles/pre-checks/openstack/defaults/main.yml +++ b/roles/pre-checks/openstack/defaults/main.yml @@ -3,11 +3,11 @@ k8s_ood_enable: false cluster_name: capi-cluster -tmp_dir: "/tmp/{{ cluster_name }}" +tmp_dir: "/var/nesi/{{ cluster_name }}" ca_file_path: /usr/local/share/ca-certificates clouds_yaml_local_location: ~/.config/openstack/clouds.yaml clouds_yaml_location: "{{ tmp_dir }}/clouds.yaml" -kube_config_location: "~/.kube" \ No newline at end of file +kube_config_location: "~/.kube" From e9a71cb2f86dc0d720d712c1d02c2224ac500686 Mon Sep 17 00:00:00 2001 From: Chris Scott Date: Wed, 24 Apr 2024 14:16:19 +1200 Subject: [PATCH 03/10] test pre pull --- vars/ondemand-config.yml.example | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vars/ondemand-config.yml.example b/vars/ondemand-config.yml.example index 192908f..3d00b09 100644 --- a/vars/ondemand-config.yml.example +++ b/vars/ondemand-config.yml.example @@ -91,7 +91,7 @@ ood_apps: repo: https://github.com/nesi/training-environment-jupyter-ml102-app.git version: 'v0.2.0' enabled: true - pre_pull: false + pre_pull: true rstudio_rnaseq: k8s_container: ghcr.io/nesi/training-environment-rstudio-rnaseq-app:v0.2.2 @@ -127,7 +127,7 @@ enable_privileged_pods: false # pull the images defined in ood_apps onto all k8s worker nodes # Note: make sure the worker nodes have enough disk space (especially if many apps are enabled) -enable_pod_pre_pull: false +enable_pod_pre_pull: true # HTTPS LetsEncrypt Settings From 048e82f1bdf9c93e57ea1a28fb96e7cf9c5769d9 Mon Sep 17 00:00:00 2001 From: Chris Scott Date: Wed, 24 Apr 2024 14:36:52 +1200 Subject: [PATCH 04/10] become yes for k8s plays --- setup-training-environment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup-training-environment.yml b/setup-training-environment.yml index 41b215b..8bb2f3e 100644 --- a/setup-training-environment.yml +++ b/setup-training-environment.yml @@ -39,7 +39,7 @@ name: "{{ inventory_hostname }}.flexi.nesi" - hosts: servers - become_user: root + become: yes any_errors_fatal: "true" vars_files: - ondemand-config.yml @@ -49,7 +49,7 @@ - { role: kubectl } - hosts: servicesnode - become_user: root + become: yes any_errors_fatal: "true" vars_files: - ondemand-config.yml From 97c2825c21fdff9f9f73434fd0cd1917994e2272 Mon Sep 17 00:00:00 2001 From: Chris Scott Date: Fri, 26 Apr 2024 09:01:32 +1200 Subject: [PATCH 05/10] back to using /tmp for tmpdir --- roles/capi-cluster/get-nodes/defaults/main.yml | 2 +- roles/capi-cluster/workload/defaults/main.yml | 2 +- roles/openondemand-k8s/kuberenetes/defaults/main.yml | 2 +- roles/openondemand-k8s/web-node/defaults/main.yml | 2 +- roles/pre-checks/certs/defaults/main.yml | 2 +- roles/pre-checks/install-cert/defaults/main.yml | 2 +- roles/pre-checks/kubernetes/defaults/main.yml | 2 +- roles/pre-checks/openstack/defaults/main.yml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/roles/capi-cluster/get-nodes/defaults/main.yml b/roles/capi-cluster/get-nodes/defaults/main.yml index e49732c..b47a4e7 100644 --- a/roles/capi-cluster/get-nodes/defaults/main.yml +++ b/roles/capi-cluster/get-nodes/defaults/main.yml @@ -4,4 +4,4 @@ k8s_ood_enable: false cluster_name: capi-cluster capi_ssh_user: cloud-user -tmp_dir: "/var/nesi/{{ cluster_name }}" +tmp_dir: "/tmp/{{ cluster_name }}" diff --git a/roles/capi-cluster/workload/defaults/main.yml b/roles/capi-cluster/workload/defaults/main.yml index 6b83827..a314fd2 100644 --- a/roles/capi-cluster/workload/defaults/main.yml +++ b/roles/capi-cluster/workload/defaults/main.yml @@ -39,7 +39,7 @@ cluster_settle_timeout_base: 3 bin_dir: /usr/local/bin -tmp_dir: "/var/nesi/{{ cluster_name }}" +tmp_dir: "/tmp/{{ cluster_name }}" clouds_yaml_local_location: ~/.config/openstack/clouds.yaml clouds_yaml_location: "{{ tmp_dir }}/clouds.yaml" diff --git a/roles/openondemand-k8s/kuberenetes/defaults/main.yml b/roles/openondemand-k8s/kuberenetes/defaults/main.yml index 3683f04..94cf78f 100644 --- a/roles/openondemand-k8s/kuberenetes/defaults/main.yml +++ b/roles/openondemand-k8s/kuberenetes/defaults/main.yml @@ -2,7 +2,7 @@ k8s_ood_enable: false bin_dir: /usr/local/bin -tmp_dir: "/var/nesi/{{ cluster_name }}" +tmp_dir: "/tmp/{{ cluster_name }}" kube_oidc_auth: true diff --git a/roles/openondemand-k8s/web-node/defaults/main.yml b/roles/openondemand-k8s/web-node/defaults/main.yml index 295ead8..a53bd58 100644 --- a/roles/openondemand-k8s/web-node/defaults/main.yml +++ b/roles/openondemand-k8s/web-node/defaults/main.yml @@ -4,7 +4,7 @@ k8s_ood_enable: false cluster_name: capi-cluster bin_dir: /usr/local/bin -tmp_dir: "/var/nesi/{{ cluster_name }}" +tmp_dir: "/tmp/{{ cluster_name }}" kube_root_ca_location: /etc/pki/ood-certs diff --git a/roles/pre-checks/certs/defaults/main.yml b/roles/pre-checks/certs/defaults/main.yml index 6859c08..f2bb39d 100644 --- a/roles/pre-checks/certs/defaults/main.yml +++ b/roles/pre-checks/certs/defaults/main.yml @@ -1,7 +1,7 @@ --- cluster_name: capi-cluster -tmp_dir: "/var/nesi/{{ cluster_name }}" +tmp_dir: "/tmp/{{ cluster_name }}" ca_file_path: /usr/local/share/ca-certificates diff --git a/roles/pre-checks/install-cert/defaults/main.yml b/roles/pre-checks/install-cert/defaults/main.yml index 2e2070f..6daa3ee 100644 --- a/roles/pre-checks/install-cert/defaults/main.yml +++ b/roles/pre-checks/install-cert/defaults/main.yml @@ -1,7 +1,7 @@ --- cluster_name: capi-cluster -tmp_dir: "/var/nesi/{{ cluster_name }}" +tmp_dir: "/tmp/{{ cluster_name }}" cert_file_path: /etc/ssl diff --git a/roles/pre-checks/kubernetes/defaults/main.yml b/roles/pre-checks/kubernetes/defaults/main.yml index 0d37b39..5e5370d 100644 --- a/roles/pre-checks/kubernetes/defaults/main.yml +++ b/roles/pre-checks/kubernetes/defaults/main.yml @@ -3,7 +3,7 @@ k8s_ood_enable: false cluster_name: capi-cluster -tmp_dir: "/var/nesi/{{ cluster_name }}" +tmp_dir: "/tmp/{{ cluster_name }}" ca_file_path: /usr/local/share/ca-certificates diff --git a/roles/pre-checks/openstack/defaults/main.yml b/roles/pre-checks/openstack/defaults/main.yml index 0d37b39..5e5370d 100644 --- a/roles/pre-checks/openstack/defaults/main.yml +++ b/roles/pre-checks/openstack/defaults/main.yml @@ -3,7 +3,7 @@ k8s_ood_enable: false cluster_name: capi-cluster -tmp_dir: "/var/nesi/{{ cluster_name }}" +tmp_dir: "/tmp/{{ cluster_name }}" ca_file_path: /usr/local/share/ca-certificates From 32cc9177a43fe55231fd167ab34de097536be832 Mon Sep 17 00:00:00 2001 From: Chris Scott Date: Fri, 26 Apr 2024 09:15:37 +1200 Subject: [PATCH 06/10] Revert "become yes for k8s plays" This reverts commit 048e82f1bdf9c93e57ea1a28fb96e7cf9c5769d9. --- setup-training-environment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup-training-environment.yml b/setup-training-environment.yml index 8bb2f3e..41b215b 100644 --- a/setup-training-environment.yml +++ b/setup-training-environment.yml @@ -39,7 +39,7 @@ name: "{{ inventory_hostname }}.flexi.nesi" - hosts: servers - become: yes + become_user: root any_errors_fatal: "true" vars_files: - ondemand-config.yml @@ -49,7 +49,7 @@ - { role: kubectl } - hosts: servicesnode - become: yes + become_user: root any_errors_fatal: "true" vars_files: - ondemand-config.yml From a127b74b4ed8c2249f9fad03494b8accf310bd2e Mon Sep 17 00:00:00 2001 From: Chris Scott Date: Fri, 26 Apr 2024 09:17:04 +1200 Subject: [PATCH 07/10] become_user not doing anything --- setup-training-environment.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup-training-environment.yml b/setup-training-environment.yml index 41b215b..b5c13bf 100644 --- a/setup-training-environment.yml +++ b/setup-training-environment.yml @@ -39,7 +39,6 @@ name: "{{ inventory_hostname }}.flexi.nesi" - hosts: servers - become_user: root any_errors_fatal: "true" vars_files: - ondemand-config.yml @@ -49,7 +48,6 @@ - { role: kubectl } - hosts: servicesnode - become_user: root any_errors_fatal: "true" vars_files: - ondemand-config.yml From e64323ebf0e39df60327a3dbe5ca391fc575ea06 Mon Sep 17 00:00:00 2001 From: Chris Scott Date: Fri, 26 Apr 2024 09:17:22 +1200 Subject: [PATCH 08/10] copy cluster kubeconfig to persistent location --- roles/capi-cluster/get-nodes/tasks/prerequisites.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/roles/capi-cluster/get-nodes/tasks/prerequisites.yml b/roles/capi-cluster/get-nodes/tasks/prerequisites.yml index 345fc98..3c9a307 100644 --- a/roles/capi-cluster/get-nodes/tasks/prerequisites.yml +++ b/roles/capi-cluster/get-nodes/tasks/prerequisites.yml @@ -36,3 +36,10 @@ kubectl get secret -n {{ cluster_namespace }} {{ cluster_name }}-kubeconfig -o jsonpath='{.data.value}'|base64 -d > {{ tmp_dir }}/{{ cluster_name }}.kubeconfig when: not new_cluster_config.stat.exists + +- name: Copy {{ cluster_name }} cluster kubeconfig yaml to persistent location + ansible.builtin.copy: + src: "{{ tmp_dir }}/{{ cluster_name }}.kubeconfig" + dest: "~/.kube/{{ cluster_name }}.kubeconfig" + mode: '0640' + remote_src: yes From 26189fb7d563cdd73a629fe134e08a2ba3fe5937 Mon Sep 17 00:00:00 2001 From: Chris Scott Date: Fri, 26 Apr 2024 09:55:54 +1200 Subject: [PATCH 09/10] remove obsolete ml102 workflows --- .github/workflows/jsm-redeploy-ml102.yml | 133 ----------------------- .github/workflows/redeploy-ml102.yml | 125 --------------------- redeploy-ml102.yml | 39 ------- 3 files changed, 297 deletions(-) delete mode 100644 .github/workflows/jsm-redeploy-ml102.yml delete mode 100644 .github/workflows/redeploy-ml102.yml delete mode 100644 redeploy-ml102.yml diff --git a/.github/workflows/jsm-redeploy-ml102.yml b/.github/workflows/jsm-redeploy-ml102.yml deleted file mode 100644 index ce99a92..0000000 --- a/.github/workflows/jsm-redeploy-ml102.yml +++ /dev/null @@ -1,133 +0,0 @@ -name: JSM Redeploy ML102 - -on: - workflow_dispatch: - inputs: - environment: - description: 'Training environment name' - type: string - required: true - -jobs: - redeploy_ml102: - concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - runs-on: ubuntu-22.04 - env: - OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} - OS_APPLICATION_CREDENTIAL_NAME: ${{ secrets.OS_APPLICATION_CREDENTIAL_NAME }} - OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }} - AWS_ROUTE53_KEY_ID: ${{ secrets.AWS_ROUTE53_KEY_ID }} - AWS_ROUTE53_SECRET_KEY: ${{ secrets.AWS_ROUTE53_SECRET_KEY }} - TF_VAR_key_pair: ${{ secrets.KEY_PAIR_NAME }} - TF_VAR_extra_public_keys: '${{ vars.EXTRA_PUBLIC_KEYS }}' - TF_VAR_vm_user: ubuntu - steps: - - uses: actions/checkout@v3 - - uses: hashicorp/setup-terraform@v2 - with: - terraform_version: "1.6.3" - terraform_wrapper: false - - run: python --version - - - name: Write private key file - run: | - import os - with open(os.environ['TF_VAR_key_file'], 'w') as fh: - fh.write(os.environ['PRIVATE_KEY_CONTENT']) - shell: python - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - PRIVATE_KEY_CONTENT: ${{ secrets.PRIVATE_KEY_CONTENT }} - - run: chmod 400 ${TF_VAR_key_file} - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - - - name: Write clouds.yaml to temp dir - run: | - import os - with open(os.environ['CLOUDS_YAML_FILE'], 'w') as fh: - fh.write(os.environ['CLOUDS_YAML_CONTENT']) - shell: python - env: - CLOUDS_YAML_CONTENT: "${{ secrets.CLOUDS_YAML }}" - CLOUDS_YAML_FILE: "${{ runner.temp }}/clouds.yaml" - - name: Link clouds.yaml - run: | - mkdir -p ~/.config/openstack - ln -s ${CLOUDS_YAML_FILE} ~/.config/openstack/clouds.yaml - env: - CLOUDS_YAML_FILE: "${{ runner.temp }}/clouds.yaml" - - - run: echo "GITHUB_REF = $GITHUB_REF" - - run: echo "GITHUB_REF_NAME = $GITHUB_REF_NAME" - - run: echo "GITHUB_HEAD_REF = $GITHUB_HEAD_REF" - - run: echo "GITHUB_BASE_REF = $GITHUB_BASE_REF" - - run: echo "GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME" - - - name: Write ansible config file to temp dir - run: | - cp ondemand-config.yml.example ${ONDEMAND_CONFIG_FILE} - sed -i'' "s/CHANGEME_KEYCLOAK_ADMIN_PASSWORD/$KEYCLOAK_ADMIN_PASSWORD/" ${ONDEMAND_CONFIG_FILE} - sed -i'' "s/CHANGEME_LDAP_ADMIN_PASSWORD/$LDAP_ADMIN_PASSWORD/" ${ONDEMAND_CONFIG_FILE} - sed -i'' "s/CHANGEME_OIDC_CRYPTO_PASSPHRASE/$OIDC_CRYPTO_PASSPHRASE/" ${ONDEMAND_CONFIG_FILE} - sed -i'' "s/CHANGEME_OPENSTACK_SSH_KEY_NAME/$KEY_PAIR_NAME/" ${ONDEMAND_CONFIG_FILE} - working-directory: vars - env: - KEYCLOAK_ADMIN_PASSWORD: '${{ secrets.KEYCLOAK_ADMIN_PASSWORD }}' - LDAP_ADMIN_PASSWORD: '${{ secrets.LDAP_ADMIN_PASSWORD }}' - OIDC_CRYPTO_PASSPHRASE: '${{ secrets.OIDC_CRYPTO_PASSPHRASE }}' - KEY_PAIR_NAME: '${{ secrets.KEY_PAIR_NAME }}' - ONDEMAND_CONFIG_FILE: '${{ runner.temp }}/ondemand-config.yml' - - name: Link ondemand config - run: | - ln -s ${ONDEMAND_CONFIG_FILE} ondemand-config.yml - working-directory: vars - env: - ONDEMAND_CONFIG_FILE: '${{ runner.temp }}/ondemand-config.yml' - - - name: Install ansible dependencies - run: ansible-galaxy install -r requirements.yml - - name: Terraform initialise - run: terraform init -input=false - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - working-directory: terraform - - - name: Switch terraform workspace (making sure it exists) - run: terraform workspace select -or-create=true ${ENVIRONMENT} - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - ENVIRONMENT: ${{ inputs.environment }} - working-directory: terraform - - - name: Install required Python library (botocore and boto3) - run: | - pip install botocore - pip install boto3 - working-directory: terraform - - - name: Ensure infrastructure is setup - run: ansible-playbook setup-infra.yml -e operation=create -e terraform_workspace="${ENVIRONMENT}" - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - ANSIBLE_HOST_KEY_CHECKING: "False" - CLOUDS_YAML_BASE64: "${{ secrets.CLOUDS_YAML_BASE64 }}" - KUBE_CONFIG_BASE64: "${{ secrets.KUBE_CONFIG_BASE64 }}" - WILD_CERT_BASE64: "${{ secrets.WILD_CERT_BASE64 }}" - WILD_CERT_KEY_BASE64: "${{ secrets.WILD_CERT_KEY_BASE64 }}" - ENVIRONMENT: ${{ inputs.environment }} - - - name: Redeploy ML102 - run: ansible-playbook -i host.ini redeploy-ml102.yml -u "${TF_VAR_vm_user}" --key-file '${TF_VAR_key_file}' -e terraform_workspace="${ENVIRONMENT}" - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - ANSIBLE_HOST_KEY_CHECKING: "False" - CLOUDS_YAML_BASE64: "${{ secrets.CLOUDS_YAML_BASE64 }}" - KUBE_CONFIG_BASE64: "${{ secrets.KUBE_CONFIG_BASE64 }}" - WILD_CERT_BASE64: "${{ secrets.WILD_CERT_BASE64 }}" - WILD_CERT_KEY_BASE64: "${{ secrets.WILD_CERT_KEY_BASE64 }}" - ENVIRONMENT: ${{ inputs.environment }} diff --git a/.github/workflows/redeploy-ml102.yml b/.github/workflows/redeploy-ml102.yml deleted file mode 100644 index 2d04399..0000000 --- a/.github/workflows/redeploy-ml102.yml +++ /dev/null @@ -1,125 +0,0 @@ -name: Redeploy ML102 - -on: - workflow_dispatch: - -jobs: - redeploy_ml102: - concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - runs-on: ubuntu-22.04 - env: - OS_APPLICATION_CREDENTIAL_ID: ${{ secrets.OS_APPLICATION_CREDENTIAL_ID }} - OS_APPLICATION_CREDENTIAL_NAME: ${{ secrets.OS_APPLICATION_CREDENTIAL_NAME }} - OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }} - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} - AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }} - AWS_ROUTE53_KEY_ID: ${{ secrets.AWS_ROUTE53_KEY_ID }} - AWS_ROUTE53_SECRET_KEY: ${{ secrets.AWS_ROUTE53_SECRET_KEY }} - TF_VAR_key_pair: ${{ secrets.KEY_PAIR_NAME }} - TF_VAR_extra_public_keys: '${{ vars.EXTRA_PUBLIC_KEYS }}' - TF_VAR_vm_user: ubuntu - steps: - - uses: actions/checkout@v3 - - uses: hashicorp/setup-terraform@v2 - with: - terraform_version: "1.6.3" - terraform_wrapper: false - - run: python --version - - - name: Write private key file - run: | - import os - with open(os.environ['TF_VAR_key_file'], 'w') as fh: - fh.write(os.environ['PRIVATE_KEY_CONTENT']) - shell: python - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - PRIVATE_KEY_CONTENT: ${{ secrets.PRIVATE_KEY_CONTENT }} - - run: chmod 400 ${TF_VAR_key_file} - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - - - name: Write clouds.yaml to temp dir - run: | - import os - with open(os.environ['CLOUDS_YAML_FILE'], 'w') as fh: - fh.write(os.environ['CLOUDS_YAML_CONTENT']) - shell: python - env: - CLOUDS_YAML_CONTENT: "${{ secrets.CLOUDS_YAML }}" - CLOUDS_YAML_FILE: "${{ runner.temp }}/clouds.yaml" - - name: Link clouds.yaml - run: | - mkdir -p ~/.config/openstack - ln -s ${CLOUDS_YAML_FILE} ~/.config/openstack/clouds.yaml - env: - CLOUDS_YAML_FILE: "${{ runner.temp }}/clouds.yaml" - - - run: echo "GITHUB_REF = $GITHUB_REF" - - run: echo "GITHUB_REF_NAME = $GITHUB_REF_NAME" - - run: echo "GITHUB_HEAD_REF = $GITHUB_HEAD_REF" - - run: echo "GITHUB_BASE_REF = $GITHUB_BASE_REF" - - run: echo "GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME" - - - name: Write ansible config file to temp dir - run: | - cp ondemand-config.yml.example ${ONDEMAND_CONFIG_FILE} - sed -i'' "s/CHANGEME_KEYCLOAK_ADMIN_PASSWORD/$KEYCLOAK_ADMIN_PASSWORD/" ${ONDEMAND_CONFIG_FILE} - sed -i'' "s/CHANGEME_LDAP_ADMIN_PASSWORD/$LDAP_ADMIN_PASSWORD/" ${ONDEMAND_CONFIG_FILE} - sed -i'' "s/CHANGEME_OIDC_CRYPTO_PASSPHRASE/$OIDC_CRYPTO_PASSPHRASE/" ${ONDEMAND_CONFIG_FILE} - sed -i'' "s/CHANGEME_OPENSTACK_SSH_KEY_NAME/$KEY_PAIR_NAME/" ${ONDEMAND_CONFIG_FILE} - working-directory: vars - env: - KEYCLOAK_ADMIN_PASSWORD: '${{ secrets.KEYCLOAK_ADMIN_PASSWORD }}' - LDAP_ADMIN_PASSWORD: '${{ secrets.LDAP_ADMIN_PASSWORD }}' - OIDC_CRYPTO_PASSPHRASE: '${{ secrets.OIDC_CRYPTO_PASSPHRASE }}' - KEY_PAIR_NAME: '${{ secrets.KEY_PAIR_NAME }}' - ONDEMAND_CONFIG_FILE: '${{ runner.temp }}/ondemand-config.yml' - - name: Link ondemand config - run: | - ln -s ${ONDEMAND_CONFIG_FILE} ondemand-config.yml - working-directory: vars - env: - ONDEMAND_CONFIG_FILE: '${{ runner.temp }}/ondemand-config.yml' - - - name: Install ansible dependencies - run: ansible-galaxy install -r requirements.yml - - name: Terraform initialise - run: terraform init -input=false - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - working-directory: terraform - - - name: Switch terraform workspace (making sure it exists) - run: terraform workspace select -or-create=true ${GITHUB_BASE_REF:-$GITHUB_REF_NAME} - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - working-directory: terraform - - - name: Install required Python library (botocore and boto3) - run: | - pip install botocore - pip install boto3 - working-directory: terraform - - - name: Ensure infrastructure is setup - run: ansible-playbook setup-infra.yml -e operation=create -e terraform_workspace="${GITHUB_BASE_REF:-$GITHUB_REF_NAME}" - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - ANSIBLE_HOST_KEY_CHECKING: "False" - CLOUDS_YAML_BASE64: "${{ secrets.CLOUDS_YAML_BASE64 }}" - KUBE_CONFIG_BASE64: "${{ secrets.KUBE_CONFIG_BASE64 }}" - WILD_CERT_BASE64: "${{ secrets.WILD_CERT_BASE64 }}" - WILD_CERT_KEY_BASE64: "${{ secrets.WILD_CERT_KEY_BASE64 }}" - - - name: Redeploy ML102 - run: ansible-playbook -i host.ini redeploy-ml102.yml -u "${TF_VAR_vm_user}" --key-file '${TF_VAR_key_file}' -e terraform_workspace="${GITHUB_BASE_REF:-$GITHUB_REF_NAME}" - env: - TF_VAR_key_file: "${{ runner.temp }}/my_ci_private_key" - ANSIBLE_HOST_KEY_CHECKING: "False" - CLOUDS_YAML_BASE64: "${{ secrets.CLOUDS_YAML_BASE64 }}" - KUBE_CONFIG_BASE64: "${{ secrets.KUBE_CONFIG_BASE64 }}" - WILD_CERT_BASE64: "${{ secrets.WILD_CERT_BASE64 }}" - WILD_CERT_KEY_BASE64: "${{ secrets.WILD_CERT_KEY_BASE64 }}" diff --git a/redeploy-ml102.yml b/redeploy-ml102.yml deleted file mode 100644 index fe48730..0000000 --- a/redeploy-ml102.yml +++ /dev/null @@ -1,39 +0,0 @@ -# workflow to redploy ML102 on the training environment -# -# 1. Get the list of all training/trainer home directories -# 2. Remove any "ml102_workshop" directories in the homes -# 3. Remove the ML102 apptainer image -# 4. Pull the apptainer image -# -# Following this, any user that starts a new ML102 session -# will get the version from the newly downloaded image -# -# Note: this doesn't redeploy the OOD ML102 app ---- -- name: Clean ML102 directories in homes - become: yes - hosts: servicesnode - tasks: - - name: List home directories - find: - paths: /srv/homes - file_type: directory - recurse: no - register: homes_find - - - name: Remove any ML102 directories in homes - ansible.builtin.file: - path: "{{ item.path }}/ml102_workshop" - state: absent - with_items: "{{ homes_find.files }}" - -- name: Redeploy ML102 image - become: yes - hosts: webnode - pre_tasks: - - name: Remove the ML102 apptainer image - ansible.builtin.file: - path: /opt/ml102_workshop/ML102.sif - state: absent - roles: - - ml102_workshop From 8e1b5a10541eca744cf57abee0a41a52dfb8558f Mon Sep 17 00:00:00 2001 From: Chris Scott Date: Fri, 26 Apr 2024 10:05:45 +1200 Subject: [PATCH 10/10] fix variable name in destroy k8s cluster playbook --- .github/workflows/destroy.yml | 2 +- destroy-k8s-cluster.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/destroy.yml b/.github/workflows/destroy.yml index af095a8..edda30b 100644 --- a/.github/workflows/destroy.yml +++ b/.github/workflows/destroy.yml @@ -85,7 +85,7 @@ jobs: ONDEMAND_CONFIG_FILE: '${{ runner.temp }}/ondemand-config.yml' - name: Install ansible dependencies - run: ansible-galaxy install -r requirements.yml + run: ansible-galaxy install --force -r requirements.yml - name: Terraform initialise run: terraform init -input=false env: diff --git a/destroy-k8s-cluster.yml b/destroy-k8s-cluster.yml index ef08317..10eb701 100644 --- a/destroy-k8s-cluster.yml +++ b/destroy-k8s-cluster.yml @@ -8,5 +8,5 @@ shell: >- kubectl delete cluster {{ terraform_workspace }} when: - - "enable_k8s_openondemand|default(false)|bool == true" + - "k8s_ood_enable|default(false)|bool == true" register: destroy