-
Notifications
You must be signed in to change notification settings - Fork 2
/
slurm-infra.yml
140 lines (127 loc) · 5.38 KB
/
slurm-infra.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
---
# Provision the infrastructure using Terraform
- name: Provision infrastructure
hosts: openstack
roles:
- cluster_infra
# Setup tasks now that all hosts have been added to the correct groups
- hosts: cluster
become: yes
tasks:
# Ensure that the hosts in the cluster can all refer to each other by their hostname
- name: Populate /etc/hosts with cluster hosts
lineinfile:
path: /etc/hosts
regexp: "{{ hostvars[host].inventory_hostname }}"
line: "{{ hostvars[host].ansible_default_ipv4.address }} {{ hostvars[host].inventory_hostname }}"
loop: "{{ ansible_play_hosts }}"
loop_control:
loop_var: host
# Ensure that the secrets are generated and persisted on the control host
- name: Generate and persist secrets
hosts: control
gather_facts: no
become: yes
roles:
- persist_openhpc_secrets
# validate.yml asserts presence of a control group which doesn't exist when
# destroying infra, so only validate when we're not destroying
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/validate.yml
when: cluster_state is not defined or (cluster_state is defined and cluster_state != "absent")
# The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run
# This can disrupt the SSH connection, particularly because we use the login host as a jump host
# So we move the home directory on the login node and reset the connections first
- hosts: login
gather_facts: false
tasks:
- name: Set up Ansible user
user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}"
become_method: "sudo"
# Need to change working directory otherwise we try to switch back to non-existent directory.
become_flags: '-i'
become: true
- hosts: cluster
gather_facts: no
tasks:
- name: Reset persistent SSH connections
meta: reset_connection
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/bootstrap.yml
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/filesystems.yml
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/slurm.yml
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/portal.yml
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/iam.yml
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/monitoring.yml
- name: Persist login hostkey across rebuilds
# Need NFS for this so can't do it before the appliance plays
hosts: login
gather_facts: no
become: yes
roles:
- persist_hostkeys
# Configure the Zenith clients that are required
# First, ensure that podman is installed on all hosts that will run Zenith clients
- hosts: zenith,!podman
tasks:
- import_role:
name: podman
tasks_from: prereqs.yml
- import_role:
name: podman
tasks_from: config.yml
- hosts: grafana
tasks:
- name: Deploy the Zenith client for Grafana
include_role:
name: zenith_proxy
vars:
zenith_proxy_service_name: zenith-monitoring
# Use the IP address for the upstream host
zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}"
zenith_proxy_upstream_port: "{{ grafana_port }}"
zenith_proxy_client_token: "{{ zenith_token_monitoring }}"
zenith_proxy_client_auth_params:
tenancy-id: "{{ openstack_project_id }}"
zenith_proxy_mitm_enabled: yes
zenith_proxy_mitm_auth_inject: basic
zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}"
zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}"
when: zenith_subdomain_monitoring is defined
- hosts: openondemand
tasks:
- name: Deploy the Zenith client for OOD
include_role:
name: zenith_proxy
vars:
zenith_proxy_service_name: zenith-ood
# Use the IP address for the upstream host
zenith_proxy_upstream_scheme: https
zenith_proxy_upstream_host: "{{ ansible_default_ipv4.address }}"
zenith_proxy_upstream_port: 443
zenith_proxy_client_token: "{{ zenith_token_ood }}"
zenith_proxy_client_auth_params:
tenancy-id: "{{ openstack_project_id }}"
zenith_proxy_mitm_enabled: yes
zenith_proxy_mitm_auth_inject: basic
zenith_proxy_mitm_auth_basic_username: azimuth
zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}"
when: zenith_subdomain_ood is defined
- import_playbook: vendor/stackhpc/ansible-slurm-appliance/ansible/adhoc/hpctests.yml
# Write the outputs as the final task
- hosts: localhost
tasks:
- debug: var=outputs
vars:
# Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain,
# so we have to repeat logic here unfortunately
outputs: >-
{{-
{ "cluster_access_ip": hostvars[groups['openstack'][0]].cluster_gateway_ip } |
combine(
{
"openondemand_url": "https://" ~ (hostvars[groups['openstack'][0]].cluster_gateway_ip | replace('.', '-')) ~ ".sslip.io",
"azimuth_user_password": hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_azimuth_user_password
}
if zenith_fqdn_ood is not defined
else {}
)
}}