From 419c4af13b0091c153ddb727eb1143931a8baf61 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Aug 2023 11:09:34 +0000 Subject: [PATCH 1/7] bump opensearch version to 2.9.0 --- ansible/roles/opensearch/defaults/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/opensearch/defaults/main.yml b/ansible/roles/opensearch/defaults/main.yml index d07a9e333..69e7f9c25 100644 --- a/ansible/roles/opensearch/defaults/main.yml +++ b/ansible/roles/opensearch/defaults/main.yml @@ -3,7 +3,7 @@ #opensearch_internal_users_path: opensearch_podman_user: "{{ ansible_user }}" -opensearch_version: '2.4.0' # https://hub.docker.com/r/opensearchproject/opensearch/tags +opensearch_version: '2.9.0' # https://hub.docker.com/r/opensearchproject/opensearch/tags opensearch_config_path: /usr/share/opensearch/config opensearch_data_path: /usr/share/opensearch/data opensearch_state: started # will be restarted if required From 3dfe9242622cc0602086f198ec7f703a0da80642 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Aug 2023 12:44:23 +0000 Subject: [PATCH 2/7] pre-pull opensearch container --- ansible/roles/opensearch/tasks/main.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ansible/roles/opensearch/tasks/main.yml b/ansible/roles/opensearch/tasks/main.yml index 65da51404..7f5f0a801 100644 --- a/ansible/roles/opensearch/tasks/main.yml +++ b/ansible/roles/opensearch/tasks/main.yml @@ -74,6 +74,12 @@ become: true notify: Restart opensearch service +- name: Pull container + containers.podman.podman_image: + name: "opensearchproject/opensearch:{{ opensearch_version }}" + become: true + become_user: "{{ opensearch_podman_user }}" + - name: Flush handlers meta: flush_handlers From 2aa0706151e6a0a3fc942a811b13baf64228ac30 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Aug 2023 13:51:08 +0000 Subject: [PATCH 3/7] break opensearch into install and runtime task books --- ansible/fatimage.yml | 5 +++++ ansible/monitoring.yml | 5 +++++ ansible/roles/opensearch/tasks/install.yml | 15 +++++++++++++ .../tasks/{main.yml => runtime.yml} | 21 ------------------- 4 files changed, 25 insertions(+), 21 deletions(-) create mode 100644 ansible/roles/opensearch/tasks/install.yml rename ansible/roles/opensearch/tasks/{main.yml => runtime.yml} (73%) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index c236700f2..36b608499 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -59,6 +59,11 @@ tasks_from: vnc_compute.yml # - import_playbook: monitoring.yml: + - import_role: + name: opensearch + tasks_from: install.yml + become: true + # opensearch - containerised, nothing to do # slurm_stats - nothing to do # filebeat - containerised - nothing to do diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml index f60678874..b8d5fc0a5 100644 --- a/ansible/monitoring.yml +++ b/ansible/monitoring.yml @@ -7,6 +7,11 @@ tasks: - import_role: name: opensearch + tasks_from: install.yml + become: true + - import_role: + name: opensearch + tasks_from: runtime.yml become: true - name: Setup slurm stats diff --git a/ansible/roles/opensearch/tasks/install.yml b/ansible/roles/opensearch/tasks/install.yml new file mode 100644 index 000000000..640064afc --- /dev/null +++ b/ansible/roles/opensearch/tasks/install.yml @@ -0,0 +1,15 @@ +# safe to use during build + +- name: Increase maximum number of virtual memory maps + # see https://opensearch.org/docs/2.0/opensearch/install/important-settings/ + ansible.posix.sysctl: + name: vm.max_map_count + value: '262144' + state: present + reload: yes + +- name: Create systemd unit file + template: + dest: /etc/systemd/system/opensearch.service + src: opensearch.service.j2 + notify: Restart opensearch service diff --git a/ansible/roles/opensearch/tasks/main.yml b/ansible/roles/opensearch/tasks/runtime.yml similarity index 73% rename from ansible/roles/opensearch/tasks/main.yml rename to ansible/roles/opensearch/tasks/runtime.yml index 7f5f0a801..e79cefc01 100644 --- a/ansible/roles/opensearch/tasks/main.yml +++ b/ansible/roles/opensearch/tasks/runtime.yml @@ -15,15 +15,6 @@ path: /etc/systemd/system/opendistro.service state: absent -- name: Increase maximum number of virtual memory maps - # see https://opensearch.org/docs/2.0/opensearch/install/important-settings/ - ansible.posix.sysctl: - name: vm.max_map_count - value: '262144' - state: present - reload: yes - become: true - - name: Ensure required opensearch host directories exist file: state: directory @@ -35,11 +26,9 @@ loop: - "{{ opensearch_config_path }}" - "{{ opensearch_data_path }}" - when: "'started' in opensearch_state" # don't run during image build - name: Create certs import_tasks: certs.yml - when: "'started' in opensearch_state" # don't run during image build - name: Template general configuration ansible.builtin.template: @@ -52,7 +41,6 @@ mode: 0660 notify: Restart opensearch service become: true - when: "'started' in opensearch_state" # don't run during image build - name: Template internal user configuration template: @@ -65,19 +53,10 @@ mode: 0660 notify: Restart opensearch service become: true - when: "'started' in opensearch_state" # don't run during image build - -- name: Create systemd unit file - template: - dest: /etc/systemd/system/opensearch.service - src: opensearch.service.j2 - become: true - notify: Restart opensearch service - name: Pull container containers.podman.podman_image: name: "opensearchproject/opensearch:{{ opensearch_version }}" - become: true become_user: "{{ opensearch_podman_user }}" - name: Flush handlers From 2783bda943a0b5551a678f0c085cb74cc0c02684 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Aug 2023 14:40:06 +0000 Subject: [PATCH 4/7] bump image for CI --- environments/.stackhpc/terraform/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index d9a28d526..9433810bd 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -13,7 +13,7 @@ variable "cluster_name" { variable "cluster_image" { description = "single image for all cluster nodes - a convenience for CI" type = string - default = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298 + default = "openhpc-230809-1401-2aa07061" # https://github.com/stackhpc/ansible-slurm-appliance/pull/299 # default = "Rocky-8-GenericCloud-Base-8.7-20221130.0.x86_64.qcow2" # default = "Rocky-8-GenericCloud-8.6.20220702.0.x86_64.qcow2" } From 8c65ec7588da3a4c0203b9a6e74f3ce507096c28 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Aug 2023 16:00:13 +0000 Subject: [PATCH 5/7] don't enable/start opensearch during build --- ansible/roles/opensearch/handlers/main.yml | 1 - ansible/roles/opensearch/tasks/install.yml | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ansible/roles/opensearch/handlers/main.yml b/ansible/roles/opensearch/handlers/main.yml index 539a06125..d3a040dbb 100644 --- a/ansible/roles/opensearch/handlers/main.yml +++ b/ansible/roles/opensearch/handlers/main.yml @@ -5,5 +5,4 @@ name: opensearch.service state: "{{ 'restarted' if 'started' in opensearch_state else opensearch_state }}" enabled: "{{ opensearch_systemd_service_enabled }}" - daemon_reload: "{{ 'started' in opensearch_state }}" become: true diff --git a/ansible/roles/opensearch/tasks/install.yml b/ansible/roles/opensearch/tasks/install.yml index 640064afc..902c71d1f 100644 --- a/ansible/roles/opensearch/tasks/install.yml +++ b/ansible/roles/opensearch/tasks/install.yml @@ -12,4 +12,8 @@ template: dest: /etc/systemd/system/opensearch.service src: opensearch.service.j2 - notify: Restart opensearch service + register: _opensearch_unit + +- name: Reload opensearch unit file + command: systemctl daemon-reload + when: _opensearch_unit.changed From 884df2a366530475a3b945b0a53cd4ca8f753337 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 9 Aug 2023 21:48:58 +0000 Subject: [PATCH 6/7] bump CI image --- environments/.stackhpc/terraform/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/terraform/main.tf b/environments/.stackhpc/terraform/main.tf index 9433810bd..4061485c3 100644 --- a/environments/.stackhpc/terraform/main.tf +++ b/environments/.stackhpc/terraform/main.tf @@ -13,7 +13,7 @@ variable "cluster_name" { variable "cluster_image" { description = "single image for all cluster nodes - a convenience for CI" type = string - default = "openhpc-230809-1401-2aa07061" # https://github.com/stackhpc/ansible-slurm-appliance/pull/299 + default = "openhpc-230809-1602-2250239e" # https://github.com/stackhpc/ansible-slurm-appliance/pull/299 # default = "Rocky-8-GenericCloud-Base-8.7-20221130.0.x86_64.qcow2" # default = "Rocky-8-GenericCloud-8.6.20220702.0.x86_64.qcow2" } From 293772541a07cfe308be4c5b4b82127a4177bbae Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Thu, 10 Aug 2023 14:47:15 +0000 Subject: [PATCH 7/7] use slurm jobid for opensearch index and archive old data --- .../roles/opensearch/tasks/archive_data.yml | 17 ++++++++++++++++ ansible/roles/opensearch/tasks/runtime.yml | 20 +++++++++++++++++++ .../common/files/filebeat/filebeat.yml | 5 +++++ 3 files changed, 42 insertions(+) create mode 100644 ansible/roles/opensearch/tasks/archive_data.yml diff --git a/ansible/roles/opensearch/tasks/archive_data.yml b/ansible/roles/opensearch/tasks/archive_data.yml new file mode 100644 index 000000000..298f66a8e --- /dev/null +++ b/ansible/roles/opensearch/tasks/archive_data.yml @@ -0,0 +1,17 @@ +# Remove data which was NOT indexed by Slurm Job ID +# It will be re-ingested by filebeat from the slurmdbd, with that index + +- name: Ensure opensearch stopped + systemd: + name: opensearch + state: stopped + register: _opensearch_stop + until: "_opensearch_stop.status.ActiveState in ['inactive', 'failed']" + retries: 15 + delay: 5 + +- name: Archive existing data + community.general.archive: + path: "{{ opensearch_data_path }}" + dest: "{{ opensearch_data_path | dirname }}/data-{{ lookup('pipe', 'date --iso-8601=minutes') }}.tar.gz" + remove: true diff --git a/ansible/roles/opensearch/tasks/runtime.yml b/ansible/roles/opensearch/tasks/runtime.yml index e79cefc01..b2cdeb456 100644 --- a/ansible/roles/opensearch/tasks/runtime.yml +++ b/ansible/roles/opensearch/tasks/runtime.yml @@ -15,6 +15,17 @@ path: /etc/systemd/system/opendistro.service state: absent +- name: Enumerate files in data directory + find: + path: "{{ opensearch_data_path }}" + register: _find_opensearch_data + +- name: Archive incorrectly indexed data + import_tasks: archive_data.yml + when: + - _find_opensearch_data.files | length > 0 + - "'slurm_jobid_index' not in _find_opensearch_data.files | map(attribute='path') | map('basename')" + - name: Ensure required opensearch host directories exist file: state: directory @@ -27,6 +38,15 @@ - "{{ opensearch_config_path }}" - "{{ opensearch_data_path }}" +- name: Set indexed data flag + copy: + dest: "{{ opensearch_data_path }}/slurm_jobid_index" + content: | + This is a flag file to indicate that filebeat is pushing data + indexed by Slurm JobID to prevent duplicate OpenSearch records + owner: "{{ opensearch_podman_user }}" + group: "{{ opensearch_podman_user }}" + - name: Create certs import_tasks: certs.yml diff --git a/environments/common/files/filebeat/filebeat.yml b/environments/common/files/filebeat/filebeat.yml index 82c22a840..0f7186b3a 100644 --- a/environments/common/files/filebeat/filebeat.yml +++ b/environments/common/files/filebeat/filebeat.yml @@ -22,6 +22,11 @@ filebeat.inputs: fields_under_root: true processors: + # Want to use the Slurm JobID as the ElasticSearch id to avoid duplicated records + # Don't use filebeat.inputs:json.document_id as this removes the JobID from the record + - fingerprint: + fields: ["json.JobID"] + target_field: "@metadata._id" - timestamp: field: json.End layouts: