stackhpc · sjpb · Aug 10, 2023 · Aug 9, 2023 · Aug 9, 2023 · Aug 9, 2023
@@ -59,6 +59,11 @@
         tasks_from: vnc_compute.yml
 
     # - import_playbook: monitoring.yml:
+    - import_role:
+        name: opensearch
+        tasks_from: install.yml
+      become: true
+
     #   opensearch - containerised, nothing to do
     # slurm_stats - nothing to do
     # filebeat - containerised - nothing to do

@@ -7,6 +7,11 @@
   tasks:
     - import_role:
         name: opensearch
+        tasks_from: install.yml
+      become: true
+    - import_role:
+        name: opensearch
+        tasks_from: runtime.yml
       become: true
 
 - name: Setup slurm stats

@@ -3,7 +3,7 @@
 #opensearch_internal_users_path:
 
 opensearch_podman_user: "{{ ansible_user }}"
-opensearch_version: '2.4.0' # https://hub.docker.com/r/opensearchproject/opensearch/tags
+opensearch_version: '2.9.0' # https://hub.docker.com/r/opensearchproject/opensearch/tags
 opensearch_config_path: /usr/share/opensearch/config
 opensearch_data_path: /usr/share/opensearch/data
 opensearch_state: started # will be restarted if required

@@ -5,5 +5,4 @@
     name: opensearch.service
     state: "{{ 'restarted' if 'started' in opensearch_state else opensearch_state }}"
     enabled: "{{ opensearch_systemd_service_enabled }}"
-    daemon_reload: "{{ 'started' in opensearch_state }}"
   become: true
@@ -0,0 +1,17 @@
+# Remove data which was NOT indexed by Slurm Job ID
+# It will be re-ingested by filebeat from the slurmdbd, with that index
+
+- name: Ensure opensearch stopped
+  systemd:
+    name: opensearch
+    state: stopped
+  register: _opensearch_stop
+  until: "_opensearch_stop.status.ActiveState in ['inactive', 'failed']"
+  retries: 15
+  delay: 5
+
+- name: Archive existing data
+  community.general.archive:
+    path: "{{ opensearch_data_path }}"
+    dest: "{{ opensearch_data_path | dirname }}/data-{{ lookup('pipe', 'date --iso-8601=minutes') }}.tar.gz"
+    remove: true
@@ -0,0 +1,19 @@
+# safe to use during build
+
+- name: Increase maximum number of virtual memory maps
+  # see https://opensearch.org/docs/2.0/opensearch/install/important-settings/
+  ansible.posix.sysctl:
+    name: vm.max_map_count
+    value: '262144'
+    state: present
+    reload: yes
+
+- name: Create systemd unit file
+  template:
+    dest: /etc/systemd/system/opensearch.service
+    src: opensearch.service.j2
+  register: _opensearch_unit
+
+- name: Reload opensearch unit file
+  command: systemctl daemon-reload
+  when: _opensearch_unit.changed
@@ -15,14 +15,16 @@
     path: /etc/systemd/system/opendistro.service
     state: absent
 
-- name: Increase maximum number of virtual memory maps
-  # see https://opensearch.org/docs/2.0/opensearch/install/important-settings/
-  ansible.posix.sysctl:
-    name: vm.max_map_count
-    value: '262144'
-    state: present
-    reload: yes
-  become: true
+- name: Enumerate files in data directory
+  find:
+    path: "{{ opensearch_data_path }}"
+  register: _find_opensearch_data
+
+- name: Archive incorrectly indexed data
+  import_tasks: archive_data.yml
+  when:
+    - _find_opensearch_data.files | length > 0
+    - "'slurm_jobid_index' not in _find_opensearch_data.files | map(attribute='path') | map('basename')"
 
 - name: Ensure required opensearch host directories exist
   file:
@@ -35,11 +37,18 @@
   loop:
     - "{{ opensearch_config_path }}"
     - "{{ opensearch_data_path }}"
-  when: "'started' in opensearch_state" # don't run during image build
+
+- name: Set indexed data flag
+  copy:
+    dest: "{{ opensearch_data_path }}/slurm_jobid_index"
+    content: |
+      This is a flag file to indicate that filebeat is pushing data
+      indexed by Slurm JobID to prevent duplicate OpenSearch records
+    owner: "{{ opensearch_podman_user }}"
+    group: "{{ opensearch_podman_user }}"
 
 - name: Create certs
   import_tasks: certs.yml
-  when: "'started' in opensearch_state" # don't run during image build
 
 - name: Template general configuration
   ansible.builtin.template:
@@ -52,7 +61,6 @@
     mode: 0660
   notify: Restart opensearch service
   become: true
-  when: "'started' in opensearch_state" # don't run during image build
 
 - name: Template internal user configuration
   template:
@@ -65,14 +73,11 @@
       mode: 0660
   notify: Restart opensearch service
   become: true
-  when: "'started' in opensearch_state" # don't run during image build
 
-- name: Create systemd unit file
-  template:
-    dest: /etc/systemd/system/opensearch.service
-    src: opensearch.service.j2
-  become: true
-  notify: Restart opensearch service
+- name: Pull container
+  containers.podman.podman_image:
+    name: "opensearchproject/opensearch:{{ opensearch_version }}"
+  become_user: "{{ opensearch_podman_user }}"
 
 - name: Flush handlers
   meta: flush_handlers

@@ -13,7 +13,7 @@ variable "cluster_name" {
 variable "cluster_image" {
     description = "single image for all cluster nodes - a convenience for CI"
     type = string
-    default = "openhpc-230804-1754-80b8d714" # https://github.com/stackhpc/ansible-slurm-appliance/pull/298
+    default = "openhpc-230809-1602-2250239e" # https://github.com/stackhpc/ansible-slurm-appliance/pull/299
     # default = "Rocky-8-GenericCloud-Base-8.7-20221130.0.x86_64.qcow2"
     # default = "Rocky-8-GenericCloud-8.6.20220702.0.x86_64.qcow2"
 }

@@ -22,6 +22,11 @@ filebeat.inputs:
     fields_under_root: true
 
 processors:
+  # Want to use the Slurm JobID as the ElasticSearch id to avoid duplicated records
+  # Don't use filebeat.inputs:json.document_id as this removes the JobID from the record
+  - fingerprint:
+      fields: ["json.JobID"]
+      target_field: "@metadata._id"
   - timestamp:
       field: json.End
       layouts: