stackhpc · wtripp180901 · Nov 5, 2024 · Nov 5, 2024 · Nov 6, 2024 · Nov 6, 2024
@@ -9,3 +9,4 @@ image_list:
 - { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" }
 - { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" }
 - { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" }
+- { name: "quay.io/prometheus/blackbox-exporter", tag: "{{ kube_prometheus_stack_blackbox_exporter_image_tag }}" }
@@ -20,6 +20,12 @@ kube_prometheus_stack_wait_timeout: 5m
 kube_prometheus_stack_metrics_image_tag: v2.12.0
 kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6
 
+kube_prometheus_stack_blackbox_exporter_release_version: 9.0.1
+kube_prometheus_stack_blackbox_exporter_image_tag: v0.25.0
+kube_prometheus_stack_blackbox_exporter_release_name: blackbox-exporter
+
+kube_prometheus_stack_blackbox_modules: {}
+
 control_ip: "{{ hostvars[groups['control'].0].ansible_host }}"
 
 grafana_auth_anonymous: false

@@ -176,6 +176,26 @@
   ansible.builtin.import_role:
     name: grafana-dashboards
 
+- name: Install blackbox exporter helm chart
+  no_log: true # may expose testuser password
+  kubernetes.core.helm:
+    chart_ref: prometheus-blackbox-exporter
+    chart_repo_url: https://prometheus-community.github.io/helm-charts
+    chart_version: "{{ kube_prometheus_stack_blackbox_exporter_release_version }}"
+    release_name: "{{ kube_prometheus_stack_blackbox_exporter_release_name }}"
+    release_namespace: "{{ kube_prometheus_stack_release_namespace }}"
+    release_values:
+      nodeSelector:
+        clusterrole: "server"
+      config:
+        modules: "{{ kube_prometheus_stack_blackbox_modules }}"
+      configReloader:
+        image:
+          tag: "{{ kube_prometheus_stack_app_version }}" # keeps consistent with pre-pulled image for kube-prometheus-stack
+      image:
+        tag: "{{ kube_prometheus_stack_blackbox_exporter_image_tag }}"
+    wait: yes
+
 - name: Install kube-prometheus-stack on target Kubernetes cluster
   kubernetes.core.helm:
     chart_ref: "{{ kube_prometheus_stack_chart_name }}"

@@ -28,6 +28,10 @@ Metrics are scraped from exporters. Exporters are services which expose HTTP end
 
 Tool which parses slurm accounting data and produces a log file that is suitable for ingest by filebeat.
 
+### [blackbox-exporter](https://github.com/prometheus/blackbox_exporter)
+
+Tool which allows blackbox probing of endpoints over HTTP, HTTPS, DNS, TCP, ICMP and gRPC.
+
 ## Definition of terms
 
 In this section we define any terms that may not be widely understood.
@@ -290,6 +294,21 @@ slurm-stats is configured `slurm-stats` role in the [slurm_openstack_tools colle
 The `slurm_stats` group controls the placement of the `slurm_stats` service.
 This should be configured to be a group with a single host. That host must be co-located on the same host as the `filebeat` service that scrapes its output.
 
+## blackbox-exporter
+
+### Defaults and adding jobs
+
+Blackbox exporter is configured using rolevars in the [kube_prometheus_stack role](../ansible/roles/kube_prometheus_stack/defaults/main). Blackbox uses modules to
+probe service endpoints. Modules can be configured by overriding the maps in [environments/common/inventory/group_vars/all/blackbox_exporter.yml](../environments/common/inventory/group_vars/all/blackbox_exporter.yml), see [upstream docs](https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md)
+and [underlying Helm chart values](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-blackbox-exporter/values.yaml#L162) for module configuration options. Probes are defined through Prometheus scrape jobs, which can be added in [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml). See upstream docs for configuring blackbox-exporter scrape jobs.
 
+By default a HTTPS probe for OpenOndemand is added if there are hosts in the `openondemand` group, the module and scrape job for this is defined in
+[environments/common/inventory/group_vars/all/openondemand.yml](../environments/common/inventory/group_vars/all/openondemand.yml) (these are merged into the config in [blackbox_exporter.yml]([prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml)) and [prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml) respectively).
 
+### Placement
+
+Installed as part of the kube_prometheus_stack role thats placement is controlled by the `prometheus` group. As above, there is currently no load balancing support so should only be placed on a single node, configured to be the Slurm control node by default.
+
+### Access
 
+Probes can be viewed through the `Prometheus Blackbox Exporter` Grafana dashboard.
@@ -1,7 +1,7 @@
 {
     "cluster_image": {
-        "RL8": "openhpc-RL8-241029-0905-f23c2fca",
-        "RL9": "openhpc-RL9-241029-0949-f23c2fca",
-        "RL9-cuda": "openhpc-cuda-RL9-241029-0905-f23c2fca"
+        "RL8": "openhpc-RL8-241106-1149-6e780c0d",
+        "RL9": "openhpc-RL9-241106-1149-6e780c0d",
+        "RL9-cuda": "openhpc-cuda-RL9-241106-1149-6e780c0d"
     }
 }
@@ -0,0 +1,7 @@
+# Note: underlying helm chart is configured with a default module 'http_2xx', defining modules with this name here will merge the module's values with the existing module,
+# see https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-blackbox-exporter/values.yaml#L163
+kube_prometheus_stack_blackbox_modules_defaults: {}
+
+kube_prometheus_stack_blackbox_modules: "{{ kube_prometheus_stack_blackbox_modules_defaults if ( groups['openondemand'] | count == 0 ) else ( kube_prometheus_stack_blackbox_modules_defaults | combine(openondemand_blackbox_modules) ) }}"
+
+# See prometheus_scrape_configs in prometheus.yml to add additional scrape jobs to probe services
@@ -38,6 +38,12 @@ grafana_dashboards_default:
       - placeholder: DS_PROMETHEUS
         replacement: prometheus
     revision_id: 3
+  # blackbox probes
+  - dashboard_id: 14928
+    replacements:
+      - placeholder: DS_PROMETHEUS
+        replacement: prometheus
+    revision_id: 6
 grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}"
 
 # Configmap names of kube prometheus stack's default dashboards to exclude

@@ -2,5 +2,6 @@ kube_prometheus_stack_chart_version: 59.1.0
 kube_prometheus_stack_release_namespace: monitoring-system
 kube_prometheus_stack_release_name: kube-prometheus-stack
 kube_prometheus_stack_wait_timeout: 5m
+kube_prometheus_stack_blackbox_exporter_release_name: blackbox-exporter
 
 # See prometheus.yml, grafana.yml and alertmanager.yml for config of individual monitoring services
@@ -182,6 +182,34 @@ openondemand_scrape_configs:
       labels:
         environment: "{{ appliances_environment_name }}"
         service: "openondemand"
+  - job_name: "ood-blackbox-probe"
+    metrics_path: /probe
+    params:
+      module: [ood_http_2xx]
+    static_configs:
+      - targets:
+        - "{{ openondemand_address }}"
+    relabel_configs:
+    - source_labels: [__address__]
+      target_label: __param_target
+    - source_labels: [__param_target]
+      target_label: target
+    - target_label: __address__ 
+      replacement: "{{ kube_prometheus_stack_blackbox_exporter_release_name }}-prometheus-blackbox-exporter:9115"
+
+openondemand_blackbox_modules:
+  ood_http_2xx:
+    prober: http
+    timeout: 5s
+    http:
+      valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
+      follow_redirects: true
+      preferred_ip_protocol: "ip4"
+      tls_config:
+        insecure_skip_verify: true
+      basic_auth:
+        username: "testuser"
+        password: "{{ vault_testuser_password }}"
 
 openondemand_dashboard:
   - dashboard_id: 13465

@@ -28,6 +28,54 @@ prometheus_extra_rules:
     expr: "slurm_nodes_down > 0\n"
     labels:
       severity: critical
+  - alert: BlackboxProbeFailed
+    expr: probe_success == 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: '{% raw %}Blackbox probe failed (target {{ $labels.target }}){% endraw %}'
+      description: "{% raw %}Blackbox probe '{{ $labels.target }}' failed{% endraw %}"
+  - alert: BlackboxSlowProbe
+    expr: avg_over_time(probe_duration_seconds[1m]) > 1.25 #around 1.14 expected due to indirection in cluster
+    for: 1m
+    labels:
+      severity: warning
+    annotations:
+      summary: '{% raw %}Blackbox slow probe (target {{ $labels.target }}){% endraw %}'
+      description: "{% raw %}Blackbox probe '{{ $labels.target }}' took more than 1s to complete - {{ $value }}{% endraw %}"
+  - alert: BlackboxProbeHttpFailure
+    expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: '{% raw %}Blackbox probe HTTP failure (target {{ $labels.target }}){% endraw %}'
+      description: "{% raw %}Blackbox probe '{{ $labels.target }}' returned an HTTP error status - {{ $value }}{% endraw %}"
+  - alert: BlackboxSslCertificateWillExpireSoon
+    expr: (7 * 24 * 3600) <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (30 * 24 * 3600)
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: '{% raw %}Blackbox SSL certificate will expire soon (target {{ $labels.target }}){% endraw %}'
+      description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}"
+  - alert: BlackboxSslCertificateWillExpireVerySoon
+    expr: 0 <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (7 * 24 * 3600)
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: '{% raw %}Blackbox SSL certificate will expire very soon (target {{ $labels.target }}){% endraw %}'
+      description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}"
+  - alert: BlackboxSslCertificateExpired
+    expr: (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: '{% raw %}Blackbox SSL certificate expired (target {{ $labels.target }}){% endraw %}'
+      description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' has expired{% endraw %}"
   - record: node_cpu_system_seconds:record
     expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="system",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s])))
   - record: node_cpu_user_seconds:record