diff --git a/charts/lumigo-operator/templates/cluster-agent-service.yaml b/charts/lumigo-operator/templates/cluster-agent-service.yaml index 7152b1b..386116b 100644 --- a/charts/lumigo-operator/templates/cluster-agent-service.yaml +++ b/charts/lumigo-operator/templates/cluster-agent-service.yaml @@ -18,5 +18,6 @@ spec: protocol: TCP port: {{ .Values.prometheusNodeExporter.service.port }} targetPort: {{ .Values.prometheusNodeExporter.service.port }} - type: ClusterIP + nodePort: {{ .Values.prometheusNodeExporter.service.nodePort }} + type: NodePort {{- end }} \ No newline at end of file diff --git a/charts/lumigo-operator/templates/controller-deployment-and-webhooks.yaml b/charts/lumigo-operator/templates/controller-deployment-and-webhooks.yaml index 8479cbe..31c5175 100644 --- a/charts/lumigo-operator/templates/controller-deployment-and-webhooks.yaml +++ b/charts/lumigo-operator/templates/controller-deployment-and-webhooks.yaml @@ -258,7 +258,7 @@ spec: - name: LUMIGO_CLUSTER_AGENT_SERVICE value: "{{ include "helm.fullname" . }}-cluster-agent-service.{{ .Release.Namespace }}.svc.cluster.local" - name: LUMIGO_PROM_NODE_EXPORTER_PORT - value: "{{ .Values.prometheusNodeExporter.service.port }}" + value: "{{ .Values.prometheusNodeExporter.service.nodePort }}" - name: LUMIGO_KUBE_STATE_METRICS_SERVICE value: "{{ .Release.Name }}-kube-state-metrics.{{ .Release.Namespace }}.svc.cluster.local" - name: LUMIGO_KUBE_STATE_METRICS_PORT diff --git a/charts/lumigo-operator/values.yaml b/charts/lumigo-operator/values.yaml index 25a4b27..20eb659 100644 --- a/charts/lumigo-operator/values.yaml +++ b/charts/lumigo-operator/values.yaml @@ -59,6 +59,7 @@ prometheusNodeExporter: tag: v1.8.2 service: port: 9100 + nodePort: 30090 resources: limits: cpu: 500m diff --git a/telemetryproxy/docker/etc/config.yaml.tpl b/telemetryproxy/docker/etc/config.yaml.tpl index 8e14118..d7085b8 100644 --- a/telemetryproxy/docker/etc/config.yaml.tpl +++ b/telemetryproxy/docker/etc/config.yaml.tpl @@ -46,10 +46,21 @@ receivers: authorization: credentials_file: "/var/run/secrets/kubernetes.io/serviceaccount/token" - job_name: 'prometheus-node-exporter' - metrics_path: /metrics - scrape_interval: {{ $infraMetricsFrequency }} - static_configs: - - targets: ['{{ getenv "LUMIGO_CLUSTER_AGENT_SERVICE" }}:{{ getenv "LUMIGO_PROM_NODE_EXPORTER_PORT" }}'] + kubernetes_sd_configs: + - role: node + relabel_configs: + - source_labels: [__meta_kubernetes_node_address_InternalIP] + action: replace + target_label: __address__ + # Scrape a custom port provided by LUMIGO_PROM_NODE_EXPORTER_PORT. + # '$$1' escapes '$1', as Gomplate otherwise thinks it's an environment variable. + replacement: '$$1:$LUMIGO_PROM_NODE_EXPORTER_PORT' + - source_labels: [__meta_kubernetes_node_name] + action: replace + target_label: node + metrics_path: "/metrics" + authorization: + credentials_file: "/var/run/secrets/kubernetes.io/serviceaccount/token" - job_name: 'kube-state-metrics' metrics_path: /metrics scrape_interval: {{ $infraMetricsFrequency }} diff --git a/tests/kubernetes-distros/kind/lumigooperator_metrics_test.go b/tests/kubernetes-distros/kind/lumigooperator_metrics_test.go index 28680f2..bd8ea22 100644 --- a/tests/kubernetes-distros/kind/lumigooperator_metrics_test.go +++ b/tests/kubernetes-distros/kind/lumigooperator_metrics_test.go @@ -71,24 +71,47 @@ func TestLumigoOperatorInfraMetrics(t *testing.T) { } } - allMetricNames := strings.Join(uniqueMetricNames, " ") - expectedSampleMetrics := []string{ - // A sample for cadvisor metrics - "container_fs_usage_bytes", - // A sample for kube-state-metrics metrics - "kube_pod_status_scheduled", - // A sample for Prometheus Node Exporter metrics - "node_cpu_seconds_total", - } + prometheusNodeExporterMetricsFound := false + cadvisorMetricsFound := false + kubeStateMetricsFound := false + + for _, metric := range metrics { + if metric.Name() == "node_cpu_seconds_total" { + prometheusNodeExporterMetricsFound = true + for i := 0; i < metric.Sum().DataPoints().Len(); i++ { + attributes := metric.Sum().DataPoints().At(i).Attributes() + _, nodeAttributeExists := attributes.Get("node") + if !nodeAttributeExists { + t.Logf("could not find attribute 'node' for metric 'node_cpu_seconds_total'") + return false, nil + } + } + } + + if metric.Name() == "container_fs_usage_bytes" { + cadvisorMetricsFound = true + } - t.Logf("Collected metrics so far: %v\n", uniqueMetricNames) - for _, expectedSampleMetric := range expectedSampleMetrics { - if !strings.Contains(allMetricNames, expectedSampleMetric) { - t.Logf("could not find %s among collected metrics", expectedSampleMetric) - return false, nil + if metric.Name() == "kube_pod_status_scheduled" { + kubeStateMetricsFound = true } } + if !prometheusNodeExporterMetricsFound { + t.Logf("could not find Prometheus Node Exporter metrics. Seen metrics: %v", uniqueMetricNames) + return false, nil + } + + if !cadvisorMetricsFound { + t.Logf("could not find cAdvisor metrics. Seen metrics: %v", uniqueMetricNames) + return false, nil + } + + if !kubeStateMetricsFound { + t.Logf("could not find kube-state-metrics. Seen metrics: %v", uniqueMetricNames) + return false, nil + } + return true, nil }); err != nil { t.Fatalf("Failed to wait for metrics: %v", err)