diff --git a/deckard/iaac/gcp/kepler/deployment.yaml b/deckard/iaac/gcp/kepler/deployment.yaml index 57657727..df9240c5 100644 --- a/deckard/iaac/gcp/kepler/deployment.yaml +++ b/deckard/iaac/gcp/kepler/deployment.yaml @@ -9,6 +9,64 @@ metadata: sustainable-computing.io/app: kepler name: kepler --- +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + sustainable-computing.io/app: kepler + name: prometheus-k8s + namespace: kepler +rules: +- apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/instance: k8s + app.kubernetes.io/name: prometheus + sustainable-computing.io/app: kepler + name: prometheus-k8s + namespace: kepler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- apiVersion: v1 kind: ServiceAccount metadata: @@ -60,16 +118,17 @@ data: ENABLE_GPU: "true" ENABLE_PROCESS_METRICS: "false" ENABLE_QAT: "false" - EXPOSE_CGROUP_METRICS: "false" + EXPOSE_CGROUP_METRICS: "true" EXPOSE_HW_COUNTER_METRICS: "true" EXPOSE_IRQ_COUNTER_METRICS: "true" - EXPOSE_KUBELET_METRICS: "false" + EXPOSE_KUBELET_METRICS: "true" KEPLER_LOG_LEVEL: "1" KEPLER_NAMESPACE: kepler METRIC_PATH: /metrics MODEL_CONFIG: | CONTAINER_COMPONENTS_ESTIMATOR=false - PROMETHEUS_SCRAPE_INTERVAL: 30s + PROMETHEUS_SCRAPE_INTERVAL: 5s + CONTAINER_COMPONENTS_INIT_URL=https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/main/tests/test_models/DynComponentModelWeight/CgroupOnly/ScikitMixed/ScikitMixed.json REDFISH_PROBE_INTERVAL_IN_SECONDS: "60" REDFISH_SKIP_SSL_VERIFY: "true" kind: ConfigMap @@ -134,8 +193,7 @@ spec: spec: containers: - args: - - /usr/bin/kepler -v=1 -kernel-source-dir=/usr/share/kepler/kernel_sources - -redfish-cred-file-path=/etc/redfish/redfish.csv + - /usr/bin/kepler -v=1 command: - /bin/sh - -c @@ -193,6 +251,10 @@ spec: tolerations: - effect: NoSchedule key: node-role.kubernetes.io/master + - effect: NoSchedule + operator: "Equal" + value: present + key: nvidia.com/gpu volumes: - hostPath: path: /lib/modules @@ -216,3 +278,33 @@ spec: - name: redfish secret: secretName: redfish-4kh9d7bc7m +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kepler-exporter + sustainable-computing.io/app: kepler + name: kepler-exporter + namespace: monitoring +spec: + endpoints: + - interval: 3s + port: http + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: instance + scheme: http + jobLabel: app.kubernetes.io/name + namespaceSelector: + matchNames: + - kepler + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kepler-exporter diff --git a/deckard/iaac/gcp/prometheus/alertmanager-serviceMonitor.yaml b/deckard/iaac/gcp/prometheus/alertmanager-serviceMonitor.yaml index b9bb00c6..3b77741e 100644 --- a/deckard/iaac/gcp/prometheus/alertmanager-serviceMonitor.yaml +++ b/deckard/iaac/gcp/prometheus/alertmanager-serviceMonitor.yaml @@ -11,9 +11,9 @@ metadata: namespace: monitoring spec: endpoints: - - interval: 30s + - interval: 5s port: web - - interval: 30s + - interval: 5s port: reloader-web selector: matchLabels: diff --git a/deckard/iaac/gcp/prometheus/blackboxExporter-serviceMonitor.yaml b/deckard/iaac/gcp/prometheus/blackboxExporter-serviceMonitor.yaml index 64c78d27..4cd33df6 100644 --- a/deckard/iaac/gcp/prometheus/blackboxExporter-serviceMonitor.yaml +++ b/deckard/iaac/gcp/prometheus/blackboxExporter-serviceMonitor.yaml @@ -11,7 +11,7 @@ metadata: spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s + interval: 5s path: /metrics port: https scheme: https diff --git a/deckard/iaac/gcp/prometheus/kubeStateMetrics-serviceMonitor.yaml b/deckard/iaac/gcp/prometheus/kubeStateMetrics-serviceMonitor.yaml index 93ca4bd8..b2acbe66 100644 --- a/deckard/iaac/gcp/prometheus/kubeStateMetrics-serviceMonitor.yaml +++ b/deckard/iaac/gcp/prometheus/kubeStateMetrics-serviceMonitor.yaml @@ -12,7 +12,7 @@ spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token honorLabels: true - interval: 30s + interval: 5s metricRelabelings: - action: drop regex: kube_endpoint_address_not_ready|kube_endpoint_address_available @@ -27,7 +27,7 @@ spec: tlsConfig: insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s + interval: 5s port: https-self scheme: https tlsConfig: diff --git a/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorApiserver.yaml b/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorApiserver.yaml index bfc1f315..0163f3da 100644 --- a/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorApiserver.yaml +++ b/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorApiserver.yaml @@ -9,7 +9,7 @@ metadata: spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s + interval: 5s metricRelabelings: - action: drop regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) diff --git a/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml b/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml index 1a71e8e4..22f98cb0 100644 --- a/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml +++ b/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubeControllerManager.yaml @@ -9,7 +9,7 @@ metadata: spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s + interval: 5s metricRelabelings: - action: drop regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) diff --git a/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubeScheduler.yaml b/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubeScheduler.yaml index 7fd84fc3..588e37b9 100644 --- a/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubeScheduler.yaml +++ b/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubeScheduler.yaml @@ -9,7 +9,7 @@ metadata: spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s + interval: 5s port: https-metrics scheme: https tlsConfig: diff --git a/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubelet.yaml b/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubelet.yaml index 96bbdbab..1c60e2c1 100644 --- a/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubelet.yaml +++ b/deckard/iaac/gcp/prometheus/kubernetesControlPlane-serviceMonitorKubelet.yaml @@ -10,7 +10,7 @@ spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token honorLabels: true - interval: 30s + interval: 5s metricRelabelings: - action: drop regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) @@ -56,7 +56,7 @@ spec: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token honorLabels: true honorTimestamps: false - interval: 30s + interval: 5s metricRelabelings: - action: drop regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) @@ -85,7 +85,7 @@ spec: insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token honorLabels: true - interval: 30s + interval: 5s path: /metrics/probes port: https-metrics relabelings: diff --git a/deckard/iaac/gcp/prometheus/prometheus-serviceMonitor.yaml b/deckard/iaac/gcp/prometheus/prometheus-serviceMonitor.yaml index 6800a948..957ba404 100644 --- a/deckard/iaac/gcp/prometheus/prometheus-serviceMonitor.yaml +++ b/deckard/iaac/gcp/prometheus/prometheus-serviceMonitor.yaml @@ -11,9 +11,9 @@ metadata: namespace: monitoring spec: endpoints: - - interval: 30s + - interval: 5s port: web - - interval: 30s + - interval: 5s port: reloader-web selector: matchLabels: diff --git a/deckard/iaac/gcp/prometheus/prometheusAdapter-serviceMonitor.yaml b/deckard/iaac/gcp/prometheus/prometheusAdapter-serviceMonitor.yaml index a6e3e035..eb6faed1 100644 --- a/deckard/iaac/gcp/prometheus/prometheusAdapter-serviceMonitor.yaml +++ b/deckard/iaac/gcp/prometheus/prometheusAdapter-serviceMonitor.yaml @@ -11,7 +11,7 @@ metadata: spec: endpoints: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s + interval: 5s metricRelabelings: - action: drop regex: (apiserver_client_certificate_.*|apiserver_envelope_.*|apiserver_flowcontrol_.*|apiserver_storage_.*|apiserver_webhooks_.*|workqueue_.*) diff --git a/deckard/iaac/gcp/prometheus/setup/0prometheusCustomResourceDefinition.yaml b/deckard/iaac/gcp/prometheus/setup/0prometheusCustomResourceDefinition.yaml index 30b57926..bb09666f 100644 --- a/deckard/iaac/gcp/prometheus/setup/0prometheusCustomResourceDefinition.yaml +++ b/deckard/iaac/gcp/prometheus/setup/0prometheusCustomResourceDefinition.yaml @@ -1929,8 +1929,8 @@ spec: format: int64 type: integer evaluationInterval: - default: 30s - description: 'Interval between rule evaluations. Default: "30s"' + default: 5s + description: 'Interval between rule evaluations. Default: "5s"' pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string excludedFromEnforcement: @@ -4038,8 +4038,8 @@ spec: type: object x-kubernetes-map-type: atomic scrapeInterval: - default: 30s - description: "Interval between consecutive scrapes. \n Default: \"30s\"" + default: 5s + description: "Interval between consecutive scrapes. \n Default: \"5s\"" pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string scrapeTimeout: diff --git a/deckard/iaac/gcp/prometheus/setup/0prometheusagentCustomResourceDefinition.yaml b/deckard/iaac/gcp/prometheus/setup/0prometheusagentCustomResourceDefinition.yaml index 013c2fb8..c75effe6 100644 --- a/deckard/iaac/gcp/prometheus/setup/0prometheusagentCustomResourceDefinition.yaml +++ b/deckard/iaac/gcp/prometheus/setup/0prometheusagentCustomResourceDefinition.yaml @@ -3306,8 +3306,8 @@ spec: type: object x-kubernetes-map-type: atomic scrapeInterval: - default: 30s - description: "Interval between consecutive scrapes. \n Default: \"30s\"" + default: 5s + description: "Interval between consecutive scrapes. \n Default: \"5s\"" pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ type: string scrapeTimeout: