Skip to content

Commit

Permalink
fix kepler zero values and reduce scrape interval
Browse files Browse the repository at this point in the history
  • Loading branch information
salehsedghpour committed Nov 27, 2023
1 parent 5fa5b03 commit d48048f
Show file tree
Hide file tree
Showing 12 changed files with 117 additions and 25 deletions.
102 changes: 97 additions & 5 deletions deckard/iaac/gcp/kepler/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,64 @@ metadata:
sustainable-computing.io/app: kepler
name: kepler
---
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
sustainable-computing.io/app: kepler
name: prometheus-k8s
namespace: kepler
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: prometheus
app.kubernetes.io/instance: k8s
app.kubernetes.io/name: prometheus
sustainable-computing.io/app: kepler
name: prometheus-k8s
namespace: kepler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: v1
kind: ServiceAccount
metadata:
Expand Down Expand Up @@ -60,16 +118,17 @@ data:
ENABLE_GPU: "true"
ENABLE_PROCESS_METRICS: "false"
ENABLE_QAT: "false"
EXPOSE_CGROUP_METRICS: "false"
EXPOSE_CGROUP_METRICS: "true"
EXPOSE_HW_COUNTER_METRICS: "true"
EXPOSE_IRQ_COUNTER_METRICS: "true"
EXPOSE_KUBELET_METRICS: "false"
EXPOSE_KUBELET_METRICS: "true"
KEPLER_LOG_LEVEL: "1"
KEPLER_NAMESPACE: kepler
METRIC_PATH: /metrics
MODEL_CONFIG: |
CONTAINER_COMPONENTS_ESTIMATOR=false
PROMETHEUS_SCRAPE_INTERVAL: 30s
PROMETHEUS_SCRAPE_INTERVAL: 5s
CONTAINER_COMPONENTS_INIT_URL=https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/main/tests/test_models/DynComponentModelWeight/CgroupOnly/ScikitMixed/ScikitMixed.json
REDFISH_PROBE_INTERVAL_IN_SECONDS: "60"
REDFISH_SKIP_SSL_VERIFY: "true"
kind: ConfigMap
Expand Down Expand Up @@ -134,8 +193,7 @@ spec:
spec:
containers:
- args:
- /usr/bin/kepler -v=1 -kernel-source-dir=/usr/share/kepler/kernel_sources
-redfish-cred-file-path=/etc/redfish/redfish.csv
- /usr/bin/kepler -v=1
command:
- /bin/sh
- -c
Expand Down Expand Up @@ -193,6 +251,10 @@ spec:
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
- effect: NoSchedule
operator: "Equal"
value: present
key: nvidia.com/gpu
volumes:
- hostPath:
path: /lib/modules
Expand All @@ -216,3 +278,33 @@ spec:
- name: redfish
secret:
secretName: redfish-4kh9d7bc7m
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kepler-exporter
sustainable-computing.io/app: kepler
name: kepler-exporter
namespace: monitoring
spec:
endpoints:
- interval: 3s
port: http
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: instance
scheme: http
jobLabel: app.kubernetes.io/name
namespaceSelector:
matchNames:
- kepler
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kepler-exporter
4 changes: 2 additions & 2 deletions deckard/iaac/gcp/prometheus/alertmanager-serviceMonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ metadata:
namespace: monitoring
spec:
endpoints:
- interval: 30s
- interval: 5s
port: web
- interval: 30s
- interval: 5s
port: reloader-web
selector:
matchLabels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ metadata:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
interval: 5s
path: /metrics
port: https
scheme: https
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
interval: 5s
metricRelabelings:
- action: drop
regex: kube_endpoint_address_not_ready|kube_endpoint_address_available
Expand All @@ -27,7 +27,7 @@ spec:
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
interval: 5s
port: https-self
scheme: https
tlsConfig:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
interval: 5s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
interval: 5s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ metadata:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
interval: 5s
port: https-metrics
scheme: https
tlsConfig:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
interval: 5s
metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
Expand Down Expand Up @@ -56,7 +56,7 @@ spec:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
honorTimestamps: false
interval: 30s
interval: 5s
metricRelabelings:
- action: drop
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
Expand Down Expand Up @@ -85,7 +85,7 @@ spec:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s
interval: 5s
path: /metrics/probes
port: https-metrics
relabelings:
Expand Down
4 changes: 2 additions & 2 deletions deckard/iaac/gcp/prometheus/prometheus-serviceMonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ metadata:
namespace: monitoring
spec:
endpoints:
- interval: 30s
- interval: 5s
port: web
- interval: 30s
- interval: 5s
port: reloader-web
selector:
matchLabels:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ metadata:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s
interval: 5s
metricRelabelings:
- action: drop
regex: (apiserver_client_certificate_.*|apiserver_envelope_.*|apiserver_flowcontrol_.*|apiserver_storage_.*|apiserver_webhooks_.*|workqueue_.*)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1929,8 +1929,8 @@ spec:
format: int64
type: integer
evaluationInterval:
default: 30s
description: 'Interval between rule evaluations. Default: "30s"'
default: 5s
description: 'Interval between rule evaluations. Default: "5s"'
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
excludedFromEnforcement:
Expand Down Expand Up @@ -4038,8 +4038,8 @@ spec:
type: object
x-kubernetes-map-type: atomic
scrapeInterval:
default: 30s
description: "Interval between consecutive scrapes. \n Default: \"30s\""
default: 5s
description: "Interval between consecutive scrapes. \n Default: \"5s\""
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
scrapeTimeout:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3306,8 +3306,8 @@ spec:
type: object
x-kubernetes-map-type: atomic
scrapeInterval:
default: 30s
description: "Interval between consecutive scrapes. \n Default: \"30s\""
default: 5s
description: "Interval between consecutive scrapes. \n Default: \"5s\""
pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$
type: string
scrapeTimeout:
Expand Down

0 comments on commit d48048f

Please sign in to comment.