Skip to content

Commit

Permalink
install kepler and monitoring tools
Browse files Browse the repository at this point in the history
  • Loading branch information
salehsedghpour committed Nov 12, 2023
1 parent aeacf29 commit 0fbc381
Show file tree
Hide file tree
Showing 97 changed files with 73,343 additions and 0 deletions.
8 changes: 8 additions & 0 deletions deckard/iaac/gcp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,14 @@ You can simply take a look at `pod.yaml` file for defining a pod. Just to check,
kubectl apply -f ./IaaC/gcp/pod.yaml
```

## Install Kepler and monitoring tools
Kepler is the module that collects the power consumption per container/namespace/node and stores them in Prometheus:
```bash
kubectl apply --server-side -f ./IaaC/gcp/prometheus/setup
kubectl apply -f ./IaaC/gcp/prometheus/
kubectl apply -f ./IaaC/gcp/kepler/deployment.yaml
```

## Prepare the access values in the shared volume (optional):
First of all we need to create a vm by running:
```
Expand Down
218 changes: 218 additions & 0 deletions deckard/iaac/gcp/kepler/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
apiVersion: v1
kind: Namespace
metadata:
labels:
pod-security.kubernetes.io/audit: privileged
pod-security.kubernetes.io/enforce: privileged
pod-security.kubernetes.io/warn: privileged
security.openshift.io/scc.podSecurityLabelSync: "false"
sustainable-computing.io/app: kepler
name: kepler
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
sustainable-computing.io/app: kepler
name: kepler-sa
namespace: kepler
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
sustainable-computing.io/app: kepler
name: kepler-clusterrole
rules:
- apiGroups:
- ""
resources:
- nodes/metrics
- nodes/proxy
- nodes/stats
- pods
verbs:
- get
- watch
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
sustainable-computing.io/app: kepler
name: kepler-clusterrole-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kepler-clusterrole
subjects:
- kind: ServiceAccount
name: kepler-sa
namespace: kepler
---
apiVersion: v1
data:
BIND_ADDRESS: 0.0.0.0:9102
CGROUP_METRICS: '*'
CPU_ARCH_OVERRIDE: ""
ENABLE_EBPF_CGROUPID: "true"
ENABLE_GPU: "true"
ENABLE_PROCESS_METRICS: "false"
ENABLE_QAT: "false"
EXPOSE_CGROUP_METRICS: "false"
EXPOSE_HW_COUNTER_METRICS: "true"
EXPOSE_IRQ_COUNTER_METRICS: "true"
EXPOSE_KUBELET_METRICS: "false"
KEPLER_LOG_LEVEL: "1"
KEPLER_NAMESPACE: kepler
METRIC_PATH: /metrics
MODEL_CONFIG: |
CONTAINER_COMPONENTS_ESTIMATOR=false
PROMETHEUS_SCRAPE_INTERVAL: 30s
REDFISH_PROBE_INTERVAL_IN_SECONDS: "60"
REDFISH_SKIP_SSL_VERIFY: "true"
kind: ConfigMap
metadata:
labels:
sustainable-computing.io/app: kepler
name: kepler-cfm
namespace: kepler
---
apiVersion: v1
data:
redfish.csv: |
eW91cl9rdWJlbGV0X25vZGVfbmFtZSxyZWRmaXNoX3VzZXJuYW1lLHJlZGZpc2hfcGFzc3
dvcmQsaHR0cHM6Ly9yZWRmaXNoX2lwX29yX2hvc3RuYW1lCg==
kind: Secret
metadata:
labels:
sustainable-computing.io/app: kepler
name: redfish-4kh9d7bc7m
namespace: kepler
type: Opaque
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kepler-exporter
sustainable-computing.io/app: kepler
name: kepler-exporter
namespace: kepler
spec:
clusterIP: None
ports:
- name: http
port: 9102
targetPort: http
selector:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kepler-exporter
sustainable-computing.io/app: kepler
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
sustainable-computing.io/app: kepler
name: kepler-exporter
namespace: kepler
spec:
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kepler-exporter
sustainable-computing.io/app: kepler
template:
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: kepler-exporter
sustainable-computing.io/app: kepler
spec:
containers:
- args:
- /usr/bin/kepler -v=1 -kernel-source-dir=/usr/share/kepler/kernel_sources
-redfish-cred-file-path=/etc/redfish/redfish.csv
command:
- /bin/sh
- -c
env:
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
image: quay.io/sustainable_computing_io/kepler:latest
imagePullPolicy: Always
livenessProbe:
failureThreshold: 5
httpGet:
path: /healthz
port: 9102
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 60
successThreshold: 1
timeoutSeconds: 10
name: kepler-exporter
ports:
- containerPort: 9102
name: http
resources:
requests:
cpu: 100m
memory: 400Mi
securityContext:
privileged: true
volumeMounts:
- mountPath: /lib/modules
name: lib-modules
readOnly: true
- mountPath: /sys
name: tracing
readOnly: true
- mountPath: /proc
name: proc
- mountPath: /var/run
name: var-run
- mountPath: /etc/kepler/kepler.config
name: cfm
readOnly: true
- mountPath: /etc/redfish
name: redfish
readOnly: true
dnsPolicy: ClusterFirstWithHostNet
hostPID: true
serviceAccountName: kepler-sa
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
volumes:
- hostPath:
path: /lib/modules
type: Directory
name: lib-modules
- hostPath:
path: /sys
type: Directory
name: tracing
- hostPath:
path: /proc
type: Directory
name: proc
- hostPath:
path: /var/run
type: Directory
name: var-run
- configMap:
name: kepler-cfm
name: cfm
- name: redfish
secret:
secretName: redfish-4kh9d7bc7m
37 changes: 37 additions & 0 deletions deckard/iaac/gcp/prometheus/alertmanager-alertmanager.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
name: main
namespace: monitoring
spec:
image: quay.io/prometheus/alertmanager:v0.26.0
nodeSelector:
kubernetes.io/os: linux
podMetadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
replicas: 3
resources:
limits:
cpu: 100m
memory: 100Mi
requests:
cpu: 4m
memory: 100Mi
secrets: []
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 1000
serviceAccountName: alertmanager-main
version: 0.26.0
42 changes: 42 additions & 0 deletions deckard/iaac/gcp/prometheus/alertmanager-networkPolicy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
name: alertmanager-main
namespace: monitoring
spec:
egress:
- {}
ingress:
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: prometheus
ports:
- port: 9093
protocol: TCP
- port: 8080
protocol: TCP
- from:
- podSelector:
matchLabels:
app.kubernetes.io/name: alertmanager
ports:
- port: 9094
protocol: TCP
- port: 9094
protocol: UDP
podSelector:
matchLabels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
policyTypes:
- Egress
- Ingress
19 changes: 19 additions & 0 deletions deckard/iaac/gcp/prometheus/alertmanager-podDisruptionBudget.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.26.0
name: alertmanager-main
namespace: monitoring
spec:
maxUnavailable: 1
selector:
matchLabels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
Loading

0 comments on commit 0fbc381

Please sign in to comment.