Skip to content

Commit

Permalink
Use specific label for selecting monitors and rules
Browse files Browse the repository at this point in the history
  • Loading branch information
jeff-french committed Jan 10, 2024
1 parent b9aa5a9 commit b994e84
Show file tree
Hide file tree
Showing 2 changed files with 140 additions and 54 deletions.
2 changes: 1 addition & 1 deletion charts/moonswitch-agent/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ icon: https://static.moonswitch.com/logos/color/icon.svg
sources:
- https://github.com/moonswitch/charts

version: 0.10.0
version: 0.10.1

dependencies:
- name: teleport-kube-agent
Expand Down
192 changes: 139 additions & 53 deletions charts/moonswitch-agent/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ teleport-kube-agent:
enabled: true
minAvailable: 1
roles: kube
podMonitor:
enabled: true
additionalLabels:
monitored-by: moonswitch-agent
joinParams:
method: "token"
tokenName:
Expand Down Expand Up @@ -41,22 +45,33 @@ teleport-kube-agent:

cloudflare-tunnel-remote:
enabled: true
# TODO: write our own servicemonitor for this :2000/metrics

kube-prometheus-stack:
enabled: true
commonLabels:
"moonswitch.io/app": moonswitch-agent

alertmanager:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent
alertmanagerSpec:
storage:
volumeClaimTemplate:
spec:
resources:
requests:
storage: 40Gi
alertmanagerConfigSelector:
matchLabels:
monitored-by: moonswitch-agent


grafana:
serviceMonitor:
labels:
monitored-by: moonswitch-agent
defaultDashboardsTimezone: browser
persistence:
enabled: true
Expand Down Expand Up @@ -84,34 +99,81 @@ kube-prometheus-stack:

cleanPrometheusOperatorObjectNames: true

defaultRules:
labels:
monitored-by: moonswitch-agent

kubeApiServer:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent

kubelet:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent

kubeControllerManager:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent

coreDns:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent

kubeDns:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent

kubeEtcd:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent

kubeScheduler:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent

kubeProxy:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent

prometheusOperator:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent

prometheus:
serviceMonitor:
additionalLabels:
monitored-by: moonswitch-agent
prometheusSpec:
storageSpec:
volumeClaimTemplate:
spec:
resources:
requests:
storage: 100Gi
additionalScrapeConfigs:
- job_name: kubecost
honor_labels: true
scrape_interval: 1m
scrape_timeout: 60s
metrics_path: /metrics
scheme: http
dns_sd_configs:
- names:
- moonswitch-agent-cost-analyzer
type: 'A'
port: 9003
- job_name: kubecost-networking
kubernetes_sd_configs:
- role: pod
relabel_configs:
# These will need to be updated when we update the chart version past 1.106.x
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: moonswitch-agent-network-costs
ruleSelector:
matchLabels:
monitored-by: moonswitch-agent
serviceMonitorSelector:
matchLabels:
monitored-by: moonswitch-agent
podMonitorSelector:
matchLabels:
monitored-by: moonswitch-agent
probeSelector:
matchLabels:
monitored-by: moonswitch-agent
scrapeConfigSelector:
matchLabels:
monitored-by: moonswitch-agent

prometheus-node-exporter:
affinity:
Expand All @@ -124,39 +186,11 @@ kube-prometheus-stack:
values:
- fargate

additionalPrometheusRulesMap:
kubecost:
groups:
- name: CPU
rules:
- expr: sum(rate(container_cpu_usage_seconds_total{container!=""}[5m]))
record: cluster:cpu_usage:rate5m
- expr: rate(container_cpu_usage_seconds_total{container!=""}[5m])
record: cluster:cpu_usage_nosum:rate5m
- expr: avg(irate(container_cpu_usage_seconds_total{container!="POD", container!=""}[5m])) by (container,pod,namespace)
record: kubecost_container_cpu_usage_irate
- expr: sum(container_memory_working_set_bytes{container!="POD",container!=""}) by (container,pod,namespace)
record: kubecost_container_memory_working_set_bytes
- expr: sum(container_memory_working_set_bytes{container!="POD",container!=""})
record: kubecost_cluster_memory_working_set_bytes
- name: Savings
rules:
- expr: sum(avg(kube_pod_owner{owner_kind!="DaemonSet"}) by (pod) * sum(container_cpu_allocation) by (pod))
record: kubecost_savings_cpu_allocation
labels:
daemonset: "false"
- expr: sum(avg(kube_pod_owner{owner_kind="DaemonSet"}) by (pod) * sum(container_cpu_allocation) by (pod)) / sum(kube_node_info)
record: kubecost_savings_cpu_allocation
labels:
daemonset: "true"
- expr: sum(avg(kube_pod_owner{owner_kind!="DaemonSet"}) by (pod) * sum(container_memory_allocation_bytes) by (pod))
record: kubecost_savings_memory_allocation_bytes
labels:
daemonset: "false"
- expr: sum(avg(kube_pod_owner{owner_kind="DaemonSet"}) by (pod) * sum(container_memory_allocation_bytes) by (pod)) / sum(kube_node_info)
record: kubecost_savings_memory_allocation_bytes
labels:
daemonset: "true"
kube-state-metrics:
prometheus:
monitor:
additionalLabels:
monitored-by: moonswitch-agent

weave-gitops:
enabled: true
Expand All @@ -165,6 +199,17 @@ weave-gitops:
"moonswitch.io/app": moonswitch-agent
additionalArgs:
- --insecure-no-authentication-user=gitops-dashboard-user
metrics:
enabled: true
annotations: {}
# TODO: Write our own ServiceMonitor for this :2112/metrics
resources:
requests:
cpu: 10m
memory: 100M
limits:
cpu: 10m
memory: 100M

cost-analyzer:
enabled: true
Expand Down Expand Up @@ -208,12 +253,26 @@ cost-analyzer:
enabled: false
kube-state-metrics:
disabled: true
serviceMonitor:
enabled: true
additionalLabels:
monitored-by: moonswitch-agent
networkCosts:
enabled: true
additionalLabels:
monitored-by: moonswitch-agent
prometheusRule:
enabled: true
additionalLabels:
monitored-by: moonswitch-agent
kubecostMetrics:
emitKsmV1Metrics: false
emitKsmV1MetricsOnly: true
exporter:
serviceMonitor:
enabled: true
additionalLabels:
monitored-by: moonswitch-agent
kubecostModel:
image: "public.ecr.aws/kubecost/cost-model"
outOfClusterPromMetricsEnabled: false
Expand All @@ -234,6 +293,10 @@ cost-analyzer:
config:
services:
amazon-web-services: true
podMonitor:
enabled: true
additionalLabels:
monitored-by: moonswitch-agent

nginx:
enabled: true
Expand All @@ -244,6 +307,12 @@ nginx:
staticSiteConfigmap: nginx-static-site
service:
type: ClusterIP
metrics:
enabled: true
serviceMonitor:
enabled: true
labels:
monitored-by: moonswitch-agent

loki:
enabled: true
Expand All @@ -269,6 +338,13 @@ loki:
installOperator: false
serviceMonitor:
enabled: true
labels:
monitored-by: moonswitch-agent
metricsInstance:
enabled: false
rules:
labels:
monitored-by: moonswitch-agent

promtail:
enabled: true
Expand All @@ -285,8 +361,12 @@ promtail:
expression: "(default)" # Use this to drop logs from client app namespaces e.g "(default|client-app1|cool-app-namespace)"
serviceMonitor:
enabled: true
labels:
monitored-by: moonswitch-agent
prometheusRule:
enabled: true
additionalLabels:
monitored-by: moonswitch-agent
rules:
- alert: PromtailRequestErrors
expr: 100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10
Expand Down Expand Up @@ -360,6 +440,10 @@ kubernetes-dashboard:
"moonswitch.io/app": moonswitch-agent
serviceAccount:
name: moonswitch-agent-kubernetes-dashboard
serviceMonitor:
enabled: true
labels:
monitored-by: moonswitch-agent

helm-dashboard:
enabled: true
Expand All @@ -374,6 +458,8 @@ trivy-operator:
builtInTrivyServer: true
serviceMonitor:
enabled: true
labels:
monitored-by: moonswitch-agent
trivy:
resources:
requests:
Expand Down

0 comments on commit b994e84

Please sign in to comment.