Skip to content

Commit

Permalink
Increase PVC size for production (#340)
Browse files Browse the repository at this point in the history
* Increase PVC size for production
Fix alertmanager syntax
Add namespace to prometheus rules to catch the alerting logic.

* #335 requests and limits should be the same
  • Loading branch information
johnml1135 authored Mar 1, 2024
1 parent 3c548cf commit 45626b3
Show file tree
Hide file tree
Showing 11 changed files with 44 additions and 36 deletions.
4 changes: 2 additions & 2 deletions deploy/mongo/templates/mongo-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ spec:
memory: "2000Mi"
cpu: "1000m"
requests:
memory: "100Mi"
cpu: "100m"
memory: "2000Mi"
cpu: "1000m"
volumeMounts:
- mountPath: /data/db
name: mongo-data
Expand Down
12 changes: 7 additions & 5 deletions deploy/qa-ext-values.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
externalHost: qa.serval-api.org
environment: Production
deploymentVersion: '1.1.QA2'
deploymentVersion: '1.2.QA1'
alertEmail: [email protected]
emailsToAlert: '[email protected], [email protected]'
emailsToAlert: [email protected]
enableTls: true
namespace: serval
auth0Domain: dev-sillsdev.auth0.com
lokiTenent: serval-tenant
lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local
servalImage: ghcr.io/sillsdev/serval:1.1.1
machineImage: ghcr.io/sillsdev/machine:3.6.3
ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.1.3
servalImage: ghcr.io/sillsdev/serval:1.2.0
machineImage: ghcr.io/sillsdev/machine:3.7.0
ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.2.0
ClearMLQueue: production
SharedFileLocation: s3://aqua-ml-data/ext-qa/
servalClaimSize: 2Gi
machineClaimSize: 10Gi
enableEcho: true
8 changes: 5 additions & 3 deletions deploy/qa-int-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@ namespace: nlp
auth0Domain: sil-appbuilder.auth0.com
lokiTenent: nlp-tenant
lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local
servalImage: ghcr.io/sillsdev/serval:1.1.1
machineImage: ghcr.io/sillsdev/machine:3.6.3
ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.1.3
servalImage: ghcr.io/sillsdev/serval:1.2.0
machineImage: ghcr.io/sillsdev/machine:3.7.0
ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.2.0
ClearMLQueue: lambert_24gb
SharedFileLocation: s3://aqua-ml-data/int-qa/
servalClaimSize: 1Gi
machineClaimSize: 2Gi
enableEcho: true
4 changes: 2 additions & 2 deletions deploy/serval-pvc/templates/persistent-volume-claims.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ spec:
- ReadWriteMany
resources:
requests:
storage: 5Gi
storage: {{ .Values.servalClaimSize }}
---
apiVersion: v1
kind: PersistentVolumeClaim
Expand All @@ -22,7 +22,7 @@ spec:
- ReadWriteMany
resources:
requests:
storage: 20Gi
storage: {{ .Values.machineClaimSize }}
---
apiVersion: v1
kind: PersistentVolumeClaim
Expand Down
4 changes: 3 additions & 1 deletion deploy/serval/templates/alert-manager-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,5 +26,7 @@ spec:
repeatInterval: 4h
routes:
- matchers:
- alertname =~ "KubeQuotaAlmostFull|KubeQuotaFullyUsed|InfoInhibitor|Watchdog"
- name: alertname
value: "CPUThrottlingHigh|KubeQuotaAlmostFull|KubeQuotaFullyUsed|InfoInhibitor|Watchdog"
matchType: =~
receiver: 'null'
2 changes: 1 addition & 1 deletion deploy/serval/templates/echo-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ spec:
memory: "100Mi"
cpu: "100m"
requests:
memory: "50Mi"
memory: "100Mi"
cpu: "100m"
volumeMounts:
- mountPath: /var/lib/serval
Expand Down
4 changes: 2 additions & 2 deletions deploy/serval/templates/machine-engine-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ spec:
memory: "2500Mi"
cpu: "500m"
requests:
memory: "500Mi"
cpu: "100m"
memory: "2500Mi"
cpu: "500m"
volumeMounts:
- mountPath: /var/lib/machine
name: machine-mount
Expand Down
4 changes: 2 additions & 2 deletions deploy/serval/templates/machine-job-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ spec:
memory: "2500Mi"
cpu: "1.9"
requests:
memory: "100Mi"
cpu: "100m"
memory: "2500Mi"
cpu: "1.9"
volumeMounts:
- mountPath: /var/lib/machine
name: machine-mount
Expand Down
24 changes: 12 additions & 12 deletions deploy/serval/templates/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ spec:
description: >-
'{{ "{{ $labels.container }}" }} has high CPU for over 3 minutes.'
expr: >-
min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container!~"POD|machine-job" } [3m]))
by (container) / on (container)
min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container!~"POD|machine-job"})
by (container) * 100 >= 80
min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container!~"POD|machine-job" } [3m]))
by (container, namespace) / on (container, namespace)
min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container!~"POD|machine-job"})
by (container, namespace) * 100 >= 80
for: 0s
labels:
severity: warning
Expand All @@ -26,10 +26,10 @@ spec:
description: >-
'{{ "{{ $labels.container }}" }} has a job running over 3 hours.'
expr: >-
min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container="machine-job" } [3h]))
by (container) / on (container)
min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container="machine-job"})
by (container) * 100 >= 80
min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container="machine-job" } [3h]))
by (container, namespace) / on (container, namespace)
min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container="machine-job"})
by (container, namespace) * 100 >= 80
for: 0s
labels:
severity: warning
Expand All @@ -41,9 +41,9 @@ spec:
'{{ "{{ $labels.container }}" }} is over 80% memory.'
expr: >-
min (container_memory_working_set_bytes{image!="", namespace="{{ .Values.namespace }}", container!="POD" })
by (container) / on (container)
by (container, namespace) / on (container, namespace)
min (kube_pod_container_resource_limits{resource="memory", namespace="{{ .Values.namespace }}", container!="POD"})
by (container) * 100 >= 80
by (container, namespace) * 100 >= 80
for: 0s
labels:
severity: warning
Expand All @@ -55,9 +55,9 @@ spec:
'{{ "{{ $labels.container }}" }} is over 80% disk space utilization.'
expr: >-
min (kubelet_volume_stats_used_bytes{namespace="{{ .Values.namespace }}"})
by (persistentvolumeclaim) / on (persistentvolumeclaim)
by (persistentvolumeclaim, namespace) / on (persistentvolumeclaim, namespace)
min (kube_persistentvolumeclaim_resource_requests_storage_bytes{namespace="{{ .Values.namespace }}"})
by (persistentvolumeclaim) * 100 > 80
by (persistentvolumeclaim, namespace) * 100 > 80
for: 0s
labels:
severity: warning
4 changes: 2 additions & 2 deletions deploy/serval/templates/serval-api-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ spec:
memory: "400Mi"
cpu: "500m"
requests:
memory: "100Mi"
cpu: "100m"
memory: "400Mi"
cpu: "500m"
volumeMounts:
- mountPath: /var/lib/serval
name: serval-mount
Expand Down
10 changes: 6 additions & 4 deletions deploy/values.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
externalHost: prod.serval-api.org
environment: Production
deploymentVersion: '1.1.1'
deploymentVersion: '1.2.0'
alertEmail: [email protected]
emailsToAlert: '[email protected], [email protected]'
enableTls: true
namespace: serval
auth0Domain: languagetechnology.auth0.com
lokiTenent: nlp-tenant
lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local
servalImage: ghcr.io/sillsdev/serval:1.1.1
machineImage: ghcr.io/sillsdev/machine:3.6.3
ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.1.3
servalImage: ghcr.io/sillsdev/serval:1.2.0
machineImage: ghcr.io/sillsdev/machine:3.7.0
ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.2.0
ClearMLQueue: production
SharedFileLocation: s3://aqua-ml-data/production/
servalClaimSize: 5Gi
machineClaimSize: 40Gi
enableEcho: true

0 comments on commit 45626b3

Please sign in to comment.