Increase PVC size for production (#340)

* Increase PVC size for production Fix alertmanager syntax Add namespace to prometheus rules to catch the alerting logic. * #335 requests and limits should be the same
sillsdev · Mar 1, 2024 · 45626b3 · 45626b3
1 parent 3c548cf
commit 45626b3
Show file tree

Hide file tree

Showing 11 changed files with 44 additions and 36 deletions.
diff --git a/deploy/mongo/templates/mongo-deployment.yaml b/deploy/mongo/templates/mongo-deployment.yaml
@@ -30,8 +30,8 @@ spec:
               memory: "2000Mi"
               cpu: "1000m"
             requests:
-              memory: "100Mi"
-              cpu: "100m"
+              memory: "2000Mi"
+              cpu: "1000m"
           volumeMounts:
             - mountPath: /data/db
               name: mongo-data

diff --git a/deploy/qa-ext-values.yaml b/deploy/qa-ext-values.yaml
@@ -1,16 +1,18 @@
 externalHost: qa.serval-api.org
 environment: Production
-deploymentVersion: '1.1.QA2'
+deploymentVersion: '1.2.QA1'
 alertEmail: [email protected]
-emailsToAlert: '[email protected], [email protected]'
+emailsToAlert: [email protected]
 enableTls: true
 namespace: serval
 auth0Domain: dev-sillsdev.auth0.com
 lokiTenent: serval-tenant
 lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local
-servalImage: ghcr.io/sillsdev/serval:1.1.1
-machineImage: ghcr.io/sillsdev/machine:3.6.3
-ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.1.3
+servalImage: ghcr.io/sillsdev/serval:1.2.0
+machineImage: ghcr.io/sillsdev/machine:3.7.0
+ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.2.0
 ClearMLQueue: production
 SharedFileLocation: s3://aqua-ml-data/ext-qa/
+servalClaimSize: 2Gi
+machineClaimSize: 10Gi
 enableEcho: true
diff --git a/deploy/qa-int-values.yaml b/deploy/qa-int-values.yaml
@@ -8,9 +8,11 @@ namespace: nlp
 auth0Domain: sil-appbuilder.auth0.com
 lokiTenent: nlp-tenant
 lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local
-servalImage: ghcr.io/sillsdev/serval:1.1.1
-machineImage: ghcr.io/sillsdev/machine:3.6.3
-ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.1.3
+servalImage: ghcr.io/sillsdev/serval:1.2.0
+machineImage: ghcr.io/sillsdev/machine:3.7.0
+ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.2.0
 ClearMLQueue: lambert_24gb
 SharedFileLocation: s3://aqua-ml-data/int-qa/
+servalClaimSize: 1Gi
+machineClaimSize: 2Gi
 enableEcho: true
diff --git a/deploy/serval-pvc/templates/persistent-volume-claims.yaml b/deploy/serval-pvc/templates/persistent-volume-claims.yaml
@@ -9,7 +9,7 @@ spec:
       - ReadWriteMany
   resources:
     requests:
-      storage: 5Gi
+      storage: {{ .Values.servalClaimSize }}
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
@@ -22,7 +22,7 @@ spec:
       - ReadWriteMany
   resources:
     requests:
-      storage: 20Gi
+      storage: {{ .Values.machineClaimSize }}
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim

diff --git a/deploy/serval/templates/alert-manager-config.yaml b/deploy/serval/templates/alert-manager-config.yaml
@@ -26,5 +26,7 @@ spec:
     repeatInterval: 4h
     routes:
     - matchers:
-        - alertname =~ "KubeQuotaAlmostFull|KubeQuotaFullyUsed|InfoInhibitor|Watchdog"
+        - name: alertname
+          value: "CPUThrottlingHigh|KubeQuotaAlmostFull|KubeQuotaFullyUsed|InfoInhibitor|Watchdog"
+          matchType: =~
       receiver: 'null'
diff --git a/deploy/serval/templates/echo-deployment.yaml b/deploy/serval/templates/echo-deployment.yaml
@@ -42,7 +42,7 @@ spec:
               memory: "100Mi"
               cpu: "100m"
             requests:
-              memory: "50Mi"
+              memory: "100Mi"
               cpu: "100m"
           volumeMounts:
             - mountPath: /var/lib/serval

diff --git a/deploy/serval/templates/machine-engine-deployment.yaml b/deploy/serval/templates/machine-engine-deployment.yaml
@@ -47,8 +47,8 @@ spec:
               memory: "2500Mi"
               cpu: "500m"
             requests:
-              memory: "500Mi"
-              cpu: "100m"
+              memory: "2500Mi"
+              cpu: "500m"
           volumeMounts:
             - mountPath: /var/lib/machine
               name: machine-mount

diff --git a/deploy/serval/templates/machine-job-deployment.yaml b/deploy/serval/templates/machine-job-deployment.yaml
@@ -47,8 +47,8 @@ spec:
               memory: "2500Mi"
               cpu: "1.9"
             requests:
-              memory: "100Mi"
-              cpu: "100m"
+              memory: "2500Mi"
+              cpu: "1.9"
           volumeMounts:
             - mountPath: /var/lib/machine
               name: machine-mount

diff --git a/deploy/serval/templates/prometheus-rules.yaml b/deploy/serval/templates/prometheus-rules.yaml
@@ -12,10 +12,10 @@ spec:
             description: >-
               '{{ "{{ $labels.container }}" }} has high CPU for over 3 minutes.'
           expr: >-
-            min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container!~"POD|machine-job" } [3m])) 
-            by (container) / on (container) 
-            min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container!~"POD|machine-job"}) 
-            by (container) * 100 >= 80
+            min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container!~"POD|machine-job" } [3m]))
+            by (container, namespace) / on (container, namespace)
+            min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container!~"POD|machine-job"})
+            by (container, namespace) * 100 >= 80
           for: 0s
           labels:
             severity: warning
@@ -26,10 +26,10 @@ spec:
             description: >-
               '{{ "{{ $labels.container }}" }} has a job running over 3 hours.'
           expr: >-
-            min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container="machine-job" } [3h])) 
-            by (container) / on (container) 
-            min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container="machine-job"}) 
-            by (container) * 100 >= 80
+            min (rate (container_cpu_usage_seconds_total {image!="", namespace="{{ .Values.namespace }}", container="machine-job" } [3h]))
+            by (container, namespace) / on (container, namespace)
+            min (kube_pod_container_resource_limits{resource="cpu", namespace="{{ .Values.namespace }}", container="machine-job"})
+            by (container, namespace) * 100 >= 80
           for: 0s
           labels:
             severity: warning
@@ -41,9 +41,9 @@ spec:
               '{{ "{{ $labels.container }}" }} is over 80% memory.'
           expr: >-
             min (container_memory_working_set_bytes{image!="", namespace="{{ .Values.namespace }}", container!="POD" })
-            by (container) / on (container)
+            by (container, namespace) / on (container, namespace)
             min (kube_pod_container_resource_limits{resource="memory", namespace="{{ .Values.namespace }}", container!="POD"})
-            by (container) * 100 >= 80
+            by (container, namespace) * 100 >= 80
           for: 0s
           labels:
             severity: warning
@@ -55,9 +55,9 @@ spec:
               '{{ "{{ $labels.container }}" }} is over 80% disk space utilization.'
           expr: >-
             min (kubelet_volume_stats_used_bytes{namespace="{{ .Values.namespace }}"})
-            by (persistentvolumeclaim) / on (persistentvolumeclaim)
+            by (persistentvolumeclaim, namespace) / on (persistentvolumeclaim, namespace)
             min (kube_persistentvolumeclaim_resource_requests_storage_bytes{namespace="{{ .Values.namespace }}"})
-            by (persistentvolumeclaim) * 100 > 80
+            by (persistentvolumeclaim, namespace) * 100 > 80
           for: 0s
           labels:
             severity: warning
diff --git a/deploy/serval/templates/serval-api-deployment.yaml b/deploy/serval/templates/serval-api-deployment.yaml
@@ -62,8 +62,8 @@ spec:
               memory: "400Mi"
               cpu: "500m"
             requests:
-              memory: "100Mi"
-              cpu: "100m"
+              memory: "400Mi"
+              cpu: "500m"
           volumeMounts:
             - mountPath: /var/lib/serval
               name: serval-mount

diff --git a/deploy/values.yaml b/deploy/values.yaml
@@ -1,16 +1,18 @@
 externalHost: prod.serval-api.org
 environment: Production
-deploymentVersion: '1.1.1'
+deploymentVersion: '1.2.0'
 alertEmail: [email protected]
 emailsToAlert: '[email protected], [email protected]'
 enableTls: true
 namespace: serval
 auth0Domain: languagetechnology.auth0.com
 lokiTenent: nlp-tenant
 lokiUrl: http://loki-distributed-gateway.loki.svc.cluster.local
-servalImage: ghcr.io/sillsdev/serval:1.1.1
-machineImage: ghcr.io/sillsdev/machine:3.6.3
-ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.1.3
+servalImage: ghcr.io/sillsdev/serval:1.2.0
+machineImage: ghcr.io/sillsdev/machine:3.7.0
+ClearMLDockerImage: ghcr.io/sillsdev/machine.py:1.2.0
 ClearMLQueue: production
 SharedFileLocation: s3://aqua-ml-data/production/
+servalClaimSize: 5Gi
+machineClaimSize: 40Gi
 enableEcho: true