diff --git a/charts/retool/ci/telemetry-enabled-values.yaml b/charts/retool/ci/telemetry-enabled-values.yaml new file mode 100644 index 00000000..8f4e1298 --- /dev/null +++ b/charts/retool/ci/telemetry-enabled-values.yaml @@ -0,0 +1,66 @@ +# ================================================ +# === boilerplate to get a bootable deployment === +# ================================================ +config: + encryptionKey: '000000000' + jwtSecret: '000000000' + + licenseKey: "EXPIRED-LICENSE-KEY-TRIAL" + +image: + tag: 'latest' + +resources: + requests: + cpu: 500m + memory: 1024Mi + limits: + cpu: 1000m + memory: 4096Mi + +workflows: + resources: + requests: + cpu: 500m + memory: 1024Mi + limits: + cpu: 1000m + memory: 2048Mi + +env: + NUM_WORKERS: '1' + # NODE_DEBUG: 'module' + +ingress: + kubernetes.io/ingress.class: nginx + hosts: + - host: retool.example.com + paths: + - path: / + +livenessProbe: + timeoutSeconds: 60 + periodSeconds: 2 + failureThreshold: 60 +readinessProbe: + timeoutSeconds: 30 + periodSeconds: 5 + +replicaCount: 1 + +persistentVolumeClaim: + size: '3Gi' +# ================================================ + +# === New telemetry stuff === +telemetry: + enabled: true + image: + repository: 'tryretool/telemetry' + # tag: + sendToRetool: + enabled: true + customVectorConfig: + foo: bar + customGrafanaAgentConfig: | + // wee woo custom text diff --git a/charts/retool/templates/NOTES.txt b/charts/retool/templates/NOTES.txt index 26f209b8..51ad134a 100644 --- a/charts/retool/templates/NOTES.txt +++ b/charts/retool/templates/NOTES.txt @@ -22,6 +22,7 @@ {{- end }} {{- if and .Values.postgresql.enabled (and (not .Values.postgresql.auth.postgresPassword) (eq .Values.postgresql.auth.username "postgres")) }} + *************************************************************************** Warning: Using in-cluster postgresql setup. `.Values.postgresql.auth.username username` is set to the default admin username "postgres", but the admin password field `.Values.postgresql.auth.postgresPassword` is not set, so a random password is generated and used. This wouldn't affect your usage, but if you choose to uninstall and reinstall this helm chart, please make sure you remove the existing PersistentVolumeClaim backing the in-cluster postgresql by running: kubectl --namespace {{ .Release.Namespace }} delete pvc/data-{{ include "retool.fullname" . }}-postgresql-0 diff --git a/charts/retool/templates/_telemetry_helpers.tpl b/charts/retool/templates/_telemetry_helpers.tpl new file mode 100644 index 00000000..b14b4d2f --- /dev/null +++ b/charts/retool/templates/_telemetry_helpers.tpl @@ -0,0 +1,74 @@ +{{/* +Reusable name for telemetry-related chart resources. +*/}} +{{- define "retool.telemetry.fullname" -}} +{{- $name := default "telemetry" .Values.telemetry.nameOverride -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Labels to include on telemetry pods. +*/}} +{{- define "retool.telemetry.labels" -}} +helm.sh/chart: {{ include "retool.chart" . }} +{{ include "retool.telemetry.selectorLabels" . }} +app.kubernetes.io/version: {{ .Chart.Version | quote }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Labels to use as selector for telemetry pods and deployment. Note these become +immutable once deployed, so changes here will require recreating the deployment. +*/}} +{{- define "retool.telemetry.selectorLabels" -}} +app.kubernetes.io/name: {{ default "telemetry" .Values.telemetry.nameOverride }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + + +{{/* +The name of the service account to use. +*/}} +{{- define "retool.telemetry.serviceAccountName" -}} +{{- if .Values.telemetry.serviceAccount.create }} +{{- default (include "retool.telemetry.fullname" .) .Values.telemetry.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.telemetry.serviceAccount.name }} +{{- end }} +{{- end }} + + +{{/* +The labels to use for scoping log collection to only the pods in the same +release, as a single comma-separated string. The label(s) below should be +present on all relevant pods, or else logs won't be collected. */}} +{{- define "retool.telemetry.logSourcePodLabels" -}} +app.kubernetes.io/instance={{ .Release.Name }} +{{- end }} + + +{{/* +Env vars to include on retool pods to collect telemetry via telemetry pod. +*/}} +{{- define "retool.telemetry.includeEnvVars" -}} +{{- if .Values.telemetry.enabled }} +- name: RTEL_ENABLED + value: 'true' +- name: RTEL_SERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['telemetry.retool.com/service-name'] +- name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name +- name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName +- name: STATSD_HOST + value: {{ printf "%s.%s" (include "retool.telemetry.fullname" .) .Release.Namespace | quote }} +- name: STATSD_PORT + value: "9125" +{{- end }} +{{- end }} diff --git a/charts/retool/templates/deployment_backend.yaml b/charts/retool/templates/deployment_backend.yaml index 6c965e5a..04d9d454 100644 --- a/charts/retool/templates/deployment_backend.yaml +++ b/charts/retool/templates/deployment_backend.yaml @@ -141,6 +141,9 @@ spec: - name: CODE_EXECUTOR_INGRESS_DOMAIN value: http://{{ template "retool.codeExecutor.name" . }} {{- end }} + + {{- include "retool.telemetry.includeEnvVars" . | nindent 10 }} + {{- if and (not .Values.externalSecrets.enabled) (not .Values.externalSecrets.externalSecretsOperator.enabled) }} - name: LICENSE_KEY valueFrom: @@ -245,6 +248,7 @@ spec: port: {{ .Values.service.internalPort }} initialDelaySeconds: {{ .Values.livenessProbe.initialDelaySeconds }} timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds }} + periodSeconds: {{ .Values.livenessProbe.periodSeconds }} failureThreshold: {{ .Values.livenessProbe.failureThreshold }} {{- end }} {{- if .Values.readinessProbe.enabled }} diff --git a/charts/retool/templates/deployment_code_executor.yaml b/charts/retool/templates/deployment_code_executor.yaml index 916f58f8..0c4b7d31 100644 --- a/charts/retool/templates/deployment_code_executor.yaml +++ b/charts/retool/templates/deployment_code_executor.yaml @@ -69,6 +69,9 @@ spec: value: production - name: NODE_OPTIONS value: {{(.Values.codeExecutor.config).nodeOptions | default "--max_old_space_size=1024" }} + + {{- include "retool.telemetry.includeEnvVars" . | nindent 10 }} + {{- range $key, $value := .Values.env }} - name: "{{ $key }}" value: "{{ $value }}" diff --git a/charts/retool/templates/deployment_jobs.yaml b/charts/retool/templates/deployment_jobs.yaml index fd06e21c..accbfc7f 100644 --- a/charts/retool/templates/deployment_jobs.yaml +++ b/charts/retool/templates/deployment_jobs.yaml @@ -82,6 +82,9 @@ spec: value: {{ template "retool.postgresql.user" . }} - name: POSTGRES_SSL_ENABLED value: {{ template "retool.postgresql.ssl_enabled" . }} + + {{- include "retool.telemetry.includeEnvVars" . | nindent 10 }} + {{- if and (not .Values.externalSecrets.enabled) (not .Values.externalSecrets.externalSecretsOperator.enabled) }} - name: LICENSE_KEY valueFrom: diff --git a/charts/retool/templates/deployment_workflows.yaml b/charts/retool/templates/deployment_workflows.yaml index 2a8a0ed0..f2d1cc7d 100644 --- a/charts/retool/templates/deployment_workflows.yaml +++ b/charts/retool/templates/deployment_workflows.yaml @@ -110,6 +110,9 @@ spec: {{- end }} {{- end }} {{- end }} + + {{- include "retool.telemetry.includeEnvVars" . | nindent 10 }} + - name: CLIENT_ID value: {{ default "" .Values.config.auth.google.clientId }} - name: COOKIE_INSECURE diff --git a/charts/retool/templates/deployment_workflows_worker.yaml b/charts/retool/templates/deployment_workflows_worker.yaml index 7ed9fd69..6de64518 100644 --- a/charts/retool/templates/deployment_workflows_worker.yaml +++ b/charts/retool/templates/deployment_workflows_worker.yaml @@ -135,6 +135,9 @@ spec: - name: CODE_EXECUTOR_INGRESS_DOMAIN value: http://{{ template "retool.codeExecutor.name" . }} {{- end }} + + {{- include "retool.telemetry.includeEnvVars" . | nindent 10 }} + {{- if and (((.Values.workflows.config).otelCollector).enabled) (((.Values.workflows.config).otelCollector).endpoint) }} - name: OTEL_EXPORTER_OTLP_ENDPOINT value: {{ ((.Values.workflows.config).otelCollector).endpoint }} diff --git a/charts/retool/templates/telemetry_configmap.yaml b/charts/retool/templates/telemetry_configmap.yaml new file mode 100644 index 00000000..41acb096 --- /dev/null +++ b/charts/retool/templates/telemetry_configmap.yaml @@ -0,0 +1,22 @@ +{{- if .Values.telemetry.enabled }} + +{{- if or .Values.telemetry.customVectorConfig .Values.telemetry.customGrafanaAgentConfig }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "retool.fullname" . }}-telemetry + labels: + {{- include "retool.telemetry.labels" . | nindent 4 }} +data: + {{- if .Values.telemetry.customVectorConfig }} + vector-custom.yaml: | + {{- tpl (toYaml .Values.telemetry.customVectorConfig) . | nindent 4 }} + {{- end }} + {{- if .Values.telemetry.customGrafanaAgentConfig }} + grafana-agent-custom.river: | + {{- .Values.telemetry.customGrafanaAgentConfig | nindent 4 }} + {{- end }} + +{{- end }} + +{{- end }} diff --git a/charts/retool/templates/telemetry_deployment.yaml b/charts/retool/templates/telemetry_deployment.yaml new file mode 100644 index 00000000..eb894d70 --- /dev/null +++ b/charts/retool/templates/telemetry_deployment.yaml @@ -0,0 +1,158 @@ +{{- if .Values.telemetry.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "retool.fullname" . }}-telemetry + labels: + {{- include "retool.telemetry.labels" . | nindent 4 }} +spec: + selector: + matchLabels: + {{- include "retool.telemetry.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "retool.telemetry.selectorLabels" . | nindent 8 }} + annotations: + {{- if or .Values.telemetry.customVectorConfig .Values.telemetry.customGrafanaAgentConfig }} + checksum/custom-config: {{ include (print $.Template.BasePath "/telemetry_configmap.yaml") . | sha256sum | quote }} + {{- end }} + spec: + serviceAccountName: {{ include "retool.telemetry.serviceAccountName" . }} + containers: + - name: telemetry + image: '{{ .Values.telemetry.image.repository }}:{{ .Values.telemetry.image.tag | default .Values.image.tag }}' + imagePullPolicy: {{ .Values.telemetry.image.pullPolicy }} + command: ['retool-telemetry'] + resources: + {{ toYaml .Values.telemetry.resources | nindent 10 }} + env: + - name: VECTOR_SELF_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: VECTOR_SELF_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: VECTOR_SELF_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: PROCFS_ROOT + value: "/host/proc" + - name: SYSFS_ROOT + value: "/host/sys" + {{- if and (not .Values.externalSecrets.enabled) (not .Values.externalSecrets.externalSecretsOperator.enabled) }} + - name: LICENSE_KEY + valueFrom: + secretKeyRef: + {{- if .Values.config.licenseKeySecretName }} + name: {{ .Values.config.licenseKeySecretName }} + key: {{ .Values.config.licenseKeySecretKey | default "license-key" }} + {{- else }} + name: {{ template "retool.fullname" . }} + key: license-key + {{- end }} + {{- end }} + - name: RTEL_DEPLOYMENT_MODE + value: 'kubernetes-helm' + - name: RTEL_HELM_RELEASE_NAME + value: {{ .Release.Name | quote }} + - name: RTEL_KUBE_POD_PREFIX + value: {{ include "retool.fullname" . | quote }} + - name: RTEL_KUBE_NAMESPACE + value: {{ .Release.Namespace | quote }} + - name: RTEL_KUBE_LABEL_SELECTOR + value: {{ include "retool.telemetry.logSourcePodLabels" . | quote }} + - name: RTEL_SEND_TO_RETOOL + value: {{ .Values.telemetry.sendToRetool.enabled | quote }} + {{- if .Values.externalSecrets.enabled }} + envFrom: + - secretRef: + name: {{ .Values.externalSecrets.name }} + {{- end }} + {{- if .Values.externalSecrets.externalSecretsOperator.enabled }} + envFrom: + {{- range .Values.externalSecrets.externalSecretsOperator.secretRef }} + - secretRef: + name: {{ .name }} + {{- end }} + {{- end }} + volumeMounts: + - name: vector-data + mountPath: {{ .Values.telemetry.customVectorConfig | dig "data_dir" "/vector-data" | quote }} + - name: grafana-agent-data + mountPath: /grafana-agent-data + {{- if .Values.telemetry.customVectorConfig }} + - name: custom-config + mountPath: "/etc/vector-custom/vector-custom.yaml" + subPath: "vector-custom.yaml" + readOnly: true + {{- end }} + {{- if .Values.telemetry.customGrafanaAgentConfig }} + - name: custom-config + mountPath: "/etc/grafana-agent-custom/grafana-agent-custom.river" + subPath: "grafana-agent-custom.river" + readOnly: true + {{- end }} + - name: var-log + mountPath: "/var/log/" + readOnly: true + - name: var-lib + mountPath: "/var/lib" + readOnly: true + - name: procfs + mountPath: "/host/proc" + readOnly: true + - name: sysfs + mountPath: "/host/sys" + readOnly: true + ports: + - containerPort: 9125 + # hostPort: 9125 + name: statsd-udp + protocol: UDP + volumes: + - name: vector-data + emptyDir: + sizeLimit: 100Mi + - name: grafana-agent-data + emptyDir: + sizeLimit: 100Mi + {{- if or .Values.telemetry.customVectorConfig .Values.telemetry.customGrafanaAgentConfig }} + - name: custom-config + configMap: + name: {{ include "retool.fullname" . }}-telemetry + {{- end }} + - name: var-log + hostPath: + path: "/var/log/" + - name: var-lib + hostPath: + path: "/var/lib/" + - name: procfs + hostPath: + path: "/proc" + - name: sysfs + hostPath: + path: "/sys" + +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "retool.telemetry.fullname" . }} + labels: + {{- include "retool.telemetry.labels" . | nindent 4 }} +spec: + ports: + - name: statsd-udp + port: 9125 + protocol: UDP + selector: + {{- include "retool.telemetry.selectorLabels" . | nindent 4 }} + type: ClusterIP + + +{{- end }} diff --git a/charts/retool/templates/telemetry_rbac.yaml b/charts/retool/templates/telemetry_rbac.yaml new file mode 100644 index 00000000..135ed529 --- /dev/null +++ b/charts/retool/templates/telemetry_rbac.yaml @@ -0,0 +1,74 @@ +{{- if .Values.telemetry.enabled }} + +{{- if .Values.telemetry.serviceAccount.create }} +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "retool.telemetry.serviceAccountName" . | quote }} + labels: + {{- include "retool.telemetry.labels" . | nindent 4 }} + {{- with .Values.telemetry.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +automountServiceAccountToken: {{ .Values.telemetry.serviceAccount.automountToken }} +{{- end }} + + +{{- if .Values.telemetry.rbac.create }} +--- +{{/* +Permissions to use Kubernetes API. +Requires that RBAC authorization is enabled. +*/}} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "retool.telemetry.fullname" . }} + labels: + {{- include "retool.telemetry.labels" . | nindent 4 }} +rules: + - apiGroups: + - "" + resources: + - namespaces + - nodes + - nodes/metrics + - pods + - pods/log + verbs: + - get + - list + - watch +{{/* +# PodSecurityPolicy was deprecated in k8s v1.21 and removed in v1.25 +{{- if and .Values.psp.create (.Capabilities.APIVersions.Has "policy/v1beta1") }} + - apiGroups: + - policy + resources: + - podsecuritypolicies + verbs: + - use + resourceNames: + - {{ include "telemetry.fullname" . }} +{{- end }} +*/}} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "retool.telemetry.fullname" . }} + labels: + {{- include "retool.telemetry.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "retool.telemetry.fullname" . }} +subjects: + - kind: ServiceAccount + name: {{ include "retool.telemetry.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} + +{{- end }} diff --git a/charts/retool/values.yaml b/charts/retool/values.yaml index 93a78574..cf7c39de 100644 --- a/charts/retool/values.yaml +++ b/charts/retool/values.yaml @@ -610,3 +610,69 @@ hostAliases: {} # hostnames: # - test.com # - anothertest.com + +telemetry: + + # When enabled, will collect metrics and logs from the other pods in the + # chart. These will be forwarded to Retool for proactive monitoring and bug + # identification when `telemetry.sendToRetool.enabled = true`, and can also + # optionally be sent to a user-managed destination via + # `telemetry.customVectorConfig`. + enabled: false + + sendToRetool: + + # Only relevant when `telemetry.enabled = true`. When enabled, telemetry + # from pods in this chart will be forwarded to Retool for proactive + # monitoring and bug identification. + enabled: true + + # Should not be changed except for chart testing. + address: "https://telemetry.retool.com:443" + + image: + + repository: "tryretool/telemetry" + + # Default to same as top level `image.tag`. + tag: null + + pullPolicy: 'IfNotPresent' + + resources: + requests: + cpu: 128m + memory: 128Mi + limits: + cpu: 256m + memory: 256Mi + + # When present, any vector config here gets added to the telemetry pod (via a + # created configmap) and added to the internal [vector](https://vector.dev/) + # instance which runs inside. This can enable adding additional user-specified + # sinks for collected telemetry data. + # + # See [vector sinks + # documentation](https://vector.dev/docs/reference/configuration/sinks/) for + # more details. + customVectorConfig: {} + + # When present, any grafana-agent config here gets added to the telemetry pod + # (via a created configmap) and added to the internal + # [grafana-agent](https://grafana.com/docs/agent/latest/) instance which runs + # inside. This can enable adding additional user-specified sources of extra + # telemetry data. + # + # The internal grafana-agent runs in Flow mode, so the config here must use + # river syntax. See [grafana-agent Flow mode + # documentation](https://grafana.com/docs/agent/latest/flow/) for more + # details. + customGrafanaAgentConfig: + + serviceAccount: + create: true + name: + automountToken: true + + rbac: + create: true diff --git a/values.yaml b/values.yaml index 93a78574..cf7c39de 100644 --- a/values.yaml +++ b/values.yaml @@ -610,3 +610,69 @@ hostAliases: {} # hostnames: # - test.com # - anothertest.com + +telemetry: + + # When enabled, will collect metrics and logs from the other pods in the + # chart. These will be forwarded to Retool for proactive monitoring and bug + # identification when `telemetry.sendToRetool.enabled = true`, and can also + # optionally be sent to a user-managed destination via + # `telemetry.customVectorConfig`. + enabled: false + + sendToRetool: + + # Only relevant when `telemetry.enabled = true`. When enabled, telemetry + # from pods in this chart will be forwarded to Retool for proactive + # monitoring and bug identification. + enabled: true + + # Should not be changed except for chart testing. + address: "https://telemetry.retool.com:443" + + image: + + repository: "tryretool/telemetry" + + # Default to same as top level `image.tag`. + tag: null + + pullPolicy: 'IfNotPresent' + + resources: + requests: + cpu: 128m + memory: 128Mi + limits: + cpu: 256m + memory: 256Mi + + # When present, any vector config here gets added to the telemetry pod (via a + # created configmap) and added to the internal [vector](https://vector.dev/) + # instance which runs inside. This can enable adding additional user-specified + # sinks for collected telemetry data. + # + # See [vector sinks + # documentation](https://vector.dev/docs/reference/configuration/sinks/) for + # more details. + customVectorConfig: {} + + # When present, any grafana-agent config here gets added to the telemetry pod + # (via a created configmap) and added to the internal + # [grafana-agent](https://grafana.com/docs/agent/latest/) instance which runs + # inside. This can enable adding additional user-specified sources of extra + # telemetry data. + # + # The internal grafana-agent runs in Flow mode, so the config here must use + # river syntax. See [grafana-agent Flow mode + # documentation](https://grafana.com/docs/agent/latest/flow/) for more + # details. + customGrafanaAgentConfig: + + serviceAccount: + create: true + name: + automountToken: true + + rbac: + create: true