From dccf86f64e543ff9fcad2285133e8cd79f923acd Mon Sep 17 00:00:00 2001
From: Aidan Hilt <11202897+AidanHilt@users.noreply.github.com>
Date: Tue, 14 Jan 2025 12:02:15 -0500
Subject: [PATCH] First commit of cluster-level-resources publicly (#233)
---
helm/cluster-level-resources/.helmignore | 23 +
helm/cluster-level-resources/App.yaml | 17 +
helm/cluster-level-resources/Chart.yaml | 9 +
helm/cluster-level-resources/README.md | 77 ++
.../templates/alb-controller.yaml | 38 +
.../templates/alloy-configmap.yaml | 9 +
.../templates/aws-s3-mountpoint.yaml | 36 +
.../templates/calico.yaml | 37 +
.../templates/coreDNS.yaml | 57 ++
.../templates/ebs-csi-driver.yaml | 30 +
.../templates/fluentd.yaml | 43 +
.../templates/grafana-alloy.yaml | 66 ++
.../templates/karpenter-crd-default.yaml | 291 +++++++
.../templates/karpenter-crd-jupyter.yaml | 294 +++++++
.../templates/karpenter-crd-workflow.yaml | 307 +++++++
.../templates/karpenter-templates.yaml | 30 +
.../templates/karpenter.yaml | 53 ++
.../templates/kube-state-metrics.yaml | 36 +
.../templates/vpc-cni.yaml | 36 +
helm/cluster-level-resources/values.yaml | 788 ++++++++++++++++++
20 files changed, 2277 insertions(+)
create mode 100644 helm/cluster-level-resources/.helmignore
create mode 100644 helm/cluster-level-resources/App.yaml
create mode 100644 helm/cluster-level-resources/Chart.yaml
create mode 100644 helm/cluster-level-resources/README.md
create mode 100644 helm/cluster-level-resources/templates/alb-controller.yaml
create mode 100644 helm/cluster-level-resources/templates/alloy-configmap.yaml
create mode 100644 helm/cluster-level-resources/templates/aws-s3-mountpoint.yaml
create mode 100644 helm/cluster-level-resources/templates/calico.yaml
create mode 100644 helm/cluster-level-resources/templates/coreDNS.yaml
create mode 100644 helm/cluster-level-resources/templates/ebs-csi-driver.yaml
create mode 100644 helm/cluster-level-resources/templates/fluentd.yaml
create mode 100644 helm/cluster-level-resources/templates/grafana-alloy.yaml
create mode 100644 helm/cluster-level-resources/templates/karpenter-crd-default.yaml
create mode 100644 helm/cluster-level-resources/templates/karpenter-crd-jupyter.yaml
create mode 100644 helm/cluster-level-resources/templates/karpenter-crd-workflow.yaml
create mode 100644 helm/cluster-level-resources/templates/karpenter-templates.yaml
create mode 100644 helm/cluster-level-resources/templates/karpenter.yaml
create mode 100644 helm/cluster-level-resources/templates/kube-state-metrics.yaml
create mode 100644 helm/cluster-level-resources/templates/vpc-cni.yaml
create mode 100644 helm/cluster-level-resources/values.yaml
diff --git a/helm/cluster-level-resources/.helmignore b/helm/cluster-level-resources/.helmignore
new file mode 100644
index 00000000..0e8a0eb3
--- /dev/null
+++ b/helm/cluster-level-resources/.helmignore
@@ -0,0 +1,23 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
diff --git a/helm/cluster-level-resources/App.yaml b/helm/cluster-level-resources/App.yaml
new file mode 100644
index 00000000..872e58d2
--- /dev/null
+++ b/helm/cluster-level-resources/App.yaml
@@ -0,0 +1,17 @@
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: cluster-level-resources
+ namespace: argocd
+spec:
+ project: default
+ destination:
+ namespace: argocd
+ server: https://kubernetes.default.svc
+ source:
+ repoURL: https://github.com/uc-cdis/gen3-gitops.git
+ targetRevision: master
+ path: cluster-level-resources
+ syncPolicy:
+ automated:
+ selfHeal: true
diff --git a/helm/cluster-level-resources/Chart.yaml b/helm/cluster-level-resources/Chart.yaml
new file mode 100644
index 00000000..c16adaab
--- /dev/null
+++ b/helm/cluster-level-resources/Chart.yaml
@@ -0,0 +1,9 @@
+apiVersion: v2
+name: cluster-level-resources
+description: An app-of-apps Helm chart that allows for flexible deployment of resources that support Gen3
+
+type: application
+
+version: 0.5.3
+
+appVersion: "1.17.0"
diff --git a/helm/cluster-level-resources/README.md b/helm/cluster-level-resources/README.md
new file mode 100644
index 00000000..39b9bd1a
--- /dev/null
+++ b/helm/cluster-level-resources/README.md
@@ -0,0 +1,77 @@
+# cluster-level-resources
+
+![Version: 0.5.3](https://img.shields.io/badge/Version-0.5.3-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.17.0](https://img.shields.io/badge/AppVersion-1.17.0-informational?style=flat-square)
+
+An app-of-apps Helm chart that allows for flexible deployment of resources that support Gen3
+
+## Values
+
+| Key | Type | Default | Description |
+|-----|------|---------|-------------|
+| accountNumber | string | `"xxxxxxxxxxxx"` | |
+| alb-controller.configuration.enabled | bool | `false` | |
+| alb-controller.enabled | bool | `false` | |
+| alb-controller.targetRevision | string | `"1.7.1"` | |
+| alloy-configmap-data | string | `"logging {\n level = \"info\"\n format = \"json\"\n write_to = [loki.write.endpoint.receiver]\n}\n\n/////////////////////// OTLP START ///////////////////////\n\notelcol.receiver.otlp \"default\" {\n grpc {}\n http {}\n\n output {\n metrics = [otelcol.processor.batch.default.input]\n traces = [otelcol.processor.batch.default.input]\n }\n}\n\notelcol.processor.batch \"default\" {\n output {\n metrics = [otelcol.exporter.prometheus.default.input]\n traces = [otelcol.exporter.otlp.tempo.input]\n }\n}\n\notelcol.exporter.prometheus \"default\" {\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\notelcol.exporter.otlp \"tempo\" {\n client {\n endpoint = \"http://monitoring-tempo-distributor.monitoring:4317\"\n // Configure TLS settings for communicating with the endpoint.\n tls {\n // The connection is insecure.\n insecure = true\n // Do not verify TLS certificates when connecting.\n insecure_skip_verify = true\n }\n }\n}\n\n\n/////////////////////// OTLP END ///////////////////////\n\n// discover all pods, to be used later in this config\ndiscovery.kubernetes \"pods\" {\n role = \"pod\"\n}\n\n// discover all services, to be used later in this config\ndiscovery.kubernetes \"services\" {\n role = \"service\"\n}\n\n// discover all nodes, to be used later in this config\ndiscovery.kubernetes \"nodes\" {\n role = \"node\"\n}\n\n// Generic scrape of any pod with Annotation \"prometheus.io/scrape: true\"\ndiscovery.relabel \"annotation_autodiscovery_pods\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scrape\"]\n regex = \"true\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_job\"]\n action = \"replace\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_instance\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n }\n\n // Choose the pod port\n // The discovery generates a target for each declared container port of the pod.\n // If the metricsPortName annotation has value, keep only the target where the port name matches the one of the annotation.\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_portName\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n // If the metrics port number annotation has a value, override the target address to use it, regardless whether it is\n // one of the declared ports on that Pod.\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n regex = \"(\\\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})\"\n replacement = \"[$2]:$1\" // IPv6\n target_label = \"__address__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n regex = \"(\\\\d+);((([0-9]+?)(\\\\.|$)){4})\" // IPv4, takes priority over IPv6 when both exists\n replacement = \"$2:$1\"\n target_label = \"__address__\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n target_label = \"__scheme__\"\n }\n\n\n // add labels\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_controller_name\"]\n target_label = \"controller\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app\"]\n target_label = \"app\"\n }\n\n // map all labels\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_label_(.+)\"\n }\n}\n\n// Generic scrape of any service with\n// Annotation Autodiscovery\ndiscovery.relabel \"annotation_autodiscovery_services\" {\n targets = discovery.kubernetes.services.targets\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_scrape\"]\n regex = \"true\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_job\"]\n action = \"replace\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_instance\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n }\n\n // Choose the service port\n rule {\n source_labels = [\"__meta_kubernetes_service_port_name\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_portName\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_port_name\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_service_port_number\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_port\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_port_number\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n target_label = \"__scheme__\"\n }\n}\n\nprometheus.scrape \"metrics\" {\n job_name = \"integrations/autodiscovery_metrics\"\n targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output)\n honor_labels = true\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n\n// Node Exporter\n// TODO: replace with https://grafana.com/docs/alloy/latest/reference/components/prometheus.exporter.unix/\ndiscovery.relabel \"node_exporter\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_instance\"]\n regex = \"monitoring-extras\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_name\"]\n regex = \"node-exporter\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n}\n\nprometheus.scrape \"node_exporter\" {\n job_name = \"integrations/node_exporter\"\n targets = discovery.relabel.node_exporter.output\n scrape_interval = \"60s\"\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.node_exporter.receiver]\n}\n\nprometheus.relabel \"node_exporter\" {\n rule {\n source_labels = [\"__name__\"]\n regex = \"up|node_cpu.*|node_network.*|node_exporter_build_info|node_filesystem.*|node_memory.*|process_cpu_seconds_total|process_resident_memory_bytes\"\n action = \"keep\"\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n\n// cAdvisor\n// discovery.relabel \"cadvisor\" {\n// targets = discovery.kubernetes.nodes.targets\n// rule {\n// target_label = \"__address__\"\n// replacement = \"kubernetes.default.svc.cluster.local:443\"\n// }\n// rule {\n// source_labels = [\"__meta_kubernetes_node_name\"]\n// regex = \"(.+)\"\n// replacement = \"/api/v1/nodes/${1}/proxy/metrics/cadvisor\"\n// target_label = \"__metrics_path__\"\n// }\n// }\n\n// prometheus.scrape \"cadvisor\" {\n// job_name = \"integrations/kubernetes/cadvisor\"\n// targets = discovery.relabel.cadvisor.output\n// scheme = \"https\"\n// scrape_interval = \"60s\"\n// bearer_token_file = \"/var/run/secrets/kubernetes.io/serviceaccount/token\"\n// tls_config {\n// insecure_skip_verify = true\n// }\n// clustering {\n// enabled = true\n// }\n// forward_to = [prometheus.relabel.cadvisor.receiver]\n//}\n\n//prometheus.relabel \"cadvisor\" {\n// rule {\n// source_labels = [\"__name__\"]\n// regex = \"up|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|machine_memory_bytes\"\n// action = \"keep\"\n// }\n// forward_to = [prometheus.relabel.metrics_service.receiver]\n// }\n\n// Logs from all pods\ndiscovery.relabel \"all_pods\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_controller_name\"]\n target_label = \"controller\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app\"]\n target_label = \"app\"\n }\n\n // map all labels\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_label_(.+)\"\n }\n\n}\n\nloki.source.kubernetes \"pods\" {\n targets = discovery.relabel.all_pods.output\n forward_to = [loki.write.endpoint.receiver]\n}\n\n// kube-state-metrics\ndiscovery.relabel \"relabel_kube_state_metrics\" {\n targets = discovery.kubernetes.services.targets\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n regex = \"monitoring\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_name\"]\n regex = \"monitoring-extras-kube-state-metrics\"\n action = \"keep\"\n }\n}\n\nprometheus.scrape \"kube_state_metrics\" {\n targets = discovery.relabel.relabel_kube_state_metrics.output\n job_name = \"kube-state-metrics\"\n metrics_path = \"/metrics\"\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\n// Kubelet\ndiscovery.relabel \"kubelet\" {\n targets = discovery.kubernetes.nodes.targets\n rule {\n target_label = \"__address__\"\n replacement = \"kubernetes.default.svc.cluster.local:443\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_node_name\"]\n regex = \"(.+)\"\n replacement = \"/api/v1/nodes/${1}/proxy/metrics\"\n target_label = \"__metrics_path__\"\n }\n}\n\nprometheus.scrape \"kubelet\" {\n job_name = \"integrations/kubernetes/kubelet\"\n targets = discovery.relabel.kubelet.output\n scheme = \"https\"\n scrape_interval = \"60s\"\n bearer_token_file = \"/var/run/secrets/kubernetes.io/serviceaccount/token\"\n tls_config {\n insecure_skip_verify = true\n }\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.kubelet.receiver]\n}\n\nprometheus.relabel \"kubelet\" {\n rule {\n source_labels = [\"__name__\"]\n regex = \"up|container_cpu_usage_seconds_total|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_certificate_manager_client_ttl_seconds|kubelet_certificate_manager_server_ttl_seconds|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_cgroup_manager_duration_seconds_count|kubelet_node_config_error|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_count|kubelet_pleg_relist_interval_seconds_bucket|kubelet_pod_start_duration_seconds_bucket|kubelet_pod_start_duration_seconds_count|kubelet_pod_worker_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_count|kubelet_running_container_count|kubelet_running_containers|kubelet_running_pod_count|kubelet_running_pods|kubelet_runtime_operations_errors_total|kubelet_runtime_operations_total|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubernetes_build_info|namespace_workload_pod|rest_client_requests_total|storage_operation_duration_seconds_count|storage_operation_errors_total|volume_manager_total_volumes\"\n action = \"keep\"\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n// Cluster Events\nloki.source.kubernetes_events \"cluster_events\" {\n job_name = \"integrations/kubernetes/eventhandler\"\n log_format = \"logfmt\"\n forward_to = [loki.write.endpoint.receiver]\n}\n\n\n// Why is this needed?\nprometheus.relabel \"metrics_service\" {\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\n\n// Write Endpoints\n// prometheus write endpoint\nprometheus.remote_write \"default\" {\n external_labels = {\n cluster = \"{{ .Values.cluster }}\",\n project = \"{{ .Values.project }}\",\n }\n endpoint {\n url = \"https://mimir.planx-pla.net/api/v1/push\"\n\n headers = {\n \"X-Scope-OrgID\" = \"anonymous\",\n }\n\n }\n}\n\n// loki write endpoint\nloki.write \"endpoint\" {\n external_labels = {\n cluster = \"{{ .Values.cluster }}\",\n project = \"{{ .Values.project }}\",\n }\n endpoint {\n url = \"https://loki.planx-pla.net/loki/api/v1/push\"\n }\n}\n"` | |
+| aws-s3-mountpoint.configuration.enabled | bool | `false` | |
+| aws-s3-mountpoint.enabled | bool | `false` | |
+| aws-s3-mountpoint.targetRevision | string | `"1.8.0"` | |
+| calico.configuration.enabled | bool | `false` | |
+| calico.enabled | bool | `false` | |
+| calico.targetRevision | string | `"v3.27.0"` | |
+| cluster | string | `"unfunded"` | |
+| configuration.configurationRepo | string | `"https://github.com/uc-cdis/gen3-gitops"` | |
+| configuration.configurationRevision | string | `"master"` | |
+| coreDNS.configuration.enabled | bool | `false` | |
+| coreDNS.enabled | bool | `false` | |
+| coreDNS.targetRevision | string | `"v1.29.0"` | |
+| ebs-csi-driver.configuration.enabled | bool | `false` | |
+| ebs-csi-driver.enabled | bool | `false` | |
+| ebs-csi-driver.targetRevision | string | `"2.36.0"` | |
+| fluentd-configmap-data | string | `"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n @type kubernetes_metadata\n @id filter_kube_metadata\n kubernetes_url \"#{ENV['FLUENT_FILTER_KUBERNETES_URL'] || 'https://' + ENV.fetch('KUBERNETES_SERVICE_HOST') + ':' + ENV.fetch('KUBERNETES_SERVICE_PORT') + '/api'}\"\n verify_ssl \"#{ENV['KUBERNETES_VERIFY_SSL'] || true}\"\n ca_file \"#{ENV['KUBERNETES_CA_FILE']}\"\n skip_labels \"#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_LABELS'] || 'false'}\"\n skip_container_metadata \"#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_CONTAINER_METADATA'] || 'false'}\"\n skip_master_url \"#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_MASTER_URL'] || 'false'}\"\n skip_namespace_metadata \"#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_NAMESPACE_METADATA'] || 'false'}\"\n\n\n\n @type null\n\n\n\n @type null\n\n\n\n @type rewrite_tag_filter\n \n key $._HOSTNAME\n pattern ^(.+)$\n tag $1.docker\n \n\n\n\n @type rewrite_tag_filter\n \n key $._HOSTNAME\n pattern ^(.+)$\n tag $1.kubelet\n \n\n\n\n @type rewrite_tag_filter\n \n key $.host\n pattern ^(.+)$\n tag $1.messages\n \n\n\n\n @type rewrite_tag_filter\n \n key $.host\n pattern ^(.+)$\n tag $1.secure\n \n\n\n\n @type rewrite_tag_filter\n \n # json structured log - consider adoption a standard json schema:\n # https://github.com/timberio/log-event-json-schema\n key message\n pattern /^\\{\\s*\"gen3log\":/\n tag kubernetes.gen3.json.${tag}\n \n \n # combined log format - default Apache and nginx structure\n # https://httpd.apache.org/docs/1.3/logs.html#combined\n key message\n pattern /^(((\\d+\\.\\d+\\.\\d+\\.\\d+)|-)\\s+){2}\\S+\\s+\\[\\d\\d?\\//\n tag kubernetes.gen3.combined.${tag}\n \n \n # unstructured log line\n key message\n pattern /\\S/\n tag kubernetes.gen3.raw.${tag}\n \n\n\n\n\n @type record_transformer\n \n log_type json\n # This one doesn't work for whatever reason, if you do ${record[\"kubernetes\"]} the whole blob would be added, but can't access subobjects\n #container_name ${record[\"kubernetes\"][\"container_name\"]}\n \n\n\n\n @type record_transformer\n \n log_type combined\n \n\n\n\n @type record_transformer\n \n log_type raw\n \n\n\n\n @type rewrite_tag_filter\n \n key $.kubernetes.pod_name\n pattern ^(.+)$\n tag \"#{Time.now.strftime('%Y-%m-%d')}.$1\"\n \n# \n# key $.kubernetes\n# pattern ^(.+)$\n# tag $1.container_name\n# \n\n\n#\n# @type rewrite_tag_filter\n# \n# key $.kubernetes.container_name\n# pattern ^(.+)$\n #tag $1.${tag}\n# tag ${tag}.$1\n# \n#\n\n# TODO:\n# * python stack traces: \"Traceback (most recent call last):\"\"\n# https://docs.fluentd.org/v0.12/articles/parser_multiline#formatn\n#\n# Idea: add `visitor` cookie to revproxy ...\n\n\n\n @type cloudwatch_logs\n @id out_cloudwatch_logs\n log_group_name \"#{ENV['LOG_GROUP_NAME']}\"\n auto_create_stream true\n use_tag_as_stream true\n retention_in_days \"#{ENV['RETENTION_IN_DAYS'] || 'nil'}\"\n json_handler yajl # To avoid UndefinedConversionError\n log_rejected_request \"#{ENV['LOG_REJECTED_REQUEST']}\" # Log rejected request for missing parts\n\n"` | |
+| fluentd.configuration.enabled | bool | `false` | |
+| fluentd.enabled | bool | `false` | |
+| fluentd.targetRevision | string | `"0.5.2"` | |
+| grafana-alloy.configuration.enabled | bool | `false` | |
+| grafana-alloy.enabled | bool | `false` | |
+| grafana-alloy.targetRevision | string | `"0.4.0"` | |
+| karpenter-crds.amiSelectorName | string | `"EKS-FIPS*"` | |
+| karpenter-crds.default.consolidateAfter | string | `"30s"` | |
+| karpenter-crds.default.consolidation | bool | `true` | |
+| karpenter-crds.default.consolidationPolicy | string | `"WhenEmpty"` | |
+| karpenter-crds.default.enabled | bool | `true` | |
+| karpenter-crds.default.expireAfter | string | `"168h"` | |
+| karpenter-crds.enabled | bool | `false` | |
+| karpenter-crds.jupyter.consolidateAfter | string | `"30s"` | |
+| karpenter-crds.jupyter.consolidation | bool | `true` | |
+| karpenter-crds.jupyter.consolidationPolicy | string | `"WhenEmpty"` | |
+| karpenter-crds.jupyter.enabled | bool | `true` | |
+| karpenter-crds.jupyter.expireAfter | string | `"168h"` | |
+| karpenter-crds.migration | bool | `false` | |
+| karpenter-crds.selectorTag | string | `""` | |
+| karpenter-crds.targetRevision | string | `"master"` | |
+| karpenter-crds.workflow.consolidateAfter | string | `"30s"` | |
+| karpenter-crds.workflow.consolidation | bool | `true` | |
+| karpenter-crds.workflow.consolidationPolicy | string | `"WhenEmpty"` | |
+| karpenter-crds.workflow.enabled | bool | `true` | |
+| karpenter-crds.workflow.expireAfter | string | `"168h"` | |
+| karpenter-crds.workflow.sgSelector | string | `""` | |
+| karpenter-templates.configuration.enabled | bool | `false` | |
+| karpenter-templates.enabled | bool | `false` | |
+| karpenter-templates.targetRevision | string | `"feat/karpenter-templates"` | |
+| karpenter.configuration.enabled | bool | `false` | |
+| karpenter.controller.image.digest | string | `"sha256:0c142050d872cb0ac7b30a188ec36aa765b449718cde0c7e49f7495b28f47c29"` | |
+| karpenter.controller.image.tag | string | `"v0.32.9"` | |
+| karpenter.enabled | bool | `false` | |
+| karpenter.resources.limits.cpu | string | `"1"` | |
+| karpenter.resources.limits.memory | string | `"1Gi"` | |
+| karpenter.resources.requests.cpu | string | `"1"` | |
+| karpenter.resources.requests.memory | string | `"1Gi"` | |
+| karpenter.targetRevision | string | `"v0.32.9"` | |
+| kube-state-metrics.configuration.enabled | bool | `false` | |
+| kube-state-metrics.enabled | bool | `false` | |
+| kube-state-metrics.targetRevision | string | `"5.28.0"` | |
+| project | string | `"unfunded"` | |
+| vpc-cni.configuration.enabled | bool | `false` | |
+| vpc-cni.enabled | bool | `false` | |
+| vpc-cni.targetRevision | string | `"v1.16.2"` | |
diff --git a/helm/cluster-level-resources/templates/alb-controller.yaml b/helm/cluster-level-resources/templates/alb-controller.yaml
new file mode 100644
index 00000000..bc7745dc
--- /dev/null
+++ b/helm/cluster-level-resources/templates/alb-controller.yaml
@@ -0,0 +1,38 @@
+{{ if index .Values "alb-controller" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: alb-controller
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - chart: aws-load-balancer-controller
+ repoURL: https://aws.github.io/eks-charts
+ targetRevision: {{ index .Values "alb-controller" "targetRevision" }}
+ helm:
+ releaseName: alb-controller
+ {{- if index .Values "alb-controller" "configuration" "enabled" }}
+ valueFiles:
+ - $values/{{ .Values.cluster }}/cluster-values/alb-controller.yaml
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: {{ .Values.configurationRevision }}
+ ref: values
+ {{- else }}
+ values: |
+ clusterName: {{ .Values.eksClusterName | default .Values.cluster }}
+ serviceAccount:
+ create: true
+ annotations:
+ eks.amazonaws.com/role-arn: arn:aws:iam::{{ .Values.accountNumber }}:role/gen3-service/{{ .Values.eksClusterName | default .Values.cluster }}-aws-load-balancer-controller-sa
+ name: aws-load-balancer-controller
+ {{- end }}
+ destination:
+ server: "https://kubernetes.default.svc"
+ namespace: kube-system
+ syncPolicy:
+ syncOptions:
+ - CreateNamespace=false
+ automated:
+ selfHeal: true
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/alloy-configmap.yaml b/helm/cluster-level-resources/templates/alloy-configmap.yaml
new file mode 100644
index 00000000..52df5312
--- /dev/null
+++ b/helm/cluster-level-resources/templates/alloy-configmap.yaml
@@ -0,0 +1,9 @@
+{{ if index .Values "grafana-alloy" "enabled" }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: alloy-gen3
+ namespace: monitoring
+data:
+ config: {{ tpl (index .Values "alloy-configmap-data") . | toYaml | indent 2}}
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/aws-s3-mountpoint.yaml b/helm/cluster-level-resources/templates/aws-s3-mountpoint.yaml
new file mode 100644
index 00000000..f503b766
--- /dev/null
+++ b/helm/cluster-level-resources/templates/aws-s3-mountpoint.yaml
@@ -0,0 +1,36 @@
+{{ if index .Values "aws-s3-mountpoint" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: aws-s3-mountpoint
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - chart: aws-mountpoint-s3-csi-driver
+ repoURL: https://awslabs.github.io/mountpoint-s3-csi-driver
+ targetRevision: {{ index .Values "aws-s3-mountpoint" "targetRevision" }}
+ helm:
+ releaseName: aws-s3-mountpoint
+ {{- if index .Values "aws-s3-mountpoint" "configuration" "enabled" }}
+ valueFiles:
+ - $values/{{ .Values.cluster }}/cluster-values/aws-s3-mountpoint.yaml
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: {{ .Values.configuration.configurationRevision }}
+ ref: values
+ {{- else }}
+ values: |
+ node:
+ serviceAccount:
+ annotations:
+ "eks.amazonaws.com/role-arn": "arn:aws:iam::{{ .Values.accountNumber }}:role/AmazonEKS_S3_CSI_DriverRole-{{ .Values.cluster }}"
+ {{- end }}
+ destination:
+ server: "https://kubernetes.default.svc"
+ namespace: kube-system
+ syncPolicy:
+ syncOptions:
+ - CreateNamespace=false
+ automated:
+ selfHeal: true
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/calico.yaml b/helm/cluster-level-resources/templates/calico.yaml
new file mode 100644
index 00000000..5d7aabbe
--- /dev/null
+++ b/helm/cluster-level-resources/templates/calico.yaml
@@ -0,0 +1,37 @@
+{{ if index .Values "calico" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: calico
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - chart: tigera-operator
+ repoURL: https://docs.tigera.io/calico/charts
+ targetRevision: {{ index .Values "calico" "targetRevision" }}
+ helm:
+ releaseName: calico
+ {{- if index .Values "calico" "configuration" "enabled" }}
+ valueFiles:
+ - $values/{{ .Values.cluster }}/cluster-values/calico.yaml
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: {{ .Values.configuration.configurationRevision }}
+ ref: values
+ {{- else }}
+ values: |
+ installation:
+ kubernetesProvider: EKS
+ registry: quay.io/
+ {{- end }}
+ destination:
+ server: "https://kubernetes.default.svc"
+ namespace: kube-system
+ syncPolicy:
+ syncOptions:
+ - Force=true
+ - ServerSideApply=true
+ - CreateNamespace=false
+ automated:
+ selfHeal: true
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/coreDNS.yaml b/helm/cluster-level-resources/templates/coreDNS.yaml
new file mode 100644
index 00000000..ed90c920
--- /dev/null
+++ b/helm/cluster-level-resources/templates/coreDNS.yaml
@@ -0,0 +1,57 @@
+{{ if index .Values "coreDNS" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: coredns
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - chart: coredns
+ repoURL: https://coredns.github.io/helm
+ targetRevision: {{ index .Values "coreDNS" "targetRevision" }}
+ helm:
+ releaseName: coredns
+ {{- if index .Values "coreDNS" "configuration" "enabled" }}
+ valueFiles:
+ - $values/{{ .Values.cluster }}/cluster-values/coreDNS.yaml
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: {{ .Values.configuration.configurationRevision }}
+ ref: values
+ {{- else }}
+ values: |
+ service:
+ clusterIP: "10.100.0.10"
+ name: "kube-dns"
+
+ autoscaler:
+ enabled: true
+ replicas: 2
+
+ k8sAppLabelOverride: "kube-dns"
+
+ deployment:
+ name: "coredns-argo"
+
+ affinity:
+ podAntiAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 100
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: k8s-app
+ operator: In
+ values:
+ - kube-dns
+ topologyKey: kubernetes.io/hostname
+ {{- end }}
+ destination:
+ server: "https://kubernetes.default.svc"
+ namespace: kube-system
+ syncPolicy:
+ syncOptions:
+ - CreateNamespace=false
+ automated:
+ selfHeal: false
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/ebs-csi-driver.yaml b/helm/cluster-level-resources/templates/ebs-csi-driver.yaml
new file mode 100644
index 00000000..d323c09b
--- /dev/null
+++ b/helm/cluster-level-resources/templates/ebs-csi-driver.yaml
@@ -0,0 +1,30 @@
+{{ if index .Values "ebs-csi-driver" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: ebs-csi-driver
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - chart: aws-ebs-csi-driver
+ repoURL: https://kubernetes-sigs.github.io/aws-ebs-csi-driver
+ targetRevision: {{ index .Values "ebs-csi-driver" "targetRevision" }}
+ helm:
+ releaseName: ebs-csi-driver
+ {{- if index .Values "ebs-csi-driver" "configuration" "enabled" }}
+ valueFiles:
+ - $values/{{ .Values.cluster }}/cluster-values/ebs-csi-driver.yaml
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: {{ .Values.configuration.configurationRevision }}
+ ref: values
+ {{- end }}
+ destination:
+ server: "https://kubernetes.default.svc"
+ namespace: kube-system
+ syncPolicy:
+ syncOptions:
+ - CreateNamespace=false
+ automated:
+ selfHeal: true
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/fluentd.yaml b/helm/cluster-level-resources/templates/fluentd.yaml
new file mode 100644
index 00000000..c61d98cf
--- /dev/null
+++ b/helm/cluster-level-resources/templates/fluentd.yaml
@@ -0,0 +1,43 @@
+{{ if index .Values "fluentd" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: fluentd
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - chart: fluentd
+ repoURL: https://fluent.github.io/helm-charts
+ targetRevision: {{ index .Values "fluentd" "targetRevision" }}
+ helm:
+ releaseName: fluentd
+ {{- if index .Values "fluentd" "configuration" "enabled" }}
+ valueFiles:
+ - $values/{{ .Values.cluster }}/cluster-values/fluentd.yaml
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: {{ .Values.configurationRevision }}
+ ref: values
+ {{- else }}
+ values: |
+ fileConfigs:
+ gen3.conf: | {{ index .Values "fluentd-configmap-data" | nindent 14 }}
+ env:
+ - name: "FLUENTD_CONF"
+ value: "../../../etc/fluent/config.d/gen3.conf"
+ - name: FLUENT_CONTAINER_TAIL_PARSER_TYPE
+ value: "cri"
+ - name: AWS_REGION
+ value: "us-east-1"
+ image:
+ tag: v1.15.3-debian-cloudwatch-1.0
+ {{- end }}
+ destination:
+ server: "https://kubernetes.default.svc"
+ namespace: kube-system
+ syncPolicy:
+ syncOptions:
+ - CreateNamespace=false
+ automated:
+ selfHeal: true
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/grafana-alloy.yaml b/helm/cluster-level-resources/templates/grafana-alloy.yaml
new file mode 100644
index 00000000..27717072
--- /dev/null
+++ b/helm/cluster-level-resources/templates/grafana-alloy.yaml
@@ -0,0 +1,66 @@
+{{ if index .Values "grafana-alloy" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: grafana-alloy
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - repoURL: https://grafana.github.io/helm-charts
+ chart: alloy
+ targetRevision: {{ index .Values "grafana-alloy" "targetRevision" }}
+ helm:
+ releaseName: alloy
+ {{- if index .Values "grafana-alloy" "configuration" "enabled" }}
+ valueFiles:
+ - $values/{{ .Values.cluster }}/cluster-values/grafana-alloy.yaml
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: {{ .Values.configuration.configurationRevision }}
+ ref: values
+ {{- else }}
+ values: |
+ controller:
+ type: "deployment"
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: topology.kubernetes.io/zone
+ operator: In
+ values:
+ - us-east-1a
+
+ alloy:
+ stabilityLevel: "public-preview"
+ uiPathPrefix: /alloy
+ # -- Extra ports to expose on the Alloy container.
+ extraPorts:
+ - name: "otel-grpc"
+ port: 4317
+ targetPort: 4317
+ protocol: "TCP"
+ - name: "otel-http"
+ port: 4318
+ targetPort: 4318
+ protocol: "TCP"
+ clustering:
+ enabled: true
+ configMap:
+ name: alloy-gen3
+ key: config
+ resources:
+ requests:
+ cpu: 1000m
+ memory: 1Gi
+ {{- end }}
+ syncPolicy:
+ syncOptions:
+ - CreateNamespace=true
+ automated:
+ selfHeal: true
+ destination:
+ server: https://kubernetes.default.svc
+ namespace: monitoring
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/karpenter-crd-default.yaml b/helm/cluster-level-resources/templates/karpenter-crd-default.yaml
new file mode 100644
index 00000000..a2cdfef4
--- /dev/null
+++ b/helm/cluster-level-resources/templates/karpenter-crd-default.yaml
@@ -0,0 +1,291 @@
+{{ if and (index .Values "karpenter-crds" "enabled") (index .Values "karpenter-crds" "default" "enabled") }}
+ {{ if eq (index .Values "karpenter-crds" "migration") true }}
+---
+apiVersion: karpenter.k8s.aws/v1beta1
+kind: EC2NodeClass
+metadata:
+ name: default
+spec:
+ amiFamily: AL2
+ amiSelectorTerms:
+ - name: {{ index .Values "karpenter-crds" "amiSelectorName" }}
+ owner: "143731057154"
+ blockDeviceMappings:
+ - deviceName: /dev/xvda
+ ebs:
+ deleteOnTermination: true
+ encrypted: true
+ volumeSize: 50Gi
+ volumeType: gp3
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: optional
+ role: eks_{{ index .Values "karpenter-crds" "selectorTag" }}_workers_role
+
+ securityGroupSelectorTerms:
+ - tags:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+
+ subnetSelectorTerms:
+ - tags:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+
+ tags:
+ Environment: {{ .Values.cluster }}
+ Name: eks-{{ .Values.cluster }}-karpenter
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ purpose: default
+
+ userData: |
+ MIME-Version: 1.0
+ Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+ --BOUNDARY
+ Content-Type: text/x-shellscript; charset="us-ascii"
+
+ #!/bin/bash -x
+ instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
+ curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
+ echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
+ sysctl -w fs.inotify.max_user_watches=12000
+
+ sudo yum update -y
+
+ --BOUNDARY--
+---
+apiVersion: karpenter.sh/v1beta1
+kind: NodePool
+metadata:
+ name: default
+spec:
+ disruption:
+ {{ if eq (index .Values "karpenter-crds" "default" "consolidationPolicy" ) "WhenEmpty" }}
+ consolidateAfter: {{ index .Values "karpenter-crds" "default" "consolidateAfter" }}
+ {{ end }}
+ consolidationPolicy: {{ index .Values "karpenter-crds" "default" "consolidationPolicy" }}
+ expireAfter: {{ index .Values "karpenter-crds" "default" "expireAfter" }}
+ limits:
+ cpu: "1000"
+ memory: 1000Gi
+ template:
+ metadata:
+ labels:
+ role: default
+ spec:
+ kubelet:
+ evictionHard:
+ memory.available: 5%
+ evictionSoft:
+ memory.available: 10%
+ evictionSoftGracePeriod:
+ memory.available: 5m
+ kubeReserved:
+ cpu: 480m
+ ephemeral-storage: 3Gi
+ memory: 1632Mi
+ nodeClassRef:
+ apiVersion: karpenter.k8s.aws/v1beta1
+ kind: EC2NodeClass
+ name: default
+ requirements:
+ - key: karpenter.sh/capacity-type
+ operator: In
+ values:
+ - on-demand
+ - spot
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ - key: karpenter.k8s.aws/instance-category
+ operator: In
+ values:
+ - c
+ - m
+ - r
+ - t
+---
+apiVersion: karpenter.k8s.aws/v1alpha1
+kind: AWSNodeTemplate
+metadata:
+ name: default
+spec:
+ amiSelector:
+ aws::name: {{ index .Values "karpenter-crds" "amiSelectorName" }}
+ aws::owners: "143731057154"
+ subnetSelector:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ securityGroupSelector:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ tags:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ Environment: {{ .Values.cluster }}
+ Name: eks-{{ .Values.cluster }}-karpenter
+ purpose: default
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: optional
+ userData: |
+ MIME-Version: 1.0
+ Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+ --BOUNDARY
+ Content-Type: text/x-shellscript; charset="us-ascii"
+
+ #!/bin/bash -x
+ instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
+ curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
+
+ echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
+
+ sysctl -w fs.inotify.max_user_watches=12000
+
+ # --BOUNDARY
+ # Content-Type: text/cloud-config; charset="us-ascii"
+
+ # mounts:
+ # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime']
+
+ --BOUNDARY--
+ blockDeviceMappings:
+ - deviceName: /dev/xvda
+ ebs:
+ volumeSize: 50Gi
+ volumeType: gp2
+ encrypted: true
+ deleteOnTermination: true
+---
+apiVersion: karpenter.sh/v1alpha5
+kind: Provisioner
+metadata:
+ name: default
+spec:
+ # Allow for spot and on demand instances
+ requirements:
+ - key: karpenter.sh/capacity-type
+ operator: In
+ values: ["on-demand", "spot"]
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ - key: karpenter.k8s.aws/instance-category
+ operator: In
+ values:
+ - c
+ - m
+ - r
+ - t
+ taints:
+ - key: karpenter.sh/legacy
+ value: "true"
+ effect: NoSchedule
+ # Set a limit of 1000 vcpus
+ limits:
+ resources:
+ cpu: 1000
+ # Use the default node template
+ providerRef:
+ name: default
+ # Allow pods to be rearranged
+ consolidation:
+ enabled: {{ index .Values "karpenter-crds" "default" "consolidation" }}
+ # Kill nodes after 7 days to ensure they stay up to date
+ ttlSecondsUntilExpired: 604800
+ {{ else }}
+---
+apiVersion: karpenter.k8s.aws/v1alpha1
+kind: AWSNodeTemplate
+metadata:
+ name: default
+spec:
+ amiSelector:
+ aws::name: {{ index .Values "karpenter-crds" "amiSelectorName" }}
+ aws::owners: "143731057154"
+ subnetSelector:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ securityGroupSelector:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ tags:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ Environment: {{ .Values.cluster }}
+ Name: eks-{{ .Values.cluster }}-karpenter
+ purpose: default
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: optional
+ userData: |
+ MIME-Version: 1.0
+ Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+ --BOUNDARY
+ Content-Type: text/x-shellscript; charset="us-ascii"
+
+ #!/bin/bash -x
+ instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
+ curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
+
+ echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
+
+ sysctl -w fs.inotify.max_user_watches=12000
+
+ # --BOUNDARY
+ # Content-Type: text/cloud-config; charset="us-ascii"
+
+ # mounts:
+ # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime']
+
+ --BOUNDARY--
+ blockDeviceMappings:
+ - deviceName: /dev/xvda
+ ebs:
+ volumeSize: 50Gi
+ volumeType: gp2
+ encrypted: true
+ deleteOnTermination: true
+---
+apiVersion: karpenter.sh/v1alpha5
+kind: Provisioner
+metadata:
+ name: default
+spec:
+ # Allow for spot and on demand instances
+ requirements:
+ - key: karpenter.sh/capacity-type
+ operator: In
+ values: ["on-demand", "spot"]
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ - key: karpenter.k8s.aws/instance-category
+ operator: In
+ values:
+ - c
+ - m
+ - r
+ - t
+ - key: karpenter.k8s.aws/instance-cpu
+ operator: Gt
+ values:
+ - "7"
+ # Set a limit of 1000 vcpus
+ limits:
+ resources:
+ cpu: 1000
+ # Use the default node template
+ providerRef:
+ name: default
+ # Allow pods to be rearranged
+ consolidation:
+ enabled: {{ index .Values "karpenter-crds" "default" "consolidation" }}
+ # Kill nodes after 7 days to ensure they stay up to date
+ ttlSecondsUntilExpired: 604800
+ {{ end }}
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/karpenter-crd-jupyter.yaml b/helm/cluster-level-resources/templates/karpenter-crd-jupyter.yaml
new file mode 100644
index 00000000..e9877f64
--- /dev/null
+++ b/helm/cluster-level-resources/templates/karpenter-crd-jupyter.yaml
@@ -0,0 +1,294 @@
+{{ if and (index .Values "karpenter-crds" "enabled") (index .Values "karpenter-crds" "jupyter" "enabled") }}
+ {{ if eq (index .Values "karpenter-crds" "migration") true }}
+---
+apiVersion: karpenter.k8s.aws/v1beta1
+kind: EC2NodeClass
+metadata:
+ name: jupyter
+spec:
+ amiFamily: AL2
+ amiSelectorTerms:
+ - name: {{ index .Values "karpenter-crds" "amiSelectorName" }}
+ owner: "143731057154"
+ blockDeviceMappings:
+ - deviceName: /dev/xvda
+ ebs:
+ deleteOnTermination: true
+ encrypted: true
+ volumeSize: 50Gi
+ volumeType: gp3
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: optional
+ role: eks_{{ index .Values "karpenter-crds" "selectorTag" }}_workers_role
+
+ securityGroupSelectorTerms:
+ - tags:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-jupyter
+
+ subnetSelectorTerms:
+ - tags:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+
+ tags:
+ Environment: {{ .Values.cluster }}
+ Name: eks-{{ .Values.cluster }}-karpenter
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ purpose: jupyter
+
+ userData: |
+ MIME-Version: 1.0
+ Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+ --BOUNDARY
+ Content-Type: text/x-shellscript; charset="us-ascii"
+
+ #!/bin/bash -x
+ instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
+ curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
+ echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
+ sysctl -w fs.inotify.max_user_watches=12000
+
+ sudo yum update -y
+
+ --BOUNDARY--
+---
+apiVersion: karpenter.sh/v1beta1
+kind: NodePool
+metadata:
+ name: jupyter
+spec:
+ disruption:
+ {{ if eq (index .Values "karpenter-crds" "jupyter" "consolidationPolicy" ) "WhenEmpty" }}
+ consolidateAfter: {{ index .Values "karpenter-crds" "jupyter" "consolidateAfter" }}
+ {{ end }}
+ consolidationPolicy: {{ index .Values "karpenter-crds" "jupyter" "consolidationPolicy" }}
+ expireAfter: {{ index .Values "karpenter-crds" "jupyter" "expireAfter" }}
+ limits:
+ cpu: "1000"
+ memory: 1000Gi
+ template:
+ metadata:
+ labels:
+ role: jupyter
+ spec:
+ kubelet:
+ evictionHard:
+ memory.available: 5%
+ evictionSoft:
+ memory.available: 10%
+ evictionSoftGracePeriod:
+ memory.available: 5m
+ kubeReserved:
+ cpu: 480m
+ ephemeral-storage: 3Gi
+ memory: 1632Mi
+ nodeClassRef:
+ apiVersion: karpenter.k8s.aws/v1beta1
+ kind: EC2NodeClass
+ name: jupyter
+ requirements:
+ - key: karpenter.sh/capacity-type
+ operator: In
+ values:
+ - on-demand
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ - key: karpenter.k8s.aws/instance-category
+ operator: In
+ values:
+ - c
+ - m
+ - r
+ - t
+ taints:
+ - effect: NoSchedule
+ key: role
+ value: jupyter
+---
+apiVersion: karpenter.k8s.aws/v1alpha1
+kind: AWSNodeTemplate
+metadata:
+ name: jupyter
+spec:
+ subnetSelector:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ securityGroupSelector:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-jupyter
+ tags:
+ Environment: {{ .Values.cluster }}
+ Name: eks-{{ .Values.cluster }}-jupyter-karpenter
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ purpose: jupyter
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: optional
+ userData: |
+ MIME-Version: 1.0
+ Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+ --BOUNDARY
+ Content-Type: text/x-shellscript; charset="us-ascii"
+
+ #!/bin/bash -x
+ instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
+ curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
+
+ echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
+
+ sysctl -w fs.inotify.max_user_watches=12000
+
+ # --BOUNDARY
+ # Content-Type: text/cloud-config; charset="us-ascii"
+
+ # mounts:
+ # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime']
+
+ --BOUNDARY--
+ blockDeviceMappings:
+ - deviceName: /dev/xvda
+ ebs:
+ volumeSize: 50Gi
+ volumeType: gp2
+ encrypted: true
+ deleteOnTermination: true
+---
+apiVersion: karpenter.sh/v1alpha5
+kind: Provisioner
+metadata:
+ name: jupyter
+spec:
+ requirements:
+ - key: karpenter.sh/capacity-type
+ operator: In
+ values: ["on-demand"]
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ - key: karpenter.k8s.aws/instance-category
+ operator: In
+ values:
+ - c
+ - m
+ - r
+ - t
+ taints:
+ - key: role
+ value: jupyter
+ effect: NoSchedule
+ - key: karpenter.sh/legacy
+ value: "true"
+ effect: NoSchedule
+ labels:
+ role: jupyter
+ #TODO this could be paramaterized
+ limits:
+ resources:
+ cpu: 1000
+ providerRef:
+ name: jupyter
+ # Allow pods to be rearranged
+ consolidation:
+ enabled: {{ index .Values "karpenter-crds.jupyter.consolidation" }}
+ # Kill nodes after 7 days to ensure they stay up to date
+ # TODO This could be paramaterized
+ ttlSecondsUntilExpired: 604800
+ {{ else }}
+---
+apiVersion: karpenter.k8s.aws/v1alpha1
+kind: AWSNodeTemplate
+metadata:
+ name: jupyter
+spec:
+ subnetSelector:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ securityGroupSelector:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-jupyter
+ tags:
+ Environment: {{ .Values.cluster }}
+ Name: eks-{{ .Values.cluster }}-jupyter-karpenter
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ purpose: jupyter
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: optional
+ userData: |
+ MIME-Version: 1.0
+ Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+ --BOUNDARY
+ Content-Type: text/x-shellscript; charset="us-ascii"
+
+ #!/bin/bash -x
+ instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
+ curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
+
+ echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
+
+ sysctl -w fs.inotify.max_user_watches=12000
+
+ # --BOUNDARY
+ # Content-Type: text/cloud-config; charset="us-ascii"
+
+ # mounts:
+ # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime']
+
+ --BOUNDARY--
+ blockDeviceMappings:
+ - deviceName: /dev/xvda
+ ebs:
+ volumeSize: 50Gi
+ volumeType: gp2
+ encrypted: true
+ deleteOnTermination: true
+---
+apiVersion: karpenter.sh/v1alpha5
+kind: Provisioner
+metadata:
+ name: jupyter
+spec:
+ # Only allow on demand instance
+ requirements:
+ - key: karpenter.sh/capacity-type
+ operator: In
+ values: ["on-demand"]
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ - key: karpenter.k8s.aws/instance-category
+ operator: In
+ values:
+ - c
+ - m
+ - r
+ - t
+ # Set a taint for jupyter pods
+ taints:
+ - key: role
+ value: jupyter
+ effect: NoSchedule
+ labels:
+ role: jupyter
+ #TODO this could be paramaterized
+ limits:
+ resources:
+ cpu: 1000
+ providerRef:
+ name: jupyter
+ # Allow pods to be rearranged
+ consolidation:
+ enabled: {{ index .Values "karpenter-crds.jupyter.consolidation" }}
+ # Kill nodes after 7 days to ensure they stay up to date
+ ttlSecondsUntilExpired: 604800
+ {{ end }}
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/karpenter-crd-workflow.yaml b/helm/cluster-level-resources/templates/karpenter-crd-workflow.yaml
new file mode 100644
index 00000000..c0137670
--- /dev/null
+++ b/helm/cluster-level-resources/templates/karpenter-crd-workflow.yaml
@@ -0,0 +1,307 @@
+{{ if and (index .Values "karpenter-crds" "enabled") (index .Values "karpenter-crds" "workflow" "enabled") }}
+ {{ if eq (index .Values "karpenter-crds" "migration") true }}
+---
+apiVersion: karpenter.k8s.aws/v1beta1
+kind: EC2NodeClass
+metadata:
+ name: workflow
+spec:
+ amiFamily: AL2
+ amiSelectorTerms:
+ - name: {{ index .Values "karpenter-crds" "amiSelectorName" }}
+ owner: "143731057154"
+ blockDeviceMappings:
+ - deviceName: /dev/xvda
+ ebs:
+ deleteOnTermination: true
+ encrypted: true
+ volumeSize: 50Gi
+ volumeType: gp3
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: optional
+ role: eks_{{ index .Values "karpenter-crds" "selectorTag" }}_workers_role
+
+ securityGroupSelectorTerms:
+ - tags:
+ {{- if ne (index .Values "karpenter-crds" "workflow" "sgSelector") "" }}
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "workflow" "sgSelector" }}
+ {{- else }}
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-workflow
+ {{- end }}
+
+ subnetSelectorTerms:
+ - tags:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+
+ tags:
+ Environment: {{ .Values.cluster }}
+ Name: eks-{{ .Values.cluster }}-karpenter
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ purpose: workflow
+
+ userData: |
+ MIME-Version: 1.0
+ Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+ --BOUNDARY
+ Content-Type: text/x-shellscript; charset="us-ascii"
+
+ #!/bin/bash -x
+ instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
+ curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
+ echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
+ sysctl -w fs.inotify.max_user_watches=12000
+
+ sudo yum update -y
+
+ --BOUNDARY--
+---
+apiVersion: karpenter.sh/v1beta1
+kind: NodePool
+metadata:
+ name: workflow
+spec:
+ disruption:
+ {{ if eq (index .Values "karpenter-crds" "workflow" "consolidationPolicy" ) "WhenEmpty" }}
+ consolidateAfter: {{ index .Values "karpenter-crds" "workflow" "consolidateAfter" }}
+ {{ end }}
+ consolidationPolicy: {{ index .Values "karpenter-crds" "workflow" "consolidationPolicy" }}
+ expireAfter: {{ index .Values "karpenter-crds" "workflow" "expireAfter" }}
+ limits:
+ cpu: "1000"
+ memory: 1000Gi
+ template:
+ metadata:
+ labels:
+ role: workflow
+ spec:
+ kubelet:
+ evictionHard:
+ memory.available: 5%
+ evictionSoft:
+ memory.available: 10%
+ evictionSoftGracePeriod:
+ memory.available: 5m
+ kubeReserved:
+ cpu: 480m
+ ephemeral-storage: 3Gi
+ memory: 1632Mi
+ nodeClassRef:
+ apiVersion: karpenter.k8s.aws/v1beta1
+ kind: EC2NodeClass
+ name: workflow
+ requirements:
+ - key: karpenter.sh/capacity-type
+ operator: In
+ values:
+ - on-demand
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ - key: karpenter.k8s.aws/instance-category
+ operator: In
+ values:
+ - c
+ - m
+ - r
+ - t
+ taints:
+ - effect: NoSchedule
+ key: role
+ value: workflow
+---
+apiVersion: karpenter.k8s.aws/v1alpha1
+kind: AWSNodeTemplate
+metadata:
+ name: workflow
+spec:
+ subnetSelector:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ securityGroupSelector:
+ # TODO this is an example of how to parameterize this, we should expand this when helpful
+ {{- if ne (index .Values "karpenter-crds" "workflow" "sgSelector") "" }}
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "workflow" "sgSelector" }}
+ {{- else }}
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-workflow
+ {{- end }}
+ tags:
+ Environment: {{ .Values.cluster }}
+ Name: eks-{{ .Values.cluster }}-workflow-karpenter
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ purpose: workflow
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: optional
+ userData: |
+ MIME-Version: 1.0
+ Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+ --BOUNDARY
+ Content-Type: text/x-shellscript; charset="us-ascii"
+
+ #!/bin/bash -x
+ instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
+ curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
+
+ echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
+
+ sysctl -w fs.inotify.max_user_watches=12000
+
+ # --BOUNDARY
+ # Content-Type: text/cloud-config; charset="us-ascii"
+
+ # mounts:
+ # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime']
+
+ --BOUNDARY--
+ blockDeviceMappings:
+ - deviceName: /dev/xvda
+ ebs:
+ volumeSize: 50Gi
+ volumeType: gp2
+ encrypted: true
+ deleteOnTermination: true
+---
+apiVersion: karpenter.sh/v1alpha5
+kind: Provisioner
+metadata:
+ name: workflow
+spec:
+ requirements:
+ - key: karpenter.sh/capacity-type
+ operator: In
+ values: ["on-demand"]
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ - key: karpenter.k8s.aws/instance-category
+ operator: In
+ values:
+ - c
+ - m
+ - r
+ - t
+ taints:
+ - key: role
+ value: workflow
+ effect: NoSchedule
+ - key: karpenter.sh/legacy
+ value: "true"
+ effect: NoSchedule
+ labels:
+ role: workflow
+ #TODO this could be paramaterized
+ limits:
+ resources:
+ cpu: 1000
+ providerRef:
+ name: workflow
+ # Allow pods to be rearranged
+ consolidation:
+ enabled: {{ index .Values "karpenter-crds.workflow.consolidation" }}
+ # Kill nodes after 7 days to ensure they stay up to date
+ # TODO This could be paramaterized
+ ttlSecondsUntilExpired: 604800
+ {{ else }}
+---
+apiVersion: karpenter.k8s.aws/v1alpha1
+kind: AWSNodeTemplate
+metadata:
+ name: workflow
+spec:
+ subnetSelector:
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ securityGroupSelector:
+ # TODO this is an example of how to parameterize this, we should expand this when helpful
+ {{- if ne (index .Values "karpenter-crds" "workflow" "sgSelector") "" }}
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "workflow" "sgSelector" }}
+ {{- else }}
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-workflow
+ {{- end }}
+ tags:
+ Environment: {{ .Values.cluster }}
+ Name: eks-{{ .Values.cluster }}-workflow-karpenter
+ karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}
+ purpose: workflow
+ metadataOptions:
+ httpEndpoint: enabled
+ httpProtocolIPv6: disabled
+ httpPutResponseHopLimit: 2
+ httpTokens: optional
+ userData: |
+ MIME-Version: 1.0
+ Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+ --BOUNDARY
+ Content-Type: text/x-shellscript; charset="us-ascii"
+
+ #!/bin/bash -x
+ instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId)
+ curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys
+
+ echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json
+
+ sysctl -w fs.inotify.max_user_watches=12000
+
+ # --BOUNDARY
+ # Content-Type: text/cloud-config; charset="us-ascii"
+
+ # mounts:
+ # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime']
+
+ --BOUNDARY--
+ blockDeviceMappings:
+ - deviceName: /dev/xvda
+ ebs:
+ volumeSize: 50Gi
+ volumeType: gp2
+ encrypted: true
+ deleteOnTermination: true
+---
+apiVersion: karpenter.sh/v1alpha5
+kind: Provisioner
+metadata:
+ name: workflow
+spec:
+ requirements:
+ - key: karpenter.sh/capacity-type
+ operator: In
+ values: ["on-demand"]
+ - key: kubernetes.io/arch
+ operator: In
+ values:
+ - amd64
+ - key: karpenter.k8s.aws/instance-category
+ operator: In
+ values:
+ - c
+ - m
+ - r
+ - t
+ taints:
+ - key: role
+ value: workflow
+ effect: NoSchedule
+ labels:
+ role: workflow
+ #TODO this could be paramaterized
+ limits:
+ resources:
+ cpu: 1000
+ providerRef:
+ name: workflow
+ # Allow pods to be rearranged
+ consolidation:
+ enabled: {{ index .Values "karpenter-crds.workflow.consolidation" }}
+ # Kill nodes after 7 days to ensure they stay up to date
+ # TODO This could be paramaterized
+ ttlSecondsUntilExpired: 604800
+ {{ end }}
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/karpenter-templates.yaml b/helm/cluster-level-resources/templates/karpenter-templates.yaml
new file mode 100644
index 00000000..1b184ed4
--- /dev/null
+++ b/helm/cluster-level-resources/templates/karpenter-templates.yaml
@@ -0,0 +1,30 @@
+{{ if index .Values "karpenter-templates" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: karpenter-templates
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - path: helm/karpenter-templates
+ repoURL: https://github.com/uc-cdis/gen3-helm
+ targetRevision: {{ index .Values "karpenter-templates" "targetRevision" }}
+ helm:
+ releaseName: karpenter-templates
+ {{- if index .Values "karpenter-templates" "configuration" "enabled" }}
+ valueFiles:
+ - $values/{{ .Values.cluster }}/cluster-values/karpenter-templates.yaml
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: {{ .Values.configuration.configurationRevision }}
+ ref: values
+ {{- end }}
+ destination:
+ server: "https://kubernetes.default.svc"
+ namespace: karpenter
+ syncPolicy:
+ syncOptions:
+ - CreateNamespace=true
+ automated:
+ selfHeal: true
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/karpenter.yaml b/helm/cluster-level-resources/templates/karpenter.yaml
new file mode 100644
index 00000000..45a5666d
--- /dev/null
+++ b/helm/cluster-level-resources/templates/karpenter.yaml
@@ -0,0 +1,53 @@
+{{ if index .Values "karpenter" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: karpenter
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - repoURL: 'https://github.com/aws/karpenter-provider-aws'
+ targetRevision: {{ .Values.karpenter.targetRevision }}
+ path: charts/karpenter
+ helm:
+ releaseName: karpenter
+ {{- if index .Values "karpenter" "configuration" "enabled" }}
+ valueFiles:
+ - "$values/{{ .Values.cluster }}/cluster-values/karpenter.yaml"
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: {{ .Values.configuration.configurationRevision }}
+ ref: values
+ {{- else }}
+ values: |
+ serviceAccount:
+ name: karpenter
+ create: true
+ annotations:
+ eks.amazonaws.com/role-arn: "arn:aws:iam::{{ .Values.accountNumber }}:role/{{ .Values.eksClusterName | default .Values.cluster }}-karpenter-sa"
+ settings:
+ clusterName: {{ .Values.eksClusterName | default .Values.cluster }}
+ controller:
+ image:
+ tag: {{ .Values.karpenter.controller.image.tag | default .Values.karpenter.targetRevision }}
+ digest: {{ .Values.karpenter.controller.image.digest }}
+ env:
+ - name: AWS_REGION
+ value: us-east-1
+ resources:
+ requests:
+ memory: {{ .Values.karpenter.resources.requests.memory }}
+ cpu: {{ .Values.karpenter.resources.requests.cpu }}
+ limits:
+ memory: {{ .Values.karpenter.resources.limits.memory }}
+ cpu: {{ .Values.karpenter.resources.limits.cpu }}
+ {{- end }}
+ destination:
+ server: "https://kubernetes.default.svc"
+ namespace: karpenter
+ syncPolicy:
+ syncOptions:
+ - CreateNamespace=true
+ automated:
+ selfHeal: true
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/kube-state-metrics.yaml b/helm/cluster-level-resources/templates/kube-state-metrics.yaml
new file mode 100644
index 00000000..248d7993
--- /dev/null
+++ b/helm/cluster-level-resources/templates/kube-state-metrics.yaml
@@ -0,0 +1,36 @@
+{{ if index .Values "kube-state-metrics" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: kube-state-metrics
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - repoURL: https://prometheus-community.github.io/helm-charts
+ chart: kube-state-metrics
+ targetRevision: {{ index .Values "kube-state-metrics" "targetRevision" }}
+ helm:
+ releaseName:
+ {{- if index .Values "kube-state-metrics" "configuration" "enabled" }}
+ valueFiles:
+ - "$values/{{ .Values.cluster }}/cluster-values/kube-state-metrics.yaml"
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: {{ .Values.configuration.configurationRevision }}
+ ref: values
+ {{- else}}
+ values: |
+ podAnnotations:
+ prometheus.io/path: "/metrics"
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "8080"
+ {{- end }}
+ syncPolicy:
+ syncOptions:
+ - CreateNamespace=true
+ automated:
+ selfHeal: true
+ destination:
+ server: https://kubernetes.default.svc
+ namespace: monitoring
+{{ end }}
diff --git a/helm/cluster-level-resources/templates/vpc-cni.yaml b/helm/cluster-level-resources/templates/vpc-cni.yaml
new file mode 100644
index 00000000..e535cce7
--- /dev/null
+++ b/helm/cluster-level-resources/templates/vpc-cni.yaml
@@ -0,0 +1,36 @@
+{{ if index .Values "vpc-cni" "enabled" }}
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+ name: vpc-cni
+ namespace: argocd
+spec:
+ project: default
+ sources:
+ - chart: aws-vpc-cni
+ repoURL: https://aws.github.io/eks-charts
+ targetRevision: {{ index .Values "vpc-cni" "targetRevision" }}
+ helm:
+ releaseName: vpc-cni
+ {{- if index .Values "vpc-cni" "configuration" "enabled" }}
+ valueFiles:
+ - $values/{{ .Values.cluster }}/cluster-values/vpc-cni.yaml
+ - repoURL: {{ .Values.configuration.configurationRepo }}
+ targetRevision: master
+ ref: values
+ {{- else }}
+ values: |
+ enableNetworkPolicy: false
+ originalMatchLabels: true
+ env:
+ ANNOTATE_POD_IP: "true"
+ {{- end }}
+ destination:
+ server: "https://kubernetes.default.svc"
+ namespace: kube-system
+ syncPolicy:
+ syncOptions:
+ - CreateNamespace=false
+ automated:
+ selfHeal: true
+{{ end }}
diff --git a/helm/cluster-level-resources/values.yaml b/helm/cluster-level-resources/values.yaml
new file mode 100644
index 00000000..b5cef8da
--- /dev/null
+++ b/helm/cluster-level-resources/values.yaml
@@ -0,0 +1,788 @@
+# The name of the cluster this configuration is going to. This should match the name of the directory configuration
+# is stored in
+cluster: "unfunded"
+project: unfunded
+
+# AWS account number that this cluster lives in
+accountNumber: "xxxxxxxxxxxx"
+
+# This is universal for all of our configuration, we assume that all of the configuration (i.e. values files)
+# live in the same repo, on the same branch
+configuration:
+ configurationRepo: https://github.com/uc-cdis/gen3-gitops
+ configurationRevision: master
+
+alb-controller:
+ enabled: false
+ targetRevision: 1.7.1
+ configuration:
+ enabled: false
+
+aws-s3-mountpoint:
+ enabled: false
+ targetRevision: 1.8.0
+ configuration:
+ enabled: false
+
+calico:
+ enabled: false
+ targetRevision: v3.27.0
+ configuration:
+ enabled: false
+
+coreDNS:
+ enabled: false
+ targetRevision: v1.29.0
+ configuration:
+ enabled: false
+
+ebs-csi-driver:
+ enabled: false
+ targetRevision: 2.36.0
+ configuration:
+ enabled: false
+
+fluentd:
+ enabled: false
+ targetRevision: "0.5.2"
+ configuration:
+ enabled: false
+
+grafana-alloy:
+ enabled: false
+ targetRevision: 0.4.0
+ configuration:
+ enabled: false
+
+karpenter:
+ enabled: false
+ targetRevision: v0.32.9
+ configuration:
+ enabled: false
+ resources:
+ requests:
+ memory: "1Gi"
+ cpu: "1"
+ limits:
+ memory: "1Gi"
+ cpu: "1"
+ controller:
+ image:
+ tag: "v0.32.9"
+ digest: "sha256:0c142050d872cb0ac7b30a188ec36aa765b449718cde0c7e49f7495b28f47c29"
+
+karpenter-templates:
+ enabled: false
+ targetRevision: feat/karpenter-templates
+ configuration:
+ enabled: false
+
+karpenter-crds:
+ enabled: false
+ targetRevision: master
+ amiSelectorName: "EKS-FIPS*"
+ selectorTag: ""
+ migration: false
+ default:
+ enabled: true
+ consolidation: true
+ consolidateAfter: "30s"
+ consolidationPolicy: "WhenEmpty"
+ expireAfter: "168h"
+ jupyter:
+ enabled: true
+ consolidation: true
+ consolidateAfter: "30s"
+ consolidationPolicy: "WhenEmpty"
+ expireAfter: "168h"
+ workflow:
+ enabled: true
+ consolidation: true
+ consolidateAfter: "30s"
+ consolidationPolicy: "WhenEmpty"
+ expireAfter: "168h"
+ sgSelector: ""
+
+kube-state-metrics:
+ enabled: false
+ configuration:
+ enabled: false
+ targetRevision: 5.28.0
+
+vpc-cni:
+ enabled: false
+ targetRevision: v1.16.2
+ configuration:
+ enabled: false
+
+# =============================================================================================
+# THIS IS THE CONFIGURATION THAT GOES INTO THE ALLOY CONFIGMAP. CUSTOMIZE AT YOUR OWN PERIL!!!!
+# =============================================================================================
+alloy-configmap-data: |
+ logging {
+ level = "info"
+ format = "json"
+ write_to = [loki.write.endpoint.receiver]
+ }
+
+ /////////////////////// OTLP START ///////////////////////
+
+ otelcol.receiver.otlp "default" {
+ grpc {}
+ http {}
+
+ output {
+ metrics = [otelcol.processor.batch.default.input]
+ traces = [otelcol.processor.batch.default.input]
+ }
+ }
+
+ otelcol.processor.batch "default" {
+ output {
+ metrics = [otelcol.exporter.prometheus.default.input]
+ traces = [otelcol.exporter.otlp.tempo.input]
+ }
+ }
+
+ otelcol.exporter.prometheus "default" {
+ forward_to = [prometheus.remote_write.default.receiver]
+ }
+
+ otelcol.exporter.otlp "tempo" {
+ client {
+ endpoint = "http://monitoring-tempo-distributor.monitoring:4317"
+ // Configure TLS settings for communicating with the endpoint.
+ tls {
+ // The connection is insecure.
+ insecure = true
+ // Do not verify TLS certificates when connecting.
+ insecure_skip_verify = true
+ }
+ }
+ }
+
+
+ /////////////////////// OTLP END ///////////////////////
+
+ // discover all pods, to be used later in this config
+ discovery.kubernetes "pods" {
+ role = "pod"
+ }
+
+ // discover all services, to be used later in this config
+ discovery.kubernetes "services" {
+ role = "service"
+ }
+
+ // discover all nodes, to be used later in this config
+ discovery.kubernetes "nodes" {
+ role = "node"
+ }
+
+ // Generic scrape of any pod with Annotation "prometheus.io/scrape: true"
+ discovery.relabel "annotation_autodiscovery_pods" {
+ targets = discovery.kubernetes.pods.targets
+ rule {
+ source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scrape"]
+ regex = "true"
+ action = "keep"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_job"]
+ action = "replace"
+ target_label = "job"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_instance"]
+ action = "replace"
+ target_label = "instance"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_path"]
+ action = "replace"
+ target_label = "__metrics_path__"
+ }
+
+ // Choose the pod port
+ // The discovery generates a target for each declared container port of the pod.
+ // If the metricsPortName annotation has value, keep only the target where the port name matches the one of the annotation.
+ rule {
+ source_labels = ["__meta_kubernetes_pod_container_port_name"]
+ target_label = "__tmp_port"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_portName"]
+ regex = "(.+)"
+ target_label = "__tmp_port"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_container_port_name"]
+ action = "keepequal"
+ target_label = "__tmp_port"
+ }
+
+ // If the metrics port number annotation has a value, override the target address to use it, regardless whether it is
+ // one of the declared ports on that Pod.
+ rule {
+ source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_port", "__meta_kubernetes_pod_ip"]
+ regex = "(\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})"
+ replacement = "[$2]:$1" // IPv6
+ target_label = "__address__"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_port", "__meta_kubernetes_pod_ip"]
+ regex = "(\\d+);((([0-9]+?)(\\.|$)){4})" // IPv4, takes priority over IPv6 when both exists
+ replacement = "$2:$1"
+ target_label = "__address__"
+ }
+
+ rule {
+ source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scheme"]
+ action = "replace"
+ target_label = "__scheme__"
+ }
+
+
+ // add labels
+ rule {
+ source_labels = ["__meta_kubernetes_pod_name"]
+ target_label = "pod"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_container_name"]
+ target_label = "container"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_controller_name"]
+ target_label = "controller"
+ }
+
+ rule {
+ source_labels = ["__meta_kubernetes_namespace"]
+ target_label = "namespace"
+ }
+
+
+ rule {
+ source_labels = ["__meta_kubernetes_pod_label_app"]
+ target_label = "app"
+ }
+
+ // map all labels
+ rule {
+ action = "labelmap"
+ regex = "__meta_kubernetes_pod_label_(.+)"
+ }
+ }
+
+ // Generic scrape of any service with
+ // Annotation Autodiscovery
+ discovery.relabel "annotation_autodiscovery_services" {
+ targets = discovery.kubernetes.services.targets
+ rule {
+ source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_scrape"]
+ regex = "true"
+ action = "keep"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_job"]
+ action = "replace"
+ target_label = "job"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_instance"]
+ action = "replace"
+ target_label = "instance"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_path"]
+ action = "replace"
+ target_label = "__metrics_path__"
+ }
+
+ // Choose the service port
+ rule {
+ source_labels = ["__meta_kubernetes_service_port_name"]
+ target_label = "__tmp_port"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_portName"]
+ regex = "(.+)"
+ target_label = "__tmp_port"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_service_port_name"]
+ action = "keepequal"
+ target_label = "__tmp_port"
+ }
+
+ rule {
+ source_labels = ["__meta_kubernetes_service_port_number"]
+ target_label = "__tmp_port"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_port"]
+ regex = "(.+)"
+ target_label = "__tmp_port"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_service_port_number"]
+ action = "keepequal"
+ target_label = "__tmp_port"
+ }
+
+ rule {
+ source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_scheme"]
+ action = "replace"
+ target_label = "__scheme__"
+ }
+ }
+
+ prometheus.scrape "metrics" {
+ job_name = "integrations/autodiscovery_metrics"
+ targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output)
+ honor_labels = true
+ clustering {
+ enabled = true
+ }
+ forward_to = [prometheus.relabel.metrics_service.receiver]
+ }
+
+
+ // Node Exporter
+ // TODO: replace with https://grafana.com/docs/alloy/latest/reference/components/prometheus.exporter.unix/
+ discovery.relabel "node_exporter" {
+ targets = discovery.kubernetes.pods.targets
+ rule {
+ source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_instance"]
+ regex = "monitoring-extras"
+ action = "keep"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
+ regex = "node-exporter"
+ action = "keep"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_node_name"]
+ action = "replace"
+ target_label = "instance"
+ }
+ }
+
+ prometheus.scrape "node_exporter" {
+ job_name = "integrations/node_exporter"
+ targets = discovery.relabel.node_exporter.output
+ scrape_interval = "60s"
+ clustering {
+ enabled = true
+ }
+ forward_to = [prometheus.relabel.node_exporter.receiver]
+ }
+
+ prometheus.relabel "node_exporter" {
+ rule {
+ source_labels = ["__name__"]
+ regex = "up|node_cpu.*|node_network.*|node_exporter_build_info|node_filesystem.*|node_memory.*|process_cpu_seconds_total|process_resident_memory_bytes"
+ action = "keep"
+ }
+ forward_to = [prometheus.relabel.metrics_service.receiver]
+ }
+
+
+ // cAdvisor
+ // discovery.relabel "cadvisor" {
+ // targets = discovery.kubernetes.nodes.targets
+ // rule {
+ // target_label = "__address__"
+ // replacement = "kubernetes.default.svc.cluster.local:443"
+ // }
+ // rule {
+ // source_labels = ["__meta_kubernetes_node_name"]
+ // regex = "(.+)"
+ // replacement = "/api/v1/nodes/${1}/proxy/metrics/cadvisor"
+ // target_label = "__metrics_path__"
+ // }
+ // }
+
+ // prometheus.scrape "cadvisor" {
+ // job_name = "integrations/kubernetes/cadvisor"
+ // targets = discovery.relabel.cadvisor.output
+ // scheme = "https"
+ // scrape_interval = "60s"
+ // bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+ // tls_config {
+ // insecure_skip_verify = true
+ // }
+ // clustering {
+ // enabled = true
+ // }
+ // forward_to = [prometheus.relabel.cadvisor.receiver]
+ //}
+
+ //prometheus.relabel "cadvisor" {
+ // rule {
+ // source_labels = ["__name__"]
+ // regex = "up|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|machine_memory_bytes"
+ // action = "keep"
+ // }
+ // forward_to = [prometheus.relabel.metrics_service.receiver]
+ // }
+
+ // Logs from all pods
+ discovery.relabel "all_pods" {
+ targets = discovery.kubernetes.pods.targets
+ rule {
+ source_labels = ["__meta_kubernetes_namespace"]
+ target_label = "namespace"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_name"]
+ target_label = "pod"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_container_name"]
+ target_label = "container"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_pod_controller_name"]
+ target_label = "controller"
+ }
+
+ rule {
+ source_labels = ["__meta_kubernetes_pod_label_app"]
+ target_label = "app"
+ }
+
+ // map all labels
+ rule {
+ action = "labelmap"
+ regex = "__meta_kubernetes_pod_label_(.+)"
+ }
+
+ }
+
+ loki.source.kubernetes "pods" {
+ targets = discovery.relabel.all_pods.output
+ forward_to = [loki.write.endpoint.receiver]
+ }
+
+ // kube-state-metrics
+ discovery.relabel "relabel_kube_state_metrics" {
+ targets = discovery.kubernetes.services.targets
+ rule {
+ source_labels = ["__meta_kubernetes_namespace"]
+ regex = "monitoring"
+ action = "keep"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_service_name"]
+ regex = "monitoring-extras-kube-state-metrics"
+ action = "keep"
+ }
+ }
+
+ prometheus.scrape "kube_state_metrics" {
+ targets = discovery.relabel.relabel_kube_state_metrics.output
+ job_name = "kube-state-metrics"
+ metrics_path = "/metrics"
+ forward_to = [prometheus.remote_write.default.receiver]
+ }
+
+ // Kubelet
+ discovery.relabel "kubelet" {
+ targets = discovery.kubernetes.nodes.targets
+ rule {
+ target_label = "__address__"
+ replacement = "kubernetes.default.svc.cluster.local:443"
+ }
+ rule {
+ source_labels = ["__meta_kubernetes_node_name"]
+ regex = "(.+)"
+ replacement = "/api/v1/nodes/${1}/proxy/metrics"
+ target_label = "__metrics_path__"
+ }
+ }
+
+ prometheus.scrape "kubelet" {
+ job_name = "integrations/kubernetes/kubelet"
+ targets = discovery.relabel.kubelet.output
+ scheme = "https"
+ scrape_interval = "60s"
+ bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token"
+ tls_config {
+ insecure_skip_verify = true
+ }
+ clustering {
+ enabled = true
+ }
+ forward_to = [prometheus.relabel.kubelet.receiver]
+ }
+
+ prometheus.relabel "kubelet" {
+ rule {
+ source_labels = ["__name__"]
+ regex = "up|container_cpu_usage_seconds_total|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_certificate_manager_client_ttl_seconds|kubelet_certificate_manager_server_ttl_seconds|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_cgroup_manager_duration_seconds_count|kubelet_node_config_error|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_count|kubelet_pleg_relist_interval_seconds_bucket|kubelet_pod_start_duration_seconds_bucket|kubelet_pod_start_duration_seconds_count|kubelet_pod_worker_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_count|kubelet_running_container_count|kubelet_running_containers|kubelet_running_pod_count|kubelet_running_pods|kubelet_runtime_operations_errors_total|kubelet_runtime_operations_total|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubernetes_build_info|namespace_workload_pod|rest_client_requests_total|storage_operation_duration_seconds_count|storage_operation_errors_total|volume_manager_total_volumes"
+ action = "keep"
+ }
+ forward_to = [prometheus.relabel.metrics_service.receiver]
+ }
+
+ // Cluster Events
+ loki.source.kubernetes_events "cluster_events" {
+ job_name = "integrations/kubernetes/eventhandler"
+ log_format = "logfmt"
+ forward_to = [loki.write.endpoint.receiver]
+ }
+
+
+ // Why is this needed?
+ prometheus.relabel "metrics_service" {
+ forward_to = [prometheus.remote_write.default.receiver]
+ }
+
+
+ // Write Endpoints
+ // prometheus write endpoint
+ prometheus.remote_write "default" {
+ external_labels = {
+ cluster = "{{ .Values.cluster }}",
+ project = "{{ .Values.project }}",
+ }
+ endpoint {
+ url = "https://mimir.planx-pla.net/api/v1/push"
+
+ headers = {
+ "X-Scope-OrgID" = "anonymous",
+ }
+
+ }
+ }
+
+ // loki write endpoint
+ loki.write "endpoint" {
+ external_labels = {
+ cluster = "{{ .Values.cluster }}",
+ project = "{{ .Values.project }}",
+ }
+ endpoint {
+ url = "https://loki.planx-pla.net/loki/api/v1/push"
+ }
+ }
+
+# =======================================================================
+# THIS IS THE CONFIGURATION FOR FLUENTD. CUSTOMIZE AT YOUR OWN PERIL!!!!!
+# =======================================================================
+fluentd-configmap-data: |
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ @type kubernetes_metadata
+ @id filter_kube_metadata
+ kubernetes_url "#{ENV['FLUENT_FILTER_KUBERNETES_URL'] || 'https://' + ENV.fetch('KUBERNETES_SERVICE_HOST') + ':' + ENV.fetch('KUBERNETES_SERVICE_PORT') + '/api'}"
+ verify_ssl "#{ENV['KUBERNETES_VERIFY_SSL'] || true}"
+ ca_file "#{ENV['KUBERNETES_CA_FILE']}"
+ skip_labels "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_LABELS'] || 'false'}"
+ skip_container_metadata "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_CONTAINER_METADATA'] || 'false'}"
+ skip_master_url "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_MASTER_URL'] || 'false'}"
+ skip_namespace_metadata "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_NAMESPACE_METADATA'] || 'false'}"
+
+
+
+ @type null
+
+
+
+ @type null
+
+
+
+ @type rewrite_tag_filter
+
+ key $._HOSTNAME
+ pattern ^(.+)$
+ tag $1.docker
+
+
+
+
+ @type rewrite_tag_filter
+
+ key $._HOSTNAME
+ pattern ^(.+)$
+ tag $1.kubelet
+
+
+
+
+ @type rewrite_tag_filter
+
+ key $.host
+ pattern ^(.+)$
+ tag $1.messages
+
+
+
+
+ @type rewrite_tag_filter
+
+ key $.host
+ pattern ^(.+)$
+ tag $1.secure
+
+
+
+
+ @type rewrite_tag_filter
+
+ # json structured log - consider adoption a standard json schema:
+ # https://github.com/timberio/log-event-json-schema
+ key message
+ pattern /^\{\s*"gen3log":/
+ tag kubernetes.gen3.json.${tag}
+
+
+ # combined log format - default Apache and nginx structure
+ # https://httpd.apache.org/docs/1.3/logs.html#combined
+ key message
+ pattern /^(((\d+\.\d+\.\d+\.\d+)|-)\s+){2}\S+\s+\[\d\d?\//
+ tag kubernetes.gen3.combined.${tag}
+
+
+ # unstructured log line
+ key message
+ pattern /\S/
+ tag kubernetes.gen3.raw.${tag}
+
+
+
+
+
+ @type record_transformer
+
+ log_type json
+ # This one doesn't work for whatever reason, if you do ${record["kubernetes"]} the whole blob would be added, but can't access subobjects
+ #container_name ${record["kubernetes"]["container_name"]}
+
+
+
+
+ @type record_transformer
+
+ log_type combined
+
+
+
+
+ @type record_transformer
+
+ log_type raw
+
+
+
+
+ @type rewrite_tag_filter
+
+ key $.kubernetes.pod_name
+ pattern ^(.+)$
+ tag "#{Time.now.strftime('%Y-%m-%d')}.$1"
+
+ #
+ # key $.kubernetes
+ # pattern ^(.+)$
+ # tag $1.container_name
+ #
+
+
+ #
+ # @type rewrite_tag_filter
+ #
+ # key $.kubernetes.container_name
+ # pattern ^(.+)$
+ #tag $1.${tag}
+ # tag ${tag}.$1
+ #
+ #
+
+ # TODO:
+ # * python stack traces: "Traceback (most recent call last):""
+ # https://docs.fluentd.org/v0.12/articles/parser_multiline#formatn
+ #
+ # Idea: add `visitor` cookie to revproxy ...
+
+
+
+ @type cloudwatch_logs
+ @id out_cloudwatch_logs
+ log_group_name "#{ENV['LOG_GROUP_NAME']}"
+ auto_create_stream true
+ use_tag_as_stream true
+ retention_in_days "#{ENV['RETENTION_IN_DAYS'] || 'nil'}"
+ json_handler yajl # To avoid UndefinedConversionError
+ log_rejected_request "#{ENV['LOG_REJECTED_REQUEST']}" # Log rejected request for missing parts
+