From dccf86f64e543ff9fcad2285133e8cd79f923acd Mon Sep 17 00:00:00 2001 From: Aidan Hilt <11202897+AidanHilt@users.noreply.github.com> Date: Tue, 14 Jan 2025 12:02:15 -0500 Subject: [PATCH] First commit of cluster-level-resources publicly (#233) --- helm/cluster-level-resources/.helmignore | 23 + helm/cluster-level-resources/App.yaml | 17 + helm/cluster-level-resources/Chart.yaml | 9 + helm/cluster-level-resources/README.md | 77 ++ .../templates/alb-controller.yaml | 38 + .../templates/alloy-configmap.yaml | 9 + .../templates/aws-s3-mountpoint.yaml | 36 + .../templates/calico.yaml | 37 + .../templates/coreDNS.yaml | 57 ++ .../templates/ebs-csi-driver.yaml | 30 + .../templates/fluentd.yaml | 43 + .../templates/grafana-alloy.yaml | 66 ++ .../templates/karpenter-crd-default.yaml | 291 +++++++ .../templates/karpenter-crd-jupyter.yaml | 294 +++++++ .../templates/karpenter-crd-workflow.yaml | 307 +++++++ .../templates/karpenter-templates.yaml | 30 + .../templates/karpenter.yaml | 53 ++ .../templates/kube-state-metrics.yaml | 36 + .../templates/vpc-cni.yaml | 36 + helm/cluster-level-resources/values.yaml | 788 ++++++++++++++++++ 20 files changed, 2277 insertions(+) create mode 100644 helm/cluster-level-resources/.helmignore create mode 100644 helm/cluster-level-resources/App.yaml create mode 100644 helm/cluster-level-resources/Chart.yaml create mode 100644 helm/cluster-level-resources/README.md create mode 100644 helm/cluster-level-resources/templates/alb-controller.yaml create mode 100644 helm/cluster-level-resources/templates/alloy-configmap.yaml create mode 100644 helm/cluster-level-resources/templates/aws-s3-mountpoint.yaml create mode 100644 helm/cluster-level-resources/templates/calico.yaml create mode 100644 helm/cluster-level-resources/templates/coreDNS.yaml create mode 100644 helm/cluster-level-resources/templates/ebs-csi-driver.yaml create mode 100644 helm/cluster-level-resources/templates/fluentd.yaml create mode 100644 helm/cluster-level-resources/templates/grafana-alloy.yaml create mode 100644 helm/cluster-level-resources/templates/karpenter-crd-default.yaml create mode 100644 helm/cluster-level-resources/templates/karpenter-crd-jupyter.yaml create mode 100644 helm/cluster-level-resources/templates/karpenter-crd-workflow.yaml create mode 100644 helm/cluster-level-resources/templates/karpenter-templates.yaml create mode 100644 helm/cluster-level-resources/templates/karpenter.yaml create mode 100644 helm/cluster-level-resources/templates/kube-state-metrics.yaml create mode 100644 helm/cluster-level-resources/templates/vpc-cni.yaml create mode 100644 helm/cluster-level-resources/values.yaml diff --git a/helm/cluster-level-resources/.helmignore b/helm/cluster-level-resources/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/helm/cluster-level-resources/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm/cluster-level-resources/App.yaml b/helm/cluster-level-resources/App.yaml new file mode 100644 index 00000000..872e58d2 --- /dev/null +++ b/helm/cluster-level-resources/App.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cluster-level-resources + namespace: argocd +spec: + project: default + destination: + namespace: argocd + server: https://kubernetes.default.svc + source: + repoURL: https://github.com/uc-cdis/gen3-gitops.git + targetRevision: master + path: cluster-level-resources + syncPolicy: + automated: + selfHeal: true diff --git a/helm/cluster-level-resources/Chart.yaml b/helm/cluster-level-resources/Chart.yaml new file mode 100644 index 00000000..c16adaab --- /dev/null +++ b/helm/cluster-level-resources/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: cluster-level-resources +description: An app-of-apps Helm chart that allows for flexible deployment of resources that support Gen3 + +type: application + +version: 0.5.3 + +appVersion: "1.17.0" diff --git a/helm/cluster-level-resources/README.md b/helm/cluster-level-resources/README.md new file mode 100644 index 00000000..39b9bd1a --- /dev/null +++ b/helm/cluster-level-resources/README.md @@ -0,0 +1,77 @@ +# cluster-level-resources + +![Version: 0.5.3](https://img.shields.io/badge/Version-0.5.3-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.17.0](https://img.shields.io/badge/AppVersion-1.17.0-informational?style=flat-square) + +An app-of-apps Helm chart that allows for flexible deployment of resources that support Gen3 + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| accountNumber | string | `"xxxxxxxxxxxx"` | | +| alb-controller.configuration.enabled | bool | `false` | | +| alb-controller.enabled | bool | `false` | | +| alb-controller.targetRevision | string | `"1.7.1"` | | +| alloy-configmap-data | string | `"logging {\n level = \"info\"\n format = \"json\"\n write_to = [loki.write.endpoint.receiver]\n}\n\n/////////////////////// OTLP START ///////////////////////\n\notelcol.receiver.otlp \"default\" {\n grpc {}\n http {}\n\n output {\n metrics = [otelcol.processor.batch.default.input]\n traces = [otelcol.processor.batch.default.input]\n }\n}\n\notelcol.processor.batch \"default\" {\n output {\n metrics = [otelcol.exporter.prometheus.default.input]\n traces = [otelcol.exporter.otlp.tempo.input]\n }\n}\n\notelcol.exporter.prometheus \"default\" {\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\notelcol.exporter.otlp \"tempo\" {\n client {\n endpoint = \"http://monitoring-tempo-distributor.monitoring:4317\"\n // Configure TLS settings for communicating with the endpoint.\n tls {\n // The connection is insecure.\n insecure = true\n // Do not verify TLS certificates when connecting.\n insecure_skip_verify = true\n }\n }\n}\n\n\n/////////////////////// OTLP END ///////////////////////\n\n// discover all pods, to be used later in this config\ndiscovery.kubernetes \"pods\" {\n role = \"pod\"\n}\n\n// discover all services, to be used later in this config\ndiscovery.kubernetes \"services\" {\n role = \"service\"\n}\n\n// discover all nodes, to be used later in this config\ndiscovery.kubernetes \"nodes\" {\n role = \"node\"\n}\n\n// Generic scrape of any pod with Annotation \"prometheus.io/scrape: true\"\ndiscovery.relabel \"annotation_autodiscovery_pods\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scrape\"]\n regex = \"true\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_job\"]\n action = \"replace\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_instance\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n }\n\n // Choose the pod port\n // The discovery generates a target for each declared container port of the pod.\n // If the metricsPortName annotation has value, keep only the target where the port name matches the one of the annotation.\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_portName\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_port_name\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n // If the metrics port number annotation has a value, override the target address to use it, regardless whether it is\n // one of the declared ports on that Pod.\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n regex = \"(\\\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})\"\n replacement = \"[$2]:$1\" // IPv6\n target_label = \"__address__\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_port\", \"__meta_kubernetes_pod_ip\"]\n regex = \"(\\\\d+);((([0-9]+?)(\\\\.|$)){4})\" // IPv4, takes priority over IPv6 when both exists\n replacement = \"$2:$1\"\n target_label = \"__address__\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n target_label = \"__scheme__\"\n }\n\n\n // add labels\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_controller_name\"]\n target_label = \"controller\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app\"]\n target_label = \"app\"\n }\n\n // map all labels\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_label_(.+)\"\n }\n}\n\n// Generic scrape of any service with\n// Annotation Autodiscovery\ndiscovery.relabel \"annotation_autodiscovery_services\" {\n targets = discovery.kubernetes.services.targets\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_scrape\"]\n regex = \"true\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_job\"]\n action = \"replace\"\n target_label = \"job\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_instance\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_path\"]\n action = \"replace\"\n target_label = \"__metrics_path__\"\n }\n\n // Choose the service port\n rule {\n source_labels = [\"__meta_kubernetes_service_port_name\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_portName\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_port_name\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_service_port_number\"]\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_port\"]\n regex = \"(.+)\"\n target_label = \"__tmp_port\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_port_number\"]\n action = \"keepequal\"\n target_label = \"__tmp_port\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_service_annotation_prometheus_io_scheme\"]\n action = \"replace\"\n target_label = \"__scheme__\"\n }\n}\n\nprometheus.scrape \"metrics\" {\n job_name = \"integrations/autodiscovery_metrics\"\n targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output)\n honor_labels = true\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n\n// Node Exporter\n// TODO: replace with https://grafana.com/docs/alloy/latest/reference/components/prometheus.exporter.unix/\ndiscovery.relabel \"node_exporter\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_instance\"]\n regex = \"monitoring-extras\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app_kubernetes_io_name\"]\n regex = \"node-exporter\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_node_name\"]\n action = \"replace\"\n target_label = \"instance\"\n }\n}\n\nprometheus.scrape \"node_exporter\" {\n job_name = \"integrations/node_exporter\"\n targets = discovery.relabel.node_exporter.output\n scrape_interval = \"60s\"\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.node_exporter.receiver]\n}\n\nprometheus.relabel \"node_exporter\" {\n rule {\n source_labels = [\"__name__\"]\n regex = \"up|node_cpu.*|node_network.*|node_exporter_build_info|node_filesystem.*|node_memory.*|process_cpu_seconds_total|process_resident_memory_bytes\"\n action = \"keep\"\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n\n// cAdvisor\n// discovery.relabel \"cadvisor\" {\n// targets = discovery.kubernetes.nodes.targets\n// rule {\n// target_label = \"__address__\"\n// replacement = \"kubernetes.default.svc.cluster.local:443\"\n// }\n// rule {\n// source_labels = [\"__meta_kubernetes_node_name\"]\n// regex = \"(.+)\"\n// replacement = \"/api/v1/nodes/${1}/proxy/metrics/cadvisor\"\n// target_label = \"__metrics_path__\"\n// }\n// }\n\n// prometheus.scrape \"cadvisor\" {\n// job_name = \"integrations/kubernetes/cadvisor\"\n// targets = discovery.relabel.cadvisor.output\n// scheme = \"https\"\n// scrape_interval = \"60s\"\n// bearer_token_file = \"/var/run/secrets/kubernetes.io/serviceaccount/token\"\n// tls_config {\n// insecure_skip_verify = true\n// }\n// clustering {\n// enabled = true\n// }\n// forward_to = [prometheus.relabel.cadvisor.receiver]\n//}\n\n//prometheus.relabel \"cadvisor\" {\n// rule {\n// source_labels = [\"__name__\"]\n// regex = \"up|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|machine_memory_bytes\"\n// action = \"keep\"\n// }\n// forward_to = [prometheus.relabel.metrics_service.receiver]\n// }\n\n// Logs from all pods\ndiscovery.relabel \"all_pods\" {\n targets = discovery.kubernetes.pods.targets\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n target_label = \"namespace\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_name\"]\n target_label = \"pod\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_container_name\"]\n target_label = \"container\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_pod_controller_name\"]\n target_label = \"controller\"\n }\n\n rule {\n source_labels = [\"__meta_kubernetes_pod_label_app\"]\n target_label = \"app\"\n }\n\n // map all labels\n rule {\n action = \"labelmap\"\n regex = \"__meta_kubernetes_pod_label_(.+)\"\n }\n\n}\n\nloki.source.kubernetes \"pods\" {\n targets = discovery.relabel.all_pods.output\n forward_to = [loki.write.endpoint.receiver]\n}\n\n// kube-state-metrics\ndiscovery.relabel \"relabel_kube_state_metrics\" {\n targets = discovery.kubernetes.services.targets\n rule {\n source_labels = [\"__meta_kubernetes_namespace\"]\n regex = \"monitoring\"\n action = \"keep\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_service_name\"]\n regex = \"monitoring-extras-kube-state-metrics\"\n action = \"keep\"\n }\n}\n\nprometheus.scrape \"kube_state_metrics\" {\n targets = discovery.relabel.relabel_kube_state_metrics.output\n job_name = \"kube-state-metrics\"\n metrics_path = \"/metrics\"\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\n// Kubelet\ndiscovery.relabel \"kubelet\" {\n targets = discovery.kubernetes.nodes.targets\n rule {\n target_label = \"__address__\"\n replacement = \"kubernetes.default.svc.cluster.local:443\"\n }\n rule {\n source_labels = [\"__meta_kubernetes_node_name\"]\n regex = \"(.+)\"\n replacement = \"/api/v1/nodes/${1}/proxy/metrics\"\n target_label = \"__metrics_path__\"\n }\n}\n\nprometheus.scrape \"kubelet\" {\n job_name = \"integrations/kubernetes/kubelet\"\n targets = discovery.relabel.kubelet.output\n scheme = \"https\"\n scrape_interval = \"60s\"\n bearer_token_file = \"/var/run/secrets/kubernetes.io/serviceaccount/token\"\n tls_config {\n insecure_skip_verify = true\n }\n clustering {\n enabled = true\n }\n forward_to = [prometheus.relabel.kubelet.receiver]\n}\n\nprometheus.relabel \"kubelet\" {\n rule {\n source_labels = [\"__name__\"]\n regex = \"up|container_cpu_usage_seconds_total|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_certificate_manager_client_ttl_seconds|kubelet_certificate_manager_server_ttl_seconds|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_cgroup_manager_duration_seconds_count|kubelet_node_config_error|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_count|kubelet_pleg_relist_interval_seconds_bucket|kubelet_pod_start_duration_seconds_bucket|kubelet_pod_start_duration_seconds_count|kubelet_pod_worker_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_count|kubelet_running_container_count|kubelet_running_containers|kubelet_running_pod_count|kubelet_running_pods|kubelet_runtime_operations_errors_total|kubelet_runtime_operations_total|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubernetes_build_info|namespace_workload_pod|rest_client_requests_total|storage_operation_duration_seconds_count|storage_operation_errors_total|volume_manager_total_volumes\"\n action = \"keep\"\n }\n forward_to = [prometheus.relabel.metrics_service.receiver]\n}\n\n// Cluster Events\nloki.source.kubernetes_events \"cluster_events\" {\n job_name = \"integrations/kubernetes/eventhandler\"\n log_format = \"logfmt\"\n forward_to = [loki.write.endpoint.receiver]\n}\n\n\n// Why is this needed?\nprometheus.relabel \"metrics_service\" {\n forward_to = [prometheus.remote_write.default.receiver]\n}\n\n\n// Write Endpoints\n// prometheus write endpoint\nprometheus.remote_write \"default\" {\n external_labels = {\n cluster = \"{{ .Values.cluster }}\",\n project = \"{{ .Values.project }}\",\n }\n endpoint {\n url = \"https://mimir.planx-pla.net/api/v1/push\"\n\n headers = {\n \"X-Scope-OrgID\" = \"anonymous\",\n }\n\n }\n}\n\n// loki write endpoint\nloki.write \"endpoint\" {\n external_labels = {\n cluster = \"{{ .Values.cluster }}\",\n project = \"{{ .Values.project }}\",\n }\n endpoint {\n url = \"https://loki.planx-pla.net/loki/api/v1/push\"\n }\n}\n"` | | +| aws-s3-mountpoint.configuration.enabled | bool | `false` | | +| aws-s3-mountpoint.enabled | bool | `false` | | +| aws-s3-mountpoint.targetRevision | string | `"1.8.0"` | | +| calico.configuration.enabled | bool | `false` | | +| calico.enabled | bool | `false` | | +| calico.targetRevision | string | `"v3.27.0"` | | +| cluster | string | `"unfunded"` | | +| configuration.configurationRepo | string | `"https://github.com/uc-cdis/gen3-gitops"` | | +| configuration.configurationRevision | string | `"master"` | | +| coreDNS.configuration.enabled | bool | `false` | | +| coreDNS.enabled | bool | `false` | | +| coreDNS.targetRevision | string | `"v1.29.0"` | | +| ebs-csi-driver.configuration.enabled | bool | `false` | | +| ebs-csi-driver.enabled | bool | `false` | | +| ebs-csi-driver.targetRevision | string | `"2.36.0"` | | +| fluentd-configmap-data | string | `"\n\n\n\n @type tail\n @id in_tail_container_logs\n path /var/log/containers/*.log\n pos_file /var/log/fluentd-containers.log.pos\n tag \"#{ENV['FLUENT_CONTAINER_TAIL_TAG'] || 'kubernetes.*'}\"\n exclude_path \"#{ENV['FLUENT_CONTAINER_TAIL_EXCLUDE_PATH'] || use_default}\"\n read_from_head true\n \n @type \"#{ENV['FLUENT_CONTAINER_TAIL_PARSER_TYPE'] || 'json'}\"\n time_format %Y-%m-%dT%H:%M:%S.%NZ\n \n\n\n\n @type tail\n path /var/log/messages\n pos_file /var/log/host-messages.log.pos\n \n @type syslog\n \n tag host.messages\n\n\n\n\n @type tail\n path /var/log/secure\n pos_file /var/log/host-secure.log.pos\n \n @type syslog\n \n tag host.secure\n\n\n\n @type tail\n @id in_tail_docker\n path /var/log/docker.log\n pos_file /var/log/fluentd-docker.log.pos\n tag docker\n \n @type regexp\n expression /^time=\"(?\n\n\n\n\n @type tail\n @id in_tail_kubelet\n multiline_flush_interval 5s\n path /var/log/kubelet.log\n pos_file /var/log/fluentd-kubelet.log.pos\n tag kubelet\n \n @type kubernetes\n \n\n\n\n @type kubernetes_metadata\n @id filter_kube_metadata\n kubernetes_url \"#{ENV['FLUENT_FILTER_KUBERNETES_URL'] || 'https://' + ENV.fetch('KUBERNETES_SERVICE_HOST') + ':' + ENV.fetch('KUBERNETES_SERVICE_PORT') + '/api'}\"\n verify_ssl \"#{ENV['KUBERNETES_VERIFY_SSL'] || true}\"\n ca_file \"#{ENV['KUBERNETES_CA_FILE']}\"\n skip_labels \"#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_LABELS'] || 'false'}\"\n skip_container_metadata \"#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_CONTAINER_METADATA'] || 'false'}\"\n skip_master_url \"#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_MASTER_URL'] || 'false'}\"\n skip_namespace_metadata \"#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_NAMESPACE_METADATA'] || 'false'}\"\n\n\n\n @type null\n\n\n\n @type null\n\n\n\n @type rewrite_tag_filter\n \n key $._HOSTNAME\n pattern ^(.+)$\n tag $1.docker\n \n\n\n\n @type rewrite_tag_filter\n \n key $._HOSTNAME\n pattern ^(.+)$\n tag $1.kubelet\n \n\n\n\n @type rewrite_tag_filter\n \n key $.host\n pattern ^(.+)$\n tag $1.messages\n \n\n\n\n @type rewrite_tag_filter\n \n key $.host\n pattern ^(.+)$\n tag $1.secure\n \n\n\n\n @type rewrite_tag_filter\n \n # json structured log - consider adoption a standard json schema:\n # https://github.com/timberio/log-event-json-schema\n key message\n pattern /^\\{\\s*\"gen3log\":/\n tag kubernetes.gen3.json.${tag}\n \n \n # combined log format - default Apache and nginx structure\n # https://httpd.apache.org/docs/1.3/logs.html#combined\n key message\n pattern /^(((\\d+\\.\\d+\\.\\d+\\.\\d+)|-)\\s+){2}\\S+\\s+\\[\\d\\d?\\//\n tag kubernetes.gen3.combined.${tag}\n \n \n # unstructured log line\n key message\n pattern /\\S/\n tag kubernetes.gen3.raw.${tag}\n \n\n\n\n\n @type record_transformer\n \n log_type json\n # This one doesn't work for whatever reason, if you do ${record[\"kubernetes\"]} the whole blob would be added, but can't access subobjects\n #container_name ${record[\"kubernetes\"][\"container_name\"]}\n \n\n\n\n @type record_transformer\n \n log_type combined\n \n\n\n\n @type record_transformer\n \n log_type raw\n \n\n\n\n @type rewrite_tag_filter\n \n key $.kubernetes.pod_name\n pattern ^(.+)$\n tag \"#{Time.now.strftime('%Y-%m-%d')}.$1\"\n \n# \n# key $.kubernetes\n# pattern ^(.+)$\n# tag $1.container_name\n# \n\n\n#\n# @type rewrite_tag_filter\n# \n# key $.kubernetes.container_name\n# pattern ^(.+)$\n #tag $1.${tag}\n# tag ${tag}.$1\n# \n#\n\n# TODO:\n# * python stack traces: \"Traceback (most recent call last):\"\"\n# https://docs.fluentd.org/v0.12/articles/parser_multiline#formatn\n#\n# Idea: add `visitor` cookie to revproxy ...\n\n\n\n @type cloudwatch_logs\n @id out_cloudwatch_logs\n log_group_name \"#{ENV['LOG_GROUP_NAME']}\"\n auto_create_stream true\n use_tag_as_stream true\n retention_in_days \"#{ENV['RETENTION_IN_DAYS'] || 'nil'}\"\n json_handler yajl # To avoid UndefinedConversionError\n log_rejected_request \"#{ENV['LOG_REJECTED_REQUEST']}\" # Log rejected request for missing parts\n\n"` | | +| fluentd.configuration.enabled | bool | `false` | | +| fluentd.enabled | bool | `false` | | +| fluentd.targetRevision | string | `"0.5.2"` | | +| grafana-alloy.configuration.enabled | bool | `false` | | +| grafana-alloy.enabled | bool | `false` | | +| grafana-alloy.targetRevision | string | `"0.4.0"` | | +| karpenter-crds.amiSelectorName | string | `"EKS-FIPS*"` | | +| karpenter-crds.default.consolidateAfter | string | `"30s"` | | +| karpenter-crds.default.consolidation | bool | `true` | | +| karpenter-crds.default.consolidationPolicy | string | `"WhenEmpty"` | | +| karpenter-crds.default.enabled | bool | `true` | | +| karpenter-crds.default.expireAfter | string | `"168h"` | | +| karpenter-crds.enabled | bool | `false` | | +| karpenter-crds.jupyter.consolidateAfter | string | `"30s"` | | +| karpenter-crds.jupyter.consolidation | bool | `true` | | +| karpenter-crds.jupyter.consolidationPolicy | string | `"WhenEmpty"` | | +| karpenter-crds.jupyter.enabled | bool | `true` | | +| karpenter-crds.jupyter.expireAfter | string | `"168h"` | | +| karpenter-crds.migration | bool | `false` | | +| karpenter-crds.selectorTag | string | `""` | | +| karpenter-crds.targetRevision | string | `"master"` | | +| karpenter-crds.workflow.consolidateAfter | string | `"30s"` | | +| karpenter-crds.workflow.consolidation | bool | `true` | | +| karpenter-crds.workflow.consolidationPolicy | string | `"WhenEmpty"` | | +| karpenter-crds.workflow.enabled | bool | `true` | | +| karpenter-crds.workflow.expireAfter | string | `"168h"` | | +| karpenter-crds.workflow.sgSelector | string | `""` | | +| karpenter-templates.configuration.enabled | bool | `false` | | +| karpenter-templates.enabled | bool | `false` | | +| karpenter-templates.targetRevision | string | `"feat/karpenter-templates"` | | +| karpenter.configuration.enabled | bool | `false` | | +| karpenter.controller.image.digest | string | `"sha256:0c142050d872cb0ac7b30a188ec36aa765b449718cde0c7e49f7495b28f47c29"` | | +| karpenter.controller.image.tag | string | `"v0.32.9"` | | +| karpenter.enabled | bool | `false` | | +| karpenter.resources.limits.cpu | string | `"1"` | | +| karpenter.resources.limits.memory | string | `"1Gi"` | | +| karpenter.resources.requests.cpu | string | `"1"` | | +| karpenter.resources.requests.memory | string | `"1Gi"` | | +| karpenter.targetRevision | string | `"v0.32.9"` | | +| kube-state-metrics.configuration.enabled | bool | `false` | | +| kube-state-metrics.enabled | bool | `false` | | +| kube-state-metrics.targetRevision | string | `"5.28.0"` | | +| project | string | `"unfunded"` | | +| vpc-cni.configuration.enabled | bool | `false` | | +| vpc-cni.enabled | bool | `false` | | +| vpc-cni.targetRevision | string | `"v1.16.2"` | | diff --git a/helm/cluster-level-resources/templates/alb-controller.yaml b/helm/cluster-level-resources/templates/alb-controller.yaml new file mode 100644 index 00000000..bc7745dc --- /dev/null +++ b/helm/cluster-level-resources/templates/alb-controller.yaml @@ -0,0 +1,38 @@ +{{ if index .Values "alb-controller" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: alb-controller + namespace: argocd +spec: + project: default + sources: + - chart: aws-load-balancer-controller + repoURL: https://aws.github.io/eks-charts + targetRevision: {{ index .Values "alb-controller" "targetRevision" }} + helm: + releaseName: alb-controller + {{- if index .Values "alb-controller" "configuration" "enabled" }} + valueFiles: + - $values/{{ .Values.cluster }}/cluster-values/alb-controller.yaml + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: {{ .Values.configurationRevision }} + ref: values + {{- else }} + values: | + clusterName: {{ .Values.eksClusterName | default .Values.cluster }} + serviceAccount: + create: true + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::{{ .Values.accountNumber }}:role/gen3-service/{{ .Values.eksClusterName | default .Values.cluster }}-aws-load-balancer-controller-sa + name: aws-load-balancer-controller + {{- end }} + destination: + server: "https://kubernetes.default.svc" + namespace: kube-system + syncPolicy: + syncOptions: + - CreateNamespace=false + automated: + selfHeal: true +{{ end }} diff --git a/helm/cluster-level-resources/templates/alloy-configmap.yaml b/helm/cluster-level-resources/templates/alloy-configmap.yaml new file mode 100644 index 00000000..52df5312 --- /dev/null +++ b/helm/cluster-level-resources/templates/alloy-configmap.yaml @@ -0,0 +1,9 @@ +{{ if index .Values "grafana-alloy" "enabled" }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: alloy-gen3 + namespace: monitoring +data: + config: {{ tpl (index .Values "alloy-configmap-data") . | toYaml | indent 2}} +{{ end }} diff --git a/helm/cluster-level-resources/templates/aws-s3-mountpoint.yaml b/helm/cluster-level-resources/templates/aws-s3-mountpoint.yaml new file mode 100644 index 00000000..f503b766 --- /dev/null +++ b/helm/cluster-level-resources/templates/aws-s3-mountpoint.yaml @@ -0,0 +1,36 @@ +{{ if index .Values "aws-s3-mountpoint" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: aws-s3-mountpoint + namespace: argocd +spec: + project: default + sources: + - chart: aws-mountpoint-s3-csi-driver + repoURL: https://awslabs.github.io/mountpoint-s3-csi-driver + targetRevision: {{ index .Values "aws-s3-mountpoint" "targetRevision" }} + helm: + releaseName: aws-s3-mountpoint + {{- if index .Values "aws-s3-mountpoint" "configuration" "enabled" }} + valueFiles: + - $values/{{ .Values.cluster }}/cluster-values/aws-s3-mountpoint.yaml + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: {{ .Values.configuration.configurationRevision }} + ref: values + {{- else }} + values: | + node: + serviceAccount: + annotations: + "eks.amazonaws.com/role-arn": "arn:aws:iam::{{ .Values.accountNumber }}:role/AmazonEKS_S3_CSI_DriverRole-{{ .Values.cluster }}" + {{- end }} + destination: + server: "https://kubernetes.default.svc" + namespace: kube-system + syncPolicy: + syncOptions: + - CreateNamespace=false + automated: + selfHeal: true +{{ end }} diff --git a/helm/cluster-level-resources/templates/calico.yaml b/helm/cluster-level-resources/templates/calico.yaml new file mode 100644 index 00000000..5d7aabbe --- /dev/null +++ b/helm/cluster-level-resources/templates/calico.yaml @@ -0,0 +1,37 @@ +{{ if index .Values "calico" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: calico + namespace: argocd +spec: + project: default + sources: + - chart: tigera-operator + repoURL: https://docs.tigera.io/calico/charts + targetRevision: {{ index .Values "calico" "targetRevision" }} + helm: + releaseName: calico + {{- if index .Values "calico" "configuration" "enabled" }} + valueFiles: + - $values/{{ .Values.cluster }}/cluster-values/calico.yaml + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: {{ .Values.configuration.configurationRevision }} + ref: values + {{- else }} + values: | + installation: + kubernetesProvider: EKS + registry: quay.io/ + {{- end }} + destination: + server: "https://kubernetes.default.svc" + namespace: kube-system + syncPolicy: + syncOptions: + - Force=true + - ServerSideApply=true + - CreateNamespace=false + automated: + selfHeal: true +{{ end }} diff --git a/helm/cluster-level-resources/templates/coreDNS.yaml b/helm/cluster-level-resources/templates/coreDNS.yaml new file mode 100644 index 00000000..ed90c920 --- /dev/null +++ b/helm/cluster-level-resources/templates/coreDNS.yaml @@ -0,0 +1,57 @@ +{{ if index .Values "coreDNS" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: coredns + namespace: argocd +spec: + project: default + sources: + - chart: coredns + repoURL: https://coredns.github.io/helm + targetRevision: {{ index .Values "coreDNS" "targetRevision" }} + helm: + releaseName: coredns + {{- if index .Values "coreDNS" "configuration" "enabled" }} + valueFiles: + - $values/{{ .Values.cluster }}/cluster-values/coreDNS.yaml + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: {{ .Values.configuration.configurationRevision }} + ref: values + {{- else }} + values: | + service: + clusterIP: "10.100.0.10" + name: "kube-dns" + + autoscaler: + enabled: true + replicas: 2 + + k8sAppLabelOverride: "kube-dns" + + deployment: + name: "coredns-argo" + + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: k8s-app + operator: In + values: + - kube-dns + topologyKey: kubernetes.io/hostname + {{- end }} + destination: + server: "https://kubernetes.default.svc" + namespace: kube-system + syncPolicy: + syncOptions: + - CreateNamespace=false + automated: + selfHeal: false +{{ end }} diff --git a/helm/cluster-level-resources/templates/ebs-csi-driver.yaml b/helm/cluster-level-resources/templates/ebs-csi-driver.yaml new file mode 100644 index 00000000..d323c09b --- /dev/null +++ b/helm/cluster-level-resources/templates/ebs-csi-driver.yaml @@ -0,0 +1,30 @@ +{{ if index .Values "ebs-csi-driver" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: ebs-csi-driver + namespace: argocd +spec: + project: default + sources: + - chart: aws-ebs-csi-driver + repoURL: https://kubernetes-sigs.github.io/aws-ebs-csi-driver + targetRevision: {{ index .Values "ebs-csi-driver" "targetRevision" }} + helm: + releaseName: ebs-csi-driver + {{- if index .Values "ebs-csi-driver" "configuration" "enabled" }} + valueFiles: + - $values/{{ .Values.cluster }}/cluster-values/ebs-csi-driver.yaml + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: {{ .Values.configuration.configurationRevision }} + ref: values + {{- end }} + destination: + server: "https://kubernetes.default.svc" + namespace: kube-system + syncPolicy: + syncOptions: + - CreateNamespace=false + automated: + selfHeal: true +{{ end }} diff --git a/helm/cluster-level-resources/templates/fluentd.yaml b/helm/cluster-level-resources/templates/fluentd.yaml new file mode 100644 index 00000000..c61d98cf --- /dev/null +++ b/helm/cluster-level-resources/templates/fluentd.yaml @@ -0,0 +1,43 @@ +{{ if index .Values "fluentd" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: fluentd + namespace: argocd +spec: + project: default + sources: + - chart: fluentd + repoURL: https://fluent.github.io/helm-charts + targetRevision: {{ index .Values "fluentd" "targetRevision" }} + helm: + releaseName: fluentd + {{- if index .Values "fluentd" "configuration" "enabled" }} + valueFiles: + - $values/{{ .Values.cluster }}/cluster-values/fluentd.yaml + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: {{ .Values.configurationRevision }} + ref: values + {{- else }} + values: | + fileConfigs: + gen3.conf: | {{ index .Values "fluentd-configmap-data" | nindent 14 }} + env: + - name: "FLUENTD_CONF" + value: "../../../etc/fluent/config.d/gen3.conf" + - name: FLUENT_CONTAINER_TAIL_PARSER_TYPE + value: "cri" + - name: AWS_REGION + value: "us-east-1" + image: + tag: v1.15.3-debian-cloudwatch-1.0 + {{- end }} + destination: + server: "https://kubernetes.default.svc" + namespace: kube-system + syncPolicy: + syncOptions: + - CreateNamespace=false + automated: + selfHeal: true +{{ end }} diff --git a/helm/cluster-level-resources/templates/grafana-alloy.yaml b/helm/cluster-level-resources/templates/grafana-alloy.yaml new file mode 100644 index 00000000..27717072 --- /dev/null +++ b/helm/cluster-level-resources/templates/grafana-alloy.yaml @@ -0,0 +1,66 @@ +{{ if index .Values "grafana-alloy" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: grafana-alloy + namespace: argocd +spec: + project: default + sources: + - repoURL: https://grafana.github.io/helm-charts + chart: alloy + targetRevision: {{ index .Values "grafana-alloy" "targetRevision" }} + helm: + releaseName: alloy + {{- if index .Values "grafana-alloy" "configuration" "enabled" }} + valueFiles: + - $values/{{ .Values.cluster }}/cluster-values/grafana-alloy.yaml + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: {{ .Values.configuration.configurationRevision }} + ref: values + {{- else }} + values: | + controller: + type: "deployment" + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-1a + + alloy: + stabilityLevel: "public-preview" + uiPathPrefix: /alloy + # -- Extra ports to expose on the Alloy container. + extraPorts: + - name: "otel-grpc" + port: 4317 + targetPort: 4317 + protocol: "TCP" + - name: "otel-http" + port: 4318 + targetPort: 4318 + protocol: "TCP" + clustering: + enabled: true + configMap: + name: alloy-gen3 + key: config + resources: + requests: + cpu: 1000m + memory: 1Gi + {{- end }} + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + selfHeal: true + destination: + server: https://kubernetes.default.svc + namespace: monitoring +{{ end }} diff --git a/helm/cluster-level-resources/templates/karpenter-crd-default.yaml b/helm/cluster-level-resources/templates/karpenter-crd-default.yaml new file mode 100644 index 00000000..a2cdfef4 --- /dev/null +++ b/helm/cluster-level-resources/templates/karpenter-crd-default.yaml @@ -0,0 +1,291 @@ +{{ if and (index .Values "karpenter-crds" "enabled") (index .Values "karpenter-crds" "default" "enabled") }} + {{ if eq (index .Values "karpenter-crds" "migration") true }} +--- +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass +metadata: + name: default +spec: + amiFamily: AL2 + amiSelectorTerms: + - name: {{ index .Values "karpenter-crds" "amiSelectorName" }} + owner: "143731057154" + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + deleteOnTermination: true + encrypted: true + volumeSize: 50Gi + volumeType: gp3 + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: optional + role: eks_{{ index .Values "karpenter-crds" "selectorTag" }}_workers_role + + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + + tags: + Environment: {{ .Values.cluster }} + Name: eks-{{ .Values.cluster }}-karpenter + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + purpose: default + + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash -x + instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId) + curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys + echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json + sysctl -w fs.inotify.max_user_watches=12000 + + sudo yum update -y + + --BOUNDARY-- +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +metadata: + name: default +spec: + disruption: + {{ if eq (index .Values "karpenter-crds" "default" "consolidationPolicy" ) "WhenEmpty" }} + consolidateAfter: {{ index .Values "karpenter-crds" "default" "consolidateAfter" }} + {{ end }} + consolidationPolicy: {{ index .Values "karpenter-crds" "default" "consolidationPolicy" }} + expireAfter: {{ index .Values "karpenter-crds" "default" "expireAfter" }} + limits: + cpu: "1000" + memory: 1000Gi + template: + metadata: + labels: + role: default + spec: + kubelet: + evictionHard: + memory.available: 5% + evictionSoft: + memory.available: 10% + evictionSoftGracePeriod: + memory.available: 5m + kubeReserved: + cpu: 480m + ephemeral-storage: 3Gi + memory: 1632Mi + nodeClassRef: + apiVersion: karpenter.k8s.aws/v1beta1 + kind: EC2NodeClass + name: default + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - spot + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: karpenter.k8s.aws/instance-category + operator: In + values: + - c + - m + - r + - t +--- +apiVersion: karpenter.k8s.aws/v1alpha1 +kind: AWSNodeTemplate +metadata: + name: default +spec: + amiSelector: + aws::name: {{ index .Values "karpenter-crds" "amiSelectorName" }} + aws::owners: "143731057154" + subnetSelector: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + securityGroupSelector: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + tags: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + Environment: {{ .Values.cluster }} + Name: eks-{{ .Values.cluster }}-karpenter + purpose: default + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: optional + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash -x + instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId) + curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys + + echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json + + sysctl -w fs.inotify.max_user_watches=12000 + + # --BOUNDARY + # Content-Type: text/cloud-config; charset="us-ascii" + + # mounts: + # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime'] + + --BOUNDARY-- + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeSize: 50Gi + volumeType: gp2 + encrypted: true + deleteOnTermination: true +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: default +spec: + # Allow for spot and on demand instances + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand", "spot"] + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: karpenter.k8s.aws/instance-category + operator: In + values: + - c + - m + - r + - t + taints: + - key: karpenter.sh/legacy + value: "true" + effect: NoSchedule + # Set a limit of 1000 vcpus + limits: + resources: + cpu: 1000 + # Use the default node template + providerRef: + name: default + # Allow pods to be rearranged + consolidation: + enabled: {{ index .Values "karpenter-crds" "default" "consolidation" }} + # Kill nodes after 7 days to ensure they stay up to date + ttlSecondsUntilExpired: 604800 + {{ else }} +--- +apiVersion: karpenter.k8s.aws/v1alpha1 +kind: AWSNodeTemplate +metadata: + name: default +spec: + amiSelector: + aws::name: {{ index .Values "karpenter-crds" "amiSelectorName" }} + aws::owners: "143731057154" + subnetSelector: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + securityGroupSelector: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + tags: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + Environment: {{ .Values.cluster }} + Name: eks-{{ .Values.cluster }}-karpenter + purpose: default + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: optional + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash -x + instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId) + curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys + + echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json + + sysctl -w fs.inotify.max_user_watches=12000 + + # --BOUNDARY + # Content-Type: text/cloud-config; charset="us-ascii" + + # mounts: + # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime'] + + --BOUNDARY-- + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeSize: 50Gi + volumeType: gp2 + encrypted: true + deleteOnTermination: true +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: default +spec: + # Allow for spot and on demand instances + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand", "spot"] + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: karpenter.k8s.aws/instance-category + operator: In + values: + - c + - m + - r + - t + - key: karpenter.k8s.aws/instance-cpu + operator: Gt + values: + - "7" + # Set a limit of 1000 vcpus + limits: + resources: + cpu: 1000 + # Use the default node template + providerRef: + name: default + # Allow pods to be rearranged + consolidation: + enabled: {{ index .Values "karpenter-crds" "default" "consolidation" }} + # Kill nodes after 7 days to ensure they stay up to date + ttlSecondsUntilExpired: 604800 + {{ end }} +{{ end }} diff --git a/helm/cluster-level-resources/templates/karpenter-crd-jupyter.yaml b/helm/cluster-level-resources/templates/karpenter-crd-jupyter.yaml new file mode 100644 index 00000000..e9877f64 --- /dev/null +++ b/helm/cluster-level-resources/templates/karpenter-crd-jupyter.yaml @@ -0,0 +1,294 @@ +{{ if and (index .Values "karpenter-crds" "enabled") (index .Values "karpenter-crds" "jupyter" "enabled") }} + {{ if eq (index .Values "karpenter-crds" "migration") true }} +--- +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass +metadata: + name: jupyter +spec: + amiFamily: AL2 + amiSelectorTerms: + - name: {{ index .Values "karpenter-crds" "amiSelectorName" }} + owner: "143731057154" + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + deleteOnTermination: true + encrypted: true + volumeSize: 50Gi + volumeType: gp3 + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: optional + role: eks_{{ index .Values "karpenter-crds" "selectorTag" }}_workers_role + + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-jupyter + + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + + tags: + Environment: {{ .Values.cluster }} + Name: eks-{{ .Values.cluster }}-karpenter + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + purpose: jupyter + + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash -x + instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId) + curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys + echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json + sysctl -w fs.inotify.max_user_watches=12000 + + sudo yum update -y + + --BOUNDARY-- +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +metadata: + name: jupyter +spec: + disruption: + {{ if eq (index .Values "karpenter-crds" "jupyter" "consolidationPolicy" ) "WhenEmpty" }} + consolidateAfter: {{ index .Values "karpenter-crds" "jupyter" "consolidateAfter" }} + {{ end }} + consolidationPolicy: {{ index .Values "karpenter-crds" "jupyter" "consolidationPolicy" }} + expireAfter: {{ index .Values "karpenter-crds" "jupyter" "expireAfter" }} + limits: + cpu: "1000" + memory: 1000Gi + template: + metadata: + labels: + role: jupyter + spec: + kubelet: + evictionHard: + memory.available: 5% + evictionSoft: + memory.available: 10% + evictionSoftGracePeriod: + memory.available: 5m + kubeReserved: + cpu: 480m + ephemeral-storage: 3Gi + memory: 1632Mi + nodeClassRef: + apiVersion: karpenter.k8s.aws/v1beta1 + kind: EC2NodeClass + name: jupyter + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: karpenter.k8s.aws/instance-category + operator: In + values: + - c + - m + - r + - t + taints: + - effect: NoSchedule + key: role + value: jupyter +--- +apiVersion: karpenter.k8s.aws/v1alpha1 +kind: AWSNodeTemplate +metadata: + name: jupyter +spec: + subnetSelector: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + securityGroupSelector: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-jupyter + tags: + Environment: {{ .Values.cluster }} + Name: eks-{{ .Values.cluster }}-jupyter-karpenter + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + purpose: jupyter + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: optional + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash -x + instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId) + curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys + + echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json + + sysctl -w fs.inotify.max_user_watches=12000 + + # --BOUNDARY + # Content-Type: text/cloud-config; charset="us-ascii" + + # mounts: + # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime'] + + --BOUNDARY-- + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeSize: 50Gi + volumeType: gp2 + encrypted: true + deleteOnTermination: true +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: jupyter +spec: + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: karpenter.k8s.aws/instance-category + operator: In + values: + - c + - m + - r + - t + taints: + - key: role + value: jupyter + effect: NoSchedule + - key: karpenter.sh/legacy + value: "true" + effect: NoSchedule + labels: + role: jupyter + #TODO this could be paramaterized + limits: + resources: + cpu: 1000 + providerRef: + name: jupyter + # Allow pods to be rearranged + consolidation: + enabled: {{ index .Values "karpenter-crds.jupyter.consolidation" }} + # Kill nodes after 7 days to ensure they stay up to date + # TODO This could be paramaterized + ttlSecondsUntilExpired: 604800 + {{ else }} +--- +apiVersion: karpenter.k8s.aws/v1alpha1 +kind: AWSNodeTemplate +metadata: + name: jupyter +spec: + subnetSelector: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + securityGroupSelector: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-jupyter + tags: + Environment: {{ .Values.cluster }} + Name: eks-{{ .Values.cluster }}-jupyter-karpenter + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + purpose: jupyter + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: optional + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash -x + instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId) + curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys + + echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json + + sysctl -w fs.inotify.max_user_watches=12000 + + # --BOUNDARY + # Content-Type: text/cloud-config; charset="us-ascii" + + # mounts: + # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime'] + + --BOUNDARY-- + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeSize: 50Gi + volumeType: gp2 + encrypted: true + deleteOnTermination: true +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: jupyter +spec: + # Only allow on demand instance + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: karpenter.k8s.aws/instance-category + operator: In + values: + - c + - m + - r + - t + # Set a taint for jupyter pods + taints: + - key: role + value: jupyter + effect: NoSchedule + labels: + role: jupyter + #TODO this could be paramaterized + limits: + resources: + cpu: 1000 + providerRef: + name: jupyter + # Allow pods to be rearranged + consolidation: + enabled: {{ index .Values "karpenter-crds.jupyter.consolidation" }} + # Kill nodes after 7 days to ensure they stay up to date + ttlSecondsUntilExpired: 604800 + {{ end }} +{{ end }} diff --git a/helm/cluster-level-resources/templates/karpenter-crd-workflow.yaml b/helm/cluster-level-resources/templates/karpenter-crd-workflow.yaml new file mode 100644 index 00000000..c0137670 --- /dev/null +++ b/helm/cluster-level-resources/templates/karpenter-crd-workflow.yaml @@ -0,0 +1,307 @@ +{{ if and (index .Values "karpenter-crds" "enabled") (index .Values "karpenter-crds" "workflow" "enabled") }} + {{ if eq (index .Values "karpenter-crds" "migration") true }} +--- +apiVersion: karpenter.k8s.aws/v1beta1 +kind: EC2NodeClass +metadata: + name: workflow +spec: + amiFamily: AL2 + amiSelectorTerms: + - name: {{ index .Values "karpenter-crds" "amiSelectorName" }} + owner: "143731057154" + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + deleteOnTermination: true + encrypted: true + volumeSize: 50Gi + volumeType: gp3 + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: optional + role: eks_{{ index .Values "karpenter-crds" "selectorTag" }}_workers_role + + securityGroupSelectorTerms: + - tags: + {{- if ne (index .Values "karpenter-crds" "workflow" "sgSelector") "" }} + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "workflow" "sgSelector" }} + {{- else }} + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-workflow + {{- end }} + + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + + tags: + Environment: {{ .Values.cluster }} + Name: eks-{{ .Values.cluster }}-karpenter + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + purpose: workflow + + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash -x + instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId) + curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys + echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json + sysctl -w fs.inotify.max_user_watches=12000 + + sudo yum update -y + + --BOUNDARY-- +--- +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +metadata: + name: workflow +spec: + disruption: + {{ if eq (index .Values "karpenter-crds" "workflow" "consolidationPolicy" ) "WhenEmpty" }} + consolidateAfter: {{ index .Values "karpenter-crds" "workflow" "consolidateAfter" }} + {{ end }} + consolidationPolicy: {{ index .Values "karpenter-crds" "workflow" "consolidationPolicy" }} + expireAfter: {{ index .Values "karpenter-crds" "workflow" "expireAfter" }} + limits: + cpu: "1000" + memory: 1000Gi + template: + metadata: + labels: + role: workflow + spec: + kubelet: + evictionHard: + memory.available: 5% + evictionSoft: + memory.available: 10% + evictionSoftGracePeriod: + memory.available: 5m + kubeReserved: + cpu: 480m + ephemeral-storage: 3Gi + memory: 1632Mi + nodeClassRef: + apiVersion: karpenter.k8s.aws/v1beta1 + kind: EC2NodeClass + name: workflow + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: karpenter.k8s.aws/instance-category + operator: In + values: + - c + - m + - r + - t + taints: + - effect: NoSchedule + key: role + value: workflow +--- +apiVersion: karpenter.k8s.aws/v1alpha1 +kind: AWSNodeTemplate +metadata: + name: workflow +spec: + subnetSelector: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + securityGroupSelector: + # TODO this is an example of how to parameterize this, we should expand this when helpful + {{- if ne (index .Values "karpenter-crds" "workflow" "sgSelector") "" }} + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "workflow" "sgSelector" }} + {{- else }} + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-workflow + {{- end }} + tags: + Environment: {{ .Values.cluster }} + Name: eks-{{ .Values.cluster }}-workflow-karpenter + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + purpose: workflow + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: optional + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash -x + instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId) + curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys + + echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json + + sysctl -w fs.inotify.max_user_watches=12000 + + # --BOUNDARY + # Content-Type: text/cloud-config; charset="us-ascii" + + # mounts: + # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime'] + + --BOUNDARY-- + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeSize: 50Gi + volumeType: gp2 + encrypted: true + deleteOnTermination: true +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: workflow +spec: + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: karpenter.k8s.aws/instance-category + operator: In + values: + - c + - m + - r + - t + taints: + - key: role + value: workflow + effect: NoSchedule + - key: karpenter.sh/legacy + value: "true" + effect: NoSchedule + labels: + role: workflow + #TODO this could be paramaterized + limits: + resources: + cpu: 1000 + providerRef: + name: workflow + # Allow pods to be rearranged + consolidation: + enabled: {{ index .Values "karpenter-crds.workflow.consolidation" }} + # Kill nodes after 7 days to ensure they stay up to date + # TODO This could be paramaterized + ttlSecondsUntilExpired: 604800 + {{ else }} +--- +apiVersion: karpenter.k8s.aws/v1alpha1 +kind: AWSNodeTemplate +metadata: + name: workflow +spec: + subnetSelector: + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + securityGroupSelector: + # TODO this is an example of how to parameterize this, we should expand this when helpful + {{- if ne (index .Values "karpenter-crds" "workflow" "sgSelector") "" }} + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "workflow" "sgSelector" }} + {{- else }} + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }}-workflow + {{- end }} + tags: + Environment: {{ .Values.cluster }} + Name: eks-{{ .Values.cluster }}-workflow-karpenter + karpenter.sh/discovery: {{ index .Values "karpenter-crds" "selectorTag" }} + purpose: workflow + metadataOptions: + httpEndpoint: enabled + httpProtocolIPv6: disabled + httpPutResponseHopLimit: 2 + httpTokens: optional + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + #!/bin/bash -x + instanceId=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .instanceId) + curl https://raw.githubusercontent.com/uc-cdis/cloud-automation/master/files/authorized_keys/ops_team >> /home/ec2-user/.ssh/authorized_keys + + echo "$(jq '.registryPullQPS=0' /etc/kubernetes/kubelet/kubelet-config.json)" > /etc/kubernetes/kubelet/kubelet-config.json + + sysctl -w fs.inotify.max_user_watches=12000 + + # --BOUNDARY + # Content-Type: text/cloud-config; charset="us-ascii" + + # mounts: + # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime'] + + --BOUNDARY-- + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeSize: 50Gi + volumeType: gp2 + encrypted: true + deleteOnTermination: true +--- +apiVersion: karpenter.sh/v1alpha5 +kind: Provisioner +metadata: + name: workflow +spec: + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + - key: kubernetes.io/arch + operator: In + values: + - amd64 + - key: karpenter.k8s.aws/instance-category + operator: In + values: + - c + - m + - r + - t + taints: + - key: role + value: workflow + effect: NoSchedule + labels: + role: workflow + #TODO this could be paramaterized + limits: + resources: + cpu: 1000 + providerRef: + name: workflow + # Allow pods to be rearranged + consolidation: + enabled: {{ index .Values "karpenter-crds.workflow.consolidation" }} + # Kill nodes after 7 days to ensure they stay up to date + # TODO This could be paramaterized + ttlSecondsUntilExpired: 604800 + {{ end }} +{{ end }} diff --git a/helm/cluster-level-resources/templates/karpenter-templates.yaml b/helm/cluster-level-resources/templates/karpenter-templates.yaml new file mode 100644 index 00000000..1b184ed4 --- /dev/null +++ b/helm/cluster-level-resources/templates/karpenter-templates.yaml @@ -0,0 +1,30 @@ +{{ if index .Values "karpenter-templates" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: karpenter-templates + namespace: argocd +spec: + project: default + sources: + - path: helm/karpenter-templates + repoURL: https://github.com/uc-cdis/gen3-helm + targetRevision: {{ index .Values "karpenter-templates" "targetRevision" }} + helm: + releaseName: karpenter-templates + {{- if index .Values "karpenter-templates" "configuration" "enabled" }} + valueFiles: + - $values/{{ .Values.cluster }}/cluster-values/karpenter-templates.yaml + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: {{ .Values.configuration.configurationRevision }} + ref: values + {{- end }} + destination: + server: "https://kubernetes.default.svc" + namespace: karpenter + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + selfHeal: true +{{ end }} diff --git a/helm/cluster-level-resources/templates/karpenter.yaml b/helm/cluster-level-resources/templates/karpenter.yaml new file mode 100644 index 00000000..45a5666d --- /dev/null +++ b/helm/cluster-level-resources/templates/karpenter.yaml @@ -0,0 +1,53 @@ +{{ if index .Values "karpenter" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: karpenter + namespace: argocd +spec: + project: default + sources: + - repoURL: 'https://github.com/aws/karpenter-provider-aws' + targetRevision: {{ .Values.karpenter.targetRevision }} + path: charts/karpenter + helm: + releaseName: karpenter + {{- if index .Values "karpenter" "configuration" "enabled" }} + valueFiles: + - "$values/{{ .Values.cluster }}/cluster-values/karpenter.yaml" + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: {{ .Values.configuration.configurationRevision }} + ref: values + {{- else }} + values: | + serviceAccount: + name: karpenter + create: true + annotations: + eks.amazonaws.com/role-arn: "arn:aws:iam::{{ .Values.accountNumber }}:role/{{ .Values.eksClusterName | default .Values.cluster }}-karpenter-sa" + settings: + clusterName: {{ .Values.eksClusterName | default .Values.cluster }} + controller: + image: + tag: {{ .Values.karpenter.controller.image.tag | default .Values.karpenter.targetRevision }} + digest: {{ .Values.karpenter.controller.image.digest }} + env: + - name: AWS_REGION + value: us-east-1 + resources: + requests: + memory: {{ .Values.karpenter.resources.requests.memory }} + cpu: {{ .Values.karpenter.resources.requests.cpu }} + limits: + memory: {{ .Values.karpenter.resources.limits.memory }} + cpu: {{ .Values.karpenter.resources.limits.cpu }} + {{- end }} + destination: + server: "https://kubernetes.default.svc" + namespace: karpenter + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + selfHeal: true +{{ end }} diff --git a/helm/cluster-level-resources/templates/kube-state-metrics.yaml b/helm/cluster-level-resources/templates/kube-state-metrics.yaml new file mode 100644 index 00000000..248d7993 --- /dev/null +++ b/helm/cluster-level-resources/templates/kube-state-metrics.yaml @@ -0,0 +1,36 @@ +{{ if index .Values "kube-state-metrics" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: kube-state-metrics + namespace: argocd +spec: + project: default + sources: + - repoURL: https://prometheus-community.github.io/helm-charts + chart: kube-state-metrics + targetRevision: {{ index .Values "kube-state-metrics" "targetRevision" }} + helm: + releaseName: + {{- if index .Values "kube-state-metrics" "configuration" "enabled" }} + valueFiles: + - "$values/{{ .Values.cluster }}/cluster-values/kube-state-metrics.yaml" + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: {{ .Values.configuration.configurationRevision }} + ref: values + {{- else}} + values: | + podAnnotations: + prometheus.io/path: "/metrics" + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + {{- end }} + syncPolicy: + syncOptions: + - CreateNamespace=true + automated: + selfHeal: true + destination: + server: https://kubernetes.default.svc + namespace: monitoring +{{ end }} diff --git a/helm/cluster-level-resources/templates/vpc-cni.yaml b/helm/cluster-level-resources/templates/vpc-cni.yaml new file mode 100644 index 00000000..e535cce7 --- /dev/null +++ b/helm/cluster-level-resources/templates/vpc-cni.yaml @@ -0,0 +1,36 @@ +{{ if index .Values "vpc-cni" "enabled" }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: vpc-cni + namespace: argocd +spec: + project: default + sources: + - chart: aws-vpc-cni + repoURL: https://aws.github.io/eks-charts + targetRevision: {{ index .Values "vpc-cni" "targetRevision" }} + helm: + releaseName: vpc-cni + {{- if index .Values "vpc-cni" "configuration" "enabled" }} + valueFiles: + - $values/{{ .Values.cluster }}/cluster-values/vpc-cni.yaml + - repoURL: {{ .Values.configuration.configurationRepo }} + targetRevision: master + ref: values + {{- else }} + values: | + enableNetworkPolicy: false + originalMatchLabels: true + env: + ANNOTATE_POD_IP: "true" + {{- end }} + destination: + server: "https://kubernetes.default.svc" + namespace: kube-system + syncPolicy: + syncOptions: + - CreateNamespace=false + automated: + selfHeal: true +{{ end }} diff --git a/helm/cluster-level-resources/values.yaml b/helm/cluster-level-resources/values.yaml new file mode 100644 index 00000000..b5cef8da --- /dev/null +++ b/helm/cluster-level-resources/values.yaml @@ -0,0 +1,788 @@ +# The name of the cluster this configuration is going to. This should match the name of the directory configuration +# is stored in +cluster: "unfunded" +project: unfunded + +# AWS account number that this cluster lives in +accountNumber: "xxxxxxxxxxxx" + +# This is universal for all of our configuration, we assume that all of the configuration (i.e. values files) +# live in the same repo, on the same branch +configuration: + configurationRepo: https://github.com/uc-cdis/gen3-gitops + configurationRevision: master + +alb-controller: + enabled: false + targetRevision: 1.7.1 + configuration: + enabled: false + +aws-s3-mountpoint: + enabled: false + targetRevision: 1.8.0 + configuration: + enabled: false + +calico: + enabled: false + targetRevision: v3.27.0 + configuration: + enabled: false + +coreDNS: + enabled: false + targetRevision: v1.29.0 + configuration: + enabled: false + +ebs-csi-driver: + enabled: false + targetRevision: 2.36.0 + configuration: + enabled: false + +fluentd: + enabled: false + targetRevision: "0.5.2" + configuration: + enabled: false + +grafana-alloy: + enabled: false + targetRevision: 0.4.0 + configuration: + enabled: false + +karpenter: + enabled: false + targetRevision: v0.32.9 + configuration: + enabled: false + resources: + requests: + memory: "1Gi" + cpu: "1" + limits: + memory: "1Gi" + cpu: "1" + controller: + image: + tag: "v0.32.9" + digest: "sha256:0c142050d872cb0ac7b30a188ec36aa765b449718cde0c7e49f7495b28f47c29" + +karpenter-templates: + enabled: false + targetRevision: feat/karpenter-templates + configuration: + enabled: false + +karpenter-crds: + enabled: false + targetRevision: master + amiSelectorName: "EKS-FIPS*" + selectorTag: "" + migration: false + default: + enabled: true + consolidation: true + consolidateAfter: "30s" + consolidationPolicy: "WhenEmpty" + expireAfter: "168h" + jupyter: + enabled: true + consolidation: true + consolidateAfter: "30s" + consolidationPolicy: "WhenEmpty" + expireAfter: "168h" + workflow: + enabled: true + consolidation: true + consolidateAfter: "30s" + consolidationPolicy: "WhenEmpty" + expireAfter: "168h" + sgSelector: "" + +kube-state-metrics: + enabled: false + configuration: + enabled: false + targetRevision: 5.28.0 + +vpc-cni: + enabled: false + targetRevision: v1.16.2 + configuration: + enabled: false + +# ============================================================================================= +# THIS IS THE CONFIGURATION THAT GOES INTO THE ALLOY CONFIGMAP. CUSTOMIZE AT YOUR OWN PERIL!!!! +# ============================================================================================= +alloy-configmap-data: | + logging { + level = "info" + format = "json" + write_to = [loki.write.endpoint.receiver] + } + + /////////////////////// OTLP START /////////////////////// + + otelcol.receiver.otlp "default" { + grpc {} + http {} + + output { + metrics = [otelcol.processor.batch.default.input] + traces = [otelcol.processor.batch.default.input] + } + } + + otelcol.processor.batch "default" { + output { + metrics = [otelcol.exporter.prometheus.default.input] + traces = [otelcol.exporter.otlp.tempo.input] + } + } + + otelcol.exporter.prometheus "default" { + forward_to = [prometheus.remote_write.default.receiver] + } + + otelcol.exporter.otlp "tempo" { + client { + endpoint = "http://monitoring-tempo-distributor.monitoring:4317" + // Configure TLS settings for communicating with the endpoint. + tls { + // The connection is insecure. + insecure = true + // Do not verify TLS certificates when connecting. + insecure_skip_verify = true + } + } + } + + + /////////////////////// OTLP END /////////////////////// + + // discover all pods, to be used later in this config + discovery.kubernetes "pods" { + role = "pod" + } + + // discover all services, to be used later in this config + discovery.kubernetes "services" { + role = "service" + } + + // discover all nodes, to be used later in this config + discovery.kubernetes "nodes" { + role = "node" + } + + // Generic scrape of any pod with Annotation "prometheus.io/scrape: true" + discovery.relabel "annotation_autodiscovery_pods" { + targets = discovery.kubernetes.pods.targets + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scrape"] + regex = "true" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_job"] + action = "replace" + target_label = "job" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_instance"] + action = "replace" + target_label = "instance" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_path"] + action = "replace" + target_label = "__metrics_path__" + } + + // Choose the pod port + // The discovery generates a target for each declared container port of the pod. + // If the metricsPortName annotation has value, keep only the target where the port name matches the one of the annotation. + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_portName"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_port_name"] + action = "keepequal" + target_label = "__tmp_port" + } + + // If the metrics port number annotation has a value, override the target address to use it, regardless whether it is + // one of the declared ports on that Pod. + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_port", "__meta_kubernetes_pod_ip"] + regex = "(\\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})" + replacement = "[$2]:$1" // IPv6 + target_label = "__address__" + } + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_port", "__meta_kubernetes_pod_ip"] + regex = "(\\d+);((([0-9]+?)(\\.|$)){4})" // IPv4, takes priority over IPv6 when both exists + replacement = "$2:$1" + target_label = "__address__" + } + + rule { + source_labels = ["__meta_kubernetes_pod_annotation_prometheus_io_scheme"] + action = "replace" + target_label = "__scheme__" + } + + + // add labels + rule { + source_labels = ["__meta_kubernetes_pod_name"] + target_label = "pod" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_name"] + target_label = "container" + } + rule { + source_labels = ["__meta_kubernetes_pod_controller_name"] + target_label = "controller" + } + + rule { + source_labels = ["__meta_kubernetes_namespace"] + target_label = "namespace" + } + + + rule { + source_labels = ["__meta_kubernetes_pod_label_app"] + target_label = "app" + } + + // map all labels + rule { + action = "labelmap" + regex = "__meta_kubernetes_pod_label_(.+)" + } + } + + // Generic scrape of any service with + // Annotation Autodiscovery + discovery.relabel "annotation_autodiscovery_services" { + targets = discovery.kubernetes.services.targets + rule { + source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_scrape"] + regex = "true" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_job"] + action = "replace" + target_label = "job" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_instance"] + action = "replace" + target_label = "instance" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_path"] + action = "replace" + target_label = "__metrics_path__" + } + + // Choose the service port + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_portName"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_port_name"] + action = "keepequal" + target_label = "__tmp_port" + } + + rule { + source_labels = ["__meta_kubernetes_service_port_number"] + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_port"] + regex = "(.+)" + target_label = "__tmp_port" + } + rule { + source_labels = ["__meta_kubernetes_service_port_number"] + action = "keepequal" + target_label = "__tmp_port" + } + + rule { + source_labels = ["__meta_kubernetes_service_annotation_prometheus_io_scheme"] + action = "replace" + target_label = "__scheme__" + } + } + + prometheus.scrape "metrics" { + job_name = "integrations/autodiscovery_metrics" + targets = concat(discovery.relabel.annotation_autodiscovery_pods.output, discovery.relabel.annotation_autodiscovery_services.output) + honor_labels = true + clustering { + enabled = true + } + forward_to = [prometheus.relabel.metrics_service.receiver] + } + + + // Node Exporter + // TODO: replace with https://grafana.com/docs/alloy/latest/reference/components/prometheus.exporter.unix/ + discovery.relabel "node_exporter" { + targets = discovery.kubernetes.pods.targets + rule { + source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_instance"] + regex = "monitoring-extras" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"] + regex = "node-exporter" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_pod_node_name"] + action = "replace" + target_label = "instance" + } + } + + prometheus.scrape "node_exporter" { + job_name = "integrations/node_exporter" + targets = discovery.relabel.node_exporter.output + scrape_interval = "60s" + clustering { + enabled = true + } + forward_to = [prometheus.relabel.node_exporter.receiver] + } + + prometheus.relabel "node_exporter" { + rule { + source_labels = ["__name__"] + regex = "up|node_cpu.*|node_network.*|node_exporter_build_info|node_filesystem.*|node_memory.*|process_cpu_seconds_total|process_resident_memory_bytes" + action = "keep" + } + forward_to = [prometheus.relabel.metrics_service.receiver] + } + + + // cAdvisor + // discovery.relabel "cadvisor" { + // targets = discovery.kubernetes.nodes.targets + // rule { + // target_label = "__address__" + // replacement = "kubernetes.default.svc.cluster.local:443" + // } + // rule { + // source_labels = ["__meta_kubernetes_node_name"] + // regex = "(.+)" + // replacement = "/api/v1/nodes/${1}/proxy/metrics/cadvisor" + // target_label = "__metrics_path__" + // } + // } + + // prometheus.scrape "cadvisor" { + // job_name = "integrations/kubernetes/cadvisor" + // targets = discovery.relabel.cadvisor.output + // scheme = "https" + // scrape_interval = "60s" + // bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + // tls_config { + // insecure_skip_verify = true + // } + // clustering { + // enabled = true + // } + // forward_to = [prometheus.relabel.cadvisor.receiver] + //} + + //prometheus.relabel "cadvisor" { + // rule { + // source_labels = ["__name__"] + // regex = "up|container_cpu_cfs_periods_total|container_cpu_cfs_throttled_periods_total|container_cpu_usage_seconds_total|container_fs_reads_bytes_total|container_fs_reads_total|container_fs_writes_bytes_total|container_fs_writes_total|container_memory_cache|container_memory_rss|container_memory_swap|container_memory_working_set_bytes|container_network_receive_bytes_total|container_network_receive_packets_dropped_total|container_network_receive_packets_total|container_network_transmit_bytes_total|container_network_transmit_packets_dropped_total|container_network_transmit_packets_total|machine_memory_bytes" + // action = "keep" + // } + // forward_to = [prometheus.relabel.metrics_service.receiver] + // } + + // Logs from all pods + discovery.relabel "all_pods" { + targets = discovery.kubernetes.pods.targets + rule { + source_labels = ["__meta_kubernetes_namespace"] + target_label = "namespace" + } + rule { + source_labels = ["__meta_kubernetes_pod_name"] + target_label = "pod" + } + rule { + source_labels = ["__meta_kubernetes_pod_container_name"] + target_label = "container" + } + rule { + source_labels = ["__meta_kubernetes_pod_controller_name"] + target_label = "controller" + } + + rule { + source_labels = ["__meta_kubernetes_pod_label_app"] + target_label = "app" + } + + // map all labels + rule { + action = "labelmap" + regex = "__meta_kubernetes_pod_label_(.+)" + } + + } + + loki.source.kubernetes "pods" { + targets = discovery.relabel.all_pods.output + forward_to = [loki.write.endpoint.receiver] + } + + // kube-state-metrics + discovery.relabel "relabel_kube_state_metrics" { + targets = discovery.kubernetes.services.targets + rule { + source_labels = ["__meta_kubernetes_namespace"] + regex = "monitoring" + action = "keep" + } + rule { + source_labels = ["__meta_kubernetes_service_name"] + regex = "monitoring-extras-kube-state-metrics" + action = "keep" + } + } + + prometheus.scrape "kube_state_metrics" { + targets = discovery.relabel.relabel_kube_state_metrics.output + job_name = "kube-state-metrics" + metrics_path = "/metrics" + forward_to = [prometheus.remote_write.default.receiver] + } + + // Kubelet + discovery.relabel "kubelet" { + targets = discovery.kubernetes.nodes.targets + rule { + target_label = "__address__" + replacement = "kubernetes.default.svc.cluster.local:443" + } + rule { + source_labels = ["__meta_kubernetes_node_name"] + regex = "(.+)" + replacement = "/api/v1/nodes/${1}/proxy/metrics" + target_label = "__metrics_path__" + } + } + + prometheus.scrape "kubelet" { + job_name = "integrations/kubernetes/kubelet" + targets = discovery.relabel.kubelet.output + scheme = "https" + scrape_interval = "60s" + bearer_token_file = "/var/run/secrets/kubernetes.io/serviceaccount/token" + tls_config { + insecure_skip_verify = true + } + clustering { + enabled = true + } + forward_to = [prometheus.relabel.kubelet.receiver] + } + + prometheus.relabel "kubelet" { + rule { + source_labels = ["__name__"] + regex = "up|container_cpu_usage_seconds_total|kubelet_certificate_manager_client_expiration_renew_errors|kubelet_certificate_manager_client_ttl_seconds|kubelet_certificate_manager_server_ttl_seconds|kubelet_cgroup_manager_duration_seconds_bucket|kubelet_cgroup_manager_duration_seconds_count|kubelet_node_config_error|kubelet_node_name|kubelet_pleg_relist_duration_seconds_bucket|kubelet_pleg_relist_duration_seconds_count|kubelet_pleg_relist_interval_seconds_bucket|kubelet_pod_start_duration_seconds_bucket|kubelet_pod_start_duration_seconds_count|kubelet_pod_worker_duration_seconds_bucket|kubelet_pod_worker_duration_seconds_count|kubelet_running_container_count|kubelet_running_containers|kubelet_running_pod_count|kubelet_running_pods|kubelet_runtime_operations_errors_total|kubelet_runtime_operations_total|kubelet_server_expiration_renew_errors|kubelet_volume_stats_available_bytes|kubelet_volume_stats_capacity_bytes|kubelet_volume_stats_inodes|kubelet_volume_stats_inodes_used|kubernetes_build_info|namespace_workload_pod|rest_client_requests_total|storage_operation_duration_seconds_count|storage_operation_errors_total|volume_manager_total_volumes" + action = "keep" + } + forward_to = [prometheus.relabel.metrics_service.receiver] + } + + // Cluster Events + loki.source.kubernetes_events "cluster_events" { + job_name = "integrations/kubernetes/eventhandler" + log_format = "logfmt" + forward_to = [loki.write.endpoint.receiver] + } + + + // Why is this needed? + prometheus.relabel "metrics_service" { + forward_to = [prometheus.remote_write.default.receiver] + } + + + // Write Endpoints + // prometheus write endpoint + prometheus.remote_write "default" { + external_labels = { + cluster = "{{ .Values.cluster }}", + project = "{{ .Values.project }}", + } + endpoint { + url = "https://mimir.planx-pla.net/api/v1/push" + + headers = { + "X-Scope-OrgID" = "anonymous", + } + + } + } + + // loki write endpoint + loki.write "endpoint" { + external_labels = { + cluster = "{{ .Values.cluster }}", + project = "{{ .Values.project }}", + } + endpoint { + url = "https://loki.planx-pla.net/loki/api/v1/push" + } + } + +# ======================================================================= +# THIS IS THE CONFIGURATION FOR FLUENTD. CUSTOMIZE AT YOUR OWN PERIL!!!!! +# ======================================================================= +fluentd-configmap-data: | + + + + + @type tail + @id in_tail_container_logs + path /var/log/containers/*.log + pos_file /var/log/fluentd-containers.log.pos + tag "#{ENV['FLUENT_CONTAINER_TAIL_TAG'] || 'kubernetes.*'}" + exclude_path "#{ENV['FLUENT_CONTAINER_TAIL_EXCLUDE_PATH'] || use_default}" + read_from_head true + + @type "#{ENV['FLUENT_CONTAINER_TAIL_PARSER_TYPE'] || 'json'}" + time_format %Y-%m-%dT%H:%M:%S.%NZ + + + + + @type tail + path /var/log/messages + pos_file /var/log/host-messages.log.pos + + @type syslog + + tag host.messages + + + + + @type tail + path /var/log/secure + pos_file /var/log/host-secure.log.pos + + @type syslog + + tag host.secure + + + + @type tail + @id in_tail_docker + path /var/log/docker.log + pos_file /var/log/fluentd-docker.log.pos + tag docker + + @type regexp + expression /^time="(? + + + + + @type tail + @id in_tail_kubelet + multiline_flush_interval 5s + path /var/log/kubelet.log + pos_file /var/log/fluentd-kubelet.log.pos + tag kubelet + + @type kubernetes + + + + + @type kubernetes_metadata + @id filter_kube_metadata + kubernetes_url "#{ENV['FLUENT_FILTER_KUBERNETES_URL'] || 'https://' + ENV.fetch('KUBERNETES_SERVICE_HOST') + ':' + ENV.fetch('KUBERNETES_SERVICE_PORT') + '/api'}" + verify_ssl "#{ENV['KUBERNETES_VERIFY_SSL'] || true}" + ca_file "#{ENV['KUBERNETES_CA_FILE']}" + skip_labels "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_LABELS'] || 'false'}" + skip_container_metadata "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_CONTAINER_METADATA'] || 'false'}" + skip_master_url "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_MASTER_URL'] || 'false'}" + skip_namespace_metadata "#{ENV['FLUENT_KUBERNETES_METADATA_SKIP_NAMESPACE_METADATA'] || 'false'}" + + + + @type null + + + + @type null + + + + @type rewrite_tag_filter + + key $._HOSTNAME + pattern ^(.+)$ + tag $1.docker + + + + + @type rewrite_tag_filter + + key $._HOSTNAME + pattern ^(.+)$ + tag $1.kubelet + + + + + @type rewrite_tag_filter + + key $.host + pattern ^(.+)$ + tag $1.messages + + + + + @type rewrite_tag_filter + + key $.host + pattern ^(.+)$ + tag $1.secure + + + + + @type rewrite_tag_filter + + # json structured log - consider adoption a standard json schema: + # https://github.com/timberio/log-event-json-schema + key message + pattern /^\{\s*"gen3log":/ + tag kubernetes.gen3.json.${tag} + + + # combined log format - default Apache and nginx structure + # https://httpd.apache.org/docs/1.3/logs.html#combined + key message + pattern /^(((\d+\.\d+\.\d+\.\d+)|-)\s+){2}\S+\s+\[\d\d?\// + tag kubernetes.gen3.combined.${tag} + + + # unstructured log line + key message + pattern /\S/ + tag kubernetes.gen3.raw.${tag} + + + + + + @type record_transformer + + log_type json + # This one doesn't work for whatever reason, if you do ${record["kubernetes"]} the whole blob would be added, but can't access subobjects + #container_name ${record["kubernetes"]["container_name"]} + + + + + @type record_transformer + + log_type combined + + + + + @type record_transformer + + log_type raw + + + + + @type rewrite_tag_filter + + key $.kubernetes.pod_name + pattern ^(.+)$ + tag "#{Time.now.strftime('%Y-%m-%d')}.$1" + + # + # key $.kubernetes + # pattern ^(.+)$ + # tag $1.container_name + # + + + # + # @type rewrite_tag_filter + # + # key $.kubernetes.container_name + # pattern ^(.+)$ + #tag $1.${tag} + # tag ${tag}.$1 + # + # + + # TODO: + # * python stack traces: "Traceback (most recent call last):"" + # https://docs.fluentd.org/v0.12/articles/parser_multiline#formatn + # + # Idea: add `visitor` cookie to revproxy ... + + + + @type cloudwatch_logs + @id out_cloudwatch_logs + log_group_name "#{ENV['LOG_GROUP_NAME']}" + auto_create_stream true + use_tag_as_stream true + retention_in_days "#{ENV['RETENTION_IN_DAYS'] || 'nil'}" + json_handler yajl # To avoid UndefinedConversionError + log_rejected_request "#{ENV['LOG_REJECTED_REQUEST']}" # Log rejected request for missing parts +