diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml deleted file mode 100644 index 34158a4d..00000000 --- a/.github/workflows/ci.yaml +++ /dev/null @@ -1,56 +0,0 @@ -name: Run-Tests -on: [pull_request, push] - -jobs: - lint: - name: Lint - runs-on: ubuntu-latest - strategy: - matrix: - python: [3.8, 3.9] - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Install dependencies - run: python3 -m pip install tox - - name: Run linters - run: tox -vve lint - unit-test: - name: Unit tests - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Install dependencies - run: python -m pip install tox - - name: Run tests - run: tox -vve unit - # - name: Push coverage report to CodeCov - # uses: codecov/codecov-action@v2 - # with: - # token: ${{ secrets.CODECOV_TOKEN }} - # files: ./coverage.xml - # fail_ci_if_error: true - # # verbose: true - security: - name: Security tests - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Install dependencies - run: python -m pip install tox - - name: Run tests - run: tox -vve security - integration-test-microk8s: - name: Integration tests (microk8s) - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Setup operator environment - uses: charmed-kubernetes/actions-operator@main - with: - provider: microk8s - - name: Run integration tests - run: tox -vve integration diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e4875332..d1d69027 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,22 +2,53 @@ ## Overview -This documents explains the processes and practices recommended for contributing enhancements to -the Kafka K8s charm. - -- Generally, before developing enhancements to this charm, you should consider [opening an issue - ](https://github.com/canonical/kafka-k8s-operator/issues) explaining your use case. -- If you would like to chat with us about your use-cases or proposed implementation, you can reach - us at [Canonical Mattermost public channel](https://chat.charmhub.io/charmhub/channels/charm-dev) - or [Discourse](https://discourse.charmhub.io/). -- Familiarising yourself with the [Charmed Operator Framework](https://juju.is/docs/sdk) library - will help you a lot when working on new features or bug fixes. +This documents explains the processes and practices recommended for contributing enhancements to this operator. + +- Generally, before developing enhancements to this charm, you should consider [opening an issue](https://github.com/canonical/kafka-operator/issues) explaining your problem with examples, and your desired use case. +- If you would like to chat with us about your use-cases or proposed implementation, you can reach us at [Canonical Mattermost public channel](https://chat.charmhub.io/charmhub/channels/charm-dev) or [Discourse](https://discourse.charmhub.io/). +- Familiarising yourself with the [Charmed Operator Framework](https://juju.is/docs/sdk) library will help you a lot when working on new features or bug fixes. - All enhancements require review before being merged. Code review typically examines - code quality - test coverage - user experience for Juju administrators this charm. -- Please help us out in ensuring easy to review branches by rebasing your pull request branch onto - the `main` branch. This also avoids merge commits and creates a linear Git commit history. +- Please help us out in ensuring easy to review branches by rebasing your pull request branch onto the `main` branch. This also avoids merge commits and creates a linear Git commit history. + +## Requirements + +To build the charm locally, you will need to install [Charmcraft](https://juju.is/docs/sdk/install-charmcraft). + +To run the charm locally with Juju, it is recommended to use [LXD](https://linuxcontainers.org/lxd/introduction/) as your virtual machine manager. Instructions for running Juju on LXD can be found [here](https://juju.is/docs/olm/lxd). + +## Build and Deploy + +To build the charm in this repository, from the root of the dir you can run: +Once you have Juju set up locally, to download, build and deploy the charm you can run: + +### Deploy + +```bash +# Clone and enter the repository +git clone https://github.com/canonical/kafka-k8s-operator.git +cd kafka-k8s-operator/ + +# Create a working model +juju add-model kafka-k8s + +# Enable DEBUG logging for the model +juju model-config logging-config="=INFO;unit=DEBUG" + +# Build the charm locally +charmcraft pack + +# Deploy the latest ZooKeeper release +juju deploy zookeeper-k8s --channel edge -n 3 + +# Deploy the charm +juju deploy ./*.charm -n 3 + +# After ZooKeeper has initialised, relate the applications +juju relate kafka-k8s zookeeper-k8s +``` ## Developing @@ -38,27 +69,6 @@ tox -e integration # integration tests tox # runs 'lint' and 'unit' environments ``` -## Build charm - -Build the charm in this git repository using: - -```shell -charmcraft pack -``` - -### Deploy - -```bash -# Create a model -juju add-model test-kafka-k8s -# Enable DEBUG logging -juju model-config logging-config="=INFO;unit=DEBUG" -# Deploy the charm -juju deploy ./kafka-k8s_ubuntu-20.04-amd64.charm \ - --resource kafka-image=confluentinc/cp-kafka:7.0.1 \ - --resource jmx-prometheus-jar=./jmx_prometheus_javaagent-0.15.0.jar -``` - ## Canonical Contributor Agreement -Canonical welcomes contributions to the Charmed Kafka K8s Operator. Please check out our [contributor agreement](https://ubuntu.com/legal/contributors) if you're interested in contributing to the solution. \ No newline at end of file +Canonical welcomes contributions to the Charmed Kafka Kubernetes Operator. Please check out our [contributor agreement](https://ubuntu.com/legal/contributors) if you're interested in contributing to the solution. diff --git a/README.md b/README.md index f737a54a..9fa626fa 100644 --- a/README.md +++ b/README.md @@ -1,108 +1,43 @@ - +## Kafka K8s Operator - a Charmed Operator for running Apache Kafka on Kubernetes from Canonical -# Kafka K8s Operator +This repository hosts the Kubernetes Python Operator for [Apache Kafka](https://kafka.apache.org). +The Kafka K8s Operator is a Python script that uses the latest upstream Kafka binaries released by the The Apache Software Foundation that comes with Kafka, made available using the [`ubuntu/kafka` OCI image](https://registry.hub.docker.com/r/ubuntu/kafka) distributed by Canonical. -[![code style](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black/tree/main) -[![Run-Tests](https://github.com/canonical/kafka-k8s-operator/actions/workflows/ci.yaml/badge.svg)](https://github.com/canonical/kafka-k8s-operator/actions/workflows/ci.yaml) +As currently Kafka requires a paired ZooKeeper deployment in production, this operator makes use of the [ZooKeeper K8s Operator](https://github.com/canonical/zookeeper-k8s-operator) for various essential functions. -[![Kafka K8s](https://charmhub.io/kafka-k8s/badge.svg)](https://charmhub.io/kafka-k8s) +### Usage -## Description +The Kafka and ZooKeeper operators can both be deployed and connected to each other using the Juju command line as follows: -Apache Kafka is an open-source distributed event streaming platform used by thousands of companies for high-performance data pipelines, streaming analytics, data integration, and mission-critical applications. - -This repository contains a Charm Operator for deploying the Kafka in a Kubernetes cluster. - - - -## How-to guides - -### Deploy Kafka - -The Kafka K8s Operator may be deployed using the Juju command line as in - -```shell -$ juju add-model kafka-k8s -$ juju deploy kafka-k8s --trust -$ juju deploy zookeeper-k8s +```bash +$ juju deploy zookeeper-k8s -n 3 +$ juju deploy kafka-k8s -n 3 $ juju relate kafka-k8s zookeeper-k8s ``` -### Scale Kafka - -Scale Kafka by executing the following command - -```shell -$ juju scale-application kafka-k8s 3 -``` - -## Integration with Canonical Observability Stack - -This exporter can very easily be integrated with Canonical Observability Stack (COS). - -To do so, after following the steps from the previous section, execute the following commands: - -```shell -$ juju deploy cos-lite --channel beta --trust -$ juju upgrade-charm grafana --channel edge -$ juju relate grafana kafka-k8s -$ juju relate prometheus kafka-k8s -``` - -Wait until everything is deployed: -```shell -$ watch -c juju status --color -Model Controller Cloud/Region Version SLA Timestamp -kafka-k8s microk8s-localhost microk8s/localhost 2.9.25 unsupported 15:33:09+01:00 - -App Version Status Scale Charm Store Channel Rev OS Address Message -alertmanager waiting 1 alertmanager-k8s charmhub beta 9 kubernetes 10.152.183.185 -grafana active 1 grafana-k8s charmhub edge 28 kubernetes 10.152.183.247 -kafka-k8s active 1 kafka-k8s charmhub edge 0 kubernetes 10.152.183.25 -loki active 1 loki-k8s charmhub beta 13 kubernetes 10.152.183.32 -prometheus active 1 prometheus-k8s charmhub beta 19 kubernetes 10.152.183.211 -zookeeper-k8s active 1 zookeeper-k8s charmhub edge 10 kubernetes 10.152.183.153 - -Unit Workload Agent Address Ports Message -alertmanager/0* active idle 10.1.245.86 -grafana/0* active idle 10.1.245.101 -kafka-k8s/0* active idle 10.1.245.68 -loki/0* active idle 10.1.245.107 -prometheus/0* active idle 10.1.245.82 -zookeeper-k8s/0* active idle 10.1.245.81 -``` - -To see the metrics, you can get the grafana admin password as follows: - -```shell -$ juju run-action grafana/0 get-admin-password --wait -unit-grafana-0: - UnitId: grafana/0 - id: "2" - results: - admin-password: ************* - status: completed - timing: - completed: 2022-03-02 14:31:49 +0000 UTC - enqueued: 2022-03-02 14:31:39 +0000 UTC - started: 2022-03-02 14:31:48 +0000 UTC -``` +## A fast and fault-tolerant, real-time event streaming platform! -Open your browser and go to the Grafana dashboard at port 3000. +Manual, Day 2 operations for deploying and operating Apache Kafka, topic creation, client authentication, ACL management and more are all handled automatically using the [Juju Operator Lifecycle Manager](https://juju.is/docs/olm). -## Reference +### Key Features +- SASL/SCRAM auth for Broker-Broker and Client-Broker authenticaion enabled by default. +- Access control management supported with user-provided ACL lists. +- Fault-tolerance, replication and high-availability out-of-the-box. +- Streamlined topic-creation through [Juju Actions](https://juju.is/docs/olm/working-with-actions) and [application relations](https://juju.is/docs/olm/relations) -- [Kafka documentation](https://kafka.apache.org/documentation/) -- [OCI image](https://hub.docker.com/r/confluentinc/cp-kafka): currently using tag `7.0.1`. -## Explanation +### Checklist -- [What is Apache Kafka?](https://kafka.apache.org/intro) +- [x] Super-user creation +- [x] Inter-broker auth +- [x] Horizontally scale brokers +- [x] Username/Password creation for related applications +- [ ] Automatic topic creation with associated user ACLs +- [ ] Partition rebalancing during broker scaling +- [ ] Rack awareness support +- [ ] Persistent storage support with [Juju Storage](https://juju.is/docs/olm/defining-and-using-persistent-storage) +- [ ] TLS/SSL support -## Contributing +## Usage -Please see the [Juju SDK docs](https://juju.is/docs/sdk) for guidelines -on enhancements to this charm following best practice guidelines, and -`CONTRIBUTING.md` for developer guidance. +This charm is still in active development. If you would like to contribute, please refer to [CONTRIBUTING.md](https://github.com/canonical/kafka-k8s-operator/blob/main/CONTRIBUTING.md) diff --git a/config.yaml b/config.yaml index ff82c3eb..2b79250a 100644 --- a/config.yaml +++ b/config.yaml @@ -1,99 +1,108 @@ -# Copyright 2022 David Garcia +# Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. options: - kafka-properties: - description: kafka.properties configuration file. + data-dir: + description: filepath for setting the Kafka dataDir option type: string - default: | - clientPort=2181 - broker.id.generation.enable=true - listeners=PLAINTEXT://:9092 - advertised.listeners=PLAINTEXT://:9092 - log.dirs=/var/lib/kafka/data - auto.create.topics.enable=true - auto.leader.rebalance.enable=true - background.threads=10 - compression.type=producer - delete.topic.enable=false - leader.imbalance.check.interval.seconds=300 - leader.imbalance.per.broker.percentage=10 - log.flush.interval.messages=9223372036854775807 - log.flush.offset.checkpoint.interval.ms=60000 - log.flush.scheduler.interval.ms=9223372036854775807 - log.retention.bytes=-1 - log.retention.hours=168 - log.roll.hours=168 - log.roll.jitter.hours=0 - log.segment.bytes=1073741824 - log.segment.delete.delay.ms=60000 - message.max.bytes=1000012 - min.insync.replicas=1 - num.io.threads=8 - num.network.threads=1 - num.recovery.threads.per.data.dir=1 - num.replica.fetchers=1 - offset.metadata.max.bytes=4096 - offsets.commit.required.acks=-1 - offsets.commit.timeout.ms=5000 - offsets.load.buffer.size=5242880 - offsets.retention.check.interval.ms=600000 - offsets.retention.minutes=1440 - offsets.topic.compression.codec=0 - offsets.topic.num.partitions=50 - offsets.topic.replication.factor=1 - offsets.topic.segment.bytes=104857600 - queued.max.requests=500 - quota.consumer.default=9223372036854775807 - quota.producer.default=9223372036854775807 - replica.fetch.min.bytes=1 - replica.fetch.wait.max.ms=500 - replica.high.watermark.checkpoint.interval.ms=5000 - replica.lag.time.max.ms=10000 - replica.socket.receive.buffer.bytes=65536 - replica.socket.timeout.ms=30000 - request.timeout.ms=30000 - socket.receive.buffer.bytes=102400 - socket.request.max.bytes=104857600 - socket.send.buffer.bytes=102400 - unclean.leader.election.enable=true - zookeeper.session.timeout.ms=6000 - zookeeper.set.acl=false - broker.id.generation.enable=true - connections.max.idle.ms=600000 - controlled.shutdown.enable=true - controlled.shutdown.max.retries=3 - controlled.shutdown.retry.backoff.ms=5000 - controller.socket.timeout.ms=30000 - default.replication.factor=1 - fetch.purgatory.purge.interval.requests=1000 - group.max.session.timeout.ms=300000 - group.min.session.timeout.ms=6000 - log.cleaner.backoff.ms=15000 - log.cleaner.dedupe.buffer.size=134217728 - log.cleaner.delete.retention.ms=86400000 - log.cleaner.enable=true - log.cleaner.io.buffer.load.factor=0.9 - log.cleaner.io.buffer.size=524288 - log.cleaner.io.max.bytes.per.second=1.7976931348623157E308 - log.cleaner.min.cleanable.ratio=0.5 - log.cleaner.min.compaction.lag.ms=0 - log.cleaner.threads=1 - log.cleanup.policy=delete - log.index.interval.bytes=4096 - log.index.size.max.bytes=10485760 - log.message.timestamp.difference.max.ms=9223372036854775807 - log.message.timestamp.type=CreateTime - log.preallocate=false - log.retention.check.interval.ms=300000 - max.connections.per.ip=2147483647 - num.partitions=1 - producer.purgatory.purge.interval.requests=1000 - replica.fetch.backoff.ms=1000 - replica.fetch.max.bytes=1048576 - replica.fetch.response.max.bytes=10485760 - reserved.broker.max.id=1000 - metrics: - description: Enable/disable metrics. + default: "/data/kafka" + log-dir: + description: filepath for setting the Kafka dataLogDir option + type: string + default: "/logs/kafka" + offsets-retention-minutes: + description: the number of minutes offsets will be kept before getting discarded + type: int + default: 10080 + log-retention-hours: + description: the number of hours to keep a log file before deleting it + type: int + default: 168 + auto-create-topics: + description: enables auto creation of topic on the server type: boolean - default: true \ No newline at end of file + default: false + + # log.dirs=/var/snap/kafka/common/log + # + # # networking + # clientPort=2181 + # listeners=SASL_PLAINTEXT://:9092 + # + # # offsets + # offsets.topic.num.partitions=50 + # offsets.commit.required.acks=-1 + # offsets.retention.minutes=10080 + # + # # topic + # auto.leader.rebalance.enable=true + # # to be changed when necessary + # delete.topic.enable=true + # unclean.leader.election.enable=false + # auto.create.topics.enable=false + # # helpful + # group.initial.rebalance.delay.ms=3000 + # + # # auth + # sasl.enabled.mechanisms=SCRAM-SHA-512 + # sasl.mechanism.inter.broker.protocol=SCRAM-SHA-512 + # security.inter.broker.protocol=SASL_PLAINTEXT + # authorizer.class.name=kafka.security.authorizer.AclAuthorizer + # allow.everyone.if.no.acl.found=false + # super.users=User:sync + # listener.name.sasl_plaintext.sasl.enabled.mechanisms=SCRAM-SHA-512 + # # zookeeper.set.acl=true + + + + ## Backup + # background.threads=10 + # compression.type=producer + # leader.imbalance.check.interval.seconds=300 + # leader.imbalance.per.broker.percentage=10 + # log.retention.bytes=-1 + # log.roll.hours=168 + # log.roll.jitter.hours=0 + # log.segment.bytes=1073741824 + # log.segment.delete.delay.ms=60000 + # message.max.bytes=1000012 + # num.io.threads=8 + # num.network.threads=3 + # num.recovery.threads.per.data.dir=1 + # num.replica.fetchers=1 + # offset.metadata.max.bytes=4096 + # offsets.commit.timeout.ms=5000 + # offsets.load.buffer.size=5242880 + # offsets.retention.check.interval.ms=600000 + # offsets.topic.compression.codec=0 + # offsets.topic.segment.bytes=104857600 + # queued.max.requests=500 + # quota.consumer.default=9223372036854775807 + # quota.producer.default=9223372036854775807 + # replica.fetch.min.bytes=1 + # replica.fetch.wait.max.ms=500 + # replica.high.watermark.checkpoint.interval.ms=5000 + # replica.lag.time.max.ms=10000 + # replica.socket.receive.buffer.bytes=65536 + # replica.socket.timeout.ms=30000 + # request.timeout.ms=30000 + # socket.receive.buffer.bytes=102400 + # socket.request.max.bytes=104857600 + # socket.send.buffer.bytes=102400 + # zookeeper.session.timeout.ms=6000 + # connections.max.idle.ms=600000 + # controlled.shutdown.enable=true + # controlled.shutdown.max.retries=3 + # controlled.shutdown.retry.backoff.ms=5000 + # controller.socket.timeout.ms=30000 + # fetch.purgatory.purge.interval.requests=1000 + # group.max.session.timeout.ms=300000 + # group.min.session.timeout.ms=600 + # producer.purgatory.purge.interval.requests=1000 + # replica.fetch.backoff.ms=1000 + # replica.fetch.max.bytes=1048576 + # replica.fetch.response.max.bytes=10485760 + # reserved.broker.max.id=1000 + # num.partitions=1 + # group.initial.rebalance.delay.ms=0 + # zookeeper.connection.timeout.ms=18000 diff --git a/icon.svg b/icon.svg deleted file mode 100644 index de5ab1be..00000000 --- a/icon.svg +++ /dev/null @@ -1,21 +0,0 @@ - - - kafka - Created with Sketch. - - - - - - - - - - - - - - - - - diff --git a/jmx_prometheus_javaagent-0.15.0.jar b/jmx_prometheus_javaagent-0.15.0.jar deleted file mode 100644 index d896a217..00000000 Binary files a/jmx_prometheus_javaagent-0.15.0.jar and /dev/null differ diff --git a/lib/charms/grafana_k8s/v0/grafana_dashboard.py b/lib/charms/grafana_k8s/v0/grafana_dashboard.py deleted file mode 100644 index f981ed60..00000000 --- a/lib/charms/grafana_k8s/v0/grafana_dashboard.py +++ /dev/null @@ -1,1380 +0,0 @@ -# Copyright 2021 Canonical Ltd. -# See LICENSE file for licensing details. - -"""## Overview. - -This document explains how to integrate with the Grafana charm -for the purpose of providing a dashboard which can be used by -end users. It also explains the structure of the data -expected by the `grafana-dashboard` interface, and may provide a -mechanism or reference point for providing a compatible interface -or library by providing a definitive reference guide to the -structure of relation data which is shared between the Grafana -charm and any charm providing datasource information. - -## Provider Library Usage - -The Grafana charm interacts with its dashboards using its charm -library. The goal of this library is to be as simple to use as -possible, and instantiation of the class with or without changing -the default arguments provides a complete use case. For the simplest -use case of a charm which bundles dashboards and provides a -`provides: grafana-dashboard` interface, creation of a -`GrafanaDashboardProvider` object with the default arguments is -sufficient. - -:class:`GrafanaDashboardProvider` expects that bundled dashboards should -be included in your charm with a default path of: - - path/to/charm.py - path/to/src/grafana_dashboards/*.tmpl - -Where the `*.tmpl` files are Grafana dashboard JSON data either from the -Grafana marketplace, or directly exported from a a Grafana instance. - -The default arguments are: - - `charm`: `self` from the charm instantiating this library - `relation_name`: grafana-dashboard - `dashboards_path`: "/src/grafana_dashboards" - -If your configuration requires any changes from these defaults, they -may be set from the class constructor. It may be instantiated as -follows: - - from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider - - class FooCharm: - def __init__(self, *args): - super().__init__(*args, **kwargs) - ... - self.grafana_dashboard_provider = GrafanaDashboardProvider(self) - ... - -The first argument (`self`) should be a reference to the parent (providing -dashboards), as this charm's lifecycle events will be used to re-submit -dashboard information if a charm is upgraded, the pod is restarted, or other. - -An instantiated `GrafanaDashboardProvider` validates that the path specified -in the constructor (or the default) exists, reads the file contents, then -compresses them with LZMA and adds them to the application relation data -when a relation is established with Grafana. - -Provided dashboards will be checked by Grafana, and a series of dropdown menus -providing the ability to select query targets by Juju Model, application instance, -and unit will be added if they do not exist. - -To avoid requiring `jinja` in `GrafanaDashboardProvider` users, template validation -and rendering occurs on the other side of the relation, and relation data in -the form of: - - { - "event": { - "valid": `true|false`, - "errors": [], - } - } - -Will be returned if rendering or validation fails. In this case, the -`GrafanaDashboardProvider` object will emit a `dashboard_status_changed` event -of the type :class:`GrafanaDashboardEvent`, which will contain information -about the validation error. - -This information is added to the relation data for the charms as serialized JSON -from a dict, with a structure of: -``` -{ - "application": { - "dashboards": { - "uuid": a uuid generated to ensure a relation event triggers, - "templates": { - "file:{hash}": { - "content": `{compressed_template_data}`, - "charm": `charm.meta.name`, - "juju_topology": { - "model": `charm.model.name`, - "model_uuid": `charm.model.uuid`, - "application": `charm.app.name`, - "unit": `charm.unit.name`, - } - }, - "file:{other_file_hash}": { - ... - }, - }, - }, - }, -} -``` - -This is ingested by :class:`GrafanaDashboardConsumer`, and is sufficient for configuration. - -The [COS Configuration Charm](https://charmhub.io/cos-configuration-k8s) can be used to -add dashboards which are bundled with charms. - -## Consumer Library Usage - -The `GrafanaDashboardConsumer` object may be used by Grafana -charms to manage relations with available dashboards. For this -purpose, a charm consuming Grafana dashboard information should do -the following things: - -1. Instantiate the `GrafanaDashboardConsumer` object by providing it a -reference to the parent (Grafana) charm and, optionally, the name of -the relation that the Grafana charm uses to interact with dashboards. -This relation must confirm to the `grafana-dashboard` interface. - -For example a Grafana charm may instantiate the -`GrafanaDashboardConsumer` in its constructor as follows - - from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardConsumer - - def __init__(self, *args): - super().__init__(*args) - ... - self.grafana_dashboard_consumer = GrafanaDashboardConsumer(self) - ... - -2. A Grafana charm also needs to listen to the -`GrafanaDashboardConsumer` events emitted by the `GrafanaDashboardConsumer` -by adding itself as an observer for these events: - - self.framework.observe( - self.grafana_source_consumer.on.sources_changed, - self._on_dashboards_changed, - ) - -Dashboards can be retrieved the :meth:`dashboards`: - -It will be returned in the format of: - -``` -[ - { - "id": unique_id, - "relation_id": relation_id, - "charm": the name of the charm which provided the dashboard, - "content": compressed_template_data - }, -] -``` - -The consuming charm should decompress the dashboard. -""" - -import base64 -import json -import logging -import lzma -import os -import re -import uuid -from pathlib import Path -from typing import Any, Dict, List, Optional, Union - -from ops.charm import ( - CharmBase, - HookEvent, - RelationBrokenEvent, - RelationChangedEvent, - RelationCreatedEvent, - RelationEvent, - RelationRole, -) -from ops.framework import ( - EventBase, - EventSource, - Object, - ObjectEvents, - StoredDict, - StoredList, - StoredState, -) -from ops.model import Relation - -# The unique Charmhub library identifier, never change it -LIBID = "c49eb9c7dfef40c7b6235ebd67010a3f" - -# Increment this major API version when introducing breaking changes -LIBAPI = 0 - -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version -LIBPATCH = 10 - -logger = logging.getLogger(__name__) - - -DEFAULT_RELATION_NAME = "grafana-dashboard" -RELATION_INTERFACE_NAME = "grafana_dashboard" - -TEMPLATE_DROPDOWNS = [ - { - "allValue": None, - "datasource": "${prometheusds}", - "definition": "label_values(up,juju_model)", - "description": None, - "error": None, - "hide": 0, - "includeAll": False, - "label": "Juju model", - "multi": False, - "name": "juju_model", - "query": { - "query": "label_values(up,juju_model)", - "refId": "StandardVariableQuery", - }, - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": False, - }, - { - "allValue": None, - "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', - "description": None, - "error": None, - "hide": 0, - "includeAll": False, - "label": "Juju model uuid", - "multi": False, - "name": "juju_model_uuid", - "query": { - "query": 'label_values(up{juju_model="$juju_model"},juju_model_uuid)', - "refId": "StandardVariableQuery", - }, - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": False, - }, - { - "allValue": None, - "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', - "description": None, - "error": None, - "hide": 0, - "includeAll": False, - "label": "Juju application", - "multi": False, - "name": "juju_application", - "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid"},juju_application)', - "refId": "StandardVariableQuery", - }, - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": False, - }, - { - "allValue": None, - "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', - "description": None, - "error": None, - "hide": 0, - "includeAll": False, - "label": "Juju unit", - "multi": False, - "name": "juju_unit", - "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},juju_unit)', - "refId": "StandardVariableQuery", - }, - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "sort": 0, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": False, - }, - { - "description": None, - "error": None, - "hide": 0, - "includeAll": False, - "label": None, - "multi": False, - "name": "prometheusds", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "type": "datasource", - }, -] - -REACTIVE_CONVERTER = { # type: ignore - "allValue": None, - "datasource": "${prometheusds}", - "definition": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', - "description": None, - "error": None, - "hide": 0, - "includeAll": False, - "label": "hosts", - "multi": True, - "name": "host", - "options": [], - "query": { - "query": 'label_values(up{juju_model="$juju_model",juju_model_uuid="$juju_model_uuid",juju_application="$juju_application"},host)', - "refId": "StandardVariableQuery", - }, - "refresh": 1, - "regex": "", - "skipUrlSync": False, - "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": False, -} - - -class RelationNotFoundError(Exception): - """Raised if there is no relation with the given name.""" - - def __init__(self, relation_name: str): - self.relation_name = relation_name - self.message = "No relation named '{}' found".format(relation_name) - - super().__init__(self.message) - - -class RelationInterfaceMismatchError(Exception): - """Raised if the relation with the given name has a different interface.""" - - def __init__( - self, - relation_name: str, - expected_relation_interface: str, - actual_relation_interface: str, - ): - self.relation_name = relation_name - self.expected_relation_interface = expected_relation_interface - self.actual_relation_interface = actual_relation_interface - self.message = ( - "The '{}' relation has '{}' as " - "interface rather than the expected '{}'".format( - relation_name, actual_relation_interface, expected_relation_interface - ) - ) - - super().__init__(self.message) - - -class RelationRoleMismatchError(Exception): - """Raised if the relation with the given name has a different direction.""" - - def __init__( - self, - relation_name: str, - expected_relation_role: RelationRole, - actual_relation_role: RelationRole, - ): - self.relation_name = relation_name - self.expected_relation_interface = expected_relation_role - self.actual_relation_role = actual_relation_role - self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( - relation_name, repr(actual_relation_role), repr(expected_relation_role) - ) - - super().__init__(self.message) - - -class InvalidDirectoryPathError(Exception): - """Raised if the grafana dashboards folder cannot be found or is otherwise invalid.""" - - def __init__( - self, - grafana_dashboards_absolute_path: str, - message: str, - ): - self.grafana_dashboards_absolute_path = grafana_dashboards_absolute_path - self.message = message - - super().__init__(self.message) - - -def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: - """Resolve the provided path items against the directory of the main file. - - Look up the directory of the charmed operator file being executed. This is normally - going to be the charm.py file of the charm including this library. Then, resolve - the provided path elements and return its absolute path. - - Raises: - InvalidDirectoryPathError if the resolved path does not exist or it is not a directory - - """ - charm_dir = Path(str(charm.charm_dir)) - if not charm_dir.exists() or not charm_dir.is_dir(): - # Operator Framework does not currently expose a robust - # way to determine the top level charm source directory - # that is consistent across deployed charms and unit tests - # Hence for unit tests the current working directory is used - # TODO: updated this logic when the following ticket is resolved - # https://github.com/canonical/operator/issues/643 - charm_dir = Path(os.getcwd()) - - dir_path = charm_dir.absolute().joinpath(*path_elements) - - if not dir_path.exists(): - raise InvalidDirectoryPathError(str(dir_path), "directory does not exist") - if not dir_path.is_dir(): - raise InvalidDirectoryPathError(str(dir_path), "is not a directory") - - return str(dir_path) - - -def _validate_relation_by_interface_and_direction( - charm: CharmBase, - relation_name: str, - expected_relation_interface: str, - expected_relation_role: RelationRole, -) -> None: - """Verifies that a relation has the necessary characteristics. - - Verifies that the `relation_name` provided: (1) exists in metadata.yaml, - (2) declares as interface the interface name passed as `relation_interface` - and (3) has the right "direction", i.e., it is a relation that `charm` - provides or requires. - - Args: - charm: a `CharmBase` object to scan for the matching relation. - relation_name: the name of the relation to be verified. - expected_relation_interface: the interface name to be matched by the - relation named `relation_name`. - expected_relation_role: whether the `relation_name` must be either - provided or required by `charm`. - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - named like the value of the `relation_name` argument. - RelationInterfaceMismatchError: If the relation interface of the - relation named as the provided `relation_name` argument does not - match the `expected_relation_interface` argument. - RelationRoleMismatchError: If the relation named as the provided `relation_name` - argument has a different role than what is specified by the - `expected_relation_role` argument. - """ - if relation_name not in charm.meta.relations: - raise RelationNotFoundError(relation_name) - - relation = charm.meta.relations[relation_name] - - actual_relation_interface = relation.interface_name - if actual_relation_interface != expected_relation_interface: - raise RelationInterfaceMismatchError( - relation_name, expected_relation_interface, actual_relation_interface - ) - - if expected_relation_role == RelationRole.provides: - if relation_name not in charm.meta.provides: - raise RelationRoleMismatchError( - relation_name, RelationRole.provides, RelationRole.requires - ) - elif expected_relation_role == RelationRole.requires: - if relation_name not in charm.meta.requires: - raise RelationRoleMismatchError( - relation_name, RelationRole.requires, RelationRole.provides - ) - else: - raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) - - -def _encode_dashboard_content(content: Union[str, bytes]) -> str: - if isinstance(content, str): - content = bytes(content, "utf-8") - - return base64.b64encode(lzma.compress(content)).decode("utf-8") - - -def _decode_dashboard_content(encoded_content: str) -> str: - return lzma.decompress(base64.b64decode(encoded_content.encode("utf-8"))).decode() - - -def _inject_dashboard_dropdowns(content: str) -> str: - """Make sure dropdowns are present for Juju topology.""" - dict_content = json.loads(content) - if "templating" not in content: - dict_content["templating"] = {"list": [d for d in TEMPLATE_DROPDOWNS]} - else: - for d in TEMPLATE_DROPDOWNS: - if d not in dict_content["templating"]["list"]: - dict_content["templating"]["list"].insert(0, d) - - return json.dumps(dict_content) - - -def _type_convert_stored(obj): - """Convert Stored* to their appropriate types, recursively.""" - if isinstance(obj, StoredList): - return list(map(_type_convert_stored, obj)) - elif isinstance(obj, StoredDict): - rdict = {} # type: Dict[Any, Any] - for k in obj.keys(): - rdict[k] = _type_convert_stored(obj[k]) - return rdict - else: - return obj - - -class GrafanaDashboardsChanged(EventBase): - """Event emitted when Grafana dashboards change.""" - - def __init__(self, handle, data=None): - super().__init__(handle) - self.data = data - - def snapshot(self) -> Dict: - """Save grafana source information.""" - return {"data": self.data} - - def restore(self, snapshot): - """Restore grafana source information.""" - self.data = snapshot["data"] - - -class GrafanaDashboardEvents(ObjectEvents): - """Events raised by :class:`GrafanaSourceEvents`.""" - - dashboards_changed = EventSource(GrafanaDashboardsChanged) - - -class GrafanaDashboardEvent(EventBase): - """Event emitted when Grafana dashboards cannot be resolved. - - Enables us to set a clear status on the provider. - """ - - def __init__(self, handle, error_message: str = "", valid: bool = False): - super().__init__(handle) - self.error_message = error_message - self.valid = valid - - def snapshot(self) -> Dict: - """Save grafana source information.""" - return {"error_message": self.error_message, "valid": self.valid} - - def restore(self, snapshot): - """Restore grafana source information.""" - self.error_message = snapshot["error_message"] - self.valid = snapshot["valid"] - - -class GrafanaProviderEvents(ObjectEvents): - """Events raised by :class:`GrafanaSourceEvents`.""" - - dashboard_status_changed = EventSource(GrafanaDashboardEvent) - - -class GrafanaDashboardProvider(Object): - """An API to provide Grafana dashboards to a Grafana charm.""" - - _stored = StoredState() - on = GrafanaProviderEvents() - - def __init__( - self, - charm: CharmBase, - relation_name: str = DEFAULT_RELATION_NAME, - dashboards_path: str = "src/grafana_dashboards", - ) -> None: - """API to provide Grafana dashboard to a Grafana charmed operator. - - The :class:`GrafanaDashboardProvider` object provides an API - to upload dashboards to a Grafana charm. In its most streamlined - usage, the :class:`GrafanaDashboardProvider` is integrated in a - charmed operator as follows: - - self.grafana = GrafanaDashboardProvider(self) - - The :class:`GrafanaDashboardProvider` will look for dashboard - templates in the `/grafana_dashboards` folder. - Additionally, dashboard templates can be uploaded programmatically - via the :method:`GrafanaDashboardProvider.add_dashboard` method. - - To use the :class:`GrafanaDashboardProvider` API, you need a relation - defined in your charm operator's metadata.yaml as follows: - - provides: - grafana-dashboard: - interface: grafana_dashboard - - If you would like to use relation name other than `grafana-dashboard`, - you will need to specify the relation name via the `relation_name` - argument when instantiating the :class:`GrafanaDashboardProvider` object. - However, it is strongly advised to keep the the default relation name, - so that people deploying your charm will have a consistent experience - with all other charms that provide Grafana dashboards. - - It is possible to provide a different file path for the Grafana dashboards - to be automatically managed by the :class:`GrafanaDashboardProvider` object - via the `dashboards_path` argument. This may be necessary when the directory - structure of your charmed operator repository is not the "usual" one as - generated by `charmcraft init`, for example when adding the charmed operator - in a Java repository managed by Maven or Gradle. However, unless there are - such constraints with other tooling, it is strongly advised to store the - Grafana dashboards in the default `/grafana_dashboards` - folder, in order to provide a consistent experience for other charmed operator - authors. - - Args: - charm: a :class:`CharmBase` object which manages this - :class:`GrafanaProvider` object. Generally this is - `self` in the instantiating class. - relation_name: a :string: name of the relation managed by this - :class:`GrafanaDashboardProvider`; it defaults to "grafana-dashboard". - dashboards_path: a filesystem path relative to the charm root - where dashboard templates can be located. By default, the library - expects dashboard files to be in the `/grafana_dashboards` - directory. - """ - _validate_relation_by_interface_and_direction( - charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides - ) - - try: - dashboards_path = _resolve_dir_against_charm_path(charm, dashboards_path) - except InvalidDirectoryPathError as e: - logger.warning( - "Invalid Grafana dashboards folder at %s: %s", - e.grafana_dashboards_absolute_path, - e.message, - ) - - super().__init__(charm, relation_name) - - self._charm = charm - self._relation_name = relation_name - self._dashboards_path = dashboards_path - self._stored.set_default(dashboard_templates={}) - - self.framework.observe(self._charm.on.leader_elected, self._update_all_dashboards_from_dir) - self.framework.observe(self._charm.on.upgrade_charm, self._update_all_dashboards_from_dir) - - self.framework.observe( - self._charm.on[self._relation_name].relation_created, - self._on_grafana_dashboard_relation_created, - ) - self.framework.observe( - self._charm.on[self._relation_name].relation_changed, - self._on_grafana_dashboard_relation_changed, - ) - - def add_dashboard(self, content: str) -> None: - """Add a dashboard to the relation managed by this :class:`GrafanaDashboardProvider`. - - Args: - content: a string representing a Jinja template. Currently, no - global variables are added to the Jinja template evaluation - context. - """ - # Update of storage must be done irrespective of leadership, so - # that the stored state is there when this unit becomes leader. - stored_dashboard_templates = self._stored.dashboard_templates - - encoded_dashboard = _encode_dashboard_content(content) - - # Use as id the first chars of the encoded dashboard, so that - # it is predictable across units. - id = "prog:{}".format(encoded_dashboard[-24:-16]) - stored_dashboard_templates[id] = self._content_to_dashboard_object(encoded_dashboard) - - if self._charm.unit.is_leader(): - for dashboard_relation in self._charm.model.relations[self._relation_name]: - self._upset_dashboards_on_relation(dashboard_relation) - - def remove_non_builtin_dashboards(self) -> None: - """Remove all dashboards to the relation added via :method:`add_dashboard`.""" - # Update of storage must be done irrespective of leadership, so - # that the stored state is there when this unit becomes leader. - stored_dashboard_templates = self._stored.dashboard_templates - - for dashboard_id in list(stored_dashboard_templates.keys()): - if dashboard_id.startswith("prog:"): - del stored_dashboard_templates[dashboard_id] - self._stored.dashboard_templates = stored_dashboard_templates - - if self._charm.unit.is_leader(): - for dashboard_relation in self._charm.model.relations[self._relation_name]: - self._upset_dashboards_on_relation(dashboard_relation) - - def update_dashboards(self) -> None: - """Trigger the re-evaluation of the data on all relations.""" - if self._charm.unit.is_leader(): - for dashboard_relation in self._charm.model.relations[self._relation_name]: - self._upset_dashboards_on_relation(dashboard_relation) - - def _update_all_dashboards_from_dir(self, _: Optional[HookEvent] = None) -> None: - """Scans the built-in dashboards and updates relations with changes.""" - # Update of storage must be done irrespective of leadership, so - # that the stored state is there when this unit becomes leader. - - # Ensure we do not leave outdated dashboards by removing from stored all - # the encoded dashboards that start with "file/". - if self._dashboards_path: - stored_dashboard_templates = self._stored.dashboard_templates - - for dashboard_id in list(stored_dashboard_templates.keys()): - if dashboard_id.startswith("file:"): - del stored_dashboard_templates[dashboard_id] - - for path in filter(Path.is_file, Path(self._dashboards_path).glob("*.tmpl")): - id = "file:{}".format(path.stem) - stored_dashboard_templates[id] = self._content_to_dashboard_object( - _encode_dashboard_content(path.read_bytes()) - ) - - self._stored.dashboard_templates = stored_dashboard_templates - - if self._charm.unit.is_leader(): - for dashboard_relation in self._charm.model.relations[self._relation_name]: - self._upset_dashboards_on_relation(dashboard_relation) - - def _reinitialize_dashboard_data(self) -> None: - """Triggers a reload of dashboard outside of an eventing workflow. - - This will destroy any existing relation data. - """ - try: - _resolve_dir_against_charm_path(self._charm, self._dashboards_path) - self._update_all_dashboards_from_dir() - - except InvalidDirectoryPathError as e: - logger.warning( - "Invalid Grafana dashboards folder at %s: %s", - e.grafana_dashboards_absolute_path, - e.message, - ) - stored_dashboard_templates = self._stored.dashboard_templates - - for dashboard_id in list(stored_dashboard_templates.keys()): - if dashboard_id.startswith("file:"): - del stored_dashboard_templates[dashboard_id] - self._stored.dashboard_templates = stored_dashboard_templates - - # With all of the file-based dashboards cleared out, force a refresh - # of relation data - if self._charm.unit.is_leader(): - for dashboard_relation in self._charm.model.relations[self._relation_name]: - self._upset_dashboards_on_relation(dashboard_relation) - - def _on_grafana_dashboard_relation_created(self, event: RelationCreatedEvent) -> None: - """Watch for a relation being created and automatically send dashboards. - - Args: - event: The :class:`RelationJoinedEvent` sent when a - `grafana_dashboaard` relationship is joined - """ - if self._charm.unit.is_leader(): - self._upset_dashboards_on_relation(event.relation) - - def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: - """Watch for changes so we know if there's an error to signal back to the parent charm. - - Args: - event: The `RelationChangedEvent` that triggered this handler. - """ - if self._charm.unit.is_leader(): - data = json.loads(event.relation.data[event.app].get("event", "{}")) - - if not data: - return - - valid = bool(data.get("valid", True)) - errors = data.get("errors", []) - if valid and not errors: - self.on.dashboard_status_changed.emit(valid=valid) - else: - self.on.dashboard_status_changed.emit(valid=valid, errors=errors) - - def _upset_dashboards_on_relation(self, relation: Relation) -> None: - """Update the dashboards in the relation data bucket.""" - # It's completely ridiculous to add a UUID, but if we don't have some - # pseudo-random value, this never makes it across 'juju set-state' - stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), - "uuid": str(uuid.uuid4()), - } - - relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) - - def _content_to_dashboard_object(self, content: str) -> Dict: - return { - "charm": self._charm.meta.name, - "content": content, - "juju_topology": self._juju_topology, - } - - # This is not actually used in the dashboards, but is present to provide a secondary - # salt to ensure uniqueness in the dict keys in case individual charm units provide - # dashboards - @property - def _juju_topology(self) -> Dict: - return { - "model": self._charm.model.name, - "model_uuid": self._charm.model.uuid, - "application": self._charm.app.name, - "unit": self._charm.unit.name, - } - - @property - def dashboard_templates(self) -> List: - """Return a list of the known dashboard templates.""" - return [v for v in self._stored.dashboard_templates.values()] - - -class GrafanaDashboardConsumer(Object): - """A consumer object for working with Grafana Dashboards.""" - - on = GrafanaDashboardEvents() - _stored = StoredState() - - def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME) -> None: - """API to receive Grafana dashboards from charmed operators. - - The :class:`GrafanaDashboardConsumer` object provides an API - to consume dashboards provided by a charmed operator using the - :class:`GrafanaDashboardProvider` library. The - :class:`GrafanaDashboardConsumer` is integrated in a - charmed operator as follows: - - self.grafana = GrafanaDashboardConsumer(self) - - To use this library, you need a relation defined as follows in - your charm operator's metadata.yaml: - - requires: - grafana-dashboard: - interface: grafana_dashboard - - If you would like to use a different relation name than - `grafana-dashboard`, you need to specify the relation name via the - `relation_name` argument. However, it is strongly advised not to - change the default, so that people deploying your charm will have - a consistent experience with all other charms that consume Grafana - dashboards. - - Args: - charm: a :class:`CharmBase` object which manages this - :class:`GrafanaProvider` object. Generally this is - `self` in the instantiating class. - relation_name: a :string: name of the relation managed by this - :class:`GrafanaDashboardConsumer`; it defaults to "grafana-dashboard". - """ - _validate_relation_by_interface_and_direction( - charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires - ) - - super().__init__(charm, relation_name) - self._charm = charm - self._relation_name = relation_name - - self._stored.set_default(dashboards=dict()) - - self.framework.observe( - self._charm.on[self._relation_name].relation_changed, - self._on_grafana_dashboard_relation_changed, - ) - self.framework.observe( - self._charm.on[self._relation_name].relation_broken, - self._on_grafana_dashboard_relation_broken, - ) - - def get_dashboards_from_relation(self, relation_id: int) -> List: - """Get a list of known dashboards for one instance of the monitored relation. - - Args: - relation_id: the identifier of the relation instance, as returned by - :method:`ops.model.Relation.id`. - - Returns: a list of known dashboards coming from the provided relation instance. - """ - return [ - self._to_external_object(relation_id, dashboard) - for dashboard in self._stored.dashboards.get(relation_id, []) - ] - - def _on_grafana_dashboard_relation_changed(self, event: RelationChangedEvent) -> None: - """Handle relation changes in related providers. - - If there are changes in relations between Grafana dashboard consumers - and providers, this event handler (if the unit is the leader) will - get data for an incoming grafana-dashboard relation through a - :class:`GrafanaDashboardsChanged` event, and make the relation data - available in the app's datastore object. The Grafana charm can - then respond to the event to update its configuration. - """ - # TODO Are we sure this is right? It sounds like every Grafana unit - # should create files with the dashboards in its container. - if not self._charm.unit.is_leader(): - return - - self._render_dashboards_and_emit_event(event.relation) - - def update_dashboards(self, relation: Optional[Relation] = None) -> None: - """Re-establish dashboards on one or more relations. - - If something changes between this library and a datasource, try to re-establish - invalid dashboards and invalidate active ones. - - Args: - relation: a specific relation for which the dashboards have to be - updated. If not specified, all relations managed by this - :class:`GrafanaDashboardConsumer` will be updated. - """ - if not self._charm.unit.is_leader(): - return - - relations = [relation] if relation else self._charm.model.relations[self._relation_name] - - for relation in relations: - self._render_dashboards_and_emit_event(relation) - - def _on_grafana_dashboard_relation_broken(self, event: RelationBrokenEvent) -> None: - """Update job config when providers depart. - - When a Grafana dashboard provider departs, the configuration - for that provider is removed from the list of dashboards - """ - if not self._charm.unit.is_leader(): - return - - self._remove_all_dashboards_for_relation(event.relation) - - def _render_dashboards_and_emit_event(self, relation: Relation) -> None: - """Validate a given dashboard. - - Verify that the passed dashboard data is able to be found in our list - of datasources and will render. If they do, let the charm know by - emitting an event. - - Args: - relation: Relation; The relation the dashboard is associated with. - """ - other_app = relation.app - - raw_data = relation.data[other_app].get("dashboards", {}) - - if not raw_data: - logger.warning( - "No dashboard data found in the %s:%s relation", - self._relation_name, - str(relation.id), - ) - return - - data = json.loads(raw_data) - - # The only piece of data needed on this side of the relations is "templates" - templates = data.pop("templates") - - # Import only if a charmed operator uses the consumer, we don't impose these - # dependencies on the client - from jinja2 import Template # type: ignore - from jinja2.exceptions import TemplateSyntaxError # type: ignore - - # The dashboards are WAY too big since this ultimately calls out to Juju to - # set the relation data, and it overflows the maximum argument length for - # subprocess, so we have to use b64, annoyingly. - # Worse, Python3 expects absolutely everything to be a byte, and a plain - # `base64.b64encode()` is still too large, so we have to go through hoops - # of encoding to byte, compressing with lzma, converting to base64 so it - # can be converted to JSON, then all the way back. - - rendered_dashboards = [] - relation_has_invalid_dashboards = False - - for _, (fname, template) in enumerate(templates.items()): - decoded_content = _decode_dashboard_content(template["content"]) - - content = None - error = None - try: - content = Template(decoded_content).render() - content = _encode_dashboard_content(_inject_dashboard_dropdowns(content)) - except TemplateSyntaxError as e: - error = str(e) - relation_has_invalid_dashboards = True - - # Prepend the relation name and ID to the dashboard ID to avoid clashes with - # multiple relations with apps from the same charm, or having dashboards with - # the same ids inside their charm operators - rendered_dashboards.append( - { - "id": "{}:{}/{}".format(relation.name, relation.id, fname), - "original_id": fname, - "content": content if content else None, - "template": template, - "valid": (error is None), - "error": error, - } - ) - - if relation_has_invalid_dashboards: - self._remove_all_dashboards_for_relation(relation) - - invalid_templates = [ - data["original_id"] for data in rendered_dashboards if not data["valid"] - ] - - logger.warning( - "Cannot add one or more Grafana dashboards from relation '{}:{}': the following " - "templates are invalid: {}".format( - relation.name, - relation.id, - invalid_templates, - ) - ) - - relation.data[self._charm.app]["event"] = json.dumps( - { - "errors": [ - { - "dashboard_id": rendered_dashboard["original_id"], - "error": rendered_dashboard["error"], - } - for rendered_dashboard in rendered_dashboards - if rendered_dashboard["error"] - ] - } - ) - - # Dropping dashboards for a relation needs to be signalled - self.on.dashboards_changed.emit() - else: - stored_data = rendered_dashboards - currently_stored_data = self._stored.dashboards.get(relation.id, {}) - - coerced_data = ( - _type_convert_stored(currently_stored_data) if currently_stored_data else {} - ) - - if not coerced_data == stored_data: - self._stored.dashboards[relation.id] = stored_data - self.on.dashboards_changed.emit() - - def _remove_all_dashboards_for_relation(self, relation: Relation) -> None: - """If an errored dashboard is in stored data, remove it and trigger a deletion.""" - if self._stored.dashboards.pop(relation.id, None): - self.on.dashboards_changed.emit() - - def _to_external_object(self, relation_id, dashboard): - return { - "id": dashboard["original_id"], - "relation_id": relation_id, - "charm": dashboard["template"]["charm"], - "content": _decode_dashboard_content(dashboard["content"]), - } - - @property - def dashboards(self) -> List[Dict]: - """Get a list of known dashboards across all instances of the monitored relation. - - Returns: a list of known dashboards. The JSON of each of the dashboards is available - in the `content` field of the corresponding `dict`. - """ - dashboards = [] - - for _, (relation_id, dashboards_for_relation) in enumerate( - self._stored.dashboards.items() - ): - for dashboard in dashboards_for_relation: - dashboards.append(self._to_external_object(relation_id, dashboard)) - - return dashboards - - -class GrafanaDashboardAggregator(Object): - """API to retrieve Grafana dashboards from machine dashboards. - - The :class:`GrafanaDashboardAggregator` object provides a way to - collate and aggregate Grafana dashboards from reactive/machine charms - and transport them into Charmed Operators, using Juju topology. - - For detailed usage instructions, see the documentation for - :module:`lma-proxy-operator`, as this class is intended for use as a - single point of intersection rather than use in individual charms. - - Since :class:`GrafanaDashboardAggregator` serves as a bridge between - Canonical Observability Stack Charmed Operators and Reactive Charms, - deployed in a Reactive Juju model, both a target relation which is - used to collect events from Reactive charms and a `grafana_relation` - which is used to send the collected data back to the Canonical - Observability Stack are required. - - In its most streamlined usage, :class:`GrafanaDashboardAggregator` is - integrated in a charmed operator as follows: - - self.grafana = GrafanaDashboardAggregator(self) - - Args: - charm: a :class:`CharmBase` object which manages this - :class:`GrafanaProvider` object. Generally this is - `self` in the instantiating class. - target_relation: a :string: name of a relation managed by this - :class:`GrafanaDashboardAggregator`, which is used to communicate - with reactive/machine charms it defaults to "dashboards". - grafana_relation: a :string: name of a relation used by this - :class:`GrafanaDashboardAggregator`, which is used to communicate - with charmed grafana. It defaults to "downstream-grafana-dashboard" - """ - - _stored = StoredState() - on = GrafanaProviderEvents() - - def __init__( - self, - charm: CharmBase, - target_relation: str = "dashboards", - grafana_relation: str = "downstream-grafana-dashboard", - ): - super().__init__(charm, grafana_relation) - self._stored.set_default( - dashboard_templates={}, - id_mappings={}, - ) - - self._charm = charm - self._target_relation = target_relation - self._grafana_relation = grafana_relation - - self.framework.observe( - self._charm.on[self._grafana_relation].relation_joined, - self._update_remote_grafana, - ) - self.framework.observe( - self._charm.on[self._grafana_relation].relation_changed, - self._update_remote_grafana, - ) - self.framework.observe( - self._charm.on[self._target_relation].relation_changed, - self.update_dashboards, - ) - self.framework.observe( - self._charm.on[self._target_relation].relation_broken, - self.remove_dashboards, - ) - - def update_dashboards(self, event: RelationEvent) -> None: - """If we get a dashboard from a reactive charm, parse it out and update.""" - if self._charm.unit.is_leader(): - self._upset_dashboards_on_event(event) - - def _upset_dashboards_on_event(self, event: RelationEvent) -> None: - """Update the dashboards in the relation data bucket.""" - dashboards = self._handle_reactive_dashboards(event) - - if not dashboards: - logger.warning( - "Could not find dashboard data after a relation change for {}".format(event.app) - ) - return - - for id in dashboards: - self._stored.dashboard_templates[id] = self._content_to_dashboard_object( - dashboards[id], event - ) - - self._stored.id_mappings[event.app.name] = dashboards - self._update_remote_grafana(event) - - def _update_remote_grafana(self, _: Optional[RelationEvent] = None) -> None: - """Push dashboards to the downstream Grafana relation.""" - # It's still ridiculous to add a UUID here, but needed - stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), - "uuid": str(uuid.uuid4()), - } - - for grafana_relation in self.model.relations[self._grafana_relation]: - grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) - - def remove_dashboards(self, event: RelationBrokenEvent) -> None: - """Remove a dashboard if the relation is broken.""" - app_ids = _type_convert_stored(self._stored.id_mappings[event.app.name]) - - del self._stored.id_mappings[event.app.name] - for id in app_ids: - del self._stored.dashboard_templates[id] - - stored_data = { - "templates": _type_convert_stored(self._stored.dashboard_templates), - "uuid": str(uuid.uuid4()), - } - - for grafana_relation in self.model.relations[self._grafana_relation]: - grafana_relation.data[self._charm.app]["dashboards"] = json.dumps(stored_data) - - # Yes, this has a fair amount of branching. It's not that complex, though - def _strip_existing_datasources(self, template: dict) -> dict: # noqa: C901 - """Remove existing reactive charm datasource templating out. - - This method iterates through *known* places where reactive charms may set - data in contributed dashboards and removes them. - - `dashboard["__inputs"]` is a property sometimes set when exporting dashboards from - the Grafana UI. It is not present in earlier Grafana versions, and can be disabled - in 5.3.4 and above (optionally). If set, any values present will be substituted on - import. Some reactive charms use this for Prometheus. LMA2 uses dropdown selectors - for datasources, and leaving this present results in "default" datasource values - which are broken. - - Similarly, `dashboard["templating"]["list"][N]["name"] == "host"` can be used to - set a `host` variable for use in dashboards which is not meaningful in the context - of Juju topology and will yield broken dashboards. - - Further properties may be discovered. - """ - dash = template["dashboard"] - try: - if "list" in dash["templating"]: - for i in range(len(dash["templating"]["list"])): - if ( - "datasource" in dash["templating"]["list"][i] - and "Juju" in dash["templating"]["list"][i]["datasource"] - ): - dash["templating"]["list"][i]["datasource"] = r"${prometheusds}" - if ( - "name" in dash["templating"]["list"][i] - and dash["templating"]["list"][i]["name"] == "host" - ): - dash["templating"]["list"][i] = REACTIVE_CONVERTER - except KeyError: - logger.debug("No existing templating data in dashboard") - - if "__inputs" in dash: - inputs = dash - for i in range(len(dash["__inputs"])): - if dash["__inputs"][i]["pluginName"] == "Prometheus": - del inputs["__inputs"][i] - if inputs: - dash["__inputs"] = inputs["__inputs"] - else: - del dash["__inputs"] - - template["dashboard"] = dash - return template - - def _handle_reactive_dashboards(self, event: RelationEvent) -> Optional[Dict]: - """Look for a dashboard in relation data (during a reactive hook) or builtin by name.""" - templates = [] - id = "" - - # Reactive data can reliably be pulled out of events. In theory, if we got an event, - # it's on the bucket, but using event explicitly keeps the mental model in - # place for reactive - for k in event.relation.data[event.unit].keys(): - if k.startswith("request_"): - templates.append(json.loads(event.relation.data[event.unit][k])["dashboard"]) - - for k in event.relation.data[event.app].keys(): - if k.startswith("request_"): - templates.append(json.loads(event.relation.data[event.app][k])["dashboard"]) - - builtins = self._maybe_get_builtin_dashboards(event) - - if not templates and not builtins: - return {} - - dashboards = {} - for t in templates: - # Replace values with LMA-style templating - t = self._strip_existing_datasources(t) - - # This seems ridiculous, too, but to get it from a "dashboards" key in serialized JSON - # in the bucket back out to the actual "dashboard" we _need_, this is the way - # This is not a mistake -- there's a double nesting in reactive charms, and - # Grafana won't load it. We have to unbox: - # event.relation.data[event.]["request_*"]["dashboard"]["dashboard"], - # and the final unboxing is below. - dash = json.dumps(t["dashboard"]) - - # Replace the old-style datasource templates - dash = re.sub(r"<< datasource >>", r"${prometheusds}", dash) - dash = re.sub(r'"datasource": "prom.*?"', r'"datasource": "${prometheusds}"', dash) - - from jinja2 import Template - - content = _encode_dashboard_content( - Template(dash).render(host=event.unit.name, datasource="prometheus") - ) - id = "prog:{}".format(content[-24:-16]) - - dashboards[id] = content - return {**builtins, **dashboards} - - def _maybe_get_builtin_dashboards(self, event: RelationEvent) -> Dict: - """Tries to match the event with an included dashboard. - - Scans dashboards packed with the charm instantiating this class, and tries to match - one with the event. There is no guarantee that any given event will match a builtin, - since each charm instantiating this class may include a different set of dashboards, - or none. - """ - builtins = {} - dashboards_path = None - - try: - dashboards_path = _resolve_dir_against_charm_path( - self._charm, "src/grafana_dashboards" - ) - except InvalidDirectoryPathError as e: - logger.warning( - "Invalid Grafana dashboards folder at %s: %s", - e.grafana_dashboards_absolute_path, - e.message, - ) - - if dashboards_path: - for path in filter(Path.is_file, Path(dashboards_path).glob("*.tmpl")): - if event.app.name in path.name: - id = "file:{}".format(path.stem) - builtins[id] = self._content_to_dashboard_object( - _encode_dashboard_content(path.read_bytes()), event - ) - - return builtins - - def _content_to_dashboard_object(self, content: str, event: RelationEvent) -> Dict: - return { - "charm": event.app.name, - "content": content, - "juju_topology": self._juju_topology(event), - } - - # This is not actually used in the dashboards, but is present to provide a secondary - # salt to ensure uniqueness in the dict keys in case individual charm units provide - # dashboards - def _juju_topology(self, event: RelationEvent) -> Dict: - return { - "model": self._charm.model.name, - "model_uuid": self._charm.model.uuid, - "application": event.app.name, - "unit": event.unit.name, - } diff --git a/lib/charms/kafka_k8s/v0/kafka.py b/lib/charms/kafka_k8s/v0/kafka.py deleted file mode 100644 index 3439a64f..00000000 --- a/lib/charms/kafka_k8s/v0/kafka.py +++ /dev/null @@ -1,200 +0,0 @@ -# Copyright 2022 Canonical Ltd. -# See LICENSE file for licensing details. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -# implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Kafka library. - -This [library](https://juju.is/docs/sdk/libraries) implements both sides of the -`kafka` [interface](https://juju.is/docs/sdk/relations). - -The *provider* side of this interface is implemented by the -[kafka-k8s Charmed Operator](https://charmhub.io/kafka-k8s). - -Any Charmed Operator that *requires* Kafka for providing its -service should implement the *requirer* side of this interface. - -In a nutshell using this library to implement a Charmed Operator *requiring* -Kafka would look like - -``` -$ charmcraft fetch-lib charms.kafka_k8s.v0.kafka -``` - -`metadata.yaml`: - -``` -requires: - kafka: - interface: kafka - limit: 1 -``` - -`src/charm.py`: - -``` -from charms.kafka_k8s.v0.kafka import KafkaEvents, KafkaRequires -from ops.charm import CharmBase - - -class MyCharm(CharmBase): - - on = KafkaEvents() - - def __init__(self, *args): - super().__init__(*args) - self.kafka = KafkaRequires(self) - self.framework.observe( - self.on.kafka_available, - self._on_kafka_available, - ) - self.framework.observe( - self.on["kafka"].relation_broken, - self._on_kafka_broken, - ) - - def _on_kafka_available(self, event): - # Get Kafka host and port - host: str = self.kafka.host - port: int = self.kafka.port - # host => "kafka-k8s" - # port => 9092 - - def _on_kafka_broken(self, event): - # Stop service - # ... - self.unit.status = BlockedStatus("need kafka relation") -``` - -You can file bugs -[here](https://github.com/canonical/kafka-k8s-operator/issues)! -""" - -from typing import Optional - -from ops.charm import CharmBase, CharmEvents -from ops.framework import EventBase, EventSource, Object - -# The unique Charmhub library identifier, never change it -from ops.model import Relation - -LIBID = "eacc8c85082347c9aae740e0220b8376" - -# Increment this major API version when introducing breaking changes -LIBAPI = 0 - -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version -LIBPATCH = 4 - - -KAFKA_HOST_APP_KEY = "host" -KAFKA_PORT_APP_KEY = "port" - - -class _KafkaAvailableEvent(EventBase): - """Event emitted when Kafka is available.""" - - -class KafkaEvents(CharmEvents): - """Kafka events. - - This class defines the events that Kafka can emit. - - Events: - kafka_available (_KafkaAvailableEvent) - """ - - kafka_available = EventSource(_KafkaAvailableEvent) - - -class KafkaRequires(Object): - """Requires-side of the Kafka relation.""" - - def __init__(self, charm: CharmBase, endpoint_name: str = "kafka") -> None: - super().__init__(charm, endpoint_name) - self.charm = charm - self._endpoint_name = endpoint_name - - # Observe relation events - event_observe_mapping = { - charm.on[self._endpoint_name].relation_changed: self._on_relation_changed, - } - for event, observer in event_observe_mapping.items(): - self.framework.observe(event, observer) - - def _on_relation_changed(self, event) -> None: - if event.relation.app and all( - key in event.relation.data[event.relation.app] - for key in (KAFKA_HOST_APP_KEY, KAFKA_PORT_APP_KEY) - ): - self.charm.on.kafka_available.emit() - - @property - def host(self) -> str: - """Get kafka hostname.""" - relation: Relation = self.model.get_relation(self._endpoint_name) - return ( - relation.data[relation.app].get(KAFKA_HOST_APP_KEY) - if relation and relation.app - else None - ) - - @property - def port(self) -> int: - """Get kafka port number.""" - relation: Relation = self.model.get_relation(self._endpoint_name) - return ( - int(relation.data[relation.app].get(KAFKA_PORT_APP_KEY)) - if relation and relation.app - else None - ) - - -class KafkaProvides(Object): - """Provides-side of the Kafka relation.""" - - def __init__(self, charm: CharmBase, endpoint_name: str = "kafka") -> None: - super().__init__(charm, endpoint_name) - self._endpoint_name = endpoint_name - - def set_host_info(self, host: str, port: int, relation: Optional[Relation] = None) -> None: - """Set Kafka host and port. - - This function writes in the application data of the relation, therefore, - only the unit leader can call it. - - Args: - host (str): Kafka hostname or IP address. - port (int): Kafka port. - relation (Optional[Relation]): Relation to update. - If not specified, all relations will be updated. - - Raises: - Exception: if a non-leader unit calls this function. - """ - if not self.model.unit.is_leader(): - raise Exception("only the leader set host information.") - - if relation: - self._update_relation_data(host, port, relation) - return - - for relation in self.model.relations[self._endpoint_name]: - self._update_relation_data(host, port, relation) - - def _update_relation_data(self, host: str, port: int, relation: Relation) -> None: - """Update data in relation if needed.""" - relation.data[self.model.app][KAFKA_HOST_APP_KEY] = host - relation.data[self.model.app][KAFKA_PORT_APP_KEY] = str(port) diff --git a/lib/charms/observability_libs/v1/kubernetes_service_patch.py b/lib/charms/observability_libs/v1/kubernetes_service_patch.py deleted file mode 100644 index d1d7eb22..00000000 --- a/lib/charms/observability_libs/v1/kubernetes_service_patch.py +++ /dev/null @@ -1,290 +0,0 @@ -# Copyright 2021 Canonical Ltd. -# See LICENSE file for licensing details. - -"""# KubernetesServicePatch Library. - -This library is designed to enable developers to more simply patch the Kubernetes Service created -by Juju during the deployment of a sidecar charm. When sidecar charms are deployed, Juju creates a -service named after the application in the namespace (named after the Juju model). This service by -default contains a "placeholder" port, which is 65536/TCP. - -When modifying the default set of resources managed by Juju, one must consider the lifecycle of the -charm. In this case, any modifications to the default service (created during deployment), will be -overwritten during a charm upgrade. - -When initialised, this library binds a handler to the parent charm's `install` and `upgrade_charm` -events which applies the patch to the cluster. This should ensure that the service ports are -correct throughout the charm's life. - -The constructor simply takes a reference to the parent charm, and a list of -[`lightkube`](https://github.com/gtsystem/lightkube) ServicePorts that each define a port for the -service. For information regarding the `lightkube` `ServicePort` model, please visit the -`lightkube` [docs](https://gtsystem.github.io/lightkube-models/1.23/models/core_v1/#serviceport). - -Optionally, a name of the service (in case service name needs to be patched as well), labels, -selectors, and annotations can be provided as keyword arguments. - -## Getting Started - -To get started using the library, you just need to fetch the library using `charmcraft`. **Note -that you also need to add `lightkube` and `lightkube-models` to your charm's `requirements.txt`.** - -```shell -cd some-charm -charmcraft fetch-lib charms.observability_libs.v0.kubernetes_service_patch -echo <<-EOF >> requirements.txt -lightkube -lightkube-models -EOF -``` - -Then, to initialise the library: - -For `ClusterIP` services: - -```python -# ... -from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch -from lightkube.models.core_v1 import ServicePort - -class SomeCharm(CharmBase): - def __init__(self, *args): - # ... - port = ServicePort(443, name=f"{self.app.name}") - self.service_patcher = KubernetesServicePatch(self, [port]) - # ... -``` - -For `LoadBalancer`/`NodePort` services: - -```python -# ... -from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch -from lightkube.models.core_v1 import ServicePort - -class SomeCharm(CharmBase): - def __init__(self, *args): - # ... - port = ServicePort(443, name=f"{self.app.name}", targetPort=443, nodePort=30666) - self.service_patcher = KubernetesServicePatch( - self, [port], "LoadBalancer" - ) - # ... -``` - -Port protocols can also be specified. Valid protocols are `"TCP"`, `"UDP"`, and `"SCTP"` - -```python -# ... -from charms.observability_libs.v0.kubernetes_service_patch import KubernetesServicePatch -from lightkube.models.core_v1 import ServicePort - -class SomeCharm(CharmBase): - def __init__(self, *args): - # ... - tcp = ServicePort(443, name=f"{self.app.name}-tcp", protocol="TCP") - udp = ServicePort(443, name=f"{self.app.name}-udp", protocol="UDP") - sctp = ServicePort(443, name=f"{self.app.name}-sctp", protocol="SCTP") - self.service_patcher = KubernetesServicePatch(self, [tcp, udp, sctp]) - # ... -``` - -Additionally, you may wish to use mocks in your charm's unit testing to ensure that the library -does not try to make any API calls, or open any files during testing that are unlikely to be -present, and could break your tests. The easiest way to do this is during your test `setUp`: - -```python -# ... - -@patch("charm.KubernetesServicePatch", lambda x, y: None) -def setUp(self, *unused): - self.harness = Harness(SomeCharm) - # ... -``` -""" - -import logging -from types import MethodType -from typing import List, Literal - -from lightkube import ApiError, Client -from lightkube.models.core_v1 import ServicePort, ServiceSpec -from lightkube.models.meta_v1 import ObjectMeta -from lightkube.resources.core_v1 import Service -from lightkube.types import PatchType -from ops.charm import CharmBase -from ops.framework import Object - -logger = logging.getLogger(__name__) - -# The unique Charmhub library identifier, never change it -LIBID = "0042f86d0a874435adef581806cddbbb" - -# Increment this major API version when introducing breaking changes -LIBAPI = 1 - -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version -LIBPATCH = 1 - -ServiceType = Literal["ClusterIP", "LoadBalancer"] - - -class KubernetesServicePatch(Object): - """A utility for patching the Kubernetes service set up by Juju.""" - - def __init__( - self, - charm: CharmBase, - ports: List[ServicePort], - service_name: str = None, - service_type: ServiceType = "ClusterIP", - additional_labels: dict = None, - additional_selectors: dict = None, - additional_annotations: dict = None, - ): - """Constructor for KubernetesServicePatch. - - Args: - charm: the charm that is instantiating the library. - ports: a list of ServicePorts - service_name: allows setting custom name to the patched service. If none given, - application name will be used. - service_type: desired type of K8s service. Default value is in line with ServiceSpec's - default value. - additional_labels: Labels to be added to the kubernetes service (by default only - "app.kubernetes.io/name" is set to the service name) - additional_selectors: Selectors to be added to the kubernetes service (by default only - "app.kubernetes.io/name" is set to the service name) - additional_annotations: Annotations to be added to the kubernetes service. - """ - super().__init__(charm, "kubernetes-service-patch") - self.charm = charm - self.service_name = service_name if service_name else self._app - self.service = self._service_object( - ports, - service_name, - service_type, - additional_labels, - additional_selectors, - additional_annotations, - ) - - # Make mypy type checking happy that self._patch is a method - assert isinstance(self._patch, MethodType) - # Ensure this patch is applied during the 'install' and 'upgrade-charm' events - self.framework.observe(charm.on.install, self._patch) - self.framework.observe(charm.on.upgrade_charm, self._patch) - - def _service_object( - self, - ports: List[ServicePort], - service_name: str = None, - service_type: ServiceType = "ClusterIP", - additional_labels: dict = None, - additional_selectors: dict = None, - additional_annotations: dict = None, - ) -> Service: - """Creates a valid Service representation. - - Args: - ports: a list of ServicePorts - service_name: allows setting custom name to the patched service. If none given, - application name will be used. - service_type: desired type of K8s service. Default value is in line with ServiceSpec's - default value. - additional_labels: Labels to be added to the kubernetes service (by default only - "app.kubernetes.io/name" is set to the service name) - additional_selectors: Selectors to be added to the kubernetes service (by default only - "app.kubernetes.io/name" is set to the service name) - additional_annotations: Annotations to be added to the kubernetes service. - - Returns: - Service: A valid representation of a Kubernetes Service with the correct ports. - """ - if not service_name: - service_name = self._app - labels = {"app.kubernetes.io/name": self._app} - if additional_labels: - labels.update(additional_labels) - selector = {"app.kubernetes.io/name": self._app} - if additional_selectors: - selector.update(additional_selectors) - return Service( - apiVersion="v1", - kind="Service", - metadata=ObjectMeta( - namespace=self._namespace, - name=service_name, - labels=labels, - annotations=additional_annotations, # type: ignore[arg-type] - ), - spec=ServiceSpec( - selector=selector, - ports=ports, - type=service_type, - ), - ) - - def _patch(self, _) -> None: - """Patch the Kubernetes service created by Juju to map the correct port. - - Raises: - PatchFailed: if patching fails due to lack of permissions, or otherwise. - """ - if not self.charm.unit.is_leader(): - return - - client = Client() - try: - if self.service_name != self._app: - self._delete_and_create_service(client) - client.patch(Service, self.service_name, self.service, patch_type=PatchType.MERGE) - except ApiError as e: - if e.status.code == 403: - logger.error("Kubernetes service patch failed: `juju trust` this application.") - else: - logger.error("Kubernetes service patch failed: %s", str(e)) - else: - logger.info("Kubernetes service '%s' patched successfully", self._app) - - def _delete_and_create_service(self, client: Client): - service = client.get(Service, self._app, namespace=self._namespace) - service.metadata.name = self.service_name # type: ignore[attr-defined] - service.metadata.resourceVersion = service.metadata.uid = None # type: ignore[attr-defined] # noqa: E501 - client.delete(Service, self._app, namespace=self._namespace) - client.create(service) - - def is_patched(self) -> bool: - """Reports if the service patch has been applied. - - Returns: - bool: A boolean indicating if the service patch has been applied. - """ - client = Client() - # Get the relevant service from the cluster - service = client.get(Service, name=self.service_name, namespace=self._namespace) - # Construct a list of expected ports, should the patch be applied - expected_ports = [(p.port, p.targetPort) for p in self.service.spec.ports] - # Construct a list in the same manner, using the fetched service - fetched_ports = [(p.port, p.targetPort) for p in service.spec.ports] # type: ignore[attr-defined] # noqa: E501 - return expected_ports == fetched_ports - - @property - def _app(self) -> str: - """Name of the current Juju application. - - Returns: - str: A string containing the name of the current Juju application. - """ - return self.charm.app.name - - @property - def _namespace(self) -> str: - """The Kubernetes namespace we're running in. - - Returns: - str: A string containing the name of the current Kubernetes namespace. - """ - with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") as f: - return f.read().strip() diff --git a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py b/lib/charms/prometheus_k8s/v0/prometheus_scrape.py deleted file mode 100644 index 994b4302..00000000 --- a/lib/charms/prometheus_k8s/v0/prometheus_scrape.py +++ /dev/null @@ -1,2261 +0,0 @@ -# Copyright 2021 Canonical Ltd. -# See LICENSE file for licensing details. -"""## Overview. - -This document explains how to integrate with the Prometheus charm -for the purpose of providing a metrics endpoint to Prometheus. It -also explains how alternative implementations of the Prometheus charms -may maintain the same interface and be backward compatible with all -currently integrated charms. Finally this document is the -authoritative reference on the structure of relation data that is -shared between Prometheus charms and any other charm that intends to -provide a scrape target for Prometheus. - -## Provider Library Usage - -This Prometheus charm interacts with its scrape targets using its -charm library. Charms seeking to expose metric endpoints for the -Prometheus charm, must do so using the `MetricsEndpointProvider` -object from this charm library. For the simplest use cases, using the -`MetricsEndpointProvider` object only requires instantiating it, -typically in the constructor of your charm (the one which exposes a -metrics endpoint). The `MetricsEndpointProvider` constructor requires -the name of the relation over which a scrape target (metrics endpoint) -is exposed to the Prometheus charm. This relation must use the -`prometheus_scrape` interface. By default address of the metrics -endpoint is set to the unit IP address, by each unit of the -`MetricsEndpointProvider` charm. These units set their address in -response to the `PebbleReady` event of each container in the unit, -since container restarts of Kubernetes charms can result in change of -IP addresses. The default name for the metrics endpoint relation is -`metrics-endpoint`. It is strongly recommended to use the same -relation name for consistency across charms and doing so obviates the -need for an additional constructor argument. The -`MetricsEndpointProvider` object may be instantiated as follows - - from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider - - def __init__(self, *args): - super().__init__(*args) - ... - self.metrics_endpoint = MetricsEndpointProvider(self) - ... - -Note that the first argument (`self`) to `MetricsEndpointProvider` is -always a reference to the parent (scrape target) charm. - -An instantiated `MetricsEndpointProvider` object will ensure that each -unit of its parent charm, is a scrape target for the -`MetricsEndpointConsumer` (Prometheus) charm. By default -`MetricsEndpointProvider` assumes each unit of the consumer charm -exports its metrics at a path given by `/metrics` on port 80. These -defaults may be changed by providing the `MetricsEndpointProvider` -constructor an optional argument (`jobs`) that represents a -Prometheus scrape job specification using Python standard data -structures. This job specification is a subset of Prometheus' own -[scrape -configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config) -format but represented using Python data structures. More than one job -may be provided using the `jobs` argument. Hence `jobs` accepts a list -of dictionaries where each dictionary represents one `` -object as described in the Prometheus documentation. The currently -supported configuration subset is: `job_name`, `metrics_path`, -`static_configs` - -Suppose it is required to change the port on which scraped metrics are -exposed to 8000. This may be done by providing the following data -structure as the value of `jobs`. - -``` -[ - { - "static_configs": [ - { - "targets": ["*:8000"] - } - ] - } -] -``` - -The wildcard ("*") host specification implies that the scrape targets -will automatically be set to the host addresses advertised by each -unit of the consumer charm. - -It is also possible to change the metrics path and scrape multiple -ports, for example - -``` -[ - { - "metrics_path": "/my-metrics-path", - "static_configs": [ - { - "targets": ["*:8000", "*:8081"], - } - ] - } -] -``` - -More complex scrape configurations are possible. For example - -``` -[ - { - "static_configs": [ - { - "targets": ["10.1.32.215:7000", "*:8000"], - "labels": { - "some-key": "some-value" - } - } - ] - } -] -``` - -This example scrapes the target "10.1.32.215" at port 7000 in addition -to scraping each unit at port 8000. There is however one difference -between wildcard targets (specified using "*") and fully qualified -targets (such as "10.1.32.215"). The Prometheus charm automatically -associates labels with metrics generated by each target. These labels -localise the source of metrics within the Juju topology by specifying -its "model name", "model UUID", "application name" and "unit -name". However unit name is associated only with wildcard targets but -not with fully qualified targets. - -Multiple jobs with different metrics paths and labels are allowed, but -each job must be given a unique name. For example - -``` -[ - { - "job_name": "my-first-job", - "metrics_path": "one-path", - "static_configs": [ - { - "targets": ["*:7000"], - "labels": { - "some-key": "some-value" - } - } - ] - }, - { - "job_name": "my-second-job", - "metrics_path": "another-path", - "static_configs": [ - { - "targets": ["*:8000"], - "labels": { - "some-other-key": "some-other-value" - } - } - ] - } -] -``` - -It is also possible to configure other scrape related parameters using -these job specifications as described by the Prometheus -[documentation](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config). -The permissible subset of job specific scrape configuration parameters -supported in a `MetricsEndpointProvider` job specification are: - -- `job_name` -- `metrics_path` -- `static_configs` -- `scrape_interval` -- `scrape_timeout` -- `proxy_url` -- `relabel_configs` -- `metrics_relabel_configs` -- `sample_limit` -- `label_limit` -- `label_name_length_limit` -- `label_value_length_limit` - -## Consumer Library Usage - -The `MetricsEndpointConsumer` object may be used by Prometheus -charms to manage relations with their scrape targets. For this -purposes a Prometheus charm needs to do two things - -1. Instantiate the `MetricsEndpointConsumer` object by providing it a -reference to the parent (Prometheus) charm and optionally the name of -the relation that the Prometheus charm uses to interact with scrape -targets. This relation must confirm to the `prometheus_scrape` -interface and it is strongly recommended that this relation be named -`metrics-endpoint` which is its default value. - -For example a Prometheus charm may instantiate the -`MetricsEndpointConsumer` in its constructor as follows - - from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointConsumer - - def __init__(self, *args): - super().__init__(*args) - ... - self.metrics_consumer = MetricsEndpointConsumer(self) - ... - -2. A Prometheus charm also needs to respond to the -`TargetsChangedEvent` event of the `MetricsEndpointConsumer` by adding itself as -an observer for these events, as in - - self.framework.observe( - self.metrics_consumer.on.targets_changed, - self._on_scrape_targets_changed, - ) - -In responding to the `TargetsChangedEvent` event the Prometheus -charm must update the Prometheus configuration so that any new scrape -targets are added and/or old ones removed from the list of scraped -endpoints. For this purpose the `MetricsEndpointConsumer` object -exposes a `jobs()` method that returns a list of scrape jobs. Each -element of this list is the Prometheus scrape configuration for that -job. In order to update the Prometheus configuration, the Prometheus -charm needs to replace the current list of jobs with the list provided -by `jobs()` as follows - - def _on_scrape_targets_changed(self, event): - ... - scrape_jobs = self.metrics_consumer.jobs() - for job in scrape_jobs: - prometheus_scrape_config.append(job) - ... - -## Alerting Rules - -This charm library also supports gathering alerting rules from all -related `MetricsEndpointProvider` charms and enabling corresponding alerts within the -Prometheus charm. Alert rules are automatically gathered by `MetricsEndpointProvider` -charms when using this library, from a directory conventionally named -`prometheus_alert_rules`. This directory must reside at the top level -in the `src` folder of the consumer charm. Each file in this directory -is assumed to be in one of two formats: -- the official prometheus alert rule format, conforming to the -[Prometheus docs](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) -- a single rule format, which is a simplified subset of the official format, -comprising a single alert rule per file, using the same YAML fields. - -The file name must have the `.rule` extension. - -An example of the contents of such a file in the custom single rule -format is shown below. - -``` -alert: HighRequestLatency -expr: job:request_latency_seconds:mean5m{my_key=my_value} > 0.5 -for: 10m -labels: - severity: Medium - type: HighLatency -annotations: - summary: High request latency for {{ $labels.instance }}. -``` - -The `MetricsEndpointProvider` will read all available alert rules and -also inject "filtering labels" into the alert expressions. The -filtering labels ensure that alert rules are localised to the metrics -provider charm's Juju topology (application, model and its UUID). Such -a topology filter is essential to ensure that alert rules submitted by -one provider charm generates alerts only for that same charm. When -alert rules are embedded in a charm, and the charm is deployed as a -Juju application, the alert rules from that application have their -expressions automatically updated to filter for metrics coming from -the units of that application alone. This remove risk of spurious -evaluation, e.g., when you have multiple deployments of the same charm -monitored by the same Prometheus. - -Not all alerts one may want to specify can be embedded in a -charm. Some alert rules will be specific to a user's use case. This is -the case, for example, of alert rules that are based on business -constraints, like expecting a certain amount of requests to a specific -API every five minutes. Such alert rules can be specified via the -[COS Config Charm](https://charmhub.io/cos-configuration-k8s), -which allows importing alert rules and other settings like dashboards -from a Git repository. - -Gathering alert rules and generating rule files within the Prometheus -charm is easily done using the `alerts()` method of -`MetricsEndpointConsumer`. Alerts generated by Prometheus will -automatically include Juju topology labels in the alerts. These labels -indicate the source of the alert. The following labels are -automatically included with each alert - -- `juju_model` -- `juju_model_uuid` -- `juju_application` - -## Relation Data - -The Prometheus charm uses both application and unit relation data to -obtain information regarding its scrape jobs, alert rules and scrape -targets. This relation data is in JSON format and it closely resembles -the YAML structure of Prometheus [scrape configuration] -(https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config). - -Units of Metrics provider charms advertise their names and addresses -over unit relation data using the `prometheus_scrape_unit_name` and -`prometheus_scrape_unit_address` keys. While the `scrape_metadata`, -`scrape_jobs` and `alert_rules` keys in application relation data -of Metrics provider charms hold eponymous information. - -""" - -import json -import logging -import os -import platform -import subprocess -from collections import OrderedDict -from pathlib import Path -from typing import Dict, List, Optional, Union - -import yaml -from ops.charm import CharmBase, RelationRole -from ops.framework import EventBase, EventSource, Object, ObjectEvents - -# The unique Charmhub library identifier, never change it -from ops.model import ModelError - -LIBID = "bc84295fef5f4049878f07b131968ee2" - -# Increment this major API version when introducing breaking changes -LIBAPI = 0 - -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version -LIBPATCH = 17 - -logger = logging.getLogger(__name__) - - -ALLOWED_KEYS = { - "job_name", - "metrics_path", - "static_configs", - "scrape_interval", - "scrape_timeout", - "proxy_url", - "relabel_configs", - "metrics_relabel_configs", - "sample_limit", - "label_limit", - "label_name_length_limit", - "label_value_lenght_limit", -} -DEFAULT_JOB = { - "metrics_path": "/metrics", - "static_configs": [{"targets": ["*:80"]}], -} - - -DEFAULT_RELATION_NAME = "metrics-endpoint" -RELATION_INTERFACE_NAME = "prometheus_scrape" - -DEFAULT_ALERT_RULES_RELATIVE_PATH = "./src/prometheus_alert_rules" - - -class RelationNotFoundError(Exception): - """Raised if there is no relation with the given name is found.""" - - def __init__(self, relation_name: str): - self.relation_name = relation_name - self.message = "No relation named '{}' found".format(relation_name) - - super().__init__(self.message) - - -class RelationInterfaceMismatchError(Exception): - """Raised if the relation with the given name has a different interface.""" - - def __init__( - self, - relation_name: str, - expected_relation_interface: str, - actual_relation_interface: str, - ): - self.relation_name = relation_name - self.expected_relation_interface = expected_relation_interface - self.actual_relation_interface = actual_relation_interface - self.message = ( - "The '{}' relation has '{}' as interface rather than the expected '{}'".format( - relation_name, actual_relation_interface, expected_relation_interface - ) - ) - - super().__init__(self.message) - - -class RelationRoleMismatchError(Exception): - """Raised if the relation with the given name has a different role.""" - - def __init__( - self, - relation_name: str, - expected_relation_role: RelationRole, - actual_relation_role: RelationRole, - ): - self.relation_name = relation_name - self.expected_relation_interface = expected_relation_role - self.actual_relation_role = actual_relation_role - self.message = "The '{}' relation has role '{}' rather than the expected '{}'".format( - relation_name, repr(actual_relation_role), repr(expected_relation_role) - ) - - super().__init__(self.message) - - -def _validate_relation_by_interface_and_direction( - charm: CharmBase, - relation_name: str, - expected_relation_interface: str, - expected_relation_role: RelationRole, -): - """Verifies that a relation has the necessary characteristics. - - Verifies that the `relation_name` provided: (1) exists in metadata.yaml, - (2) declares as interface the interface name passed as `relation_interface` - and (3) has the right "direction", i.e., it is a relation that `charm` - provides or requires. - - Args: - charm: a `CharmBase` object to scan for the matching relation. - relation_name: the name of the relation to be verified. - expected_relation_interface: the interface name to be matched by the - relation named `relation_name`. - expected_relation_role: whether the `relation_name` must be either - provided or required by `charm`. - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - with the same name as provided via `relation_name` argument. - RelationInterfaceMismatchError: The relation with the same name as provided - via `relation_name` argument does not have the same relation interface - as specified via the `expected_relation_interface` argument. - RelationRoleMismatchError: If the relation with the same name as provided - via `relation_name` argument does not have the same role as specified - via the `expected_relation_role` argument. - """ - if relation_name not in charm.meta.relations: - raise RelationNotFoundError(relation_name) - - relation = charm.meta.relations[relation_name] - - actual_relation_interface = relation.interface_name - if actual_relation_interface != expected_relation_interface: - raise RelationInterfaceMismatchError( - relation_name, expected_relation_interface, actual_relation_interface - ) - - if expected_relation_role == RelationRole.provides: - if relation_name not in charm.meta.provides: - raise RelationRoleMismatchError( - relation_name, RelationRole.provides, RelationRole.requires - ) - elif expected_relation_role == RelationRole.requires: - if relation_name not in charm.meta.requires: - raise RelationRoleMismatchError( - relation_name, RelationRole.requires, RelationRole.provides - ) - else: - raise Exception("Unexpected RelationDirection: {}".format(expected_relation_role)) - - -def _sanitize_scrape_configuration(job) -> dict: - """Restrict permissible scrape configuration options. - - If job is empty then a default job is returned. The - default job is - - ``` - { - "metrics_path": "/metrics", - "static_configs": [{"targets": ["*:80"]}], - } - ``` - - Args: - job: a dict containing a single Prometheus job - specification. - - Returns: - a dictionary containing a sanitized job specification. - """ - sanitized_job = DEFAULT_JOB.copy() - sanitized_job.update({key: value for key, value in job.items() if key in ALLOWED_KEYS}) - return sanitized_job - - -class JujuTopology: - """Class for storing and formatting juju topology information.""" - - STUB = "%%juju_topology%%" - - def __new__(cls, *args, **kwargs): - """Reject instantiation of a base JujuTopology class. Children only.""" - if cls is JujuTopology: - raise TypeError("only children of '{}' may be instantiated".format(cls.__name__)) - return object.__new__(cls) - - def __init__( - self, - model: str, - model_uuid: str, - application: str, - unit: Optional[str] = "", - charm_name: Optional[str] = "", - ): - """Build a JujuTopology object. - - A `JujuTopology` object is used for storing and transforming - Juju Topology information. This information is used to - annotate Prometheus scrape jobs and alert rules. Such - annotation when applied to scrape jobs helps in identifying - the source of the scrapped metrics. On the other hand when - applied to alert rules topology information ensures that - evaluation of alert expressions is restricted to the source - (charm) from which the alert rules were obtained. - - Args: - model: a string name of the Juju model - model_uuid: a globally unique string identifier for the Juju model - application: an application name as a string - unit: a unit name as a string - charm_name: name of charm as a string - - Note: - `JujuTopology` should not be constructed directly by charm code. Please - use `ProviderTopology` or `AggregatorTopology`. - """ - self.model = model - self.model_uuid = model_uuid - self.application = application - self.charm_name = charm_name - self.unit = unit - - @classmethod - def from_charm(cls, charm): - """Factory method for creating `JujuTopology` children from a given charm. - - Args: - charm: a `CharmBase` object for which the `JujuTopology` has to be constructed - - Returns: - a `JujuTopology` object. - """ - return cls( - model=charm.model.name, - model_uuid=charm.model.uuid, - application=charm.model.app.name, - unit=charm.model.unit.name, - charm_name=charm.meta.name, - ) - - @classmethod - def from_relation_data(cls, data: dict): - """Factory method for creating `JujuTopology` children from a dictionary. - - Args: - data: a dictionary with four keys providing topology information. The keys are - - "model" - - "model_uuid" - - "application" - - "unit" - - "charm_name" - - `unit` and `charm_name` may be empty, but will result in more limited - labels. However, this allows us to support payload-only charms. - - Returns: - a `JujuTopology` object. - """ - return cls( - model=data["model"], - model_uuid=data["model_uuid"], - application=data["application"], - unit=data.get("unit", ""), - charm_name=data.get("charm_name", ""), - ) - - @property - def identifier(self) -> str: - """Format the topology information into a terse string.""" - # This is odd, but may have `None` as a model key - return "_".join([str(val) for val in self.as_promql_label_dict().values()]).replace( - "/", "_" - ) - - @property - def promql_labels(self) -> str: - """Format the topology information into a verbose string.""" - return ", ".join( - ['{}="{}"'.format(key, value) for key, value in self.as_promql_label_dict().items()] - ) - - def as_dict(self, rename_keys: Optional[Dict[str, str]] = None) -> OrderedDict: - """Format the topology information into a dict. - - Use an OrderedDict so we can rely on the insertion order on Python 3.5 (and 3.6, - which still does not guarantee it). - - Args: - rename_keys: A dictionary mapping old key names to new key names, which will - be substituted when invoked. - """ - ret = OrderedDict( - [ - ("model", self.model), - ("model_uuid", self.model_uuid), - ("application", self.application), - ("unit", self.unit), - ("charm_name", self.charm_name), - ] - ) - - ret["unit"] or ret.pop("unit") - ret["charm_name"] or ret.pop("charm_name") - - # If a key exists in `rename_keys`, replace the value - if rename_keys: - ret = OrderedDict( - (rename_keys.get(k), v) if rename_keys.get(k) else (k, v) for k, v in ret.items() # type: ignore - ) - - return ret - - def as_promql_label_dict(self): - """Format the topology information into a dict with keys having 'juju_' as prefix.""" - vals = { - "juju_{}".format(key): val - for key, val in self.as_dict(rename_keys={"charm_name": "charm"}).items() - } - # The leader is the only unit that sets alert rules, if "juju_unit" is present, - # then the rules will only be evaluated for that unit - if "juju_unit" in vals: - vals.pop("juju_unit") - - return vals - - def render(self, template: str): - """Render a juju-topology template string with topology info.""" - return template.replace(JujuTopology.STUB, self.promql_labels) - - -class AggregatorTopology(JujuTopology): - """Class for initializing topology information for MetricsEndpointAggregator.""" - - @classmethod - def create(cls, model: str, model_uuid: str, application: str, unit: str): - """Factory method for creating the `AggregatorTopology` dataclass from a given charm. - - Args: - model: a string representing the model - model_uuid: the model UUID as a string - application: the application name - unit: the unit name - - Returns: - a `AggregatorTopology` object. - """ - return cls( - model=model, - model_uuid=model_uuid, - application=application, - unit=unit, - ) - - def as_promql_label_dict(self): - """Format the topology information into a dict with keys having 'juju_' as prefix.""" - vals = {"juju_{}".format(key): val for key, val in self.as_dict().items()} - - # FIXME: Why is this different? I have no idea. The uuid length should be the same - vals["juju_model_uuid"] = vals["juju_model_uuid"][:7] - - return vals - - -class ProviderTopology(JujuTopology): - """Class for initializing topology information for MetricsEndpointProvider.""" - - @property - def scrape_identifier(self): - """Format the topology information into a scrape identifier.""" - # This is used only by Metrics[Consumer|Provider] and does not need a - # unit name, so only check for the charm name - return "juju_{}_prometheus_scrape".format(self.identifier) - - -class InvalidAlertRulePathError(Exception): - """Raised if the alert rules folder cannot be found or is otherwise invalid.""" - - def __init__( - self, - alert_rules_absolute_path: Path, - message: str, - ): - self.alert_rules_absolute_path = alert_rules_absolute_path - self.message = message - - super().__init__(self.message) - - -def _is_official_alert_rule_format(rules_dict: dict) -> bool: - """Are alert rules in the upstream format as supported by Prometheus. - - Alert rules in dictionary format are in "official" form if they - contain a "groups" key, since this implies they contain a list of - alert rule groups. - - Args: - rules_dict: a set of alert rules in Python dictionary format - - Returns: - True if alert rules are in official Prometheus file format. - """ - return "groups" in rules_dict - - -def _is_single_alert_rule_format(rules_dict: dict) -> bool: - """Are alert rules in single rule format. - - The Prometheus charm library supports reading of alert rules in a - custom format that consists of a single alert rule per file. This - does not conform to the official Prometheus alert rule file format - which requires that each alert rules file consists of a list of - alert rule groups and each group consists of a list of alert - rules. - - Alert rules in dictionary form are considered to be in single rule - format if in the least it contains two keys corresponding to the - alert rule name and alert expression. - - Returns: - True if alert rule is in single rule file format. - """ - # one alert rule per file - return set(rules_dict) >= {"alert", "expr"} - - -class AlertRules: - """Utility class for amalgamating prometheus alert rule files and injecting juju topology. - - An `AlertRules` object supports aggregating alert rules from files and directories in both - official and single rule file formats using the `add_path()` method. All the alert rules - read are annotated with Juju topology labels and amalgamated into a single data structure - in the form of a Python dictionary using the `as_dict()` method. Such a dictionary can be - easily dumped into JSON format and exchanged over relation data. The dictionary can also - be dumped into YAML format and written directly into an alert rules file that is read by - Prometheus. Note that multiple `AlertRules` objects must not be written into the same file, - since Prometheus allows only a single list of alert rule groups per alert rules file. - - The official Prometheus format is a YAML file conforming to the Prometheus documentation - (https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). - The custom single rule format is a subsection of the official YAML, having a single alert - rule, effectively "one alert per file". - """ - - # This class uses the following terminology for the various parts of a rule file: - # - alert rules file: the entire groups[] yaml, including the "groups:" key. - # - alert groups (plural): the list of groups[] (a list, i.e. no "groups:" key) - it is a list - # of dictionaries that have the "name" and "rules" keys. - # - alert group (singular): a single dictionary that has the "name" and "rules" keys. - # - alert rules (plural): all the alerts in a given alert group - a list of dictionaries with - # the "alert" and "expr" keys. - # - alert rule (singular): a single dictionary that has the "alert" and "expr" keys. - - def __init__(self, topology: Optional[JujuTopology] = None): - """Build and alert rule object. - - Args: - topology: an optional `JujuTopology` instance that is used to annotate all alert rules. - """ - self.topology = topology - self.alert_groups = [] # type: List[dict] - - def _from_file(self, root_path: Path, file_path: Path) -> List[dict]: - """Read a rules file from path, injecting juju topology. - - Args: - root_path: full path to the root rules folder (used only for generating group name) - file_path: full path to a *.rule file. - - Returns: - A list of dictionaries representing the rules file, if file is valid (the structure is - formed by `yaml.safe_load` of the file); an empty list otherwise. - """ - with file_path.open() as rf: - # Load a list of rules from file then add labels and filters - try: - rule_file = yaml.safe_load(rf) - - except Exception as e: - logger.error("Failed to read alert rules from %s: %s", file_path.name, e) - return [] - - if _is_official_alert_rule_format(rule_file): - alert_groups = rule_file["groups"] - elif _is_single_alert_rule_format(rule_file): - # convert to list of alert groups - # group name is made up from the file name - alert_groups = [{"name": file_path.stem, "rules": [rule_file]}] - else: - # invalid/unsupported - logger.error("Invalid rules file: %s", file_path.name) - return [] - - # update rules with additional metadata - for alert_group in alert_groups: - # update group name with topology and sub-path - alert_group["name"] = self._group_name( - str(root_path), - str(file_path), - alert_group["name"], - ) - - # add "juju_" topology labels - for alert_rule in alert_group["rules"]: - if "labels" not in alert_rule: - alert_rule["labels"] = {} - - if self.topology: - alert_rule["labels"].update(self.topology.as_promql_label_dict()) - # insert juju topology filters into a prometheus alert rule - alert_rule["expr"] = self.topology.render(alert_rule["expr"]) - - return alert_groups - - def _group_name(self, root_path: str, file_path: str, group_name: str) -> str: - """Generate group name from path and topology. - - The group name is made up of the relative path between the root dir_path, the file path, - and topology identifier. - - Args: - root_path: path to the root rules dir. - file_path: path to rule file. - group_name: original group name to keep as part of the new augmented group name - - Returns: - New group name, augmented by juju topology and relative path. - """ - rel_path = os.path.relpath(os.path.dirname(file_path), root_path) - rel_path = "" if rel_path == "." else rel_path.replace(os.path.sep, "_") - - # Generate group name: - # - name, from juju topology - # - suffix, from the relative path of the rule file; - group_name_parts = [self.topology.identifier] if self.topology else [] - group_name_parts.extend([rel_path, group_name, "alerts"]) - # filter to remove empty strings - return "_".join(filter(None, group_name_parts)) - - @classmethod - def _multi_suffix_glob( - cls, dir_path: Path, suffixes: List[str], recursive: bool = True - ) -> list: - """Helper function for getting all files in a directory that have a matching suffix. - - Args: - dir_path: path to the directory to glob from. - suffixes: list of suffixes to include in the glob (items should begin with a period). - recursive: a flag indicating whether a glob is recursive (nested) or not. - - Returns: - List of files in `dir_path` that have one of the suffixes specified in `suffixes`. - """ - all_files_in_dir = dir_path.glob("**/*" if recursive else "*") - return list(filter(lambda f: f.is_file() and f.suffix in suffixes, all_files_in_dir)) - - def _from_dir(self, dir_path: Path, recursive: bool) -> List[dict]: - """Read all rule files in a directory. - - All rules from files for the same directory are loaded into a single - group. The generated name of this group includes juju topology. - By default, only the top directory is scanned; for nested scanning, pass `recursive=True`. - - Args: - dir_path: directory containing *.rule files (alert rules without groups). - recursive: flag indicating whether to scan for rule files recursively. - - Returns: - a list of dictionaries representing prometheus alert rule groups, each dictionary - representing an alert group (structure determined by `yaml.safe_load`). - """ - alert_groups = [] # type: List[dict] - - # Gather all alerts into a list of groups - for file_path in self._multi_suffix_glob(dir_path, [".rule", ".rules"], recursive): - alert_groups_from_file = self._from_file(dir_path, file_path) - if alert_groups_from_file: - logger.debug("Reading alert rule from %s", file_path) - alert_groups.extend(alert_groups_from_file) - - return alert_groups - - def add_path(self, path: str, *, recursive: bool = False) -> None: - """Add rules from a dir path. - - All rules from files are aggregated into a data structure representing a single rule file. - All group names are augmented with juju topology. - - Args: - path: either a rules file or a dir of rules files. - recursive: whether to read files recursively or not (no impact if `path` is a file). - - Returns: - True if path was added else False. - """ - path = Path(path) # type: Path - if path.is_dir(): - self.alert_groups.extend(self._from_dir(path, recursive)) - elif path.is_file(): - self.alert_groups.extend(self._from_file(path.parent, path)) - else: - logger.warning("path does not exist: %s", path) - - def as_dict(self) -> dict: - """Return standard alert rules file in dict representation. - - Returns: - a dictionary containing a single list of alert rule groups. - The list of alert rule groups is provided as value of the - "groups" dictionary key. - """ - return {"groups": self.alert_groups} if self.alert_groups else {} - - -class TargetsChangedEvent(EventBase): - """Event emitted when Prometheus scrape targets change.""" - - def __init__(self, handle, relation_id): - super().__init__(handle) - self.relation_id = relation_id - - def snapshot(self): - """Save scrape target relation information.""" - return {"relation_id": self.relation_id} - - def restore(self, snapshot): - """Restore scrape target relation information.""" - self.relation_id = snapshot["relation_id"] - - -class MonitoringEvents(ObjectEvents): - """Event descriptor for events raised by `MetricsEndpointConsumer`.""" - - targets_changed = EventSource(TargetsChangedEvent) - - -class MetricsEndpointConsumer(Object): - """A Prometheus based Monitoring service.""" - - on = MonitoringEvents() - - def __init__(self, charm: CharmBase, relation_name: str = DEFAULT_RELATION_NAME): - """A Prometheus based Monitoring service. - - Args: - charm: a `CharmBase` instance that manages this - instance of the Prometheus service. - relation_name: an optional string name of the relation between `charm` - and the Prometheus charmed service. The default is "metrics-endpoint". - It is strongly advised not to change the default, so that people - deploying your charm will have a consistent experience with all - other charms that consume metrics endpoints. - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - with the same name as provided via `relation_name` argument. - RelationInterfaceMismatchError: The relation with the same name as provided - via `relation_name` argument does not have the `prometheus_scrape` relation - interface. - RelationRoleMismatchError: If the relation with the same name as provided - via `relation_name` argument does not have the `RelationRole.requires` - role. - """ - _validate_relation_by_interface_and_direction( - charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.requires - ) - - super().__init__(charm, relation_name) - self._charm = charm - self._relation_name = relation_name - self._transformer = PromqlTransformer(self._charm) - events = self._charm.on[relation_name] - self.framework.observe(events.relation_changed, self._on_metrics_provider_relation_changed) - self.framework.observe( - events.relation_departed, self._on_metrics_provider_relation_departed - ) - - def _on_metrics_provider_relation_changed(self, event): - """Handle changes with related metrics providers. - - Anytime there are changes in relations between Prometheus - and metrics provider charms the Prometheus charm is informed, - through a `TargetsChangedEvent` event. The Prometheus charm can - then choose to update its scrape configuration. - - Args: - event: a `CharmEvent` in response to which the Prometheus - charm must update its scrape configuration. - """ - rel_id = event.relation.id - - self.on.targets_changed.emit(relation_id=rel_id) - - def _on_metrics_provider_relation_departed(self, event): - """Update job config when a metrics provider departs. - - When a metrics provider departs the Prometheus charm is informed - through a `TargetsChangedEvent` event so that it can update its - scrape configuration to ensure that the departed metrics provider - is removed from the list of scrape jobs and - - Args: - event: a `CharmEvent` that indicates a metrics provider - unit has departed. - """ - rel_id = event.relation.id - self.on.targets_changed.emit(relation_id=rel_id) - - def jobs(self) -> list: - """Fetch the list of scrape jobs. - - Returns: - A list consisting of all the static scrape configurations - for each related `MetricsEndpointProvider` that has specified - its scrape targets. - """ - scrape_jobs = [] - - for relation in self._charm.model.relations[self._relation_name]: - static_scrape_jobs = self._static_scrape_config(relation) - if static_scrape_jobs: - scrape_jobs.extend(static_scrape_jobs) - - return scrape_jobs - - def alerts(self) -> dict: - """Fetch alerts for all relations. - - A Prometheus alert rules file consists of a list of "groups". Each - group consists of a list of alerts (`rules`) that are sequentially - executed. This method returns all the alert rules provided by each - related metrics provider charm. These rules may be used to generate a - separate alert rules file for each relation since the returned list - of alert groups are indexed by that relations Juju topology identifier. - The Juju topology identifier string includes substrings that identify - alert rule related metadata such as the Juju model, model UUID and the - application name from where the alert rule originates. Since this - topology identifier is globally unique, it may be used for instance as - the name for the file into which the list of alert rule groups are - written. For each relation, the structure of data returned is a dictionary - representation of a standard prometheus rules file: - - {"groups": [{"name": ...}, ...]} - - per official prometheus documentation - https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ - - The value of the `groups` key is such that it may be used to generate - a Prometheus alert rules file directly using `yaml.dump` but the - `groups` key itself must be included as this is required by Prometheus. - - For example the list of alert rule groups returned by this method may - be written into files consumed by Prometheus as follows - - ``` - for topology_identifier, alert_rule_groups in self.metrics_consumer.alerts().items(): - filename = "juju_" + topology_identifier + ".rules" - path = os.path.join(PROMETHEUS_RULES_DIR, filename) - rules = yaml.dump(alert_rule_groups) - container.push(path, rules, make_dirs=True) - ``` - - Returns: - A dictionary mapping the Juju topology identifier of the source charm to - its list of alert rule groups. - """ - alerts = {} # type: Dict[str, dict] # mapping b/w juju identifiers and alert rule files - for relation in self._charm.model.relations[self._relation_name]: - if not relation.units: - continue - - alert_rules = json.loads(relation.data[relation.app].get("alert_rules", "{}")) - if not alert_rules: - continue - - identifier = None - try: - scrape_metadata = json.loads(relation.data[relation.app]["scrape_metadata"]) - identifier = ProviderTopology.from_relation_data(scrape_metadata).identifier - alerts[identifier] = self._transformer.apply_label_matchers(alert_rules) - - except KeyError as e: - logger.debug( - "Relation %s has no 'scrape_metadata': %s", - relation.id, - e, - ) - identifier = self._get_identifier_by_alert_rules(alert_rules) - - if not identifier: - logger.error( - "Alert rules were found but no usable group or identifier was present" - ) - continue - alerts[identifier] = alert_rules - - return alerts - - def _get_identifier_by_alert_rules(self, rules: dict) -> Union[str, None]: - """Determine an appropriate dict key for alert rules. - - The key is used as the filename when writing alerts to disk, so the structure - and uniqueness is important. - - Args: - rules: a dict of alert rules - """ - if "groups" not in rules: - logger.warning("No alert groups were found in relation data") - return None - - # Construct an ID based on what's in the alert rules if they have labels - for group in rules["groups"]: - try: - labels = group["rules"][0]["labels"] - identifier = "{}_{}_{}".format( - labels["juju_model"], - labels["juju_model_uuid"], - labels["juju_application"], - ) - return identifier - except KeyError: - logger.debug("Alert rules were found but no usable labels were present") - continue - - logger.warning( - "No labeled alert rules were found, and no 'scrape_metadata' " - "was available. Using the alert group name as filename." - ) - try: - for group in rules["groups"]: - return group["name"] - except KeyError: - logger.debug("No group name was found to use as identifier") - - return None - - def _static_scrape_config(self, relation) -> list: - """Generate the static scrape configuration for a single relation. - - If the relation data includes `scrape_metadata` then the value - of this key is used to annotate the scrape jobs with Juju - Topology labels before returning them. - - Args: - relation: an `ops.model.Relation` object whose static - scrape configuration is required. - - Returns: - A list (possibly empty) of scrape jobs. Each job is a - valid Prometheus scrape configuration for that job, - represented as a Python dictionary. - """ - if not relation.units: - return [] - - scrape_jobs = json.loads(relation.data[relation.app].get("scrape_jobs", "[]")) - - if not scrape_jobs: - return [] - - scrape_metadata = json.loads(relation.data[relation.app].get("scrape_metadata", "{}")) - - if not scrape_metadata: - return scrape_jobs - - job_name_prefix = ProviderTopology.from_relation_data(scrape_metadata).scrape_identifier - - hosts = self._relation_hosts(relation) - - labeled_job_configs = [] - for job in scrape_jobs: - config = self._labeled_static_job_config( - _sanitize_scrape_configuration(job), - job_name_prefix, - hosts, - scrape_metadata, - ) - labeled_job_configs.append(config) - - return labeled_job_configs - - def _relation_hosts(self, relation) -> dict: - """Fetch unit names and address of all metrics provider units for a single relation. - - Args: - relation: An `ops.model.Relation` object for which the unit name to - address mapping is required. - - Returns: - A dictionary that maps unit names to unit addresses for - the specified relation. - """ - hosts = {} - for unit in relation.units: - # TODO deprecate and remove unit.name - unit_name = relation.data[unit].get("prometheus_scrape_unit_name") or unit.name - # TODO deprecate and remove "prometheus_scrape_host" - unit_address = relation.data[unit].get( - "prometheus_scrape_unit_address" - ) or relation.data[unit].get("prometheus_scrape_host") - if unit_name and unit_address: - hosts.update({unit_name: unit_address}) - - return hosts - - def _labeled_static_job_config(self, job, job_name_prefix, hosts, scrape_metadata) -> dict: - """Construct labeled job configuration for a single job. - - Args: - - job: a dictionary representing the job configuration as obtained from - `MetricsEndpointProvider` over relation data. - job_name_prefix: a string that may either be used as the - job name if the job has no associated name or used as a prefix for - the job if it does have a job name. - hosts: a dictionary mapping host names to host address for - all units of the relation for which this job configuration - must be constructed. - scrape_metadata: scrape configuration metadata obtained - from `MetricsEndpointProvider` from the same relation for - which this job configuration is being constructed. - - Returns: - A dictionary representing a Prometheus job configuration - for a single job. - """ - name = job.get("job_name") - job_name = "{}_{}".format(job_name_prefix, name) if name else job_name_prefix - - labeled_job = job.copy() - labeled_job["job_name"] = job_name - - static_configs = job.get("static_configs") - labeled_job["static_configs"] = [] - - # relabel instance labels so that instance identifiers are globally unique - # stable over unit recreation - instance_relabel_config = { - "source_labels": ["juju_model", "juju_model_uuid", "juju_application"], - "separator": "_", - "target_label": "instance", - "regex": "(.*)", - } - - # label all static configs in the Prometheus job - # labeling inserts Juju topology information and - # sets a relable config for instance labels - for static_config in static_configs: - labels = static_config.get("labels", {}) if static_configs else {} - all_targets = static_config.get("targets", []) - - # split all targets into those which will have unit labels - # and those which will not - ports = [] - unitless_targets = [] - for target in all_targets: - host, port = target.split(":") - if host.strip() == "*": - ports.append(port.strip()) - else: - unitless_targets.append(target) - - # label scrape targets that do not have unit labels - if unitless_targets: - unitless_config = self._labeled_unitless_config( - unitless_targets, labels, scrape_metadata - ) - labeled_job["static_configs"].append(unitless_config) - - # label scrape targets that do have unit labels - for host_name, host_address in hosts.items(): - static_config = self._labeled_unit_config( - host_name, host_address, ports, labels, scrape_metadata - ) - labeled_job["static_configs"].append(static_config) - if "juju_unit" not in instance_relabel_config["source_labels"]: - instance_relabel_config["source_labels"].append("juju_unit") # type: ignore - - # ensure topology relabeling of instance label is last in order of relabelings - relabel_configs = job.get("relabel_configs", []) - relabel_configs.append(instance_relabel_config) - labeled_job["relabel_configs"] = relabel_configs - - return labeled_job - - def _set_juju_labels(self, labels, scrape_metadata) -> dict: - """Create a copy of metric labels with Juju topology information. - - Args: - labels: a dictionary containing Prometheus metric labels. - scrape_metadata: scrape related metadata provided by - `MetricsEndpointProvider`. - - Returns: - a copy of the `labels` dictionary augmented with Juju - topology information with the exception of unit name. - """ - juju_labels = labels.copy() # deep copy not needed - juju_labels.update( - ProviderTopology.from_relation_data(scrape_metadata).as_promql_label_dict() - ) - - return juju_labels - - def _labeled_unitless_config(self, targets, labels, scrape_metadata) -> dict: - """Static scrape configuration for fully qualified host addresses. - - Fully qualified hosts are those scrape targets for which the - address are specified by the `MetricsEndpointProvider` as part - of the scrape job specification set in application relation data. - The address specified need not belong to any unit of the - `MetricsEndpointProvider` charm. As a result there is no reliable - way to determine the name (Juju topology unit name) for such a - target. - - Args: - targets: a list of addresses of fully qualified hosts. - labels: labels specified by `MetricsEndpointProvider` clients - which are associated with `targets`. - scrape_metadata: scrape related metadata provided by `MetricsEndpointProvider`. - - Returns: - A dictionary containing the static scrape configuration - for a list of fully qualified hosts. - """ - juju_labels = self._set_juju_labels(labels, scrape_metadata) - unitless_config = {"targets": targets, "labels": juju_labels} - return unitless_config - - def _labeled_unit_config( - self, unit_name, host_address, ports, labels, scrape_metadata - ) -> dict: - """Static scrape configuration for a wildcard host. - - Wildcard hosts are those scrape targets whose name (Juju unit - name) and address (unit IP address) is set into unit relation - data by the `MetricsEndpointProvider` charm, which sets this - data for ALL its units. - - Args: - unit_name: a string representing the unit name of the wildcard host. - host_address: a string representing the address of the wildcard host. - ports: list of ports on which this wildcard host exposes its metrics. - labels: a dictionary of labels provided by - `MetricsEndpointProvider` intended to be associated with - this wildcard host. - scrape_metadata: scrape related metadata provided by `MetricsEndpointProvider`. - - Returns: - A dictionary containing the static scrape configuration - for a single wildcard host. - """ - juju_labels = self._set_juju_labels(labels, scrape_metadata) - - juju_labels["juju_unit"] = unit_name - - static_config = {"labels": juju_labels} - - if ports: - targets = [] - for port in ports: - targets.append("{}:{}".format(host_address, port)) - static_config["targets"] = targets # type: ignore - else: - static_config["targets"] = [host_address] # type: ignore - - return static_config - - -def _resolve_dir_against_charm_path(charm: CharmBase, *path_elements: str) -> str: - """Resolve the provided path items against the directory of the main file. - - Look up the directory of the `main.py` file being executed. This is normally - going to be the charm.py file of the charm including this library. Then, resolve - the provided path elements and, if the result path exists and is a directory, - return its absolute path; otherwise, raise en exception. - - Raises: - InvalidAlertRulePathError, if the path does not exist or is not a directory. - """ - charm_dir = Path(str(charm.charm_dir)) - if not charm_dir.exists() or not charm_dir.is_dir(): - # Operator Framework does not currently expose a robust - # way to determine the top level charm source directory - # that is consistent across deployed charms and unit tests - # Hence for unit tests the current working directory is used - # TODO: updated this logic when the following ticket is resolved - # https://github.com/canonical/operator/issues/643 - charm_dir = Path(os.getcwd()) - - alerts_dir_path = charm_dir.absolute().joinpath(*path_elements) - - if not alerts_dir_path.exists(): - raise InvalidAlertRulePathError(alerts_dir_path, "directory does not exist") - if not alerts_dir_path.is_dir(): - raise InvalidAlertRulePathError(alerts_dir_path, "is not a directory") - - return str(alerts_dir_path) - - -class MetricsEndpointProvider(Object): - """A metrics endpoint for Prometheus.""" - - def __init__( - self, - charm, - relation_name: str = DEFAULT_RELATION_NAME, - jobs=None, - alert_rules_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, - ): - """Construct a metrics provider for a Prometheus charm. - - If your charm exposes a Prometheus metrics endpoint, the - `MetricsEndpointProvider` object enables your charm to easily - communicate how to reach that metrics endpoint. - - By default, a charm instantiating this object has the metrics - endpoints of each of its units scraped by the related Prometheus - charms. The scraped metrics are automatically tagged by the - Prometheus charms with Juju topology data via the - `juju_model_name`, `juju_model_uuid`, `juju_application_name` - and `juju_unit` labels. To support such tagging `MetricsEndpointProvider` - automatically forwards scrape metadata to a `MetricsEndpointConsumer` - (Prometheus charm). - - Scrape targets provided by `MetricsEndpointProvider` can be - customized when instantiating this object. For example in the - case of a charm exposing the metrics endpoint for each of its - units on port 8080 and the `/metrics` path, the - `MetricsEndpointProvider` can be instantiated as follows: - - self.metrics_endpoint_provider = MetricsEndpointProvider( - self, - jobs=[{ - "static_configs": [{"targets": ["*:8080"]}], - }]) - - The notation `*:` means "scrape each unit of this charm on port - ``. - - In case the metrics endpoints are not on the standard `/metrics` path, - a custom path can be specified as follows: - - self.metrics_endpoint_provider = MetricsEndpointProvider( - self, - jobs=[{ - "metrics_path": "/my/strange/metrics/path", - "static_configs": [{"targets": ["*:8080"]}], - }]) - - Note how the `jobs` argument is a list: this allows you to expose multiple - combinations of paths "metrics_path" and "static_configs" in case your charm - exposes multiple endpoints, which could happen, for example, when you have - multiple workload containers, with applications in each needing to be scraped. - The structure of the objects in the `jobs` list is one-to-one with the - `scrape_config` configuration item of Prometheus' own configuration (see - https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config - ), but with only a subset of the fields allowed. The permitted fields are - listed in `ALLOWED_KEYS` object in this charm library module. - - It is also possible to specify alert rules. By default, this library will look - into the `/prometheus_alert_rules`, which in a standard charm - layouts resolves to `src/prometheus_alert_rules`. Each alert rule goes into a - separate `*.rule` file. If the syntax of a rule is invalid, - the `MetricsEndpointProvider` logs an error and does not load the particular - rule. - - To avoid false positives and negatives in the evaluation of alert rules, - all ingested alert rule expressions are automatically qualified using Juju - Topology filters. This ensures that alert rules provided by your charm, trigger - alerts based only on data scrapped from your charm. For example an alert rule - such as the following - - alert: UnitUnavailable - expr: up < 1 - for: 0m - - will be automatically transformed into something along the lines of the following - - alert: UnitUnavailable - expr: up{juju_model=, juju_model_uuid=, juju_application=} < 1 - for: 0m - - Args: - charm: a `CharmBase` object that manages this - `MetricsEndpointProvider` object. Typically this is - `self` in the instantiating class. - relation_name: an optional string name of the relation between `charm` - and the Prometheus charmed service. The default is "metrics-endpoint". - It is strongly advised not to change the default, so that people - deploying your charm will have a consistent experience with all - other charms that provide metrics endpoints. - jobs: an optional list of dictionaries where each - dictionary represents the Prometheus scrape - configuration for a single job. When not provided, a - default scrape configuration is provided for the - `/metrics` endpoint polling all units of the charm on port `80` - using the `MetricsEndpointProvider` object. - alert_rules_path: an optional path for the location of alert rules - files. Defaults to "./prometheus_alert_rules", - resolved relative to the directory hosting the charm entry file. - The alert rules are automatically updated on charm upgrade. - - Raises: - RelationNotFoundError: If there is no relation in the charm's metadata.yaml - with the same name as provided via `relation_name` argument. - RelationInterfaceMismatchError: The relation with the same name as provided - via `relation_name` argument does not have the `prometheus_scrape` relation - interface. - RelationRoleMismatchError: If the relation with the same name as provided - via `relation_name` argument does not have the `RelationRole.provides` - role. - """ - _validate_relation_by_interface_and_direction( - charm, relation_name, RELATION_INTERFACE_NAME, RelationRole.provides - ) - - try: - alert_rules_path = _resolve_dir_against_charm_path(charm, alert_rules_path) - except InvalidAlertRulePathError as e: - logger.warning( - "Invalid Prometheus alert rules folder at %s: %s", - e.alert_rules_absolute_path, - e.message, - ) - - super().__init__(charm, relation_name) - self.topology = ProviderTopology.from_charm(charm) - - self._charm = charm - self._alert_rules_path = alert_rules_path - self._relation_name = relation_name - # sanitize job configurations to the supported subset of parameters - jobs = [] if jobs is None else jobs - self._jobs = [_sanitize_scrape_configuration(job) for job in jobs] - - events = self._charm.on[self._relation_name] - self.framework.observe(events.relation_joined, self._set_scrape_job_spec) - self.framework.observe(events.relation_changed, self._set_scrape_job_spec) - - # dirty fix: set the ip address when the containers start, as a workaround - # for not being able to lookup the pod ip - for container_name in charm.unit.containers: - self.framework.observe( - charm.on[container_name].pebble_ready, - self._set_unit_ip, - ) - - self.framework.observe(self._charm.on.upgrade_charm, self._set_scrape_job_spec) - - def _set_scrape_job_spec(self, event): - """Ensure scrape target information is made available to prometheus. - - When a metrics provider charm is related to a prometheus charm, the - metrics provider sets specification and metadata related to its own - scrape configuration. This information is set using Juju application - data. In addition each of the consumer units also sets its own - host address in Juju unit relation data. - """ - self._set_unit_ip(event) - - if not self._charm.unit.is_leader(): - return - - alert_rules = AlertRules(topology=self.topology) - alert_rules.add_path(self._alert_rules_path, recursive=True) - alert_rules_as_dict = alert_rules.as_dict() - - for relation in self._charm.model.relations[self._relation_name]: - relation.data[self._charm.app]["scrape_metadata"] = json.dumps(self._scrape_metadata) - relation.data[self._charm.app]["scrape_jobs"] = json.dumps(self._scrape_jobs) - - if alert_rules_as_dict: - # Update relation data with the string representation of the rule file. - # Juju topology is already included in the "scrape_metadata" field above. - # The consumer side of the relation uses this information to name the rules file - # that is written to the filesystem. - relation.data[self._charm.app]["alert_rules"] = json.dumps(alert_rules_as_dict) - - def _set_unit_ip(self, _): - """Set unit host address. - - Each time a metrics provider charm container is restarted it updates its own - host address in the unit relation data for the prometheus charm. - - The only argument specified is an event and it ignored. this is for expediency - to be able to use this method as an event handler, although no access to the - event is actually needed. - """ - for relation in self._charm.model.relations[self._relation_name]: - relation.data[self._charm.unit]["prometheus_scrape_unit_address"] = str( - self._charm.model.get_binding(relation).network.bind_address - ) - relation.data[self._charm.unit]["prometheus_scrape_unit_name"] = str( - self._charm.model.unit.name - ) - - @property - def _scrape_jobs(self) -> list: - """Fetch list of scrape jobs. - - Returns: - A list of dictionaries, where each dictionary specifies a - single scrape job for Prometheus. - """ - return self._jobs if self._jobs else [DEFAULT_JOB] - - @property - def _scrape_metadata(self) -> dict: - """Generate scrape metadata. - - Returns: - Scrape configuration metadata for this metrics provider charm. - """ - return self.topology.as_dict() - - -class PrometheusRulesProvider(Object): - """Forward rules to Prometheus. - - This object may be used to forward rules to Prometheus. At present it only supports - forwarding alert rules. This is unlike :class:`MetricsEndpointProvider`, which - is used for forwarding both scrape targets and associated alert rules. This object - is typically used when there is a desire to forward rules that apply globally (across - all deployed charms and units) rather than to a single charm. All rule files are - forwarded using the same 'prometheus_scrape' interface that is also used by - `MetricsEndpointProvider`. - - Args: - charm: A charm instance that `provides` a relation with the `prometheus_scrape` interface. - relation_name: Name of the relation in `metadata.yaml` that - has the `prometheus_scrape` interface. - dir_path: Root directory for the collection of rule files. - recursive: Whether or not to scan for rule files recursively. - """ - - def __init__( - self, - charm: CharmBase, - relation_name: str = DEFAULT_RELATION_NAME, - dir_path: str = DEFAULT_ALERT_RULES_RELATIVE_PATH, - recursive=True, - ): - super().__init__(charm, relation_name) - self._charm = charm - self._relation_name = relation_name - self.topology = ProviderTopology.from_charm(charm) - self._recursive = recursive - - try: - dir_path = _resolve_dir_against_charm_path(charm, dir_path) - except InvalidAlertRulePathError as e: - logger.warning( - "Invalid Prometheus alert rules folder at %s: %s", - e.alert_rules_absolute_path, - e.message, - ) - self.dir_path = dir_path - - events = self._charm.on[self._relation_name] - event_sources = [ - events.relation_joined, - events.relation_changed, - self._charm.on.leader_elected, - self._charm.on.upgrade_charm, - ] - - for event_source in event_sources: - self.framework.observe(event_source, self._update_relation_data) - - def _reinitialize_alert_rules(self): - """Reloads alert rules and updates all relations.""" - self._update_relation_data(None) - - def _update_relation_data(self, _): - """Update application relation data with alert rules for all relations.""" - if not self._charm.unit.is_leader(): - return - - alert_rules = AlertRules() - alert_rules.add_path(self.dir_path, recursive=self._recursive) - alert_rules_as_dict = alert_rules.as_dict() - - logger.info("Updating relation data with rule files from disk") - for relation in self._charm.model.relations[self._relation_name]: - relation.data[self._charm.app]["alert_rules"] = json.dumps( - alert_rules_as_dict, - sort_keys=True, # sort, to prevent unnecessary relation_changed events - ) - - -class MetricsEndpointAggregator(Object): - """Aggregate metrics from multiple scrape targets. - - `MetricsEndpointAggregator` collects scrape target information from one - or more related charms and forwards this to a `MetricsEndpointConsumer` - charm, which may be in a different Juju model. However it is - essential that `MetricsEndpointAggregator` itself resides in the same - model as its scrape targets, as this is currently the only way to - ensure in Juju that the `MetricsEndpointAggregator` will be able to - determine the model name and uuid of the scrape targets. - - `MetricsEndpointAggregator` should be used in place of - `MetricsEndpointProvider` in the following two use cases: - - 1. Integrating one or more scrape targets that do not support the - `prometheus_scrape` interface. - - 2. Integrating one or more scrape targets through cross model - relations. Although the [Scrape Config Operator](https://charmhub.io/cos-configuration-k8s) - may also be used for the purpose of supporting cross model - relations. - - Using `MetricsEndpointAggregator` to build a Prometheus charm client - only requires instantiating it. Instantiating - `MetricsEndpointAggregator` is similar to `MetricsEndpointProvider` except - that it requires specifying the names of three relations: the - relation with scrape targets, the relation for alert rules, and - that with the Prometheus charms. For example - - ```python - self._aggregator = MetricsEndpointAggregator( - self, - { - "prometheus": "monitoring", - "scrape_target": "prometheus-target", - "alert_rules": "prometheus-rules" - } - ) - ``` - - `MetricsEndpointAggregator` assumes that each unit of a scrape target - sets in its unit-level relation data two entries with keys - "hostname" and "port". If it is required to integrate with charms - that do not honor these assumptions, it is always possible to - derive from `MetricsEndpointAggregator` overriding the `_get_targets()` - method, which is responsible for aggregating the unit name, host - address ("hostname") and port of the scrape target. - - `MetricsEndpointAggregator` also assumes that each unit of a - scrape target sets in its unit-level relation data a key named - "groups". The value of this key is expected to be the string - representation of list of Prometheus Alert rules in YAML format. - An example of a single such alert rule is - - ```yaml - - alert: HighRequestLatency - expr: job:request_latency_seconds:mean5m{job="myjob"} > 0.5 - for: 10m - labels: - severity: page - annotations: - summary: High request latency - ``` - - Once again if it is required to integrate with charms that do not - honour these assumptions about alert rules then an object derived - from `MetricsEndpointAggregator` may be used by overriding the - `_get_alert_rules()` method. - - `MetricsEndpointAggregator` ensures that Prometheus scrape job - specifications and alert rules are annotated with Juju topology - information, just like `MetricsEndpointProvider` and - `MetricsEndpointConsumer` do. - - By default `MetricsEndpointAggregator` ensures that Prometheus - "instance" labels refer to Juju topology. This ensures that - instance labels are stable over unit recreation. While it is not - advisable to change this option, if required it can be done by - setting the "relabel_instance" keyword argument to `False` when - constructing an aggregator object. - """ - - def __init__(self, charm, relation_names, relabel_instance=True): - """Construct a `MetricsEndpointAggregator`. - - Args: - charm: a `CharmBase` object that manages this - `MetricsEndpointAggregator` object. Typically this is - `self` in the instantiating class. - relation_names: a dictionary with three keys. The value - of the "scrape_target" and "alert_rules" keys are - the relation names over which scrape job and alert rule - information is gathered by this `MetricsEndpointAggregator`. - And the value of the "prometheus" key is the name of - the relation with a `MetricsEndpointConsumer` such as - the Prometheus charm. - relabel_instance: A boolean flag indicating if Prometheus - scrape job "instance" labels must refer to Juju Topology. - """ - super().__init__(charm, relation_names["prometheus"]) - - self._charm = charm - self._target_relation = relation_names["scrape_target"] - self._prometheus_relation = relation_names["prometheus"] - self._alert_rules_relation = relation_names["alert_rules"] - self._relabel_instance = relabel_instance - - # manage Prometheus charm relation events - prometheus_events = self._charm.on[self._prometheus_relation] - self.framework.observe(prometheus_events.relation_joined, self._set_prometheus_data) - - # manage list of Prometheus scrape jobs from related scrape targets - target_events = self._charm.on[self._target_relation] - self.framework.observe(target_events.relation_changed, self._update_prometheus_jobs) - self.framework.observe(target_events.relation_departed, self._remove_prometheus_jobs) - - # manage alert rules for Prometheus from related scrape targets - alert_rule_events = self._charm.on[self._alert_rules_relation] - self.framework.observe(alert_rule_events.relation_changed, self._update_alert_rules) - self.framework.observe(alert_rule_events.relation_departed, self._remove_alert_rules) - - def _set_prometheus_data(self, event): - """Ensure every new Prometheus instances is updated. - - Any time a new Prometheus unit joins the relation with - `MetricsEndpointAggregator`, that Prometheus unit is provided - with the complete set of existing scrape jobs and alert rules. - """ - jobs = [] # list of scrape jobs, one per relation - for relation in self.model.relations[self._target_relation]: - targets = self._get_targets(relation) - if targets: - jobs.append(self._static_scrape_job(targets, relation.app.name)) - - groups = [] # list of alert rule groups, one group per relation - for relation in self.model.relations[self._alert_rules_relation]: - unit_rules = self._get_alert_rules(relation) - if unit_rules: - appname = relation.app.name - rules = self._label_alert_rules(unit_rules, appname) - group = {"name": self._group_name(appname), "rules": rules} - groups.append(group) - - event.relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - event.relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) - - def _set_target_job_data(self, targets: dict, app_name: str, **kwargs) -> None: - """Update scrape jobs in response to scrape target changes. - - When there is any change in relation data with any scrape - target, the Prometheus scrape job, for that specific target is - updated. Additionally, if this method is called manually, do the - sameself. - - Args: - targets: a `dict` containing target information - app_name: a `str` identifying the application - """ - # new scrape job for the relation that has changed - updated_job = self._static_scrape_job(targets, app_name, **kwargs) - - for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) - # list of scrape jobs that have not changed - jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] - jobs.append(updated_job) - relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - - def _update_prometheus_jobs(self, event): - """Update scrape jobs in response to scrape target changes. - - When there is any change in relation data with any scrape - target, the Prometheus scrape job, for that specific target is - updated. - """ - targets = self._get_targets(event.relation) - if not targets: - return - - # new scrape job for the relation that has changed - updated_job = self._static_scrape_job(targets, event.relation.app.name) - - for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) - # list of scrape jobs that have not changed - jobs = [job for job in jobs if updated_job["job_name"] != job["job_name"]] - jobs.append(updated_job) - relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - - def _remove_prometheus_jobs(self, event): - """Remove scrape jobs when a target departs. - - Any time a scrape target departs, any Prometheus scrape job - associated with that specific scrape target is removed. - """ - job_name = self._job_name(event.relation.app.name) - unit_name = event.unit.name - - for relation in self.model.relations[self._prometheus_relation]: - jobs = json.loads(relation.data[self._charm.app].get("scrape_jobs", "[]")) - if not jobs: - continue - - changed_job = [j for j in jobs if j.get("job_name") == job_name] - if not changed_job: - continue - changed_job = changed_job[0] - - # list of scrape jobs that have not changed - jobs = [job for job in jobs if job.get("job_name") != job_name] - - # list of scrape jobs for units of the same application that still exist - configs_kept = [ - config - for config in changed_job["static_configs"] # type: ignore - if config.get("labels", {}).get("juju_unit") != unit_name - ] - - if configs_kept: - changed_job["static_configs"] = configs_kept # type: ignore - jobs.append(changed_job) - - relation.data[self._charm.app]["scrape_jobs"] = json.dumps(jobs) - - def _update_alert_rules(self, event): - """Update alert rules in response to scrape target changes. - - When there is any change in alert rule relation data for any - scrape target, the list of alert rules for that specific - target is updated. - """ - unit_rules = self._get_alert_rules(event.relation) - if not unit_rules: - return - - appname = event.relation.app.name - rules = self._label_alert_rules(unit_rules, appname) - # the alert rule group that has changed - updated_group = {"name": self._group_name(appname), "rules": rules} - - for relation in self.model.relations[self._prometheus_relation]: - alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) - groups = alert_rules.get("groups", []) - # list of alert rule groups that have not changed - groups = [group for group in groups if updated_group["name"] != group["name"]] - groups.append(updated_group) - relation.data[self._charm.app]["alert_rules"] = json.dumps({"groups": groups}) - - def _remove_alert_rules(self, event): - """Remove alert rules for departed targets. - - Any time a scrape target departs any alert rules associated - with that specific scrape target is removed. - """ - group_name = self._group_name(event.relation.app.name) - unit_name = event.unit.name - - for relation in self.model.relations[self._prometheus_relation]: - alert_rules = json.loads(relation.data[self._charm.app].get("alert_rules", "{}")) - if not alert_rules: - continue - - groups = alert_rules.get("groups", []) - if not groups: - continue - - changed_group = [group for group in groups if group["name"] == group_name] - if not changed_group: - continue - changed_group = changed_group[0] - - # list of alert rule groups that have not changed - groups = [group for group in groups if group["name"] != group_name] - - # list of alert rules not associated with departing unit - rules_kept = [ - rule - for rule in changed_group.get("rules") # type: ignore - if rule.get("labels").get("juju_unit") != unit_name - ] - - if rules_kept: - changed_group["rules"] = rules_kept # type: ignore - groups.append(changed_group) - - relation.data[self._charm.app]["alert_rules"] = ( - json.dumps({"groups": groups}) if groups else "{}" - ) - - def _get_targets(self, relation) -> dict: - """Fetch scrape targets for a relation. - - Scrape target information is returned for each unit in the - relation. This information contains the unit name, network - hostname (or address) for that unit, and port on which an - metrics endpoint is exposed in that unit. - - Args: - relation: an `ops.model.Relation` object for which scrape - targets are required. - - Returns: - a dictionary whose keys are names of the units in the - relation. There values associated with each key is itself - a dictionary of the form - ``` - {"hostname": hostname, "port": port} - ``` - """ - targets = {} - for unit in relation.units: - port = relation.data[unit].get("port", 80) - hostname = relation.data[unit].get("hostname") - if hostname: - targets.update({unit.name: {"hostname": hostname, "port": port}}) - - return targets - - def _get_alert_rules(self, relation) -> dict: - """Fetch alert rules for a relation. - - Each unit of the related scrape target may have its own - associated alert rules. Alert rules for all units are returned - indexed by unit name. - - Args: - relation: an `ops.model.Relation` object for which alert - rules are required. - - Returns: - a dictionary whose keys are names of the units in the - relation. There values associated with each key is a list - of alert rules. Each rule is in dictionary format. The - structure "rule dictionary" corresponds to single - Prometheus alert rule. - """ - rules = {} - for unit in relation.units: - unit_rules = yaml.safe_load(relation.data[unit].get("groups", "")) - if unit_rules: - rules.update({unit.name: unit_rules}) - - return rules - - def _job_name(self, appname) -> str: - """Construct a scrape job name. - - Each relation has its own unique scrape job name. All units in - the relation are scraped as part of the same scrape job. - - Args: - appname: string name of a related application. - - Returns: - a string Prometheus scrape job name for the application. - """ - return "juju_{}_{}_{}_prometheus_scrape".format( - self.model.name, self.model.uuid[:7], appname - ) - - def _group_name(self, appname) -> str: - """Construct name for an alert rule group. - - Each unit in a relation may define its own alert rules. All - rules, for all units in a relation are grouped together and - given a single alert rule group name. - - Args: - appname: string name of a related application. - - Returns: - a string Prometheus alert rules group name for the application. - """ - return "juju_{}_{}_{}_alert_rules".format(self.model.name, self.model.uuid[:7], appname) - - def _label_alert_rules(self, unit_rules, appname) -> list: - """Apply juju topology labels to alert rules. - - Args: - unit_rules: a list of alert rules, where each rule is in - dictionary format. - appname: a string name of the application to which the - alert rules belong. - - Returns: - a list of alert rules with Juju topology labels. - """ - labeled_rules = [] - for unit_name, rules in unit_rules.items(): - for rule in rules: - rule["labels"].update( - AggregatorTopology.create( - self.model.name, self.model.uuid, appname, unit_name - ).as_promql_label_dict() - ) - labeled_rules.append(rule) - - return labeled_rules - - def _static_scrape_job(self, targets, application_name, **kwargs) -> dict: - """Construct a static scrape job for an application. - - Args: - targets: a dictionary providing hostname and port for all - scrape target. The keys of this dictionary are unit - names. Values corresponding to these keys are - themselves a dictionary with keys "hostname" and - "port". - application_name: a string name of the application for - which this static scrape job is being constructed. - - Returns: - A dictionary corresponding to a Prometheus static scrape - job configuration for one application. The returned - dictionary may be transformed into YAML and appended to - the list of any existing list of Prometheus static configs. - """ - juju_model = self.model.name - juju_model_uuid = self.model.uuid - job = { - "job_name": self._job_name(application_name), - "static_configs": [ - { - "targets": ["{}:{}".format(target["hostname"], target["port"])], - "labels": { - "juju_model": juju_model, - "juju_model_uuid": juju_model_uuid, - "juju_application": application_name, - "juju_unit": unit_name, - "host": target["hostname"], - }, - } - for unit_name, target in targets.items() - ], - "relabel_configs": self._relabel_configs + kwargs.get("relabel_configs", []), - } - job.update(kwargs.get("updates", {})) - - return job - - @property - def _relabel_configs(self) -> list: - """Create Juju topology relabeling configuration. - - Using Juju topology for instance labels ensures that these - labels are stable across unit recreation. - - Returns: - a list of Prometheus relabling configurations. Each item in - this list is one relabel configuration. - """ - return ( - [ - { - "source_labels": [ - "juju_model", - "juju_model_uuid", - "juju_application", - "juju_unit", - ], - "separator": "_", - "target_label": "instance", - "regex": "(.*)", - } - ] - if self._relabel_instance - else [] - ) - - -class PromqlTransformer: - """Uses promql-transform to inject label matchers into alert rule expressions.""" - - _path = None - _disabled = False - - @property - def path(self): - """Lazy lookup of the path of promql-transform.""" - if self._disabled: - return None - if not self._path: - self._path = self._get_transformer_path() - if not self._path: - logger.debug("Skipping injection of juju topology as label matchers") - self._disabled = True - return self._path - - def __init__(self, charm): - self._charm = charm - - def apply_label_matchers(self, rules): - """Will apply label matchers to the expression of all alerts in all supplied groups.""" - if not self.path: - return rules - for group in rules["groups"]: - rules_in_group = group.get("rules", []) - for rule in rules_in_group: - topology = {} - # if the user for some reason has provided juju_unit, we'll need to honor it - # in most cases, however, this will be empty - for label in [ - "juju_model", - "juju_model_uuid", - "juju_application", - "juju_charm", - "juju_unit", - ]: - if label in rule["labels"]: - topology[label] = rule["labels"][label] - - rule["expr"] = self._apply_label_matcher(rule["expr"], topology) - return rules - - def _apply_label_matcher(self, expression, topology): - if not topology: - return expression - if not self.path: - logger.debug( - "`promql-transform` unavailable. leaving expression unchanged: %s", expression - ) - return expression - args = [str(self.path)] - args.extend( - ["--label-matcher={}={}".format(key, value) for key, value in topology.items()] - ) - - args.extend(["{}".format(expression)]) - # noinspection PyBroadException - try: - return self._exec(args) - except Exception as e: - logger.debug('Applying the expression failed: "{}", falling back to the original', e) - return expression - - def _get_transformer_path(self) -> Optional[Path]: - arch = platform.processor() - arch = "amd64" if arch == "x86_64" else arch - res = "promql-transform-{}".format(arch) - try: - path = self._charm.model.resources.fetch(res) - os.chmod(path, 0o777) - return path - except NotImplementedError: - logger.debug("System lacks support for chmod") - except (NameError, ModelError): - logger.debug('No resource available for the platform "{}"'.format(arch)) - return None - - def _exec(self, cmd): - result = subprocess.run(cmd, check=False, stdout=subprocess.PIPE) - output = result.stdout.decode("utf-8").strip() - return output diff --git a/lib/charms/zookeeper/v0/client.py b/lib/charms/zookeeper/v0/client.py new file mode 100644 index 00000000..173a8a56 --- /dev/null +++ b/lib/charms/zookeeper/v0/client.py @@ -0,0 +1,490 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +"""ZooKeeperManager and ZooKeeperClient classes + +`ZooKeeperManager` provides an interface for performing actions that requires +a connection to the current ZK quorum leader, e.g updating zNodes, ACLs and quorum members. +On `__init__`, it loops through all passed hosts, attempts a `ZooKeeperClient` connection, and +checks leadership of each unit, storing the current quorum leader host as an attribute. + +In most cases, custom `Exception`s raised by `ZooKeeperManager` should trigger an `event.defer()`, +as they indicate that the servers are not ready to have actions performed upon them just yet. + +`ZooKeeperClient` serves as a handler for managing a ZooKeeper client connection to a +single unit. It's methods contain common 4lw commands, and functionality to read/write +to specific zNodes. +It is not expected to use this class from directly from charm code, +but to instead use the `ZooKeeperManager` class to perform it's actions on the ZK servers. + + +Instances of `ZooKeeperManager` are to be created by methods in either the `Charm` itself, +or from another library. + +Example usage for `ZooKeeperManager`: + +```python + +def update_cluster(new_members: List[str], event: EventBase) -> None: + + try: + zk = ZooKeeperManager( + hosts=["10.141.73.20", "10.141.73.21"], + client_port=2181, + username="super", + password="password" + ) + + current_quorum_members = zk.server_members + + servers_to_remove = list(current_quorum_members - new_members) + zk.remove_members(servers_to_remove) + + servers_to_add = sorted(new_members - current_quorum_members) + zk.add_members(servers_to_add) + + except ( + MembersSyncingError, + MemberNotReadyError, + QuorumLeaderNotFoundError, + ) as e: + logger.info(str(e)) + event.defer() + return +``` +""" + +import logging +import re +from typing import Any, Dict, Iterable, List, Set, Tuple + +from kazoo.client import ACL, KazooClient +from kazoo.handlers.threading import KazooTimeoutError +from tenacity import RetryError, retry +from tenacity.retry import retry_if_not_result +from tenacity.stop import stop_after_attempt +from tenacity.wait import wait_fixed + +# The unique Charmhub library identifier, never change it +LIBID = "4dc4430e6e5d492699391f57bd697fce" + +# Increment this major API version when introducing breaking changes +LIBAPI = 0 + +# Increment this PATCH version before using `charmcraft publish-lib` or reset +# to 0 if you are raising the major API version +LIBPATCH = 2 + + +logger = logging.getLogger(__name__) + +# Kazoo logs are unbearably chatty +logging.getLogger("kazoo.client").disabled = True + + +class MembersSyncingError(Exception): + """Generic exception for when quorum members are syncing data.""" + + pass + + +class MemberNotReadyError(Exception): + """Generic exception for when a zk unit can't be connected to or is not broadcasting.""" + + pass + + +class QuorumLeaderNotFoundError(Exception): + """Generic exception for when there are no zk leaders in the app.""" + + pass + + +class ZooKeeperManager: + """Handler for performing ZK commands.""" + + def __init__( + self, + hosts: List[str], + username: str, + password: str, + client_port: int = 2181, + ): + self.hosts = hosts + self.username = username + self.password = password + self.client_port = client_port + self.leader = "" + + try: + self.leader = self.get_leader() + except RetryError: + raise QuorumLeaderNotFoundError("quorum leader not found") + + @retry( + wait=wait_fixed(3), + stop=stop_after_attempt(2), + retry=retry_if_not_result(lambda result: True if result else False), + ) + def get_leader(self) -> str: + """Attempts to find the current ZK quorum leader. + + In the case when there is a leadership election, this may fail. + When this happens, we attempt 1 retry after 3 seconds. + + Returns: + String of the host for the quorum leader + + Raises: + tenacity.RetryError: if the leader can't be found during the retry conditions + """ + leader = None + for host in self.hosts: + try: + with ZooKeeperClient( + host=host, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + response = zk.srvr + if response.get("Mode") == "leader": + leader = host + break + except KazooTimeoutError: # in the case of having a dead unit in relation data + logger.debug(f"TIMEOUT - {host}") + continue + + return leader or "" + + @property + def server_members(self) -> Set[str]: + """The current members within the ZooKeeper quorum. + + Returns: + A set of ZK member strings + e.g {"server.1=10.141.78.207:2888:3888:participant;0.0.0.0:2181"} + """ + with ZooKeeperClient( + host=self.leader, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + members, _ = zk.config + + return set(members) + + @property + def config_version(self) -> int: + """The current config version for ZooKeeper. + + Returns: + The zookeeper config version decoded from base16 + """ + with ZooKeeperClient( + host=self.leader, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + _, version = zk.config + + return version + + @property + def members_syncing(self) -> bool: + """Flag to check if any quorum members are currently syncing data. + + Returns: + True if any members are syncing. Otherwise False. + """ + with ZooKeeperClient( + host=self.leader, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + result = zk.mntr + if ( + result.get("zk_peer_state", "") == "leading - broadcast" + and result["zk_pending_syncs"] == "0" + ): + return False + return True + + def add_members(self, members: Iterable[str]) -> None: + """Adds new members to the members' dynamic config. + + Raises: + MembersSyncingError: if any members are busy syncing data + MemberNotReadyError: if any members are not yet broadcasting + """ + if self.members_syncing: + raise MembersSyncingError("Unable to add members - some members are syncing") + + for member in members: + host = member.split("=")[1].split(":")[0] + + try: + # individual connections to each server + with ZooKeeperClient( + host=host, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + if not zk.is_ready: + raise MemberNotReadyError(f"Server is not ready: {host}") + except KazooTimeoutError as e: # for when units are departing + logger.debug(str(e)) + continue + + # specific connection to leader + with ZooKeeperClient( + host=self.leader, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + zk.client.reconfig( + joining=member, leaving=None, new_members=None, from_config=self.config_version + ) + + def remove_members(self, members: Iterable[str]): + """Removes members from the members' dynamic config. + + Raises: + MembersSyncingError: if any members are busy syncing data + """ + if self.members_syncing: + raise MembersSyncingError("Unable to remove members - some members are syncing") + + for member in members: + member_id = re.findall(r"server.([1-9]+)", member)[0] + with ZooKeeperClient( + host=self.leader, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + zk.client.reconfig( + joining=None, + leaving=member_id, + new_members=None, + from_config=self.config_version, + ) + + def leader_znodes(self, path: str) -> Set[str]: + """Grabs all children zNodes for a path on the current quorum leader. + + Args: + path: the 'root' path to search from + + Returns: + Set of all nested child zNodes + """ + with ZooKeeperClient( + host=self.leader, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + all_znode_children = zk.get_all_znode_children(path=path) + + return all_znode_children + + def create_znode_leader(self, path: str, acls: List[ACL]) -> None: + """Creates a new zNode on the current quorum leader with given ACLs. + + Args: + path: the zNode path to set + acls: the ACLs to be set on that path + """ + with ZooKeeperClient( + host=self.leader, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + zk.create_znode(path=path, acls=acls) + + def set_acls_znode_leader(self, path: str, acls: List[ACL]) -> None: + """Updates ACLs for an existing zNode on the current quorum leader. + + Args: + path: the zNode path to update + acls: the new ACLs to be set on that path + """ + with ZooKeeperClient( + host=self.leader, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + zk.set_acls(path=path, acls=acls) + + def delete_znode_leader(self, path: str) -> None: + """Deletes a zNode path from the current quorum leader. + + Args: + path: the zNode path to delete + """ + with ZooKeeperClient( + host=self.leader, + client_port=self.client_port, + username=self.username, + password=self.password, + ) as zk: + zk.delete_znode(path=path) + + +class ZooKeeperClient: + """Handler for ZooKeeper connections and running 4lw client commands.""" + + def __init__(self, host: str, client_port: int, username: str, password: str): + self.host = host + self.client_port = client_port + self.username = username + self.password = password + self.client = KazooClient( + hosts=f"{host}:{client_port}", + timeout=1.0, + sasl_options={"mechanism": "DIGEST-MD5", "username": username, "password": password}, + ) + self.client.start() + + def __enter__(self): + return self + + def __exit__(self, object_type, value, traceback): + self.client.stop() + + def _run_4lw_command(self, command: str): + return self.client.command(command.encode()) + + @property + def config(self) -> Tuple[List[str], int]: + """Retrieves the dynamic config for a ZooKeeper service. + + Returns: + Tuple of the decoded config list, and decoded config version + """ + response = self.client.get("/zookeeper/config") + if response: + result = str(response[0].decode("utf-8")).splitlines() + version = int(result.pop(-1).split("=")[1], base=16) + else: + raise + + return result, version + + @property + def srvr(self) -> Dict[str, Any]: + """Retrieves attributes returned from the 'srvr' 4lw command. + + Returns: + Mapping of field and setting returned from `srvr` + """ + response = self._run_4lw_command("srvr") + + result = {} + for item in response.splitlines(): + k = re.split(": ", item)[0] + v = re.split(": ", item)[1] + result[k] = v + + return result + + @property + def mntr(self) -> Dict[str, Any]: + """Retrieves attributes returned from the 'mntr' 4lw command. + + Returns: + Mapping of field and setting returned from `mntr` + """ + response = self._run_4lw_command("mntr") + + result = {} + for item in response.splitlines(): + if re.search("=|\\t", item): + k = re.split("=|\\t", item)[0] + v = re.split("=|\\t", item)[1] + result[k] = v + else: + result[item] = "" + + return result + + @property + def is_ready(self) -> bool: + """Flag to confirm connected ZooKeeper server is connected and broadcasting. + + Returns: + True if server is broadcasting. Otherwise False. + """ + if self.client.connected: + return "broadcast" in self.mntr.get("zk_peer_state", "") + return False + + def get_all_znode_children(self, path: str) -> Set[str]: + """Recursively gets all children for a given parent znode path. + + Args: + path: the desired parent znode path to recurse + + Returns: + Set of all nested children znode paths for the given parent + """ + children = self.client.get_children(path) or [] + + result = set() + for child in children: + if path + child != "/zookeeper": + result.update(self.get_all_znode_children(path.rstrip("/") + "/" + child)) + if path != "/": + result.add(path) + + return result + + def delete_znode(self, path: str) -> None: + """Drop znode and all it's children from ZK tree. + + Args: + path: the desired znode path to delete + """ + if not self.client.exists(path): + return + self.client.delete(path, recursive=True) + + def create_znode(self, path: str, acls: List[ACL]) -> None: + """Create new znode. + + Args: + path: the desired znode path to create + acls: the acls for the new znode + """ + self.client.create(path, acl=acls, makepath=True) + + def get_acls(self, path: str) -> List[ACL]: + """Gets acls for a desired znode path. + + Args: + path: the desired znode path + + Returns: + List of the acls set for the given znode + """ + acl_list = self.client.get_acls(path) + + return acl_list if acl_list else [] + + def set_acls(self, path: str, acls: List[ACL]) -> None: + """Sets acls for a desired znode path. + + Args: + path: the desired znode path + acls: the acls to set to the given znode + """ + self.client.set_acls(path, acls) diff --git a/lib/charms/zookeeper_k8s/v0/zookeeper.py b/lib/charms/zookeeper_k8s/v0/zookeeper.py deleted file mode 100644 index 23a5f30c..00000000 --- a/lib/charms/zookeeper_k8s/v0/zookeeper.py +++ /dev/null @@ -1,221 +0,0 @@ -#!/usr/bin/env python3 -# Copyright 2022 Canonical Ltd. -# See LICENSE file for licensing details. -"""ZooKeeper Library. - -This [library](https://juju.is/docs/sdk/libraries) implements both sides of the -`zookeeper` [interface](https://juju.is/docs/sdk/relations). - -The *provider* side of this interface is implemented by the -[zookeeper-k8s Charmed Operator](https://charmhub.io/zookeeper-k8s). - -Any Charmed Operator that *requires* a ZooKeeper database for providing its -service should implement the *requirer* side of this interface. -[kafka-k8s](https://charmhub.io/kafka-k8s) is an example. - -In a nutshell using this library to implement a Charmed Operator *requiring* a -ZooKeeper database (and talking to it as a ZooKeeper client) would look like - -``` -$ charmcraft fetch-lib charms.zookeeper_k8s.v0.zookeeper -``` - -`metadata.yaml`: - -``` -requires: - zookeeper: - interface: zookeeper -``` - -`src/charm.py`: - -``` -from charms.zookeeper_k8s.v0.zookeeper import ZooKeeperEvents, ZooKeeperRequires -from ops.charm import CharmBase - - -class MyCharm(CharmBase): - - on = ZooKeeperEvents() - - def __init__(self, *args): - super().__init__(*args) - self.zookeeper = ZooKeeperRequires(self) - self.framework.observe( - self.on.zookeeper_clients_changed, - self._on_zookeeper_clients_changed, - ) - self.framework.observe( - self.on.zookeeper_clients_broken, - self._on_zookeeper_clients_broken, - ) - - def _on_zookeeper_clients_changed(self, event): - # Get zookeeper client addresses - client_addresses: str = self.zookeeper.hosts - # client_addresses => "zk-0:2181,zk-1:2181" - - def _on_zookeeper_clients_broken(self, event): - # Stop service - # ... - self.unit.status = BlockedStatus("need zookeeper relation") -``` - -You can file bugs -[here](https://github.com/canonical/zookeeper-k8s-operator/issues)! -""" - -import logging -from typing import Optional - -from ops.charm import CharmBase, CharmEvents -from ops.framework import EventBase, EventSource, Object -from ops.model import Relation - -# The unique Charmhub library identifier, never change it -LIBID = "0d1db716e5cf45aa9177f4df6ad969ff" - -# Increment this major API version when introducing breaking changes -LIBAPI = 0 - -# Increment this PATCH version before using `charmcraft publish-lib` or reset -# to 0 if you are raising the major API version -LIBPATCH = 13 - - -logger = logging.getLogger(__name__) - -# Relation app data keys -HOSTS_APP_KEY = "hosts" - - -class _ClientsChangedEvent(EventBase): - """Event emitted whenever there is a change in the zookeeper clients.""" - - -class _ClientsBrokenEvent(EventBase): - """Event emitted when the zookeeper clients are not available anymore.""" - - -class ZooKeeperEvents(CharmEvents): - """ZooKeeper events. - - This class defines the events that ZooKeeper can emit. - - Events: - zookeeper_clients_changed (_ClientsBrokenEvent) - """ - - zookeeper_clients_changed = EventSource(_ClientsChangedEvent) - zookeeper_clients_broken = EventSource(_ClientsBrokenEvent) - - -class ZooKeeperRequires(Object): - """ZooKeeper requires relation.""" - - def __init__(self, charm: CharmBase, endpoint_name: str = "zookeeper") -> None: - """Constructor. - - Args: - charm (CharmBase): The charm that implements the relation. - endpoint_name (str): Endpoint name of the relation. - """ - super().__init__(charm, endpoint_name) - self.charm = charm - self._endpoint_name = endpoint_name - - # Observe relation events - event_observe_mapping = { - charm.on[self._endpoint_name].relation_changed: self._on_relation_changed, - charm.on[self._endpoint_name].relation_broken: self._on_relation_broken, - } - for event, observer in event_observe_mapping.items(): - self.framework.observe(event, observer) - - @property - def hosts(self) -> Optional[str]: - """Get zookeeper hosts. - - Returns: - Optional[str]: Comma-listed zookeeper client hosts. - """ - hosts = None - relation: Relation = self.framework.model.get_relation(self._endpoint_name) - if relation and relation.app and relation.data and relation.app in relation.data: - hosts = relation.data[relation.app].get(HOSTS_APP_KEY) - return hosts - - def _on_relation_changed(self, _): - self.charm.on.zookeeper_clients_changed.emit() - - def _on_relation_broken(self, _): - self.charm.on.zookeeper_clients_broken.emit() - - -class ZooKeeperProvides(Object): - """ZooKeeper provides relation. - - Example: - class ZooKeeperK8sCharm(CharmBase): - on = ZooKeeperClusterEvents() - - def __init__(self, *args): - super().__init__(*args) - self.cluster = ZooKeeperCluster(self) - self.zookeeper = ZooKeeperProvides(self) - self.framework.observe( - self.on.zookeeper_relation_changed, - self._update_hosts - ) - self.framework.observe(self.on.servers_changed, self._on_servers_changed) - - def _update_hosts(self, _=None): - if self.unit.is_leader(): - self.zookeeper.update_hosts(self.cluster.client_addresses) - - def _on_servers_changed(self, event): - # Reload the service... - self._update_hosts() - """ - - def __init__(self, charm: CharmBase, endpoint_name: str = "zookeeper") -> None: - """Constructor. - - Args: - charm (CharmBase): The charm that implements the relation. - endpoint_name (str): Endpoint name of the relation. - """ - super().__init__(charm, endpoint_name) - self._endpoint_name = endpoint_name - - def update_hosts(self, client_addresses: str, relation_id: Optional[int] = None) -> None: - """Update hosts in the zookeeper relation. - - This method will cause a relation-changed event in the requirer units - of the relation. - - Args: - client_addresses (str): Comma-listed addresses of zookeeper clients. - relation_id (Optional[int]): Id of the relation. If set, it will be used to update - the relation data of the specified relation. If not set, - the data for all the relations will be updated. - """ - relation: Relation - if relation_id: - relation = self.model.get_relation(self._endpoint_name, relation_id) - self._update_hosts_in_relation(relation, client_addresses) - relation.data[self.model.app][HOSTS_APP_KEY] = client_addresses - else: - for relation in self.model.relations[self._endpoint_name]: - self._update_hosts_in_relation(relation, client_addresses) - - def _update_hosts_in_relation(self, relation: Relation, hosts: str) -> None: - """Update hosts relation data if needed. - - Args: - relation (Relation): Relation to be updated. - hosts (str): String with the zookeeper hosts. - """ - if relation.data[self.model.app].get(HOSTS_APP_KEY) != hosts: - relation.data[self.model.app][HOSTS_APP_KEY] = hosts diff --git a/metadata.yaml b/metadata.yaml index 1c80a61c..564a3a74 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -4,14 +4,12 @@ name: kafka-k8s display-name: Kafka K8s summary: | - Kafka is a distributed system consisting of servers and clients that - communicate via a high-performance TCP network protocol. + "" description: | - Apache Kafka is an open-source distributed event streaming platform used by - thousands of companies for high-performance data pipelines, streaming analytics, - data integration, and mission-critical applications. + "" maintainers: - - David Garcia + - Marc Oppenheimer + containers: kafka: resource: kafka-image @@ -19,27 +17,17 @@ containers: resources: kafka-image: type: oci-image - description: OCI image for Kafka K8s - upstream-source: confluentinc/cp-kafka:7.0.1 - jmx-prometheus-jar: - type: file - filename: jmx_prometheus_javaagent.jar - description: JMX Prometheus Java Agent (JAR) + description: OCI Image for Apache Kafka + upstream-source: ubuntu/kafka:latest + +peers: + cluster: + interface: cluster requires: zookeeper: interface: zookeeper - limit: 1 provides: kafka: interface: kafka - metrics-endpoint: - interface: prometheus_scrape - grafana-dashboard: - interface: grafana_dashboard - -storage: - data: - type: filesystem - location: /var/lib/kafka diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index e6007771..00000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,2 +0,0 @@ -# Copyright 2022 Canonical Ltd. -# See LICENSE file for licensing details. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d6a73ecc..ea181d88 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ -# Copyright 2022 Canonical Ltd. -# See LICENSE file for licensing details. -ops >= 1.2.0 -lightkube -lightkube-models +ops >= 1.5.0 +kazoo >= 2.8.0 +tenacity >= 8.0.1 +pure-sasl >= 0.6.2 diff --git a/src/charm.py b/src/charm.py index f7e5db14..286d2b0f 100755 --- a/src/charm.py +++ b/src/charm.py @@ -2,316 +2,207 @@ # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. -"""Kafka K8s charm module.""" +"""Charmed Machine Operator for Apache Kafka.""" import logging -from typing import Any, Dict - -from charms.grafana_k8s.v0.grafana_dashboard import GrafanaDashboardProvider -from charms.kafka_k8s.v0.kafka import KafkaProvides -from charms.observability_libs.v1.kubernetes_service_patch import KubernetesServicePatch -from charms.prometheus_k8s.v0.prometheus_scrape import MetricsEndpointProvider -from charms.zookeeper_k8s.v0.zookeeper import ZooKeeperEvents, ZooKeeperRequires -from lightkube.models.core_v1 import ServicePort -from ops.charm import CharmBase, RelationJoinedEvent -from ops.framework import StoredState -from ops.main import main -from ops.model import ( - ActiveStatus, - BlockedStatus, - Container, - MaintenanceStatus, - ModelError, - StatusBase, - WaitingStatus, -) -from ops.pebble import PathError, ServiceStatus - -logger = logging.getLogger(__name__) - -KAFKA_PORT = 9092 - - -def _convert_key_to_confluent_syntax(key: str) -> str: - new_key = key.replace("_", "___").replace("-", "__").replace(".", "_") - new_key = "".join([f"_{char}" if char.isupper() else char for char in new_key]) - return f"KAFKA_{new_key.upper()}" +import secrets +import string +from typing import List +from ops.charm import CharmBase, RelationEvent, RelationJoinedEvent +from ops.framework import EventBase +from ops.main import main +from ops.model import ActiveStatus, BlockedStatus, Container, Relation, WaitingStatus +from ops.pebble import ExecError, Layer -class CharmError(Exception): - """Charm Error Exception.""" +from config import KafkaConfig +from connection_check import broker_active, zookeeper_connected +from literals import CHARM_KEY, PEER, ZOOKEEPER_REL_NAME +from provider import KafkaProvider - def __init__(self, message: str, status: StatusBase = BlockedStatus) -> None: - self.message = message - self.status = status +logger = logging.getLogger(__name__) class KafkaK8sCharm(CharmBase): - """Kafka K8s Charm operator.""" - - on = ZooKeeperEvents() - _stored = StoredState() + """Charmed Operator for Kafka K8s.""" def __init__(self, *args): super().__init__(*args) - self.kafka = KafkaProvides(self) - self.zookeeper = ZooKeeperRequires(self) - - # Observe charm events - event_observe_mapping = { - self.on.kafka_pebble_ready: self._on_config_changed, - self.on.config_changed: self._on_config_changed, - self.on.update_status: self._on_update_status, - self.on.zookeeper_clients_changed: self._on_config_changed, - self.on.zookeeper_clients_broken: self._on_zookeeper_clients_broken, - self.on.kafka_relation_joined: self._on_kafka_relation_joined, - } - for event, observer in event_observe_mapping.items(): - self.framework.observe(event, observer) - - # Stored State - self._stored.set_default(kafka_started=False) - - # Patch K8s service port - port = ServicePort(KAFKA_PORT, name=f"{self.app.name}") - self.service_patcher = KubernetesServicePatch(self, [port]) - - # Prometheus and Grafana integration - self.metrics_endpoint = MetricsEndpointProvider( - self, jobs=[{"static_configs": [{"targets": ["*:1234"]}]}] + self.name = CHARM_KEY + self.kafka_config = KafkaConfig(self) + self.client_relations = KafkaProvider(self) + + self.framework.observe(getattr(self.on, "kafka_pebble_ready"), self._on_kafka_pebble_ready) + self.framework.observe(getattr(self.on, "leader_elected"), self._on_leader_elected) + self.framework.observe( + self.on[ZOOKEEPER_REL_NAME].relation_joined, self._on_zookeeper_joined + ) + self.framework.observe( + self.on[ZOOKEEPER_REL_NAME].relation_changed, self._on_kafka_pebble_ready + ) + self.framework.observe( + self.on[ZOOKEEPER_REL_NAME].relation_departed, self._on_zookeeper_broken + ) + self.framework.observe( + self.on[ZOOKEEPER_REL_NAME].relation_broken, self._on_zookeeper_broken ) - self.grafana_dashboard_provider = GrafanaDashboardProvider(self) - - # --------------------------------------------------------------------------- - # Properties - # --------------------------------------------------------------------------- @property - def kafka_properties(self) -> Dict[str, Any]: - """Get Kafka environment variables. - - This function uses the configuration kafka-properties to generate the - environment variables needed to configure Kafka and in the format expected - by the container. - - Returns: - Dictionary with the environment variables needed for Kafka container. - """ - envs = {} - for kafka_property in self.config["kafka-properties"].splitlines(): - if "=" not in kafka_property: - continue - key, value = kafka_property.strip().split("=") - key = _convert_key_to_confluent_syntax(key) - envs[key] = value - return envs - - # --------------------------------------------------------------------------- - # Handlers for Charm Events - # --------------------------------------------------------------------------- - - def _on_config_changed(self, _) -> None: - """Handler for the config-changed event.""" - try: - self._validate_config() - self._check_relations() - container: Container = self.unit.get_container("kafka") - self._check_container_ready(container) - # Add Pebble layer with the kafka service - self._patch_entrypoint(container) - self._setup_metrics(container) - container.add_layer("kafka", self._get_kafka_layer(), combine=True) - container.replan() - - # Update kafka information - if not self._stored.kafka_started and self.unit.is_leader(): - self.kafka.set_host_info(self.app.name, KAFKA_PORT) - self._stored.kafka_started = True - - # Update charm status - self._on_update_status() - except CharmError as e: - logger.debug(e.message) - self.unit.status = e.status(e.message) - - def _on_update_status(self, _=None) -> None: - """Handler for the update-status event.""" - try: - self._check_relations() - container: Container = self.unit.get_container("kafka") - self._check_container_ready(container) - self._check_service_configured(container) - self._check_service_active(container) - self.unit.status = ActiveStatus() - except CharmError as e: - logger.debug(e.message) - self.unit.status = e.status(e.message) - - def _on_zookeeper_clients_broken(self, _) -> None: - """Handler for the zookeeper-clients-broken event.""" - # Check Pebble has started in the container - container: Container = self.unit.get_container("kafka") - if ( - container.can_connect() - and "kafka" in container.get_plan().services - and container.get_service("kafka").current == ServiceStatus.ACTIVE - ): - logger.debug("Stopping kafka service") - container.stop("kafka") - - # Update charm status - self.unit.status = BlockedStatus("need zookeeper relation") - - def _on_kafka_relation_joined(self, event: RelationJoinedEvent) -> None: - """Handler for the kafka-relation-joined event.""" - if self._stored.kafka_started and self.unit.is_leader(): - self.kafka.set_host_info(self.app.name, KAFKA_PORT, event.relation) - - # --------------------------------------------------------------------------- - # Validation and configuration - # --------------------------------------------------------------------------- - - def _validate_config(self) -> None: - """Validate charm configuration. - - Raises: - CharmError: if charm configuration is invalid. - """ - pass + def container(self) -> Container: + """Grabs the current Kafka container.""" + return self.unit.get_container(CHARM_KEY) - def _check_relations(self) -> None: - """Check required relations. + @property + def _kafka_layer(self) -> Layer: + """Returns a Pebble configuration layer for Kafka.""" + layer_config = { + "summary": "kafka layer", + "description": "Pebble config layer for kafka", + "services": { + CHARM_KEY: { + "override": "replace", + "summary": "kafka", + "command": self.kafka_config.kafka_command, + "startup": "enabled", + "environment": {"KAFKA_OPTS": self.kafka_config.extra_args}, + } + }, + } + return Layer(layer_config) - Raises: - CharmError: if relations are missing. - """ - if not self.zookeeper.hosts: - raise CharmError("need zookeeper relation") + @property + def peer_relation(self) -> Relation: + """The Kafka peer relation.""" + return self.model.get_relation(PEER) - def _check_container_ready(self, container: Container) -> None: - """Check Pebble has started in the container. + def run_bin_command(self, bin_keyword: str, bin_args: List[str], extra_args: str) -> str: + """Runs kafka bin command with desired args. Args: - container (Container): Container to be checked. + bin_keyword: the kafka shell script to run + e.g `configs`, `topics` etc + bin_args: the shell command args + extra_args (optional): the desired `KAFKA_OPTS` env var values for the command - Raises: - CharmError: if container is not ready. + Returns: + String of kafka bin command output """ - if not container.can_connect(): - raise CharmError("waiting for pebble to start", MaintenanceStatus) + environment = {"KAFKA_OPTS": extra_args} + command = [f"/opt/kafka/bin/kafka-{bin_keyword}.sh"] + bin_args - def _check_service_configured(self, container: Container) -> None: - """Check if kafka service has been successfully configured. - - Args: - container (Container): Container to be checked. + try: + process = self.container.exec(command=command, environment=environment) + output, _ = process.wait_output() + logger.debug(f"{output=}") + return output + except (ExecError) as e: + logger.debug(f"cmd failed:\ncommand={e.command}\nstdout={e.stdout}\nstderr={e.stderr}") + raise e + + def _on_kafka_pebble_ready(self, event: EventBase) -> None: + """Handler for `kafka_pebble_ready` event.""" + if not self.container.can_connect(): + event.defer() + return - Raises: - CharmError: if kafka service has not been configured. - """ - if "kafka" not in container.get_plan().services: - raise CharmError("kafka service not configured yet", WaitingStatus) + if not zookeeper_connected(charm=self): + self.unit.status = WaitingStatus("waiting for zookeeper relation") + return - def _check_service_active(self, container: Container) -> None: - """Check if the kafka service is running. + # required settings given zookeeper connection config has been created + self.kafka_config.set_server_properties() + self.kafka_config.set_jaas_config() - Raises: - CharmError: if kafka service is not running. - """ - if container.get_service("kafka").current != ServiceStatus.ACTIVE: - raise CharmError("kafka service is not running") - - def _patch_entrypoint(self, container: Container) -> None: - """Patch entrypoint. + # do not start units until SCRAM users have been added to ZooKeeper for server-server auth + if self.unit.is_leader() and self.kafka_config.sync_password: + try: + self.add_user_to_zookeeper( + username="sync", password=self.kafka_config.sync_password + ) + self.peer_relation.data[self.app].update({"broker-creds": "added"}) + except ExecError: + # command to add users fails sometimes for unknown reasons. Retry seems to fix it. + event.defer() + return + + # for non-leader units + if not self.peer_relation.data[self.app].get("broker-creds", None): + logger.debug("broker-creds not yet added to zookeeper") + event.defer() + return - This function pushes what will be the main entrypoint for the kafka service. - The pushed entrypoint is a wrapper to the default one, that unsets the environment - variables that Kubernetes autommatically creates that conflict with the expected - environment variables by the container. + # start kafka service + self.container.add_layer(CHARM_KEY, self._kafka_layer, combine=True) + self.container.replan() - Args: - container (Container): Container where the entrypoint will be pushed. - """ - if self._file_exists(container, "/entrypoint"): + # start_snap_service can fail silently, confirm with ZK if kafka is actually connected + if broker_active( + unit=self.unit, + zookeeper_config=self.kafka_config.zookeeper_config, + ): + logger.info(f'Broker {self.unit.name.split("/")[1]} connected') + self.unit.status = ActiveStatus() + else: + self.unit.status = BlockedStatus("kafka unit not connected to ZooKeeper") return - with open("templates/entrypoint", "r") as f: - container.push( - "/entrypoint", - f.read(), - permissions=0o777, + + def _on_leader_elected(self, _) -> None: + """Handler for `leader_elected` event, ensuring sync_passwords gets set.""" + sync_password = self.kafka_config.sync_password + if not sync_password: + self.peer_relation.data[self.app].update( + { + "sync_password": "".join( + [secrets.choice(string.ascii_letters + string.digits) for _ in range(32)] + ) + } ) - def _setup_metrics(self, container: Container) -> None: - """Setup metrics. + def _on_zookeeper_joined(self, event: RelationJoinedEvent) -> None: + """Handler for `zookeeper_relation_joined` event, ensuring chroot gets set.""" + if self.unit.is_leader(): + event.relation.data[self.app].update({"chroot": "/" + self.app.name}) - Args: - container (Container): Container where the the metrics will be setup. - """ - if self.config.get("metrics"): - container.make_dir("/opt/prometheus", make_parents=True, permissions=0o555) - with open("templates/kafka_broker.yml", "r") as f: - container.push("/opt/prometheus/kafka_broker.yml", f) - try: - resource_path = self.model.resources.fetch("jmx-prometheus-jar") - with open(resource_path, "rb") as f: - container.push("/opt/prometheus/jmx_prometheus_javaagent.jar", f) - except ModelError: - raise CharmError("Missing 'jmx-prometheus-jar' resource") + def _on_zookeeper_broken(self, _: RelationEvent) -> None: + """Handler for `zookeeper_relation_departed/broken` events.""" + logger.info("stopping kafka service") + self.container.stop(CHARM_KEY) + self.unit.status = BlockedStatus("missing required zookeeper relation") - def _get_kafka_layer(self) -> Dict[str, Any]: - """Get Kafka layer for Pebble. + def add_user_to_zookeeper(self, username: str, password: str) -> None: + """Adds user credentials to ZooKeeper for authorising clients and brokers. - Returns: - Dict[str, Any]: Pebble layer. + Raises: + ops.pebble.ExecError: If the command failed """ - env_variables = { - "CHARM_NAME": self.app.name.upper().replace("-", "_"), - "PATH": "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "LANG": "C.UTF-8", - "CUB_CLASSPATH": '"/usr/share/java/cp-base-new/*"', - "container": "oci", - "COMPONENT": "kafka", - "KAFKA_ZOOKEEPER_CONNECT": self.zookeeper.hosts, - **self.kafka_properties, - } - if self.config.get("metrics"): - env_variables[ - "KAFKA_OPTS" - ] = "-javaagent:/opt/prometheus/jmx_prometheus_javaagent.jar=1234:/opt/prometheus/kafka_broker.yml" - - return { - "summary": "kafka layer", - "description": "pebble config layer for kafka", - "services": { - "kafka": { - "override": "replace", - "summary": "kafka service", - "command": "/entrypoint", - "startup": "enabled", - "environment": env_variables, - } - }, - } - - def _file_exists(self, container: Container, path: str) -> bool: - """Check if a file exists in the container. + command = [ + f"--zookeeper={self.kafka_config.zookeeper_config['connect']}", + "--alter", + "--entity-type=users", + f"--entity-name={username}", + f"--add-config=SCRAM-SHA-512=[password={password}]", + ] + self.run_bin_command( + bin_keyword="configs", bin_args=command, extra_args=self.kafka_config.extra_args + ) - Args: - path (str): Path of the file to be checked. + def delete_user_from_zookeeper(self, username: str) -> None: + """Deletes user credentials from ZooKeeper for authorising clients and brokers. - Returns: - bool: True if the file exists, else False. + Raises: + ops.pebble.ExecError: If the command failed """ - file_exists = None - try: - _ = container.pull(path) - file_exists = True - except PathError: - file_exists = False - exist_str = "exists" if file_exists else 'doesn"t exist' - logger.debug(f"File {path} {exist_str}.") - return file_exists + command = [ + f"--zookeeper={self.kafka_config.zookeeper_config['connect']}", + "--alter", + "--entity-type=users", + f"--entity-name={username}", + "--delete-config=SCRAM-SHA-512", + ] + self.run_bin_command( + bin_keyword="configs", bin_args=command, extra_args=self.kafka_config.extra_args + ) -if __name__ == "__main__": # pragma: no cover +if __name__ == "__main__": main(KafkaK8sCharm) diff --git a/src/config.py b/src/config.py new file mode 100644 index 00000000..a2060b97 --- /dev/null +++ b/src/config.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Manager for handling Kafka configuration.""" + +import logging +from typing import Dict, List, Optional + +from ops.charm import CharmBase + +from literals import CHARM_KEY, PEER, ZOOKEEPER_REL_NAME + +logger = logging.getLogger(__name__) + +DEFAULT_CONFIG_OPTIONS = """ +clientPort=2181 +listeners=SASL_PLAINTEXT://:9092 +sasl.enabled.mechanisms=SCRAM-SHA-512 +sasl.mechanism.inter.broker.protocol=SCRAM-SHA-512 +security.inter.broker.protocol=SASL_PLAINTEXT +authorizer.class.name=kafka.security.authorizer.AclAuthorizer +allow.everyone.if.no.acl.found=false +super.users=User:sync +listener.name.sasl_plaintext.sasl.enabled.mechanisms=SCRAM-SHA-512 +""" + + +class KafkaConfig: + """Manager for handling Kafka configuration.""" + + def __init__(self, charm: CharmBase): + self.charm = charm + self.container = self.charm.unit.get_container(CHARM_KEY) + self.default_config_path = f"{self.charm.config['data-dir']}/config" + self.properties_filepath = f"{self.default_config_path}/server.properties" + self.jaas_filepath = f"{self.default_config_path}/kafka-jaas.cfg" + + @property + def sync_password(self) -> Optional[str]: + """Returns charm-set sync_password for server-server auth between brokers.""" + return self.charm.model.get_relation(PEER).data[self.charm.app].get("sync_password", None) + + @property + def zookeeper_config(self) -> Dict[str, str]: + """Checks the zookeeper relations for data necessary to connect to ZooKeeper. + + Returns: + Dict with zookeeper username, password, endpoints, chroot and uris + """ + zookeeper_config = {} + for relation in self.charm.model.relations[ZOOKEEPER_REL_NAME]: + zk_keys = ["username", "password", "endpoints", "chroot", "uris"] + missing_config = any( + relation.data[relation.app].get(key, None) is None for key in zk_keys + ) + + if missing_config: + continue + + zookeeper_config.update(relation.data[relation.app]) + break + + if zookeeper_config: + zookeeper_config["connect"] = ( + zookeeper_config["uris"].replace(zookeeper_config["chroot"], "") + + zookeeper_config["chroot"] + ) + return zookeeper_config + + @property + def extra_args(self) -> str: + """Collection of Java config arguments for SASL auth. + + Returns: + String of command argument to be prepended to start-up command + """ + extra_args = f"-Djava.security.auth.login.config={self.jaas_filepath}" + + return extra_args + + @property + def kafka_command(self) -> str: + """The run command for starting the Kafka service. + + Returns: + String of startup command and expected config filepath + """ + entrypoint = "/opt/kafka/bin/kafka-server-start.sh" + return f"{entrypoint} {self.properties_filepath}" + + @property + def default_replication_properties(self) -> List[str]: + """Builds replication-related properties based on the expected app size. + + Returns: + List of properties to be set + """ + replication_factor = min([3, self.charm.app.planned_units()]) + min_isr = max([1, replication_factor - 1]) + + return [ + f"default.replication.factor={replication_factor}", + f"num.partitions={replication_factor}", + f"transaction.state.log.replication.factor={replication_factor}", + f"offsets.topic.replication.factor={replication_factor}", + f"min.insync.replicas={min_isr}", + f"transaction.state.log.min.isr={min_isr}", + ] + + @property + def auth_properties(self) -> List[str]: + """Builds properties necessary for inter-broker authorization through ZooKeeper. + + Returns: + List of properties to be set + """ + broker_id = self.charm.unit.name.split("/")[1] + host = f"{self.charm.app.name}-{broker_id}.{self.charm.app.name}-endpoints" + + return [ + f"broker.id={broker_id}", + f"advertised.listeners=SASL_PLAINTEXT://{host}:9092", + f'zookeeper.connect={self.zookeeper_config["connect"]}', + f'listener.name.sasl_plaintext.scram-sha-512.sasl.jaas.config=org.apache.kafka.common.security.scram.ScramLoginModule required username="sync" password="{self.sync_password}";', + ] + + def push(self, content: str, path: str) -> None: + """Simple wrapper for writing a file and contents to a container. + + Args: + content: the text content to write to a file path + path: the full path of the desired file + """ + self.container.push(path, content, make_dirs=True) + + def set_jaas_config(self) -> None: + """Sets the Kafka JAAS config using zookeeper relation data.""" + jaas_config = f""" + Client {{ + org.apache.zookeeper.server.auth.DigestLoginModule required + username="{self.zookeeper_config['username']}" + password="{self.zookeeper_config['password']}"; + }}; + """ + self.push(content=jaas_config, path=self.jaas_filepath) + + def set_server_properties(self) -> None: + """Sets all kafka config properties to the server.properties path.""" + server_properties = ( + [ + f"data.dir={self.charm.config['data-dir']}", + f"log.dir={self.charm.config['log-dir']}", + f"offsets.retention.minutes={self.charm.config['offsets-retention-minutes']}", + f"log.retention.hours={self.charm.config['log-retention-hours']}", + f"auto.create.topics={self.charm.config['auto-create-topics']}", + ] + + self.default_replication_properties + + self.auth_properties + + DEFAULT_CONFIG_OPTIONS.split("\n") + ) + + self.push(content="\n".join(server_properties), path=self.properties_filepath) diff --git a/src/connection_check.py b/src/connection_check.py new file mode 100644 index 00000000..42edbbbb --- /dev/null +++ b/src/connection_check.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Collection of helper methods for checking active connections between ZK and Kafka.""" + +import logging +from typing import Dict + +from charms.zookeeper.v0.client import ZooKeeperManager +from kazoo.exceptions import AuthFailedError, NoNodeError +from ops.charm import CharmBase +from ops.model import Unit +from tenacity import retry +from tenacity.retry import retry_if_not_result +from tenacity.stop import stop_after_attempt +from tenacity.wait import wait_fixed + +logger = logging.getLogger(__name__) + + +def zookeeper_connected(charm: CharmBase) -> bool: + """Flag for if required zookeeper config exists in the relation data. + + Returns: + True if config exits i.e successful relation. False otherwise + """ + if not getattr(charm, "kafka_config").zookeeper_config: + return False + + return True + + +@retry( + # retry to give ZK time to update its broker zNodes before failing + wait=wait_fixed(5), + stop=stop_after_attempt(3), + retry_error_callback=(lambda state: state.outcome.result()), + retry=retry_if_not_result(lambda result: True if result else False), +) +def broker_active(unit: Unit, zookeeper_config: Dict[str, str]) -> bool: + """Checks ZooKeeper for client connections, checks for specific broker id. + + Args: + unit: the `Unit` to check connection of + data: the relation data provided by ZooKeeper + + Returns: + True if broker id is recognised as active by ZooKeeper. Otherwise False. + """ + broker_id = unit.name.split("/")[1] + chroot = zookeeper_config.get("chroot", "") + hosts = zookeeper_config.get("endpoints", "").split(",") + username = zookeeper_config.get("username", "") + password = zookeeper_config.get("password", "") + + zk = ZooKeeperManager(hosts=hosts, username=username, password=password) + path = f"{chroot}/brokers/ids/" + + try: + brokers = zk.leader_znodes(path=path) + # auth might not be ready with ZK after relation yet + except (NoNodeError, AuthFailedError) as e: + logger.debug(str(e)) + return False + + return f"{chroot}/brokers/ids/{broker_id}" in brokers diff --git a/src/grafana_dashboards/kafka_dashboard.tmpl b/src/grafana_dashboards/kafka_dashboard.tmpl deleted file mode 100644 index 3697c2bd..00000000 --- a/src/grafana_dashboards/kafka_dashboard.tmpl +++ /dev/null @@ -1,5962 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "${prometheusds}", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "description": "Kafka resource usage and throughput", - "editable": true, - "gnetId": 721, - "graphTooltip": 0, - "id": 6, - "iteration": 1637943038134, - "links": [], - "panels": [ - { - "collapsed": false, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 42, - "panels": [], - "title": "Healthcheck", - "type": "row" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Number of active controllers in the cluster.", - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#299c46", - "value": null - }, - { - "color": "#e5ac0e", - "value": 2 - }, - { - "color": "#bf1b00" - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 0, - "y": 1 - }, - "id": 12, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "vertical", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "value" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_controller_kafkacontroller_activecontrollercount > 0", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "title": "Active Controllers", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Number of Brokers Online", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#d44a3a", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 0 - }, - { - "color": "#299c46", - "value": 2 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 1 - }, - "id": 14, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "repeat": null, - "repeatDirection": "h", - "targets": [ - { - "expr": "count(kafka_server_replicamanager_leadercount)", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Brokers Online", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Partitions that are online", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#d44a3a", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 0 - }, - { - "color": "#299c46", - "value": 0 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 1 - }, - "id": 18, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_replicamanager_partitioncount)", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Online Partitions", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#299c46", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 2 - }, - { - "color": "#d44a3a" - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 1 - }, - "id": 33, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_controller_kafkacontroller_preferredreplicaimbalancecount)", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Preferred Replica Imbalance", - "type": "stat" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "Bytes/s", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 1 - }, - "id": 84, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(rate(kafka_server_brokertopicmetrics_bytesinpersec[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Bytes in", - "metric": "kafka_server_brokertopicmetrics_bytesinpersec", - "refId": "A", - "step": 4 - }, - { - "expr": "sum(rate(kafka_server_brokertopicmetrics_bytesoutpersec[5m]))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "Bytes out", - "metric": "kafka_server_brokertopicmetrics_bytesinpersec", - "refId": "B", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Broker network throughput", - "type": "timeseries" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Number of under-replicated partitions (| ISR | < | all replicas |).", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#508642", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 1 - }, - { - "color": "#bf1b00", - "value": 5 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 0, - "y": 5 - }, - "id": 20, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_replicamanager_underreplicatedpartitions)", - "format": "time_series", - "hide": false, - "instant": true, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Under Replicated Partitions", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Number of partitions under min insync replicas.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#508642", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 1 - }, - { - "color": "#bf1b00", - "value": 5 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 5 - }, - "id": 32, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_cluster_partition_underminisr)", - "format": "time_series", - "hide": false, - "instant": true, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Under Min ISR Partitions", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Number of partitions that dont have an active leader and are hence not writable or readable.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#508642", - "value": null - }, - { - "color": "#ef843c", - "value": 1 - }, - { - "color": "#bf1b00", - "value": 1 - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 5 - }, - "id": 22, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_controller_kafkacontroller_offlinepartitionscount)", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Offline Partitions Count", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Unclean leader election rate", - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "#299c46", - "value": null - }, - { - "color": "rgba(237, 129, 40, 0.89)", - "value": 2 - }, - { - "color": "#d44a3a" - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 5 - }, - "id": 16, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "none", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_controller_controllerstats_uncleanleaderelectionspersec)", - "format": "time_series", - "instant": true, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "title": "Unclean Leader Election Rate", - "type": "stat" - }, - { - "collapsed": false, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 9 - }, - "id": 31, - "panels": [], - "title": "Request rate", - "type": "row" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Produce request rate.", - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 0, - "y": 10 - }, - "id": 93, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(rate(kafka_network_requestmetrics_requestspersec[5m]))", - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "All Request Per Sec", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Produce request rate.", - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 4, - "y": 10 - }, - "id": 35, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(rate(kafka_network_requestmetrics_requestspersec[5m]))", - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Produce Request Per Sec", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Fetch request rate.", - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 8, - "y": 10 - }, - "id": 37, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(rate(kafka_network_requestmetrics_requestspersec[5m]))", - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Consumer Fetch Request Per Sec", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Fetch request rate.", - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 12, - "y": 10 - }, - "id": 94, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(rate(kafka_network_requestmetrics_requestspersec[5m]))", - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Broker Fetch Request Per Sec", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Offset Commit request rate.", - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 16, - "y": 10 - }, - "id": 38, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(rate(kafka_network_requestmetrics_requestspersec[5m]))", - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Offset Commit Request Per Sec", - "type": "stat" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Metadata request rate.", - "fieldConfig": { - "defaults": { - "mappings": [ - { - "options": { - "match": "null", - "result": { - "text": "N/A" - } - }, - "type": "special" - } - ], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "none" - }, - "overrides": [] - }, - "gridPos": { - "h": 4, - "w": 4, - "x": 20, - "y": 10 - }, - "id": 36, - "interval": null, - "links": [], - "maxDataPoints": 100, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "horizontal", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "text": {}, - "textMode": "auto" - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(rate(kafka_network_requestmetrics_requestspersec[5m]))", - "interval": "", - "legendFormat": "", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Metadata Request Per Sec", - "type": "stat" - }, - { - "collapsed": false, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 14 - }, - "id": 40, - "panels": [], - "title": "System", - "type": "row" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "Cores", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "localhost:7071" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#629E51", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 15 - }, - "id": 27, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "irate(process_cpu_seconds_total[5m])*100", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "process_cpu_secondspersec", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "Memory", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "localhost:7071" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#BA43A9", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 15 - }, - "id": 2, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum without(area)(jvm_memory_bytes_used)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "jvm_memory_bytes_used", - "refId": "A", - "step": 4 - }, - { - "expr": "jvm_memory_bytes_max", - "interval": "", - "legendFormat": "{{instance}}", - "refId": "B" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "JVM Memory Used", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "% time in GC", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "localhost:7071" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "#890F02", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 15 - }, - "id": 3, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum without(gc)(rate(jvm_gc_collection_seconds_sum[5m]))", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "jvm_gc_collection_seconds_sum", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Time spent in GC", - "type": "timeseries" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 23 - }, - "id": 29, - "panels": [ - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "Messages/s", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "cps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 24 - }, - "id": 4, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "exemplar": true, - "expr": "sum without(instance,topic)(rate(kafka_server_brokertopicmetrics_messagesinpersec[5m]))", - "interval": "", - "intervalFactor": 2, - "legendFormat": "bytes/sec", - "metric": "kafka_server_brokertopicmetrics_messagesinpersec", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Messages In", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "Bytes/s", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 24 - }, - "id": 5, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "exemplar": true, - "expr": "sum without(instance,topic)(rate(kafka_server_brokertopicmetrics_bytesinpersec[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "bytes/sec", - "metric": "kafka_server_brokertopicmetrics_bytesinpersec", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Bytes In", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "Bytes/s", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 24 - }, - "id": 6, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "exemplar": true, - "expr": "sum without(instance,topic)(rate(kafka_server_brokertopicmetrics_bytesoutpersec[5m]))", - "interval": "", - "intervalFactor": 2, - "legendFormat": "bytes/sec", - "metric": "kafka_server_brokertopicmetrics_bytesinpersec", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Bytes Out", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "Messages/s", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "cps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 32 - }, - "id": 10, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum without(topic)(rate(kafka_server_brokertopicmetrics_messagesinpersec[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "kafka_server_brokertopicmetrics_messagesinpersec", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Messages In Per Broker", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "Bytes/s", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 2, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "Bps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 32 - }, - "id": 7, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum without(topic)(rate(kafka_server_brokertopicmetrics_bytesinpersec[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{instance}}", - "metric": "kafka_server_brokertopicmetrics_bytesinpersec", - "refId": "A", - "step": 4 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Bytes In Per Broker", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "binBps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 32 - }, - "id": 9, - "links": [], - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum without(topic)(rate(kafka_server_brokertopicmetrics_bytesoutpersec[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Bytes Out Per Broker", - "type": "timeseries" - } - ], - "title": "Throughput In/Out", - "type": "row" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 24 - }, - "id": 44, - "panels": [ - { - "datasource": "${prometheusds}", - "description": "Average fraction of time the network processor threads are idle. Values are between 0 (all resources are used) and 100 (all resources are available)\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 25 - }, - "id": 24, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "1-kafka_network_socketserver_networkprocessoravgidlepercent", - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Network Processor Avg Usage Percent", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "Average fraction of time the request handler threads are idle. Values are between 0 (all resources are used) and 100 (all resources are available).\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 25 - }, - "id": 25, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "1 - kafka_server_kafkarequesthandlerpool_requesthandleravgidlepercent_total", - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Request Handler Avg Percent", - "type": "timeseries" - } - ], - "title": "Thread utilization", - "type": "row" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 25 - }, - "id": 86, - "panels": [ - { - "datasource": "${prometheusds}", - "description": "Latency in milliseconds for ZooKeeper requests from broker.\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 26 - }, - "id": 88, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_server_zookeeperclientmetrics_zookeeperrequestlatencyms", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Zookeeper Request Latency", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 26 - }, - "id": 92, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_server_sessionexpirelistener_zookeepersyncconnectspersec", - "hide": false, - "instant": false, - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Zookeeper connections per sec", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 35 - }, - "id": 89, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_server_sessionexpirelistener_zookeeperexpirespersec", - "hide": false, - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Zookeeper expired connections per sec", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 8, - "y": 35 - }, - "id": 90, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_server_sessionexpirelistener_zookeeperdisconnectspersec", - "hide": false, - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Zookeeper disconnect per sec", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 35 - }, - "id": 91, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_server_sessionexpirelistener_zookeeperauthfailurespersec", - "hide": false, - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Zookeeper auth failures per sec", - "type": "timeseries" - } - ], - "title": "Zookeeper", - "type": "row" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 26 - }, - "id": 82, - "panels": [ - { - "datasource": "${prometheusds}", - "description": ": The number of in-sync replicas (ISRs) for a particular partition should remain fairly static, the only exceptions are when you are expanding your broker cluster or removing partitions. In order to maintain high availability, a healthy Kafka cluster requires a minimum number of ISRs for failover. A replica could be removed from the ISR pool for a couple of reasons: it is too far behind the leaders offset (user-configurable by setting the replica.lag.max.messages configuration parameter), or it has not contacted the leader for some time (configurable with the replica.socket.timeout.ms parameter). No matter the reason, an increase in IsrShrinksPerSec without a corresponding increase in IsrExpandsPerSec shortly thereafter is cause for concern and requires user intervention.The Kafka documentation provides a wealth of information on the user-configurable parameters for brokers.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 27 - }, - "id": 80, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "rate(kafka_server_replicamanager_isrshrinkspersec[5m])", - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "IsrShrinks per Sec", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": ": The number of in-sync replicas (ISRs) for a particular partition should remain fairly static, the only exceptions are when you are expanding your broker cluster or removing partitions. In order to maintain high availability, a healthy Kafka cluster requires a minimum number of ISRs for failover. A replica could be removed from the ISR pool for a couple of reasons: it is too far behind the leaders offset (user-configurable by setting the replica.lag.max.messages configuration parameter), or it has not contacted the leader for some time (configurable with the replica.socket.timeout.ms parameter). No matter the reason, an increase in IsrShrinksPerSec without a corresponding increase in IsrExpandsPerSec shortly thereafter is cause for concern and requires user intervention.The Kafka documentation provides a wealth of information on the user-configurable parameters for brokers.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 27 - }, - "id": 83, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "rate(kafka_server_replicamanager_isrexpandspersec[5m])", - "hide": false, - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "IsrExpands per Sec", - "type": "timeseries" - } - ], - "title": "Isr Shrinks / Expands", - "type": "row" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 27 - }, - "id": 53, - "panels": [ - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 55, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_log_log_size) by (topic)", - "legendFormat": "{{topic}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Log size per Topic", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "bytes" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 56, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_log_log_size) by (instance)", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Log size per Broker", - "type": "timeseries" - } - ], - "title": "Logs size", - "type": "row" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 28 - }, - "id": 58, - "panels": [ - { - "datasource": "${prometheusds}", - "description": "A high value can imply there aren't enough IO threads or the CPU is a bottleneck, or the request queue isn't large enough. The request queue size should match the number of connections.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 29 - }, - "id": 60, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_requestqueuetimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Producer - RequestQueueTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "In most cases, a high value can imply slow local storage or the storage is a bottleneck. One should also investigate LogFlushRateAndTimeMs to know how long page flushes are taking, which will also indicate a slow disk. In the case of FetchFollower requests, time spent in LocalTimeMs can be the result of a ZooKeeper write to change the ISR.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 29 - }, - "id": 61, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_localtimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Producer - LocalTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "A high value can imply a slow network connection. For fetch request, if the remote time is high, it could be that there is not enough data to give in a fetch response. This can happen when the consumer or replica is caught up and there is no new incoming data. If this is the case, remote time will be close to the max wait time, which is normal. Max wait time is configured via replica.fetch.wait.max.ms and fetch.max.wait.ms.\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 0, - "y": 38 - }, - "id": 62, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_remotetimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Producer - RemoteTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "A high value can imply there aren't enough network threads or the network can't dequeue responses quickly enough, causing back pressure in the response queue.\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 8, - "y": 38 - }, - "id": 63, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_responsequeuetimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Producer - ResponseQueueTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "A high value can imply the zero-copy from disk to the network is slow, or the network is the bottleneck because the network can't dequeue responses of the TCP socket as quickly as they're being created. If the network buffer gets full, Kafka will block.\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 8, - "x": 16, - "y": 38 - }, - "id": 64, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_responsesendtimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Producer - ResponseSendTimeMs", - "type": "timeseries" - } - ], - "title": "Producer Performance", - "type": "row" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 29 - }, - "id": 68, - "panels": [ - { - "datasource": "${prometheusds}", - "description": "A high value can imply there aren't enough IO threads or the CPU is a bottleneck, or the request queue isn't large enough. The request queue size should match the number of connections.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 30 - }, - "id": 69, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_requestqueuetimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Consumer - RequestQueueTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "In most cases, a high value can imply slow local storage or the storage is a bottleneck. One should also investigate LogFlushRateAndTimeMs to know how long page flushes are taking, which will also indicate a slow disk. In the case of FetchFollower requests, time spent in LocalTimeMs can be the result of a ZooKeeper write to change the ISR.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 30 - }, - "id": 70, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_localtimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Consumer - LocalTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "A high value can imply a slow network connection. For fetch request, if the remote time is high, it could be that there is not enough data to give in a fetch response. This can happen when the consumer or replica is caught up and there is no new incoming data. If this is the case, remote time will be close to the max wait time, which is normal. Max wait time is configured via replica.fetch.wait.max.ms and fetch.max.wait.ms.\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 39 - }, - "id": 71, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_remotetimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Consumer - RemoteTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "A high value can imply there aren't enough network threads or the network can't dequeue responses quickly enough, causing back pressure in the response queue.\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 8, - "y": 39 - }, - "id": 72, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_responsequeuetimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Consumer - ResponseQueueTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "A high value can imply the zero-copy from disk to the network is slow, or the network is the bottleneck because the network can't dequeue responses of the TCP socket as quickly as they're being created. If the network buffer gets full, Kafka will block.\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 39 - }, - "id": 73, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_responsesendtimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Consumer - ResponseSendTimeMs", - "type": "timeseries" - } - ], - "title": "Consumer Performance", - "type": "row" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 30 - }, - "id": 66, - "panels": [ - { - "datasource": "${prometheusds}", - "description": "A high value can imply there aren't enough IO threads or the CPU is a bottleneck, or the request queue isn't large enough. The request queue size should match the number of connections.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 31 - }, - "id": 74, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_requestqueuetimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "FetchFollower - RequestQueueTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "In most cases, a high value can imply slow local storage or the storage is a bottleneck. One should also investigate LogFlushRateAndTimeMs to know how long page flushes are taking, which will also indicate a slow disk. In the case of FetchFollower requests, time spent in LocalTimeMs can be the result of a ZooKeeper write to change the ISR.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 31 - }, - "id": 75, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_localtimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "FetchFollower - LocalTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "A high value can imply a slow network connection. For fetch request, if the remote time is high, it could be that there is not enough data to give in a fetch response. This can happen when the consumer or replica is caught up and there is no new incoming data. If this is the case, remote time will be close to the max wait time, which is normal. Max wait time is configured via replica.fetch.wait.max.ms and fetch.max.wait.ms.\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 0, - "y": 40 - }, - "id": 76, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_remotetimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "FetchFollower - RemoteTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "A high value can imply there aren't enough network threads or the network can't dequeue responses quickly enough, causing back pressure in the response queue.\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 8, - "y": 40 - }, - "id": 77, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_responsequeuetimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "FetchFollower - ResponseQueueTimeMs", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "A high value can imply the zero-copy from disk to the network is slow, or the network is the bottleneck because the network can't dequeue responses of the TCP socket as quickly as they're being created. If the network buffer gets full, Kafka will block.\n", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ms" - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 16, - "y": 40 - }, - "id": 78, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_requestmetrics_responsesendtimems", - "hide": false, - "legendFormat": "{{instance}} - {{quantile}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "FetchFollower - ResponseSendTimeMs", - "type": "timeseries" - } - ], - "title": "Fetch Follower Performance", - "type": "row" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 31 - }, - "id": 102, - "panels": [ - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 32 - }, - "id": 98, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_socketservermetrics_connection_count) by (listener)", - "interval": "", - "legendFormat": "{{listener}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Connections count per listener", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 32 - }, - "id": 100, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_socketservermetrics_connection_count) by (instance)", - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Connections count per broker", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 40 - }, - "id": 104, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_socketservermetrics_connection_creation_rate) by (listener)", - "interval": "", - "legendFormat": "{{listener}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Connections creation rate per listener", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 40 - }, - "id": 106, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_socketservermetrics_connection_creation_rate) by (instance)", - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Connections creation rate per instance", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 48 - }, - "id": 108, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_socketservermetrics_connection_close_rate) by (listener)", - "interval": "", - "legendFormat": "{{listener}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Connections close rate per listener", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 48 - }, - "id": 110, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_socketservermetrics_connection_close_rate) by (instance)", - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Connections close rate per instance", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "Tracks the amount of time Acceptor is blocked from accepting connections. See KIP-402 for more details.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 56 - }, - "id": 112, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_network_acceptor_acceptorblockedpercent", - "interval": "", - "legendFormat": "{{instance}} - {{listener}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Acceptor Blocked Percentage", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 56 - }, - "id": 114, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_socketservermetrics_connections) by (client_software_name, client_software_version)", - "interval": "", - "legendFormat": "{{client_software_name}} {{client_software_version}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Connections per client version", - "type": "timeseries" - } - ], - "title": "Connections", - "type": "row" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 32 - }, - "id": 120, - "panels": [ - { - "datasource": "${prometheusds}", - "description": "Number of consumer groups per group coordinator", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 33 - }, - "id": 116, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "kafka_coordinator_group_groupmetadatamanager_numgroups", - "instant": false, - "interval": "", - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Consumer groups number per coordinator", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "Number of consumer group per state", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 33 - }, - "id": 118, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "table", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_coordinator_group_groupmetadatamanager_numgroupsstable)", - "instant": false, - "interval": "", - "legendFormat": "stable", - "refId": "A" - }, - { - "expr": "sum(kafka_coordinator_group_groupmetadatamanager_numgroupspreparingrebalance)", - "interval": "", - "legendFormat": "preparing-rebalance", - "refId": "B" - }, - { - "expr": "sum(kafka_coordinator_group_groupmetadatamanager_numgroupsdead)", - "interval": "", - "legendFormat": "dead", - "refId": "C" - }, - { - "expr": "sum(kafka_coordinator_group_groupmetadatamanager_numgroupscompletingrebalance)", - "interval": "", - "legendFormat": "completing-rebalance", - "refId": "D" - }, - { - "expr": "sum(kafka_coordinator_group_groupmetadatamanager_numgroupsempty)", - "interval": "", - "legendFormat": "empty", - "refId": "E" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Nb consumer groups per state", - "type": "timeseries" - } - ], - "title": "Group Coordinator", - "type": "row" - }, - { - "collapsed": true, - "datasource": "${prometheusds}", - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 46, - "panels": [ - { - "datasource": "${prometheusds}", - "description": "The number of messages produced converted to match the log.message.format.version.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 34 - }, - "id": 48, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_brokertopicmetrics_producemessageconversionspersec)", - "hide": false, - "interval": "", - "legendFormat": "{{topic}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Number of produced message conversion", - "type": "timeseries" - }, - { - "datasource": "${prometheusds}", - "description": "The number of messages consumed converted at consumer to match the log.message.format.version.", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": true, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "links": [], - "mappings": [], - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "ops" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 34 - }, - "id": 51, - "options": { - "legend": { - "calcs": [ - "mean", - "lastNotNull", - "max" - ], - "displayMode": "list", - "placement": "bottom" - }, - "tooltip": { - "mode": "single" - } - }, - "pluginVersion": "8.1.3", - "targets": [ - { - "expr": "sum(kafka_server_brokertopicmetrics_fetchmessageconversionspersec)", - "hide": false, - "interval": "", - "legendFormat": "{{topic}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Number of consumed message conversion", - "type": "timeseries" - }, - { - "cacheTimeout": null, - "datasource": "${prometheusds}", - "description": "Number of connection per client version", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "decimals": 0, - "mappings": [], - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 34 - }, - "id": 96, - "interval": null, - "links": [], - "options": { - "legend": { - "calcs": [], - "displayMode": "table", - "placement": "bottom", - "values": [ - "value" - ] - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "tooltip": { - "mode": "single" - } - }, - "targets": [ - { - "expr": "sum(kafka_server_socketservermetrics_connections) by (client_software_name, client_software_version) ", - "interval": "", - "legendFormat": "{{client_software_name}} - {{client_software_version}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Client version repartition", - "type": "piechart" - } - ], - "title": "Message Conversion", - "type": "row" - } - ], - "refresh": "1m", - "schemaVersion": 30, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": null, - "current": { - "selected": false, - "text": "dev", - "value": "dev" - }, - "datasource": "${prometheusds}", - "definition": "label_values(env)", - "description": null, - "error": null, - "hide": 0, - "includeAll": false, - "label": "Environment", - "multi": false, - "name": "env", - "options": [], - "query": { - "query": "label_values(env)", - "refId": "Prometheus-env-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": null, - "current": { - "selected": true, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, - "datasource": "${prometheusds}", - "definition": "label_values(kafka_server_kafkaserver_brokerstate{env=\"${env}\"}, instance)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": "Instance", - "multi": true, - "name": "instance", - "options": [], - "query": { - "query": "label_values(kafka_server_kafkaserver_brokerstate{env=\"${env}\"}, instance)", - "refId": "Prometheus-instance-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allValue": "", - "current": { - "selected": false, - "text": [ - "0.95" - ], - "value": [ - "0.95" - ] - }, - "datasource": "${prometheusds}", - "definition": "label_values(quantile)", - "description": null, - "error": null, - "hide": 0, - "includeAll": true, - "label": "Percentile", - "multi": true, - "name": "percentile", - "options": [], - "query": { - "query": "label_values(quantile)", - "refId": "Prometheus-percentile-Variable-Query" - }, - "refresh": 1, - "regex": "", - "skipUrlSync": false, - "sort": 0, - "tagValuesQuery": "", - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-12h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Kafka", - "version": 1 -} \ No newline at end of file diff --git a/src/literals.py b/src/literals.py new file mode 100644 index 00000000..30a56936 --- /dev/null +++ b/src/literals.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Literals used by the Kafka K8s charm.""" + +CHARM_KEY = "kafka" +PEER = "cluster" +ZOOKEEPER_REL_NAME = "zookeeper" +REL_NAME = "kafka" diff --git a/src/provider.py b/src/provider.py new file mode 100644 index 00000000..1906a825 --- /dev/null +++ b/src/provider.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +"""KafkaProvider class and methods.""" + +import logging +import secrets +import string +from typing import Dict + +from ops.charm import RelationBrokenEvent, RelationJoinedEvent +from ops.framework import Object +from ops.model import Relation + +from literals import PEER, REL_NAME + +logger = logging.getLogger(__name__) + + +class KafkaProvider(Object): + """Implements the provider-side logic for client applications relating to Kafka.""" + + def __init__(self, charm) -> None: + super().__init__(charm, "client") + + self.charm = charm + + self.framework.observe( + self.charm.on[REL_NAME].relation_joined, self._on_client_relation_joined + ) + self.framework.observe( + self.charm.on[REL_NAME].relation_broken, self._on_client_relation_broken + ) + + @property + def app_relation(self) -> Relation: + """The Kafka cluster's peer relation.""" + return self.charm.model.get_relation(PEER) + + def relation_config(self, relation: Relation) -> Dict[str, str]: + """Builds necessary relation data for a given relation. + + Args: + event: the event needing config + + Returns: + Dict of `username`, `password` and `endpoints` data for the related app + """ + username = f"relation-{relation.id}" + password = self.app_relation.data[self.charm.app].get(username, self.generate_password()) + units = set([self.charm.unit] + list(self.app_relation.units)) + endpoints = [ + f"{self.charm.app.name}-{unit.name.split('/')[1]}.{self.charm.app.name}-endpoints" + for unit in units + ] + + return {"username": username, "password": password, "endpoints": ",".join(endpoints)} + + def _on_client_relation_joined(self, event: RelationJoinedEvent) -> None: + """Handler for `relation_joined` events.""" + if not self.charm.unit.is_leader(): + return + + relation_config = self.relation_config(relation=event.relation) + + self.add_user(username=relation_config["username"], password=relation_config["password"]) + event.relation.data[self.charm.app].update(relation_config) + + def _on_client_relation_broken(self, event: RelationBrokenEvent) -> None: + """Handler for `relation_broken` events.""" + if not self.charm.unit.is_leader(): + return + + relation_config = self.relation_config(relation=event.relation) + + self.delete_user(username=relation_config["username"]) + + def add_user(self, username: str, password: str) -> None: + """Adds/updates users' SCRAM credentials to ZooKeeper. + + Args: + username: the user's username + password: the user's password + + Raises: + ops.pebble.ExecError: if the command failed + """ + self.charm.add_user_to_zookeeper(username=username, password=password) + self.app_relation.data[self.charm.app].update({username: password}) + + def delete_user(self, username: str) -> None: + """Deletes users' SCRAM credentials from ZooKeeper. + + Args: + username: the user's username + + Raises: + ops.pebble.ExecError: if the command failed + """ + self.charm.delete_user_from_zookeeper(username=username) + self.app_relation.data[self.charm.app].update({username: ""}) + + @staticmethod + def generate_password(): + """Creates randomized string for use as app passwords. + + Returns: + String of 32 randomized letter+digit characters + """ + return "".join([secrets.choice(string.ascii_letters + string.digits) for _ in range(32)]) diff --git a/templates/entrypoint b/templates/entrypoint deleted file mode 100644 index b44471da..00000000 --- a/templates/entrypoint +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2022 Canonical Ltd. -# See LICENSE file for licensing details. -set -x -. /etc/confluent/docker/bash-config - -unset `env | grep -E ^${CHARM_NAME}_SERVICE_ | cut -d "=" -f 1` -unset `env | grep -E ^${CHARM_NAME}_PORT | cut -d "=" -f 1` - -/etc/confluent/docker/run \ No newline at end of file diff --git a/templates/kafka_broker.yml b/templates/kafka_broker.yml deleted file mode 100644 index 099d0da2..00000000 --- a/templates/kafka_broker.yml +++ /dev/null @@ -1,192 +0,0 @@ ---- -startDelaySeconds: 120 -lowercaseOutputName: true -lowercaseOutputLabelNames: true -blacklistObjectNames: - - "kafka.consumer:type=*,id=*" - - "kafka.consumer:type=*,client-id=*" - - "kafka.consumer:type=*,client-id=*,node-id=*" - - "kafka.producer:type=*,id=*" - - "kafka.producer:type=*,client-id=*" - - "kafka.producer:type=*,client-id=*,node-id=*" - - "kafka.*:type=kafka-metrics-count,*" - # This will ignore the admin client metrics from Kafka Brokers and will blacklist certain metrics - # that do not make sense for ingestion. - # "kafka.admin.client:type=*, node-id=*, client-id=*" - # "kafka.admin.client:type=*, client-id=*" - # "kafka.admin.client:type=*, id=*" - - "kafka.admin.client:*" - - "kafka.server:type=*,cipher=*,protocol=*,listener=*,networkProcessor=*" - - "kafka.server:type=*" -rules: - # This is by far the biggest contributor to the number of sheer metrics being produced. - # Always keep it on the top for the case of probability when so many metrics will hit the first condition and exit. - # "kafka.cluster:type=*, name=*, topic=*, partition=*" - # "kafka.log:type=*,name=*, topic=*, partition=*" - - pattern: kafka.(\w+)<>Value - name: kafka_$1_$2_$3 - type: GAUGE - labels: - topic: "$4" - partition: "$5" - # "kafka.server:type=*,name=*, client-id=*, topic=*, partition=*" - - pattern: kafka.server<>Value - name: kafka_server_$1_$2 - type: GAUGE - labels: - clientId: "$3" - topic: "$4" - partition: "$5" - - pattern: kafka.server<>Value - name: kafka_server_$1_$2 - type: GAUGE - labels: - clientId: "$3" - broker: "$4:$5" - # "kafka.network:type=*, name=*, request=*, error=*" - # "kafka.network:type=*, name=*, request=*, version=*" - - pattern: kafka.(\w+)<>(Count|Value) - name: kafka_$1_$2_$3 - labels: - "$4": "$5" - "$6": "$7" - - pattern: kafka.(\w+)<>(\d+)thPercentile - name: kafka_$1_$2_$3 - type: GAUGE - labels: - "$4": "$5" - "$6": "$7" - quantile: "0.$8" - # "kafka.rest:type=*, topic=*, partition=*, client-id=*" - # "kafka.rest:type=*, cipher=*, protocol=*, client-id=*" - - pattern: kafka.(\w+)<>Value - name: kafka_$1_$2 - labels: - "$3": "$4" - "$5": "$6" - "$7": "$8" - # Count and Value - # "kafka.server:type=*, name=*, topic=*" - # "kafka.server:type=*, name=*, clientId=*" - # "kafka.server:type=*, name=*, delayedOperation=*" - # "kafka.server:type=*, name=*, fetcherType=*" - # "kafka.network:type=*, name=*, networkProcessor=*" - # "kafka.network:type=*, name=*, processor=*" - # "kafka.network:type=*, name=*, request=*" - # "kafka.network:type=*, name=*, listener=*" - # "kafka.log:type=*, name=*, logDirectory=*" - # "kafka.log:type=*, name=*, op=*" - # "kafka.rest:type=*, node-id=*, client-id=*" - - pattern: kafka.(\w+)<>(Count|Value) - name: kafka_$1_$2_$3 - labels: - "$4": "$5" - # "kafka.consumer:type=*, topic=*, client-id=*" - # "kafka.producer:type=*, topic=*, client-id=*" - # "kafka.rest:type=*, topic=*, client-id=*" - # "kafka.server:type=*, broker-id=*, fetcher-id=*" - # "kafka.server:type=*, listener=*, networkProcessor=*" - - pattern: kafka.(\w+)<>(Count|Value) - name: kafka_$1_$2 - labels: - "$3": "$4" - "$5": "$6" - # "kafka.network:type=*, name=*" - # "kafka.server:type=*, name=*" - # "kafka.controller:type=*, name=*" - # "kafka.databalancer:type=*, name=*" - # "kafka.log:type=*, name=*" - # "kafka.utils:type=*, name=*" - - pattern: kafka.(\w+)<>(Count|Value) - name: kafka_$1_$2_$3 - # "kafka.producer:type=*, client-id=*" - # "kafka.producer:type=*, id=*" - # "kafka.rest:type=*, client-id=*" - # "kafka.rest:type=*, http-status-code=*" - # "kafka.server:type=*, BrokerId=*" - # "kafka.server:type=*, listener=*" - # "kafka.server:type=*, id=*" - - pattern: kafka.(\w+)<>Value - name: kafka_$1_$2 - labels: - "$3": "$4" - - - pattern: kafka.server<>OneMinuteRate - name: kafka_server_kafkarequesthandlerpool_requesthandleravgidlepercent_total - type: GAUGE - # "kafka.server:type=*, listener=*, networkProcessor=*, clientSoftwareName=*, clientSoftwareVersion=*" - - pattern: kafka.server<>connections - name: kafka_server_socketservermetrics_connections - type: GAUGE - labels: - client_software_name: "$1" - client_software_version: "$2" - listener: "$3" - network_processor: "$4" - - pattern: "kafka.server<>(.+):" - name: kafka_server_socketservermetrics_$3 - type: GAUGE - labels: - listener: "$1" - network_processor: "$2" - # "kafka.coordinator.group:type=*, name=*" - # "kafka.coordinator.transaction:type=*, name=*" - - pattern: kafka.coordinator.(\w+)<>(Count|Value) - name: kafka_coordinator_$1_$2_$3 - # Percentile - - pattern: kafka.(\w+)<>(\d+)thPercentile - name: kafka_$1_$2_$3 - type: GAUGE - labels: - "$4": "$5" - quantile: "0.$6" - - pattern: kafka.(\w+)<>(\d+)thPercentile - name: kafka_$1_$2_$3 - type: GAUGE - labels: - quantile: "0.$4" - # Additional Rules for Confluent Server Metrics - # 'confluent.metadata:type=*, name=*, topic=*, partition=*' - - pattern: confluent.(\w+)<>Value - name: confluent_$1_$2 - type: GAUGE - labels: - "$3": "$4" - "$5": "$6" - "$7": "$8" - # 'confluent.metadata.service:type=*, node-id=*, client-id=*' - - pattern: confluent.(.+)<>Value - name: confluent_$1_$2 - type: GAUGE - labels: - "$3": "$4" - "$5": "$6" - # 'confluent.metadata.service:type=*, client-id=*' - # 'confluent.metadata.service:type=*, id=*' - # 'confluent.metadata:type=*, name=*' - # 'confluent.license:type=*, name=*' - - pattern: confluent.(.+)<>Value - name: confluent_$1_$2 - type: GAUGE - labels: - "$3": "$4" - - # Quotas - - pattern : 'kafka.server<>(.+):' - name: kafka_server_$1_$4 - type: GAUGE - labels: - user: "$2" - client-id: "$3" - - - pattern : 'kafka.server<>(.+):' - name: kafka_server_$1_$3 - type: GAUGE - labels: - user: "$2" - - - pattern : 'kafka.server<>(.+):' - name: kafka_server_$1_$3 - type: GAUGE - labels: - client-id: "$2" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..6f5ac12c --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. +import ops.testing + +ops.testing.SIMULATE_CAN_CONNECT = True diff --git a/tests/fixtures/valid_server.properties b/tests/fixtures/valid_server.properties new file mode 100644 index 00000000..dccf7741 --- /dev/null +++ b/tests/fixtures/valid_server.properties @@ -0,0 +1,6 @@ +broker.id=1 +clientPort=2181 +broker.id.generation.enable=true +listeners=PLAINTEXT://:9092 +advertised.listeners=PLAINTEXT://:9092 +log.dirs=/var/lib/kafka/data diff --git a/tests/integration/app-charm/charmcraft.yaml b/tests/integration/app-charm/charmcraft.yaml new file mode 100644 index 00000000..e109b8b2 --- /dev/null +++ b/tests/integration/app-charm/charmcraft.yaml @@ -0,0 +1,11 @@ +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +type: charm +bases: + - build-on: + - name: "ubuntu" + channel: "20.04" + run-on: + - name: "ubuntu" + channel: "20.04" diff --git a/tests/integration/app-charm/metadata.yaml b/tests/integration/app-charm/metadata.yaml new file mode 100644 index 00000000..de52389d --- /dev/null +++ b/tests/integration/app-charm/metadata.yaml @@ -0,0 +1,17 @@ +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +name: application +description: | + Dummy charm used in integration tests for Kafka. +summary: | + Dummy charm application meant to be used + only for testing of the libs in this repository. + +peers: + cluster: + interface: cluster + +requires: + kafka: + interface: kafka diff --git a/tests/integration/app-charm/requirements.txt b/tests/integration/app-charm/requirements.txt new file mode 100644 index 00000000..56f5f642 --- /dev/null +++ b/tests/integration/app-charm/requirements.txt @@ -0,0 +1 @@ +ops >= 1.5.0 diff --git a/tests/integration/app-charm/src/charm.py b/tests/integration/app-charm/src/charm.py new file mode 100755 index 00000000..a30759a2 --- /dev/null +++ b/tests/integration/app-charm/src/charm.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +"""Application charm that connects to database charms. + +This charm is meant to be used only for testing +of the libraries in this repository. +""" + +import logging + +from ops.charm import CharmBase, RelationEvent +from ops.main import main +from ops.model import ActiveStatus + +logger = logging.getLogger(__name__) + + +CHARM_KEY = "app" +PEER = "cluster" +REL_NAME = "kafka" + + +class ApplicationCharm(CharmBase): + """Application charm that connects to database charms.""" + + def __init__(self, *args): + super().__init__(*args) + self.name = CHARM_KEY + + self.framework.observe(getattr(self.on, "start"), self._on_start) + self.framework.observe(self.on[REL_NAME].relation_changed, self._log) + self.framework.observe(self.on[REL_NAME].relation_broken, self._log) + self.framework.observe(self.on[REL_NAME].relation_joined, self._set_data) + + @property + def relation(self): + return self.model.get_relation(REL_NAME) + + def _on_start(self, _) -> None: + self.unit.status = ActiveStatus() + + def _set_data(self, _) -> None: + return + + def _log(self, event: RelationEvent): + return + + +if __name__ == "__main__": + main(ApplicationCharm) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py new file mode 100644 index 00000000..fe63d776 --- /dev/null +++ b/tests/integration/helpers.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. +import re +from pathlib import Path +from subprocess import PIPE, check_output +from typing import Any, List, Tuple + +import yaml + +METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) +APP_NAME = METADATA["name"] + + +def check_user(model_full_name: str, username: str, zookeeper_uri: str) -> None: + result = check_output( + f"JUJU_MODEL={model_full_name} juju ssh kafka-k8s/0 'kafka.configs --zookeeper {zookeeper_uri} --describe --entity-type users --entity-name {username}'", + stderr=PIPE, + shell=True, + universal_newlines=True, + ) + + assert "SCRAM-SHA-512" in result + + +def show_unit(unit_name: str, model_full_name: str) -> Any: + result = check_output( + f"JUJU_MODEL={model_full_name} juju show-unit {unit_name}", + stderr=PIPE, + shell=True, + universal_newlines=True, + ) + + return yaml.safe_load(result) + + +def get_zookeeper_connection(unit_name: str, model_full_name: str) -> Tuple[List[str], str]: + result = show_unit(unit_name=unit_name, model_full_name=model_full_name) + + relations_info = result[unit_name]["relation-info"] + + usernames = [] + zookeeper_uri = "" + for info in relations_info: + if info["endpoint"] == "cluster": + for key in info["application-data"].keys(): + if re.match(r"(relation\-[\d]+)", key): + usernames.append(key) + if info["endpoint"] == "zookeeper": + zookeeper_uri = info["application-data"]["uris"] + + if zookeeper_uri and usernames: + return usernames, zookeeper_uri + else: + raise Exception("config not found") diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index 03d4b7bb..b37fa48f 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -2,42 +2,37 @@ # Copyright 2022 Canonical Ltd. # See LICENSE file for licensing details. - +import asyncio import logging -from pathlib import Path import pytest -import yaml from pytest_operator.plugin import OpsTest logger = logging.getLogger(__name__) -METADATA = yaml.safe_load(Path("./metadata.yaml").read_text()) +APP_NAME = "kafka" +ZK = "zookeeper-k8s" @pytest.mark.abort_on_fail async def test_build_and_deploy(ops_test: OpsTest): - """Build the charm kafka-k8s and deploy it together with related charms. - - Assert on the unit status before any relations/configurations take place. - """ - await ops_test.model.set_config({"update-status-hook-interval": "10s"}) - await ops_test.model.deploy("zookeeper-k8s", channel="edge", application_name="zookeeper-k8s") - - charm = await ops_test.build_charm(".") - resources = {"kafka-image": METADATA["resources"]["kafka-image"]["upstream-source"]} - await ops_test.model.deploy( - charm, resources=resources, application_name="kafka-k8s", config={"metrics": False} + kafka_charm = await ops_test.build_charm(".") + await asyncio.gather( + ops_test.model.deploy("zookeeper-k8s", channel="edge", application_name=ZK, num_units=1), + ops_test.model.deploy(kafka_charm, application_name=APP_NAME, num_units=1), ) - await ops_test.model.add_relation("kafka-k8s:zookeeper", "zookeeper-k8s:zookeeper") - await ops_test.model.wait_for_idle( - apps=["kafka-k8s", "zookeeper-k8s"], status="active", timeout=1000 - ) - assert ops_test.model.applications["kafka-k8s"].units[0].workload_status == "active" + await ops_test.model.wait_for_idle(apps=[APP_NAME, ZK]) + assert ops_test.model.applications[APP_NAME].status == "waiting" + assert ops_test.model.applications[ZK].status == "active" + + await ops_test.model.add_relation(APP_NAME, ZK) + await ops_test.model.wait_for_idle(apps=[APP_NAME, ZK]) + assert ops_test.model.applications[APP_NAME].status == "active" + assert ops_test.model.applications[ZK].status == "active" - logger.debug("Setting update-status-hook-interval to 60m") - await ops_test.model.set_config({"update-status-hook-interval": "60m"}) - # Scale kafka - await ops_test.model.applications["kafka-k8s"].scale(3) - await ops_test.model.wait_for_idle(apps=["kafka-k8s"], status="active", timeout=1000) +@pytest.mark.abort_on_fail +async def test_blocks_without_zookeeper(ops_test: OpsTest): + await asyncio.gather(ops_test.model.applications[ZK].remove()) + await ops_test.model.wait_for_idle(apps=[APP_NAME]) + assert ops_test.model.applications[ZK].status == "blocked" diff --git a/tests/integration/test_kafka_provider.py b/tests/integration/test_kafka_provider.py new file mode 100644 index 00000000..9c93aae8 --- /dev/null +++ b/tests/integration/test_kafka_provider.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +import asyncio +import logging + +import pytest +from pytest_operator.plugin import OpsTest + +from tests.integration.helpers import check_user, get_zookeeper_connection + +logger = logging.getLogger(__name__) + +APP_NAME = "kafka" +ZK = "zookeeper" +DUMMY_NAME_1 = "app" +DUMMY_NAME_2 = "appii" + + +@pytest.fixture(scope="module") +def usernames(): + return set() + + +@pytest.mark.abort_on_fail +async def test_deploy_charms_relate_active(ops_test: OpsTest, usernames): + zk_charm = await ops_test.build_charm(".") + app_charm = await ops_test.build_charm("tests/integration/app-charm") + + await asyncio.gather( + ops_test.model.deploy( + "zookeeper", channel="edge", application_name="zookeeper", num_units=1 + ), + ops_test.model.deploy(zk_charm, application_name=APP_NAME, num_units=1), + ops_test.model.deploy(app_charm, application_name=DUMMY_NAME_1, num_units=1), + ) + await ops_test.model.wait_for_idle(apps=[APP_NAME, DUMMY_NAME_1, ZK]) + await ops_test.model.add_relation(APP_NAME, ZK) + await ops_test.model.wait_for_idle(apps=[APP_NAME, ZK]) + await ops_test.model.add_relation(APP_NAME, DUMMY_NAME_1) + await ops_test.model.wait_for_idle(apps=[APP_NAME, DUMMY_NAME_1]) + assert ops_test.model.applications[APP_NAME].status == "active" + assert ops_test.model.applications[DUMMY_NAME_1].status == "active" + + # implicitly tests setting of kafka app data + returned_usernames, zookeeper_uri = get_zookeeper_connection( + unit_name="kafka/0", model_full_name=ops_test.model_full_name + ) + usernames.update(returned_usernames) + + for username in usernames: + check_user( + username=username, + zookeeper_uri=zookeeper_uri, + model_full_name=ops_test.model_full_name, + ) + + +@pytest.mark.abort_on_fail +async def test_deploy_multiple_charms_relate_active(ops_test: OpsTest, usernames): + appii_charm = await ops_test.build_charm("tests/integration/app-charm") + await ops_test.model.deploy(appii_charm, application_name=DUMMY_NAME_2, num_units=1), + await ops_test.model.wait_for_idle(apps=[DUMMY_NAME_2]) + await ops_test.model.add_relation(APP_NAME, DUMMY_NAME_2) + await ops_test.model.wait_for_idle(apps=[APP_NAME, DUMMY_NAME_2]) + assert ops_test.model.applications[APP_NAME].status == "active" + assert ops_test.model.applications[DUMMY_NAME_1].status == "active" + assert ops_test.model.applications[DUMMY_NAME_2].status == "active" + + returned_usernames, zookeeper_uri = get_zookeeper_connection( + unit_name="kafka/0", model_full_name=ops_test.model_full_name + ) + usernames.update(returned_usernames) + + for username in usernames: + check_user( + username=username, + zookeeper_uri=zookeeper_uri, + model_full_name=ops_test.model_full_name, + ) + + +@pytest.mark.abort_on_fail +async def test_remove_application_removes_user(ops_test: OpsTest, usernames): + await ops_test.model.applications[DUMMY_NAME_1].remove() + await ops_test.model.wait_for_idle(apps=[APP_NAME]) + assert ops_test.model.applications[APP_NAME].status == "active" + + _, zookeeper_uri = get_zookeeper_connection( + unit_name="kafka/0", model_full_name=ops_test.model_full_name + ) + + # checks that past usernames no longer exist in ZooKeeper + with pytest.raises(AssertionError): + for username in usernames: + check_user( + username=username, + zookeeper_uri=zookeeper_uri, + model_full_name=ops_test.model_full_name, + ) diff --git a/tests/unit/test_charm.py b/tests/unit/test_charm.py deleted file mode 100644 index 21c7043e..00000000 --- a/tests/unit/test_charm.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright 2022 Canonical Ltd. -# See LICENSE file for licensing details. - -import pytest -from ops.model import ActiveStatus, BlockedStatus, MaintenanceStatus, WaitingStatus -from ops.testing import Harness -from pytest_mock import MockerFixture - -from charm import CharmError, KafkaK8sCharm - - -@pytest.fixture -def harness(mocker: MockerFixture): - mocker.patch("charm.KubernetesServicePatch", lambda x, y: None) - kafka_harness = Harness(KafkaK8sCharm) - kafka_harness.begin() - kafka_harness.charm.unit.get_container("kafka").push("/entrypoint", "") - yield kafka_harness - kafka_harness.cleanup() - - -def test_on_config_changed(mocker: MockerFixture, harness: Harness): - # test config validation - _validate_config_original = harness.charm._validate_config - harness.charm._validate_config = mocker.Mock() - harness.charm._validate_config.side_effect = [CharmError("invalid configuration")] - harness.charm.on.config_changed.emit() - assert harness.charm.unit.status == BlockedStatus("invalid configuration") - harness.charm._validate_config = _validate_config_original - # zookeeper not ready - harness.charm.on.config_changed.emit() - assert harness.charm.unit.status == BlockedStatus("need zookeeper relation") - # zookeeker ready - relation_id = harness.add_relation("zookeeper", "zookeeper") - harness.add_relation_unit(relation_id, "zookeeper/0") - harness.update_relation_data(relation_id, "zookeeper", {"hosts": "zk-1"}) - # test pebble not ready - harness.charm.unit.get_container("kafka").can_connect = mocker.Mock() - harness.charm.unit.get_container("kafka").can_connect.side_effect = [False] - harness.charm.on.config_changed.emit() - assert harness.charm.unit.status == MaintenanceStatus("waiting for pebble to start") - harness.charm.unit.get_container("kafka").can_connect.side_effect = None - # test pebble ready - not jmx resource - harness.charm.on.kafka_pebble_ready.emit("kafka") - assert harness.charm.unit.status == BlockedStatus("Missing 'jmx-prometheus-jar' resource") - # jmx resource added - mocker.patch("charm.KafkaK8sCharm._setup_metrics") - kafka_properties = """clientPort=2181 - broker.id.generation.enable=true - - # comment - invalid-line - """ - harness.update_config({"kafka-properties": kafka_properties}) - assert harness.charm.unit.status == ActiveStatus() - assert harness.charm.kafka_properties == { - "KAFKA_CLIENT_PORT": "2181", - "KAFKA_BROKER_ID_GENERATION_ENABLE": "true", - } - - -def test_on_update_status(mocker: MockerFixture, harness: Harness): - mocker.patch("charm.KafkaK8sCharm._setup_metrics") - # ZooKeeper not ready - harness.charm.on.update_status.emit() - assert harness.charm.unit.status == BlockedStatus("need zookeeper relation") - # ZooKeeper ready - mocker.patch( - "charms.zookeeper_k8s.v0.zookeeper.ZooKeeperRequires.hosts", - return_value="zk-1", - new_callable=mocker.PropertyMock, - ) - # test service not ready - harness.charm.on.update_status.emit() - assert harness.charm.unit.status == WaitingStatus("kafka service not configured yet") - # test service ready - harness.charm.on.kafka_pebble_ready.emit("kafka") - assert harness.charm.unit.status == ActiveStatus() - # test service not running - harness.charm.unit.get_container("kafka").stop("kafka") - harness.charm.on.update_status.emit() - assert harness.charm.unit.status == BlockedStatus("kafka service is not running") - - -def test_on_zookeeper_clients_broken(harness: Harness): - harness.charm.on.kafka_pebble_ready.emit("kafka") - harness.charm.on.zookeeper_clients_broken.emit() - assert harness.charm.unit.status == BlockedStatus("need zookeeper relation") - - -def test_kafka_relation(mocker: MockerFixture, harness: Harness): - test_on_config_changed(mocker, harness) - harness.set_leader(True) - relation_id = harness.add_relation("kafka", "kafka-client") - harness.add_relation_unit(relation_id, "kafka-client/0") - relation_data = harness.get_relation_data(relation_id, harness.charm.app.name) - assert relation_data == {"host": "kafka-k8s", "port": "9092"} diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py new file mode 100644 index 00000000..3e71e6ae --- /dev/null +++ b/tests/unit/test_config.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +import unittest + +import ops.testing +from ops.testing import Harness + +from charm import KafkaK8sCharm + +ops.testing.SIMULATE_CAN_CONNECT = True + + +class TestKafkaConfig(unittest.TestCase): + def setUp(self): + self.harness = Harness(KafkaK8sCharm) + self.addCleanup(self.harness.cleanup) + self.harness.begin_with_initial_hooks() + self.relation_id = self.harness.add_relation("zookeeper", "kafka-k8s") + + def test_zookeeper_config_succeeds_fails_config(self): + self.harness.update_relation_data( + self.relation_id, + self.harness.charm.app.name, + { + "chroot": "/kafka", + "username": "moria", + "endpoints": "1.1.1.1,2.2.2.2", + "uris": "1.1.1.1:2181,2.2.2.2:2181/kafka", + }, + ) + self.assertDictEqual(self.harness.charm.kafka_config.zookeeper_config, {}) + + def test_zookeeper_config_succeeds_valid_config(self): + self.harness.update_relation_data( + self.relation_id, + self.harness.charm.app.name, + { + "chroot": "/kafka", + "username": "moria", + "password": "mellon", + "endpoints": "1.1.1.1,2.2.2.2", + "uris": "1.1.1.1:2181/kafka,2.2.2.2:2181/kafka", + }, + ) + self.assertIn("connect", self.harness.charm.kafka_config.zookeeper_config.keys()) + self.assertEqual( + self.harness.charm.kafka_config.zookeeper_config["connect"], + "1.1.1.1:2181,2.2.2.2:2181/kafka", + ) diff --git a/tests/unit/test_provider.py b/tests/unit/test_provider.py new file mode 100644 index 00000000..81621a27 --- /dev/null +++ b/tests/unit/test_provider.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# Copyright 2022 Canonical Ltd. +# See LICENSE file for licensing details. + +import logging +import unittest +from collections import namedtuple + +import ops.testing +from ops.charm import CharmBase +from ops.testing import Harness + +from provider import KafkaProvider + +ops.testing.SIMULATE_CAN_CONNECT = True + +logger = logging.getLogger(__name__) + +METADATA = """ + name: kafka + peers: + cluster: + interface: cluster + provides: + kafka: + interface: kafka +""" + +CustomRelation = namedtuple("Relation", ["id"]) + + +class DummyKafkaCharm(CharmBase): + def __init__(self, *args): + super().__init__(*args) + self.client_relation = KafkaProvider(self) + + +class TestProvider(unittest.TestCase): + def setUp(self): + self.harness = Harness(DummyKafkaCharm, meta=METADATA) + self.addCleanup(self.harness.cleanup) + self.harness.begin_with_initial_hooks() + + @property + def provider(self): + return self.harness.charm.client_relation + + def test_relation_config_new_relation_no_password(self): + self.harness.set_leader(True) + relation_id = self.harness.add_relation("kafka", "client_app") + + config = self.harness.charm.client_relation.relation_config( + relation=self.harness.charm.model.get_relation( + relation_name="kafka", relation_id=relation_id + ) + ) + + self.assertEqual(sorted(["endpoints", "password", "username"]), sorted(config.keys())) + self.assertEqual(sorted(config["endpoints"].split(",")), ["kafka-0.kafka-endpoints"]) + self.assertEqual(len(config["password"]), 32) + + def test_relation_config_existing_relation_password(self): + self.harness.set_leader(True) + relation_id = self.harness.add_relation("kafka", "client_app") + self.harness.update_relation_data( + self.harness.charm.model.get_relation("cluster").id, + "kafka", + {"relation-1": "keepitsecret"}, + ) + + config = self.harness.charm.client_relation.relation_config( + relation=self.harness.charm.model.get_relation( + relation_name="kafka", relation_id=relation_id + ) + ) + + self.assertEqual(config["password"], "keepitsecret") diff --git a/tox.ini b/tox.ini index b1e139b0..99417567 100644 --- a/tox.ini +++ b/tox.ini @@ -7,9 +7,10 @@ skip_missing_interpreters = True envlist = lint, unit [vars] +application = kafka-k8s src_path = {toxinidir}/src/ tst_path = {toxinidir}/tests/ -lib_path = {toxinidir}/lib/charms/kafka_k8s +lib_path = {toxinidir}/lib/charms/zookeeper all_path = {[vars]src_path} {[vars]tst_path} [testenv] @@ -22,6 +23,16 @@ passenv = CHARM_BUILD_DIR MODEL_SETTINGS +[testenv:refresh] +description = Short refresh script for charm-dev +commands = + /bin/bash -ec "juju destroy-model {[vars]application} --force --destroy-storage --no-wait" + /bin/bash -ec "juju add-model {[vars]application}" + /bin/bash -ec "juju deploy zookeeper-k8s --channel edge -n 3" + /bin/bash -ec "charmcraft pack" + /bin/bash -ec "juju deploy ./*.charm -n 3" + /bin/bash -ec "juju relate {[vars]application} zookeeper-k8s" + [testenv:fmt] description = Apply coding style standards to code deps = @@ -35,7 +46,7 @@ commands = description = Check code against coding style standards deps = black - flake8 + flake8 == 4.0.1 flake8-docstrings flake8-copyright flake8-builtins @@ -89,7 +100,10 @@ deps = pytest juju pytest-operator + kazoo + tenacity + pure-sasl + kafka-python -r{toxinidir}/requirements.txt - -r{toxinidir}/requirements-dev.txt commands = pytest -v --tb native --ignore={[vars]tst_path}unit --log-cli-level=INFO -s {posargs}