From ee8b16282e9f1a857e97b5402c1b46e8913af9d6 Mon Sep 17 00:00:00 2001 From: Julien Clarysse Date: Thu, 5 Dec 2024 16:32:45 +0100 Subject: [PATCH] add: MirrorMaker troubleshooting section and first guide Create a new menu section dedicated to troubleshooting MirrorMaker, along with first guide "Why are some topics or partitions not replicated" [DOC-1212] --- .../troubleshooting/topic-not-replicated.md | 56 +++++++++++++++++++ sidebars.ts | 7 +++ 2 files changed, 63 insertions(+) create mode 100644 docs/products/kafka/kafka-mirrormaker/troubleshooting/topic-not-replicated.md diff --git a/docs/products/kafka/kafka-mirrormaker/troubleshooting/topic-not-replicated.md b/docs/products/kafka/kafka-mirrormaker/troubleshooting/topic-not-replicated.md new file mode 100644 index 00000000..03846794 --- /dev/null +++ b/docs/products/kafka/kafka-mirrormaker/troubleshooting/topic-not-replicated.md @@ -0,0 +1,56 @@ +--- +title: Why are some topics or partitions not replicated +--- + +Apache Kafka® MirrorMaker 2 stores its configs, states and offsets to ensure a resilient message replication across different Kafka clusters. +There are a number of factors that may prevent target topics (or their partitions) from progressing as you would expect. + +The following guideline assumes that the user has previously [setup a MirrorMaker replication flow](/docs/products/kafka/kafka-mirrormaker/howto/setup-replication-flow) +and has assessed an issue with the replication of some topics or partitions. + +:::note +There are different ways of performing this issue assesment, including basic monitoring +and [offset sync status analysis](/docs/products/kafka/kafka-mirrormaker/howto/log-analysis-offset-sync-tool). +::: + +## RecordTooLargeException is observed in the logs + +This can be fixed by increasing the value of broker config message_max_bytes at target, as well as integration config [producer_max_request_size](https://registry.terraform.io/providers/aiven/aiven/latest/docs/resources/service_integration#producer_max_request_size). Following to this change, it is necessary that the workers restart their task. + +## Configuration issue + +The following two worker parameters may significantly impact how topic partitions are replicated: + +- [kafka_mirrormaker.offset_lag_max](/docs/products/kafka/kafka-mirrormaker/reference/advanced-params#kafka_mirrormaker_offset_lag_max) (100 by default) defines how out-of-sync a remote partition can be before it is re-synced. Given that this parameter is global to all replication topics, you need to find a compromise between: + - a too low value that may put more load on your MirrorMaker workers and Kafka brokers due to high-throughput topics + - a too high value that may prevent the replication of low-throughput topics from progressing + +- [kafka_mirrormaker.tasks_max_per_cpu](/docs/products/kafka/kafka-mirrormaker/reference/advanced-params#kafka_mirrormaker_tasks_max_per_cpu) (1 by default) influences the maximum number of MirrorMaker tasks (of each type) per service CPU. For example in a typical cluster of 3 nodes of 2 CPU, the MirrorMaker `tasks.max` is automatically set to 6 per default, allowing for 6 different tasks to execute at the same time. The optimal and best performance case is one Kafka consumer per partition. If MirrorMaker has to process more partitions than replication tasks available, then the tasks will get assigned more than one: + - a too low value may introduce unexpected replication lag on some partitions + - a too high value result in the creation of idle tasks + +## Offset issue + +By default, MirrorMaker is storing its offsets at target cluster in internal topic `mm2-offsets..internal`. By design, it checks if it already stored an offset for the replication topic, if so it always continues mirroring from there, to avoid duplicate consumption. + +This behaviour might lead to user confusion why topics are not replicated from their earliest offset, for example: +- When a replication topic had already been replicated some time ago, and then removed from the replication flow. +- When a replication topic had been deleted/re-created at source. + +In both cases, the recommendation is to avoid these scenarios in production. Should they occur and require to be mitigated, the solution is to manually trigger MirrorMaker offset reset. There are 2 different options: + +#### You can afford to reset offsets for the entire replication flow: + +1. Disable the replication flow. +1. Delete the internal offsets topic. +1. Re-enable the replication flow. +1. The internal offset topic is automatically re-created. +1. MirrorMaker replicates from earliest offset. + +#### You need to reset offsets related to a single topic + +1. Following the steps defined in [Configure properties for Apache Kafka® toolbox](/docs/products/kafka/howto/kafka-tools-config-file), create a configuration file that can be used to access the Kafka cluster that hosts the internal offsets topic. +1. Disable the replication flow. +1. Produce a delete record (aka. tombstone) to the offset storage for the replication topic to reset. This can be achieved using [this script](https://gist.github.com/C0urante/30dba7b9dce567f33df0526d68765860). Note that the `-o` option does not need to specified if default offsets storage topic location applies. +1. Re-enable the replication flow. +1. MirrorMaker replicates from earliest offset. diff --git a/sidebars.ts b/sidebars.ts index 2bbdfbf7..0470db47 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -1063,6 +1063,13 @@ const sidebars: SidebarsConfig = { 'products/kafka/kafka-mirrormaker/howto/log-analysis-offset-sync-tool', ], }, + { + type: 'category', + label: 'Troubleshooting', + items: [ + 'products/kafka/kafka-mirrormaker/troubleshooting/topic-not-replicated', + ], + }, { type: 'category', label: 'Reference',