From 90fde03e3ef63d061f1a166dc79e1a796641f86f Mon Sep 17 00:00:00 2001 From: Xin Liao Date: Wed, 7 Aug 2024 10:26:19 +0800 Subject: [PATCH] [Opt](heartbeat) improve the tolerance of the cluster to occasional heartbeat failures (#38896) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To improve the tolerance of the cluster to occasional heartbeat failures,change max_backend_heartbeat_failure_tolerance_count from 1 to 3. --- .../src/main/java/org/apache/doris/common/Config.java | 4 ++-- .../test/java/org/apache/doris/clone/TabletHealthTest.java | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 21a0694e4096e3..57f89bc8006927 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -1923,12 +1923,12 @@ public class Config extends ConfigBase { /** * Maximum backend heartbeat failure tolerance count. - * Default is 1, which means if 1 heart failed, the backend will be marked as dead. + * Default is 3, which means if 3 heart failed, the backend will be marked as dead. * A larger value can improve the tolerance of the cluster to occasional heartbeat failures. * For example, when running regression tests, this value can be increased. */ @ConfField(mutable = true, masterOnly = true) - public static long max_backend_heartbeat_failure_tolerance_count = 1; + public static long max_backend_heartbeat_failure_tolerance_count = 3; /** * Abort transaction time after lost heartbeat. diff --git a/fe/fe-core/src/test/java/org/apache/doris/clone/TabletHealthTest.java b/fe/fe-core/src/test/java/org/apache/doris/clone/TabletHealthTest.java index b22925e5d89270..7949352eeb363d 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/clone/TabletHealthTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/clone/TabletHealthTest.java @@ -61,6 +61,7 @@ protected void beforeCreatingConnectContext() throws Exception { Config.colocate_group_relocate_delay_second = -1000; // be dead will imm relocate Config.tablet_schedule_interval_ms = 7200_000L; //disable schedule Config.tablet_checker_interval_ms = 7200_000L; //disable checker + Config.max_backend_heartbeat_failure_tolerance_count = 1; } @Override