Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

health_check: add stats counters to monitor health check behavior #37409

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions changelogs/current.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,10 @@ new_features:
change: |
Added :ref:`attribute <arch_overview_attributes>` ``upstream.cx_pool_ready_duration``
to get the duration from when the upstream request was created to when the upstream connection pool is ready.
- area: health_check
change: |
Added new health check filter stats including total requests, successful/failed checks, cached responses, and
cluster health status counters. These stats help track health check behavior and cluster health state.

deprecated:
- area: rbac
Expand Down
20 changes: 20 additions & 0 deletions docs/root/configuration/http/http_filters/health_check_filter.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,23 @@ Health check
<operations_admin_interface_healthcheck_fail>` admin endpoint has been called. (The
:ref:`/healthcheck/ok <operations_admin_interface_healthcheck_ok>` admin endpoint reverses this
behavior).

Statistics
----------

The health check filter outputs statistics in the ``http.<stat_prefix>.health_check.`` namespace. The
:ref:`stat prefix <envoy_v3_api_field_extensions.filters.network.http_connection_manager.v3.HttpConnectionManager.stat_prefix>`
comes from the owning HTTP connection manager.

.. csv-table::
:header: Name, Type, Description
:widths: 1, 1, 2

request_total, Counter, Total number of requests processed by this health check filter ()including responses served from the cache)
failed, Counter, Total number of health checks that failed (including failures due to cluster status and responses served from the cache)
ok, Counter, Total number of health checks that passed
cached_response, Counter, Total number of requests that were responded to with cached health check status
failed_cluster_not_found, Counter, Total number of failed health checks due to referenced cluster not being found
failed_cluster_empty, Counter, Total number of failed health checks due to empty cluster membership when checking cluster health
failed_cluster_unhealthy, Counter, Total number of failed health checks due to cluster falling below minimum healthy percentage threshold
degraded, Counter, Total number of health check responses that reported degraded status
9 changes: 6 additions & 3 deletions source/extensions/filters/http/health_check/config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ namespace HealthCheck {

Http::FilterFactoryCb HealthCheckFilterConfig::createFilterFactoryFromProtoTyped(
const envoy::extensions::filters::http::health_check::v3::HealthCheck& proto_config,
const std::string&, Server::Configuration::FactoryContext& context) {
const std::string& stats_prefix, Server::Configuration::FactoryContext& context) {
ASSERT(proto_config.has_pass_through_mode());

auto stats = std::make_shared<HealthCheckFilterStats>(
HealthCheckFilterStats::generateStats(stats_prefix, context.scope()));
const bool pass_through_mode = proto_config.pass_through_mode().value();
const int64_t cache_time_ms = PROTOBUF_GET_MS_OR_DEFAULT(proto_config, cache_time, 0);

Expand Down Expand Up @@ -48,10 +50,11 @@ Http::FilterFactoryCb HealthCheckFilterConfig::createFilterFactoryFromProtoTyped
}

return [&context, pass_through_mode, cache_manager, header_match_data,
cluster_min_healthy_percentages](Http::FilterChainFactoryCallbacks& callbacks) -> void {
cluster_min_healthy_percentages,
stats](Http::FilterChainFactoryCallbacks& callbacks) -> void {
callbacks.addStreamFilter(std::make_shared<HealthCheckFilter>(
context.serverFactoryContext(), pass_through_mode, cache_manager, header_match_data,
cluster_min_healthy_percentages));
cluster_min_healthy_percentages, stats));
};
}

Expand Down
15 changes: 14 additions & 1 deletion source/extensions/filters/http/health_check/health_check.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,16 +119,19 @@ void HealthCheckFilter::onComplete() {
Http::Code final_status = Http::Code::OK;
const std::string* details = &RcDetails::get().HealthCheckOk;
bool degraded = false;
stats_->request_total_.inc();
if (context_.healthCheckFailed()) {
callbacks_->streamInfo().setResponseFlag(StreamInfo::CoreResponseFlag::FailedLocalHealthCheck);
final_status = Http::Code::ServiceUnavailable;
details = &RcDetails::get().HealthCheckFailed;
stats_->failed_.inc();
} else {
if (cache_manager_) {
const auto status_and_degraded = cache_manager_->getCachedResponse();
final_status = status_and_degraded.first;
details = &RcDetails::get().HealthCheckCached;
degraded = status_and_degraded.second;
stats_->cached_response_.inc();
} else if (cluster_min_healthy_percentages_ != nullptr &&
!cluster_min_healthy_percentages_->empty()) {
// Check the status of the specified upstream cluster(s) to determine the right response.
agrawroh marked this conversation as resolved.
Show resolved Hide resolved
Expand All @@ -142,9 +145,10 @@ void HealthCheckFilter::onComplete() {
// If the cluster does not exist at all, consider the service unhealthy.
final_status = Http::Code::ServiceUnavailable;
details = &RcDetails::get().HealthCheckNoCluster;

stats_->failed_cluster_not_found_.inc();
break;
}

const auto& endpoint_stats = cluster->info()->endpointStats();
const uint64_t membership_total = endpoint_stats.membership_total_.value();
if (membership_total == 0) {
Expand All @@ -155,6 +159,7 @@ void HealthCheckFilter::onComplete() {
} else {
final_status = Http::Code::ServiceUnavailable;
details = &RcDetails::get().HealthCheckClusterEmpty;
stats_->failed_cluster_empty_.inc();
break;
}
}
Expand All @@ -165,6 +170,7 @@ void HealthCheckFilter::onComplete() {
membership_total * min_healthy_percentage) {
final_status = Http::Code::ServiceUnavailable;
details = &RcDetails::get().HealthCheckClusterUnhealthy;
stats_->failed_cluster_unhealthy_.inc();
break;
}
}
Expand All @@ -173,9 +179,16 @@ void HealthCheckFilter::onComplete() {
if (!Http::CodeUtility::is2xx(enumToInt(final_status))) {
callbacks_->streamInfo().setResponseFlag(
StreamInfo::CoreResponseFlag::FailedLocalHealthCheck);
stats_->failed_.inc();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thinking out loud: should the failed_/ok_ also be incremented for the cached response?
Either way, the doc above (about the new statistics) should reflect this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it does make sense to have fail/ok include the cached response as well. I have clarified this in the doc. Let me know if you think otherwise.

} else {
stats_->ok_.inc();
}
}

if (degraded) {
stats_->degraded_.inc();
}

callbacks_->sendLocalReply(
final_status, "",
[degraded](auto& headers) {
Expand Down
40 changes: 35 additions & 5 deletions source/extensions/filters/http/health_check/health_check.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,44 @@
#include "envoy/http/codes.h"
#include "envoy/http/filter.h"
#include "envoy/server/filter_config.h"
#include "envoy/stats/stats.h"
#include "envoy/stats/stats_macros.h"

#include "source/common/common/assert.h"
#include "source/common/http/header_utility.h"

namespace Envoy {
namespace Extensions {
namespace HttpFilters {
namespace HealthCheck {

/**
* All health check filter stats. @see stats_macros.h
*/
#define ALL_HEALTH_CHECK_FILTER_STATS(COUNTER) \
COUNTER(request_total) \
COUNTER(failed) \
COUNTER(ok) \
COUNTER(cached_response) \
COUNTER(failed_cluster_not_found) \
COUNTER(failed_cluster_empty) \
COUNTER(failed_cluster_unhealthy) \
COUNTER(degraded)

/**
* Struct definition for all health check stats. @see stats_macros.h
*/
struct HealthCheckFilterStats {
ALL_HEALTH_CHECK_FILTER_STATS(GENERATE_COUNTER_STRUCT)

static HealthCheckFilterStats generateStats(const std::string& prefix, Stats::Scope& scope) {
const std::string final_prefix = absl::StrCat(prefix, "health_check.");
return {ALL_HEALTH_CHECK_FILTER_STATS(POOL_COUNTER_PREFIX(scope, final_prefix))};
}
};

using HealthCheckFilterStatsSharedPtr = std::shared_ptr<HealthCheckFilterStats>;

/**
* Shared cache manager used by all instances of a health check filter configuration as well as
* all threads. This sets up a timer that will invalidate the cached response code and allow some
Expand Down Expand Up @@ -48,13 +78,11 @@ class HealthCheckCacheManager {
};

using HealthCheckCacheManagerSharedPtr = std::shared_ptr<HealthCheckCacheManager>;

using HeaderDataVectorSharedPtr = std::shared_ptr<std::vector<Http::HeaderUtility::HeaderDataPtr>>;
using ClusterMinHealthyPercentages = std::map<std::string, double>;
using ClusterMinHealthyPercentagesConstSharedPtr =
std::shared_ptr<const ClusterMinHealthyPercentages>;

using HeaderDataVectorSharedPtr = std::shared_ptr<std::vector<Http::HeaderUtility::HeaderDataPtr>>;

/**
* Health check responder filter.
*/
Expand All @@ -63,10 +91,11 @@ class HealthCheckFilter : public Http::StreamFilter {
HealthCheckFilter(Server::Configuration::ServerFactoryContext& context, bool pass_through_mode,
HealthCheckCacheManagerSharedPtr cache_manager,
HeaderDataVectorSharedPtr header_match_data,
ClusterMinHealthyPercentagesConstSharedPtr cluster_min_healthy_percentages)
ClusterMinHealthyPercentagesConstSharedPtr cluster_min_healthy_percentages,
HealthCheckFilterStatsSharedPtr stats)
: context_(context), pass_through_mode_(pass_through_mode), cache_manager_(cache_manager),
header_match_data_(std::move(header_match_data)),
cluster_min_healthy_percentages_(cluster_min_healthy_percentages) {}
cluster_min_healthy_percentages_(cluster_min_healthy_percentages), stats_(stats) {}

// Http::StreamFilterBase
void onDestroy() override {}
Expand Down Expand Up @@ -108,6 +137,7 @@ class HealthCheckFilter : public Http::StreamFilter {
HealthCheckCacheManagerSharedPtr cache_manager_;
const HeaderDataVectorSharedPtr header_match_data_;
ClusterMinHealthyPercentagesConstSharedPtr cluster_min_healthy_percentages_;
const HealthCheckFilterStatsSharedPtr stats_;
};

} // namespace HealthCheck
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,37 @@ TEST_P(HealthCheckIntegrationTest, HealthCheckWithBufferFilter) {
EXPECT_EQ("200", request("http", "GET", "/healthcheck", response));
}

TEST_P(HealthCheckIntegrationTest, HealthCheckStats) {
DISABLE_IF_ADMIN_DISABLED;
initialize();

// Initial stats should be zero
EXPECT_EQ(0, test_server_->counter("http.config_test.health_check.request_total")->value());
EXPECT_EQ(0, test_server_->counter("http.config_test.health_check.ok")->value());
EXPECT_EQ(0, test_server_->counter("http.config_test.health_check.failed")->value());

// Make a health check request - should result in OK response and increment request/ok counters
BufferingStreamDecoderPtr response;
EXPECT_EQ("200", request("http", "GET", "/healthcheck", response));
EXPECT_EQ(1, test_server_->counter("http.config_test.health_check.request_total")->value());
EXPECT_EQ(1, test_server_->counter("http.config_test.health_check.ok")->value());
EXPECT_EQ(0, test_server_->counter("http.config_test.health_check.failed")->value());

// Fail the health check and verify failed counter increments
EXPECT_EQ("200", request("admin", "POST", "/healthcheck/fail", response));
EXPECT_EQ("503", request("http", "GET", "/healthcheck", response));
EXPECT_EQ(2, test_server_->counter("http.config_test.health_check.request_total")->value());
EXPECT_EQ(1, test_server_->counter("http.config_test.health_check.ok")->value());
EXPECT_EQ(1, test_server_->counter("http.config_test.health_check.failed")->value());

// Restore health check and verify ok counter increments
EXPECT_EQ("200", request("admin", "POST", "/healthcheck/ok", response));
EXPECT_EQ("200", request("http", "GET", "/healthcheck", response));
EXPECT_EQ(3, test_server_->counter("http.config_test.health_check.request_total")->value());
EXPECT_EQ(2, test_server_->counter("http.config_test.health_check.ok")->value());
EXPECT_EQ(1, test_server_->counter("http.config_test.health_check.failed")->value());
}

INSTANTIATE_TEST_SUITE_P(Protocols, HealthCheckIntegrationTest,
testing::ValuesIn(HttpProtocolIntegrationTest::getProtocolTestParams(
{Http::CodecType::HTTP1, Http::CodecType::HTTP2},
Expand Down
Loading