From e6e06ff3d56bc8a6a8ae6ddd7f7e4ab99976984b Mon Sep 17 00:00:00 2001 From: Hongkun Xu Date: Sun, 30 Jun 2024 12:56:14 +0800 Subject: [PATCH] [fix](cgroup memory) Correct cgroup mem info cache (#36966) ## Proposed changes After upgrading to Doris 2.1.3, we noticed that the "sys available memory" in be.INFO continuously decreases until it falls below the warning water mark, leading to persistent garbage collection (GC) despite the actual memory usage being very low. And The cache in cgroup mem info is always 0. Consequently, I identified an error in the calculation of available memory in cgroup memory: 1. The memory information for cgroup memory is stored in the file "memory.stat" rather than "memory.meminfo" (in fact, the "memory.meminfo" file does not exist). You can see the files under the cgroup path in the attached screenshot1. 2. The output content of "memory.stat" is shown in the screenshot1 below. image image My change is about two steps: 1. Modified the file name for mem info in cgroup. 2. Modified the process for extracting the cache from cgroup. Co-authored-by: Xinyi Zou --- be/src/util/cgroup_util.cpp | 2 +- be/src/util/mem_info.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp index a2c3e294e66c19..9ad78696a6f12c 100644 --- a/be/src/util/cgroup_util.cpp +++ b/be/src/util/cgroup_util.cpp @@ -184,7 +184,7 @@ Status CGroupUtil::find_cgroup_mem_info(std::string* file_path) { } string cgroup_path; RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path)); - *file_path = cgroup_path + "/memory.meminfo"; + *file_path = cgroup_path + "/memory.stat"; return Status::OK(); } diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index 72a47fa076aa7d..a3d391a00b0ab7 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -423,7 +423,7 @@ void MemInfo::refresh_proc_meminfo() { if (fields.size() < 2) { continue; } - std::string key = fields[0].substr(0, fields[0].size() - 1); + std::string key = fields[0].substr(0, fields[0].size()); StringParser::ParseResult result; auto mem_value = StringParser::string_to_int(fields[1].data(), @@ -449,19 +449,19 @@ void MemInfo::refresh_proc_meminfo() { // https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command // memory.usage_in_bytes ~= free.used + free.(buff/cache) - (buff) // so, memory.usage_in_bytes - memory.meminfo["Cached"] - _s_cgroup_mem_usage = cgroup_mem_usage - _s_cgroup_mem_info_bytes["Cached"]; + _s_cgroup_mem_usage = cgroup_mem_usage - _s_cgroup_mem_info_bytes["cache"]; // wait 10s, 100 * 100ms, avoid too frequently. _s_cgroup_mem_refresh_wait_times = -100; LOG(INFO) << "Refresh cgroup memory win, refresh again after 10s, cgroup mem limit: " << _s_cgroup_mem_limit << ", cgroup mem usage: " << _s_cgroup_mem_usage - << ", cgroup mem info cached: " << _s_cgroup_mem_info_bytes["Cached"]; + << ", cgroup mem info cached: " << _s_cgroup_mem_info_bytes["cache"]; } else { // find cgroup failed, wait 300s, 1000 * 100ms. _s_cgroup_mem_refresh_wait_times = -3000; LOG(INFO) << "Refresh cgroup memory failed, refresh again after 300s, cgroup mem limit: " << _s_cgroup_mem_limit << ", cgroup mem usage: " << _s_cgroup_mem_usage - << ", cgroup mem info cached: " << _s_cgroup_mem_info_bytes["Cached"]; + << ", cgroup mem info cached: " << _s_cgroup_mem_info_bytes["cache"]; } } else { if (config::enable_use_cgroup_memory_info) {