From 2e8c8778d0371abf18a6720d23174922a3436a29 Mon Sep 17 00:00:00 2001 From: Xinyi Zou Date: Mon, 12 Aug 2024 22:08:30 +0800 Subject: [PATCH] 1 --- be/src/agent/cgroup_cpu_ctl.cpp | 38 +++++ be/src/agent/cgroup_cpu_ctl.h | 6 + be/src/common/cgroup_memory_ctl.cpp | 183 +++++++++++++++++++++++ be/src/common/cgroup_memory_ctl.h | 36 +++++ be/src/util/cgroup_util.cpp | 215 ++++++++++++++-------------- be/src/util/cgroup_util.h | 62 +++++--- be/src/util/mem_info.cpp | 57 ++------ be/test/util/cgroup_util_test.cpp | 17 +-- 8 files changed, 435 insertions(+), 179 deletions(-) create mode 100644 be/src/common/cgroup_memory_ctl.cpp create mode 100644 be/src/common/cgroup_memory_ctl.h diff --git a/be/src/agent/cgroup_cpu_ctl.cpp b/be/src/agent/cgroup_cpu_ctl.cpp index e1bdd1c7207ec8..3f6135928dd765 100644 --- a/be/src/agent/cgroup_cpu_ctl.cpp +++ b/be/src/agent/cgroup_cpu_ctl.cpp @@ -20,8 +20,10 @@ #include #include +#include #include +#include "util/cgroup_util.h" #include "util/defer_op.h" namespace doris { @@ -240,4 +242,40 @@ Status CgroupV1CpuCtl::delete_unused_cgroup_path(std::set& used_wg_ids return Status::OK(); } +Status CgroupV1CpuCtl::find_cgroup_cpu_limit(float* cpu_count) { + if (!CGroupUtil::cgroupsv1_enable()) { + return Status::InvalidArgument("cgroup is not enabled!"); + } + int64_t quota; + int64_t period; + std::string cgroup_path; + if (!CGroupUtil::find_abs_cgroupv1_path("cpu", &cgroup_path).ok()) { + RETURN_IF_ERROR(CGroupUtil::find_abs_cgroupv1_path("cpuacct", &cgroup_path)); + } + std::filesystem::path cfs_quota_filename = cgroup_path + "/cpu.cfs_quota_us"; + RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file(cfs_quota_filename, "a)); + if (quota <= 0) { + *cpu_count = -1; + return Status::OK(); + } + std::filesystem::path cfs_period_filename = cgroup_path + "/cpu.cfs_period_us"; + RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file(cfs_period_filename, &period)); + if (quota <= period) { + return Status::InvalidArgument("quota <= period"); + } + *cpu_count = float(quota) / float(period); + if (*cpu_count >= FLT_MAX) { + return Status::InvalidArgument("unknown"); + } + return Status::OK(); +} + +std::string CgroupV1CpuCtl::debug_string() { + float cpu_limit; + auto cpu_limit_st = find_cgroup_cpu_limit(&cpu_limit); + return fmt::format("Process CGroup Memory Info: memory limit: {}, memory usage: {}", + cpu_limit_st.ok() ? (cpu_limit > 0 ? std::to_string(cpu_limit) : "unlimited") + : cpu_limit_st.to_string()); +} + } // namespace doris diff --git a/be/src/agent/cgroup_cpu_ctl.h b/be/src/agent/cgroup_cpu_ctl.h index b5f8d2d5d80e67..b02fd9ed53b394 100644 --- a/be/src/agent/cgroup_cpu_ctl.h +++ b/be/src/agent/cgroup_cpu_ctl.h @@ -104,6 +104,12 @@ class CgroupV1CpuCtl : public CgroupCpuCtl { Status delete_unused_cgroup_path(std::set& used_wg_ids) override; + // Determines the CGroup cpu cores limit from the current processes' cgroup. + static Status find_cgroup_cpu_limit(float* cpu_count); + + // Returns a human-readable string with information about CGroups. + static std::string debug_string(); + private: std::string _cgroup_v1_cpu_query_path; std::string _cgroup_v1_cpu_tg_path; // workload group path diff --git a/be/src/common/cgroup_memory_ctl.cpp b/be/src/common/cgroup_memory_ctl.cpp new file mode 100644 index 00000000000000..43506e68660f9c --- /dev/null +++ b/be/src/common/cgroup_memory_ctl.cpp @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/CgroupsMemoryUsageObserver.cpp +// and modified by Doris + +#include "common/cgroup_memory_ctl.h" + +#include +#include +#include +#include + +#include "common/exception.h" +#include "common/status.h" +#include "util/cgroup_util.h" + +namespace doris { + +// Is the memory controller of cgroups v2 enabled on the system? +// Assumes that cgroupsv2_enable() is enabled. +bool cgroupsv2_memory_controller_enabled() { +#if defined(OS_LINUX) + assert(cgroupsv2_enable()); + // According to https://docs.kernel.org/admin-guide/cgroup-v2.html, file "cgroup.controllers" defines which controllers are available + // for the current + child cgroups. The set of available controllers can be restricted from level to level using file + // "cgroups.subtree_control". It is therefore sufficient to check the bottom-most nested "cgroup.controllers" file. + std::string cgroup = CGroupUtil::cgroupv2_of_process(); + auto cgroup_dir = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup); + std::ifstream controllers_file(cgroup_dir / "cgroup.controllers"); + if (!controllers_file.is_open()) { + return false; + } + std::string controllers; + std::getline(controllers_file, controllers); + return controllers.find("memory") != std::string::npos; +#else + return false; +#endif +} + +struct CgroupsV1Reader : CGroupUtil::ICgroupsReader { + explicit CgroupsV1Reader(std::filesystem::path mount_file_dir) + : _mount_file_dir(std::move(mount_file_dir)) {} + + uint64_t read_memory_limit() override { + int64_t value; + auto st = CGroupUtil::read_int_line_from_cgroup_file( + (_mount_file_dir / "memory.limit_in_bytes"), &value); + if (st.ok()) { + throw doris::Exception(doris::ErrorCode::END_OF_FILE, + "Cannot read cgroupv1 memory.limit_in_bytes"); + } + return value; + } + + uint64_t read_memory_usage() override { + std::unordered_map metrics_map; + CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / "memory.stat"), + metrics_map); + return metrics_map["rss"]; + } + +private: + std::filesystem::path _mount_file_dir; +}; + +struct CgroupsV2Reader : CGroupUtil::ICgroupsReader { + explicit CgroupsV2Reader(std::filesystem::path mount_file_dir) + : _mount_file_dir(std::move(mount_file_dir)) {} + + uint64_t read_memory_limit() override { + // int64_t value; + // auto st = CGroupUtil::read_int_line_from_cgroup_file((_mount_file_dir / "memory.limit_in_bytes"), &value); + // if (st.ok()) { + // throw doris::Exception(doris::ErrorCode::END_OF_FILE, "Cannot read cgroupv1 memory.limit_in_bytes"); + // } + // return value; + return 0; + } + + uint64_t read_memory_usage() override { + int64_t mem_usage = 0; + // memory.current contains a single number + // the reason why we subtract it described here: https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667 + auto st = CGroupUtil::read_int_line_from_cgroup_file((_mount_file_dir / "memory.current"), + &mem_usage); + if (st.ok()) { + throw doris::Exception(doris::ErrorCode::END_OF_FILE, + "Cannot read cgroupv2 memory.current"); + } + std::unordered_map metrics_map; + CGroupUtil::read_int_metric_from_cgroup_file((_mount_file_dir / "memory.stat"), + metrics_map); + mem_usage -= metrics_map["inactive_file"]; + if (mem_usage < 0) { + throw doris::Exception(doris::ErrorCode::END_OF_FILE, "Negative memory usage"); + } + return mem_usage; + } + +private: + std::filesystem::path _mount_file_dir; +}; + +std::pair get_cgroups_path() { + if (CGroupUtil::cgroupsv2_enable() && cgroupsv2_memory_controller_enabled()) { + auto v2_path = CGroupUtil::get_cgroupsv2_path("memory.stat"); + if (v2_path.has_value()) { + return {*v2_path, CGroupUtil::CgroupsVersion::V2}; + } + } + + std::string cgroup_path; + auto st = CGroupUtil::find_abs_cgroupv1_path("memory", &cgroup_path); + if (st.ok()) { + return {cgroup_path, CGroupUtil::CgroupsVersion::V1}; + } + + throw doris::Exception(doris::ErrorCode::END_OF_FILE, + "Cannot find cgroups v1 or v2 current memory file"); +} + +std::shared_ptr get_cgroups_reader() { + const auto [cgroup_path, version] = get_cgroups_path(); + + if (version == CGroupUtil::CgroupsVersion::V2) { + return std::make_shared(cgroup_path); + } else { + return std::make_shared(cgroup_path); + } +} + +Status CGroupMemoryCtl::find_cgroup_mem_limit(int64_t* bytes) { + try { + *bytes = get_cgroups_reader()->read_memory_limit(); + return Status::OK(); + } catch (const doris::Exception& e) { + return Status::IOError(e.to_string()); + } +} + +Status CGroupMemoryCtl::find_cgroup_mem_usage(int64_t* bytes) { + try { + *bytes = get_cgroups_reader()->read_memory_usage(); + return Status::OK(); + } catch (const doris::Exception& e) { + return Status::IOError(e.to_string()); + } +} + +std::string CGroupMemoryCtl::debug_string() { + const auto [cgroup_path, version] = get_cgroups_path(); + + int64_t mem_limit; + auto mem_limit_st = find_cgroup_mem_limit(&mem_limit); + + int64_t mem_usage; + auto mem_usage_st = find_cgroup_mem_usage(&mem_usage); + + return fmt::format( + "Process CGroup Memory Info (cgroups path: {}, cgroup version: {}): memory limit: {}, " + "memory usage: {}", + cgroup_path, (version == CGroupUtil::CgroupsVersion::V1) ? "v1" : "v2", + mem_limit_st.ok() ? std::to_string(mem_limit) : mem_limit_st.to_string(), + mem_usage_st.ok() ? std::to_string(mem_usage) : mem_usage_st.to_string()); +} + +} // namespace doris diff --git a/be/src/common/cgroup_memory_ctl.h b/be/src/common/cgroup_memory_ctl.h new file mode 100644 index 00000000000000..46c2354bc6e727 --- /dev/null +++ b/be/src/common/cgroup_memory_ctl.h @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "common/status.h" + +namespace doris { + +class CGroupMemoryCtl { +public: + // Determines the CGroup memory limit from the current processes' cgroup. + // If the limit is more than INT64_MAX, INT64_MAX is returned (since that is + // effectively unlimited anyway). Does not take into account memory limits + // set on any ancestor CGroups. + static Status find_cgroup_mem_limit(int64_t* bytes); + + // https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command + static Status find_cgroup_mem_usage(int64_t* bytes); + + // Returns a human-readable string with information about CGroups. + static std::string debug_string(); +}; +} // namespace doris diff --git a/be/src/util/cgroup_util.cpp b/be/src/util/cgroup_util.cpp index 9ad78696a6f12c..90ca83df28e248 100644 --- a/be/src/util/cgroup_util.cpp +++ b/be/src/util/cgroup_util.cpp @@ -18,10 +18,7 @@ #include "util/cgroup_util.h" #include -#include #include -#include -#include #include #include @@ -40,7 +37,25 @@ using std::pair; namespace doris { -Status CGroupUtil::find_global_cgroup(const string& subsystem, string* path) { +bool CGroupUtil::cgroupsv1_enable() { + bool exists = true; + Status st = io::global_local_filesystem()->exists("/proc/cgroups", &exists); + return st.ok() && exists; +} + +bool CGroupUtil::cgroupsv2_enable() { +#if defined(OS_LINUX) + // This file exists iff the host has cgroups v2 enabled. + auto controllers_file = default_cgroups_mount / "cgroup.controllers"; + bool exists = true; + Status st = io::global_local_filesystem()->exists(controllers_file, &exists); + return st.ok() && exists; +#else + return false; +#endif +} + +Status CGroupUtil::find_global_cgroupv1(const string& subsystem, string* path) { std::ifstream proc_cgroups("/proc/self/cgroup", std::ios::in); string line; while (true) { @@ -82,24 +97,7 @@ static Status unescape_path(const string& escaped, string* unescaped) { return Status::OK(); } -static Status read_cgroup_value(const string& limit_file_path, int64_t* val) { - std::ifstream limit_file(limit_file_path, std::ios::in); - string line; - getline(limit_file, line); - if (limit_file.fail() || limit_file.bad()) { - return Status::IOError("Error reading {}: {}", limit_file_path, get_str_err_msg()); - } - StringParser::ParseResult pr; - // Parse into an int64_t If it overflows, returning the max value of int64_t is ok because that - // is effectively unlimited. - *val = StringParser::string_to_int(line.c_str(), line.size(), &pr); - if ((pr != StringParser::PARSE_SUCCESS && pr != StringParser::PARSE_OVERFLOW)) { - return Status::InvalidArgument("Failed to parse {} as int64: '{}'", limit_file_path, line); - } - return Status::OK(); -} - -Status CGroupUtil::find_cgroup_mounts(const string& subsystem, pair* result) { +Status CGroupUtil::find_cgroupv1_mounts(const string& subsystem, pair* result) { std::ifstream mountinfo("/proc/self/mountinfo", std::ios::in); string line; while (true) { @@ -118,14 +116,18 @@ Status CGroupUtil::find_cgroup_mounts(const string& subsystem, pair fields = Split(line, " ", SkipWhitespace()); if (fields.size() < 7) { return Status::InvalidArgument( "Could not parse line from /proc/self/mountinfo - had {} > 7 tokens: '{}'", fields.size(), line); } - if (fields[fields.size() - 3] != "cgroup") continue; + if (fields[fields.size() - 3] != "cgroup") { + continue; + } // This is a cgroup mount. Check if it's the mount we're looking for. std::vector cgroup_opts = Split(fields[fields.size() - 1], ",", SkipWhitespace()); auto it = std::find(cgroup_opts.begin(), cgroup_opts.end(), subsystem); @@ -138,16 +140,21 @@ Status CGroupUtil::find_cgroup_mounts(const string& subsystem, pair paths; - RETURN_IF_ERROR(find_cgroup_mounts(subsystem, &paths)); + RETURN_IF_ERROR(find_cgroupv1_mounts(subsystem, &paths)); const string& mount_path = paths.first; const string& system_path = paths.second; if (path->compare(0, system_path.size(), system_path) != 0) { @@ -158,98 +165,96 @@ Status CGroupUtil::find_abs_cgroup_path(const string& subsystem, string* path) { return Status::OK(); } -Status CGroupUtil::find_cgroup_mem_limit(int64_t* bytes) { - if (!enable()) { - return Status::InvalidArgument("cgroup is not enabled!"); +std::string CGroupUtil::cgroupv2_of_process() { +#if defined(OS_LINUX) + assert(cgroupsv2_enable()); + // All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs + // A simpler way to get the membership is: + std::ifstream cgroup_name_file("/proc/self/cgroup"); + if (!cgroup_name_file.is_open()) { + return ""; + } + // With cgroups v2, there will be a *single* line with prefix "0::/" + // (see https://docs.kernel.org/admin-guide/cgroup-v2.html) + std::string cgroup; + std::getline(cgroup_name_file, cgroup); + static const std::string v2_prefix = "0::/"; + if (!cgroup.starts_with(v2_prefix)) { + return ""; } - string cgroup_path; - RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path)); - string limit_file_path = cgroup_path + "/memory.limit_in_bytes"; - return read_cgroup_value(limit_file_path, bytes); + cgroup = cgroup.substr(v2_prefix.length()); + return cgroup; +#else + return ""; +#endif } -Status CGroupUtil::find_cgroup_mem_usage(int64_t* bytes) { - if (!enable()) { - return Status::InvalidArgument("cgroup is not enabled!"); +std::optional CGroupUtil::get_cgroupsv2_path(const std::string& subsystem) { + if (!CGroupUtil::cgroupsv2_enable()) { + return {}; } - string cgroup_path; - RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path)); - string usage_file_path = cgroup_path + "/memory.usage_in_bytes"; - return read_cgroup_value(usage_file_path, bytes); -} -Status CGroupUtil::find_cgroup_mem_info(std::string* file_path) { - if (!enable()) { - return Status::InvalidArgument("cgroup is not enabled!"); + std::string cgroup = CGroupUtil::cgroupv2_of_process(); + auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup); + + // Return the bottom-most nested current memory file. If there is no such file at the current + // level, try again at the parent level as memory settings are inherited. + while (current_cgroup != default_cgroups_mount.parent_path()) { + if (std::filesystem::exists(current_cgroup / subsystem)) { + return {current_cgroup}; + } + current_cgroup = current_cgroup.parent_path(); } - string cgroup_path; - RETURN_IF_ERROR(find_abs_cgroup_path("memory", &cgroup_path)); - *file_path = cgroup_path + "/memory.stat"; - return Status::OK(); + return {}; } -Status CGroupUtil::find_cgroup_cpu_limit(float* cpu_count) { - if (!enable()) { - return Status::InvalidArgument("cgroup is not enabled!"); - } - int64_t quota; - int64_t period; - string cgroup_path; - if (!find_abs_cgroup_path("cpu", &cgroup_path).ok()) { - RETURN_IF_ERROR(find_abs_cgroup_path("cpuacct", &cgroup_path)); - } - string cfs_quota_filename = cgroup_path + "/cpu.cfs_quota_us"; - RETURN_IF_ERROR(read_cgroup_value(cfs_quota_filename, "a)); - if (quota <= 0) { - *cpu_count = -1; - return Status::OK(); - } - string cfs_period_filename = cgroup_path + "/cpu.cfs_period_us"; - RETURN_IF_ERROR(read_cgroup_value(cfs_period_filename, &period)); - if (quota <= period) { - return Status::InvalidArgument("quota <= period"); +Status CGroupUtil::read_int_line_from_cgroup_file(const std::filesystem::path& file_path, + int64_t* val) { + std::ifstream file_stream(file_path, std::ios::in); + string line; + getline(file_stream, line); + if (file_stream.fail() || file_stream.bad()) { + return Status::IOError("Error reading {}: {}", file_path.string(), get_str_err_msg()); } - *cpu_count = float(quota) / float(period); - if (*cpu_count >= FLT_MAX) { - return Status::InvalidArgument("unknown"); + StringParser::ParseResult pr; + // Parse into an int64_t If it overflows, returning the max value of int64_t is ok because that + // is effectively unlimited. + *val = StringParser::string_to_int(line.c_str(), line.size(), &pr); + if ((pr != StringParser::PARSE_SUCCESS && pr != StringParser::PARSE_OVERFLOW)) { + return Status::InvalidArgument("Failed to parse {} as int64: '{}'", file_path.string(), + line); } return Status::OK(); } -std::string CGroupUtil::debug_string() { - if (!enable()) { - return std::string("cgroup is not enabled!"); - } - string mem_limit_str; - int64_t mem_limit; - Status status = find_cgroup_mem_limit(&mem_limit); - if (status.ok()) { - mem_limit_str = strings::Substitute("$0", mem_limit); - } else { - mem_limit_str = status.to_string(); - } - string cpu_limit_str; - float cpu_limit; - status = find_cgroup_cpu_limit(&cpu_limit); - if (status.ok()) { - if (cpu_limit > 0) { - std::stringstream stream; - stream << std::fixed << std::setprecision(1) << cpu_limit; - cpu_limit_str = stream.str(); - } else { - cpu_limit_str = "unlimited"; +void CGroupUtil::read_int_metric_from_cgroup_file( + const std::filesystem::path& file_path, + std::unordered_map& metrics_map) { + std::ifstream cgroup_file(file_path, std::ios::in); + std::string line; + while (cgroup_file.good() && !cgroup_file.eof()) { + getline(cgroup_file, line); + std::vector fields = strings::Split(line, " ", strings::SkipWhitespace()); + if (fields.size() < 2) { + continue; } - } else { - cpu_limit_str = status.to_string(); - } - return strings::Substitute("Process CGroup Info: memory.limit_in_bytes=$0, cpu cfs limits: $1", - mem_limit_str, cpu_limit_str); -} + std::string key = fields[0].substr(0, fields[0].size()); -bool CGroupUtil::enable() { - bool exists = true; - Status st = io::global_local_filesystem()->exists("/proc/cgroups", &exists); - return st.ok() && exists; + StringParser::ParseResult result; + auto value = + StringParser::string_to_int(fields[1].data(), fields[1].size(), &result); + + if (result == StringParser::PARSE_SUCCESS) { + if (fields.size() == 2) { + metrics_map[key] = value; + } else if (fields[2] == "kB") { + metrics_map[key] = value * 1024L; + } + } + } + if (cgroup_file.is_open()) { + cgroup_file.close(); + } } } // namespace doris diff --git a/be/src/util/cgroup_util.h b/be/src/util/cgroup_util.h index 2152720ccdd1da..3ef48ec8a476a7 100644 --- a/be/src/util/cgroup_util.h +++ b/be/src/util/cgroup_util.h @@ -18,50 +18,74 @@ #pragma once #include +#include #include #include #include "common/status.h" namespace doris { + +#if defined(OS_LINUX) +// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. when in containers). +// /sys/fs/cgroup was still symlinked to the actual mount in the cases that I have seen. +static inline const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup"; +#endif + class CGroupUtil { public: - // Determines the CGroup memory limit from the current processes' cgroup. - // If the limit is more than INT64_MAX, INT64_MAX is returned (since that is - // effectively unlimited anyway). Does not take into account memory limits - // set on any ancestor CGroups. - static Status find_cgroup_mem_limit(int64_t* bytes); + enum class CgroupsVersion : uint8_t { V1, V2 }; - // memory.usage_in_bytes ~= free.used + free.(buff/cache) - (buff) - // https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command - static Status find_cgroup_mem_usage(int64_t* bytes); - static Status find_cgroup_mem_info(std::string* file_path); + // Inherited by cgroup v1 and v2 + struct ICgroupsReader { + virtual ~ICgroupsReader() = default; - // Determines the CGroup cpu cores limit from the current processes' cgroup. - static Status find_cgroup_cpu_limit(float* cpu_count); + virtual uint64_t read_memory_limit() = 0; - // Returns a human-readable string with information about CGroups. - static std::string debug_string(); + virtual uint64_t read_memory_usage() = 0; + }; // detect if cgroup is enabled - static bool enable(); + static bool cgroupsv1_enable(); + static bool cgroupsv2_enable(); -private: // return the global cgroup path of subsystem like 12:memory:/user.slice -> user.slice - static Status find_global_cgroup(const std::string& subsystem, std::string* path); + static Status find_global_cgroupv1(const std::string& subsystem, std::string* path); // Returns the absolute path to the CGroup from inside the container. // E.g. if this process belongs to // /sys/fs/cgroup/memory/kubepods/burstable/pod-, which is mounted at // /sys/fs/cgroup/memory inside the container, this function returns // "/sys/fs/cgroup/memory". - static Status find_abs_cgroup_path(const std::string& subsystem, std::string* path); + static Status find_abs_cgroupv1_path(const std::string& subsystem, std::string* path); // Figures out the mapping of the cgroup root from the container's point of view to // the full path relative to the system-wide cgroups outside of the container. // E.g. /sys/fs/cgroup/memory/kubepods/burstable/pod- may be mounted at // /sys/fs/cgroup/memory inside the container. In that case this function would return // ("/sys/fs/cgroup/memory", "kubepods/burstable/pod-"). - static Status find_cgroup_mounts(const std::string& subsystem, - std::pair* result); + static Status find_cgroupv1_mounts(const std::string& subsystem, + std::pair* result); + + // Which cgroup does the process belong to? + // Returns an empty string if the cgroup cannot be determined. + // Assumes that cgroupsV2Enabled() is enabled. + static std::string cgroupv2_of_process(); + + // Caveats: + // - All of the logic in this file assumes that the current process is the only process in the + // containing cgroup (or more precisely: the only process with significant memory consumption). + // If this is not the case, then other processe's memory consumption may affect the internal + // memory tracker ... + // - Cgroups v1 and v2 allow nested cgroup hierarchies. As v1 is deprecated for over half a + // decade and will go away at some point, hierarchical detection is only implemented for v2. + // - I did not test what happens if a host has v1 and v2 simultaneously enabled. I believe such + // systems existed only for a short transition period. + static std::optional get_cgroupsv2_path(const std::string& subsystem); + + static Status read_int_line_from_cgroup_file(const std::filesystem::path& file_path, + int64_t* val); + static void read_int_metric_from_cgroup_file( + const std::filesystem::path& file_path, + std::unordered_map& metrics_map); }; } // namespace doris diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index d0703c985ea884..1c48f09c2ae2ba 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -20,6 +20,8 @@ #include "mem_info.h" +#include "gutil/strings/split.h" + #ifdef __APPLE__ #include #endif @@ -34,11 +36,10 @@ #include #include #include -#include +#include "common/cgroup_memory_ctl.h" #include "common/config.h" #include "common/status.h" -#include "gutil/strings/split.h" #include "runtime/memory/global_memory_arbitrator.h" #include "util/cgroup_util.h" #include "util/parse_util.h" @@ -75,7 +76,6 @@ std::atomic MemInfo::_s_virtual_memory_used = 0; int64_t MemInfo::_s_cgroup_mem_limit = std::numeric_limits::max(); int64_t MemInfo::_s_cgroup_mem_usage = std::numeric_limits::min(); -static std::unordered_map _s_cgroup_mem_info_bytes; bool MemInfo::_s_cgroup_mem_refresh_state = false; int64_t MemInfo::_s_cgroup_mem_refresh_wait_times = 0; @@ -179,7 +179,7 @@ void MemInfo::refresh_proc_meminfo() { if (result == StringParser::PARSE_SUCCESS) { if (fields.size() == 2) { _mem_info_bytes[key] = mem_value; - } else if (fields[2].compare("kB") == 0) { + } else if (fields[2] == "kB") { _mem_info_bytes[key] = mem_value * 1024L; } } @@ -194,65 +194,28 @@ void MemInfo::refresh_proc_meminfo() { int64_t cgroup_mem_usage = -1; std::string cgroup_mem_info_file_path; _s_cgroup_mem_refresh_state = true; - Status status = CGroupUtil::find_cgroup_mem_limit(&cgroup_mem_limit); + Status status = CGroupMemoryCtl::find_cgroup_mem_limit(&cgroup_mem_limit); if (!status.ok() || cgroup_mem_limit <= 0) { _s_cgroup_mem_refresh_state = false; } - status = CGroupUtil::find_cgroup_mem_usage(&cgroup_mem_usage); + status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage); if (!status.ok() || cgroup_mem_usage <= 0) { _s_cgroup_mem_refresh_state = false; } - status = CGroupUtil::find_cgroup_mem_info(&cgroup_mem_info_file_path); - if (status.ok()) { - std::ifstream cgroup_meminfo(cgroup_mem_info_file_path, std::ios::in); - std::string line; - - while (cgroup_meminfo.good() && !cgroup_meminfo.eof()) { - getline(cgroup_meminfo, line); - std::vector fields = - strings::Split(line, " ", strings::SkipWhitespace()); - if (fields.size() < 2) { - continue; - } - std::string key = fields[0].substr(0, fields[0].size()); - - StringParser::ParseResult result; - auto mem_value = StringParser::string_to_int(fields[1].data(), - fields[1].size(), &result); - - if (result == StringParser::PARSE_SUCCESS) { - if (fields.size() == 2) { - _s_cgroup_mem_info_bytes[key] = mem_value; - } else if (fields[2] == "kB") { - _s_cgroup_mem_info_bytes[key] = mem_value * 1024L; - } - } - } - if (cgroup_meminfo.is_open()) { - cgroup_meminfo.close(); - } - } else { - _s_cgroup_mem_refresh_state = false; - } if (_s_cgroup_mem_refresh_state) { _s_cgroup_mem_limit = cgroup_mem_limit; - // https://serverfault.com/questions/902009/the-memory-usage-reported-in-cgroup-differs-from-the-free-command - // memory.usage_in_bytes ~= free.used + free.(buff/cache) - (buff) - // so, memory.usage_in_bytes - memory.meminfo["Cached"] - _s_cgroup_mem_usage = cgroup_mem_usage - _s_cgroup_mem_info_bytes["cache"]; + _s_cgroup_mem_usage = cgroup_mem_usage; // wait 10s, 100 * 100ms, avoid too frequently. _s_cgroup_mem_refresh_wait_times = -100; LOG(INFO) << "Refresh cgroup memory win, refresh again after 10s, cgroup mem limit: " - << _s_cgroup_mem_limit << ", cgroup mem usage: " << _s_cgroup_mem_usage - << ", cgroup mem info cached: " << _s_cgroup_mem_info_bytes["cache"]; + << _s_cgroup_mem_limit << ", cgroup mem usage: " << _s_cgroup_mem_usage; } else { // find cgroup failed, wait 300s, 1000 * 100ms. _s_cgroup_mem_refresh_wait_times = -3000; LOG(INFO) << "Refresh cgroup memory failed, refresh again after 300s, cgroup mem limit: " - << _s_cgroup_mem_limit << ", cgroup mem usage: " << _s_cgroup_mem_usage - << ", cgroup mem info cached: " << _s_cgroup_mem_info_bytes["cache"]; + << _s_cgroup_mem_limit << ", cgroup mem usage: " << _s_cgroup_mem_usage; } } else { if (config::enable_use_cgroup_memory_info) { @@ -435,7 +398,7 @@ std::string MemInfo::debug_string() { stream << "Physical Memory: " << PrettyPrinter::print(_s_physical_mem, TUnit::BYTES) << std::endl; stream << "Memory Limt: " << PrettyPrinter::print(_s_mem_limit, TUnit::BYTES) << std::endl; - stream << "CGroup Info: " << doris::CGroupUtil::debug_string() << std::endl; + stream << "CGroup Info: " << doris::CGroupMemoryCtl::debug_string() << std::endl; return stream.str(); } diff --git a/be/test/util/cgroup_util_test.cpp b/be/test/util/cgroup_util_test.cpp index 1553e5f8163188..d7ed3bb8ce0c50 100644 --- a/be/test/util/cgroup_util_test.cpp +++ b/be/test/util/cgroup_util_test.cpp @@ -23,6 +23,7 @@ #include +#include "common/cgroup_memory_ctl.h" #include "gtest/gtest_pred_impl.h" namespace doris { @@ -30,16 +31,16 @@ namespace doris { class CGroupUtilTest : public ::testing::Test { protected: CGroupUtilTest() {} - virtual ~CGroupUtilTest() {} + ~CGroupUtilTest() override = default; }; + TEST_F(CGroupUtilTest, memlimit) { - int64_t bytes; - float cpu_counts; - CGroupUtil cgroup_util; - LOG(INFO) << cgroup_util.debug_string(); - Status status1 = cgroup_util.find_cgroup_mem_limit(&bytes); - Status status2 = cgroup_util.find_cgroup_cpu_limit(&cpu_counts); - if (cgroup_util.enable()) { + LOG(INFO) << CGroupMemoryCtl::debug_string(); + int64_t mem_limit; + int64_t mem_usage; + auto status1 = CGroupMemoryCtl::find_cgroup_mem_limit(&mem_limit); + auto status2 = CGroupMemoryCtl::find_cgroup_mem_usage(&mem_usage); + if (CGroupUtil::cgroupsv1_enable() || CGroupUtil::cgroupsv2_enable()) { std::ifstream file("/proc/self/cgroup"); if (file.peek() == std::ifstream::traits_type::eof()) { EXPECT_FALSE(status1.ok());