From 7ab8752705115055cb32019cea920527e8006a4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavol=20=C5=BD=C3=A1=C4=8Dik?= Date: Tue, 17 Sep 2024 16:17:50 +0200 Subject: [PATCH 1/3] vm: Add support for dirty_(bytes|ratio) sysctl parameters And for dirty_background_(bytes|ratio). These parameters interact with each other; it is therefore not sufficient to configure them via the sysctl plugin. For more details, see https://docs.kernel.org/admin-guide/sysctl/vm.html#dirty-bytes. Resolves: RHEL-58820 --- tuned/plugins/plugin_vm.py | 96 ++++++++++++++++++++++++++++++++++++++ tuned/utils/commands.py | 3 ++ 2 files changed, 99 insertions(+) diff --git a/tuned/plugins/plugin_vm.py b/tuned/plugins/plugin_vm.py index b8b7e918..a2f3ae75 100644 --- a/tuned/plugins/plugin_vm.py +++ b/tuned/plugins/plugin_vm.py @@ -38,11 +38,26 @@ def _get_config_options(self): "transparent_hugepages" : None, "transparent_hugepage" : None, "transparent_hugepage.defrag" : None, + "dirty_bytes" : None, + "dirty_ratio" : None, + "dirty_background_bytes" : None, + "dirty_background_ratio" : None } + @staticmethod + def _check_conflicting_dirty_options(instance, first, second): + if instance.options[first] is not None and instance.options[second] is not None: + log.error("Conflicting options '%s' and '%s', this may cause undefined behavior." % (first, second)) + + @staticmethod + def _proc_sys_vm_option_path(option): + return os.path.join("/proc/sys/vm", option) + def _instance_init(self, instance): instance._has_static_tuning = True instance._has_dynamic_tuning = False + self._check_conflicting_dirty_options(instance, "dirty_bytes", "dirty_ratio") + self._check_conflicting_dirty_options(instance, "dirty_background_bytes", "dirty_background_ratio") def _instance_cleanup(self, instance): pass @@ -117,3 +132,84 @@ def _get_transparent_hugepage_defrag(self): return cmd.get_active_option(cmd.read_file(sys_file)) else: return None + + def _check_twice_pagesize(self, option, int_value): + min_bytes = 2 * int(cmd.getconf("PAGESIZE")) + if int_value < min_bytes: + log.error("The value of '%s' must be at least twice the page size (%s)." % (option, min_bytes)) + return False + return True + + def _check_positive(self, option, int_value): + if int_value <= 0: + log.error("The value of '%s' must be positive." % option) + return False + return True + + def _check_ratio(self, option, int_value): + if not 0 <= int_value <= 100: + log.error("The value of '%s' must be between 0 and 100." % option) + return False + return True + + @command_custom("dirty_bytes") + def _dirty_bytes(self, enabling, value, verify, ignore_missing): + return self._dirty_option("dirty_bytes", "dirty_ratio", self._check_twice_pagesize, enabling, value, verify) + + @command_custom("dirty_ratio") + def _dirty_ratio(self, enabling, value, verify, ignore_missing): + return self._dirty_option("dirty_ratio", "dirty_bytes", self._check_ratio, enabling, value, verify) + + @command_custom("dirty_background_bytes") + def _dirty_background_bytes(self, enabling, value, verify, ignore_missing): + return self._dirty_option("dirty_background_bytes", "dirty_background_ratio", self._check_positive, enabling, value, verify) + + @command_custom("dirty_background_ratio") + def _dirty_background_ratio(self, enabling, value, verify, ignore_missing): + return self._dirty_option("dirty_background_ratio", "dirty_background_bytes", self._check_ratio, enabling, value, verify) + + def _dirty_option(self, option, counterpart, check_fun, enabling, value, verify): + option_path = self._proc_sys_vm_option_path(option) + counterpart_path = self._proc_sys_vm_option_path(counterpart) + option_key = self._storage_key(command_name=option) + counterpart_key = self._storage_key(command_name=counterpart) + if not os.path.isfile(option_path): + log.warning("Option '%s' is not supported on the current hardware." % option) + current_value = cmd.read_file(option_path).strip() + if verify: + return current_value == value + if enabling: + try: + int_value = int(value) + except ValueError: + log.error("The value of '%s' must be an integer." % option) + if not check_fun(option, int_value): + return None + if current_value == value: + log.info("Not setting option '%s' to '%s', already set." % (option, value)) + return value + # Backup: if the option (e.g. dirty_bytes) is currently 0, + # its counterpart (dirty_ratio) is the active one, so we + # back up that one instead. + if int(current_value) == 0: + current_counterpart_value = cmd.read_file(counterpart_path).strip() + self._storage.set(counterpart_key, current_counterpart_value) + else: + self._storage.set(option_key, current_value) + log.info("Setting option '%s' to '%s'." % (option, value)) + cmd.write_to_file(option_path, value) + return value + # Rollback is analogous to the backup: if there is no backed up + # value for this option, it means that its counterpart was active + # and we have to restore that one. + old_value = self._storage.get(option_key) + old_counterpart_value = self._storage.get(counterpart_key) + if old_value is not None: + log.info("Setting option '%s' to '%s'" % (option, old_value)) + cmd.write_to_file(option_path, old_value) + elif old_counterpart_value is not None: + log.info("Setting option '%s' to '%s'" % (counterpart, old_counterpart_value)) + cmd.write_to_file(counterpart_path, old_counterpart_value) + else: + log.info("Not restoring '%s', previous value is the same or unknown." % option) + return None diff --git a/tuned/utils/commands.py b/tuned/utils/commands.py index c4f7c936..10d589a9 100644 --- a/tuned/utils/commands.py +++ b/tuned/utils/commands.py @@ -552,3 +552,6 @@ def tr(self, text, source_chars, dest_chars): # Checks if name contains only valid characters and has valid length or is empty string or None def is_valid_name(self, name): return not name or (all(c in consts.NAMES_ALLOWED_CHARS for c in name) and len(name) <= consts.NAMES_MAX_LENGTH) + + def getconf(self, variable): + return check_output(["getconf", variable]).decode().strip() From 155f2323a41b866217ee76b52b5648588d09f583 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavol=20=C5=BD=C3=A1=C4=8Dik?= Date: Tue, 17 Sep 2024 16:24:47 +0200 Subject: [PATCH 2/3] Adjust profiles to set dirty_(bytes|ratio) via the vm plugin --- profiles/accelerator-performance/tuned.conf | 7 ++--- profiles/latency-performance/tuned.conf | 7 ++--- profiles/mssql/tuned.conf | 4 +-- profiles/oracle/tuned.conf | 4 +-- profiles/postgresql/tuned.conf | 12 ++++----- profiles/sap-hana/tuned.conf | 4 +-- profiles/spectrumscale-ece/tuned.conf | 6 +++-- profiles/spindown-disk/tuned.conf | 4 ++- profiles/throughput-performance/tuned.conf | 27 ++++++++++--------- profiles/virtual-guest/tuned.conf | 5 ++-- profiles/virtual-host/tuned.conf | 4 +-- .../post-vars/tuned.conf | 4 +-- .../post/tuned.conf | 4 +-- .../post2/tuned.conf | 4 +-- 14 files changed, 51 insertions(+), 45 deletions(-) diff --git a/profiles/accelerator-performance/tuned.conf b/profiles/accelerator-performance/tuned.conf index a0f3da45..01f1a780 100644 --- a/profiles/accelerator-performance/tuned.conf +++ b/profiles/accelerator-performance/tuned.conf @@ -17,7 +17,7 @@ platform_profile=performance [disk] readahead=>4096 -[sysctl] +[vm] # If a workload mostly uses anonymous memory and it hits this limit, the entire # working set is buffered for I/O, and any more write buffering would require # swapping, so it's time to throttle writes until I/O can catch up. Workloads @@ -25,12 +25,13 @@ readahead=>4096 # # The generator of dirty data starts writeback at this percentage (system default # is 20%) -vm.dirty_ratio = 40 +dirty_ratio = 40 # Start background writeback (via writeback threads) at this percentage (system # default is 10%) -vm.dirty_background_ratio = 10 +dirty_background_ratio = 10 +[sysctl] # PID allocation wrap value. When the kernel's next PID value # reaches this value, it wraps back to a minimum PID value. # PIDs of value pid_max or larger are not allocated. diff --git a/profiles/latency-performance/tuned.conf b/profiles/latency-performance/tuned.conf index c7806023..1e044f54 100644 --- a/profiles/latency-performance/tuned.conf +++ b/profiles/latency-performance/tuned.conf @@ -14,7 +14,7 @@ min_perf_pct=100 [acpi] platform_profile=performance -[sysctl] +[vm] # If a workload mostly uses anonymous memory and it hits this limit, the entire # working set is buffered for I/O, and any more write buffering would require # swapping, so it's time to throttle writes until I/O can catch up. Workloads @@ -22,12 +22,13 @@ platform_profile=performance # # The generator of dirty data starts writeback at this percentage (system default # is 20%) -vm.dirty_ratio=10 +dirty_ratio=10 # Start background writeback (via writeback threads) at this percentage (system # default is 10%) -vm.dirty_background_ratio=3 +dirty_background_ratio=3 +[sysctl] # The swappiness parameter controls the tendency of the kernel to move # processes out of physical memory and onto the swap disk. # 0 tells the kernel to avoid swapping processes out of physical memory diff --git a/profiles/mssql/tuned.conf b/profiles/mssql/tuned.conf index a1c97863..cf3e0f94 100644 --- a/profiles/mssql/tuned.conf +++ b/profiles/mssql/tuned.conf @@ -12,11 +12,11 @@ force_latency=5 [vm] # For multi-instance SQL deployments use 'madvise' instead of 'always' transparent_hugepages=always +dirty_background_ratio=3 +dirty_ratio=80 [sysctl] vm.swappiness=1 -vm.dirty_background_ratio=3 -vm.dirty_ratio=80 vm.dirty_expire_centisecs=500 vm.dirty_writeback_centisecs=100 vm.max_map_count=1600000 diff --git a/profiles/oracle/tuned.conf b/profiles/oracle/tuned.conf index f84cb2f9..a1673ff7 100644 --- a/profiles/oracle/tuned.conf +++ b/profiles/oracle/tuned.conf @@ -8,8 +8,6 @@ include=throughput-performance [sysctl] vm.swappiness = 10 -vm.dirty_background_ratio = 3 -vm.dirty_ratio = 40 vm.dirty_expire_centisecs = 500 vm.dirty_writeback_centisecs = 100 kernel.shmmax = 4398046511104 @@ -27,4 +25,6 @@ kernel.panic_on_oops = 1 kernel.numa_balancing = 0 [vm] +dirty_background_ratio = 3 +dirty_ratio = 40 transparent_hugepages=never diff --git a/profiles/postgresql/tuned.conf b/profiles/postgresql/tuned.conf index 88da8e44..8165fd8b 100644 --- a/profiles/postgresql/tuned.conf +++ b/profiles/postgresql/tuned.conf @@ -16,22 +16,20 @@ force_latency=1 [vm] transparent_hugepages=never -[sysctl] - # The dirty_background_ratio and dirty_ratio controls percentage of memory # that file system cache have to fill with dirty data before kernel will # will start to flush data to disks. The default values are 10% and 20% # accordingly. On a systems with a big amount of memory this values can # be tens of gigabytes and produce IO spikes when PostgreSQL server writes -# checkpoints. +# checkpoints. The values can alternatively be set in absolute number of bytes +# via dirty_background_bytes and dirty_bytes. # # Keep this values reasonable small - about size of RAID controller write-back # cache size (typcal 512MB - 2GB). -vm.dirty_background_ratio = 0 -vm.dirty_ratio = 0 -vm.dirty_background_bytes = 67108864 -vm.dirty_bytes = 536870912 +dirty_background_bytes = 67108864 +dirty_bytes = 536870912 +[sysctl] # The swappiness parameter controls the tendency of the kernel to move # processes out of physical memory and onto the swap disk. # 0 tells the kernel to avoid swapping processes out of physical memory diff --git a/profiles/sap-hana/tuned.conf b/profiles/sap-hana/tuned.conf index aeecf53c..e369ddf7 100644 --- a/profiles/sap-hana/tuned.conf +++ b/profiles/sap-hana/tuned.conf @@ -13,10 +13,10 @@ min_perf_pct=100 [vm] transparent_hugepages=never +dirty_ratio = 40 +dirty_background_ratio = 10 [sysctl] kernel.sem = 32000 1024000000 500 32000 kernel.numa_balancing = 0 -vm.dirty_ratio = 40 -vm.dirty_background_ratio = 10 vm.swappiness = 10 diff --git a/profiles/spectrumscale-ece/tuned.conf b/profiles/spectrumscale-ece/tuned.conf index c111e31e..3b3bebab 100644 --- a/profiles/spectrumscale-ece/tuned.conf +++ b/profiles/spectrumscale-ece/tuned.conf @@ -11,10 +11,12 @@ governor=performance energy_perf_bias=performance min_perf_pct=100 +[vm] +dirty_ratio = 40 +dirty_background_ratio = 10 + [sysctl] kernel.numa_balancing = 1 -vm.dirty_ratio = 40 -vm.dirty_background_ratio = 10 vm.swappiness=10 net.ipv4.tcp_window_scaling = 1 net.ipv4.tcp_timestamps = 1 diff --git a/profiles/spindown-disk/tuned.conf b/profiles/spindown-disk/tuned.conf index 6f126846..5252bb73 100644 --- a/profiles/spindown-disk/tuned.conf +++ b/profiles/spindown-disk/tuned.conf @@ -26,10 +26,12 @@ spindown=6 [scsi_host] alpm=medium_power +[vm] +dirty_ratio=60 + [sysctl] vm.dirty_writeback_centisecs=6000 vm.dirty_expire_centisecs=9000 -vm.dirty_ratio=60 vm.laptop_mode=5 vm.swappiness=30 diff --git a/profiles/throughput-performance/tuned.conf b/profiles/throughput-performance/tuned.conf index b5e266d7..b55989c5 100644 --- a/profiles/throughput-performance/tuned.conf +++ b/profiles/throughput-performance/tuned.conf @@ -17,6 +17,20 @@ energy_performance_preference=performance [acpi] platform_profile=performance +[vm] +# If a workload mostly uses anonymous memory and it hits this limit, the entire +# working set is buffered for I/O, and any more write buffering would require +# swapping, so it's time to throttle writes until I/O can catch up. Workloads +# that mostly use file mappings may be able to use even higher values. +# +# The generator of dirty data starts writeback at this percentage (system default +# is 20%) +dirty_ratio = 40 + +# Start background writeback (via writeback threads) at this percentage (system +# default is 10%) +dirty_background_ratio = 10 + # Marvell ThunderX [vm.thunderx] type=vm @@ -31,19 +45,6 @@ transparent_hugepages=never readahead=>4096 [sysctl] -# If a workload mostly uses anonymous memory and it hits this limit, the entire -# working set is buffered for I/O, and any more write buffering would require -# swapping, so it's time to throttle writes until I/O can catch up. Workloads -# that mostly use file mappings may be able to use even higher values. -# -# The generator of dirty data starts writeback at this percentage (system default -# is 20%) -vm.dirty_ratio = 40 - -# Start background writeback (via writeback threads) at this percentage (system -# default is 10%) -vm.dirty_background_ratio = 10 - # PID allocation wrap value. When the kernel's next PID value # reaches this value, it wraps back to a minimum PID value. # PIDs of value pid_max or larger are not allocated. diff --git a/profiles/virtual-guest/tuned.conf b/profiles/virtual-guest/tuned.conf index 28b85c97..bc4c0399 100644 --- a/profiles/virtual-guest/tuned.conf +++ b/profiles/virtual-guest/tuned.conf @@ -6,7 +6,7 @@ summary=Optimize for running inside a virtual guest include=throughput-performance -[sysctl] +[vm] # If a workload mostly uses anonymous memory and it hits this limit, the entire # working set is buffered for I/O, and any more write buffering would require # swapping, so it's time to throttle writes until I/O can catch up. Workloads @@ -14,8 +14,9 @@ include=throughput-performance # # The generator of dirty data starts writeback at this percentage (system default # is 20%) -vm.dirty_ratio = 30 +dirty_ratio = 30 +[sysctl] # Filesystem I/O is usually much more efficient than swapping, so try to keep # swapping low. It's usually safe to go even lower than this on systems with # server-grade storage. diff --git a/profiles/virtual-host/tuned.conf b/profiles/virtual-host/tuned.conf index 5301d9ff..ba58cbb7 100644 --- a/profiles/virtual-host/tuned.conf +++ b/profiles/virtual-host/tuned.conf @@ -6,10 +6,10 @@ summary=Optimize for running KVM guests include=throughput-performance -[sysctl] +[vm] # Start background writeback (via writeback threads) at this percentage (system # default is 10%) -vm.dirty_background_ratio = 5 +dirty_background_ratio = 5 [cpu] # Setting C3 state sleep mode/power savings diff --git a/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post-vars/tuned.conf b/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post-vars/tuned.conf index 0bef6122..a93de044 100644 --- a/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post-vars/tuned.conf +++ b/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post-vars/tuned.conf @@ -1,5 +1,5 @@ [main] summary=Post-loaded profile that uses variables from the regular active profile -[sysctl] -vm.dirty_ratio=${foo} +[vm] +dirty_ratio=${foo} diff --git a/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post/tuned.conf b/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post/tuned.conf index 7844ca82..de0629f7 100644 --- a/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post/tuned.conf +++ b/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post/tuned.conf @@ -1,5 +1,5 @@ [main] summary=Post-loaded profile -[sysctl] -vm.dirty_ratio=8 +[vm] +dirty_ratio=8 diff --git a/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post2/tuned.conf b/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post2/tuned.conf index 994db42d..c8fedd9e 100644 --- a/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post2/tuned.conf +++ b/tests/beakerlib/bz1798183-RFE-support-post-loaded-profile/post2/tuned.conf @@ -1,5 +1,5 @@ [main] summary=Second version of the post-loaded profile -[sysctl] -vm.dirty_ratio=7 +[vm] +dirty_ratio=7 From 52768f78e4fcc9d657ce981a4790360444f089ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavol=20=C5=BD=C3=A1=C4=8Dik?= Date: Wed, 18 Sep 2024 08:26:36 +0200 Subject: [PATCH 3/3] Document new vm plugin options --- tuned/plugins/plugin_vm.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tuned/plugins/plugin_vm.py b/tuned/plugins/plugin_vm.py index a2f3ae75..a671e171 100644 --- a/tuned/plugins/plugin_vm.py +++ b/tuned/plugins/plugin_vm.py @@ -13,9 +13,15 @@ class VMPlugin(base.Plugin): """ - Enables or disables transparent huge pages depending on value of the - [option]`transparent_hugepages` option. The option can have one of three - possible values `always`, `madvise` and `never`. + Tunes selected sysctl options in `/proc/sys/vm`, currently + [option]`dirty_ratio`, [option]`dirty_background_ratio`, + [option]`dirty_bytes`, and [option]`dirty_background_bytes`. + See https://docs.kernel.org/admin-guide/sysctl/vm.html for detailed + documentation of these options. + + Additionaly enables or disables transparent huge pages depending on + the value of the [option]`transparent_hugepages` option. The option + can have one of three possible values: `always`, `madvise` and `never`. .Disable transparent hugepages ====