diff --git a/10-oomd-defaults.conf b/10-oomd-defaults.conf new file mode 100644 index 0000000..3660cd2 --- /dev/null +++ b/10-oomd-defaults.conf @@ -0,0 +1,2 @@ +[OOM] +DefaultMemoryPressureDurationSec=10s diff --git a/10-oomd-root-slice-defaults.conf b/10-oomd-root-slice-defaults.conf new file mode 100644 index 0000000..49958e8 --- /dev/null +++ b/10-oomd-root-slice-defaults.conf @@ -0,0 +1,2 @@ +[Slice] +ManagedOOMSwap=kill diff --git a/10-oomd-user-service-defaults.conf b/10-oomd-user-service-defaults.conf new file mode 100644 index 0000000..d78f327 --- /dev/null +++ b/10-oomd-user-service-defaults.conf @@ -0,0 +1,3 @@ +[Service] +ManagedOOMMemoryPressure=kill +ManagedOOMMemoryPressureLimit=4% diff --git a/17829.patch b/17829.patch new file mode 100644 index 0000000..176b969 --- /dev/null +++ b/17829.patch @@ -0,0 +1,60 @@ +From 14d044da23d6f2fa03066aedcc2600a479c1f731 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Wed, 2 Dec 2020 14:41:38 -0800 +Subject: [PATCH] test: fix TEST-56-OOMD thresholds for linux 5.9 changes + +Fixes #17533 + +The memory pressure values of the units in TEST-56-OOMD seemed to be a +lot lower after updating to linux 5.9. This is likely due to a fix from +https://github.com/torvalds/linux/commit/e22c6ed90aa91abc08f107344428ebb8c2629e98. + +To account for this, I lowered memory.high on testbloat.service to +throttle it even more. This was enough to generate the 50%+ value to trigger +oomd for the test, but as an extra precaution I also lowered the oomd +threshold to 1% so it's certain to try and kill testbloat.service. +--- + test/units/testsuite-56-testbloat.service | 6 +++--- + test/units/testsuite-56-workload.slice | 2 +- + test/units/testsuite-56.sh | 2 +- + 3 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/test/units/testsuite-56-testbloat.service b/test/units/testsuite-56-testbloat.service +index 40cf5a9f36f..6163aae1dba 100644 +--- a/test/units/testsuite-56-testbloat.service ++++ b/test/units/testsuite-56-testbloat.service +@@ -2,8 +2,8 @@ + Description=Create a lot of memory pressure + + [Service] +-# A very small memory.high will cause the script (trying to use a lot of memory) +-# to throttle and be put under heavy pressure +-MemoryHigh=2M ++# A VERY small memory.high will cause the script (trying to use a lot of memory) ++# to throttle and be put under heavy pressure. ++MemoryHigh=1M + Slice=testsuite-56-workload.slice + ExecStart=/usr/lib/systemd/tests/testdata/units/testsuite-56-slowgrowth.sh +diff --git a/test/units/testsuite-56-workload.slice b/test/units/testsuite-56-workload.slice +index 3d542ec2bae..45b04914c63 100644 +--- a/test/units/testsuite-56-workload.slice ++++ b/test/units/testsuite-56-workload.slice +@@ -7,4 +7,4 @@ MemoryAccounting=true + IOAccounting=true + TasksAccounting=true + ManagedOOMMemoryPressure=kill +-ManagedOOMMemoryPressureLimitPercent=50% ++ManagedOOMMemoryPressureLimitPercent=1% +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 37d62d943c0..1846248855b 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -19,7 +19,7 @@ systemctl start testsuite-56-testchill.service + + # Verify systemd-oomd is monitoring the expected units + oomctl | grep "/testsuite-56-workload.slice" +-oomctl | grep "50%" ++oomctl | grep "1%" + + # systemd-oomd watches for elevated pressure for 30 seconds before acting. + # It can take time to build up pressure so either wait 5 minutes or for the service to fail. diff --git a/18361.patch b/18361.patch new file mode 100644 index 0000000..282b7f3 --- /dev/null +++ b/18361.patch @@ -0,0 +1,403 @@ +From c20aa7b17166b9f331da33ad9288f9ede75c72db Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Sun, 24 Jan 2021 00:16:19 -0800 +Subject: [PATCH 1/4] oom: make memory pressure duration configurable through + oomd.conf + +--- + man/oomd.conf.xml | 12 +++++++++++- + src/oom/oomd-manager.c | 13 +++++++++---- + src/oom/oomd-manager.h | 5 +++-- + src/oom/oomd-util.h | 1 + + src/oom/oomd.c | 4 +++- + src/oom/oomd.conf | 1 + + test/units/testsuite-56.sh | 3 +++ + 7 files changed, 31 insertions(+), 8 deletions(-) + +diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml +index 35a0686bc50..bb5da87c548 100644 +--- a/man/oomd.conf.xml ++++ b/man/oomd.conf.xml +@@ -65,13 +65,23 @@ + will take action. A unit can override this value with ManagedOOMMemoryPressureLimitPercent=. + The memory pressure for this property represents the fraction of time in a 10 second window in which all tasks + in the cgroup were delayed. For each monitored cgroup, if the memory pressure on that cgroup exceeds the +- limit set for more than 30 seconds, systemd-oomd will act on eligible descendant cgroups, ++ limit set for longer than the duration set by DefaultMemoryPressureDurationSec=, ++ systemd-oomd will act on eligible descendant cgroups, + starting from the ones with the most reclaim activity to the least reclaim activity. Which cgroups are + monitored and what action gets taken depends on what the unit has configured for + ManagedOOMMemoryPressure=. Takes a percentage value between 0% and 100%, inclusive. + Defaults to 60%. + + ++ ++ DefaultMemoryPressureDurationSec= ++ ++ Sets the amount of time a unit's cgroup needs to have exceeded memory pressure limits before ++ systemd-oomd will take action. Memory pressure limits are defined by ++ DefaultMemoryPressureLimitPercent= and ManagedOOMMemoryPressureLimitPercent=. ++ Defaults to 30 seconds when this property is unset or set to 0. ++ ++ + + + +diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c +index fec96519e01..e8ed6a52739 100644 +--- a/src/oom/oomd-manager.c ++++ b/src/oom/oomd-manager.c +@@ -306,7 +306,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo + m->post_action_delay_start = 0; + } + +- r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, PRESSURE_DURATION_USEC, &targets); ++ r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets); + if (r == -ENOMEM) + return log_error_errno(r, "Failed to check if memory pressure exceeded limits"); + else if (r == 1) { +@@ -325,7 +325,7 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo + + SET_FOREACH(t, targets) { + log_notice("Memory pressure for %s is greater than %lu for more than %"PRIu64" seconds and there was reclaim activity", +- t->path, LOAD_INT(t->mem_pressure_limit), PRESSURE_DURATION_USEC / USEC_PER_SEC); ++ t->path, LOAD_INT(t->mem_pressure_limit), m->default_mem_pressure_duration_usec / USEC_PER_SEC); + + r = oomd_kill_by_pgscan(candidates, t->path, m->dry_run); + if (r == -ENOMEM) +@@ -471,7 +471,7 @@ static int manager_connect_bus(Manager *m) { + return 0; + } + +-int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit) { ++int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec) { + unsigned long l; + int r; + +@@ -487,6 +487,8 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur + if (r < 0) + return r; + ++ m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC; ++ + r = manager_connect_bus(m); + if (r < 0) + return r; +@@ -505,6 +507,7 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur + int manager_get_dump_string(Manager *m, char **ret) { + _cleanup_free_ char *dump = NULL; + _cleanup_fclose_ FILE *f = NULL; ++ char buf[FORMAT_TIMESPAN_MAX]; + OomdCGroupContext *c; + size_t size; + char *key; +@@ -521,10 +524,12 @@ int manager_get_dump_string(Manager *m, char **ret) { + "Dry Run: %s\n" + "Swap Used Limit: %u%%\n" + "Default Memory Pressure Limit: %lu%%\n" ++ "Default Memory Pressure Duration: %s\n" + "System Context:\n", + yes_no(m->dry_run), + m->swap_used_limit, +- LOAD_INT(m->default_mem_pressure_limit)); ++ LOAD_INT(m->default_mem_pressure_limit), ++ format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + oomd_dump_system_context(&m->system_context, f, "\t"); + + fprintf(f, "Swap Monitored CGroups:\n"); +diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h +index 3f3eb5aa4b6..ede9903e5a6 100644 +--- a/src/oom/oomd-manager.h ++++ b/src/oom/oomd-manager.h +@@ -16,7 +16,7 @@ + * percentage of time all tasks were delayed (i.e. unproductive). + * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in + * system.slice are assumed to be less latency sensitive. */ +-#define PRESSURE_DURATION_USEC (30 * USEC_PER_SEC) ++#define DEFAULT_MEM_PRESSURE_DURATION_USEC (30 * USEC_PER_SEC) + #define DEFAULT_MEM_PRESSURE_LIMIT 60 + #define DEFAULT_SWAP_USED_LIMIT 90 + +@@ -33,6 +33,7 @@ struct Manager { + bool dry_run; + unsigned swap_used_limit; + loadavg_t default_mem_pressure_limit; ++ usec_t default_mem_pressure_duration_usec; + + /* k: cgroup paths -> v: OomdCGroupContext + * Used to detect when to take action. */ +@@ -53,7 +54,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + + int manager_new(Manager **ret); + +-int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit); ++int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec); + + int manager_get_dump_string(Manager *m, char **ret); + +diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h +index 0834cbf09d7..d7a9890e7a2 100644 +--- a/src/oom/oomd-util.h ++++ b/src/oom/oomd-util.h +@@ -31,6 +31,7 @@ struct OomdCGroupContext { + + /* These are only used by oomd_pressure_above for acting on high memory pressure. */ + loadavg_t mem_pressure_limit; ++ usec_t mem_pressure_duration_usec; + usec_t last_hit_mem_pressure_limit; + }; + +diff --git a/src/oom/oomd.c b/src/oom/oomd.c +index 8cf776ec0f5..1b0f8ff6c40 100644 +--- a/src/oom/oomd.c ++++ b/src/oom/oomd.c +@@ -19,11 +19,13 @@ + static bool arg_dry_run = false; + static int arg_swap_used_limit = -1; + static int arg_mem_pressure_limit = -1; ++static usec_t arg_mem_pressure_usec = 0; + + static int parse_config(void) { + static const ConfigTableItem items[] = { + { "OOM", "SwapUsedLimitPercent", config_parse_percent, 0, &arg_swap_used_limit }, + { "OOM", "DefaultMemoryPressureLimitPercent", config_parse_percent, 0, &arg_mem_pressure_limit }, ++ { "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec }, + {} + }; + +@@ -160,7 +162,7 @@ static int run(int argc, char *argv[]) { + if (r < 0) + return log_error_errno(r, "Failed to create manager: %m"); + +- r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit); ++ r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit, arg_mem_pressure_usec); + if (r < 0) + return log_error_errno(r, "Failed to start up daemon: %m"); + +diff --git a/src/oom/oomd.conf b/src/oom/oomd.conf +index 8ac97169610..766cb1717f7 100644 +--- a/src/oom/oomd.conf ++++ b/src/oom/oomd.conf +@@ -14,3 +14,4 @@ + [OOM] + #SwapUsedLimitPercent=90% + #DefaultMemoryPressureLimitPercent=60% ++#DefaultMemoryPressureDurationSec=30s +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 1846248855b..6e7941a57fc 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -14,12 +14,15 @@ if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]] + fi + [[ -e /skipped ]] && exit 0 || true + ++echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf ++ + systemctl start testsuite-56-testbloat.service + systemctl start testsuite-56-testchill.service + + # Verify systemd-oomd is monitoring the expected units + oomctl | grep "/testsuite-56-workload.slice" + oomctl | grep "1%" ++oomctl | grep "Default Memory Pressure Duration: 5s" + + # systemd-oomd watches for elevated pressure for 30 seconds before acting. + # It can take time to build up pressure so either wait 5 minutes or for the service to fail. + +From 408a3bbd76326793ea5d1cf4e0a9444a4c252d86 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Sat, 23 Jan 2021 22:10:42 -0800 +Subject: [PATCH 2/4] oom: make swap a soft requirement + +--- + man/systemd-oomd.service.xml | 4 ++-- + src/oom/oomd-manager.c | 8 ++++++-- + src/oom/oomd.c | 6 ++---- + src/oom/test-oomd-util.c | 11 +++++++++++ + 4 files changed, 21 insertions(+), 8 deletions(-) + +diff --git a/man/systemd-oomd.service.xml b/man/systemd-oomd.service.xml +index 9cb9c6076a9..ebd2467ee23 100644 +--- a/man/systemd-oomd.service.xml ++++ b/man/systemd-oomd.service.xml +@@ -56,8 +56,8 @@ + + You will need a kernel compiled with PSI support. This is available in Linux 4.20 and above. + +- The system must also have swap enabled for systemd-oomd to function correctly. With swap +- enabled, the system spends enough time swapping pages to let systemd-oomd react. ++ It is highly recommended for the system to have swap enabled for systemd-oomd to function ++ optimally. With swap enabled, the system spends enough time swapping pages to let systemd-oomd react. + Without swap, the system enters a livelocked state much more quickly and may prevent systemd-oomd + from responding in a reasonable amount of time. See + "In defence of swap: common misconceptions" +diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c +index e8ed6a52739..814fda51f31 100644 +--- a/src/oom/oomd-manager.c ++++ b/src/oom/oomd-manager.c +@@ -6,6 +6,7 @@ + #include "cgroup-util.h" + #include "fd-util.h" + #include "fileio.h" ++#include "memory-util.h" + #include "oomd-manager-bus.h" + #include "oomd-manager.h" + #include "path-util.h" +@@ -294,9 +295,12 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo + return log_error_errno(r, "Failed to update monitored memory pressure cgroup contexts"); + + r = oomd_system_context_acquire("/proc/swaps", &m->system_context); +- /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM */ +- if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts))) ++ /* If there aren't units depending on swap actions, the only error we exit on is ENOMEM. ++ * Allow ENOENT in the event that swap is disabled on the system. */ ++ if (r == -ENOMEM || (r < 0 && r != -ENOENT && !hashmap_isempty(m->monitored_swap_cgroup_contexts))) + return log_error_errno(r, "Failed to acquire system context"); ++ else if (r == -ENOENT) ++ zero(m->system_context); + + /* If we're still recovering from a kill, don't try to kill again yet */ + if (m->post_action_delay_start > 0) { +diff --git a/src/oom/oomd.c b/src/oom/oomd.c +index 1b0f8ff6c40..1fbcf41492d 100644 +--- a/src/oom/oomd.c ++++ b/src/oom/oomd.c +@@ -142,10 +142,8 @@ static int run(int argc, char *argv[]) { + return log_error_errno(r, "Failed to get SwapTotal from /proc/meminfo: %m"); + + r = safe_atollu(swap, &s); +- if (r < 0) +- return log_error_errno(r, "Failed to parse SwapTotal from /proc/meminfo: %s: %m", swap); +- if (s == 0) +- return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Requires swap to operate"); ++ if (r < 0 || s == 0) ++ log_warning("Swap is currently not detected; memory pressure usage will be degraded"); + + if (!is_pressure_supported()) + return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Pressure Stall Information (PSI) is not supported"); +diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c +index 8143408902b..54fe2a03d14 100644 +--- a/src/oom/test-oomd-util.c ++++ b/src/oom/test-oomd-util.c +@@ -159,6 +159,11 @@ static void test_oomd_system_context_acquire(void) { + assert_se(ctx.swap_total == 0); + assert_se(ctx.swap_used == 0); + ++ assert_se(write_string_file(path, "Filename Type Size Used Priority", WRITE_STRING_FILE_CREATE) == 0); ++ assert_se(oomd_system_context_acquire(path, &ctx) == 0); ++ assert_se(ctx.swap_total == 0); ++ assert_se(ctx.swap_used == 0); ++ + assert_se(write_string_file(path, "Filename Type Size Used Priority\n" + "/swapvol/swapfile file 18971644 0 -3\n" + "/dev/vda2 partition 1999868 993780 -2", WRITE_STRING_FILE_CREATE) == 0); +@@ -268,6 +273,12 @@ static void test_oomd_swap_free_below(void) { + .swap_used = 3310136 * 1024U, + }; + assert_se(oomd_swap_free_below(&ctx, 20) == false); ++ ++ ctx = (OomdSystemContext) { ++ .swap_total = 0, ++ .swap_used = 0, ++ }; ++ assert_se(oomd_swap_free_below(&ctx, 20) == false); + } + + static void test_oomd_sort_cgroups(void) { + +From 924c89e9fe95d47b6ad94544bfdd5f087646daea Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Sun, 24 Jan 2021 01:22:51 -0800 +Subject: [PATCH 3/4] oom: fix reclaim activity detection + +This should have been checking for any reclaim activity within a larger interval +of time rather than within the past second. On systems with swap this +doesn't seem to have mattered too much as reclaim would always increase when +memory pressure was elevated. But testing in the no swap case having +this larger interval made a difference between oomd killing or not. +--- + src/oom/oomd-manager.c | 7 +++++-- + src/oom/oomd-manager.h | 2 ++ + 2 files changed, 7 insertions(+), 2 deletions(-) + +diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c +index 814fda51f31..3efa629002e 100644 +--- a/src/oom/oomd-manager.c ++++ b/src/oom/oomd-manager.c +@@ -302,6 +302,9 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo + else if (r == -ENOENT) + zero(m->system_context); + ++ if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts)) ++ m->last_reclaim_at = usec_now; ++ + /* If we're still recovering from a kill, don't try to kill again yet */ + if (m->post_action_delay_start > 0) { + if (m->post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now) +@@ -314,12 +317,12 @@ static int monitor_cgroup_contexts_handler(sd_event_source *s, uint64_t usec, vo + if (r == -ENOMEM) + return log_error_errno(r, "Failed to check if memory pressure exceeded limits"); + else if (r == 1) { +- /* Check if there was reclaim activity in the last interval. The concern is the following case: ++ /* Check if there was reclaim activity in the given interval. The concern is the following case: + * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending + * cgroup. Even after this, well-behaved processes will fault in recently resident pages and + * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need + * to kill something (it won't help anyways). */ +- if (oomd_memory_reclaim(m->monitored_mem_pressure_cgroup_contexts)) { ++ if ((usec_now - m->last_reclaim_at) <= RECLAIM_DURATION_USEC) { + _cleanup_hashmap_free_ Hashmap *candidates = NULL; + OomdCGroupContext *t; + +diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h +index ede9903e5a6..ee17abced26 100644 +--- a/src/oom/oomd-manager.h ++++ b/src/oom/oomd-manager.h +@@ -20,6 +20,7 @@ + #define DEFAULT_MEM_PRESSURE_LIMIT 60 + #define DEFAULT_SWAP_USED_LIMIT 90 + ++#define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC) + #define POST_ACTION_DELAY_USEC (15 * USEC_PER_SEC) + + typedef struct Manager Manager; +@@ -42,6 +43,7 @@ struct Manager { + + OomdSystemContext system_context; + ++ usec_t last_reclaim_at; + usec_t post_action_delay_start; + + sd_event_source *cgroup_context_event_source; + +From 2e744a2cd89fc0ea67cf78cfba617b5105a26215 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Sun, 24 Jan 2021 01:34:23 -0800 +Subject: [PATCH 4/4] oom: update extended test to remove swap gating + +--- + test/units/testsuite-56.sh | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 6e7941a57fc..4dc9d8c7a86 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -6,7 +6,6 @@ systemd-analyze log-level debug + systemd-analyze log-target console + + # Loose checks to ensure the environment has the necessary features for systemd-oomd +-[[ "$( awk '/SwapTotal/ { print $2 }' /proc/meminfo )" != "0" ]] || echo "no swap" >> /skipped + [[ -e /proc/pressure ]] || echo "no PSI" >> /skipped + cgroup_type=$(stat -fc %T /sys/fs/cgroup/) + if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]]; then +@@ -16,8 +15,8 @@ fi + + echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf + +-systemctl start testsuite-56-testbloat.service + systemctl start testsuite-56-testchill.service ++systemctl start testsuite-56-testbloat.service + + # Verify systemd-oomd is monitoring the expected units + oomctl | grep "/testsuite-56-workload.slice" diff --git a/18401.patch b/18401.patch new file mode 100644 index 0000000..c42ae7e --- /dev/null +++ b/18401.patch @@ -0,0 +1,1201 @@ +From 2ccd5198faa8ca65001f90c551924e86bf737a85 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Mon, 25 Jan 2021 23:56:23 -0800 +Subject: [PATCH 1/7] oom: shorten xattr name + +--- + src/core/cgroup.c | 2 +- + src/oom/oomd-util.c | 4 ++-- + src/oom/test-oomd-util.c | 2 +- + 3 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/src/core/cgroup.c b/src/core/cgroup.c +index c9cf7fb16c6..70282a7abda 100644 +--- a/src/core/cgroup.c ++++ b/src/core/cgroup.c +@@ -2746,7 +2746,7 @@ int unit_check_oomd_kill(Unit *u) { + else if (r == 0) + return 0; + +- r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.systemd_oomd_kill", &value); ++ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, "user.oomd_kill", &value); + if (r < 0 && r != -ENODATA) + return r; + +diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c +index fcccddb92ea..80b9583440c 100644 +--- a/src/oom/oomd-util.c ++++ b/src/oom/oomd-util.c +@@ -201,9 +201,9 @@ int oomd_cgroup_kill(const char *path, bool recurse, bool dry_run) { + if (r < 0) + return r; + +- r = increment_oomd_xattr(path, "user.systemd_oomd_kill", set_size(pids_killed)); ++ r = increment_oomd_xattr(path, "user.oomd_kill", set_size(pids_killed)); + if (r < 0) +- log_debug_errno(r, "Failed to set user.systemd_oomd_kill on kill: %m"); ++ log_debug_errno(r, "Failed to set user.oomd_kill on kill: %m"); + + return set_size(pids_killed) != 0; + } +diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c +index 54fe2a03d14..3dec4f0ff06 100644 +--- a/src/oom/test-oomd-util.c ++++ b/src/oom/test-oomd-util.c +@@ -79,7 +79,7 @@ static void test_oomd_cgroup_kill(void) { + sleep(2); + assert_se(cg_is_empty(SYSTEMD_CGROUP_CONTROLLER, cgroup) == true); + +- assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.systemd_oomd_kill", &v) >= 0); ++ assert_se(cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_kill", &v) >= 0); + assert_se(memcmp(v, i == 0 ? "2" : "4", 2) == 0); + } + } + +From d38916b398127e005d0cf131092a99317661ec3c Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Fri, 5 Feb 2021 03:00:11 -0800 +Subject: [PATCH 2/7] oom: wrap reply.path with empty_to_root + +--- + src/oom/oomd-manager.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c +index 338935b3ec6..825fe38e189 100644 +--- a/src/oom/oomd-manager.c ++++ b/src/oom/oomd-manager.c +@@ -93,7 +93,7 @@ static int process_managed_oom_reply( + m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts; + + if (reply.mode == MANAGED_OOM_AUTO) { +- (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, reply.path)); ++ (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(reply.path))); + continue; + } + +@@ -109,7 +109,7 @@ static int process_managed_oom_reply( + } + } + +- ret = oomd_insert_cgroup_context(NULL, monitor_hm, reply.path); ++ ret = oomd_insert_cgroup_context(NULL, monitor_hm, empty_to_root(reply.path)); + if (ret == -ENOMEM) { + r = ret; + goto finish; +@@ -117,7 +117,7 @@ static int process_managed_oom_reply( + + /* Always update the limit in case it was changed. For non-memory pressure detection the value is + * ignored so always updating it here is not a problem. */ +- ctx = hashmap_get(monitor_hm, reply.path); ++ ctx = hashmap_get(monitor_hm, empty_to_root(reply.path)); + if (ctx) + ctx->mem_pressure_limit = limit; + } + +From a695da238e7a6bd6eb440facc784aa6fca6c3d90 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Wed, 27 Jan 2021 23:43:13 -0800 +Subject: [PATCH 3/7] oom: sort by pgscan and memory usage + +If 2 candidates have the same pgscan, prioritize the one with the larger +memory usage. +--- + src/oom/oomd-util.c | 2 +- + src/oom/oomd-util.h | 5 ++++- + src/oom/test-oomd-util.c | 24 ++++++++++++++---------- + 3 files changed, 19 insertions(+), 12 deletions(-) + +diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c +index 80b9583440c..8f138d64c6c 100644 +--- a/src/oom/oomd-util.c ++++ b/src/oom/oomd-util.c +@@ -214,7 +214,7 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) { + + assert(h); + +- r = oomd_sort_cgroup_contexts(h, compare_pgscan, prefix, &sorted); ++ r = oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, prefix, &sorted); + if (r < 0) + return r; + +diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h +index d7a9890e7a2..f0648c5dcdd 100644 +--- a/src/oom/oomd-util.h ++++ b/src/oom/oomd-util.h +@@ -61,10 +61,13 @@ bool oomd_memory_reclaim(Hashmap *h); + /* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */ + bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent); + +-static inline int compare_pgscan(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { ++static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + assert(c1); + assert(c2); + ++ if ((*c2)->pgscan == (*c1)->pgscan) ++ return CMP((*c2)->current_memory_usage, (*c1)->current_memory_usage); ++ + return CMP((*c2)->pgscan, (*c1)->pgscan); + } + +diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c +index 3dec4f0ff06..a1fe78806a1 100644 +--- a/src/oom/test-oomd-util.c ++++ b/src/oom/test-oomd-util.c +@@ -292,16 +292,20 @@ static void test_oomd_sort_cgroups(void) { + OomdCGroupContext ctx[4] = { + { .path = paths[0], + .swap_usage = 20, +- .pgscan = 60 }, ++ .pgscan = 60, ++ .current_memory_usage = 10 }, + { .path = paths[1], + .swap_usage = 60, +- .pgscan = 40 }, ++ .pgscan = 40, ++ .current_memory_usage = 20 }, + { .path = paths[2], + .swap_usage = 40, +- .pgscan = 20 }, ++ .pgscan = 40, ++ .current_memory_usage = 40 }, + { .path = paths[3], + .swap_usage = 10, +- .pgscan = 80 }, ++ .pgscan = 80, ++ .current_memory_usage = 10 }, + }; + + assert_se(h = hashmap_new(&string_hash_ops)); +@@ -318,16 +322,16 @@ static void test_oomd_sort_cgroups(void) { + assert_se(sorted_cgroups[3] == &ctx[3]); + sorted_cgroups = mfree(sorted_cgroups); + +- assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, NULL, &sorted_cgroups) == 4); ++ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 4); + assert_se(sorted_cgroups[0] == &ctx[3]); + assert_se(sorted_cgroups[1] == &ctx[0]); +- assert_se(sorted_cgroups[2] == &ctx[1]); +- assert_se(sorted_cgroups[3] == &ctx[2]); ++ assert_se(sorted_cgroups[2] == &ctx[2]); ++ assert_se(sorted_cgroups[3] == &ctx[1]); + sorted_cgroups = mfree(sorted_cgroups); + +- assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan, "/herp.slice/derp.scope", &sorted_cgroups) == 2); +- assert_se(sorted_cgroups[0] == &ctx[1]); +- assert_se(sorted_cgroups[1] == &ctx[2]); ++ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2); ++ assert_se(sorted_cgroups[0] == &ctx[2]); ++ assert_se(sorted_cgroups[1] == &ctx[1]); + assert_se(sorted_cgroups[2] == 0); + assert_se(sorted_cgroups[3] == 0); + sorted_cgroups = mfree(sorted_cgroups); + +From c73a2c3a6788a2a28899f29579fdd68816f60d59 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Thu, 28 Jan 2021 15:47:26 -0800 +Subject: [PATCH 4/7] oom: skip over cgroups with no memory usage + +--- + src/oom/oomd-util.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c +index 8f138d64c6c..fa8b8b70b19 100644 +--- a/src/oom/oomd-util.c ++++ b/src/oom/oomd-util.c +@@ -219,7 +219,8 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) { + return r; + + for (int i = 0; i < r; i++) { +- if (sorted[i]->pgscan == 0) ++ /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure */ ++ if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0) + break; + + r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); + +From 63d6d9160523a2c1a71e96ff4125a1440d827b32 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Tue, 26 Jan 2021 00:57:36 -0800 +Subject: [PATCH 5/7] oom: implement avoid/omit xattr support + +There may be situations where a cgroup should be protected from killing +or deprioritized as a candidate. In FB oomd xattrs are used to bias oomd +away from supervisor cgroups and towards worker cgroups in container +tasks. On desktops this can be used to protect important units with +unpredictable resource consumption. + +The patch allows systemd-oomd to understand 2 xattrs: +"user.oomd_avoid" and "user.oomd_omit". If systemd-oomd sees these +xattrs set to 1 on a candidate cgroup (i.e. while attempting to kill something) +AND the cgroup is owned by root:root, it will either deprioritize the cgroup as +a candidate (avoid) or remove it completely as a candidate (omit). + +Usage is restricted to root:root cgroups to prevent situations where an +unprivileged user can set their own cgroups lower in the kill priority than +another user's (and prevent them from omitting their units from +systemd-oomd killing). +--- + src/basic/cgroup-util.c | 22 +++++++++ + src/basic/cgroup-util.h | 1 + + src/oom/oomd-util.c | 35 ++++++++++++--- + src/oom/oomd-util.h | 11 +++++ + src/oom/test-oomd-util.c | 54 +++++++++++++++++++++-- + test/test-functions | 1 + + test/units/testsuite-56-testmunch.service | 7 +++ + test/units/testsuite-56.sh | 31 +++++++++++-- + 8 files changed, 149 insertions(+), 13 deletions(-) + create mode 100644 test/units/testsuite-56-testmunch.service + +diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c +index b567822b7ef..45dc1142048 100644 +--- a/src/basic/cgroup-util.c ++++ b/src/basic/cgroup-util.c +@@ -1703,6 +1703,28 @@ int cg_get_attribute_as_bool(const char *controller, const char *path, const cha + return 0; + } + ++ ++int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid, gid_t *ret_gid) { ++ _cleanup_free_ char *f = NULL; ++ struct stat stats; ++ int r; ++ ++ assert(ret_uid); ++ assert(ret_gid); ++ ++ r = cg_get_path(controller, path, NULL, &f); ++ if (r < 0) ++ return r; ++ ++ r = stat(f, &stats); ++ if (r < 0) ++ return -errno; ++ ++ *ret_uid = stats.st_uid; ++ *ret_gid = stats.st_gid; ++ return 0; ++} ++ + int cg_get_keyed_attribute_full( + const char *controller, + const char *path, +diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h +index bdc0d0d086c..63bd25f703e 100644 +--- a/src/basic/cgroup-util.h ++++ b/src/basic/cgroup-util.h +@@ -212,6 +212,7 @@ int cg_get_attribute_as_uint64(const char *controller, const char *path, const c + int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret); + + int cg_set_access(const char *controller, const char *path, uid_t uid, gid_t gid); ++int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid, gid_t *ret_gid); + + int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags); + int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size); +diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c +index fa8b8b70b19..db6383bf436 100644 +--- a/src/oom/oomd-util.c ++++ b/src/oom/oomd-util.c +@@ -159,7 +159,8 @@ int oomd_sort_cgroup_contexts(Hashmap *h, oomd_compare_t compare_func, const cha + return -ENOMEM; + + HASHMAP_FOREACH(item, h) { +- if (item->path && prefix && !path_startswith(item->path, prefix)) ++ /* Skip over cgroups that are not valid candidates or are explicitly marked for omission */ ++ if ((item->path && prefix && !path_startswith(item->path, prefix)) || item->omit) + continue; + + sorted[k++] = item; +@@ -219,9 +220,10 @@ int oomd_kill_by_pgscan(Hashmap *h, const char *prefix, bool dry_run) { + return r; + + for (int i = 0; i < r; i++) { +- /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure */ ++ /* Skip cgroups with no reclaim and memory usage; it won't alleviate pressure. */ ++ /* Don't break since there might be "avoid" cgroups at the end. */ + if (sorted[i]->pgscan == 0 && sorted[i]->current_memory_usage == 0) +- break; ++ continue; + + r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); + if (r > 0 || r == -ENOMEM) +@@ -244,8 +246,10 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) { + /* Try to kill cgroups with non-zero swap usage until we either succeed in + * killing or we get to a cgroup with no swap usage. */ + for (int i = 0; i < r; i++) { ++ /* Skip over cgroups with no resource usage. Don't break since there might be "avoid" ++ * cgroups at the end. */ + if (sorted[i]->swap_usage == 0) +- break; ++ continue; + + r = oomd_cgroup_kill(sorted[i]->path, true, dry_run); + if (r > 0 || r == -ENOMEM) +@@ -257,8 +261,10 @@ int oomd_kill_by_swap_usage(Hashmap *h, bool dry_run) { + + int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; +- _cleanup_free_ char *p = NULL, *val = NULL; ++ _cleanup_free_ char *p = NULL, *val = NULL, *avoid_val = NULL, *omit_val = NULL; + bool is_root; ++ uid_t uid; ++ gid_t gid; + int r; + + assert(path); +@@ -278,6 +284,25 @@ int oomd_cgroup_context_acquire(const char *path, OomdCGroupContext **ret) { + if (r < 0) + return log_debug_errno(r, "Error parsing memory pressure from %s: %m", p); + ++ r = cg_get_owner(SYSTEMD_CGROUP_CONTROLLER, path, &uid, &gid); ++ if (r < 0) ++ log_debug_errno(r, "Failed to get owner/group from %s: %m", path); ++ else if (uid == 0 && gid == 0) { ++ /* Ignore most errors when reading the xattr since it is usually unset and cgroup xattrs are only used ++ * as an optional feature of systemd-oomd (and the system might not even support them). */ ++ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_avoid", &avoid_val); ++ if (r >= 0 && streq(avoid_val, "1")) ++ ctx->avoid = true; ++ else if (r == -ENOMEM) ++ return r; ++ ++ r = cg_get_xattr_malloc(SYSTEMD_CGROUP_CONTROLLER, path, "user.oomd_omit", &omit_val); ++ if (r >= 0 && streq(omit_val, "1")) ++ ctx->omit = true; ++ else if (r == -ENOMEM) ++ return r; ++ } ++ + if (is_root) { + r = procfs_memory_get_used(&ctx->current_memory_usage); + if (r < 0) +diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h +index f0648c5dcdd..ab6a8da1ef6 100644 +--- a/src/oom/oomd-util.h ++++ b/src/oom/oomd-util.h +@@ -29,6 +29,9 @@ struct OomdCGroupContext { + uint64_t last_pgscan; + uint64_t pgscan; + ++ bool avoid; ++ bool omit; ++ + /* These are only used by oomd_pressure_above for acting on high memory pressure. */ + loadavg_t mem_pressure_limit; + usec_t mem_pressure_duration_usec; +@@ -61,10 +64,15 @@ bool oomd_memory_reclaim(Hashmap *h); + /* Returns true if the amount of swap free is below the percentage of swap specified by `threshold_percent`. */ + bool oomd_swap_free_below(const OomdSystemContext *ctx, uint64_t threshold_percent); + ++/* The compare functions will sort from largest to smallest, putting all the contexts with "avoid" at the end ++ * (after the smallest values). */ + static inline int compare_pgscan_and_memory_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + assert(c1); + assert(c2); + ++ if ((*c1)->avoid != (*c2)->avoid) ++ return CMP((*c1)->avoid, (*c2)->avoid); ++ + if ((*c2)->pgscan == (*c1)->pgscan) + return CMP((*c2)->current_memory_usage, (*c1)->current_memory_usage); + +@@ -75,6 +83,9 @@ static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupCo + assert(c1); + assert(c2); + ++ if ((*c1)->avoid != (*c2)->avoid) ++ return CMP((*c1)->avoid, (*c2)->avoid); ++ + return CMP((*c2)->swap_usage, (*c1)->swap_usage); + } + +diff --git a/src/oom/test-oomd-util.c b/src/oom/test-oomd-util.c +index a1fe78806a1..193edee0eba 100644 +--- a/src/oom/test-oomd-util.c ++++ b/src/oom/test-oomd-util.c +@@ -89,6 +89,8 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { + _cleanup_(oomd_cgroup_context_freep) OomdCGroupContext *ctx = NULL; + _cleanup_free_ char *cgroup = NULL; + OomdCGroupContext *c1, *c2; ++ bool test_xattrs; ++ int r; + + if (geteuid() != 0) + return (void) log_tests_skipped("not root"); +@@ -101,6 +103,16 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { + + assert_se(cg_pid_get_path(NULL, 0, &cgroup) >= 0); + ++ /* If we don't have permissions to set xattrs we're likely in a userns or missing capabilities ++ * so skip the xattr portions of the test. */ ++ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_test", "1", 1, 0); ++ test_xattrs = !ERRNO_IS_PRIVILEGE(r) && !ERRNO_IS_NOT_SUPPORTED(r); ++ ++ if (test_xattrs) { ++ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_omit", "1", 1, 0) >= 0); ++ assert_se(cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup, "user.oomd_avoid", "1", 1, 0) >= 0); ++ } ++ + assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); + + assert_se(streq(ctx->path, cgroup)); +@@ -110,12 +122,21 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { + assert_se(ctx->swap_usage == 0); + assert_se(ctx->last_pgscan == 0); + assert_se(ctx->pgscan == 0); ++ if (test_xattrs) { ++ assert_se(ctx->omit == true); ++ assert_se(ctx->avoid == true); ++ } else { ++ assert_se(ctx->omit == false); ++ assert_se(ctx->avoid == false); ++ } + ctx = oomd_cgroup_context_free(ctx); + + /* Test the root cgroup */ + assert_se(oomd_cgroup_context_acquire("", &ctx) == 0); + assert_se(streq(ctx->path, "/")); + assert_se(ctx->current_memory_usage > 0); ++ assert_se(ctx->omit == false); ++ assert_se(ctx->avoid == false); + + /* Test hashmap inserts */ + assert_se(h1 = hashmap_new(&oomd_cgroup_ctx_hash_ops)); +@@ -137,6 +158,15 @@ static void test_oomd_cgroup_context_acquire_and_insert(void) { + assert_se(c2->last_pgscan == 5555); + assert_se(c2->mem_pressure_limit == 6789); + assert_se(c2->last_hit_mem_pressure_limit == 42); ++ ++ /* Assert that avoid/omit are not set if the cgroup is not owned by root */ ++ if (test_xattrs) { ++ ctx = oomd_cgroup_context_free(ctx); ++ assert_se(cg_set_access(SYSTEMD_CGROUP_CONTROLLER, cgroup, 65534, 65534) >= 0); ++ assert_se(oomd_cgroup_context_acquire(cgroup, &ctx) == 0); ++ assert_se(ctx->omit == false); ++ assert_se(ctx->avoid == false); ++ } + } + + static void test_oomd_system_context_acquire(void) { +@@ -287,9 +317,11 @@ static void test_oomd_sort_cgroups(void) { + char **paths = STRV_MAKE("/herp.slice", + "/herp.slice/derp.scope", + "/herp.slice/derp.scope/sheep.service", +- "/zupa.slice"); ++ "/zupa.slice", ++ "/omitted.slice", ++ "/avoid.slice"); + +- OomdCGroupContext ctx[4] = { ++ OomdCGroupContext ctx[6] = { + { .path = paths[0], + .swap_usage = 20, + .pgscan = 60, +@@ -306,6 +338,14 @@ static void test_oomd_sort_cgroups(void) { + .swap_usage = 10, + .pgscan = 80, + .current_memory_usage = 10 }, ++ { .path = paths[4], ++ .swap_usage = 90, ++ .pgscan = 100, ++ .omit = true }, ++ { .path = paths[5], ++ .swap_usage = 99, ++ .pgscan = 200, ++ .avoid = true }, + }; + + assert_se(h = hashmap_new(&string_hash_ops)); +@@ -314,19 +354,23 @@ static void test_oomd_sort_cgroups(void) { + assert_se(hashmap_put(h, "/herp.slice/derp.scope", &ctx[1]) >= 0); + assert_se(hashmap_put(h, "/herp.slice/derp.scope/sheep.service", &ctx[2]) >= 0); + assert_se(hashmap_put(h, "/zupa.slice", &ctx[3]) >= 0); ++ assert_se(hashmap_put(h, "/omitted.slice", &ctx[4]) >= 0); ++ assert_se(hashmap_put(h, "/avoid.slice", &ctx[5]) >= 0); + +- assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 4); ++ assert_se(oomd_sort_cgroup_contexts(h, compare_swap_usage, NULL, &sorted_cgroups) == 5); + assert_se(sorted_cgroups[0] == &ctx[1]); + assert_se(sorted_cgroups[1] == &ctx[2]); + assert_se(sorted_cgroups[2] == &ctx[0]); + assert_se(sorted_cgroups[3] == &ctx[3]); ++ assert_se(sorted_cgroups[4] == &ctx[5]); + sorted_cgroups = mfree(sorted_cgroups); + +- assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 4); ++ assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, NULL, &sorted_cgroups) == 5); + assert_se(sorted_cgroups[0] == &ctx[3]); + assert_se(sorted_cgroups[1] == &ctx[0]); + assert_se(sorted_cgroups[2] == &ctx[2]); + assert_se(sorted_cgroups[3] == &ctx[1]); ++ assert_se(sorted_cgroups[4] == &ctx[5]); + sorted_cgroups = mfree(sorted_cgroups); + + assert_se(oomd_sort_cgroup_contexts(h, compare_pgscan_and_memory_usage, "/herp.slice/derp.scope", &sorted_cgroups) == 2); +@@ -334,6 +378,8 @@ static void test_oomd_sort_cgroups(void) { + assert_se(sorted_cgroups[1] == &ctx[1]); + assert_se(sorted_cgroups[2] == 0); + assert_se(sorted_cgroups[3] == 0); ++ assert_se(sorted_cgroups[4] == 0); ++ assert_se(sorted_cgroups[5] == 0); + sorted_cgroups = mfree(sorted_cgroups); + } + +diff --git a/test/test-functions b/test/test-functions +index df6022982c2..6996cd74752 100644 +--- a/test/test-functions ++++ b/test/test-functions +@@ -124,6 +124,7 @@ BASICTOOLS=( + rmdir + sed + seq ++ setfattr + setfont + setsid + sfdisk +diff --git a/test/units/testsuite-56-testmunch.service b/test/units/testsuite-56-testmunch.service +new file mode 100644 +index 00000000000..b4b925a7af0 +--- /dev/null ++++ b/test/units/testsuite-56-testmunch.service +@@ -0,0 +1,7 @@ ++[Unit] ++Description=Create some memory pressure ++ ++[Service] ++MemoryHigh=2M ++Slice=testsuite-56-workload.slice ++ExecStart=/usr/lib/systemd/tests/testdata/units/testsuite-56-slowgrowth.sh +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 8b01fe37ed4..88c185b8869 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -23,20 +23,43 @@ oomctl | grep "/testsuite-56-workload.slice" + oomctl | grep "1.00%" + oomctl | grep "Default Memory Pressure Duration: 5s" + +-# systemd-oomd watches for elevated pressure for 30 seconds before acting. +-# It can take time to build up pressure so either wait 5 minutes or for the service to fail. +-timeout=$(date -ud "5 minutes" +%s) ++# systemd-oomd watches for elevated pressure for 5 seconds before acting. ++# It can take time to build up pressure so either wait 2 minutes or for the service to fail. ++timeout=$(date -ud "2 minutes" +%s) + while [[ $(date -u +%s) -le $timeout ]]; do + if ! systemctl status testsuite-56-testbloat.service; then + break + fi +- sleep 15 ++ sleep 5 + done + + # testbloat should be killed and testchill should be fine + if systemctl status testsuite-56-testbloat.service; then exit 42; fi + if ! systemctl status testsuite-56-testchill.service; then exit 24; fi + ++# only run this portion of the test if we can set xattrs ++if setfattr -n user.xattr_test -v 1 /sys/fs/cgroup/; then ++ sleep 120 # wait for systemd-oomd kill cool down and elevated memory pressure to come down ++ ++ systemctl start testsuite-56-testchill.service ++ systemctl start testsuite-56-testmunch.service ++ systemctl start testsuite-56-testbloat.service ++ setfattr -n user.oomd_avoid -v 1 /sys/fs/cgroup/testsuite.slice/testsuite-56.slice/testsuite-56-workload.slice/testsuite-56-testbloat.service ++ ++ timeout=$(date -ud "2 minutes" +%s) ++ while [[ $(date -u +%s) -le $timeout ]]; do ++ if ! systemctl status testsuite-56-testmunch.service; then ++ break ++ fi ++ sleep 5 ++ done ++ ++ # testmunch should be killed since testbloat had the avoid xattr on it ++ if ! systemctl status testsuite-56-testbloat.service; then exit 25; fi ++ if systemctl status testsuite-56-testmunch.service; then exit 43; fi ++ if ! systemctl status testsuite-56-testchill.service; then exit 24; fi ++fi ++ + systemd-analyze log-level info + + echo OK > /testok + +From d87ecfecdb6fb77097f843888e2a05945b6b396b Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Thu, 28 Jan 2021 02:31:44 -0800 +Subject: [PATCH 6/7] oom: add unit file settings for oomd avoid/omit xattrs + +--- + docs/TRANSIENT-SETTINGS.md | 1 + + src/core/cgroup.c | 58 ++++++++++++++++++--- + src/core/cgroup.h | 15 ++++++ + src/core/dbus-cgroup.c | 22 ++++++++ + src/core/execute.c | 4 ++ + src/core/load-fragment-gperf.gperf.m4 | 1 + + src/core/load-fragment.c | 1 + + src/core/load-fragment.h | 1 + + src/shared/bus-unit-util.c | 3 +- + src/test/test-tables.c | 1 + + test/fuzz/fuzz-unit-file/directives.service | 4 ++ + test/units/testsuite-56.sh | 8 ++- + 12 files changed, 109 insertions(+), 10 deletions(-) + +diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md +index 50370602543..9f69a3162a0 100644 +--- a/docs/TRANSIENT-SETTINGS.md ++++ b/docs/TRANSIENT-SETTINGS.md +@@ -273,6 +273,7 @@ All cgroup/resource control settings are available for transient units + ✓ ManagedOOMSwap= + ✓ ManagedOOMMemoryPressure= + ✓ ManagedOOMMemoryPressureLimit= ++✓ ManagedOOMPreference= + ``` + + ## Process Killing Settings +diff --git a/src/core/cgroup.c b/src/core/cgroup.c +index 70282a7abda..833b434b555 100644 +--- a/src/core/cgroup.c ++++ b/src/core/cgroup.c +@@ -131,6 +131,7 @@ void cgroup_context_init(CGroupContext *c) { + + .moom_swap = MANAGED_OOM_AUTO, + .moom_mem_pressure = MANAGED_OOM_AUTO, ++ .moom_preference = MANAGED_OOM_PREFERENCE_NONE, + }; + } + +@@ -417,7 +418,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { + "%sDelegate: %s\n" + "%sManagedOOMSwap: %s\n" + "%sManagedOOMMemoryPressure: %s\n" +- "%sManagedOOMMemoryPressureLimit: %" PRIu32 ".%02" PRIu32 "%%\n", ++ "%sManagedOOMMemoryPressureLimit: %" PRIu32 ".%02" PRIu32 "%%\n" ++ "%sManagedOOMPreference: %s%%\n", + prefix, yes_no(c->cpu_accounting), + prefix, yes_no(c->io_accounting), + prefix, yes_no(c->blockio_accounting), +@@ -450,7 +452,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { + prefix, yes_no(c->delegate), + prefix, managed_oom_mode_to_string(c->moom_swap), + prefix, managed_oom_mode_to_string(c->moom_mem_pressure), +- prefix, c->moom_mem_pressure_limit_permyriad / 100, c->moom_mem_pressure_limit_permyriad % 100); ++ prefix, c->moom_mem_pressure_limit_permyriad / 100, c->moom_mem_pressure_limit_permyriad % 100, ++ prefix, managed_oom_preference_to_string(c->moom_preference)); + + if (c->delegate) { + _cleanup_free_ char *t = NULL; +@@ -600,6 +603,35 @@ int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) + UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_low); + UNIT_DEFINE_ANCESTOR_MEMORY_LOOKUP(memory_min); + ++void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path) { ++ CGroupContext *c; ++ int r; ++ ++ assert(u); ++ ++ c = unit_get_cgroup_context(u); ++ if (!c) ++ return; ++ ++ r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_avoid"); ++ if (r != -ENODATA) ++ log_unit_debug_errno(u, r, "Failed to remove oomd_avoid flag on control group %s, ignoring: %m", cgroup_path); ++ ++ r = cg_remove_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_omit"); ++ if (r != -ENODATA) ++ log_unit_debug_errno(u, r, "Failed to remove oomd_omit flag on control group %s, ignoring: %m", cgroup_path); ++ ++ if (c->moom_preference == MANAGED_OOM_PREFERENCE_AVOID) { ++ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_avoid", "1", 1, 0); ++ if (r < 0) ++ log_unit_debug_errno(u, r, "Failed to set oomd_avoid flag on control group %s, ignoring: %m", cgroup_path); ++ } else if (c->moom_preference == MANAGED_OOM_PREFERENCE_OMIT) { ++ r = cg_set_xattr(SYSTEMD_CGROUP_CONTROLLER, cgroup_path, "user.oomd_omit", "1", 1, 0); ++ if (r < 0) ++ log_unit_debug_errno(u, r, "Failed to set oomd_omit flag on control group %s, ignoring: %m", cgroup_path); ++ } ++} ++ + static void cgroup_xattr_apply(Unit *u) { + char ids[SD_ID128_STRING_MAX]; + int r; +@@ -630,6 +662,8 @@ static void cgroup_xattr_apply(Unit *u) { + if (r != -ENODATA) + log_unit_debug_errno(u, r, "Failed to remove delegate flag on control group %s, ignoring: %m", u->cgroup_path); + } ++ ++ cgroup_oomd_xattr_apply(u, u->cgroup_path); + } + + static int lookup_block_device(const char *p, dev_t *ret) { +@@ -3737,12 +3771,6 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action) { + return 1; + } + +-static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = { +- [CGROUP_DEVICE_POLICY_AUTO] = "auto", +- [CGROUP_DEVICE_POLICY_CLOSED] = "closed", +- [CGROUP_DEVICE_POLICY_STRICT] = "strict", +-}; +- + int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { + _cleanup_free_ char *v = NULL; + int r; +@@ -3771,6 +3799,12 @@ int unit_get_cpuset(Unit *u, CPUSet *cpus, const char *name) { + return parse_cpu_set_full(v, cpus, false, NULL, NULL, 0, NULL); + } + ++static const char* const cgroup_device_policy_table[_CGROUP_DEVICE_POLICY_MAX] = { ++ [CGROUP_DEVICE_POLICY_AUTO] = "auto", ++ [CGROUP_DEVICE_POLICY_CLOSED] = "closed", ++ [CGROUP_DEVICE_POLICY_STRICT] = "strict", ++}; ++ + DEFINE_STRING_TABLE_LOOKUP(cgroup_device_policy, CGroupDevicePolicy); + + static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = { +@@ -3779,3 +3813,11 @@ static const char* const freezer_action_table[_FREEZER_ACTION_MAX] = { + }; + + DEFINE_STRING_TABLE_LOOKUP(freezer_action, FreezerAction); ++ ++static const char* const managed_oom_preference_table[_MANAGED_OOM_PREFERENCE_MAX] = { ++ [MANAGED_OOM_PREFERENCE_NONE] = "none", ++ [MANAGED_OOM_PREFERENCE_AVOID] = "avoid", ++ [MANAGED_OOM_PREFERENCE_OMIT] = "omit", ++}; ++ ++DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference, ManagedOOMPreference); +diff --git a/src/core/cgroup.h b/src/core/cgroup.h +index 9fbfabbb7e3..7d9ab4ae6b8 100644 +--- a/src/core/cgroup.h ++++ b/src/core/cgroup.h +@@ -94,6 +94,15 @@ struct CGroupBlockIODeviceBandwidth { + uint64_t wbps; + }; + ++typedef enum ManagedOOMPreference { ++ MANAGED_OOM_PREFERENCE_NONE, ++ MANAGED_OOM_PREFERENCE_AVOID, ++ MANAGED_OOM_PREFERENCE_OMIT, ++ ++ _MANAGED_OOM_PREFERENCE_MAX, ++ _MANAGED_OOM_PREFERENCE_INVALID = -1 ++} ManagedOOMPreference; ++ + struct CGroupContext { + bool cpu_accounting; + bool io_accounting; +@@ -164,6 +173,7 @@ struct CGroupContext { + ManagedOOMMode moom_swap; + ManagedOOMMode moom_mem_pressure; + uint32_t moom_mem_pressure_limit_permyriad; ++ ManagedOOMPreference moom_preference; + }; + + /* Used when querying IP accounting data */ +@@ -204,6 +214,8 @@ void cgroup_context_free_blockio_device_bandwidth(CGroupContext *c, CGroupBlockI + + int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode); + ++void cgroup_oomd_xattr_apply(Unit *u, const char *cgroup_path); ++ + CGroupMask unit_get_own_mask(Unit *u); + CGroupMask unit_get_delegate_mask(Unit *u); + CGroupMask unit_get_members_mask(Unit *u); +@@ -294,3 +306,6 @@ int unit_cgroup_freezer_action(Unit *u, FreezerAction action); + + const char* freezer_action_to_string(FreezerAction a) _const_; + FreezerAction freezer_action_from_string(const char *s) _pure_; ++ ++const char* managed_oom_preference_to_string(ManagedOOMPreference a) _const_; ++ManagedOOMPreference managed_oom_preference_from_string(const char *s) _pure_; +diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c +index 6f309feb236..0b2d945283e 100644 +--- a/src/core/dbus-cgroup.c ++++ b/src/core/dbus-cgroup.c +@@ -21,6 +21,7 @@ BUS_DEFINE_PROPERTY_GET(bus_property_get_tasks_max, "t", TasksMax, tasks_max_res + + static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy); + static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_mode, managed_oom_mode, ManagedOOMMode); ++static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_managed_oom_preference, managed_oom_preference, ManagedOOMPreference); + + static int property_get_cgroup_mask( + sd_bus *bus, +@@ -395,6 +396,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { + SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPermyriad", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit_permyriad), 0), ++ SD_BUS_PROPERTY("ManagedOOMPreference", "s", property_get_managed_oom_preference, offsetof(CGroupContext, moom_preference), 0), + SD_BUS_VTABLE_END + }; + +@@ -1720,6 +1722,26 @@ int bus_cgroup_set_property( + return 1; + } + ++ if (streq(name, "ManagedOOMPreference")) { ++ ManagedOOMPreference p; ++ const char *pref; ++ ++ r = sd_bus_message_read(message, "s", &pref); ++ if (r < 0) ++ return r; ++ ++ p = managed_oom_preference_from_string(pref); ++ if (p < 0) ++ return -EINVAL; ++ ++ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { ++ c->moom_preference = p; ++ unit_write_settingf(u, flags, name, "ManagedOOMPreference=%s", pref); ++ } ++ ++ return 1; ++ } ++ + if (streq(name, "DisableControllers") || (u->transient && u->load_state == UNIT_STUB)) + return bus_cgroup_set_transient_property(u, c, name, message, flags, error); + +diff --git a/src/core/execute.c b/src/core/execute.c +index b7d78f2197e..0368582884c 100644 +--- a/src/core/execute.c ++++ b/src/core/execute.c +@@ -4701,6 +4701,10 @@ int exec_spawn(Unit *unit, + r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path); + if (r < 0) + return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path); ++ ++ /* Normally we would not propagate the oomd xattrs to children but since we created this ++ * sub-cgroup interally we should do it. */ ++ cgroup_oomd_xattr_apply(unit, subcgroup_path); + } + } + +diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 +index 81f4561a572..dbcbe645934 100644 +--- a/src/core/load-fragment-gperf.gperf.m4 ++++ b/src/core/load-fragment-gperf.gperf.m4 +@@ -230,6 +230,7 @@ $1.IPEgressFilterPath, config_parse_ip_filter_bpf_progs, + $1.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_swap) + $1.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_mem_pressure) + $1.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof($1, cgroup_context.moom_mem_pressure_limit_permyriad) ++$1.ManagedOOMPreference, config_parse_managed_oom_preference, 0, offsetof($1, cgroup_context.moom_preference) + $1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0' + )m4_dnl + Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) +diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c +index 06b71aaf157..c6b017556f9 100644 +--- a/src/core/load-fragment.c ++++ b/src/core/load-fragment.c +@@ -133,6 +133,7 @@ DEFINE_CONFIG_PARSE_ENUM(config_parse_service_restart, service_restart, ServiceR + DEFINE_CONFIG_PARSE_ENUM(config_parse_service_timeout_failure_mode, service_timeout_failure_mode, ServiceTimeoutFailureMode, "Failed to parse timeout failure mode"); + DEFINE_CONFIG_PARSE_ENUM(config_parse_socket_bind, socket_address_bind_ipv6_only_or_bool, SocketAddressBindIPv6Only, "Failed to parse bind IPv6 only value"); + DEFINE_CONFIG_PARSE_ENUM(config_parse_oom_policy, oom_policy, OOMPolicy, "Failed to parse OOM policy"); ++DEFINE_CONFIG_PARSE_ENUM(config_parse_managed_oom_preference, managed_oom_preference, ManagedOOMPreference, "Failed to parse ManagedOOMPreference="); + DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_ip_tos, ip_tos, int, -1, "Failed to parse IP TOS value"); + DEFINE_CONFIG_PARSE_PTR(config_parse_blockio_weight, cg_blkio_weight_parse, uint64_t, "Invalid block IO weight"); + DEFINE_CONFIG_PARSE_PTR(config_parse_cg_weight, cg_weight_parse, uint64_t, "Invalid weight"); +diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h +index 6b2175cd2af..e4a5cb79869 100644 +--- a/src/core/load-fragment.h ++++ b/src/core/load-fragment.h +@@ -78,6 +78,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_tasks_max); + CONFIG_PARSER_PROTOTYPE(config_parse_delegate); + CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mode); + CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_mem_pressure_limit); ++CONFIG_PARSER_PROTOTYPE(config_parse_managed_oom_preference); + CONFIG_PARSER_PROTOTYPE(config_parse_device_policy); + CONFIG_PARSER_PROTOTYPE(config_parse_device_allow); + CONFIG_PARSER_PROTOTYPE(config_parse_io_device_latency); +diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c +index 84f57d94d23..5bbaa07dd1c 100644 +--- a/src/shared/bus-unit-util.c ++++ b/src/shared/bus-unit-util.c +@@ -435,7 +435,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons + if (STR_IN_SET(field, "DevicePolicy", + "Slice", + "ManagedOOMSwap", +- "ManagedOOMMemoryPressure")) ++ "ManagedOOMMemoryPressure", ++ "ManagedOOMPreference")) + return bus_append_string(m, field, eq); + + if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) { +diff --git a/src/test/test-tables.c b/src/test/test-tables.c +index 641cadec858..cc93bbbc749 100644 +--- a/src/test/test-tables.c ++++ b/src/test/test-tables.c +@@ -73,6 +73,7 @@ int main(int argc, char **argv) { + test_table(log_target, LOG_TARGET); + test_table(mac_address_policy, MAC_ADDRESS_POLICY); + test_table(managed_oom_mode, MANAGED_OOM_MODE); ++ test_table(managed_oom_preference, MANAGED_OOM_PREFERENCE); + test_table(manager_state, MANAGER_STATE); + test_table(manager_timestamp, MANAGER_TIMESTAMP); + test_table(mount_exec_command, MOUNT_EXEC_COMMAND); +diff --git a/test/fuzz/fuzz-unit-file/directives.service b/test/fuzz/fuzz-unit-file/directives.service +index 15fa556dd64..0c7ded6786a 100644 +--- a/test/fuzz/fuzz-unit-file/directives.service ++++ b/test/fuzz/fuzz-unit-file/directives.service +@@ -138,6 +138,10 @@ MakeDirectory= + Mark= + MaxConnections= + MaxConnectionsPerSource= ++ManagedOOMSwap= ++ManagedOOMMemoryPressure= ++ManagedOOMMemoryPressureLimitPercent= ++ManagedOOMPreference= + MemoryAccounting= + MemoryHigh= + MemoryLimit= +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 88c185b8869..1884f814689 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -13,6 +13,8 @@ if [[ "$cgroup_type" != *"cgroup2"* ]] && [[ "$cgroup_type" != *"0x63677270"* ]] + fi + [[ -e /skipped ]] && exit 0 || true + ++rm -rf /etc/systemd/system/testsuite-56-testbloat.service.d ++ + echo "DefaultMemoryPressureDurationSec=5s" >> /etc/systemd/oomd.conf + + systemctl start testsuite-56-testchill.service +@@ -41,10 +43,14 @@ if ! systemctl status testsuite-56-testchill.service; then exit 24; fi + if setfattr -n user.xattr_test -v 1 /sys/fs/cgroup/; then + sleep 120 # wait for systemd-oomd kill cool down and elevated memory pressure to come down + ++ mkdir -p /etc/systemd/system/testsuite-56-testbloat.service.d/ ++ echo "[Service]" > /etc/systemd/system/testsuite-56-testbloat.service.d/override.conf ++ echo "ManagedOOMPreference=avoid" >> /etc/systemd/system/testsuite-56-testbloat.service.d/override.conf ++ ++ systemctl daemon-reload + systemctl start testsuite-56-testchill.service + systemctl start testsuite-56-testmunch.service + systemctl start testsuite-56-testbloat.service +- setfattr -n user.oomd_avoid -v 1 /sys/fs/cgroup/testsuite.slice/testsuite-56.slice/testsuite-56-workload.slice/testsuite-56-testbloat.service + + timeout=$(date -ud "2 minutes" +%s) + while [[ $(date -u +%s) -le $timeout ]]; do + +From 32d695eccfeef00023992cdf20bf39f9d0288c67 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Thu, 28 Jan 2021 17:35:17 -0800 +Subject: [PATCH 7/7] man: document ManagedOOMPreference= + +--- + man/org.freedesktop.systemd1.xml | 36 ++++++++++++++++++++++++++++++++ + man/systemd.resource-control.xml | 32 ++++++++++++++++++++++++++++ + 2 files changed, 68 insertions(+) + +diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml +index 7543a617b78..1d419ac495e 100644 +--- a/man/org.freedesktop.systemd1.xml ++++ b/man/org.freedesktop.systemd1.xml +@@ -2450,6 +2450,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -2974,6 +2976,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + ++ ++ + + + +@@ -3538,6 +3542,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + ++ ++ + + + +@@ -4204,6 +4210,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -4756,6 +4764,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + ++ ++ + + + +@@ -5318,6 +5328,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + ++ ++ + + + +@@ -5897,6 +5909,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -6377,6 +6391,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + ++ ++ + + + +@@ -6857,6 +6873,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + ++ ++ + + + +@@ -7557,6 +7575,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -8023,6 +8043,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + ++ ++ + + + +@@ -8489,6 +8511,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + ++ ++ + + + +@@ -9042,6 +9066,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + }; + interface org.freedesktop.DBus.Peer { ... }; + interface org.freedesktop.DBus.Introspectable { ... }; +@@ -9178,6 +9204,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + ++ ++ + + + +@@ -9318,6 +9346,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + ++ ++ + + + +@@ -9477,6 +9507,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; ++ @org.freedesktop.DBus.Property.EmitsChangedSignal("false") ++ readonly s ManagedOOMPreference = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s KillMode = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -9629,6 +9661,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + ++ ++ + + + +@@ -9795,6 +9829,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + ++ ++ + + + +diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml +index be9c35057db..13ff7e9a740 100644 +--- a/man/systemd.resource-control.xml ++++ b/man/systemd.resource-control.xml +@@ -913,6 +913,38 @@ DeviceAllow=/dev/loop-control + + + ++ ++ ++ ManagedOOMPreference=none|avoid|omit ++ ++ ++ Allows deprioritizing or omitting this unit's cgroup as a candidate when systemd-oomd ++ needs to act. Requires support for extended attributes (see ++ xattr7) ++ in order to use or . Additionally, systemd-oomd ++ will ignore these extended attributes if the unit's cgroup is not owned by the root user and group. ++ ++ If this property is set to , the service manager will set the ++ "user.oomd_avoid" extended attribute on the unit's cgroup to "1". If systemd-oomd sees ++ this extended attribute on a cgroup set to "1" when choosing between candidates, it will only select the ++ cgroup with "user.oomd_avoid" if there are no other viable candidates. ++ ++ If this property is set to , the service manager will set the "user.oomd_omit" ++ extended attribute on the unit's cgroup to "1". If systemd-oomd sees the this extended ++ attribute on the cgroup set to "1", it will ignore the cgroup as a candidate and will not perform any actions ++ on the cgroup. ++ ++ It is recommended to use and sparingly as it can adversely ++ affect systemd-oomd's kill behavior. Also note that these extended attributes are not ++ applied recursively to cgroups under this unit's cgroup. ++ ++ Defaults to which means no extended attributes will be set and systemd-oomd will ++ sort this unit's cgroup as defined in ++ systemd-oomd.service8 ++ and oomd.conf5 (if this ++ unit's cgroup becomes a candidate). ++ ++ + + + diff --git a/18444.patch b/18444.patch new file mode 100644 index 0000000..7b1b066 --- /dev/null +++ b/18444.patch @@ -0,0 +1,987 @@ +From a9b1927c15fce3c9945ac249d8e8ddc42028a057 Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Tue, 2 Feb 2021 01:47:08 -0800 +Subject: [PATCH 1/2] parse-util: add permyriad parsing + +--- + src/basic/parse-util.c | 137 ++++++++++++++++++++++++++----------- + src/basic/parse-util.h | 3 + + src/test/test-parse-util.c | 68 ++++++++++++++++++ + 3 files changed, 169 insertions(+), 39 deletions(-) + +diff --git a/src/basic/parse-util.c b/src/basic/parse-util.c +index 5d4dafe3a5..a0fb2c9d17 100644 +--- a/src/basic/parse-util.c ++++ b/src/basic/parse-util.c +@@ -671,11 +671,11 @@ int parse_fractional_part_u(const char **p, size_t digits, unsigned *res) { + return 0; + } + +-int parse_percent_unbounded(const char *p) { ++static int parse_parts_value_whole(const char *p, const char *symbol) { + const char *pc, *n; + int r, v; + +- pc = endswith(p, "%"); ++ pc = endswith(p, symbol); + if (!pc) + return -EINVAL; + +@@ -689,6 +689,74 @@ int parse_percent_unbounded(const char *p) { + return v; + } + ++static int parse_parts_value_with_tenths_place(const char *p, const char *symbol) { ++ const char *pc, *dot, *n; ++ int r, q, v; ++ ++ pc = endswith(p, symbol); ++ if (!pc) ++ return -EINVAL; ++ ++ dot = memchr(p, '.', pc - p); ++ if (dot) { ++ if (dot + 2 != pc) ++ return -EINVAL; ++ if (dot[1] < '0' || dot[1] > '9') ++ return -EINVAL; ++ q = dot[1] - '0'; ++ n = strndupa(p, dot - p); ++ } else { ++ q = 0; ++ n = strndupa(p, pc - p); ++ } ++ r = safe_atoi(n, &v); ++ if (r < 0) ++ return r; ++ if (v < 0) ++ return -ERANGE; ++ if (v > (INT_MAX - q) / 10) ++ return -ERANGE; ++ ++ v = v * 10 + q; ++ return v; ++} ++ ++static int parse_parts_value_with_hundredths_place(const char *p, const char *symbol) { ++ const char *pc, *dot, *n; ++ int r, q, v; ++ ++ pc = endswith(p, symbol); ++ if (!pc) ++ return -EINVAL; ++ ++ dot = memchr(p, '.', pc - p); ++ if (dot) { ++ if (dot + 3 != pc) ++ return -EINVAL; ++ if (dot[1] < '0' || dot[1] > '9' || dot[2] < '0' || dot[2] > '9') ++ return -EINVAL; ++ q = (dot[1] - '0') * 10 + (dot[2] - '0'); ++ n = strndupa(p, dot - p); ++ } else { ++ q = 0; ++ n = strndupa(p, pc - p); ++ } ++ r = safe_atoi(n, &v); ++ if (r < 0) ++ return r; ++ if (v < 0) ++ return -ERANGE; ++ if (v > (INT_MAX - q) / 100) ++ return -ERANGE; ++ ++ v = v * 100 + q; ++ return v; ++} ++ ++int parse_percent_unbounded(const char *p) { ++ return parse_parts_value_whole(p, "%"); ++} ++ + int parse_percent(const char *p) { + int v; + +@@ -700,46 +768,13 @@ int parse_percent(const char *p) { + } + + int parse_permille_unbounded(const char *p) { +- const char *pc, *pm, *dot, *n; +- int r, q, v; ++ const char *pm; + + pm = endswith(p, "‰"); +- if (pm) { +- n = strndupa(p, pm - p); +- r = safe_atoi(n, &v); +- if (r < 0) +- return r; +- if (v < 0) +- return -ERANGE; +- } else { +- pc = endswith(p, "%"); +- if (!pc) +- return -EINVAL; +- +- dot = memchr(p, '.', pc - p); +- if (dot) { +- if (dot + 2 != pc) +- return -EINVAL; +- if (dot[1] < '0' || dot[1] > '9') +- return -EINVAL; +- q = dot[1] - '0'; +- n = strndupa(p, dot - p); +- } else { +- q = 0; +- n = strndupa(p, pc - p); +- } +- r = safe_atoi(n, &v); +- if (r < 0) +- return r; +- if (v < 0) +- return -ERANGE; +- if (v > (INT_MAX - q) / 10) +- return -ERANGE; ++ if (pm) ++ return parse_parts_value_whole(p, "‰"); + +- v = v * 10 + q; +- } +- +- return v; ++ return parse_parts_value_with_tenths_place(p, "%"); + } + + int parse_permille(const char *p) { +@@ -752,6 +787,30 @@ int parse_permille(const char *p) { + return v; + } + ++int parse_permyriad_unbounded(const char *p) { ++ const char *pm; ++ ++ pm = endswith(p, "‱"); ++ if (pm) ++ return parse_parts_value_whole(p, "‱"); ++ ++ pm = endswith(p, "‰"); ++ if (pm) ++ return parse_parts_value_with_tenths_place(p, "‰"); ++ ++ return parse_parts_value_with_hundredths_place(p, "%"); ++} ++ ++int parse_permyriad(const char *p) { ++ int v; ++ ++ v = parse_permyriad_unbounded(p); ++ if (v > 10000) ++ return -ERANGE; ++ ++ return v; ++} ++ + int parse_nice(const char *p, int *ret) { + int n, r; + +diff --git a/src/basic/parse-util.h b/src/basic/parse-util.h +index 81478ed059..3e29291f26 100644 +--- a/src/basic/parse-util.h ++++ b/src/basic/parse-util.h +@@ -136,6 +136,9 @@ int parse_percent(const char *p); + int parse_permille_unbounded(const char *p); + int parse_permille(const char *p); + ++int parse_permyriad_unbounded(const char *p); ++int parse_permyriad(const char *p); ++ + int parse_nice(const char *p, int *ret); + + int parse_ip_port(const char *s, uint16_t *ret); +diff --git a/src/test/test-parse-util.c b/src/test/test-parse-util.c +index 1c969091ef..6e23efe134 100644 +--- a/src/test/test-parse-util.c ++++ b/src/test/test-parse-util.c +@@ -790,6 +790,72 @@ static void test_parse_permille_unbounded(void) { + assert_se(parse_permille_unbounded("429496729.6%") == -ERANGE); + } + ++static void test_parse_permyriad(void) { ++ assert_se(parse_permyriad("") == -EINVAL); ++ assert_se(parse_permyriad("foo") == -EINVAL); ++ assert_se(parse_permyriad("0") == -EINVAL); ++ assert_se(parse_permyriad("50") == -EINVAL); ++ assert_se(parse_permyriad("100") == -EINVAL); ++ assert_se(parse_permyriad("-1") == -EINVAL); ++ ++ assert_se(parse_permyriad("0‱") == 0); ++ assert_se(parse_permyriad("555‱") == 555); ++ assert_se(parse_permyriad("1000‱") == 1000); ++ assert_se(parse_permyriad("-7‱") == -ERANGE); ++ assert_se(parse_permyriad("10007‱") == -ERANGE); ++ assert_se(parse_permyriad("‱") == -EINVAL); ++ assert_se(parse_permyriad("‱‱") == -EINVAL); ++ assert_se(parse_permyriad("‱1") == -EINVAL); ++ assert_se(parse_permyriad("1‱‱") == -EINVAL); ++ assert_se(parse_permyriad("3.2‱") == -EINVAL); ++ ++ assert_se(parse_permyriad("0‰") == 0); ++ assert_se(parse_permyriad("555.5‰") == 5555); ++ assert_se(parse_permyriad("1000.0‰") == 10000); ++ assert_se(parse_permyriad("-7‰") == -ERANGE); ++ assert_se(parse_permyriad("1007‰") == -ERANGE); ++ assert_se(parse_permyriad("‰") == -EINVAL); ++ assert_se(parse_permyriad("‰‰") == -EINVAL); ++ assert_se(parse_permyriad("‰1") == -EINVAL); ++ assert_se(parse_permyriad("1‰‰") == -EINVAL); ++ assert_se(parse_permyriad("3.22‰") == -EINVAL); ++ ++ assert_se(parse_permyriad("0%") == 0); ++ assert_se(parse_permyriad("55%") == 5500); ++ assert_se(parse_permyriad("55.53%") == 5553); ++ assert_se(parse_permyriad("100%") == 10000); ++ assert_se(parse_permyriad("-7%") == -ERANGE); ++ assert_se(parse_permyriad("107%") == -ERANGE); ++ assert_se(parse_permyriad("%") == -EINVAL); ++ assert_se(parse_permyriad("%%") == -EINVAL); ++ assert_se(parse_permyriad("%1") == -EINVAL); ++ assert_se(parse_permyriad("1%%") == -EINVAL); ++ assert_se(parse_permyriad("3.212%") == -EINVAL); ++} ++ ++static void test_parse_permyriad_unbounded(void) { ++ assert_se(parse_permyriad_unbounded("1001‱") == 1001); ++ assert_se(parse_permyriad_unbounded("4000‱") == 4000); ++ assert_se(parse_permyriad_unbounded("2147483647‱") == 2147483647); ++ assert_se(parse_permyriad_unbounded("2147483648‱") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("4294967295‱") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("4294967296‱") == -ERANGE); ++ ++ assert_se(parse_permyriad_unbounded("101‰") == 1010); ++ assert_se(parse_permyriad_unbounded("400‰") == 4000); ++ assert_se(parse_permyriad_unbounded("214748364.7‰") == 2147483647); ++ assert_se(parse_permyriad_unbounded("214748364.8‰") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("429496729.5‰") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("429496729.6‰") == -ERANGE); ++ ++ assert_se(parse_permyriad_unbounded("99%") == 9900); ++ assert_se(parse_permyriad_unbounded("40%") == 4000); ++ assert_se(parse_permyriad_unbounded("21474836.47%") == 2147483647); ++ assert_se(parse_permyriad_unbounded("21474836.48%") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("42949672.95%") == -ERANGE); ++ assert_se(parse_permyriad_unbounded("42949672.96%") == -ERANGE); ++} ++ + static void test_parse_nice(void) { + int n; + +@@ -987,6 +1053,8 @@ int main(int argc, char *argv[]) { + test_parse_percent_unbounded(); + test_parse_permille(); + test_parse_permille_unbounded(); ++ test_parse_permyriad(); ++ test_parse_permyriad_unbounded(); + test_parse_nice(); + test_parse_dev(); + test_parse_errno(); +-- +2.29.2 + + +From 5fdc5d3384f81888704a0a19db3cb33bce2d8bdb Mon Sep 17 00:00:00 2001 +From: Anita Zhang +Date: Tue, 2 Feb 2021 14:16:03 -0800 +Subject: [PATCH 2/2] oom: rework *MemoryPressureLimit= properties to have + 1/10000 precision + +Requested in +https://github.com/systemd/systemd/pull/15206#discussion_r505506657, +preserve the full granularity for memory pressure limits (permyriad) +instead of capping out at percent. +--- + docs/TRANSIENT-SETTINGS.md | 2 +- + man/oomd.conf.xml | 6 ++--- + man/org.freedesktop.systemd1.xml | 36 +++++++++++++------------- + man/systemd.resource-control.xml | 2 +- + src/core/cgroup.c | 4 +-- + src/core/cgroup.h | 2 +- + src/core/core-varlink.c | 2 +- + src/core/dbus-cgroup.c | 16 +++++++++--- + src/core/dbus-util.c | 29 --------------------- + src/core/dbus-util.h | 1 - + src/core/load-fragment-gperf.gperf.m4 | 2 +- + src/core/load-fragment.c | 6 ++--- + src/oom/oomd-manager.c | 24 +++++++++++------ + src/oom/oomd-manager.h | 4 +-- + src/oom/oomd-util.c | 4 +-- + src/oom/oomd.c | 10 +++---- + src/oom/oomd.conf | 2 +- + src/shared/bus-get-properties.c | 17 ------------ + src/shared/bus-get-properties.h | 1 - + src/shared/bus-unit-util.c | 19 ++++++++++++-- + src/shared/conf-parser.c | 1 + + src/shared/conf-parser.h | 1 + + test/units/testsuite-56-workload.slice | 2 +- + test/units/testsuite-56.sh | 2 +- + 24 files changed, 91 insertions(+), 104 deletions(-) + +diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md +index 50b9a42fa1..5037060254 100644 +--- a/docs/TRANSIENT-SETTINGS.md ++++ b/docs/TRANSIENT-SETTINGS.md +@@ -272,7 +272,7 @@ All cgroup/resource control settings are available for transient units + ✓ IPAddressDeny= + ✓ ManagedOOMSwap= + ✓ ManagedOOMMemoryPressure= +-✓ ManagedOOMMemoryPressureLimitPercent= ++✓ ManagedOOMMemoryPressureLimit= + ``` + + ## Process Killing Settings +diff --git a/man/oomd.conf.xml b/man/oomd.conf.xml +index bb5da87c54..2a12be8cad 100644 +--- a/man/oomd.conf.xml ++++ b/man/oomd.conf.xml +@@ -59,10 +59,10 @@ + + + +- DefaultMemoryPressureLimitPercent= ++ DefaultMemoryPressureLimit= + + Sets the limit for memory pressure on the unit's cgroup before systemd-oomd +- will take action. A unit can override this value with ManagedOOMMemoryPressureLimitPercent=. ++ will take action. A unit can override this value with ManagedOOMMemoryPressureLimit=. + The memory pressure for this property represents the fraction of time in a 10 second window in which all tasks + in the cgroup were delayed. For each monitored cgroup, if the memory pressure on that cgroup exceeds the + limit set for longer than the duration set by DefaultMemoryPressureDurationSec=, +@@ -78,7 +78,7 @@ + + Sets the amount of time a unit's cgroup needs to have exceeded memory pressure limits before + systemd-oomd will take action. Memory pressure limits are defined by +- DefaultMemoryPressureLimitPercent= and ManagedOOMMemoryPressureLimitPercent=. ++ DefaultMemoryPressureLimit= and ManagedOOMMemoryPressureLimit=. + Defaults to 30 seconds when this property is unset or set to 0. + + +diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml +index 78fd0b3378..7809b65062 100644 +--- a/man/org.freedesktop.systemd1.xml ++++ b/man/org.freedesktop.systemd1.xml +@@ -2419,7 +2419,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -2938,7 +2938,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + +- ++ + + + +@@ -3494,7 +3494,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { + + + +- ++ + + + +@@ -4146,7 +4146,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -4693,7 +4693,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + +- ++ + + + +@@ -5251,7 +5251,7 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { + + + +- ++ + + + +@@ -5827,7 +5827,7 @@ node /org/freedesktop/systemd1/unit/home_2emount { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -6302,7 +6302,7 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + +- ++ + + + +@@ -6778,7 +6778,7 @@ node /org/freedesktop/systemd1/unit/home_2emount { + + + +- ++ + + + +@@ -7475,7 +7475,7 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly as Environment = ['...', ...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -7936,7 +7936,7 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + +- ++ + + + +@@ -8398,7 +8398,7 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { + + + +- ++ + + + +@@ -8948,7 +8948,7 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + }; + interface org.freedesktop.DBus.Peer { ... }; + interface org.freedesktop.DBus.Introspectable { ... }; +@@ -9083,7 +9083,7 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + +- ++ + + + +@@ -9223,7 +9223,7 @@ node /org/freedesktop/systemd1/unit/system_2eslice { + + + +- ++ + + + +@@ -9383,7 +9383,7 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly s ManagedOOMMemoryPressure = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") +- readonly s ManagedOOMMemoryPressureLimitPercent = '...'; ++ readonly u ManagedOOMMemoryPressureLimitPermyriad = ...; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") + readonly s KillMode = '...'; + @org.freedesktop.DBus.Property.EmitsChangedSignal("const") +@@ -9534,7 +9534,7 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + +- ++ + + + +@@ -9700,7 +9700,7 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { + + + +- ++ + + + +diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml +index 26dedda3fd..4381c4e1b7 100644 +--- a/man/systemd.resource-control.xml ++++ b/man/systemd.resource-control.xml +@@ -901,7 +901,7 @@ DeviceAllow=/dev/loop-control + + + +- ManagedOOMMemoryPressureLimitPercent= ++ ManagedOOMMemoryPressureLimit= + + + Overrides the default memory pressure limit set by +diff --git a/src/core/cgroup.c b/src/core/cgroup.c +index 7dc6c20bb7..e2ed0e546e 100644 +--- a/src/core/cgroup.c ++++ b/src/core/cgroup.c +@@ -417,7 +417,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { + "%sDelegate: %s\n" + "%sManagedOOMSwap: %s\n" + "%sManagedOOMMemoryPressure: %s\n" +- "%sManagedOOMMemoryPressureLimitPercent: %d%%\n", ++ "%sManagedOOMMemoryPressureLimit: %" PRIu32 ".%02" PRIu32 "%%\n", + prefix, yes_no(c->cpu_accounting), + prefix, yes_no(c->io_accounting), + prefix, yes_no(c->blockio_accounting), +@@ -450,7 +450,7 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { + prefix, yes_no(c->delegate), + prefix, managed_oom_mode_to_string(c->moom_swap), + prefix, managed_oom_mode_to_string(c->moom_mem_pressure), +- prefix, c->moom_mem_pressure_limit); ++ prefix, c->moom_mem_pressure_limit_permyriad / 100, c->moom_mem_pressure_limit_permyriad % 100); + + if (c->delegate) { + _cleanup_free_ char *t = NULL; +diff --git a/src/core/cgroup.h b/src/core/cgroup.h +index 66f3a63b82..9fbfabbb7e 100644 +--- a/src/core/cgroup.h ++++ b/src/core/cgroup.h +@@ -163,7 +163,7 @@ struct CGroupContext { + /* Settings for systemd-oomd */ + ManagedOOMMode moom_swap; + ManagedOOMMode moom_mem_pressure; +- int moom_mem_pressure_limit; ++ uint32_t moom_mem_pressure_limit_permyriad; + }; + + /* Used when querying IP accounting data */ +diff --git a/src/core/core-varlink.c b/src/core/core-varlink.c +index dd6c11ab4d..17fb9bc83f 100644 +--- a/src/core/core-varlink.c ++++ b/src/core/core-varlink.c +@@ -83,7 +83,7 @@ static int build_managed_oom_json_array_element(Unit *u, const char *property, J + JSON_BUILD_PAIR("mode", JSON_BUILD_STRING(mode)), + JSON_BUILD_PAIR("path", JSON_BUILD_STRING(u->cgroup_path)), + JSON_BUILD_PAIR("property", JSON_BUILD_STRING(property)), +- JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit)))); ++ JSON_BUILD_PAIR_CONDITION(use_limit, "limit", JSON_BUILD_UNSIGNED(c->moom_mem_pressure_limit_permyriad)))); + } + + int manager_varlink_send_managed_oom_update(Unit *u) { +diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c +index 37c581fb22..df35ec114d 100644 +--- a/src/core/dbus-cgroup.c ++++ b/src/core/dbus-cgroup.c +@@ -395,7 +395,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { + SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0), + SD_BUS_PROPERTY("ManagedOOMSwap", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_swap), 0), + SD_BUS_PROPERTY("ManagedOOMMemoryPressure", "s", property_get_managed_oom_mode, offsetof(CGroupContext, moom_mem_pressure), 0), +- SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPercent", "s", bus_property_get_percent, offsetof(CGroupContext, moom_mem_pressure_limit), 0), ++ SD_BUS_PROPERTY("ManagedOOMMemoryPressureLimitPermyriad", "u", NULL, offsetof(CGroupContext, moom_mem_pressure_limit_permyriad), 0), + SD_BUS_VTABLE_END + }; + +@@ -1697,14 +1697,24 @@ int bus_cgroup_set_property( + return 1; + } + +- if (streq(name, "ManagedOOMMemoryPressureLimitPercent")) { ++ if (streq(name, "ManagedOOMMemoryPressureLimitPermyriad")) { ++ uint32_t v; ++ + if (!UNIT_VTABLE(u)->can_set_managed_oom) + return sd_bus_error_setf(error, SD_BUS_ERROR_INVALID_ARGS, "Cannot set %s for this unit type", name); + +- r = bus_set_transient_percent(u, name, &c->moom_mem_pressure_limit, message, flags, error); ++ r = sd_bus_message_read(message, "u", &v); + if (r < 0) + return r; + ++ if (v > 10000) ++ return -ERANGE; ++ ++ if (!UNIT_WRITE_FLAGS_NOOP(flags)) { ++ c->moom_mem_pressure_limit_permyriad = v; ++ unit_write_settingf(u, flags, name, "ManagedOOMMemoryPressureLimit=%" PRIu32 ".%02" PRIu32 "%%", v / 100, v % 100); ++ } ++ + if (c->moom_mem_pressure == MANAGED_OOM_KILL) + (void) manager_varlink_send_managed_oom_update(u); + +diff --git a/src/core/dbus-util.c b/src/core/dbus-util.c +index d6223db305..eb03d30cf7 100644 +--- a/src/core/dbus-util.c ++++ b/src/core/dbus-util.c +@@ -91,35 +91,6 @@ int bus_set_transient_bool( + return 1; + } + +-int bus_set_transient_percent( +- Unit *u, +- const char *name, +- int *p, +- sd_bus_message *message, +- UnitWriteFlags flags, +- sd_bus_error *error) { +- +- const char *v; +- int r; +- +- assert(p); +- +- r = sd_bus_message_read(message, "s", &v); +- if (r < 0) +- return r; +- +- r = parse_percent(v); +- if (r < 0) +- return r; +- +- if (!UNIT_WRITE_FLAGS_NOOP(flags)) { +- *p = r; +- unit_write_settingf(u, flags, name, "%s=%d%%", name, r); +- } +- +- return 1; +-} +- + int bus_set_transient_usec_internal( + Unit *u, + const char *name, +diff --git a/src/core/dbus-util.h b/src/core/dbus-util.h +index 4e7c68e843..b68ec38ada 100644 +--- a/src/core/dbus-util.h ++++ b/src/core/dbus-util.h +@@ -240,7 +240,6 @@ int bus_set_transient_user_relaxed(Unit *u, const char *name, char **p, sd_bus_m + int bus_set_transient_path(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + int bus_set_transient_string(Unit *u, const char *name, char **p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + int bus_set_transient_bool(Unit *u, const char *name, bool *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); +-int bus_set_transient_percent(Unit *u, const char *name, int *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + int bus_set_transient_usec_internal(Unit *u, const char *name, usec_t *p, bool fix_0, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error); + static inline int bus_set_transient_usec(Unit *u, const char *name, usec_t *p, sd_bus_message *message, UnitWriteFlags flags, sd_bus_error *error) { + return bus_set_transient_usec_internal(u, name, p, false, message, flags, error); +diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 +index 946862c398..db2a4e28a8 100644 +--- a/src/core/load-fragment-gperf.gperf.m4 ++++ b/src/core/load-fragment-gperf.gperf.m4 +@@ -226,7 +226,7 @@ $1.IPIngressFilterPath, config_parse_ip_filter_bpf_progs, + $1.IPEgressFilterPath, config_parse_ip_filter_bpf_progs, 0, offsetof($1, cgroup_context.ip_filters_egress) + $1.ManagedOOMSwap, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_swap) + $1.ManagedOOMMemoryPressure, config_parse_managed_oom_mode, 0, offsetof($1, cgroup_context.moom_mem_pressure) +-$1.ManagedOOMMemoryPressureLimitPercent, config_parse_managed_oom_mem_pressure_limit, 0, offsetof($1, cgroup_context.moom_mem_pressure_limit) ++$1.ManagedOOMMemoryPressureLimit, config_parse_managed_oom_mem_pressure_limit, 0, offsetof($1, cgroup_context.moom_mem_pressure_limit_permyriad) + $1.NetClass, config_parse_warn_compat, DISABLED_LEGACY, 0' + )m4_dnl + Unit.Description, config_parse_unit_string_printf, 0, offsetof(Unit, description) +diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c +index 4964249bf2..e0e9920e06 100644 +--- a/src/core/load-fragment.c ++++ b/src/core/load-fragment.c +@@ -3859,7 +3859,7 @@ int config_parse_managed_oom_mem_pressure_limit( + const char *rvalue, + void *data, + void *userdata) { +- int *limit = data; ++ uint32_t *limit = data; + UnitType t; + int r; + +@@ -3874,9 +3874,9 @@ int config_parse_managed_oom_mem_pressure_limit( + return 0; + } + +- r = parse_percent(rvalue); ++ r = parse_permyriad(rvalue); + if (r < 0) { +- log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse limit percent value, ignoring: %s", rvalue); ++ log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse memory pressure limit value, ignoring: %s", rvalue); + return 0; + } + +diff --git a/src/oom/oomd-manager.c b/src/oom/oomd-manager.c +index 3efa629002..338935b3ec 100644 +--- a/src/oom/oomd-manager.c ++++ b/src/oom/oomd-manager.c +@@ -100,10 +100,10 @@ static int process_managed_oom_reply( + limit = m->default_mem_pressure_limit; + + if (streq(reply.property, "ManagedOOMMemoryPressure")) { +- if (reply.limit > 100) ++ if (reply.limit > 10000) + continue; + else if (reply.limit != 0) { +- ret = store_loadavg_fixed_point((unsigned long) reply.limit, 0, &limit); ++ ret = store_loadavg_fixed_point((unsigned long) reply.limit / 100, (unsigned long) reply.limit % 100, &limit); + if (ret < 0) + continue; + } +@@ -478,8 +478,8 @@ static int manager_connect_bus(Manager *m) { + return 0; + } + +-int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec) { +- unsigned long l; ++int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec) { ++ unsigned long l, f; + int r; + + assert(m); +@@ -489,8 +489,16 @@ int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressur + m->swap_used_limit = swap_used_limit != -1 ? swap_used_limit : DEFAULT_SWAP_USED_LIMIT; + assert(m->swap_used_limit <= 100); + +- l = mem_pressure_limit != -1 ? mem_pressure_limit : DEFAULT_MEM_PRESSURE_LIMIT; +- r = store_loadavg_fixed_point(l, 0, &m->default_mem_pressure_limit); ++ if (mem_pressure_limit_permyriad != -1) { ++ assert(mem_pressure_limit_permyriad <= 10000); ++ ++ l = mem_pressure_limit_permyriad / 100; ++ f = mem_pressure_limit_permyriad % 100; ++ } else { ++ l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT; ++ f = 0; ++ } ++ r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit); + if (r < 0) + return r; + +@@ -530,12 +538,12 @@ int manager_get_dump_string(Manager *m, char **ret) { + fprintf(f, + "Dry Run: %s\n" + "Swap Used Limit: %u%%\n" +- "Default Memory Pressure Limit: %lu%%\n" ++ "Default Memory Pressure Limit: %lu.%02lu%%\n" + "Default Memory Pressure Duration: %s\n" + "System Context:\n", + yes_no(m->dry_run), + m->swap_used_limit, +- LOAD_INT(m->default_mem_pressure_limit), ++ LOAD_INT(m->default_mem_pressure_limit), LOAD_FRAC(m->default_mem_pressure_limit), + format_timespan(buf, sizeof(buf), m->default_mem_pressure_duration_usec, USEC_PER_SEC)); + oomd_dump_system_context(&m->system_context, f, "\t"); + +diff --git a/src/oom/oomd-manager.h b/src/oom/oomd-manager.h +index ee17abced2..521665e0a8 100644 +--- a/src/oom/oomd-manager.h ++++ b/src/oom/oomd-manager.h +@@ -17,7 +17,7 @@ + * Generally 60 or higher might be acceptable for something like system.slice with no memory.high set; processes in + * system.slice are assumed to be less latency sensitive. */ + #define DEFAULT_MEM_PRESSURE_DURATION_USEC (30 * USEC_PER_SEC) +-#define DEFAULT_MEM_PRESSURE_LIMIT 60 ++#define DEFAULT_MEM_PRESSURE_LIMIT_PERCENT 60 + #define DEFAULT_SWAP_USED_LIMIT 90 + + #define RECLAIM_DURATION_USEC (30 * USEC_PER_SEC) +@@ -56,7 +56,7 @@ DEFINE_TRIVIAL_CLEANUP_FUNC(Manager*, manager_free); + + int manager_new(Manager **ret); + +-int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit, usec_t mem_pressure_usec); ++int manager_start(Manager *m, bool dry_run, int swap_used_limit, int mem_pressure_limit_permyriad, usec_t mem_pressure_usec); + + int manager_get_dump_string(Manager *m, char **ret); + +diff --git a/src/oom/oomd-util.c b/src/oom/oomd-util.c +index cec656f6fa..fcccddb92e 100644 +--- a/src/oom/oomd-util.c ++++ b/src/oom/oomd-util.c +@@ -415,11 +415,11 @@ void oomd_dump_memory_pressure_cgroup_context(const OomdCGroupContext *ctx, FILE + + fprintf(f, + "%sPath: %s\n" +- "%s\tMemory Pressure Limit: %lu%%\n" ++ "%s\tMemory Pressure Limit: %lu.%02lu%%\n" + "%s\tPressure: Avg10: %lu.%02lu Avg60: %lu.%02lu Avg300: %lu.%02lu Total: %s\n" + "%s\tCurrent Memory Usage: %s\n", + strempty(prefix), ctx->path, +- strempty(prefix), LOAD_INT(ctx->mem_pressure_limit), ++ strempty(prefix), LOAD_INT(ctx->mem_pressure_limit), LOAD_FRAC(ctx->mem_pressure_limit), + strempty(prefix), + LOAD_INT(ctx->memory_pressure.avg10), LOAD_FRAC(ctx->memory_pressure.avg10), + LOAD_INT(ctx->memory_pressure.avg60), LOAD_FRAC(ctx->memory_pressure.avg60), +diff --git a/src/oom/oomd.c b/src/oom/oomd.c +index 1fbcf41492..811d211b58 100644 +--- a/src/oom/oomd.c ++++ b/src/oom/oomd.c +@@ -18,14 +18,14 @@ + + static bool arg_dry_run = false; + static int arg_swap_used_limit = -1; +-static int arg_mem_pressure_limit = -1; ++static int arg_mem_pressure_limit_permyriad = -1; + static usec_t arg_mem_pressure_usec = 0; + + static int parse_config(void) { + static const ConfigTableItem items[] = { +- { "OOM", "SwapUsedLimitPercent", config_parse_percent, 0, &arg_swap_used_limit }, +- { "OOM", "DefaultMemoryPressureLimitPercent", config_parse_percent, 0, &arg_mem_pressure_limit }, +- { "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec }, ++ { "OOM", "SwapUsedLimitPercent", config_parse_percent, 0, &arg_swap_used_limit }, ++ { "OOM", "DefaultMemoryPressureLimit", config_parse_permyriad, 0, &arg_mem_pressure_limit_permyriad }, ++ { "OOM", "DefaultMemoryPressureDurationSec", config_parse_sec, 0, &arg_mem_pressure_usec }, + {} + }; + +@@ -160,7 +160,7 @@ static int run(int argc, char *argv[]) { + if (r < 0) + return log_error_errno(r, "Failed to create manager: %m"); + +- r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit, arg_mem_pressure_usec); ++ r = manager_start(m, arg_dry_run, arg_swap_used_limit, arg_mem_pressure_limit_permyriad, arg_mem_pressure_usec); + if (r < 0) + return log_error_errno(r, "Failed to start up daemon: %m"); + +diff --git a/src/oom/oomd.conf b/src/oom/oomd.conf +index 766cb1717f..bd6a9391c6 100644 +--- a/src/oom/oomd.conf ++++ b/src/oom/oomd.conf +@@ -13,5 +13,5 @@ + + [OOM] + #SwapUsedLimitPercent=90% +-#DefaultMemoryPressureLimitPercent=60% ++#DefaultMemoryPressureLimit=60% + #DefaultMemoryPressureDurationSec=30s +diff --git a/src/shared/bus-get-properties.c b/src/shared/bus-get-properties.c +index 32f68d5e6a..a5ce7ef17f 100644 +--- a/src/shared/bus-get-properties.c ++++ b/src/shared/bus-get-properties.c +@@ -55,23 +55,6 @@ int bus_property_get_id128( + return sd_bus_message_append_array(reply, 'y', id->bytes, 16); + } + +-int bus_property_get_percent( +- sd_bus *bus, +- const char *path, +- const char *interface, +- const char *property, +- sd_bus_message *reply, +- void *userdata, +- sd_bus_error *error) { +- +- char pstr[DECIMAL_STR_MAX(int) + 2]; +- int p = *(int*) userdata; +- +- xsprintf(pstr, "%d%%", p); +- +- return sd_bus_message_append_basic(reply, 's', pstr); +-} +- + #if __SIZEOF_SIZE_T__ != 8 + int bus_property_get_size( + sd_bus *bus, +diff --git a/src/shared/bus-get-properties.h b/src/shared/bus-get-properties.h +index 9832c0d067..26f3e8588c 100644 +--- a/src/shared/bus-get-properties.h ++++ b/src/shared/bus-get-properties.h +@@ -8,7 +8,6 @@ + int bus_property_get_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + int bus_property_set_bool(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *value, void *userdata, sd_bus_error *error); + int bus_property_get_id128(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); +-int bus_property_get_percent(sd_bus *bus, const char *path, const char *interface, const char *property, sd_bus_message *reply, void *userdata, sd_bus_error *error); + + #define bus_property_get_usec ((sd_bus_property_get_t) NULL) + #define bus_property_set_usec ((sd_bus_property_set_t) NULL) +diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c +index 2bab2299fb..f96059c699 100644 +--- a/src/shared/bus-unit-util.c ++++ b/src/shared/bus-unit-util.c +@@ -435,10 +435,25 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons + if (STR_IN_SET(field, "DevicePolicy", + "Slice", + "ManagedOOMSwap", +- "ManagedOOMMemoryPressure", +- "ManagedOOMMemoryPressureLimitPercent")) ++ "ManagedOOMMemoryPressure")) + return bus_append_string(m, field, eq); + ++ if (STR_IN_SET(field, "ManagedOOMMemoryPressureLimit")) { ++ char *n; ++ ++ r = parse_permyriad(eq); ++ if (r < 0) ++ return log_error_errno(r, "Failed to parse %s value: %s", field, eq); ++ ++ n = strjoina(field, "Permyriad"); ++ ++ r = sd_bus_message_append(m, "(sv)", n, "u", (uint32_t) r); ++ if (r < 0) ++ return bus_log_create_error(r); ++ ++ return 1; ++ } ++ + if (STR_IN_SET(field, "CPUAccounting", + "MemoryAccounting", + "IOAccounting", +diff --git a/src/shared/conf-parser.c b/src/shared/conf-parser.c +index 35d301d9db..c8c253d603 100644 +--- a/src/shared/conf-parser.c ++++ b/src/shared/conf-parser.c +@@ -1245,3 +1245,4 @@ int config_parse_vlanprotocol(const char* unit, + } + + DEFINE_CONFIG_PARSE(config_parse_percent, parse_percent, "Failed to parse percent value"); ++DEFINE_CONFIG_PARSE(config_parse_permyriad, parse_permyriad, "Failed to parse permyriad value"); +diff --git a/src/shared/conf-parser.h b/src/shared/conf-parser.h +index f115cb23af..988d81e43a 100644 +--- a/src/shared/conf-parser.h ++++ b/src/shared/conf-parser.h +@@ -148,6 +148,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_mtu); + CONFIG_PARSER_PROTOTYPE(config_parse_rlimit); + CONFIG_PARSER_PROTOTYPE(config_parse_vlanprotocol); + CONFIG_PARSER_PROTOTYPE(config_parse_percent); ++CONFIG_PARSER_PROTOTYPE(config_parse_permyriad); + + typedef enum Disabled { + DISABLED_CONFIGURATION, +diff --git a/test/units/testsuite-56-workload.slice b/test/units/testsuite-56-workload.slice +index 45b04914c6..8c32b28094 100644 +--- a/test/units/testsuite-56-workload.slice ++++ b/test/units/testsuite-56-workload.slice +@@ -7,4 +7,4 @@ MemoryAccounting=true + IOAccounting=true + TasksAccounting=true + ManagedOOMMemoryPressure=kill +-ManagedOOMMemoryPressureLimitPercent=1% ++ManagedOOMMemoryPressureLimit=1% +diff --git a/test/units/testsuite-56.sh b/test/units/testsuite-56.sh +index 4dc9d8c7a8..8b01fe37ed 100755 +--- a/test/units/testsuite-56.sh ++++ b/test/units/testsuite-56.sh +@@ -20,7 +20,7 @@ systemctl start testsuite-56-testbloat.service + + # Verify systemd-oomd is monitoring the expected units + oomctl | grep "/testsuite-56-workload.slice" +-oomctl | grep "1%" ++oomctl | grep "1.00%" + oomctl | grep "Default Memory Pressure Duration: 5s" + + # systemd-oomd watches for elevated pressure for 30 seconds before acting. +-- +2.29.2 + diff --git a/95ca39f04efa278ac93881e6e364a6ae520b03e7.patch b/95ca39f04efa278ac93881e6e364a6ae520b03e7.patch new file mode 100644 index 0000000..478902a --- /dev/null +++ b/95ca39f04efa278ac93881e6e364a6ae520b03e7.patch @@ -0,0 +1,40 @@ +From 95ca39f04efa278ac93881e6e364a6ae520b03e7 Mon Sep 17 00:00:00 2001 +From: Yu Watanabe +Date: Fri, 27 Nov 2020 08:29:20 +0900 +Subject: [PATCH] oom: use CMP() macro + +--- + src/oom/oomd-util.h | 14 ++------------ + 1 file changed, 2 insertions(+), 12 deletions(-) + +diff --git a/src/oom/oomd-util.h b/src/oom/oomd-util.h +index 87ecda80fbc..0834cbf09d7 100644 +--- a/src/oom/oomd-util.h ++++ b/src/oom/oomd-util.h +@@ -64,24 +64,14 @@ static inline int compare_pgscan(OomdCGroupContext * const *c1, OomdCGroupContex + assert(c1); + assert(c2); + +- if ((*c1)->pgscan > (*c2)->pgscan) +- return -1; +- else if ((*c1)->pgscan < (*c2)->pgscan) +- return 1; +- else +- return 0; ++ return CMP((*c2)->pgscan, (*c1)->pgscan); + } + + static inline int compare_swap_usage(OomdCGroupContext * const *c1, OomdCGroupContext * const *c2) { + assert(c1); + assert(c2); + +- if ((*c1)->swap_usage > (*c2)->swap_usage) +- return -1; +- else if ((*c1)->swap_usage < (*c2)->swap_usage) +- return 1; +- else +- return 0; ++ return CMP((*c2)->swap_usage, (*c1)->swap_usage); + } + + /* Get an array of OomdCGroupContexts from `h`, qsorted from largest to smallest values according to `compare_func`. diff --git a/split-files.py b/split-files.py index ffa18f5..26e0551 100644 --- a/split-files.py +++ b/split-files.py @@ -22,6 +22,7 @@ o_rpm_macros = open('.file-list-rpm-macros', 'w') o_devel = open('.file-list-devel', 'w') o_container = open('.file-list-container', 'w') o_networkd = open('.file-list-networkd', 'w') +o_oomd_defaults = open('.file-list-oomd-defaults', 'w') o_remote = open('.file-list-remote', 'w') o_tests = open('.file-list-tests', 'w') o_standalone_tmpfiles = open('.file-list-standalone-tmpfiles', 'w') @@ -117,6 +118,8 @@ for file in files(buildroot): /modprobe.d ''', n, re.X): o = o_udev + elif re.search(r'10-oomd-.*defaults.conf|lib/systemd/oomd.conf.d', n, re.X): + o = o_oomd_defaults elif n.endswith('.standalone'): if 'tmpfiles' in n: o = o_standalone_tmpfiles diff --git a/systemd.spec b/systemd.spec index b8634e4..8dabb5a 100644 --- a/systemd.spec +++ b/systemd.spec @@ -21,7 +21,7 @@ Name: systemd Url: https://www.freedesktop.org/wiki/Software/systemd Version: 247.3 -Release: 1%{?dist} +Release: 2%{?dist} # For a breakdown of the licensing, see README License: LGPLv2+ and MIT and GPLv2+ Summary: System and Service Manager @@ -57,6 +57,10 @@ Source11: 20-grubby.install Source12: systemd-user Source13: libsystemd-shared.abignore +Source14: 10-oomd-defaults.conf +Source15: 10-oomd-root-slice-defaults.conf +Source16: 10-oomd-user-service-defaults.conf + Source21: macros.sysusers Source22: sysusers.attr Source23: sysusers.prov @@ -69,6 +73,12 @@ GIT_DIR=../../src/systemd/.git git diffab -M v233..master@{2017-06-15} -- hwdb/[ %endif # Backports of patches from upstream (0000–0499) +# systemd-oomd refinements for https://fedoraproject.org/wiki/Changes/EnableSystemdOomd +Patch0000: https://github.com/systemd/systemd/pull/17829.patch +Patch0001: https://github.com/systemd/systemd/pull/18361.patch +Patch0002: https://github.com/systemd/systemd/pull/18444.patch +Patch0003: https://github.com/systemd/systemd/pull/17732/commits/95ca39f04efa278ac93881e6e364a6ae520b03e7.patch +Patch0004: https://github.com/systemd/systemd/pull/18401.patch # Downstream-only patches (5000–9999) # https://bugzilla.redhat.com/show_bug.cgi?id=1738828 @@ -342,6 +352,15 @@ systemd-networkd is a system service that manages networks. It detects and configures network devices as they appear, as well as creating virtual network devices. +%package oomd-defaults +Summary: Configuration files for systemd-oomd +Requires: %{name}%{?_isa} = %{version}-%{release} +License: LGPLv2+ + +%description oomd-defaults +A set of drop-in files for systemd units to enable action from systemd-oomd, +a userspace out-of-memory (OOM) killer. + %package tests Summary: Internal unit tests for systemd Requires: %{name}%{?_isa} = %{version}-%{release} @@ -553,6 +572,11 @@ install -Dm0644 -t %{buildroot}%{_prefix}/lib/systemd/ %{SOURCE13} install -D -t %{buildroot}/usr/lib/systemd/ %{SOURCE3} +# systemd-oomd default configuration +install -Dm0644 -t %{buildroot}%{_prefix}/lib/systemd/oomd.conf.d/ %{SOURCE14} +install -Dm0644 -t %{buildroot}%{system_unit_dir}/-.slice.d/ %{SOURCE15} +install -Dm0644 -t %{buildroot}%{system_unit_dir}/user@.service.d/ %{SOURCE16} + sed -i 's|#!/usr/bin/env python3|#!%{__python3}|' %{buildroot}/usr/lib/systemd/tests/run-unit-tests.py install -m 0644 -D -t %{buildroot}%{_rpmconfigdir}/macros.d/ %{SOURCE21} @@ -667,6 +691,8 @@ chmod g+s /{run,var}/log/journal/{,${machine_id}} &>/dev/null || : # Apply ACL to the journal directory setfacl -Rnm g:wheel:rx,d:g:wheel:rx,g:adm:rx,d:g:adm:rx /var/log/journal/ &>/dev/null || : +%systemd_post systemd-oomd.service + [ $1 -eq 1 ] || exit 0 # We reset the enablement of all services upon initial installation @@ -727,6 +753,9 @@ if systemctl -q is-enabled systemd-resolved.service &>/dev/null; then systemctl start systemd-resolved.service &>/dev/null || : fi +%postun +%systemd_postun_with_restart systemd-oomd.service + %post libs %{?ldconfig} @@ -887,6 +916,8 @@ getent passwd systemd-network &>/dev/null || useradd -r -u 192 -l -g systemd-net %files networkd -f .file-list-networkd +%files oomd-defaults -f .file-list-oomd-defaults + %files tests -f .file-list-tests %files standalone-tmpfiles -f .file-list-standalone-tmpfiles @@ -894,6 +925,13 @@ getent passwd systemd-network &>/dev/null || useradd -r -u 192 -l -g systemd-net %files standalone-sysusers -f .file-list-standalone-sysusers %changelog +* Fri Feb 5 2021 Anita Zhang - 247.3-2 +- Changes for https://fedoraproject.org/wiki/Changes/EnableSystemdOomd. +- Backports consist primarily of PR #18361, #18444, and #18401 (plus some + additional ones to handle merge conflicts). +- Create systemd-oomd-defaults subpackage to install unit drop-ins that will + configure systemd-oomd to monitor and act. + * Tue Feb 2 2021 Zbigniew Jędrzejewski-Szmek - 247.3-1 - Minor stable release - Fixes #1895937, #1813219, #1903106.