97374c0
From 600c45e49c2060e077c06ab19078da89aa8e2e08 Mon Sep 17 00:00:00 2001
97374c0
From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= <edvin.torok@citrix.com>
97374c0
Date: Wed, 12 Oct 2022 19:13:07 +0100
97374c0
Subject: tools/ocaml: GC parameter tuning
97374c0
MIME-Version: 1.0
97374c0
Content-Type: text/plain; charset=UTF-8
97374c0
Content-Transfer-Encoding: 8bit
97374c0
97374c0
By default the OCaml garbage collector would return memory to the OS only
97374c0
after unused memory is 5x live memory.  Tweak this to 120% instead, which
97374c0
would match the major GC speed.
97374c0
97374c0
This is part of XSA-326.
97374c0
97374c0
Reported-by: Julien Grall <jgrall@amazon.com>
97374c0
Signed-off-by: Edwin Török <edvin.torok@citrix.com>
97374c0
Acked-by: Christian Lindig <christian.lindig@citrix.com>
97374c0
97374c0
diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml
97374c0
index 6b06f808595b..ba63a8147e09 100644
97374c0
--- a/tools/ocaml/xenstored/define.ml
97374c0
+++ b/tools/ocaml/xenstored/define.ml
97374c0
@@ -25,6 +25,7 @@ let maxwatch = ref (100)
97374c0
 let maxtransaction = ref (10)
97374c0
 let maxrequests = ref (1024)   (* maximum requests per transaction *)
97374c0
 
97374c0
+let gc_max_overhead = ref 120 (* 120% see comment in xenstored.ml *)
97374c0
 let conflict_burst_limit = ref 5.0
97374c0
 let conflict_max_history_seconds = ref 0.05
97374c0
 let conflict_rate_limit_is_aggregate = ref true
97374c0
diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml
97374c0
index d44ae673c42a..3b57ad016dfb 100644
97374c0
--- a/tools/ocaml/xenstored/xenstored.ml
97374c0
+++ b/tools/ocaml/xenstored/xenstored.ml
97374c0
@@ -104,6 +104,7 @@ let parse_config filename =
97374c0
 		("quota-maxsize", Config.Set_int Quota.maxsize);
97374c0
 		("quota-maxrequests", Config.Set_int Define.maxrequests);
97374c0
 		("quota-path-max", Config.Set_int Define.path_max);
97374c0
+		("gc-max-overhead", Config.Set_int Define.gc_max_overhead);
97374c0
 		("test-eagain", Config.Set_bool Transaction.test_eagain);
97374c0
 		("persistent", Config.Set_bool Disk.enable);
97374c0
 		("xenstored-log-file", Config.String Logging.set_xenstored_log_destination);
97374c0
@@ -265,6 +266,67 @@ let to_file store cons fds file =
97374c0
 	        (fun () -> close_out channel)
97374c0
 end
97374c0
 
97374c0
+(*
97374c0
+	By default OCaml's GC only returns memory to the OS when it exceeds a
97374c0
+	configurable 'max overhead' setting.
97374c0
+	The default is 500%, that is 5/6th of the OCaml heap needs to be free
97374c0
+	and only 1/6th live for a compaction to be triggerred that would
97374c0
+	release memory back to the OS.
97374c0
+	If the limit is not hit then the OCaml process can reuse that memory
97374c0
+	for its own purposes, but other processes won't be able to use it.
97374c0
+
97374c0
+	There is also a 'space overhead' setting that controls how much work
97374c0
+	each major GC slice does, and by default aims at having no more than
97374c0
+	80% or 120% (depending on version) garbage values compared to live
97374c0
+	values.
97374c0
+	This doesn't have as much relevance to memory returned to the OS as
97374c0
+	long as space_overhead <= max_overhead, because compaction is only
97374c0
+	triggerred at the end of major GC cycles.
97374c0
+
97374c0
+	The defaults are too large once the program starts using ~100MiB of
97374c0
+	memory, at which point ~500MiB would be unavailable to other processes
97374c0
+	(which would be fine if this was the main process in this VM, but it is
97374c0
+	not).
97374c0
+
97374c0
+	Max overhead can also be set to 0, however this is for testing purposes
97374c0
+	only (setting it lower than 'space overhead' wouldn't help because the
97374c0
+	major GC wouldn't run fast enough, and compaction does have a
97374c0
+	performance cost: we can only compact contiguous regions, so memory has
97374c0
+	to be moved around).
97374c0
+
97374c0
+	Max overhead controls how often the heap is compacted, which is useful
97374c0
+	if there are burst of activity followed by long periods of idle state,
97374c0
+	or if a domain quits, etc. Compaction returns memory to the OS.
97374c0
+
97374c0
+	wasted = live * space_overhead / 100
97374c0
+
97374c0
+	For globally overriding the GC settings one can use OCAMLRUNPARAM,
97374c0
+	however we provide a config file override to be consistent with other
97374c0
+	oxenstored settings.
97374c0
+
97374c0
+	One might want to dynamically adjust the overhead setting based on used
97374c0
+	memory, i.e. to use a fixed upper bound in bytes, not percentage. However
97374c0
+	measurements show that such adjustments increase GC overhead massively,
97374c0
+	while still not guaranteeing that memory is returned any more quickly
97374c0
+	than with a percentage based setting.
97374c0
+
97374c0
+	The allocation policy could also be tweaked, e.g. first fit would reduce
97374c0
+	fragmentation and thus memory usage, but the documentation warns that it
97374c0
+	can be sensibly slower, and indeed one of our own testcases can trigger
97374c0
+	such a corner case where it is multiple times slower, so it is best to keep
97374c0
+	the default allocation policy (next-fit/best-fit depending on version).
97374c0
+
97374c0
+	There are other tweaks that can be attempted in the future, e.g. setting
97374c0
+	'ulimit -v' to 75% of RAM, however getting the kernel to actually return
97374c0
+	NULL from allocations is difficult even with that setting, and without a
97374c0
+	NULL the emergency GC won't be triggerred.
97374c0
+	Perhaps cgroup limits could help, but for now tweak the safest only.
97374c0
+*)
97374c0
+
97374c0
+let tweak_gc () =
97374c0
+	Gc.set { (Gc.get ()) with Gc.max_overhead = !Define.gc_max_overhead }
97374c0
+
97374c0
+
97374c0
 let _ =
97374c0
 	let cf = do_argv in
97374c0
 	let pidfile =
97374c0
@@ -274,6 +336,8 @@ let _ =
97374c0
 			default_pidfile
97374c0
 		in
97374c0
 
97374c0
+	tweak_gc ();
97374c0
+
97374c0
 	(try
97374c0
 		Unixext.mkdir_rec (Filename.dirname pidfile) 0o755
97374c0
 	with _ ->