diff --git a/.gitignore b/.gitignore index f41c7f1..3d1112b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,4 @@ lwip-1.3.0.tar.gz pciutils-2.2.9.tar.bz2 zlib-1.2.3.tar.gz polarssl-1.1.4-gpl.tgz -/xen-4.16.2.tar.gz +/xen-4.16.3.tar.gz diff --git a/sources b/sources index f6803eb..b3ba74d 100644 --- a/sources +++ b/sources @@ -4,4 +4,4 @@ SHA512 (newlib-1.16.0.tar.gz) = 40eb96bbc6736a16b6399e0cdb73e853d0d90b685c967e77 SHA512 (zlib-1.2.3.tar.gz) = 021b958fcd0d346c4ba761bcf0cc40f3522de6186cf5a0a6ea34a70504ce9622b1c2626fce40675bc8282cf5f5ade18473656abc38050f72f5d6480507a2106e SHA512 (polarssl-1.1.4-gpl.tgz) = 88da614e4d3f4409c4fd3bb3e44c7587ba051e3fed4e33d526069a67e8180212e1ea22da984656f50e290049f60ddca65383e5983c0f8884f648d71f698303ad SHA512 (pciutils-2.2.9.tar.bz2) = 2b3d98d027e46d8c08037366dde6f0781ca03c610ef2b380984639e4ef39899ed8d8b8e4cd9c9dc54df101279b95879bd66bfd4d04ad07fef41e847ea7ae32b5 -SHA512 (xen-4.16.2.tar.gz) = b6cd036c1073798dffa167ca14c954fbdfb4c0ef99662f7c435e7e5de687d1bde8856ff6bd030d0d2e661bd17ab631551f01b2cc728cad7e70b59aaa6e692783 +SHA512 (xen-4.16.3.tar.gz) = b8cbd6f95681de5f824ada2d3cbe0653a38514a18df0dafcf811fb255219c7abec96f46217bdb0c83e1119f685da9a6af7194eeaa94f1cc3c892702782133b4f diff --git a/xen.canonicalize.patch b/xen.canonicalize.patch index e339530..f3ae37d 100644 --- a/xen.canonicalize.patch +++ b/xen.canonicalize.patch @@ -13,8 +13,8 @@ if (get_strings(in, vec, ARRAY_SIZE(vec)) != ARRAY_SIZE(vec)) return EINVAL; -- node = canonicalize(conn, in, vec[0]); -+ node = xenstore_canonicalize(conn, in, vec[0]); +- node = canonicalize(conn, ctx, vec[0]); ++ node = xenstore_canonicalize(conn, ctx, vec[0]); if (!node) return ENOMEM; list_for_each_entry(watch, &conn->watches, list) { diff --git a/xen.spec b/xen.spec index 27ed461..f2f311e 100644 --- a/xen.spec +++ b/xen.spec @@ -54,8 +54,8 @@ Summary: Xen is a virtual machine monitor Name: xen -Version: 4.16.2 -Release: 4%{?dist} +Version: 4.16.3 +Release: 1%{?dist} License: GPLv2+ and LGPLv2+ and BSD URL: http://xen.org/ Source0: https://downloads.xenproject.org/release/xen/%{version}/xen-%{version}.tar.gz @@ -111,66 +111,6 @@ Patch43: xen.gcc11.fixes.patch Patch44: xsa376.patch Patch45: xen.gcc12.fixes.patch Patch46: xen.efi.build.patch -Patch47: xsa410-4.16-01.patch -Patch48: xsa410-4.16-02.patch -Patch49: xsa410-4.16-03.patch -Patch50: xsa410-4.16-04.patch -Patch51: xsa410-4.16-05.patch -Patch52: xsa410-4.16-06.patch -Patch53: xsa410-4.16-07.patch -Patch54: xsa410-4.16-08.patch -Patch55: xsa410-4.16-09.patch -Patch56: xsa410-4.16-10.patch -Patch57: xsa409-4.13-0001-libxl-docs-Use-arch-specific-default-paging-memory.patch -Patch58: xsa409-4.13-0002-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch -Patch59: xsa409-4.13-0003-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch -Patch60: xsa409-4.13-0004-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch -Patch61: xsa411.patch -Patch62: xsa412-4.16.patch -Patch63: xsa414.patch -Patch64: xsa415.patch -Patch65: xsa326-4.16-oxenstored-01.patch -Patch66: xsa326-4.16-oxenstored-02.patch -Patch67: xsa326-4.16-oxenstored-03.patch -Patch68: xsa326-4.16-oxenstored-04.patch -Patch69: xsa326-4.16-oxenstored-05.patch -Patch70: xsa326-4.16-oxenstored-06.patch -Patch71: xsa326-4.16-oxenstored-07.patch -Patch72: xsa326-4.16-oxenstored-08.patch -Patch73: xsa326-4.16-xenstored-01.patch -Patch74: xsa326-4.16-xenstored-02.patch -Patch75: xsa326-4.16-xenstored-03.patch -Patch76: xsa326-4.16-xenstored-04.patch -Patch77: xsa326-4.16-xenstored-05.patch -Patch78: xsa326-4.16-xenstored-06.patch -Patch79: xsa326-4.16-xenstored-07.patch -Patch80: xsa326-4.16-xenstored-08.patch -Patch81: xsa326-4.16-xenstored-09.patch -Patch82: xsa326-4.16-xenstored-10.patch -Patch83: xsa326-4.16-xenstored-11.patch -Patch84: xsa326-4.16-xenstored-12.patch -Patch85: xsa326-4.16-xenstored-13.patch -Patch86: xsa326-4.16-xenstored-14.patch -Patch87: xsa326-4.16-xenstored-15.patch -Patch88: xsa326-4.16-xenstored-16.patch -Patch89: xsa416-4.16.patch -Patch90: xsa417.patch -Patch91: xsa418-4.16-01.patch -Patch92: xsa418-4.16-02.patch -Patch93: xsa418-4.16-03.patch -Patch94: xsa418-4.16-04.patch -Patch95: xsa418-4.16-05.patch -Patch96: xsa418-4.16-06.patch -Patch97: xsa418-4.16-07.patch -Patch98: xsa419-oxenstored.patch -Patch99: xsa419-xenstored-01.patch -Patch100: xsa419-xenstored-02.patch -Patch101: xsa419-xenstored-03.patch -Patch102: xsa420.patch -Patch103: xsa421-01.patch -Patch104: xsa421-02.patch -Patch105: xsa422-4.16-1.patch -Patch106: xsa422-4.16-2.patch %if %build_qemutrad @@ -383,66 +323,6 @@ manage Xen virtual machines. %patch44 -p1 %patch45 -p1 %patch46 -p1 -%patch47 -p1 -%patch48 -p1 -%patch49 -p1 -%patch50 -p1 -%patch51 -p1 -%patch52 -p1 -%patch53 -p1 -%patch54 -p1 -%patch55 -p1 -%patch56 -p1 -%patch57 -p1 -%patch58 -p1 -%patch59 -p1 -%patch60 -p1 -%patch61 -p1 -%patch62 -p1 -%patch63 -p1 -%patch64 -p1 -%patch65 -p1 -%patch66 -p1 -%patch67 -p1 -%patch68 -p1 -%patch69 -p1 -%patch70 -p1 -%patch71 -p1 -%patch72 -p1 -%patch73 -p1 -%patch74 -p1 -%patch75 -p1 -%patch76 -p1 -%patch77 -p1 -%patch78 -p1 -%patch79 -p1 -%patch80 -p1 -%patch81 -p1 -%patch82 -p1 -%patch83 -p1 -%patch84 -p1 -%patch85 -p1 -%patch86 -p1 -%patch87 -p1 -%patch88 -p1 -%patch89 -p1 -%patch90 -p1 -%patch91 -p1 -%patch92 -p1 -%patch93 -p1 -%patch94 -p1 -%patch95 -p1 -%patch96 -p1 -%patch97 -p1 -%patch98 -p1 -%patch99 -p1 -%patch100 -p1 -%patch101 -p1 -%patch102 -p1 -%patch103 -p1 -%patch104 -p1 -%patch105 -p1 -%patch106 -p1 # qemu-xen-traditional patches pushd tools/qemu-xen-traditional @@ -1058,6 +938,10 @@ fi %endif %changelog +* Mon Dec 19 2022 Michael Young - 4.16.3-1 +- update to xen-4.16.3 + remove or adjust patches now included or superceded upstream + * Tue Nov 08 2022 Michael Young - 4.16.2-4 - x86: Multiple speculative security issues [XSA-422, CVE-2022-23824] diff --git a/xsa326-4.16-oxenstored-01.patch b/xsa326-4.16-oxenstored-01.patch deleted file mode 100644 index 848a5d0..0000000 --- a/xsa326-4.16-oxenstored-01.patch +++ /dev/null @@ -1,55 +0,0 @@ -From 8d6bb4ac40619877130533b11655829101b31d04 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= -Date: Wed, 12 Oct 2022 19:13:01 +0100 -Subject: tools/ocaml/xenstored: Synchronise defaults with oxenstore.conf.in -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -We currently have 2 different set of defaults in upstream Xen git tree: -* defined in the source code, only used if there is no config file -* defined in the oxenstored.conf.in upstream Xen - -An oxenstored.conf file is not mandatory, and if missing, maxrequests in -particular has an unsafe default. - -Resync the defaults from oxenstored.conf.in into the source code. - -This is part of XSA-326 / CVE-2022-42316. - -Reported-by: Julien Grall -Signed-off-by: Edwin Török -Acked-by: Christian Lindig - -diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml -index ebe18b8e312c..6b06f808595b 100644 ---- a/tools/ocaml/xenstored/define.ml -+++ b/tools/ocaml/xenstored/define.ml -@@ -21,9 +21,9 @@ let xs_daemon_socket = Paths.xen_run_stored ^ "/socket" - - let default_config_dir = Paths.xen_config_dir - --let maxwatch = ref (50) --let maxtransaction = ref (20) --let maxrequests = ref (-1) (* maximum requests per transaction *) -+let maxwatch = ref (100) -+let maxtransaction = ref (10) -+let maxrequests = ref (1024) (* maximum requests per transaction *) - - let conflict_burst_limit = ref 5.0 - let conflict_max_history_seconds = ref 0.05 -diff --git a/tools/ocaml/xenstored/quota.ml b/tools/ocaml/xenstored/quota.ml -index abcac912805a..6e3d6401ae89 100644 ---- a/tools/ocaml/xenstored/quota.ml -+++ b/tools/ocaml/xenstored/quota.ml -@@ -20,8 +20,8 @@ exception Transaction_opened - - let warn fmt = Logging.warn "quota" fmt - let activate = ref true --let maxent = ref (10000) --let maxsize = ref (4096) -+let maxent = ref (1000) -+let maxsize = ref (2048) - - type t = { - maxent: int; (* max entities per domU *) diff --git a/xsa326-4.16-oxenstored-02.patch b/xsa326-4.16-oxenstored-02.patch deleted file mode 100644 index 7680efb..0000000 --- a/xsa326-4.16-oxenstored-02.patch +++ /dev/null @@ -1,94 +0,0 @@ -From 78d5af44ab13bb18c87b6ad75e505bd374379cb3 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= -Date: Thu, 28 Jul 2022 17:08:15 +0100 -Subject: tools/ocaml/xenstored: Check for maxrequests before performing - operations -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Previously we'd perform the operation, record the updated tree in the -transaction record, then try to insert a watchop path and the reply packet. - -If we exceeded max requests we would've returned EQUOTA, but still: -* have performed the operation on the transaction's tree -* have recorded the watchop, making this queue effectively unbounded - -It is better if we check whether we'd have room to store the operation before -performing the transaction, and raise EQUOTA there. Then the transaction -record won't grow. - -This is part of XSA-326 / CVE-2022-42317. - -Reported-by: Julien Grall -Signed-off-by: Edwin Török -Acked-by: Christian Lindig - -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index 27790d4a5c41..dd58e6979cf9 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -389,6 +389,7 @@ let input_handle_error ~cons ~doms ~fct ~con ~t ~req = - let reply_error e = - Packet.Error e in - try -+ Transaction.check_quota_exn ~perm:(Connection.get_perm con) t; - fct con t doms cons req.Packet.data - with - | Define.Invalid_path -> reply_error "EINVAL" -@@ -681,9 +682,10 @@ let process_packet ~store ~cons ~doms ~con ~req = - in - - let response = try -+ Transaction.check_quota_exn ~perm:(Connection.get_perm con) t; - if tid <> Transaction.none then - (* Remember the request and response for this operation in case we need to replay the transaction *) -- Transaction.add_operation ~perm:(Connection.get_perm con) t req response; -+ Transaction.add_operation t req response; - response - with Quota.Limit_reached -> - Packet.Error "EQUOTA" -diff --git a/tools/ocaml/xenstored/transaction.ml b/tools/ocaml/xenstored/transaction.ml -index 17b1bdf2eaf9..294143e2335b 100644 ---- a/tools/ocaml/xenstored/transaction.ml -+++ b/tools/ocaml/xenstored/transaction.ml -@@ -85,6 +85,7 @@ type t = { - oldroot: Store.Node.t; - mutable paths: (Xenbus.Xb.Op.operation * Store.Path.t) list; - mutable operations: (Packet.request * Packet.response) list; -+ mutable quota_reached: bool; - mutable read_lowpath: Store.Path.t option; - mutable write_lowpath: Store.Path.t option; - } -@@ -127,6 +128,7 @@ let make ?(internal=false) id store = - oldroot = Store.get_root store; - paths = []; - operations = []; -+ quota_reached = false; - read_lowpath = None; - write_lowpath = None; - } in -@@ -143,13 +145,19 @@ let get_root t = Store.get_root t.store - - let is_read_only t = t.paths = [] - let add_wop t ty path = t.paths <- (ty, path) :: t.paths --let add_operation ~perm t request response = -+let get_operations t = List.rev t.operations -+ -+let check_quota_exn ~perm t = - if !Define.maxrequests >= 0 - && not (Perms.Connection.is_dom0 perm) -- && List.length t.operations >= !Define.maxrequests -- then raise Quota.Limit_reached; -+ && (t.quota_reached || List.length t.operations >= !Define.maxrequests) -+ then begin -+ t.quota_reached <- true; -+ raise Quota.Limit_reached; -+ end -+ -+let add_operation t request response = - t.operations <- (request, response) :: t.operations --let get_operations t = List.rev t.operations - let set_read_lowpath t path = t.read_lowpath <- get_lowest path t.read_lowpath - let set_write_lowpath t path = t.write_lowpath <- get_lowest path t.write_lowpath - diff --git a/xsa326-4.16-oxenstored-03.patch b/xsa326-4.16-oxenstored-03.patch deleted file mode 100644 index 0b42411..0000000 --- a/xsa326-4.16-oxenstored-03.patch +++ /dev/null @@ -1,119 +0,0 @@ -From 600c45e49c2060e077c06ab19078da89aa8e2e08 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= -Date: Wed, 12 Oct 2022 19:13:07 +0100 -Subject: tools/ocaml: GC parameter tuning -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -By default the OCaml garbage collector would return memory to the OS only -after unused memory is 5x live memory. Tweak this to 120% instead, which -would match the major GC speed. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Edwin Török -Acked-by: Christian Lindig - -diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml -index 6b06f808595b..ba63a8147e09 100644 ---- a/tools/ocaml/xenstored/define.ml -+++ b/tools/ocaml/xenstored/define.ml -@@ -25,6 +25,7 @@ let maxwatch = ref (100) - let maxtransaction = ref (10) - let maxrequests = ref (1024) (* maximum requests per transaction *) - -+let gc_max_overhead = ref 120 (* 120% see comment in xenstored.ml *) - let conflict_burst_limit = ref 5.0 - let conflict_max_history_seconds = ref 0.05 - let conflict_rate_limit_is_aggregate = ref true -diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml -index d44ae673c42a..3b57ad016dfb 100644 ---- a/tools/ocaml/xenstored/xenstored.ml -+++ b/tools/ocaml/xenstored/xenstored.ml -@@ -104,6 +104,7 @@ let parse_config filename = - ("quota-maxsize", Config.Set_int Quota.maxsize); - ("quota-maxrequests", Config.Set_int Define.maxrequests); - ("quota-path-max", Config.Set_int Define.path_max); -+ ("gc-max-overhead", Config.Set_int Define.gc_max_overhead); - ("test-eagain", Config.Set_bool Transaction.test_eagain); - ("persistent", Config.Set_bool Disk.enable); - ("xenstored-log-file", Config.String Logging.set_xenstored_log_destination); -@@ -265,6 +266,67 @@ let to_file store cons fds file = - (fun () -> close_out channel) - end - -+(* -+ By default OCaml's GC only returns memory to the OS when it exceeds a -+ configurable 'max overhead' setting. -+ The default is 500%, that is 5/6th of the OCaml heap needs to be free -+ and only 1/6th live for a compaction to be triggerred that would -+ release memory back to the OS. -+ If the limit is not hit then the OCaml process can reuse that memory -+ for its own purposes, but other processes won't be able to use it. -+ -+ There is also a 'space overhead' setting that controls how much work -+ each major GC slice does, and by default aims at having no more than -+ 80% or 120% (depending on version) garbage values compared to live -+ values. -+ This doesn't have as much relevance to memory returned to the OS as -+ long as space_overhead <= max_overhead, because compaction is only -+ triggerred at the end of major GC cycles. -+ -+ The defaults are too large once the program starts using ~100MiB of -+ memory, at which point ~500MiB would be unavailable to other processes -+ (which would be fine if this was the main process in this VM, but it is -+ not). -+ -+ Max overhead can also be set to 0, however this is for testing purposes -+ only (setting it lower than 'space overhead' wouldn't help because the -+ major GC wouldn't run fast enough, and compaction does have a -+ performance cost: we can only compact contiguous regions, so memory has -+ to be moved around). -+ -+ Max overhead controls how often the heap is compacted, which is useful -+ if there are burst of activity followed by long periods of idle state, -+ or if a domain quits, etc. Compaction returns memory to the OS. -+ -+ wasted = live * space_overhead / 100 -+ -+ For globally overriding the GC settings one can use OCAMLRUNPARAM, -+ however we provide a config file override to be consistent with other -+ oxenstored settings. -+ -+ One might want to dynamically adjust the overhead setting based on used -+ memory, i.e. to use a fixed upper bound in bytes, not percentage. However -+ measurements show that such adjustments increase GC overhead massively, -+ while still not guaranteeing that memory is returned any more quickly -+ than with a percentage based setting. -+ -+ The allocation policy could also be tweaked, e.g. first fit would reduce -+ fragmentation and thus memory usage, but the documentation warns that it -+ can be sensibly slower, and indeed one of our own testcases can trigger -+ such a corner case where it is multiple times slower, so it is best to keep -+ the default allocation policy (next-fit/best-fit depending on version). -+ -+ There are other tweaks that can be attempted in the future, e.g. setting -+ 'ulimit -v' to 75% of RAM, however getting the kernel to actually return -+ NULL from allocations is difficult even with that setting, and without a -+ NULL the emergency GC won't be triggerred. -+ Perhaps cgroup limits could help, but for now tweak the safest only. -+*) -+ -+let tweak_gc () = -+ Gc.set { (Gc.get ()) with Gc.max_overhead = !Define.gc_max_overhead } -+ -+ - let _ = - let cf = do_argv in - let pidfile = -@@ -274,6 +336,8 @@ let _ = - default_pidfile - in - -+ tweak_gc (); -+ - (try - Unixext.mkdir_rec (Filename.dirname pidfile) 0o755 - with _ -> diff --git a/xsa326-4.16-oxenstored-04.patch b/xsa326-4.16-oxenstored-04.patch deleted file mode 100644 index 0f2534b..0000000 --- a/xsa326-4.16-oxenstored-04.patch +++ /dev/null @@ -1,85 +0,0 @@ -From fd6d9cd3d20e496bdbf3e0a07354f65de0bcf4ae Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= -Date: Fri, 29 Jul 2022 18:53:29 +0100 -Subject: tools/ocaml/libs/xb: hide type of Xb.t -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Hiding the type will make it easier to change the implementation -in the future without breaking code that relies on it. - -No functional change. - -Signed-off-by: Edwin Török -Acked-by: Christian Lindig -(cherry picked from commit 7ade30a1451734d041363c750a65d322e25b47ba) - -Reported-by: Julien Grall -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 104d319d7747..8404ddd8a682 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -196,6 +196,9 @@ let peek_output con = Queue.peek con.pkt_out - let input_len con = Queue.length con.pkt_in - let has_in_packet con = Queue.length con.pkt_in > 0 - let get_in_packet con = Queue.pop con.pkt_in -+let has_partial_input con = match con.partial_in with -+ | HaveHdr _ -> true -+ | NoHdr (n, _) -> n < Partial.header_size () - let has_more_input con = - match con.backend with - | Fd _ -> false -diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli -index 3a00da6cddc1..794e35bb343e 100644 ---- a/tools/ocaml/libs/xb/xb.mli -+++ b/tools/ocaml/libs/xb/xb.mli -@@ -66,13 +66,7 @@ type backend_mmap = { - type backend_fd = { fd : Unix.file_descr; } - type backend = Fd of backend_fd | Xenmmap of backend_mmap - type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes --type t = { -- backend : backend; -- pkt_in : Packet.t Queue.t; -- pkt_out : Packet.t Queue.t; -- mutable partial_in : partial_buf; -- mutable partial_out : string; --} -+type t - val init_partial_in : unit -> partial_buf - val reconnect : t -> unit - val queue : t -> Packet.t -> unit -@@ -97,6 +91,7 @@ val has_output : t -> bool - val peek_output : t -> Packet.t - val input_len : t -> int - val has_in_packet : t -> bool -+val has_partial_input : t -> bool - val get_in_packet : t -> Packet.t - val has_more_input : t -> bool - val is_selectable : t -> bool -diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml -index 65f99ea6f28a..38b47363a173 100644 ---- a/tools/ocaml/xenstored/connection.ml -+++ b/tools/ocaml/xenstored/connection.ml -@@ -125,9 +125,7 @@ let get_perm con = - let set_target con target_domid = - con.perm <- Perms.Connection.set_target (get_perm con) ~perms:[Perms.READ; Perms.WRITE] target_domid - --let is_backend_mmap con = match con.xb.Xenbus.Xb.backend with -- | Xenbus.Xb.Xenmmap _ -> true -- | _ -> false -+let is_backend_mmap con = Xenbus.Xb.is_mmap con.xb - - let send_reply con tid rid ty data = - if (String.length data) > xenstore_payload_max && (is_backend_mmap con) then -@@ -280,9 +278,7 @@ let get_transaction con tid = - - let do_input con = Xenbus.Xb.input con.xb - let has_input con = Xenbus.Xb.has_in_packet con.xb --let has_partial_input con = match con.xb.Xenbus.Xb.partial_in with -- | HaveHdr _ -> true -- | NoHdr (n, _) -> n < Xenbus.Partial.header_size () -+let has_partial_input con = Xenbus.Xb.has_partial_input con.xb - let pop_in con = Xenbus.Xb.get_in_packet con.xb - let has_more_input con = Xenbus.Xb.has_more_input con.xb - diff --git a/xsa326-4.16-oxenstored-05.patch b/xsa326-4.16-oxenstored-05.patch deleted file mode 100644 index 8b253a9..0000000 --- a/xsa326-4.16-oxenstored-05.patch +++ /dev/null @@ -1,214 +0,0 @@ -From f13fe5903361953e4ccf8602b9c8df7e64568d55 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= -Date: Wed, 12 Oct 2022 19:13:02 +0100 -Subject: tools/ocaml: Change Xb.input to return Packet.t option -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The queue here would only ever hold at most one element. This will simplify -follow-up patches. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Edwin Török -Acked-by: Christian Lindig - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 8404ddd8a682..165fd4a1edf4 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -45,7 +45,6 @@ type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes - type t = - { - backend: backend; -- pkt_in: Packet.t Queue.t; - pkt_out: Packet.t Queue.t; - mutable partial_in: partial_buf; - mutable partial_out: string; -@@ -62,7 +61,6 @@ let reconnect t = match t.backend with - Xs_ring.close backend.mmap; - backend.eventchn_notify (); - (* Clear our old connection state *) -- Queue.clear t.pkt_in; - Queue.clear t.pkt_out; - t.partial_in <- init_partial_in (); - t.partial_out <- "" -@@ -124,7 +122,6 @@ let output con = - - (* NB: can throw Reconnect *) - let input con = -- let newpacket = ref false in - let to_read = - match con.partial_in with - | HaveHdr partial_pkt -> Partial.to_complete partial_pkt -@@ -143,21 +140,19 @@ let input con = - if Partial.to_complete partial_pkt = 0 then ( - let pkt = Packet.of_partialpkt partial_pkt in - con.partial_in <- init_partial_in (); -- Queue.push pkt con.pkt_in; -- newpacket := true -- ) -+ Some pkt -+ ) else None - | NoHdr (i, buf) -> - (* we complete the partial header *) - if sz > 0 then - Bytes.blit b 0 buf (Partial.header_size () - i) sz; - con.partial_in <- if sz = i then -- HaveHdr (Partial.of_string (Bytes.to_string buf)) else NoHdr (i - sz, buf) -- ); -- !newpacket -+ HaveHdr (Partial.of_string (Bytes.to_string buf)) else NoHdr (i - sz, buf); -+ None -+ ) - - let newcon backend = { - backend = backend; -- pkt_in = Queue.create (); - pkt_out = Queue.create (); - partial_in = init_partial_in (); - partial_out = ""; -@@ -193,9 +188,6 @@ let has_output con = has_new_output con || has_old_output con - - let peek_output con = Queue.peek con.pkt_out - --let input_len con = Queue.length con.pkt_in --let has_in_packet con = Queue.length con.pkt_in > 0 --let get_in_packet con = Queue.pop con.pkt_in - let has_partial_input con = match con.partial_in with - | HaveHdr _ -> true - | NoHdr (n, _) -> n < Partial.header_size () -diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli -index 794e35bb343e..91c682162cea 100644 ---- a/tools/ocaml/libs/xb/xb.mli -+++ b/tools/ocaml/libs/xb/xb.mli -@@ -77,7 +77,7 @@ val write_fd : backend_fd -> 'a -> string -> int -> int - val write_mmap : backend_mmap -> 'a -> string -> int -> int - val write : t -> string -> int -> int - val output : t -> bool --val input : t -> bool -+val input : t -> Packet.t option - val newcon : backend -> t - val open_fd : Unix.file_descr -> t - val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> t -@@ -89,10 +89,7 @@ val has_new_output : t -> bool - val has_old_output : t -> bool - val has_output : t -> bool - val peek_output : t -> Packet.t --val input_len : t -> int --val has_in_packet : t -> bool - val has_partial_input : t -> bool --val get_in_packet : t -> Packet.t - val has_more_input : t -> bool - val is_selectable : t -> bool - val get_fd : t -> Unix.file_descr -diff --git a/tools/ocaml/libs/xs/xsraw.ml b/tools/ocaml/libs/xs/xsraw.ml -index d982fb24dbb1..451f8b38dbcc 100644 ---- a/tools/ocaml/libs/xs/xsraw.ml -+++ b/tools/ocaml/libs/xs/xsraw.ml -@@ -94,26 +94,18 @@ let pkt_send con = - done - - (* receive one packet - can sleep *) --let pkt_recv con = -- let workdone = ref false in -- while not !workdone -- do -- workdone := Xb.input con.xb -- done; -- Xb.get_in_packet con.xb -+let rec pkt_recv con = -+ match Xb.input con.xb with -+ | Some packet -> packet -+ | None -> pkt_recv con - - let pkt_recv_timeout con timeout = - let fd = Xb.get_fd con.xb in - let r, _, _ = Unix.select [ fd ] [] [] timeout in - if r = [] then - true, None -- else ( -- let workdone = Xb.input con.xb in -- if workdone then -- false, (Some (Xb.get_in_packet con.xb)) -- else -- false, None -- ) -+ else -+ false, Xb.input con.xb - - let queue_watchevent con data = - let ls = split_string ~limit:2 '\000' data in -diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml -index 38b47363a173..cc20e047d2b9 100644 ---- a/tools/ocaml/xenstored/connection.ml -+++ b/tools/ocaml/xenstored/connection.ml -@@ -277,9 +277,7 @@ let get_transaction con tid = - Hashtbl.find con.transactions tid - - let do_input con = Xenbus.Xb.input con.xb --let has_input con = Xenbus.Xb.has_in_packet con.xb - let has_partial_input con = Xenbus.Xb.has_partial_input con.xb --let pop_in con = Xenbus.Xb.get_in_packet con.xb - let has_more_input con = Xenbus.Xb.has_more_input con.xb - - let has_output con = Xenbus.Xb.has_output con.xb -@@ -307,7 +305,7 @@ let is_bad con = match con.dom with None -> false | Some dom -> Domain.is_bad_do - Restrictions below can be relaxed once xenstored learns to dump more - of its live state in a safe way *) - let has_extra_connection_data con = -- let has_in = has_input con || has_partial_input con in -+ let has_in = has_partial_input con in - let has_out = has_output con in - let has_socket = con.dom = None in - let has_nondefault_perms = make_perm con.dom <> con.perm in -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index 6a3435c265d3..2d67456a2aa0 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -195,10 +195,9 @@ let parse_live_update args = - | _ when Unix.gettimeofday () < t.deadline -> false - | l -> - warn "timeout reached: have to wait, migrate or shutdown %d domains:" (List.length l); -- let msgs = List.rev_map (fun con -> Printf.sprintf "%s: %d tx, in: %b, out: %b, perm: %s" -+ let msgs = List.rev_map (fun con -> Printf.sprintf "%s: %d tx, out: %b, perm: %s" - (Connection.get_domstr con) - (Connection.number_of_transactions con) -- (Connection.has_input con) - (Connection.has_output con) - (Connection.get_perm con |> Perms.Connection.to_string) - ) l in -@@ -705,16 +704,17 @@ let do_input store cons doms con = - info "%s requests a reconnect" (Connection.get_domstr con); - History.reconnect con; - info "%s reconnection complete" (Connection.get_domstr con); -- false -+ None - | Failure exp -> - error "caught exception %s" exp; - error "got a bad client %s" (sprintf "%-8s" (Connection.get_domstr con)); - Connection.mark_as_bad con; -- false -+ None - in - -- if newpacket then ( -- let packet = Connection.pop_in con in -+ match newpacket with -+ | None -> () -+ | Some packet -> - let tid, rid, ty, data = Xenbus.Xb.Packet.unpack packet in - let req = {Packet.tid=tid; Packet.rid=rid; Packet.ty=ty; Packet.data=data} in - -@@ -724,8 +724,7 @@ let do_input store cons doms con = - (Xenbus.Xb.Op.to_string ty) (sanitize_data data); *) - process_packet ~store ~cons ~doms ~con ~req; - write_access_log ~ty ~tid ~con:(Connection.get_domstr con) ~data; -- Connection.incr_ops con; -- ) -+ Connection.incr_ops con - - let do_output _store _cons _doms con = - if Connection.has_output con then ( diff --git a/xsa326-4.16-oxenstored-06.patch b/xsa326-4.16-oxenstored-06.patch deleted file mode 100644 index c8ebc34..0000000 --- a/xsa326-4.16-oxenstored-06.patch +++ /dev/null @@ -1,127 +0,0 @@ -From 2440a8b69a118fe14e73eb6cab4a050922866f1a Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= -Date: Wed, 12 Oct 2022 19:13:03 +0100 -Subject: tools/ocaml/xb: Add BoundedQueue -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Ensures we cannot store more than [capacity] elements in a [Queue]. Replacing -all Queue with this module will then ensure at compile time that all Queues -are correctly bound checked. - -Each element in the queue has a class with its own limits. This, in a -subsequent change, will ensure that command responses can proceed during a -flood of watch events. - -No functional change. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Edwin Török -Acked-by: Christian Lindig - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 165fd4a1edf4..4197a3888a68 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -17,6 +17,98 @@ - module Op = struct include Op end - module Packet = struct include Packet end - -+module BoundedQueue : sig -+ type ('a, 'b) t -+ -+ (** [create ~capacity ~classify ~limit] creates a queue with maximum [capacity] elements. -+ This is burst capacity, each element is further classified according to [classify], -+ and each class can have its own [limit]. -+ [capacity] is enforced as an overall limit. -+ The [limit] can be dynamic, and can be smaller than the number of elements already queued of that class, -+ in which case those elements are considered to use "burst capacity". -+ *) -+ val create: capacity:int -> classify:('a -> 'b) -> limit:('b -> int) -> ('a, 'b) t -+ -+ (** [clear q] discards all elements from [q] *) -+ val clear: ('a, 'b) t -> unit -+ -+ (** [can_push q] when [length q < capacity]. *) -+ val can_push: ('a, 'b) t -> 'b -> bool -+ -+ (** [push e q] adds [e] at the end of queue [q] if [can_push q], or returns [None]. *) -+ val push: 'a -> ('a, 'b) t -> unit option -+ -+ (** [pop q] removes and returns first element in [q], or raises [Queue.Empty]. *) -+ val pop: ('a, 'b) t -> 'a -+ -+ (** [peek q] returns the first element in [q], or raises [Queue.Empty]. *) -+ val peek : ('a, 'b) t -> 'a -+ -+ (** [length q] returns the current number of elements in [q] *) -+ val length: ('a, 'b) t -> int -+ -+ (** [debug string_of_class q] prints queue usage statistics in an unspecified internal format. *) -+ val debug: ('b -> string) -> (_, 'b) t -> string -+end = struct -+ type ('a, 'b) t = -+ { q: 'a Queue.t -+ ; capacity: int -+ ; classify: 'a -> 'b -+ ; limit: 'b -> int -+ ; class_count: ('b, int) Hashtbl.t -+ } -+ -+ let create ~capacity ~classify ~limit = -+ { capacity; q = Queue.create (); classify; limit; class_count = Hashtbl.create 3 } -+ -+ let get_count t classification = try Hashtbl.find t.class_count classification with Not_found -> 0 -+ -+ let can_push_internal t classification class_count = -+ Queue.length t.q < t.capacity && class_count < t.limit classification -+ -+ let ok = Some () -+ -+ let push e t = -+ let classification = t.classify e in -+ let class_count = get_count t classification in -+ if can_push_internal t classification class_count then begin -+ Queue.push e t.q; -+ Hashtbl.replace t.class_count classification (class_count + 1); -+ ok -+ end -+ else -+ None -+ -+ let can_push t classification = -+ can_push_internal t classification @@ get_count t classification -+ -+ let clear t = -+ Queue.clear t.q; -+ Hashtbl.reset t.class_count -+ -+ let pop t = -+ let e = Queue.pop t.q in -+ let classification = t.classify e in -+ let () = match get_count t classification - 1 with -+ | 0 -> Hashtbl.remove t.class_count classification (* reduces memusage *) -+ | n -> Hashtbl.replace t.class_count classification n -+ in -+ e -+ -+ let peek t = Queue.peek t.q -+ let length t = Queue.length t.q -+ -+ let debug string_of_class t = -+ let b = Buffer.create 128 in -+ Printf.bprintf b "BoundedQueue capacity: %d, used: {" t.capacity; -+ Hashtbl.iter (fun packet_class count -> -+ Printf.bprintf b " %s: %d" (string_of_class packet_class) count -+ ) t.class_count; -+ Printf.bprintf b "}"; -+ Buffer.contents b -+end -+ -+ - exception End_of_file - exception Eagain - exception Noent diff --git a/xsa326-4.16-oxenstored-07.patch b/xsa326-4.16-oxenstored-07.patch deleted file mode 100644 index ef02467..0000000 --- a/xsa326-4.16-oxenstored-07.patch +++ /dev/null @@ -1,872 +0,0 @@ -From bc0f05e6f3a3c93c853ceffd1f6d2022dc30fb77 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= -Date: Wed, 12 Oct 2022 19:13:04 +0100 -Subject: tools/ocaml: Limit maximum in-flight requests / outstanding replies -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Introduce a limit on the number of outstanding reply packets in the xenbus -queue. This limits the number of in-flight requests: when the output queue is -full we'll stop processing inputs until the output queue has room again. - -To avoid a busy loop on the Unix socket we only add it to the watched input -file descriptor set if we'd be able to call `input` on it. Even though Dom0 -is trusted and exempt from quotas a flood of events might cause a backlog -where events are produced faster than daemons in Dom0 can consume them, which -could lead to an unbounded queue size and OOM. - -Therefore the xenbus queue limit must apply to all connections, Dom0 is not -exempt from it, although if everything works correctly it will eventually -catch up. - -This prevents a malicious guest from sending more commands while it has -outstanding watch events or command replies in its input ring. However if it -can cause the generation of watch events by other means (e.g. by Dom0, or -another cooperative guest) and stop reading its own ring then watch events -would've queued up without limit. - -The xenstore protocol doesn't have a back-pressure mechanism, and doesn't -allow dropping watch events. In fact, dropping watch events is known to break -some pieces of normal functionality. This leaves little choice to safely -implement the xenstore protocol without exposing the xenstore daemon to -out-of-memory attacks. - -Implement the fix as pipes with bounded buffers: -* Use a bounded buffer for watch events -* The watch structure will have a bounded receiving pipe of watch events -* The source will have an "overflow" pipe of pending watch events it couldn't - deliver - -Items are queued up on one end and are sent as far along the pipe as possible: - - source domain -> watch -> xenbus of target -> xenstore ring/socket of target - -If the pipe is "full" at any point then back-pressure is applied and we prevent -more items from being queued up. For the source domain this means that we'll -stop accepting new commands as long as its pipe buffer is not empty. - -Before we try to enqueue an item we first check whether it is possible to send -it further down the pipe, by attempting to recursively flush the pipes. This -ensures that we retain the order of events as much as possible. - -We might break causality of watch events if the target domain's queue is full -and we need to start using the watch's queue. This is a breaking change in -the xenstore protocol, but only for domains which are not processing their -incoming ring as expected. - -When a watch is deleted its entire pending queue is dropped (no code is needed -for that, because it is part of the 'watch' type). - -There is a cache of watches that have pending events that we attempt to flush -at every cycle if possible. - -Introduce 3 limits here: -* quota-maxwatchevents on watch event destination: when this is hit the - source will not be allowed to queue up more watch events. -* quota-maxoustanding which is the number of responses not read from the ring: - once exceeded, no more inputs are processed until all outstanding replies - are consumed by the client. -* overflow queue on the watch event source: all watches that cannot be stored - on destination are queued up here, a single command can trigger multiple - watches (e.g. due to recursion). - -The overflow queue currently doesn't have an upper bound, it is difficult to -accurately calculate one as it depends on whether you are Dom0 and how many -watches each path has registered and how many watch events you can trigger -with a single command (e.g. a commit). However these events were already -using memory, this just moves them elsewhere, and as long as we correctly -block a domain it shouldn't result in unbounded memory usage. - -Note that Dom0 is not excluded from these checks, it is important that Dom0 is -especially not excluded when it is the source, since there are many ways in -which a guest could trigger Dom0 to send it watch events. - -This should protect against malicious frontends as long as the backend follows -the PV xenstore protocol and only exposes paths needed by the frontend, and -changes those paths at most once as a reaction to guest events, or protocol -state. - -The queue limits are per watch, and per domain-pair, so even if one -communication channel would be "blocked", others would keep working, and the -domain itself won't get blocked as long as it doesn't overflow the queue of -watch events. - -Similarly a malicious backend could cause the frontend to get blocked, but -this watch queue protects the frontend as well as long as it follows the PV -protocol. (Although note that protection against malicious backends is only a -best effort at the moment) - -This is part of XSA-326 / CVE-2022-42318. - -Reported-by: Julien Grall -Signed-off-by: Edwin Török -Acked-by: Christian Lindig - -diff --git a/tools/ocaml/libs/xb/xb.ml b/tools/ocaml/libs/xb/xb.ml -index 4197a3888a68..b292ed7a874d 100644 ---- a/tools/ocaml/libs/xb/xb.ml -+++ b/tools/ocaml/libs/xb/xb.ml -@@ -134,14 +134,44 @@ type backend = Fd of backend_fd | Xenmmap of backend_mmap - - type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes - -+(* -+ separate capacity reservation for replies and watch events: -+ this allows a domain to keep working even when under a constant flood of -+ watch events -+*) -+type capacity = { maxoutstanding: int; maxwatchevents: int } -+ -+module Queue = BoundedQueue -+ -+type packet_class = -+ | CommandReply -+ | Watchevent -+ -+let string_of_packet_class = function -+ | CommandReply -> "command_reply" -+ | Watchevent -> "watch_event" -+ - type t = - { - backend: backend; -- pkt_out: Packet.t Queue.t; -+ pkt_out: (Packet.t, packet_class) Queue.t; - mutable partial_in: partial_buf; - mutable partial_out: string; -+ capacity: capacity - } - -+let to_read con = -+ match con.partial_in with -+ | HaveHdr partial_pkt -> Partial.to_complete partial_pkt -+ | NoHdr (i, _) -> i -+ -+let debug t = -+ Printf.sprintf "XenBus state: partial_in: %d needed, partial_out: %d bytes, pkt_out: %d packets, %s" -+ (to_read t) -+ (String.length t.partial_out) -+ (Queue.length t.pkt_out) -+ (BoundedQueue.debug string_of_packet_class t.pkt_out) -+ - let init_partial_in () = NoHdr - (Partial.header_size (), Bytes.make (Partial.header_size()) '\000') - -@@ -199,7 +229,8 @@ let output con = - let s = if String.length con.partial_out > 0 then - con.partial_out - else if Queue.length con.pkt_out > 0 then -- Packet.to_string (Queue.pop con.pkt_out) -+ let pkt = Queue.pop con.pkt_out in -+ Packet.to_string pkt - else - "" in - (* send data from s, and save the unsent data to partial_out *) -@@ -212,12 +243,15 @@ let output con = - (* after sending one packet, partial is empty *) - con.partial_out = "" - -+(* we can only process an input packet if we're guaranteed to have room -+ to store the response packet *) -+let can_input con = Queue.can_push con.pkt_out CommandReply -+ - (* NB: can throw Reconnect *) - let input con = -- let to_read = -- match con.partial_in with -- | HaveHdr partial_pkt -> Partial.to_complete partial_pkt -- | NoHdr (i, _) -> i in -+ if not (can_input con) then None -+ else -+ let to_read = to_read con in - - (* try to get more data from input stream *) - let b = Bytes.make to_read '\000' in -@@ -243,11 +277,22 @@ let input con = - None - ) - --let newcon backend = { -+let classify t = -+ match t.Packet.ty with -+ | Op.Watchevent -> Watchevent -+ | _ -> CommandReply -+ -+let newcon ~capacity backend = -+ let limit = function -+ | CommandReply -> capacity.maxoutstanding -+ | Watchevent -> capacity.maxwatchevents -+ in -+ { - backend = backend; -- pkt_out = Queue.create (); -+ pkt_out = Queue.create ~capacity:(capacity.maxoutstanding + capacity.maxwatchevents) ~classify ~limit; - partial_in = init_partial_in (); - partial_out = ""; -+ capacity = capacity; - } - - let open_fd fd = newcon (Fd { fd = fd; }) -diff --git a/tools/ocaml/libs/xb/xb.mli b/tools/ocaml/libs/xb/xb.mli -index 91c682162cea..71b2754ca788 100644 ---- a/tools/ocaml/libs/xb/xb.mli -+++ b/tools/ocaml/libs/xb/xb.mli -@@ -66,10 +66,11 @@ type backend_mmap = { - type backend_fd = { fd : Unix.file_descr; } - type backend = Fd of backend_fd | Xenmmap of backend_mmap - type partial_buf = HaveHdr of Partial.pkt | NoHdr of int * bytes -+type capacity = { maxoutstanding: int; maxwatchevents: int } - type t - val init_partial_in : unit -> partial_buf - val reconnect : t -> unit --val queue : t -> Packet.t -> unit -+val queue : t -> Packet.t -> unit option - val read_fd : backend_fd -> 'a -> bytes -> int -> int - val read_mmap : backend_mmap -> 'a -> bytes -> int -> int - val read : t -> bytes -> int -> int -@@ -78,13 +79,14 @@ val write_mmap : backend_mmap -> 'a -> string -> int -> int - val write : t -> string -> int -> int - val output : t -> bool - val input : t -> Packet.t option --val newcon : backend -> t --val open_fd : Unix.file_descr -> t --val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> t -+val newcon : capacity:capacity -> backend -> t -+val open_fd : Unix.file_descr -> capacity:capacity -> t -+val open_mmap : Xenmmap.mmap_interface -> (unit -> unit) -> capacity:capacity -> t - val close : t -> unit - val is_fd : t -> bool - val is_mmap : t -> bool - val output_len : t -> int -+val can_input: t -> bool - val has_new_output : t -> bool - val has_old_output : t -> bool - val has_output : t -> bool -@@ -93,3 +95,4 @@ val has_partial_input : t -> bool - val has_more_input : t -> bool - val is_selectable : t -> bool - val get_fd : t -> Unix.file_descr -+val debug: t -> string -diff --git a/tools/ocaml/libs/xs/queueop.ml b/tools/ocaml/libs/xs/queueop.ml -index 9ff5bbd529ce..4e532cdaeacb 100644 ---- a/tools/ocaml/libs/xs/queueop.ml -+++ b/tools/ocaml/libs/xs/queueop.ml -@@ -16,9 +16,10 @@ - open Xenbus - - let data_concat ls = (String.concat "\000" ls) ^ "\000" -+let queue con pkt = let r = Xb.queue con pkt in assert (r <> None) - let queue_path ty (tid: int) (path: string) con = - let data = data_concat [ path; ] in -- Xb.queue con (Xb.Packet.create tid 0 ty data) -+ queue con (Xb.Packet.create tid 0 ty data) - - (* operations *) - let directory tid path con = queue_path Xb.Op.Directory tid path con -@@ -27,48 +28,48 @@ let read tid path con = queue_path Xb.Op.Read tid path con - let getperms tid path con = queue_path Xb.Op.Getperms tid path con - - let debug commands con = -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Debug (data_concat commands)) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Debug (data_concat commands)) - - let watch path data con = - let data = data_concat [ path; data; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Watch data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Watch data) - - let unwatch path data con = - let data = data_concat [ path; data; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Unwatch data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Unwatch data) - - let transaction_start con = -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Transaction_start (data_concat [])) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Transaction_start (data_concat [])) - - let transaction_end tid commit con = - let data = data_concat [ (if commit then "T" else "F"); ] in -- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Transaction_end data) -+ queue con (Xb.Packet.create tid 0 Xb.Op.Transaction_end data) - - let introduce domid mfn port con = - let data = data_concat [ Printf.sprintf "%u" domid; - Printf.sprintf "%nu" mfn; - string_of_int port; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Introduce data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Introduce data) - - let release domid con = - let data = data_concat [ Printf.sprintf "%u" domid; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Release data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Release data) - - let resume domid con = - let data = data_concat [ Printf.sprintf "%u" domid; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Resume data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Resume data) - - let getdomainpath domid con = - let data = data_concat [ Printf.sprintf "%u" domid; ] in -- Xb.queue con (Xb.Packet.create 0 0 Xb.Op.Getdomainpath data) -+ queue con (Xb.Packet.create 0 0 Xb.Op.Getdomainpath data) - - let write tid path value con = - let data = path ^ "\000" ^ value (* no NULL at the end *) in -- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Write data) -+ queue con (Xb.Packet.create tid 0 Xb.Op.Write data) - - let mkdir tid path con = queue_path Xb.Op.Mkdir tid path con - let rm tid path con = queue_path Xb.Op.Rm tid path con - - let setperms tid path perms con = - let data = data_concat [ path; perms ] in -- Xb.queue con (Xb.Packet.create tid 0 Xb.Op.Setperms data) -+ queue con (Xb.Packet.create tid 0 Xb.Op.Setperms data) -diff --git a/tools/ocaml/libs/xs/xsraw.ml b/tools/ocaml/libs/xs/xsraw.ml -index 451f8b38dbcc..cbd17280600c 100644 ---- a/tools/ocaml/libs/xs/xsraw.ml -+++ b/tools/ocaml/libs/xs/xsraw.ml -@@ -36,8 +36,10 @@ type con = { - let close con = - Xb.close con.xb - -+let capacity = { Xb.maxoutstanding = 1; maxwatchevents = 0; } -+ - let open_fd fd = { -- xb = Xb.open_fd fd; -+ xb = Xb.open_fd ~capacity fd; - watchevents = Queue.create (); - } - -diff --git a/tools/ocaml/xenstored/connection.ml b/tools/ocaml/xenstored/connection.ml -index cc20e047d2b9..9624a5f9da2c 100644 ---- a/tools/ocaml/xenstored/connection.ml -+++ b/tools/ocaml/xenstored/connection.ml -@@ -20,12 +20,84 @@ open Stdext - - let xenstore_payload_max = 4096 (* xen/include/public/io/xs_wire.h *) - -+type 'a bounded_sender = 'a -> unit option -+(** a bounded sender accepts an ['a] item and returns: -+ None - if there is no room to accept the item -+ Some () - if it has successfully accepted/sent the item -+ *) -+ -+module BoundedPipe : sig -+ type 'a t -+ -+ (** [create ~capacity ~destination] creates a bounded pipe with a -+ local buffer holding at most [capacity] items. Once the buffer is -+ full it will not accept further items. items from the pipe are -+ flushed into [destination] as long as it accepts items. The -+ destination could be another pipe. -+ *) -+ val create: capacity:int -> destination:'a bounded_sender -> 'a t -+ -+ (** [is_empty t] returns whether the local buffer of [t] is empty. *) -+ val is_empty : _ t -> bool -+ -+ (** [length t] the number of items in the internal buffer *) -+ val length: _ t -> int -+ -+ (** [flush_pipe t] sends as many items from the local buffer as possible, -+ which could be none. *) -+ val flush_pipe: _ t -> unit -+ -+ (** [push t item] tries to [flush_pipe] and then push [item] -+ into the pipe if its [capacity] allows. -+ Returns [None] if there is no more room -+ *) -+ val push : 'a t -> 'a bounded_sender -+end = struct -+ (* items are enqueued in [q], and then flushed to [connect_to] *) -+ type 'a t = -+ { q: 'a Queue.t -+ ; destination: 'a bounded_sender -+ ; capacity: int -+ } -+ -+ let create ~capacity ~destination = -+ { q = Queue.create (); capacity; destination } -+ -+ let rec flush_pipe t = -+ if not Queue.(is_empty t.q) then -+ let item = Queue.peek t.q in -+ match t.destination item with -+ | None -> () (* no room *) -+ | Some () -> -+ (* successfully sent item to next stage *) -+ let _ = Queue.pop t.q in -+ (* continue trying to send more items *) -+ flush_pipe t -+ -+ let push t item = -+ (* first try to flush as many items from this pipe as possible to make room, -+ it is important to do this first to preserve the order of the items -+ *) -+ flush_pipe t; -+ if Queue.length t.q < t.capacity then begin -+ (* enqueue, instead of sending directly. -+ this ensures that [out] sees the items in the same order as we receive them -+ *) -+ Queue.push item t.q; -+ Some (flush_pipe t) -+ end else None -+ -+ let is_empty t = Queue.is_empty t.q -+ let length t = Queue.length t.q -+end -+ - type watch = { - con: t; - token: string; - path: string; - base: string; - is_relative: bool; -+ pending_watchevents: Xenbus.Xb.Packet.t BoundedPipe.t; - } - - and t = { -@@ -38,8 +110,36 @@ and t = { - anonid: int; - mutable stat_nb_ops: int; - mutable perm: Perms.Connection.t; -+ pending_source_watchevents: (watch * Xenbus.Xb.Packet.t) BoundedPipe.t - } - -+module Watch = struct -+ module T = struct -+ type t = watch -+ -+ let compare w1 w2 = -+ (* cannot compare watches from different connections *) -+ assert (w1.con == w2.con); -+ match String.compare w1.token w2.token with -+ | 0 -> String.compare w1.path w2.path -+ | n -> n -+ end -+ module Set = Set.Make(T) -+ -+ let flush_events t = -+ BoundedPipe.flush_pipe t.pending_watchevents; -+ not (BoundedPipe.is_empty t.pending_watchevents) -+ -+ let pending_watchevents t = -+ BoundedPipe.length t.pending_watchevents -+end -+ -+let source_flush_watchevents t = -+ BoundedPipe.flush_pipe t.pending_source_watchevents -+ -+let source_pending_watchevents t = -+ BoundedPipe.length t.pending_source_watchevents -+ - let mark_as_bad con = - match con.dom with - |None -> () -@@ -67,7 +167,8 @@ let watch_create ~con ~path ~token = { - token = token; - path = path; - base = get_path con; -- is_relative = path.[0] <> '/' && path.[0] <> '@' -+ is_relative = path.[0] <> '/' && path.[0] <> '@'; -+ pending_watchevents = BoundedPipe.create ~capacity:!Define.maxwatchevents ~destination:(Xenbus.Xb.queue con.xb) - } - - let get_con w = w.con -@@ -93,6 +194,9 @@ let make_perm dom = - Perms.Connection.create ~perms:[Perms.READ; Perms.WRITE] domid - - let create xbcon dom = -+ let destination (watch, pkt) = -+ BoundedPipe.push watch.pending_watchevents pkt -+ in - let id = - match dom with - | None -> let old = !anon_id_next in incr anon_id_next; old -@@ -109,6 +213,16 @@ let create xbcon dom = - anonid = id; - stat_nb_ops = 0; - perm = make_perm dom; -+ -+ (* the actual capacity will be lower, this is used as an overflow -+ buffer: anything that doesn't fit elsewhere gets put here, only -+ limited by the amount of watches that you can generate with a -+ single xenstore command (which is finite, although possibly very -+ large in theory for Dom0). Once the pipe here has any contents the -+ domain is blocked from sending more commands until it is empty -+ again though. -+ *) -+ pending_source_watchevents = BoundedPipe.create ~capacity:Sys.max_array_length ~destination - } - in - Logging.new_connection ~tid:Transaction.none ~con:(get_domstr con); -@@ -127,11 +241,17 @@ let set_target con target_domid = - - let is_backend_mmap con = Xenbus.Xb.is_mmap con.xb - --let send_reply con tid rid ty data = -+let packet_of con tid rid ty data = - if (String.length data) > xenstore_payload_max && (is_backend_mmap con) then -- Xenbus.Xb.queue con.xb (Xenbus.Xb.Packet.create tid rid Xenbus.Xb.Op.Error "E2BIG\000") -+ Xenbus.Xb.Packet.create tid rid Xenbus.Xb.Op.Error "E2BIG\000" - else -- Xenbus.Xb.queue con.xb (Xenbus.Xb.Packet.create tid rid ty data) -+ Xenbus.Xb.Packet.create tid rid ty data -+ -+let send_reply con tid rid ty data = -+ let result = Xenbus.Xb.queue con.xb (packet_of con tid rid ty data) in -+ (* should never happen: we only process an input packet when there is room for an output packet *) -+ (* and the limit for replies is different from the limit for watch events *) -+ assert (result <> None) - - let send_error con tid rid err = send_reply con tid rid Xenbus.Xb.Op.Error (err ^ "\000") - let send_ack con tid rid ty = send_reply con tid rid ty "OK\000" -@@ -181,11 +301,11 @@ let del_watch con path token = - apath, w - - let del_watches con = -- Hashtbl.clear con.watches; -+ Hashtbl.reset con.watches; - con.nb_watches <- 0 - - let del_transactions con = -- Hashtbl.clear con.transactions -+ Hashtbl.reset con.transactions - - let list_watches con = - let ll = Hashtbl.fold -@@ -208,21 +328,29 @@ let lookup_watch_perm path = function - let lookup_watch_perms oldroot root path = - lookup_watch_perm path oldroot @ lookup_watch_perm path (Some root) - --let fire_single_watch_unchecked watch = -+let fire_single_watch_unchecked source watch = - let data = Utils.join_by_null [watch.path; watch.token; ""] in -- send_reply watch.con Transaction.none 0 Xenbus.Xb.Op.Watchevent data -+ let pkt = packet_of watch.con Transaction.none 0 Xenbus.Xb.Op.Watchevent data in -+ -+ match BoundedPipe.push source.pending_source_watchevents (watch, pkt) with -+ | Some () -> () (* packet queued *) -+ | None -> -+ (* a well behaved Dom0 shouldn't be able to trigger this, -+ if it happens it is likely a Dom0 bug causing runaway memory usage -+ *) -+ failwith "watch event overflow, cannot happen" - --let fire_single_watch (oldroot, root) watch = -+let fire_single_watch source (oldroot, root) watch = - let abspath = get_watch_path watch.con watch.path |> Store.Path.of_string in - let perms = lookup_watch_perms oldroot root abspath in - if Perms.can_fire_watch watch.con.perm perms then -- fire_single_watch_unchecked watch -+ fire_single_watch_unchecked source watch - else - let perms = perms |> List.map (Perms.Node.to_string ~sep:" ") |> String.concat ", " in - let con = get_domstr watch.con in - Logging.watch_not_fired ~con perms (Store.Path.to_string abspath) - --let fire_watch roots watch path = -+let fire_watch source roots watch path = - let new_path = - if watch.is_relative && path.[0] = '/' - then begin -@@ -232,7 +360,7 @@ let fire_watch roots watch path = - end else - path - in -- fire_single_watch roots { watch with path = new_path } -+ fire_single_watch source roots { watch with path = new_path } - - (* Search for a valid unused transaction id. *) - let rec valid_transaction_id con proposed_id = -@@ -280,6 +408,7 @@ let do_input con = Xenbus.Xb.input con.xb - let has_partial_input con = Xenbus.Xb.has_partial_input con.xb - let has_more_input con = Xenbus.Xb.has_more_input con.xb - -+let can_input con = Xenbus.Xb.can_input con.xb && BoundedPipe.is_empty con.pending_source_watchevents - let has_output con = Xenbus.Xb.has_output con.xb - let has_old_output con = Xenbus.Xb.has_old_output con.xb - let has_new_output con = Xenbus.Xb.has_new_output con.xb -@@ -323,7 +452,7 @@ let prevents_live_update con = not (is_bad con) - && (has_extra_connection_data con || has_transaction_data con) - - let has_more_work con = -- has_more_input con || not (has_old_output con) && has_new_output con -+ (has_more_input con && can_input con) || not (has_old_output con) && has_new_output con - - let incr_ops con = con.stat_nb_ops <- con.stat_nb_ops + 1 - -diff --git a/tools/ocaml/xenstored/connections.ml b/tools/ocaml/xenstored/connections.ml -index 3c7429fe7f61..7d68c583b43a 100644 ---- a/tools/ocaml/xenstored/connections.ml -+++ b/tools/ocaml/xenstored/connections.ml -@@ -22,22 +22,30 @@ type t = { - domains: (int, Connection.t) Hashtbl.t; - ports: (Xeneventchn.t, Connection.t) Hashtbl.t; - mutable watches: Connection.watch list Trie.t; -+ mutable has_pending_watchevents: Connection.Watch.Set.t - } - - let create () = { - anonymous = Hashtbl.create 37; - domains = Hashtbl.create 37; - ports = Hashtbl.create 37; -- watches = Trie.create () -+ watches = Trie.create (); -+ has_pending_watchevents = Connection.Watch.Set.empty; - } - -+let get_capacity () = -+ (* not multiplied by maxwatch on purpose: 2nd queue in watch itself! *) -+ { Xenbus.Xb.maxoutstanding = !Define.maxoutstanding; maxwatchevents = !Define.maxwatchevents } -+ - let add_anonymous cons fd = -- let xbcon = Xenbus.Xb.open_fd fd in -+ let capacity = get_capacity () in -+ let xbcon = Xenbus.Xb.open_fd fd ~capacity in - let con = Connection.create xbcon None in - Hashtbl.add cons.anonymous (Xenbus.Xb.get_fd xbcon) con - - let add_domain cons dom = -- let xbcon = Xenbus.Xb.open_mmap (Domain.get_interface dom) (fun () -> Domain.notify dom) in -+ let capacity = get_capacity () in -+ let xbcon = Xenbus.Xb.open_mmap ~capacity (Domain.get_interface dom) (fun () -> Domain.notify dom) in - let con = Connection.create xbcon (Some dom) in - Hashtbl.add cons.domains (Domain.get_id dom) con; - match Domain.get_port dom with -@@ -48,7 +56,9 @@ let select ?(only_if = (fun _ -> true)) cons = - Hashtbl.fold (fun _ con (ins, outs) -> - if (only_if con) then ( - let fd = Connection.get_fd con in -- (fd :: ins, if Connection.has_output con then fd :: outs else outs) -+ let in_fds = if Connection.can_input con then fd :: ins else ins in -+ let out_fds = if Connection.has_output con then fd :: outs else outs in -+ in_fds, out_fds - ) else (ins, outs) - ) - cons.anonymous ([], []) -@@ -67,10 +77,17 @@ let del_watches_of_con con watches = - | [] -> None - | ws -> Some ws - -+let del_watches cons con = -+ Connection.del_watches con; -+ cons.watches <- Trie.map (del_watches_of_con con) cons.watches; -+ cons.has_pending_watchevents <- -+ cons.has_pending_watchevents |> Connection.Watch.Set.filter @@ fun w -> -+ Connection.get_con w != con -+ - let del_anonymous cons con = - try - Hashtbl.remove cons.anonymous (Connection.get_fd con); -- cons.watches <- Trie.map (del_watches_of_con con) cons.watches; -+ del_watches cons con; - Connection.close con - with exn -> - debug "del anonymous %s" (Printexc.to_string exn) -@@ -85,7 +102,7 @@ let del_domain cons id = - | Some p -> Hashtbl.remove cons.ports p - | None -> ()) - | None -> ()); -- cons.watches <- Trie.map (del_watches_of_con con) cons.watches; -+ del_watches cons con; - Connection.close con - with exn -> - debug "del domain %u: %s" id (Printexc.to_string exn) -@@ -136,31 +153,33 @@ let del_watch cons con path token = - cons.watches <- Trie.set cons.watches key watches; - watch - --let del_watches cons con = -- Connection.del_watches con; -- cons.watches <- Trie.map (del_watches_of_con con) cons.watches -- - (* path is absolute *) --let fire_watches ?oldroot root cons path recurse = -+let fire_watches ?oldroot source root cons path recurse = - let key = key_of_path path in - let path = Store.Path.to_string path in - let roots = oldroot, root in - let fire_watch _ = function - | None -> () -- | Some watches -> List.iter (fun w -> Connection.fire_watch roots w path) watches -+ | Some watches -> List.iter (fun w -> Connection.fire_watch source roots w path) watches - in - let fire_rec _x = function - | None -> () - | Some watches -> -- List.iter (Connection.fire_single_watch roots) watches -+ List.iter (Connection.fire_single_watch source roots) watches - in - Trie.iter_path fire_watch cons.watches key; - if recurse then - Trie.iter fire_rec (Trie.sub cons.watches key) - -+let send_watchevents cons con = -+ cons.has_pending_watchevents <- -+ cons.has_pending_watchevents |> Connection.Watch.Set.filter Connection.Watch.flush_events; -+ Connection.source_flush_watchevents con -+ - let fire_spec_watches root cons specpath = -+ let source = find_domain cons 0 in - iter cons (fun con -> -- List.iter (Connection.fire_single_watch (None, root)) (Connection.get_watches con specpath)) -+ List.iter (Connection.fire_single_watch source (None, root)) (Connection.get_watches con specpath)) - - let set_target cons domain target_domain = - let con = find_domain cons domain in -@@ -197,6 +216,16 @@ let debug cons = - let domains = Hashtbl.fold (fun _ con accu -> Connection.debug con :: accu) cons.domains [] in - String.concat "" (domains @ anonymous) - -+let debug_watchevents cons con = -+ (* == (physical equality) -+ has to be used here because w.con.xb.backend might contain a [unit->unit] value causing regular -+ comparison to fail due to having a 'functional value' which cannot be compared. -+ *) -+ let s = cons.has_pending_watchevents |> Connection.Watch.Set.filter (fun w -> w.con == con) in -+ let pending = s |> Connection.Watch.Set.elements -+ |> List.map (fun w -> Connection.Watch.pending_watchevents w) |> List.fold_left (+) 0 in -+ Printf.sprintf "Watches with pending events: %d, pending events total: %d" (Connection.Watch.Set.cardinal s) pending -+ - let filter ~f cons = - let fold _ v acc = if f v then v :: acc else acc in - [] -diff --git a/tools/ocaml/xenstored/define.ml b/tools/ocaml/xenstored/define.ml -index ba63a8147e09..327b6d795ec7 100644 ---- a/tools/ocaml/xenstored/define.ml -+++ b/tools/ocaml/xenstored/define.ml -@@ -24,6 +24,13 @@ let default_config_dir = Paths.xen_config_dir - let maxwatch = ref (100) - let maxtransaction = ref (10) - let maxrequests = ref (1024) (* maximum requests per transaction *) -+let maxoutstanding = ref (1024) (* maximum outstanding requests, i.e. in-flight requests / domain *) -+let maxwatchevents = ref (1024) -+(* -+ maximum outstanding watch events per watch, -+ recommended >= maxoutstanding to avoid blocking backend transactions due to -+ malicious frontends -+ *) - - let gc_max_overhead = ref 120 (* 120% see comment in xenstored.ml *) - let conflict_burst_limit = ref 5.0 -diff --git a/tools/ocaml/xenstored/oxenstored.conf.in b/tools/ocaml/xenstored/oxenstored.conf.in -index 4ae48e42d47d..9d034e744b4b 100644 ---- a/tools/ocaml/xenstored/oxenstored.conf.in -+++ b/tools/ocaml/xenstored/oxenstored.conf.in -@@ -62,6 +62,8 @@ quota-maxwatch = 100 - quota-transaction = 10 - quota-maxrequests = 1024 - quota-path-max = 1024 -+quota-maxoutstanding = 1024 -+quota-maxwatchevents = 1024 - - # Activate filed base backend - persistent = false -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index 2d67456a2aa0..6dcedfda86e4 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -57,7 +57,7 @@ let split_one_path data con = - | path :: "" :: [] -> Store.Path.create path (Connection.get_path con) - | _ -> raise Invalid_Cmd_Args - --let process_watch t cons = -+let process_watch source t cons = - let oldroot = t.Transaction.oldroot in - let newroot = Store.get_root t.store in - let ops = Transaction.get_paths t |> List.rev in -@@ -67,8 +67,9 @@ let process_watch t cons = - | Xenbus.Xb.Op.Rm -> true, None, oldroot - | Xenbus.Xb.Op.Setperms -> false, Some oldroot, newroot - | _ -> raise (Failure "huh ?") in -- Connections.fire_watches ?oldroot root cons (snd op) recurse in -- List.iter (fun op -> do_op_watch op cons) ops -+ Connections.fire_watches ?oldroot source root cons (snd op) recurse in -+ List.iter (fun op -> do_op_watch op cons) ops; -+ Connections.send_watchevents cons source - - let create_implicit_path t perm path = - let dirname = Store.Path.get_parent path in -@@ -234,6 +235,20 @@ let do_debug con t _domains cons data = - | "watches" :: _ -> - let watches = Connections.debug cons in - Some (watches ^ "\000") -+ | "xenbus" :: domid :: _ -> -+ let domid = int_of_string domid in -+ let con = Connections.find_domain cons domid in -+ let s = Printf.sprintf "xenbus: %s; overflow queue length: %d, can_input: %b, has_more_input: %b, has_old_output: %b, has_new_output: %b, has_more_work: %b. pending: %s" -+ (Xenbus.Xb.debug con.xb) -+ (Connection.source_pending_watchevents con) -+ (Connection.can_input con) -+ (Connection.has_more_input con) -+ (Connection.has_old_output con) -+ (Connection.has_new_output con) -+ (Connection.has_more_work con) -+ (Connections.debug_watchevents cons con) -+ in -+ Some s - | "mfn" :: domid :: _ -> - let domid = int_of_string domid in - let con = Connections.find_domain cons domid in -@@ -342,7 +357,7 @@ let reply_ack fct con t doms cons data = - fct con t doms cons data; - Packet.Ack (fun () -> - if Transaction.get_id t = Transaction.none then -- process_watch t cons -+ process_watch con t cons - ) - - let reply_data fct con t doms cons data = -@@ -501,7 +516,7 @@ let do_watch con t _domains cons data = - Packet.Ack (fun () -> - (* xenstore.txt says this watch is fired immediately, - implying even if path doesn't exist or is unreadable *) -- Connection.fire_single_watch_unchecked watch) -+ Connection.fire_single_watch_unchecked con watch) - - let do_unwatch con _t _domains cons data = - let (node, token) = -@@ -532,7 +547,7 @@ let do_transaction_end con t domains cons data = - if not success then - raise Transaction_again; - if commit then begin -- process_watch t cons; -+ process_watch con t cons; - match t.Transaction.ty with - | Transaction.No -> - () (* no need to record anything *) -@@ -699,7 +714,8 @@ let process_packet ~store ~cons ~doms ~con ~req = - let do_input store cons doms con = - let newpacket = - try -- Connection.do_input con -+ if Connection.can_input con then Connection.do_input con -+ else None - with Xenbus.Xb.Reconnect -> - info "%s requests a reconnect" (Connection.get_domstr con); - History.reconnect con; -@@ -727,6 +743,7 @@ let do_input store cons doms con = - Connection.incr_ops con - - let do_output _store _cons _doms con = -+ Connection.source_flush_watchevents con; - if Connection.has_output con then ( - if Connection.has_new_output con then ( - let packet = Connection.peek_output con in -diff --git a/tools/ocaml/xenstored/xenstored.ml b/tools/ocaml/xenstored/xenstored.ml -index 3b57ad016dfb..c799e20f1145 100644 ---- a/tools/ocaml/xenstored/xenstored.ml -+++ b/tools/ocaml/xenstored/xenstored.ml -@@ -103,6 +103,8 @@ let parse_config filename = - ("quota-maxentity", Config.Set_int Quota.maxent); - ("quota-maxsize", Config.Set_int Quota.maxsize); - ("quota-maxrequests", Config.Set_int Define.maxrequests); -+ ("quota-maxoutstanding", Config.Set_int Define.maxoutstanding); -+ ("quota-maxwatchevents", Config.Set_int Define.maxwatchevents); - ("quota-path-max", Config.Set_int Define.path_max); - ("gc-max-overhead", Config.Set_int Define.gc_max_overhead); - ("test-eagain", Config.Set_bool Transaction.test_eagain); diff --git a/xsa326-4.16-oxenstored-08.patch b/xsa326-4.16-oxenstored-08.patch deleted file mode 100644 index 8a47200..0000000 --- a/xsa326-4.16-oxenstored-08.patch +++ /dev/null @@ -1,49 +0,0 @@ -From 09aa10649f75a262028e9a9b7d859ef7efb23d54 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Thu, 29 Sep 2022 13:07:35 +0200 -Subject: SUPPORT.md: clarify support of untrusted driver domains with - oxenstored - -Add a support statement for the scope of support regarding different -Xenstore variants. Especially oxenstored does not (yet) have security -support of untrusted driver domains, as those might drive oxenstored -out of memory by creating lots of watch events for the guests they are -servicing. - -Add a statement regarding Live Update support of oxenstored. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Acked-by: George Dunlap -Acked-by: Julien Grall -Reviewed-by: Christian Lindig - -diff --git a/SUPPORT.md b/SUPPORT.md -index 85726102eab8..7d0cb34c8f6f 100644 ---- a/SUPPORT.md -+++ b/SUPPORT.md -@@ -179,13 +179,18 @@ Support for running qemu-xen device model in a linux stubdomain. - - Status: Tech Preview - --## Liveupdate of C xenstored daemon -+## Xenstore - -- Status: Tech Preview -+### C xenstored daemon - --## Liveupdate of OCaml xenstored daemon -+ Status: Supported -+ Status, Liveupdate: Tech Preview - -- Status: Tech Preview -+### OCaml xenstored daemon -+ -+ Status: Supported -+ Status, untrusted driver domains: Supported, not security supported -+ Status, Liveupdate: Not functional - - ## Toolstack/3rd party - diff --git a/xsa326-4.16-xenstored-01.patch b/xsa326-4.16-xenstored-01.patch deleted file mode 100644 index b4a4b0e..0000000 --- a/xsa326-4.16-xenstored-01.patch +++ /dev/null @@ -1,205 +0,0 @@ -From 5192f13a41661b1c1b9e0889d57c0f5b41925c39 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: tools/xenstore: split up send_reply() - -Today send_reply() is used for both, normal request replies and watch -events. - -Split it up into send_reply() and send_event(). This will be used to -add some event specific handling. - -add_event() can be merged into send_event(), removing the need for an -intermediate memory allocation. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index e9c9695fd16e..249ad5ec6fb1 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -767,49 +767,32 @@ static void send_error(struct connection *conn, int error) - void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - const void *data, unsigned int len) - { -- struct buffered_data *bdata; -+ struct buffered_data *bdata = conn->in; -+ -+ assert(type != XS_WATCH_EVENT); - - if ( len > XENSTORE_PAYLOAD_MAX ) { - send_error(conn, E2BIG); - return; - } - -- /* Replies reuse the request buffer, events need a new one. */ -- if (type != XS_WATCH_EVENT) { -- bdata = conn->in; -- /* Drop asynchronous responses, e.g. errors for watch events. */ -- if (!bdata) -- return; -- bdata->inhdr = true; -- bdata->used = 0; -- conn->in = NULL; -- } else { -- /* Message is a child of the connection for auto-cleanup. */ -- bdata = new_buffer(conn); -+ if (!bdata) -+ return; -+ bdata->inhdr = true; -+ bdata->used = 0; - -- /* -- * Allocation failure here is unfortunate: we have no way to -- * tell anybody about it. -- */ -- if (!bdata) -- return; -- } - if (len <= DEFAULT_BUFFER_SIZE) - bdata->buffer = bdata->default_buffer; -- else -+ else { - bdata->buffer = talloc_array(bdata, char, len); -- if (!bdata->buffer) { -- if (type == XS_WATCH_EVENT) { -- /* Same as above: no way to tell someone. */ -- talloc_free(bdata); -+ if (!bdata->buffer) { -+ send_error(conn, ENOMEM); - return; - } -- /* re-establish request buffer for sending ENOMEM. */ -- conn->in = bdata; -- send_error(conn, ENOMEM); -- return; - } - -+ conn->in = NULL; -+ - /* Update relevant header fields and fill in the message body. */ - bdata->hdr.msg.type = type; - bdata->hdr.msg.len = len; -@@ -817,8 +800,39 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+} - -- return; -+/* -+ * Send a watch event. -+ * As this is not directly related to the current command, errors can't be -+ * reported. -+ */ -+void send_event(struct connection *conn, const char *path, const char *token) -+{ -+ struct buffered_data *bdata; -+ unsigned int len; -+ -+ len = strlen(path) + 1 + strlen(token) + 1; -+ /* Don't try to send over-long events. */ -+ if (len > XENSTORE_PAYLOAD_MAX) -+ return; -+ -+ bdata = new_buffer(conn); -+ if (!bdata) -+ return; -+ -+ bdata->buffer = talloc_array(bdata, char, len); -+ if (!bdata->buffer) { -+ talloc_free(bdata); -+ return; -+ } -+ strcpy(bdata->buffer, path); -+ strcpy(bdata->buffer + strlen(path) + 1, token); -+ bdata->hdr.msg.type = XS_WATCH_EVENT; -+ bdata->hdr.msg.len = len; -+ -+ /* Queue for later transmission. */ -+ list_add_tail(&bdata->list, &conn->out_list); - } - - /* Some routines (write, mkdir, etc) just need a non-error return */ -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 0004fa848c83..9af9af4390bd 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -187,6 +187,7 @@ unsigned int get_string(const struct buffered_data *data, unsigned int offset); - - void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - const void *data, unsigned int len); -+void send_event(struct connection *conn, const char *path, const char *token); - - /* Some routines (write, mkdir, etc) just need a non-error return */ - void send_ack(struct connection *conn, enum xsd_sockmsg_type type); -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index aca0a71bada1..99a2c266b28a 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -86,35 +86,6 @@ static const char *get_watch_path(const struct watch *watch, const char *name) - } - - /* -- * Send a watch event. -- * Temporary memory allocations are done with ctx. -- */ --static void add_event(struct connection *conn, -- const void *ctx, -- struct watch *watch, -- const char *name) --{ -- /* Data to send (node\0token\0). */ -- unsigned int len; -- char *data; -- -- name = get_watch_path(watch, name); -- -- len = strlen(name) + 1 + strlen(watch->token) + 1; -- /* Don't try to send over-long events. */ -- if (len > XENSTORE_PAYLOAD_MAX) -- return; -- -- data = talloc_array(ctx, char, len); -- if (!data) -- return; -- strcpy(data, name); -- strcpy(data + strlen(name) + 1, watch->token); -- send_reply(conn, XS_WATCH_EVENT, data, len); -- talloc_free(data); --} -- --/* - * Check permissions of a specific watch to fire: - * Either the node itself or its parent have to be readable by the connection - * the watch has been setup for. In case a watch event is created due to -@@ -190,10 +161,14 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - list_for_each_entry(watch, &i->watches, list) { - if (exact) { - if (streq(name, watch->node)) -- add_event(i, ctx, watch, name); -+ send_event(i, -+ get_watch_path(watch, name), -+ watch->token); - } else { - if (is_child(name, watch->node)) -- add_event(i, ctx, watch, name); -+ send_event(i, -+ get_watch_path(watch, name), -+ watch->token); - } - } - } -@@ -292,7 +267,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) - send_ack(conn, XS_WATCH); - - /* We fire once up front: simplifies clients and restart. */ -- add_event(conn, in, watch, watch->node); -+ send_event(conn, get_watch_path(watch, watch->node), watch->token); - - return 0; - } diff --git a/xsa326-4.16-xenstored-02.patch b/xsa326-4.16-xenstored-02.patch deleted file mode 100644 index 540ab03..0000000 --- a/xsa326-4.16-xenstored-02.patch +++ /dev/null @@ -1,108 +0,0 @@ -From 0a4c86f8a8febd85610496470123adfc4fbc1c5d Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: tools/xenstore: add helpers to free struct buffered_data - -Add two helpers for freeing struct buffered_data: free_buffered_data() -for freeing one instance and conn_free_buffered_data() for freeing all -instances for a connection. - -This is avoiding duplicated code and will help later when more actions -are needed when freeing a struct buffered_data. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 249ad5ec6fb1..527a1ebdeded 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -211,6 +211,21 @@ void reopen_log(void) - } - } - -+static void free_buffered_data(struct buffered_data *out, -+ struct connection *conn) -+{ -+ list_del(&out->list); -+ talloc_free(out); -+} -+ -+void conn_free_buffered_data(struct connection *conn) -+{ -+ struct buffered_data *out; -+ -+ while ((out = list_top(&conn->out_list, struct buffered_data, list))) -+ free_buffered_data(out, conn); -+} -+ - static bool write_messages(struct connection *conn) - { - int ret; -@@ -254,8 +269,7 @@ static bool write_messages(struct connection *conn) - - trace_io(conn, out, 1); - -- list_del(&out->list); -- talloc_free(out); -+ free_buffered_data(out, conn); - - return true; - } -@@ -1506,18 +1520,12 @@ static struct { - */ - void ignore_connection(struct connection *conn) - { -- struct buffered_data *out, *tmp; -- - trace("CONN %p ignored\n", conn); - - conn->is_ignored = true; - conn_delete_all_watches(conn); - conn_delete_all_transactions(conn); -- -- list_for_each_entry_safe(out, tmp, &conn->out_list, list) { -- list_del(&out->list); -- talloc_free(out); -- } -+ conn_free_buffered_data(conn); - - talloc_free(conn->in); - conn->in = NULL; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 9af9af4390bd..e7ee87825c3b 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -276,6 +276,8 @@ int remember_string(struct hashtable *hash, const char *str); - - void set_tdb_key(const char *name, TDB_DATA *key); - -+void conn_free_buffered_data(struct connection *conn); -+ - const char *dump_state_global(FILE *fp); - const char *dump_state_buffered_data(FILE *fp, const struct connection *c, - struct xs_state_connection *sc); -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index d03c7d93a9e7..93c4c1edcdd1 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -411,15 +411,10 @@ static struct domain *find_domain_by_domid(unsigned int domid) - static void domain_conn_reset(struct domain *domain) - { - struct connection *conn = domain->conn; -- struct buffered_data *out; - - conn_delete_all_watches(conn); - conn_delete_all_transactions(conn); -- -- while ((out = list_top(&conn->out_list, struct buffered_data, list))) { -- list_del(&out->list); -- talloc_free(out); -- } -+ conn_free_buffered_data(conn); - - talloc_free(conn->in); - diff --git a/xsa326-4.16-xenstored-03.patch b/xsa326-4.16-xenstored-03.patch deleted file mode 100644 index a3a0d81..0000000 --- a/xsa326-4.16-xenstored-03.patch +++ /dev/null @@ -1,192 +0,0 @@ -From a6c4198242bf69bea1825492b7665b559023390c Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: tools/xenstore: reduce number of watch events - -When removing a watched node outside of a transaction, two watch events -are being produced instead of just a single one. - -When finalizing a transaction watch events can be generated for each -node which is being modified, even if outside a transaction such -modifications might not have resulted in a watch event. - -This happens e.g.: - -- for nodes which are only modified due to added/removed child entries -- for nodes being removed or created implicitly (e.g. creation of a/b/c - is implicitly creating a/b, resulting in watch events for a, a/b and - a/b/c instead of a/b/c only) - -Avoid these additional watch events, in order to reduce the needed -memory inside Xenstore for queueing them. - -This is being achieved by adding event flags to struct accessed_node -specifying whether an event should be triggered, and whether it should -be an exact match of the modified path. Both flags can be set from -fire_watches() instead of implying them only. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 527a1ebdeded..bf2243873901 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1295,7 +1295,7 @@ static void delete_child(struct connection *conn, - } - - static int delete_node(struct connection *conn, const void *ctx, -- struct node *parent, struct node *node) -+ struct node *parent, struct node *node, bool watch_exact) - { - char *name; - -@@ -1307,7 +1307,7 @@ static int delete_node(struct connection *conn, const void *ctx, - node->children); - child = name ? read_node(conn, node, name) : NULL; - if (child) { -- if (delete_node(conn, ctx, node, child)) -+ if (delete_node(conn, ctx, node, child, true)) - return errno; - } else { - trace("delete_node: Error deleting child '%s/%s'!\n", -@@ -1319,7 +1319,12 @@ static int delete_node(struct connection *conn, const void *ctx, - talloc_free(name); - } - -- fire_watches(conn, ctx, node->name, node, true, NULL); -+ /* -+ * Fire the watches now, when we can still see the node permissions. -+ * This fine as we are single threaded and the next possible read will -+ * be handled only after the node has been really removed. -+ */ -+ fire_watches(conn, ctx, node->name, node, watch_exact, NULL); - delete_node_single(conn, node); - delete_child(conn, parent, basename(node->name)); - talloc_free(node); -@@ -1345,13 +1350,7 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - return (errno == ENOMEM) ? ENOMEM : EINVAL; - node->parent = parent; - -- /* -- * Fire the watches now, when we can still see the node permissions. -- * This fine as we are single threaded and the next possible read will -- * be handled only after the node has been really removed. -- */ -- fire_watches(conn, ctx, name, node, false, NULL); -- return delete_node(conn, ctx, parent, node); -+ return delete_node(conn, ctx, parent, node, false); - } - - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index faf6c930e42a..54432907fc76 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -130,6 +130,10 @@ struct accessed_node - - /* Transaction node in data base? */ - bool ta_node; -+ -+ /* Watch event flags. */ -+ bool fire_watch; -+ bool watch_exact; - }; - - struct changed_domain -@@ -324,6 +328,29 @@ int access_node(struct connection *conn, struct node *node, - } - - /* -+ * A watch event should be fired for a node modified inside a transaction. -+ * Set the corresponding information. A non-exact event is replacing an exact -+ * one, but not the other way round. -+ */ -+void queue_watches(struct connection *conn, const char *name, bool watch_exact) -+{ -+ struct accessed_node *i; -+ -+ i = find_accessed_node(conn->transaction, name); -+ if (!i) { -+ conn->transaction->fail = true; -+ return; -+ } -+ -+ if (!i->fire_watch) { -+ i->fire_watch = true; -+ i->watch_exact = watch_exact; -+ } else if (!watch_exact) { -+ i->watch_exact = false; -+ } -+} -+ -+/* - * Finalize transaction: - * Walk through accessed nodes and check generation against global data. - * If all entries match, read the transaction entries and write them without -@@ -377,15 +404,15 @@ static int finalize_transaction(struct connection *conn, - ret = tdb_store(tdb_ctx, key, data, - TDB_REPLACE); - talloc_free(data.dptr); -- if (ret) -- goto err; -- fire_watches(conn, trans, i->node, NULL, false, -- i->perms.p ? &i->perms : NULL); - } else { -- fire_watches(conn, trans, i->node, NULL, false, -+ ret = tdb_delete(tdb_ctx, key); -+ } -+ if (ret) -+ goto err; -+ if (i->fire_watch) { -+ fire_watches(conn, trans, i->node, NULL, -+ i->watch_exact, - i->perms.p ? &i->perms : NULL); -- if (tdb_delete(tdb_ctx, key)) -- goto err; - } - } - -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 14062730e3c9..0093cac807e3 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -42,6 +42,9 @@ void transaction_entry_dec(struct transaction *trans, unsigned int domid); - int access_node(struct connection *conn, struct node *node, - enum node_access_type type, TDB_DATA *key); - -+/* Queue watches for a modified node. */ -+void queue_watches(struct connection *conn, const char *name, bool watch_exact); -+ - /* Prepend the transaction to name if appropriate. */ - int transaction_prepend(struct connection *conn, const char *name, - TDB_DATA *key); -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 99a2c266b28a..205d9d8ea116 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -29,6 +29,7 @@ - #include "xenstore_lib.h" - #include "utils.h" - #include "xenstored_domain.h" -+#include "xenstored_transaction.h" - - extern int quota_nb_watch_per_domain; - -@@ -143,9 +144,11 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - struct connection *i; - struct watch *watch; - -- /* During transactions, don't fire watches. */ -- if (conn && conn->transaction) -+ /* During transactions, don't fire watches, but queue them. */ -+ if (conn && conn->transaction) { -+ queue_watches(conn, name, exact); - return; -+ } - - /* Create an event for each watch. */ - list_for_each_entry(i, &connections, list) { diff --git a/xsa326-4.16-xenstored-04.patch b/xsa326-4.16-xenstored-04.patch deleted file mode 100644 index facbba4..0000000 --- a/xsa326-4.16-xenstored-04.patch +++ /dev/null @@ -1,302 +0,0 @@ -From 2feed737530592688382c655680982e10951c1ec Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:07 +0200 -Subject: tools/xenstore: let unread watch events time out - -A future modification will limit the number of outstanding requests -for a domain, where "outstanding" means that the response of the -request or any resulting watch event hasn't been consumed yet. - -In order to avoid a malicious guest being capable to block other guests -by not reading watch events, add a timeout for watch events. In case a -watch event hasn't been consumed after this timeout, it is being -deleted. Set the default timeout to 20 seconds (a random value being -not too high). - -In order to support to specify other timeout values in future, use a -generic command line option for that purpose: - ---timeout|-w watch-event= - -This is part of XSA-326 / CVE-2022-42311. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index bf2243873901..45244c021cd3 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -108,6 +108,8 @@ int quota_max_transaction = 10; - int quota_nb_perms_per_node = 5; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; - -+unsigned int timeout_watch_event_msec = 20000; -+ - void trace(const char *fmt, ...) - { - va_list arglist; -@@ -211,19 +213,92 @@ void reopen_log(void) - } - } - -+static uint64_t get_now_msec(void) -+{ -+ struct timespec now_ts; -+ -+ if (clock_gettime(CLOCK_MONOTONIC, &now_ts)) -+ barf_perror("Could not find time (clock_gettime failed)"); -+ -+ return now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000; -+} -+ - static void free_buffered_data(struct buffered_data *out, - struct connection *conn) - { -+ struct buffered_data *req; -+ - list_del(&out->list); -+ -+ /* -+ * Update conn->timeout_msec with the next found timeout value in the -+ * queued pending requests. -+ */ -+ if (out->timeout_msec) { -+ conn->timeout_msec = 0; -+ list_for_each_entry(req, &conn->out_list, list) { -+ if (req->timeout_msec) { -+ conn->timeout_msec = req->timeout_msec; -+ break; -+ } -+ } -+ } -+ - talloc_free(out); - } - -+static void check_event_timeout(struct connection *conn, uint64_t msecs, -+ int *ptimeout) -+{ -+ uint64_t delta; -+ struct buffered_data *out, *tmp; -+ -+ if (!conn->timeout_msec) -+ return; -+ -+ delta = conn->timeout_msec - msecs; -+ if (conn->timeout_msec <= msecs) { -+ delta = 0; -+ list_for_each_entry_safe(out, tmp, &conn->out_list, list) { -+ /* -+ * Only look at buffers with timeout and no data -+ * already written to the ring. -+ */ -+ if (out->timeout_msec && out->inhdr && !out->used) { -+ if (out->timeout_msec > msecs) { -+ conn->timeout_msec = out->timeout_msec; -+ delta = conn->timeout_msec - msecs; -+ break; -+ } -+ -+ /* -+ * Free out without updating conn->timeout_msec, -+ * as the update is done in this loop already. -+ */ -+ out->timeout_msec = 0; -+ trace("watch event path %s for domain %u timed out\n", -+ out->buffer, conn->id); -+ free_buffered_data(out, conn); -+ } -+ } -+ if (!delta) { -+ conn->timeout_msec = 0; -+ return; -+ } -+ } -+ -+ if (*ptimeout == -1 || *ptimeout > delta) -+ *ptimeout = delta; -+} -+ - void conn_free_buffered_data(struct connection *conn) - { - struct buffered_data *out; - - while ((out = list_top(&conn->out_list, struct buffered_data, list))) - free_buffered_data(out, conn); -+ -+ conn->timeout_msec = 0; - } - - static bool write_messages(struct connection *conn) -@@ -411,6 +486,7 @@ static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout) - { - struct connection *conn; - struct wrl_timestampt now; -+ uint64_t msecs; - - if (fds) - memset(fds, 0, sizeof(struct pollfd) * current_array_size); -@@ -431,10 +507,12 @@ static void initialize_fds(int *p_sock_pollfd_idx, int *ptimeout) - - wrl_gettime_now(&now); - wrl_log_periodic(now); -+ msecs = get_now_msec(); - - list_for_each_entry(conn, &connections, list) { - if (conn->domain) { - wrl_check_timeout(conn->domain, now, ptimeout); -+ check_event_timeout(conn, msecs, ptimeout); - if (conn_can_read(conn) || - (conn_can_write(conn) && - !list_empty(&conn->out_list))) -@@ -794,6 +872,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - return; - bdata->inhdr = true; - bdata->used = 0; -+ bdata->timeout_msec = 0; - - if (len <= DEFAULT_BUFFER_SIZE) - bdata->buffer = bdata->default_buffer; -@@ -845,6 +924,12 @@ void send_event(struct connection *conn, const char *path, const char *token) - bdata->hdr.msg.type = XS_WATCH_EVENT; - bdata->hdr.msg.len = len; - -+ if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { -+ bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; -+ if (!conn->timeout_msec) -+ conn->timeout_msec = bdata->timeout_msec; -+ } -+ - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); - } -@@ -2201,6 +2286,9 @@ static void usage(void) - " -t, --transaction limit the number of transaction allowed per domain,\n" - " -A, --perm-nb limit the number of permissions per node,\n" - " -M, --path-max limit the allowed Xenstore node path length,\n" -+" -w, --timeout = set the timeout in seconds for ,\n" -+" allowed timeout candidates are:\n" -+" watch-event: time a watch-event is kept pending\n" - " -R, --no-recovery to request that no recovery should be attempted when\n" - " the store is corrupted (debug only),\n" - " -I, --internal-db store database in memory, not on disk\n" -@@ -2223,6 +2311,7 @@ static struct option options[] = { - { "transaction", 1, NULL, 't' }, - { "perm-nb", 1, NULL, 'A' }, - { "path-max", 1, NULL, 'M' }, -+ { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, - { "verbose", 0, NULL, 'V' }, -@@ -2236,6 +2325,39 @@ int dom0_domid = 0; - int dom0_event = 0; - int priv_domid = 0; - -+static int get_optval_int(const char *arg) -+{ -+ char *end; -+ long val; -+ -+ val = strtol(arg, &end, 10); -+ if (!*arg || *end || val < 0 || val > INT_MAX) -+ barf("invalid parameter value \"%s\"\n", arg); -+ -+ return val; -+} -+ -+static bool what_matches(const char *arg, const char *what) -+{ -+ unsigned int what_len = strlen(what); -+ -+ return !strncmp(arg, what, what_len) && arg[what_len] == '='; -+} -+ -+static void set_timeout(const char *arg) -+{ -+ const char *eq = strchr(arg, '='); -+ int val; -+ -+ if (!eq) -+ barf("quotas must be specified via =\n"); -+ val = get_optval_int(eq + 1); -+ if (what_matches(arg, "watch-event")) -+ timeout_watch_event_msec = val * 1000; -+ else -+ barf("unknown timeout \"%s\"\n", arg); -+} -+ - int main(int argc, char *argv[]) - { - int opt; -@@ -2250,7 +2372,7 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:U", options, -+ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:w:U", options, - NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2300,6 +2422,9 @@ int main(int argc, char *argv[]) - quota_max_path_len = min(XENSTORE_REL_PATH_MAX, - quota_max_path_len); - break; -+ case 'w': -+ set_timeout(optarg); -+ break; - case 'e': - dom0_event = strtol(optarg, NULL, 10); - break; -@@ -2741,6 +2866,12 @@ static void add_buffered_data(struct buffered_data *bdata, - barf("error restoring buffered data"); - - memcpy(bdata->buffer, data, len); -+ if (bdata->hdr.msg.type == XS_WATCH_EVENT && timeout_watch_event_msec && -+ domain_is_unprivileged(conn)) { -+ bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; -+ if (!conn->timeout_msec) -+ conn->timeout_msec = bdata->timeout_msec; -+ } - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index e7ee87825c3b..8a81fc693f01 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -27,6 +27,7 @@ - #include - #include - #include -+#include - #include - - #include "xenstore_lib.h" -@@ -67,6 +68,8 @@ struct buffered_data - char raw[sizeof(struct xsd_sockmsg)]; - } hdr; - -+ uint64_t timeout_msec; -+ - /* The actual data. */ - char *buffer; - char default_buffer[DEFAULT_BUFFER_SIZE]; -@@ -118,6 +121,7 @@ struct connection - - /* Buffered output data */ - struct list_head out_list; -+ uint64_t timeout_msec; - - /* Transaction context for current request (NULL if none). */ - struct transaction *transaction; -@@ -244,6 +248,8 @@ extern int dom0_event; - extern int priv_domid; - extern int quota_nb_entry_per_domain; - -+extern unsigned int timeout_watch_event_msec; -+ - /* Map the kernel's xenstore page. */ - void *xenbus_map(void); - void unmap_xenbus(void *interface); diff --git a/xsa326-4.16-xenstored-05.patch b/xsa326-4.16-xenstored-05.patch deleted file mode 100644 index 77f9c25..0000000 --- a/xsa326-4.16-xenstored-05.patch +++ /dev/null @@ -1,443 +0,0 @@ -From 2eee122a45eb4a218596b103ce7f0759a824cf2e Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: tools/xenstore: limit outstanding requests - -Add another quota for limiting the number of outstanding requests of a -guest. As the way to specify quotas on the command line is becoming -rather nasty, switch to a new scheme using [--quota|-Q] = -allowing to add more quotas in future easily. - -Set the default value to 20 (basically a random value not seeming to -be too high or too low). - -A request is said to be outstanding if any message generated by this -request (the direct response plus potential watch events) is not yet -completely stored into a ring buffer. The initial watch event sent as -a result of registering a watch is an exception. - -Note that across a live update the relation to buffered watch events -for other domains is lost. - -Use talloc_zero() for allocating the domain structure in order to have -all per-domain quota zeroed initially. - -This is part of XSA-326 / CVE-2022-42312. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Acked-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 45244c021cd3..488d540f3a32 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -107,6 +107,7 @@ int quota_max_entry_size = 2048; /* 2K */ - int quota_max_transaction = 10; - int quota_nb_perms_per_node = 5; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; -+int quota_req_outstanding = 20; - - unsigned int timeout_watch_event_msec = 20000; - -@@ -223,12 +224,24 @@ static uint64_t get_now_msec(void) - return now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000; - } - -+/* -+ * Remove a struct buffered_data from the list of outgoing data. -+ * A struct buffered_data related to a request having caused watch events to be -+ * sent is kept until all those events have been written out. -+ * Each watch event is referencing the related request via pend.req, while the -+ * number of watch events caused by a request is kept in pend.ref.event_cnt -+ * (those two cases are mutually exclusive, so the two fields can share memory -+ * via a union). -+ * The struct buffered_data is freed only if no related watch event is -+ * referencing it. The related return data can be freed right away. -+ */ - static void free_buffered_data(struct buffered_data *out, - struct connection *conn) - { - struct buffered_data *req; - - list_del(&out->list); -+ out->on_out_list = false; - - /* - * Update conn->timeout_msec with the next found timeout value in the -@@ -244,6 +257,30 @@ static void free_buffered_data(struct buffered_data *out, - } - } - -+ if (out->hdr.msg.type == XS_WATCH_EVENT) { -+ req = out->pend.req; -+ if (req) { -+ req->pend.ref.event_cnt--; -+ if (!req->pend.ref.event_cnt && !req->on_out_list) { -+ if (req->on_ref_list) { -+ domain_outstanding_domid_dec( -+ req->pend.ref.domid); -+ list_del(&req->list); -+ } -+ talloc_free(req); -+ } -+ } -+ } else if (out->pend.ref.event_cnt) { -+ /* Hang out off from conn. */ -+ talloc_steal(NULL, out); -+ if (out->buffer != out->default_buffer) -+ talloc_free(out->buffer); -+ list_add(&out->list, &conn->ref_list); -+ out->on_ref_list = true; -+ return; -+ } else -+ domain_outstanding_dec(conn); -+ - talloc_free(out); - } - -@@ -405,6 +442,7 @@ int delay_request(struct connection *conn, struct buffered_data *in, - static int destroy_conn(void *_conn) - { - struct connection *conn = _conn; -+ struct buffered_data *req; - - /* Flush outgoing if possible, but don't block. */ - if (!conn->domain) { -@@ -418,6 +456,11 @@ static int destroy_conn(void *_conn) - break; - close(conn->fd); - } -+ -+ conn_free_buffered_data(conn); -+ list_for_each_entry(req, &conn->ref_list, list) -+ req->on_ref_list = false; -+ - if (conn->target) - talloc_unlink(conn, conn->target); - list_del(&conn->list); -@@ -893,6 +936,8 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+ bdata->on_out_list = true; -+ domain_outstanding_inc(conn); - } - - /* -@@ -900,7 +945,8 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - * As this is not directly related to the current command, errors can't be - * reported. - */ --void send_event(struct connection *conn, const char *path, const char *token) -+void send_event(struct buffered_data *req, struct connection *conn, -+ const char *path, const char *token) - { - struct buffered_data *bdata; - unsigned int len; -@@ -930,8 +976,13 @@ void send_event(struct connection *conn, const char *path, const char *token) - conn->timeout_msec = bdata->timeout_msec; - } - -+ bdata->pend.req = req; -+ if (req) -+ req->pend.ref.event_cnt++; -+ - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+ bdata->on_out_list = true; - } - - /* Some routines (write, mkdir, etc) just need a non-error return */ -@@ -1740,6 +1791,7 @@ static void handle_input(struct connection *conn) - return; - } - in = conn->in; -+ in->pend.ref.domid = conn->id; - - /* Not finished header yet? */ - if (in->inhdr) { -@@ -1808,6 +1860,7 @@ struct connection *new_connection(const struct interface_funcs *funcs) - new->is_stalled = false; - new->transaction_started = 0; - INIT_LIST_HEAD(&new->out_list); -+ INIT_LIST_HEAD(&new->ref_list); - INIT_LIST_HEAD(&new->watches); - INIT_LIST_HEAD(&new->transaction_list); - INIT_LIST_HEAD(&new->delayed); -@@ -2286,6 +2339,9 @@ static void usage(void) - " -t, --transaction limit the number of transaction allowed per domain,\n" - " -A, --perm-nb limit the number of permissions per node,\n" - " -M, --path-max limit the allowed Xenstore node path length,\n" -+" -Q, --quota = set the quota to the value , allowed\n" -+" quotas are:\n" -+" outstanding: number of outstanding requests\n" - " -w, --timeout = set the timeout in seconds for ,\n" - " allowed timeout candidates are:\n" - " watch-event: time a watch-event is kept pending\n" -@@ -2311,6 +2367,7 @@ static struct option options[] = { - { "transaction", 1, NULL, 't' }, - { "perm-nb", 1, NULL, 'A' }, - { "path-max", 1, NULL, 'M' }, -+ { "quota", 1, NULL, 'Q' }, - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, -@@ -2358,6 +2415,20 @@ static void set_timeout(const char *arg) - barf("unknown timeout \"%s\"\n", arg); - } - -+static void set_quota(const char *arg) -+{ -+ const char *eq = strchr(arg, '='); -+ int val; -+ -+ if (!eq) -+ barf("quotas must be specified via =\n"); -+ val = get_optval_int(eq + 1); -+ if (what_matches(arg, "outstanding")) -+ quota_req_outstanding = val; -+ else -+ barf("unknown quota \"%s\"\n", arg); -+} -+ - int main(int argc, char *argv[]) - { - int opt; -@@ -2372,8 +2443,8 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:T:RVW:w:U", options, -- NULL)) != -1) { -+ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:T:RVW:w:U", -+ options, NULL)) != -1) { - switch (opt) { - case 'D': - no_domain_init = true; -@@ -2422,6 +2493,9 @@ int main(int argc, char *argv[]) - quota_max_path_len = min(XENSTORE_REL_PATH_MAX, - quota_max_path_len); - break; -+ case 'Q': -+ set_quota(optarg); -+ break; - case 'w': - set_timeout(optarg); - break; -@@ -2875,6 +2949,14 @@ static void add_buffered_data(struct buffered_data *bdata, - - /* Queue for later transmission. */ - list_add_tail(&bdata->list, &conn->out_list); -+ bdata->on_out_list = true; -+ /* -+ * Watch events are never "outstanding", but the request causing them -+ * are instead kept "outstanding" until all watch events caused by that -+ * request have been delivered. -+ */ -+ if (bdata->hdr.msg.type != XS_WATCH_EVENT) -+ domain_outstanding_inc(conn); - } - - void read_state_buffered_data(const void *ctx, struct connection *conn, -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 8a81fc693f01..db09f463a657 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -56,6 +56,8 @@ struct xs_state_connection; - struct buffered_data - { - struct list_head list; -+ bool on_out_list; -+ bool on_ref_list; - - /* Are we still doing the header? */ - bool inhdr; -@@ -63,6 +65,17 @@ struct buffered_data - /* How far are we? */ - unsigned int used; - -+ /* Outstanding request accounting. */ -+ union { -+ /* ref is being used for requests. */ -+ struct { -+ unsigned int event_cnt; /* # of outstanding events. */ -+ unsigned int domid; /* domid of request. */ -+ } ref; -+ /* req is being used for watch events. */ -+ struct buffered_data *req; /* request causing event. */ -+ } pend; -+ - union { - struct xsd_sockmsg msg; - char raw[sizeof(struct xsd_sockmsg)]; -@@ -123,6 +136,9 @@ struct connection - struct list_head out_list; - uint64_t timeout_msec; - -+ /* Referenced requests no longer pending. */ -+ struct list_head ref_list; -+ - /* Transaction context for current request (NULL if none). */ - struct transaction *transaction; - -@@ -191,7 +207,8 @@ unsigned int get_string(const struct buffered_data *data, unsigned int offset); - - void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - const void *data, unsigned int len); --void send_event(struct connection *conn, const char *path, const char *token); -+void send_event(struct buffered_data *req, struct connection *conn, -+ const char *path, const char *token); - - /* Some routines (write, mkdir, etc) just need a non-error return */ - void send_ack(struct connection *conn, enum xsd_sockmsg_type type); -@@ -247,6 +264,7 @@ extern int dom0_domid; - extern int dom0_event; - extern int priv_domid; - extern int quota_nb_entry_per_domain; -+extern int quota_req_outstanding; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 93c4c1edcdd1..850085a92c76 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -78,6 +78,9 @@ struct domain - /* number of watch for this domain */ - int nbwatch; - -+ /* Number of outstanding requests. */ -+ int nboutstanding; -+ - /* write rate limit */ - wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */ - struct wrl_timestampt wrl_timestamp; -@@ -183,8 +186,12 @@ static bool domain_can_read(struct connection *conn) - { - struct xenstore_domain_interface *intf = conn->domain->interface; - -- if (domain_is_unprivileged(conn) && conn->domain->wrl_credit < 0) -- return false; -+ if (domain_is_unprivileged(conn)) { -+ if (conn->domain->wrl_credit < 0) -+ return false; -+ if (conn->domain->nboutstanding >= quota_req_outstanding) -+ return false; -+ } - - return (intf->req_cons != intf->req_prod); - } -@@ -331,7 +338,7 @@ static struct domain *alloc_domain(const void *context, unsigned int domid) - { - struct domain *domain; - -- domain = talloc(context, struct domain); -+ domain = talloc_zero(context, struct domain); - if (!domain) { - errno = ENOMEM; - return NULL; -@@ -392,9 +399,6 @@ static int new_domain(struct domain *domain, int port, bool restore) - domain->conn->domain = domain; - domain->conn->id = domain->domid; - -- domain->nbentry = 0; -- domain->nbwatch = 0; -- - return 0; - } - -@@ -938,6 +942,28 @@ int domain_watch(struct connection *conn) - : 0; - } - -+void domain_outstanding_inc(struct connection *conn) -+{ -+ if (!conn || !conn->domain) -+ return; -+ conn->domain->nboutstanding++; -+} -+ -+void domain_outstanding_dec(struct connection *conn) -+{ -+ if (!conn || !conn->domain) -+ return; -+ conn->domain->nboutstanding--; -+} -+ -+void domain_outstanding_domid_dec(unsigned int domid) -+{ -+ struct domain *d = find_domain_by_domid(domid); -+ -+ if (d) -+ d->nboutstanding--; -+} -+ - static wrl_creditt wrl_config_writecost = WRL_FACTOR; - static wrl_creditt wrl_config_rate = WRL_RATE * WRL_FACTOR; - static wrl_creditt wrl_config_dburst = WRL_DBURST * WRL_FACTOR; -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 1e929b8f8c6f..4f51b005291a 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -64,6 +64,9 @@ int domain_entry(struct connection *conn); - void domain_watch_inc(struct connection *conn); - void domain_watch_dec(struct connection *conn); - int domain_watch(struct connection *conn); -+void domain_outstanding_inc(struct connection *conn); -+void domain_outstanding_dec(struct connection *conn); -+void domain_outstanding_domid_dec(unsigned int domid); - - /* Special node permission handling. */ - int set_perms_special(struct connection *conn, const char *name, -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 205d9d8ea116..0755ffa375ba 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -142,6 +142,7 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - struct node *node, bool exact, struct node_perms *perms) - { - struct connection *i; -+ struct buffered_data *req; - struct watch *watch; - - /* During transactions, don't fire watches, but queue them. */ -@@ -150,6 +151,8 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - return; - } - -+ req = domain_is_unprivileged(conn) ? conn->in : NULL; -+ - /* Create an event for each watch. */ - list_for_each_entry(i, &connections, list) { - /* introduce/release domain watches */ -@@ -164,12 +167,12 @@ void fire_watches(struct connection *conn, const void *ctx, const char *name, - list_for_each_entry(watch, &i->watches, list) { - if (exact) { - if (streq(name, watch->node)) -- send_event(i, -+ send_event(req, i, - get_watch_path(watch, name), - watch->token); - } else { - if (is_child(name, watch->node)) -- send_event(i, -+ send_event(req, i, - get_watch_path(watch, name), - watch->token); - } -@@ -269,8 +272,12 @@ int do_watch(struct connection *conn, struct buffered_data *in) - trace_create(watch, "watch"); - send_ack(conn, XS_WATCH); - -- /* We fire once up front: simplifies clients and restart. */ -- send_event(conn, get_watch_path(watch, watch->node), watch->token); -+ /* -+ * We fire once up front: simplifies clients and restart. -+ * This event will not be linked to the XS_WATCH request. -+ */ -+ send_event(NULL, conn, get_watch_path(watch, watch->node), -+ watch->token); - - return 0; - } diff --git a/xsa326-4.16-xenstored-06.patch b/xsa326-4.16-xenstored-06.patch deleted file mode 100644 index b2be7ce..0000000 --- a/xsa326-4.16-xenstored-06.patch +++ /dev/null @@ -1,85 +0,0 @@ -From c8057cb483abf2cd4060b39616423e19283fbd0a Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: tools/xenstore: don't buffer multiple identical watch events - -A guest not reading its Xenstore response buffer fast enough might -pile up lots of Xenstore watch events buffered. Reduce the generated -load by dropping new events which already have an identical copy -pending. - -The special events "@..." are excluded from that handling as there are -known use cases where the handler is relying on each event to be sent -individually. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 488d540f3a32..f1fa97b8cf50 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -916,6 +916,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - bdata->inhdr = true; - bdata->used = 0; - bdata->timeout_msec = 0; -+ bdata->watch_event = false; - - if (len <= DEFAULT_BUFFER_SIZE) - bdata->buffer = bdata->default_buffer; -@@ -948,7 +949,7 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - void send_event(struct buffered_data *req, struct connection *conn, - const char *path, const char *token) - { -- struct buffered_data *bdata; -+ struct buffered_data *bdata, *bd; - unsigned int len; - - len = strlen(path) + 1 + strlen(token) + 1; -@@ -970,12 +971,29 @@ void send_event(struct buffered_data *req, struct connection *conn, - bdata->hdr.msg.type = XS_WATCH_EVENT; - bdata->hdr.msg.len = len; - -+ /* -+ * Check whether an identical event is pending already. -+ * Special events are excluded from that check. -+ */ -+ if (path[0] != '@') { -+ list_for_each_entry(bd, &conn->out_list, list) { -+ if (bd->watch_event && bd->hdr.msg.len == len && -+ !memcmp(bdata->buffer, bd->buffer, len)) { -+ trace("dropping duplicate watch %s %s for domain %u\n", -+ path, token, conn->id); -+ talloc_free(bdata); -+ return; -+ } -+ } -+ } -+ - if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { - bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; - if (!conn->timeout_msec) - conn->timeout_msec = bdata->timeout_msec; - } - -+ bdata->watch_event = true; - bdata->pend.req = req; - if (req) - req->pend.ref.event_cnt++; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index db09f463a657..b9b50e81c7b4 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -62,6 +62,9 @@ struct buffered_data - /* Are we still doing the header? */ - bool inhdr; - -+ /* Is this a watch event? */ -+ bool watch_event; -+ - /* How far are we? */ - unsigned int used; - diff --git a/xsa326-4.16-xenstored-07.patch b/xsa326-4.16-xenstored-07.patch deleted file mode 100644 index dfcdc14..0000000 --- a/xsa326-4.16-xenstored-07.patch +++ /dev/null @@ -1,53 +0,0 @@ -From 5eac692b841633be3e85f0125c59fa02af103989 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: tools/xenstore: fix connection->id usage - -Don't use conn->id for privilege checks, but domain_is_unprivileged(). - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c -index 7b4300ef7777..adb8d51b043b 100644 ---- a/tools/xenstore/xenstored_control.c -+++ b/tools/xenstore/xenstored_control.c -@@ -891,7 +891,7 @@ int do_control(struct connection *conn, struct buffered_data *in) - unsigned int cmd, num, off; - char **vec = NULL; - -- if (conn->id != 0) -+ if (domain_is_unprivileged(conn)) - return EACCES; - - off = get_string(in, 0); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index b9b50e81c7b4..b1a70488b989 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -123,7 +123,7 @@ struct connection - /* The index of pollfd in global pollfd array */ - int pollfd_idx; - -- /* Who am I? 0 for socket connections. */ -+ /* Who am I? Domid of connection. */ - unsigned int id; - - /* Is this connection ignored? */ -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 54432907fc76..ee1b09031a3b 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -477,7 +477,8 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) - if (conn->transaction) - return EBUSY; - -- if (conn->id && conn->transaction_started > quota_max_transaction) -+ if (domain_is_unprivileged(conn) && -+ conn->transaction_started > quota_max_transaction) - return ENOSPC; - - /* Attach transaction to input for autofree until it's complete */ diff --git a/xsa326-4.16-xenstored-08.patch b/xsa326-4.16-xenstored-08.patch deleted file mode 100644 index 5aa12c8..0000000 --- a/xsa326-4.16-xenstored-08.patch +++ /dev/null @@ -1,326 +0,0 @@ -From f9f3171441b5fcb3339cf612400794fc26cd2ec2 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:08 +0200 -Subject: tools/xenstore: simplify and fix per domain node accounting - -The accounting of nodes can be simplified now that each connection -holds the associated domid. - -Fix the node accounting to cover nodes created for a domain before it -has been introduced. This requires to react properly to an allocation -failure inside domain_entry_inc() by returning an error code. - -Especially in error paths the node accounting has to be fixed in some -cases. - -This is part of XSA-326 / CVE-2022-42313. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index f1fa97b8cf50..692d863fce35 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -638,7 +638,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - - /* Permissions are struct xs_permissions. */ - node->perms.p = hdr->perms; -- if (domain_adjust_node_perms(node)) { -+ if (domain_adjust_node_perms(conn, node)) { - talloc_free(node); - return NULL; - } -@@ -660,7 +660,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - void *p; - struct xs_tdb_record_hdr *hdr; - -- if (domain_adjust_node_perms(node)) -+ if (domain_adjust_node_perms(conn, node)) - return errno; - - data.dsize = sizeof(*hdr) -@@ -1272,13 +1272,17 @@ static struct node *construct_node(struct connection *conn, const void *ctx, - return NULL; - } - --static int destroy_node(struct connection *conn, struct node *node) -+static void destroy_node_rm(struct node *node) - { - if (streq(node->name, "/")) - corrupt(NULL, "Destroying root node!"); - - tdb_delete(tdb_ctx, node->key); -+} - -+static int destroy_node(struct connection *conn, struct node *node) -+{ -+ destroy_node_rm(node); - domain_entry_dec(conn, node); - - /* -@@ -1328,8 +1332,12 @@ static struct node *create_node(struct connection *conn, const void *ctx, - goto err; - - /* Account for new node */ -- if (i->parent) -- domain_entry_inc(conn, i); -+ if (i->parent) { -+ if (domain_entry_inc(conn, i)) { -+ destroy_node_rm(i); -+ return NULL; -+ } -+ } - } - - return node; -@@ -1614,10 +1622,27 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - old_perms = node->perms; - domain_entry_dec(conn, node); - node->perms = perms; -- domain_entry_inc(conn, node); -+ if (domain_entry_inc(conn, node)) { -+ node->perms = old_perms; -+ /* -+ * This should never fail because we had a reference on the -+ * domain before and Xenstored is single-threaded. -+ */ -+ domain_entry_inc(conn, node); -+ return ENOMEM; -+ } -+ -+ if (write_node(conn, node, false)) { -+ int saved_errno = errno; - -- if (write_node(conn, node, false)) -+ domain_entry_dec(conn, node); -+ node->perms = old_perms; -+ /* No failure possible as above. */ -+ domain_entry_inc(conn, node); -+ -+ errno = saved_errno; - return errno; -+ } - - fire_watches(conn, in, name, node, false, &old_perms); - send_ack(conn, XS_SET_PERMS); -@@ -3122,7 +3147,9 @@ void read_state_node(const void *ctx, const void *state) - set_tdb_key(name, &key); - if (write_node_raw(NULL, &key, node, true)) - barf("write node error restoring node"); -- domain_entry_inc(&conn, node); -+ -+ if (domain_entry_inc(&conn, node)) -+ barf("node accounting error restoring node"); - - talloc_free(node); - } -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 850085a92c76..260952e09096 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -16,6 +16,7 @@ - along with this program; If not, see . - */ - -+#include - #include - #include - #include -@@ -363,6 +364,18 @@ static struct domain *find_or_alloc_domain(const void *ctx, unsigned int domid) - return domain ? : alloc_domain(ctx, domid); - } - -+static struct domain *find_or_alloc_existing_domain(unsigned int domid) -+{ -+ struct domain *domain; -+ xc_dominfo_t dominfo; -+ -+ domain = find_domain_struct(domid); -+ if (!domain && get_domain_info(domid, &dominfo)) -+ domain = alloc_domain(NULL, domid); -+ -+ return domain; -+} -+ - static int new_domain(struct domain *domain, int port, bool restore) - { - int rc; -@@ -782,30 +795,28 @@ void domain_deinit(void) - xenevtchn_unbind(xce_handle, virq_port); - } - --void domain_entry_inc(struct connection *conn, struct node *node) -+int domain_entry_inc(struct connection *conn, struct node *node) - { - struct domain *d; -+ unsigned int domid; - - if (!conn) -- return; -+ return 0; - -- if (node->perms.p && node->perms.p[0].id != conn->id) { -- if (conn->transaction) { -- transaction_entry_inc(conn->transaction, -- node->perms.p[0].id); -- } else { -- d = find_domain_by_domid(node->perms.p[0].id); -- if (d) -- d->nbentry++; -- } -- } else if (conn->domain) { -- if (conn->transaction) { -- transaction_entry_inc(conn->transaction, -- conn->domain->domid); -- } else { -- conn->domain->nbentry++; -- } -+ domid = node->perms.p ? node->perms.p[0].id : conn->id; -+ -+ if (conn->transaction) { -+ transaction_entry_inc(conn->transaction, domid); -+ } else { -+ d = (domid == conn->id && conn->domain) ? conn->domain -+ : find_or_alloc_existing_domain(domid); -+ if (d) -+ d->nbentry++; -+ else -+ return ENOMEM; - } -+ -+ return 0; - } - - /* -@@ -841,7 +852,7 @@ static int chk_domain_generation(unsigned int domid, uint64_t gen) - * Remove permissions for no longer existing domains in order to avoid a new - * domain with the same domid inheriting the permissions. - */ --int domain_adjust_node_perms(struct node *node) -+int domain_adjust_node_perms(struct connection *conn, struct node *node) - { - unsigned int i; - int ret; -@@ -851,8 +862,14 @@ int domain_adjust_node_perms(struct node *node) - return errno; - - /* If the owner doesn't exist any longer give it to priv domain. */ -- if (!ret) -+ if (!ret) { -+ /* -+ * In theory we'd need to update the number of dom0 nodes here, -+ * but we could be called for a read of the node. So better -+ * avoid the risk to overflow the node count of dom0. -+ */ - node->perms.p[0].id = priv_domid; -+ } - - for (i = 1; i < node->perms.num; i++) { - if (node->perms.p[i].perms & XS_PERM_IGNORE) -@@ -871,25 +888,25 @@ int domain_adjust_node_perms(struct node *node) - void domain_entry_dec(struct connection *conn, struct node *node) - { - struct domain *d; -+ unsigned int domid; - - if (!conn) - return; - -- if (node->perms.p && node->perms.p[0].id != conn->id) { -- if (conn->transaction) { -- transaction_entry_dec(conn->transaction, -- node->perms.p[0].id); -- } else { -- d = find_domain_by_domid(node->perms.p[0].id); -- if (d && d->nbentry) -- d->nbentry--; -- } -- } else if (conn->domain && conn->domain->nbentry) { -- if (conn->transaction) { -- transaction_entry_dec(conn->transaction, -- conn->domain->domid); -+ domid = node->perms.p ? node->perms.p[0].id : conn->id; -+ -+ if (conn->transaction) { -+ transaction_entry_dec(conn->transaction, domid); -+ } else { -+ d = (domid == conn->id && conn->domain) ? conn->domain -+ : find_domain_struct(domid); -+ if (d) { -+ d->nbentry--; - } else { -- conn->domain->nbentry--; -+ errno = ENOENT; -+ corrupt(conn, -+ "Node \"%s\" owned by non-existing domain %u\n", -+ node->name, domid); - } - } - } -@@ -899,13 +916,23 @@ int domain_entry_fix(unsigned int domid, int num, bool update) - struct domain *d; - int cnt; - -- d = find_domain_by_domid(domid); -- if (!d) -- return 0; -+ if (update) { -+ d = find_domain_struct(domid); -+ assert(d); -+ } else { -+ /* -+ * We are called first with update == false in order to catch -+ * any error. So do a possible allocation and check for error -+ * only in this case, as in the case of update == true nothing -+ * can go wrong anymore as the allocation already happened. -+ */ -+ d = find_or_alloc_existing_domain(domid); -+ if (!d) -+ return -1; -+ } - - cnt = d->nbentry + num; -- if (cnt < 0) -- cnt = 0; -+ assert(cnt >= 0); - - if (update) - d->nbentry = cnt; -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 4f51b005291a..d6519904d831 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -54,10 +54,10 @@ const char *get_implicit_path(const struct connection *conn); - bool domain_is_unprivileged(struct connection *conn); - - /* Remove node permissions for no longer existing domains. */ --int domain_adjust_node_perms(struct node *node); -+int domain_adjust_node_perms(struct connection *conn, struct node *node); - - /* Quota manipulation */ --void domain_entry_inc(struct connection *conn, struct node *); -+int domain_entry_inc(struct connection *conn, struct node *); - void domain_entry_dec(struct connection *conn, struct node *); - int domain_entry_fix(unsigned int domid, int num, bool update); - int domain_entry(struct connection *conn); -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index ee1b09031a3b..86caf6c398be 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -519,8 +519,12 @@ static int transaction_fix_domains(struct transaction *trans, bool update) - - list_for_each_entry(d, &trans->changed_domains, list) { - cnt = domain_entry_fix(d->domid, d->nbentry, update); -- if (!update && cnt >= quota_nb_entry_per_domain) -- return ENOSPC; -+ if (!update) { -+ if (cnt >= quota_nb_entry_per_domain) -+ return ENOSPC; -+ if (cnt < 0) -+ return ENOMEM; -+ } - } - - return 0; diff --git a/xsa326-4.16-xenstored-09.patch b/xsa326-4.16-xenstored-09.patch deleted file mode 100644 index e2c163c..0000000 --- a/xsa326-4.16-xenstored-09.patch +++ /dev/null @@ -1,244 +0,0 @@ -From 71aac6f7e89d5c101adb9e82eea7031e16d34e46 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: tools/xenstore: limit max number of nodes accessed in a transaction - -Today a guest is free to access as many nodes in a single transaction -as it wants. This can lead to unbounded memory consumption in Xenstore -as there is the need to keep track of all nodes having been accessed -during a transaction. - -In oxenstored the number of requests in a transaction is being limited -via a quota maxrequests (default is 1024). As multiple accesses of a -node are not problematic in C Xenstore, limit the number of accessed -nodes. - -In order to let read_node() detect a quota error in case too many nodes -are being accessed, check the return value of access_node() and return -NULL in case an error has been seen. Introduce __must_check and add it -to the access_node() prototype. - -This is part of XSA-326 / CVE-2022-42314. - -Reported-by: Julien Grall -Suggested-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/include/xen-tools/libs.h b/tools/include/xen-tools/libs.h -index a16e0c380709..bafc90e2f603 100644 ---- a/tools/include/xen-tools/libs.h -+++ b/tools/include/xen-tools/libs.h -@@ -63,4 +63,8 @@ - #define ROUNDUP(_x,_w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1)) - #endif - -+#ifndef __must_check -+#define __must_check __attribute__((__warn_unused_result__)) -+#endif -+ - #endif /* __XEN_TOOLS_LIBS__ */ -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 692d863fce35..f835aa1b2f1f 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -106,6 +106,7 @@ int quota_nb_watch_per_domain = 128; - int quota_max_entry_size = 2048; /* 2K */ - int quota_max_transaction = 10; - int quota_nb_perms_per_node = 5; -+int quota_trans_nodes = 1024; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; - int quota_req_outstanding = 20; - -@@ -595,6 +596,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - TDB_DATA key, data; - struct xs_tdb_record_hdr *hdr; - struct node *node; -+ int err; - - node = talloc(ctx, struct node); - if (!node) { -@@ -616,14 +618,13 @@ struct node *read_node(struct connection *conn, const void *ctx, - if (data.dptr == NULL) { - if (tdb_error(tdb_ctx) == TDB_ERR_NOEXIST) { - node->generation = NO_GENERATION; -- access_node(conn, node, NODE_ACCESS_READ, NULL); -- errno = ENOENT; -+ err = access_node(conn, node, NODE_ACCESS_READ, NULL); -+ errno = err ? : ENOENT; - } else { - log("TDB error on read: %s", tdb_errorstr(tdb_ctx)); - errno = EIO; - } -- talloc_free(node); -- return NULL; -+ goto error; - } - - node->parent = NULL; -@@ -638,19 +639,36 @@ struct node *read_node(struct connection *conn, const void *ctx, - - /* Permissions are struct xs_permissions. */ - node->perms.p = hdr->perms; -- if (domain_adjust_node_perms(conn, node)) { -- talloc_free(node); -- return NULL; -- } -+ if (domain_adjust_node_perms(conn, node)) -+ goto error; - - /* Data is binary blob (usually ascii, no nul). */ - node->data = node->perms.p + hdr->num_perms; - /* Children is strings, nul separated. */ - node->children = node->data + node->datalen; - -- access_node(conn, node, NODE_ACCESS_READ, NULL); -+ if (access_node(conn, node, NODE_ACCESS_READ, NULL)) -+ goto error; - - return node; -+ -+ error: -+ err = errno; -+ talloc_free(node); -+ errno = err; -+ return NULL; -+} -+ -+static bool read_node_can_propagate_errno(void) -+{ -+ /* -+ * 2 error cases for read_node() can always be propagated up: -+ * ENOMEM, because this has nothing to do with the node being in the -+ * data base or not, but is caused by a general lack of memory. -+ * ENOSPC, because this is related to hitting quota limits which need -+ * to be respected. -+ */ -+ return errno == ENOMEM || errno == ENOSPC; - } - - int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, -@@ -767,7 +785,7 @@ static int ask_parents(struct connection *conn, const void *ctx, - node = read_node(conn, ctx, name); - if (node) - break; -- if (errno == ENOMEM) -+ if (read_node_can_propagate_errno()) - return errno; - } while (!streq(name, "/")); - -@@ -829,7 +847,7 @@ static struct node *get_node(struct connection *conn, - } - } - /* Clean up errno if they weren't supposed to know. */ -- if (!node && errno != ENOMEM) -+ if (!node && !read_node_can_propagate_errno()) - errno = errno_from_parents(conn, ctx, name, errno, perm); - return node; - } -@@ -1235,7 +1253,7 @@ static struct node *construct_node(struct connection *conn, const void *ctx, - - /* If parent doesn't exist, create it. */ - parent = read_node(conn, parentname, parentname); -- if (!parent) -+ if (!parent && errno == ENOENT) - parent = construct_node(conn, ctx, parentname); - if (!parent) - return NULL; -@@ -1509,7 +1527,7 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - - parent = read_node(conn, ctx, parentname); - if (!parent) -- return (errno == ENOMEM) ? ENOMEM : EINVAL; -+ return read_node_can_propagate_errno() ? errno : EINVAL; - node->parent = parent; - - return delete_node(conn, ctx, parent, node, false); -@@ -1539,7 +1557,7 @@ static int do_rm(struct connection *conn, struct buffered_data *in) - return 0; - } - /* Restore errno, just in case. */ -- if (errno != ENOMEM) -+ if (!read_node_can_propagate_errno()) - errno = ENOENT; - } - return errno; -@@ -2384,6 +2402,8 @@ static void usage(void) - " -M, --path-max limit the allowed Xenstore node path length,\n" - " -Q, --quota = set the quota to the value , allowed\n" - " quotas are:\n" -+" transaction-nodes: number of accessed node per\n" -+" transaction\n" - " outstanding: number of outstanding requests\n" - " -w, --timeout = set the timeout in seconds for ,\n" - " allowed timeout candidates are:\n" -@@ -2468,6 +2488,8 @@ static void set_quota(const char *arg) - val = get_optval_int(eq + 1); - if (what_matches(arg, "outstanding")) - quota_req_outstanding = val; -+ else if (what_matches(arg, "transaction-nodes")) -+ quota_trans_nodes = val; - else - barf("unknown quota \"%s\"\n", arg); - } -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index b1a70488b989..245f9258235f 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -268,6 +268,7 @@ extern int dom0_event; - extern int priv_domid; - extern int quota_nb_entry_per_domain; - extern int quota_req_outstanding; -+extern int quota_trans_nodes; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 86caf6c398be..7bd41eb475e3 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -156,6 +156,9 @@ struct transaction - /* Connection-local identifier for this transaction. */ - uint32_t id; - -+ /* Node counter. */ -+ unsigned int nodes; -+ - /* Generation when transaction started. */ - uint64_t generation; - -@@ -260,6 +263,11 @@ int access_node(struct connection *conn, struct node *node, - - i = find_accessed_node(trans, node->name); - if (!i) { -+ if (trans->nodes >= quota_trans_nodes && -+ domain_is_unprivileged(conn)) { -+ ret = ENOSPC; -+ goto err; -+ } - i = talloc_zero(trans, struct accessed_node); - if (!i) - goto nomem; -@@ -297,6 +305,7 @@ int access_node(struct connection *conn, struct node *node, - i->ta_node = true; - } - } -+ trans->nodes++; - list_add_tail(&i->list, &trans->accessed); - } - -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 0093cac807e3..e3cbd6b23095 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -39,8 +39,8 @@ void transaction_entry_inc(struct transaction *trans, unsigned int domid); - void transaction_entry_dec(struct transaction *trans, unsigned int domid); - - /* This node was accessed. */ --int access_node(struct connection *conn, struct node *node, -- enum node_access_type type, TDB_DATA *key); -+int __must_check access_node(struct connection *conn, struct node *node, -+ enum node_access_type type, TDB_DATA *key); - - /* Queue watches for a modified node. */ - void queue_watches(struct connection *conn, const char *name, bool watch_exact); diff --git a/xsa326-4.16-xenstored-10.patch b/xsa326-4.16-xenstored-10.patch deleted file mode 100644 index 30e6b50..0000000 --- a/xsa326-4.16-xenstored-10.patch +++ /dev/null @@ -1,88 +0,0 @@ -From 90013d6a735491a7b93a6832eb2a51e5633254f5 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: tools/xenstore: move the call of setup_structure() to dom0 - introduction - -Setting up the basic structure when introducing dom0 has the advantage -to be able to add proper node memory accounting for the added nodes -later. - -This makes it possible to do proper node accounting, too. - -An additional requirement to make that work fine is to correct the -owner of the created nodes to be dom0_domid instead of domid 0. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Acked-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index f835aa1b2f1f..5171d34c947e 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2039,7 +2039,8 @@ static int tdb_flags; - static void manual_node(const char *name, const char *child) - { - struct node *node; -- struct xs_permissions perms = { .id = 0, .perms = XS_PERM_NONE }; -+ struct xs_permissions perms = { .id = dom0_domid, -+ .perms = XS_PERM_NONE }; - - node = talloc_zero(NULL, struct node); - if (!node) -@@ -2078,7 +2079,7 @@ static void tdb_logger(TDB_CONTEXT *tdb, int level, const char * fmt, ...) - } - } - --static void setup_structure(bool live_update) -+void setup_structure(bool live_update) - { - char *tdbname; - -@@ -2101,6 +2102,7 @@ static void setup_structure(bool live_update) - manual_node("/", "tool"); - manual_node("/tool", "xenstored"); - manual_node("/tool/xenstored", NULL); -+ domain_entry_fix(dom0_domid, 3, true); - } - - check_store(); -@@ -2614,9 +2616,6 @@ int main(int argc, char *argv[]) - - init_pipe(reopen_log_pipe); - -- /* Setup the database */ -- setup_structure(live_update); -- - /* Listen to hypervisor. */ - if (!no_domain_init && !live_update) { - domain_init(-1); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 245f9258235f..2c77ec7ee0f4 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -231,6 +231,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - struct node *read_node(struct connection *conn, const void *ctx, - const char *name); - -+void setup_structure(bool live_update); - struct connection *new_connection(const struct interface_funcs *funcs); - struct connection *get_connection_by_id(unsigned int conn_id); - void ignore_connection(struct connection *conn); -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 260952e09096..f04b7aae8a32 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -470,6 +470,9 @@ static struct domain *introduce_domain(const void *ctx, - } - domain->interface = interface; - -+ if (is_master_domain) -+ setup_structure(restore); -+ - /* Now domain belongs to its connection. */ - talloc_steal(domain->conn, domain); - diff --git a/xsa326-4.16-xenstored-11.patch b/xsa326-4.16-xenstored-11.patch deleted file mode 100644 index 49f1497..0000000 --- a/xsa326-4.16-xenstored-11.patch +++ /dev/null @@ -1,280 +0,0 @@ -From 6af17b8bf52b9dfdc6a5ecd3efbcea9fddd57d91 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: tools/xenstore: add infrastructure to keep track of per domain memory - usage - -The amount of memory a domain can consume in Xenstore is limited by -various quota today, but even with sane quota a domain can still -consume rather large memory quantities. - -Add the infrastructure for keeping track of the amount of memory a -domain is consuming in Xenstore. Note that this is only the memory a -domain has direct control over, so any internal administration data -needed by Xenstore only is not being accounted for. - -There are two quotas defined: a soft quota which will result in a -warning issued via syslog() when it is exceeded, and a hard quota -resulting in a stop of accepting further requests or watch events as -long as the hard quota would be violated by accepting those. - -Setting any of those quotas to 0 will disable it. - -As default values use 2MB per domain for the soft limit (this basically -covers the allowed case to create 1000 nodes needing 2kB each), and -2.5MB for the hard limit. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 5171d34c947e..b2bf6740d430 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -109,6 +109,8 @@ int quota_nb_perms_per_node = 5; - int quota_trans_nodes = 1024; - int quota_max_path_len = XENSTORE_REL_PATH_MAX; - int quota_req_outstanding = 20; -+int quota_memory_per_domain_soft = 2 * 1024 * 1024; /* 2 MB */ -+int quota_memory_per_domain_hard = 2 * 1024 * 1024 + 512 * 1024; /* 2.5 MB */ - - unsigned int timeout_watch_event_msec = 20000; - -@@ -2406,7 +2408,14 @@ static void usage(void) - " quotas are:\n" - " transaction-nodes: number of accessed node per\n" - " transaction\n" -+" memory: total used memory per domain for nodes,\n" -+" transactions, watches and requests, above\n" -+" which Xenstore will stop talking to domain\n" - " outstanding: number of outstanding requests\n" -+" -q, --quota-soft = set a soft quota to the value ,\n" -+" causing a warning to be issued via syslog() if the\n" -+" limit is violated, allowed quotas are:\n" -+" memory: see above\n" - " -w, --timeout = set the timeout in seconds for ,\n" - " allowed timeout candidates are:\n" - " watch-event: time a watch-event is kept pending\n" -@@ -2433,6 +2442,7 @@ static struct option options[] = { - { "perm-nb", 1, NULL, 'A' }, - { "path-max", 1, NULL, 'M' }, - { "quota", 1, NULL, 'Q' }, -+ { "quota-soft", 1, NULL, 'q' }, - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, -@@ -2480,7 +2490,7 @@ static void set_timeout(const char *arg) - barf("unknown timeout \"%s\"\n", arg); - } - --static void set_quota(const char *arg) -+static void set_quota(const char *arg, bool soft) - { - const char *eq = strchr(arg, '='); - int val; -@@ -2488,11 +2498,16 @@ static void set_quota(const char *arg) - if (!eq) - barf("quotas must be specified via =\n"); - val = get_optval_int(eq + 1); -- if (what_matches(arg, "outstanding")) -+ if (what_matches(arg, "outstanding") && !soft) - quota_req_outstanding = val; -- else if (what_matches(arg, "transaction-nodes")) -+ else if (what_matches(arg, "transaction-nodes") && !soft) - quota_trans_nodes = val; -- else -+ else if (what_matches(arg, "memory")) { -+ if (soft) -+ quota_memory_per_domain_soft = val; -+ else -+ quota_memory_per_domain_hard = val; -+ } else - barf("unknown quota \"%s\"\n", arg); - } - -@@ -2510,7 +2525,7 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:T:RVW:w:U", -+ while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:q:T:RVW:w:U", - options, NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2561,7 +2576,10 @@ int main(int argc, char *argv[]) - quota_max_path_len); - break; - case 'Q': -- set_quota(optarg); -+ set_quota(optarg, false); -+ break; -+ case 'q': -+ set_quota(optarg, true); - break; - case 'w': - set_timeout(optarg); -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 2c77ec7ee0f4..373af18297bf 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -270,6 +270,8 @@ extern int priv_domid; - extern int quota_nb_entry_per_domain; - extern int quota_req_outstanding; - extern int quota_trans_nodes; -+extern int quota_memory_per_domain_soft; -+extern int quota_memory_per_domain_hard; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index f04b7aae8a32..94fd561e9de4 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -76,6 +76,13 @@ struct domain - /* number of entry from this domain in the store */ - int nbentry; - -+ /* Amount of memory allocated for this domain. */ -+ int memory; -+ bool soft_quota_reported; -+ bool hard_quota_reported; -+ time_t mem_last_msg; -+#define MEM_WARN_MINTIME_SEC 10 -+ - /* number of watch for this domain */ - int nbwatch; - -@@ -192,6 +199,9 @@ static bool domain_can_read(struct connection *conn) - return false; - if (conn->domain->nboutstanding >= quota_req_outstanding) - return false; -+ if (conn->domain->memory >= quota_memory_per_domain_hard && -+ quota_memory_per_domain_hard) -+ return false; - } - - return (intf->req_cons != intf->req_prod); -@@ -950,6 +960,89 @@ int domain_entry(struct connection *conn) - : 0; - } - -+static bool domain_chk_quota(struct domain *domain, int mem) -+{ -+ time_t now; -+ -+ if (!domain || !domid_is_unprivileged(domain->domid) || -+ (domain->conn && domain->conn->is_ignored)) -+ return false; -+ -+ now = time(NULL); -+ -+ if (mem >= quota_memory_per_domain_hard && -+ quota_memory_per_domain_hard) { -+ if (domain->hard_quota_reported) -+ return true; -+ syslog(LOG_ERR, "Domain %u exceeds hard memory quota, Xenstore interface to domain stalled\n", -+ domain->domid); -+ domain->mem_last_msg = now; -+ domain->hard_quota_reported = true; -+ return true; -+ } -+ -+ if (now - domain->mem_last_msg >= MEM_WARN_MINTIME_SEC) { -+ if (domain->hard_quota_reported) { -+ domain->mem_last_msg = now; -+ domain->hard_quota_reported = false; -+ syslog(LOG_INFO, "Domain %u below hard memory quota again\n", -+ domain->domid); -+ } -+ if (mem >= quota_memory_per_domain_soft && -+ quota_memory_per_domain_soft && -+ !domain->soft_quota_reported) { -+ domain->mem_last_msg = now; -+ domain->soft_quota_reported = true; -+ syslog(LOG_WARNING, "Domain %u exceeds soft memory quota\n", -+ domain->domid); -+ } -+ if (mem < quota_memory_per_domain_soft && -+ domain->soft_quota_reported) { -+ domain->mem_last_msg = now; -+ domain->soft_quota_reported = false; -+ syslog(LOG_INFO, "Domain %u below soft memory quota again\n", -+ domain->domid); -+ } -+ -+ } -+ -+ return false; -+} -+ -+int domain_memory_add(unsigned int domid, int mem, bool no_quota_check) -+{ -+ struct domain *domain; -+ -+ domain = find_domain_struct(domid); -+ if (domain) { -+ /* -+ * domain_chk_quota() will print warning and also store whether -+ * the soft/hard quota has been hit. So check no_quota_check -+ * *after*. -+ */ -+ if (domain_chk_quota(domain, domain->memory + mem) && -+ !no_quota_check) -+ return ENOMEM; -+ domain->memory += mem; -+ } else { -+ /* -+ * The domain the memory is to be accounted for should always -+ * exist, as accounting is done either for a domain related to -+ * the current connection, or for the domain owning a node -+ * (which is always existing, as the owner of the node is -+ * tested to exist and replaced by domid 0 if not). -+ * So not finding the related domain MUST be an error in the -+ * data base. -+ */ -+ errno = ENOENT; -+ corrupt(NULL, "Accounting called for non-existing domain %u\n", -+ domid); -+ return ENOENT; -+ } -+ -+ return 0; -+} -+ - void domain_watch_inc(struct connection *conn) - { - if (!conn || !conn->domain) -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index d6519904d831..633c9a0a0a1f 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -61,6 +61,26 @@ int domain_entry_inc(struct connection *conn, struct node *); - void domain_entry_dec(struct connection *conn, struct node *); - int domain_entry_fix(unsigned int domid, int num, bool update); - int domain_entry(struct connection *conn); -+int domain_memory_add(unsigned int domid, int mem, bool no_quota_check); -+ -+/* -+ * domain_memory_add_chk(): to be used when memory quota should be checked. -+ * Not to be used when specifying a negative mem value, as lowering the used -+ * memory should always be allowed. -+ */ -+static inline int domain_memory_add_chk(unsigned int domid, int mem) -+{ -+ return domain_memory_add(domid, mem, false); -+} -+/* -+ * domain_memory_add_nochk(): to be used when memory quota should not be -+ * checked, e.g. when lowering memory usage, or in an error case for undoing -+ * a previous memory adjustment. -+ */ -+static inline void domain_memory_add_nochk(unsigned int domid, int mem) -+{ -+ domain_memory_add(domid, mem, true); -+} - void domain_watch_inc(struct connection *conn); - void domain_watch_dec(struct connection *conn); - int domain_watch(struct connection *conn); diff --git a/xsa326-4.16-xenstored-12.patch b/xsa326-4.16-xenstored-12.patch deleted file mode 100644 index 517f2a0..0000000 --- a/xsa326-4.16-xenstored-12.patch +++ /dev/null @@ -1,76 +0,0 @@ -From ae7042f024af7584251f776a12d9bb24d13fecaf Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:09 +0200 -Subject: tools/xenstore: add memory accounting for responses - -Add the memory accounting for queued responses. - -In case adding a watch event for a guest is causing the hard memory -quota of that guest to be violated, the event is dropped. This will -ensure that it is impossible to drive another guest past its memory -quota by generating insane amounts of events for that guest. This is -especially important for protecting driver domains from that attack -vector. - -This is part of XSA-326 / CVE-2022-42315. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index b2bf6740d430..ecab6cfbbe15 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -260,6 +260,8 @@ static void free_buffered_data(struct buffered_data *out, - } - } - -+ domain_memory_add_nochk(conn->id, -out->hdr.msg.len - sizeof(out->hdr)); -+ - if (out->hdr.msg.type == XS_WATCH_EVENT) { - req = out->pend.req; - if (req) { -@@ -938,11 +940,14 @@ void send_reply(struct connection *conn, enum xsd_sockmsg_type type, - bdata->timeout_msec = 0; - bdata->watch_event = false; - -- if (len <= DEFAULT_BUFFER_SIZE) -+ if (len <= DEFAULT_BUFFER_SIZE) { - bdata->buffer = bdata->default_buffer; -- else { -+ /* Don't check quota, path might be used for returning error. */ -+ domain_memory_add_nochk(conn->id, len + sizeof(bdata->hdr)); -+ } else { - bdata->buffer = talloc_array(bdata, char, len); -- if (!bdata->buffer) { -+ if (!bdata->buffer || -+ domain_memory_add_chk(conn->id, len + sizeof(bdata->hdr))) { - send_error(conn, ENOMEM); - return; - } -@@ -1007,6 +1012,11 @@ void send_event(struct buffered_data *req, struct connection *conn, - } - } - -+ if (domain_memory_add_chk(conn->id, len + sizeof(bdata->hdr))) { -+ talloc_free(bdata); -+ return; -+ } -+ - if (timeout_watch_event_msec && domain_is_unprivileged(conn)) { - bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec; - if (!conn->timeout_msec) -@@ -3039,6 +3049,12 @@ static void add_buffered_data(struct buffered_data *bdata, - */ - if (bdata->hdr.msg.type != XS_WATCH_EVENT) - domain_outstanding_inc(conn); -+ /* -+ * We are restoring the state after Live-Update and the new quota may -+ * be smaller. So ignore it. The limit will be applied for any resource -+ * after the state has been fully restored. -+ */ -+ domain_memory_add_nochk(conn->id, len + sizeof(bdata->hdr)); - } - - void read_state_buffered_data(const void *ctx, struct connection *conn, diff --git a/xsa326-4.16-xenstored-13.patch b/xsa326-4.16-xenstored-13.patch deleted file mode 100644 index c37dd1e..0000000 --- a/xsa326-4.16-xenstored-13.patch +++ /dev/null @@ -1,89 +0,0 @@ -From 4628ae0a56b037dcdc8a3e42c543c5b9fd9990cf Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: tools/xenstore: add memory accounting for watches - -Add the memory accounting for registered watches. - -When a socket connection is destroyed, the associated watches are -removed, too. In order to keep memory accounting correct the watches -must be removed explicitly via a call of conn_delete_all_watches() from -destroy_conn(). - -This is part of XSA-326 / CVE-2022-42315. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index ecab6cfbbe15..d86942f5aa77 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -463,6 +463,7 @@ static int destroy_conn(void *_conn) - } - - conn_free_buffered_data(conn); -+ conn_delete_all_watches(conn); - list_for_each_entry(req, &conn->ref_list, list) - req->on_ref_list = false; - -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 0755ffa375ba..fdf9b2d653a0 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -211,7 +211,7 @@ static int check_watch_path(struct connection *conn, const void *ctx, - } - - static struct watch *add_watch(struct connection *conn, char *path, char *token, -- bool relative) -+ bool relative, bool no_quota_check) - { - struct watch *watch; - -@@ -222,6 +222,9 @@ static struct watch *add_watch(struct connection *conn, char *path, char *token, - watch->token = talloc_strdup(watch, token); - if (!watch->node || !watch->token) - goto nomem; -+ if (domain_memory_add(conn->id, strlen(path) + strlen(token), -+ no_quota_check)) -+ goto nomem; - - if (relative) - watch->relative_path = get_implicit_path(conn); -@@ -265,7 +268,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) - if (domain_watch(conn) > quota_nb_watch_per_domain) - return E2BIG; - -- watch = add_watch(conn, vec[0], vec[1], relative); -+ watch = add_watch(conn, vec[0], vec[1], relative, false); - if (!watch) - return errno; - -@@ -296,6 +299,8 @@ int do_unwatch(struct connection *conn, struct buffered_data *in) - list_for_each_entry(watch, &conn->watches, list) { - if (streq(watch->node, node) && streq(watch->token, vec[1])) { - list_del(&watch->list); -+ domain_memory_add_nochk(conn->id, -strlen(watch->node) - -+ strlen(watch->token)); - talloc_free(watch); - domain_watch_dec(conn); - send_ack(conn, XS_UNWATCH); -@@ -311,6 +316,8 @@ void conn_delete_all_watches(struct connection *conn) - - while ((watch = list_top(&conn->watches, struct watch, list))) { - list_del(&watch->list); -+ domain_memory_add_nochk(conn->id, -strlen(watch->node) - -+ strlen(watch->token)); - talloc_free(watch); - domain_watch_dec(conn); - } -@@ -373,7 +380,7 @@ void read_state_watch(const void *ctx, const void *state) - if (!path) - barf("allocation error for read watch"); - -- if (!add_watch(conn, path, token, relative)) -+ if (!add_watch(conn, path, token, relative, true)) - barf("error adding watch"); - } - diff --git a/xsa326-4.16-xenstored-14.patch b/xsa326-4.16-xenstored-14.patch deleted file mode 100644 index df898c4..0000000 --- a/xsa326-4.16-xenstored-14.patch +++ /dev/null @@ -1,334 +0,0 @@ -From b8bd74e5e962955211ab0c5c1924ebf2bb526799 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: tools/xenstore: add memory accounting for nodes - -Add the memory accounting for Xenstore nodes. In order to make this -not too complicated allow for some sloppiness when writing nodes. Any -hard quota violation will result in no further requests to be accepted. - -This is part of XSA-326 / CVE-2022-42315. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index d86942f5aa77..16504de42017 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -591,6 +591,117 @@ void set_tdb_key(const char *name, TDB_DATA *key) - key->dsize = strlen(name); - } - -+static void get_acc_data(TDB_DATA *key, struct node_account_data *acc) -+{ -+ TDB_DATA old_data; -+ struct xs_tdb_record_hdr *hdr; -+ -+ if (acc->memory < 0) { -+ old_data = tdb_fetch(tdb_ctx, *key); -+ /* No check for error, as the node might not exist. */ -+ if (old_data.dptr == NULL) { -+ acc->memory = 0; -+ } else { -+ hdr = (void *)old_data.dptr; -+ acc->memory = old_data.dsize; -+ acc->domid = hdr->perms[0].id; -+ } -+ talloc_free(old_data.dptr); -+ } -+} -+ -+/* -+ * Per-transaction nodes need to be accounted for the transaction owner. -+ * Those nodes are stored in the data base with the transaction generation -+ * count prepended (e.g. 123/local/domain/...). So testing for the node's -+ * key not to start with "/" is sufficient. -+ */ -+static unsigned int get_acc_domid(struct connection *conn, TDB_DATA *key, -+ unsigned int domid) -+{ -+ return (!conn || key->dptr[0] == '/') ? domid : conn->id; -+} -+ -+int do_tdb_write(struct connection *conn, TDB_DATA *key, TDB_DATA *data, -+ struct node_account_data *acc, bool no_quota_check) -+{ -+ struct xs_tdb_record_hdr *hdr = (void *)data->dptr; -+ struct node_account_data old_acc = {}; -+ unsigned int old_domid, new_domid; -+ int ret; -+ -+ if (!acc) -+ old_acc.memory = -1; -+ else -+ old_acc = *acc; -+ -+ get_acc_data(key, &old_acc); -+ old_domid = get_acc_domid(conn, key, old_acc.domid); -+ new_domid = get_acc_domid(conn, key, hdr->perms[0].id); -+ -+ /* -+ * Don't check for ENOENT, as we want to be able to switch orphaned -+ * nodes to new owners. -+ */ -+ if (old_acc.memory) -+ domain_memory_add_nochk(old_domid, -+ -old_acc.memory - key->dsize); -+ ret = domain_memory_add(new_domid, data->dsize + key->dsize, -+ no_quota_check); -+ if (ret) { -+ /* Error path, so no quota check. */ -+ if (old_acc.memory) -+ domain_memory_add_nochk(old_domid, -+ old_acc.memory + key->dsize); -+ return ret; -+ } -+ -+ /* TDB should set errno, but doesn't even set ecode AFAICT. */ -+ if (tdb_store(tdb_ctx, *key, *data, TDB_REPLACE) != 0) { -+ domain_memory_add_nochk(new_domid, -data->dsize - key->dsize); -+ /* Error path, so no quota check. */ -+ if (old_acc.memory) -+ domain_memory_add_nochk(old_domid, -+ old_acc.memory + key->dsize); -+ errno = EIO; -+ return errno; -+ } -+ -+ if (acc) { -+ /* Don't use new_domid, as it might be a transaction node. */ -+ acc->domid = hdr->perms[0].id; -+ acc->memory = data->dsize; -+ } -+ -+ return 0; -+} -+ -+int do_tdb_delete(struct connection *conn, TDB_DATA *key, -+ struct node_account_data *acc) -+{ -+ struct node_account_data tmp_acc; -+ unsigned int domid; -+ -+ if (!acc) { -+ acc = &tmp_acc; -+ acc->memory = -1; -+ } -+ -+ get_acc_data(key, acc); -+ -+ if (tdb_delete(tdb_ctx, *key)) { -+ errno = EIO; -+ return errno; -+ } -+ -+ if (acc->memory) { -+ domid = get_acc_domid(conn, key, acc->domid); -+ domain_memory_add_nochk(domid, -acc->memory - key->dsize); -+ } -+ -+ return 0; -+} -+ - /* - * If it fails, returns NULL and sets errno. - * Temporary memory allocations will be done with ctx. -@@ -644,9 +755,15 @@ struct node *read_node(struct connection *conn, const void *ctx, - - /* Permissions are struct xs_permissions. */ - node->perms.p = hdr->perms; -+ node->acc.domid = node->perms.p[0].id; -+ node->acc.memory = data.dsize; - if (domain_adjust_node_perms(conn, node)) - goto error; - -+ /* If owner is gone reset currently accounted memory size. */ -+ if (node->acc.domid != node->perms.p[0].id) -+ node->acc.memory = 0; -+ - /* Data is binary blob (usually ascii, no nul). */ - node->data = node->perms.p + hdr->num_perms; - /* Children is strings, nul separated. */ -@@ -715,12 +832,9 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - p += node->datalen; - memcpy(p, node->children, node->childlen); - -- /* TDB should set errno, but doesn't even set ecode AFAICT. */ -- if (tdb_store(tdb_ctx, *key, data, TDB_REPLACE) != 0) { -- corrupt(conn, "Write of %s failed", key->dptr); -- errno = EIO; -- return errno; -- } -+ if (do_tdb_write(conn, key, &data, &node->acc, no_quota_check)) -+ return EIO; -+ - return 0; - } - -@@ -1222,7 +1336,7 @@ static void delete_node_single(struct connection *conn, struct node *node) - if (access_node(conn, node, NODE_ACCESS_DELETE, &key)) - return; - -- if (tdb_delete(tdb_ctx, key) != 0) { -+ if (do_tdb_delete(conn, &key, &node->acc) != 0) { - corrupt(conn, "Could not delete '%s'", node->name); - return; - } -@@ -1295,6 +1409,7 @@ static struct node *construct_node(struct connection *conn, const void *ctx, - /* No children, no data */ - node->children = node->data = NULL; - node->childlen = node->datalen = 0; -+ node->acc.memory = 0; - node->parent = parent; - return node; - -@@ -1303,17 +1418,17 @@ static struct node *construct_node(struct connection *conn, const void *ctx, - return NULL; - } - --static void destroy_node_rm(struct node *node) -+static void destroy_node_rm(struct connection *conn, struct node *node) - { - if (streq(node->name, "/")) - corrupt(NULL, "Destroying root node!"); - -- tdb_delete(tdb_ctx, node->key); -+ do_tdb_delete(conn, &node->key, &node->acc); - } - - static int destroy_node(struct connection *conn, struct node *node) - { -- destroy_node_rm(node); -+ destroy_node_rm(conn, node); - domain_entry_dec(conn, node); - - /* -@@ -1365,7 +1480,7 @@ static struct node *create_node(struct connection *conn, const void *ctx, - /* Account for new node */ - if (i->parent) { - if (domain_entry_inc(conn, i)) { -- destroy_node_rm(i); -+ destroy_node_rm(conn, i); - return NULL; - } - } -@@ -2291,7 +2406,7 @@ static int clean_store_(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA val, - if (!hashtable_search(reachable, name)) { - log("clean_store: '%s' is orphaned!", name); - if (recovery) { -- tdb_delete(tdb, key); -+ do_tdb_delete(NULL, &key, NULL); - } - } - -@@ -3149,6 +3264,7 @@ void read_state_node(const void *ctx, const void *state) - if (!node) - barf("allocation error restoring node"); - -+ node->acc.memory = 0; - node->name = name; - node->generation = ++generation; - node->datalen = sn->data_len; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 373af18297bf..da9ecce67f31 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -176,6 +176,11 @@ struct node_perms { - struct xs_permissions *p; - }; - -+struct node_account_data { -+ unsigned int domid; -+ int memory; /* -1 if unknown */ -+}; -+ - struct node { - const char *name; - /* Key used to update TDB */ -@@ -198,6 +203,9 @@ struct node { - /* Children, each nul-terminated. */ - unsigned int childlen; - char *children; -+ -+ /* Allocation information for node currently in store. */ -+ struct node_account_data acc; - }; - - /* Return the only argument in the input. */ -@@ -306,6 +314,10 @@ extern xengnttab_handle **xgt_handle; - int remember_string(struct hashtable *hash, const char *str); - - void set_tdb_key(const char *name, TDB_DATA *key); -+int do_tdb_write(struct connection *conn, TDB_DATA *key, TDB_DATA *data, -+ struct node_account_data *acc, bool no_quota_check); -+int do_tdb_delete(struct connection *conn, TDB_DATA *key, -+ struct node_account_data *acc); - - void conn_free_buffered_data(struct connection *conn); - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 7bd41eb475e3..ace9a11d77bb 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -153,6 +153,9 @@ struct transaction - /* List of all transactions active on this connection. */ - struct list_head list; - -+ /* Connection this transaction is associated with. */ -+ struct connection *conn; -+ - /* Connection-local identifier for this transaction. */ - uint32_t id; - -@@ -286,6 +289,8 @@ int access_node(struct connection *conn, struct node *node, - - introduce = true; - i->ta_node = false; -+ /* acc.memory < 0 means "unknown, get size from TDB". */ -+ node->acc.memory = -1; - - /* - * Additional transaction-specific node for read type. We only -@@ -410,11 +415,11 @@ static int finalize_transaction(struct connection *conn, - goto err; - hdr = (void *)data.dptr; - hdr->generation = ++generation; -- ret = tdb_store(tdb_ctx, key, data, -- TDB_REPLACE); -+ ret = do_tdb_write(conn, &key, &data, NULL, -+ true); - talloc_free(data.dptr); - } else { -- ret = tdb_delete(tdb_ctx, key); -+ ret = do_tdb_delete(conn, &key, NULL); - } - if (ret) - goto err; -@@ -425,7 +430,7 @@ static int finalize_transaction(struct connection *conn, - } - } - -- if (i->ta_node && tdb_delete(tdb_ctx, ta_key)) -+ if (i->ta_node && do_tdb_delete(conn, &ta_key, NULL)) - goto err; - list_del(&i->list); - talloc_free(i); -@@ -453,7 +458,7 @@ static int destroy_transaction(void *_transaction) - i->node); - if (trans_name) { - set_tdb_key(trans_name, &key); -- tdb_delete(tdb_ctx, key); -+ do_tdb_delete(trans->conn, &key, NULL); - } - } - list_del(&i->list); -@@ -497,6 +502,7 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) - - INIT_LIST_HEAD(&trans->accessed); - INIT_LIST_HEAD(&trans->changed_domains); -+ trans->conn = conn; - trans->fail = false; - trans->generation = ++generation; - diff --git a/xsa326-4.16-xenstored-15.patch b/xsa326-4.16-xenstored-15.patch deleted file mode 100644 index 8a42a36..0000000 --- a/xsa326-4.16-xenstored-15.patch +++ /dev/null @@ -1,54 +0,0 @@ -From c55a1ea0a5ea7f6a3dc850cb015a49ba9ec571ab Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: tools/xenstore: add exports for quota variables - -Some quota variables are not exported via header files. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Acked-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index da9ecce67f31..bfd3fc1e9df3 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -275,6 +275,11 @@ extern TDB_CONTEXT *tdb_ctx; - extern int dom0_domid; - extern int dom0_event; - extern int priv_domid; -+extern int quota_nb_watch_per_domain; -+extern int quota_max_transaction; -+extern int quota_max_entry_size; -+extern int quota_nb_perms_per_node; -+extern int quota_max_path_len; - extern int quota_nb_entry_per_domain; - extern int quota_req_outstanding; - extern int quota_trans_nodes; -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index ace9a11d77bb..28774813de83 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -175,7 +175,6 @@ struct transaction - bool fail; - }; - --extern int quota_max_transaction; - uint64_t generation; - - static struct accessed_node *find_accessed_node(struct transaction *trans, -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index fdf9b2d653a0..85362bcce314 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -31,8 +31,6 @@ - #include "xenstored_domain.h" - #include "xenstored_transaction.h" - --extern int quota_nb_watch_per_domain; -- - struct watch - { - /* Watches on this connection */ diff --git a/xsa326-4.16-xenstored-16.patch b/xsa326-4.16-xenstored-16.patch deleted file mode 100644 index c2a8953..0000000 --- a/xsa326-4.16-xenstored-16.patch +++ /dev/null @@ -1,238 +0,0 @@ -From 05cc2af50ba43431d6d50aff758e968833aab9c6 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: tools/xenstore: add control command for setting and showing quota - -Add a xenstore-control command "quota" to: -- show current quota settings -- change quota settings -- show current quota related values of a domain - -Note that in the case the new quota is lower than existing one, -Xenstored may continue to handle requests from a domain exceeding the -new limit (depends on which one has been broken) and the amount of -resource used will not change. However the domain will not be able to -create more resource (associated to the quota) until it is back to below -the limit. - -This is part of XSA-326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/docs/misc/xenstore.txt b/docs/misc/xenstore.txt -index 334dc8b6fdf5..a7d006519ae8 100644 ---- a/docs/misc/xenstore.txt -+++ b/docs/misc/xenstore.txt -@@ -366,6 +366,17 @@ CONTROL |[|] - print| - print to syslog (xenstore runs as daemon) or - to console (xenstore runs as stubdom) -+ quota|[set |] -+ without parameters: print the current quota settings -+ with "set ": set the quota to new value -+ (The admin should make sure all the domain usage is -+ below the quota. If it is not, then Xenstored may continue to -+ handle requests from the domain as long as the resource -+ violating the new quota setting isn't increased further) -+ with "": print quota related accounting data for -+ the domain -+ quota-soft|[set ] -+ like the "quota" command, but for soft-quota. - help - return list of supported commands for CONTROL - -diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c -index adb8d51b043b..1031a81c3874 100644 ---- a/tools/xenstore/xenstored_control.c -+++ b/tools/xenstore/xenstored_control.c -@@ -196,6 +196,115 @@ static int do_control_log(void *ctx, struct connection *conn, - return 0; - } - -+struct quota { -+ const char *name; -+ int *quota; -+ const char *descr; -+}; -+ -+static const struct quota hard_quotas[] = { -+ { "nodes", "a_nb_entry_per_domain, "Nodes per domain" }, -+ { "watches", "a_nb_watch_per_domain, "Watches per domain" }, -+ { "transactions", "a_max_transaction, "Transactions per domain" }, -+ { "outstanding", "a_req_outstanding, -+ "Outstanding requests per domain" }, -+ { "transaction-nodes", "a_trans_nodes, -+ "Max. number of accessed nodes per transaction" }, -+ { "memory", "a_memory_per_domain_hard, -+ "Total Xenstore memory per domain (error level)" }, -+ { "node-size", "a_max_entry_size, "Max. size of a node" }, -+ { "path-max", "a_max_path_len, "Max. length of a node path" }, -+ { "permissions", "a_nb_perms_per_node, -+ "Max. number of permissions per node" }, -+ { NULL, NULL, NULL } -+}; -+ -+static const struct quota soft_quotas[] = { -+ { "memory", "a_memory_per_domain_soft, -+ "Total Xenstore memory per domain (warning level)" }, -+ { NULL, NULL, NULL } -+}; -+ -+static int quota_show_current(const void *ctx, struct connection *conn, -+ const struct quota *quotas) -+{ -+ char *resp; -+ unsigned int i; -+ -+ resp = talloc_strdup(ctx, "Quota settings:\n"); -+ if (!resp) -+ return ENOMEM; -+ -+ for (i = 0; quotas[i].quota; i++) { -+ resp = talloc_asprintf_append(resp, "%-17s: %8d %s\n", -+ quotas[i].name, *quotas[i].quota, -+ quotas[i].descr); -+ if (!resp) -+ return ENOMEM; -+ } -+ -+ send_reply(conn, XS_CONTROL, resp, strlen(resp) + 1); -+ -+ return 0; -+} -+ -+static int quota_set(const void *ctx, struct connection *conn, -+ char **vec, int num, const struct quota *quotas) -+{ -+ unsigned int i; -+ int val; -+ -+ if (num != 2) -+ return EINVAL; -+ -+ val = atoi(vec[1]); -+ if (val < 1) -+ return EINVAL; -+ -+ for (i = 0; quotas[i].quota; i++) { -+ if (!strcmp(vec[0], quotas[i].name)) { -+ *quotas[i].quota = val; -+ send_ack(conn, XS_CONTROL); -+ return 0; -+ } -+ } -+ -+ return EINVAL; -+} -+ -+static int quota_get(const void *ctx, struct connection *conn, -+ char **vec, int num) -+{ -+ if (num != 1) -+ return EINVAL; -+ -+ return domain_get_quota(ctx, conn, atoi(vec[0])); -+} -+ -+static int do_control_quota(void *ctx, struct connection *conn, -+ char **vec, int num) -+{ -+ if (num == 0) -+ return quota_show_current(ctx, conn, hard_quotas); -+ -+ if (!strcmp(vec[0], "set")) -+ return quota_set(ctx, conn, vec + 1, num - 1, hard_quotas); -+ -+ return quota_get(ctx, conn, vec, num); -+} -+ -+static int do_control_quota_s(void *ctx, struct connection *conn, -+ char **vec, int num) -+{ -+ if (num == 0) -+ return quota_show_current(ctx, conn, soft_quotas); -+ -+ if (!strcmp(vec[0], "set")) -+ return quota_set(ctx, conn, vec + 1, num - 1, soft_quotas); -+ -+ return EINVAL; -+} -+ - #ifdef __MINIOS__ - static int do_control_memreport(void *ctx, struct connection *conn, - char **vec, int num) -@@ -847,6 +956,8 @@ static struct cmd_s cmds[] = { - { "memreport", do_control_memreport, "[]" }, - #endif - { "print", do_control_print, "" }, -+ { "quota", do_control_quota, "[set |]" }, -+ { "quota-soft", do_control_quota_s, "[set ]" }, - { "help", do_control_help, "" }, - }; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 94fd561e9de4..e7c6886ccf47 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -31,6 +31,7 @@ - #include "xenstored_domain.h" - #include "xenstored_transaction.h" - #include "xenstored_watch.h" -+#include "xenstored_control.h" - - #include - #include -@@ -345,6 +346,38 @@ static struct domain *find_domain_struct(unsigned int domid) - return NULL; - } - -+int domain_get_quota(const void *ctx, struct connection *conn, -+ unsigned int domid) -+{ -+ struct domain *d = find_domain_struct(domid); -+ char *resp; -+ int ta; -+ -+ if (!d) -+ return ENOENT; -+ -+ ta = d->conn ? d->conn->transaction_started : 0; -+ resp = talloc_asprintf(ctx, "Domain %u:\n", domid); -+ if (!resp) -+ return ENOMEM; -+ -+#define ent(t, e) \ -+ resp = talloc_asprintf_append(resp, "%-16s: %8d\n", #t, e); \ -+ if (!resp) return ENOMEM -+ -+ ent(nodes, d->nbentry); -+ ent(watches, d->nbwatch); -+ ent(transactions, ta); -+ ent(outstanding, d->nboutstanding); -+ ent(memory, d->memory); -+ -+#undef ent -+ -+ send_reply(conn, XS_CONTROL, resp, strlen(resp) + 1); -+ -+ return 0; -+} -+ - static struct domain *alloc_domain(const void *context, unsigned int domid) - { - struct domain *domain; -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 633c9a0a0a1f..904faa923afb 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -87,6 +87,8 @@ int domain_watch(struct connection *conn); - void domain_outstanding_inc(struct connection *conn); - void domain_outstanding_dec(struct connection *conn); - void domain_outstanding_domid_dec(unsigned int domid); -+int domain_get_quota(const void *ctx, struct connection *conn, -+ unsigned int domid); - - /* Special node permission handling. */ - int set_perms_special(struct connection *conn, const char *name, diff --git a/xsa409-4.13-0001-libxl-docs-Use-arch-specific-default-paging-memory.patch b/xsa409-4.13-0001-libxl-docs-Use-arch-specific-default-paging-memory.patch deleted file mode 100644 index 9aca0a7..0000000 --- a/xsa409-4.13-0001-libxl-docs-Use-arch-specific-default-paging-memory.patch +++ /dev/null @@ -1,147 +0,0 @@ -From bd4a7db4001364fd03a80a2e73b81c46aaa44e9c Mon Sep 17 00:00:00 2001 -From: Henry Wang -Date: Mon, 22 Aug 2022 01:35:09 +0000 -Subject: [PATCH 1/4] libxl, docs: Use arch-specific default paging memory - -The default paging memory (descibed in `shadow_memory` entry in xl -config) in libxl is used to determine the memory pool size for xl -guests. Currently this size is only used for x86, and contains a part -of RAM to shadow the resident processes. Since on Arm there is no -shadow mode guests, so the part of RAM to shadow the resident processes -is not necessary. Therefore, this commit splits the function -`libxl_get_required_shadow_memory()` to arch specific helpers and -renamed the helper to `libxl__arch_get_required_paging_memory()`. - -On x86, this helper calls the original value from -`libxl_get_required_shadow_memory()` so no functional change intended. - -On Arm, this helper returns 1MB per vcpu plus 4KB per MiB of RAM -for the P2M map and additional 512KB. - -Also update the xl.cfg documentation to add Arm documentation -according to code changes and correct the comment style following Xen -coding style. - -This is part of CVE-2022-33747 / XSA-409. - -Suggested-by: Julien Grall -Signed-off-by: Henry Wang -Reviewed-by: Anthony PERARD ---- - docs/man/xl.cfg.5.pod.in | 5 +++++ - tools/libs/light/libxl_arch.h | 4 ++++ - tools/libs/light/libxl_arm.c | 14 ++++++++++++++ - tools/libs/light/libxl_utils.c | 9 ++------- - tools/libs/light/libxl_x86.c | 13 +++++++++++++ - 5 files changed, 38 insertions(+), 7 deletions(-) - -diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in -index b98d1613987e..eda1e77ebd06 100644 ---- a/docs/man/xl.cfg.5.pod.in -+++ b/docs/man/xl.cfg.5.pod.in -@@ -1768,6 +1768,11 @@ are not using hardware assisted paging (i.e. you are using shadow - mode) and your guest workload consists of a very large number of - similar processes then increasing this value may improve performance. - -+On Arm, this field is used to determine the size of the guest P2M pages -+pool, and the default value is 1MB per vCPU plus 4KB per MB of RAM for -+the P2M map and additional 512KB for extended regions. Users should -+adjust this value if bigger P2M pool size is needed. -+ - =back - - =head3 Processor and Platform Features -diff --git a/tools/libs/light/libxl_arch.h b/tools/libs/light/libxl_arch.h -index 1522ecb97f72..5a060c2c3033 100644 ---- a/tools/libs/light/libxl_arch.h -+++ b/tools/libs/light/libxl_arch.h -@@ -90,6 +90,10 @@ void libxl__arch_update_domain_config(libxl__gc *gc, - libxl_domain_config *dst, - const libxl_domain_config *src); - -+_hidden -+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, -+ unsigned int smp_cpus); -+ - #if defined(__i386__) || defined(__x86_64__) - - #define LAPIC_BASE_ADDRESS 0xfee00000 -diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c -index eef1de093914..73a95e83af24 100644 ---- a/tools/libs/light/libxl_arm.c -+++ b/tools/libs/light/libxl_arm.c -@@ -154,6 +154,20 @@ out: - return rc; - } - -+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, -+ unsigned int smp_cpus) -+{ -+ /* -+ * 256 pages (1MB) per vcpu, -+ * plus 1 page per MiB of RAM for the P2M map, -+ * plus 1 page per MiB of extended region. This default value is 128 MiB -+ * which should be enough for domains that are not running backend. -+ * This is higher than the minimum that Xen would allocate if no value -+ * were given (but the Xen minimum is for safety, not performance). -+ */ -+ return 4 * (256 * smp_cpus + maxmem_kb / 1024 + 128); -+} -+ - static struct arch_info { - const char *guest_type; - const char *timer_compat; -diff --git a/tools/libs/light/libxl_utils.c b/tools/libs/light/libxl_utils.c -index 4699c4a0a36f..e276c0ee9cc3 100644 ---- a/tools/libs/light/libxl_utils.c -+++ b/tools/libs/light/libxl_utils.c -@@ -18,6 +18,7 @@ - #include - - #include "libxl_internal.h" -+#include "libxl_arch.h" - #include "_paths.h" - - #ifndef LIBXL_HAVE_NONCONST_LIBXL_BASENAME_RETURN_VALUE -@@ -39,13 +40,7 @@ char *libxl_basename(const char *name) - - unsigned long libxl_get_required_shadow_memory(unsigned long maxmem_kb, unsigned int smp_cpus) - { -- /* 256 pages (1MB) per vcpu, -- plus 1 page per MiB of RAM for the P2M map, -- plus 1 page per MiB of RAM to shadow the resident processes. -- This is higher than the minimum that Xen would allocate if no value -- were given (but the Xen minimum is for safety, not performance). -- */ -- return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024)); -+ return libxl__arch_get_required_paging_memory(maxmem_kb, smp_cpus); - } - - char *libxl_domid_to_name(libxl_ctx *ctx, uint32_t domid) -diff --git a/tools/libs/light/libxl_x86.c b/tools/libs/light/libxl_x86.c -index 1feadebb1852..51362893cf98 100644 ---- a/tools/libs/light/libxl_x86.c -+++ b/tools/libs/light/libxl_x86.c -@@ -882,6 +882,19 @@ void libxl__arch_update_domain_config(libxl__gc *gc, - libxl_defbool_val(src->b_info.arch_x86.msr_relaxed)); - } - -+unsigned long libxl__arch_get_required_paging_memory(unsigned long maxmem_kb, -+ unsigned int smp_cpus) -+{ -+ /* -+ * 256 pages (1MB) per vcpu, -+ * plus 1 page per MiB of RAM for the P2M map, -+ * plus 1 page per MiB of RAM to shadow the resident processes. -+ * This is higher than the minimum that Xen would allocate if no value -+ * were given (but the Xen minimum is for safety, not performance). -+ */ -+ return 4 * (256 * smp_cpus + 2 * (maxmem_kb / 1024)); -+} -+ - /* - * Local variables: - * mode: C --- -2.37.1 - diff --git a/xsa409-4.13-0002-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch b/xsa409-4.13-0002-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch deleted file mode 100644 index 8c03657..0000000 --- a/xsa409-4.13-0002-xen-arm-Construct-the-P2M-pages-pool-for-guests.patch +++ /dev/null @@ -1,187 +0,0 @@ -From 419a4bbc20cf7c5d7d9dedae59fb8049922e6a2c Mon Sep 17 00:00:00 2001 -From: Henry Wang -Date: Mon, 6 Jun 2022 06:17:28 +0000 -Subject: [PATCH 2/4] xen/arm: Construct the P2M pages pool for guests - -This commit constructs the p2m pages pool for guests from the -data structure and helper perspective. - -This is implemented by: - -- Adding a `struct paging_domain` which contains a freelist, a -counter variable and a spinlock to `struct arch_domain` to -indicate the free p2m pages and the number of p2m total pages in -the p2m pages pool. - -- Adding a helper `p2m_get_allocation` to get the p2m pool size. - -- Adding a helper `p2m_set_allocation` to set the p2m pages pool -size. This helper should be called before allocating memory for -a guest. - -- Adding a helper `p2m_teardown_allocation` to free the p2m pages -pool. This helper should be called during the xl domain destory. - -This is part of CVE-2022-33747 / XSA-409. - -Signed-off-by: Henry Wang -Reviewed-by: Stefano Stabellini ---- - xen/arch/arm/p2m.c | 88 ++++++++++++++++++++++++++++++++++++ - xen/include/asm-arm/domain.h | 10 ++++ - xen/include/asm-arm/p2m.h | 4 ++ - 3 files changed, 102 insertions(+) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 3bcd1e897e88..79f3d37f5230 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -50,6 +50,92 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn) - return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48)); - } - -+/* Return the size of the pool, rounded up to the nearest MB */ -+unsigned int p2m_get_allocation(struct domain *d) -+{ -+ unsigned long nr_pages = ACCESS_ONCE(d->arch.paging.p2m_total_pages); -+ -+ return ROUNDUP(nr_pages, 1 << (20 - PAGE_SHIFT)) >> (20 - PAGE_SHIFT); -+} -+ -+/* -+ * Set the pool of pages to the required number of pages. -+ * Returns 0 for success, non-zero for failure. -+ * Call with d->arch.paging.lock held. -+ */ -+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted) -+{ -+ struct page_info *pg; -+ -+ ASSERT(spin_is_locked(&d->arch.paging.lock)); -+ -+ for ( ; ; ) -+ { -+ if ( d->arch.paging.p2m_total_pages < pages ) -+ { -+ /* Need to allocate more memory from domheap */ -+ pg = alloc_domheap_page(NULL, 0); -+ if ( pg == NULL ) -+ { -+ printk(XENLOG_ERR "Failed to allocate P2M pages.\n"); -+ return -ENOMEM; -+ } -+ ACCESS_ONCE(d->arch.paging.p2m_total_pages) = -+ d->arch.paging.p2m_total_pages + 1; -+ page_list_add_tail(pg, &d->arch.paging.p2m_freelist); -+ } -+ else if ( d->arch.paging.p2m_total_pages > pages ) -+ { -+ /* Need to return memory to domheap */ -+ pg = page_list_remove_head(&d->arch.paging.p2m_freelist); -+ if( pg ) -+ { -+ ACCESS_ONCE(d->arch.paging.p2m_total_pages) = -+ d->arch.paging.p2m_total_pages - 1; -+ free_domheap_page(pg); -+ } -+ else -+ { -+ printk(XENLOG_ERR -+ "Failed to free P2M pages, P2M freelist is empty.\n"); -+ return -ENOMEM; -+ } -+ } -+ else -+ break; -+ -+ /* Check to see if we need to yield and try again */ -+ if ( preempted && general_preempt_check() ) -+ { -+ *preempted = true; -+ return -ERESTART; -+ } -+ } -+ -+ return 0; -+} -+ -+int p2m_teardown_allocation(struct domain *d) -+{ -+ int ret = 0; -+ bool preempted = false; -+ -+ spin_lock(&d->arch.paging.lock); -+ if ( d->arch.paging.p2m_total_pages != 0 ) -+ { -+ ret = p2m_set_allocation(d, 0, &preempted); -+ if ( preempted ) -+ { -+ spin_unlock(&d->arch.paging.lock); -+ return -ERESTART; -+ } -+ ASSERT(d->arch.paging.p2m_total_pages == 0); -+ } -+ spin_unlock(&d->arch.paging.lock); -+ -+ return ret; -+} -+ - /* Unlock the flush and do a P2M TLB flush if necessary */ - void p2m_write_unlock(struct p2m_domain *p2m) - { -@@ -1599,7 +1685,9 @@ int p2m_init(struct domain *d) - unsigned int cpu; - - rwlock_init(&p2m->lock); -+ spin_lock_init(&d->arch.paging.lock); - INIT_PAGE_LIST_HEAD(&p2m->pages); -+ INIT_PAGE_LIST_HEAD(&d->arch.paging.p2m_freelist); - - p2m->vmid = INVALID_VMID; - -diff --git a/xen/include/asm-arm/domain.h b/xen/include/asm-arm/domain.h -index 9b3647587a04..c90daa65afa7 100644 ---- a/xen/include/asm-arm/domain.h -+++ b/xen/include/asm-arm/domain.h -@@ -40,6 +40,14 @@ struct vtimer { - uint64_t cval; - }; - -+struct paging_domain { -+ spinlock_t lock; -+ /* Free P2M pages from the pre-allocated P2M pool */ -+ struct page_list_head p2m_freelist; -+ /* Number of pages from the pre-allocated P2M pool */ -+ unsigned long p2m_total_pages; -+}; -+ - struct arch_domain - { - #ifdef CONFIG_ARM_64 -@@ -51,6 +59,8 @@ struct arch_domain - - struct hvm_domain hvm; - -+ struct paging_domain paging; -+ - struct vmmio vmmio; - - /* Continuable domain_relinquish_resources(). */ -diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h -index b3ba83283e11..c9598740bd02 100644 ---- a/xen/include/asm-arm/p2m.h -+++ b/xen/include/asm-arm/p2m.h -@@ -218,6 +218,10 @@ void p2m_restore_state(struct vcpu *n); - /* Print debugging/statistial info about a domain's p2m */ - void p2m_dump_info(struct domain *d); - -+unsigned int p2m_get_allocation(struct domain *d); -+int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted); -+int p2m_teardown_allocation(struct domain *d); -+ - static inline void p2m_write_lock(struct p2m_domain *p2m) - { - write_lock(&p2m->lock); --- -2.37.1 - diff --git a/xsa409-4.13-0003-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch b/xsa409-4.13-0003-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch deleted file mode 100644 index bb638e1..0000000 --- a/xsa409-4.13-0003-xen-arm-libxl-Implement-XEN_DOMCTL_shadow_op-for-Arm.patch +++ /dev/null @@ -1,106 +0,0 @@ -From 332a9979d4dd0b047aa16db201c50fcedbd56743 Mon Sep 17 00:00:00 2001 -From: Henry Wang -Date: Mon, 6 Jun 2022 06:17:29 +0000 -Subject: [PATCH 3/4] xen/arm, libxl: Implement XEN_DOMCTL_shadow_op for Arm - -This commit implements the `XEN_DOMCTL_shadow_op` support in Xen -for Arm. The p2m pages pool size for xl guests is supposed to be -determined by `XEN_DOMCTL_shadow_op`. Hence, this commit: - -- Introduces a function `p2m_domctl` and implements the subops -`XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` and -`XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION` of `XEN_DOMCTL_shadow_op`. - -- Adds the `XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION` support in libxl. - -Therefore enabling the setting of shadow memory pool size -when creating a guest from xl and getting shadow memory pool size -from Xen. - -Note that the `XEN_DOMCTL_shadow_op` added in this commit is only -a dummy op, and the functionality of setting/getting p2m memory pool -size for xl guests will be added in following commits. - -This is part of CVE-2022-33747 / XSA-409. - -Signed-off-by: Henry Wang -Reviewed-by: Stefano Stabellini ---- - tools/libs/light/libxl_arm.c | 12 ++++++++++++ - xen/arch/arm/domctl.c | 32 ++++++++++++++++++++++++++++++++ - 2 files changed, 44 insertions(+) - -diff --git a/tools/libs/light/libxl_arm.c b/tools/libs/light/libxl_arm.c -index 73a95e83af24..22a0c561bbc6 100644 ---- a/tools/libs/light/libxl_arm.c -+++ b/tools/libs/light/libxl_arm.c -@@ -131,6 +131,18 @@ int libxl__arch_domain_create(libxl__gc *gc, - libxl__domain_build_state *state, - uint32_t domid) - { -+ libxl_ctx *ctx = libxl__gc_owner(gc); -+ unsigned int shadow_mb = DIV_ROUNDUP(d_config->b_info.shadow_memkb, 1024); -+ -+ int r = xc_shadow_control(ctx->xch, domid, -+ XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION, -+ &shadow_mb, 0); -+ if (r) { -+ LOGED(ERROR, domid, -+ "Failed to set %u MiB shadow allocation", shadow_mb); -+ return ERROR_FAIL; -+ } -+ - return 0; - } - -diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c -index 1baf25c3d98b..9bf72e693019 100644 ---- a/xen/arch/arm/domctl.c -+++ b/xen/arch/arm/domctl.c -@@ -47,11 +47,43 @@ static int handle_vuart_init(struct domain *d, - return rc; - } - -+static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, -+ XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) -+{ -+ if ( unlikely(d == current->domain) ) -+ { -+ printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n"); -+ return -EINVAL; -+ } -+ -+ if ( unlikely(d->is_dying) ) -+ { -+ printk(XENLOG_ERR "Tried to do a p2m domctl op on dying domain %u\n", -+ d->domain_id); -+ return -EINVAL; -+ } -+ -+ switch ( sc->op ) -+ { -+ case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: -+ return 0; -+ case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: -+ return 0; -+ default: -+ { -+ printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op); -+ return -EINVAL; -+ } -+ } -+} -+ - long arch_do_domctl(struct xen_domctl *domctl, struct domain *d, - XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - { - switch ( domctl->cmd ) - { -+ case XEN_DOMCTL_shadow_op: -+ return p2m_domctl(d, &domctl->u.shadow_op, u_domctl); - case XEN_DOMCTL_cacheflush: - { - gfn_t s = _gfn(domctl->u.cacheflush.start_pfn); --- -2.37.1 - diff --git a/xsa409-4.13-0004-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch b/xsa409-4.13-0004-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch deleted file mode 100644 index 5366c8f..0000000 --- a/xsa409-4.13-0004-xen-arm-Allocate-and-free-P2M-pages-from-the-P2M-poo.patch +++ /dev/null @@ -1,287 +0,0 @@ -From 39664d9ee041f96e9c7ee131ed8ef72a4d19c9f8 Mon Sep 17 00:00:00 2001 -From: Henry Wang -Date: Mon, 6 Jun 2022 06:17:30 +0000 -Subject: [PATCH 4/4] xen/arm: Allocate and free P2M pages from the P2M pool - -This commit sets/tearsdown of p2m pages pool for non-privileged Arm -guests by calling `p2m_set_allocation` and `p2m_teardown_allocation`. - -- For dom0, P2M pages should come from heap directly instead of p2m -pool, so that the kernel may take advantage of the extended regions. - -- For xl guests, the setting of the p2m pool is called in -`XEN_DOMCTL_shadow_op` and the p2m pool is destroyed in -`domain_relinquish_resources`. Note that domctl->u.shadow_op.mb is -updated with the new size when setting the p2m pool. - -- For dom0less domUs, the setting of the p2m pool is called before -allocating memory during domain creation. Users can specify the p2m -pool size by `xen,domain-p2m-mem-mb` dts property. - -To actually allocate/free pages from the p2m pool, this commit adds -two helper functions namely `p2m_alloc_page` and `p2m_free_page` to -`struct p2m_domain`. By replacing the `alloc_domheap_page` and -`free_domheap_page` with these two helper functions, p2m pages can -be added/removed from the list of p2m pool rather than from the heap. - -Since page from `p2m_alloc_page` is cleaned, take the opportunity -to remove the redundant `clean_page` in `p2m_create_table`. - -This is part of CVE-2022-33747 / XSA-409. - -Signed-off-by: Henry Wang -Reviewed-by: Stefano Stabellini ---- - docs/misc/arm/device-tree/booting.txt | 8 ++++ - xen/arch/arm/domain.c | 6 +++ - xen/arch/arm/domain_build.c | 29 ++++++++++++++ - xen/arch/arm/domctl.c | 23 ++++++++++- - xen/arch/arm/p2m.c | 57 +++++++++++++++++++++++++-- - 5 files changed, 118 insertions(+), 5 deletions(-) - -diff --git a/docs/misc/arm/device-tree/booting.txt b/docs/misc/arm/device-tree/booting.txt -index 71895663a4de..d92ccc56ffe0 100644 ---- a/docs/misc/arm/device-tree/booting.txt -+++ b/docs/misc/arm/device-tree/booting.txt -@@ -182,6 +182,14 @@ with the following properties: - Both #address-cells and #size-cells need to be specified because - both sub-nodes (described shortly) have reg properties. - -+- xen,domain-p2m-mem-mb -+ -+ Optional. A 32-bit integer specifying the amount of megabytes of RAM -+ used for the domain P2M pool. This is in-sync with the shadow_memory -+ option in xl.cfg. Leaving this field empty in device tree will lead to -+ the default size of domain P2M pool, i.e. 1MB per guest vCPU plus 4KB -+ per MB of guest RAM plus 512KB for guest extended regions. -+ - Under the "xen,domain" compatible node, one or more sub-nodes are present - for the DomU kernel and ramdisk. - -diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c -index 2694c39127c5..a818f33a1afa 100644 ---- a/xen/arch/arm/domain.c -+++ b/xen/arch/arm/domain.c -@@ -997,6 +997,7 @@ enum { - PROG_page, - PROG_mapping, - PROG_p2m, -+ PROG_p2m_pool, - PROG_done, - }; - -@@ -1062,6 +1063,11 @@ int domain_relinquish_resources(struct domain *d) - if ( ret ) - return ret; - -+ PROGRESS(p2m_pool): -+ ret = p2m_teardown_allocation(d); -+ if( ret ) -+ return ret; -+ - PROGRESS(done): - break; - -diff --git a/xen/arch/arm/domain_build.c b/xen/arch/arm/domain_build.c -index d02bacbcd1ed..8aec3755ca5d 100644 ---- a/xen/arch/arm/domain_build.c -+++ b/xen/arch/arm/domain_build.c -@@ -2833,6 +2833,21 @@ static void __init find_gnttab_region(struct domain *d, - kinfo->gnttab_start, kinfo->gnttab_start + kinfo->gnttab_size); - } - -+static unsigned long __init domain_p2m_pages(unsigned long maxmem_kb, -+ unsigned int smp_cpus) -+{ -+ /* -+ * Keep in sync with libxl__get_required_paging_memory(). -+ * 256 pages (1MB) per vcpu, plus 1 page per MiB of RAM for the P2M map, -+ * plus 128 pages to cover extended regions. -+ */ -+ unsigned long memkb = 4 * (256 * smp_cpus + (maxmem_kb / 1024) + 128); -+ -+ BUILD_BUG_ON(PAGE_SIZE != SZ_4K); -+ -+ return DIV_ROUND_UP(memkb, 1024) << (20 - PAGE_SHIFT); -+} -+ - static int __init construct_domain(struct domain *d, struct kernel_info *kinfo) - { - unsigned int i; -@@ -2924,6 +2939,8 @@ static int __init construct_domU(struct domain *d, - struct kernel_info kinfo = {}; - int rc; - u64 mem; -+ u32 p2m_mem_mb; -+ unsigned long p2m_pages; - - rc = dt_property_read_u64(node, "memory", &mem); - if ( !rc ) -@@ -2933,6 +2950,18 @@ static int __init construct_domU(struct domain *d, - } - kinfo.unassigned_mem = (paddr_t)mem * SZ_1K; - -+ rc = dt_property_read_u32(node, "xen,domain-p2m-mem-mb", &p2m_mem_mb); -+ /* If xen,domain-p2m-mem-mb is not specified, use the default value. */ -+ p2m_pages = rc ? -+ p2m_mem_mb << (20 - PAGE_SHIFT) : -+ domain_p2m_pages(mem, d->max_vcpus); -+ -+ spin_lock(&d->arch.paging.lock); -+ rc = p2m_set_allocation(d, p2m_pages, NULL); -+ spin_unlock(&d->arch.paging.lock); -+ if ( rc != 0 ) -+ return rc; -+ - printk("*** LOADING DOMU cpus=%u memory=%"PRIx64"KB ***\n", d->max_vcpus, mem); - - kinfo.vpl011 = dt_property_read_bool(node, "vpl011"); -diff --git a/xen/arch/arm/domctl.c b/xen/arch/arm/domctl.c -index 9bf72e693019..c8fdeb124084 100644 ---- a/xen/arch/arm/domctl.c -+++ b/xen/arch/arm/domctl.c -@@ -50,6 +50,9 @@ static int handle_vuart_init(struct domain *d, - static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, - XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) - { -+ long rc; -+ bool preempted = false; -+ - if ( unlikely(d == current->domain) ) - { - printk(XENLOG_ERR "Tried to do a p2m domctl op on itself.\n"); -@@ -66,9 +69,27 @@ static long p2m_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, - switch ( sc->op ) - { - case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: -- return 0; -+ { -+ /* Allow and handle preemption */ -+ spin_lock(&d->arch.paging.lock); -+ rc = p2m_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted); -+ spin_unlock(&d->arch.paging.lock); -+ -+ if ( preempted ) -+ /* Not finished. Set up to re-run the call. */ -+ rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h", -+ u_domctl); -+ else -+ /* Finished. Return the new allocation. */ -+ sc->mb = p2m_get_allocation(d); -+ -+ return rc; -+ } - case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: -+ { -+ sc->mb = p2m_get_allocation(d); - return 0; -+ } - default: - { - printk(XENLOG_ERR "Bad p2m domctl op %u\n", sc->op); -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index 79f3d37f5230..1bf9cbeb53cf 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -50,6 +50,54 @@ static uint64_t generate_vttbr(uint16_t vmid, mfn_t root_mfn) - return (mfn_to_maddr(root_mfn) | ((uint64_t)vmid << 48)); - } - -+static struct page_info *p2m_alloc_page(struct domain *d) -+{ -+ struct page_info *pg; -+ -+ spin_lock(&d->arch.paging.lock); -+ /* -+ * For hardware domain, there should be no limit in the number of pages that -+ * can be allocated, so that the kernel may take advantage of the extended -+ * regions. Hence, allocate p2m pages for hardware domains from heap. -+ */ -+ if ( is_hardware_domain(d) ) -+ { -+ pg = alloc_domheap_page(NULL, 0); -+ if ( pg == NULL ) -+ { -+ printk(XENLOG_G_ERR "Failed to allocate P2M pages for hwdom.\n"); -+ spin_unlock(&d->arch.paging.lock); -+ return NULL; -+ } -+ } -+ else -+ { -+ pg = page_list_remove_head(&d->arch.paging.p2m_freelist); -+ if ( unlikely(!pg) ) -+ { -+ spin_unlock(&d->arch.paging.lock); -+ return NULL; -+ } -+ d->arch.paging.p2m_total_pages--; -+ } -+ spin_unlock(&d->arch.paging.lock); -+ -+ return pg; -+} -+ -+static void p2m_free_page(struct domain *d, struct page_info *pg) -+{ -+ spin_lock(&d->arch.paging.lock); -+ if ( is_hardware_domain(d) ) -+ free_domheap_page(pg); -+ else -+ { -+ d->arch.paging.p2m_total_pages++; -+ page_list_add_tail(pg, &d->arch.paging.p2m_freelist); -+ } -+ spin_unlock(&d->arch.paging.lock); -+} -+ - /* Return the size of the pool, rounded up to the nearest MB */ - unsigned int p2m_get_allocation(struct domain *d) - { -@@ -751,7 +799,7 @@ static int p2m_create_table(struct p2m_domain *p2m, lpae_t *entry) - - ASSERT(!p2m_is_valid(*entry)); - -- page = alloc_domheap_page(NULL, 0); -+ page = p2m_alloc_page(p2m->domain); - if ( page == NULL ) - return -ENOMEM; - -@@ -878,7 +926,7 @@ static void p2m_free_entry(struct p2m_domain *p2m, - pg = mfn_to_page(mfn); - - page_list_del(pg, &p2m->pages); -- free_domheap_page(pg); -+ p2m_free_page(p2m->domain, pg); - } - - static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry, -@@ -902,7 +950,7 @@ static bool p2m_split_superpage(struct p2m_domain *p2m, lpae_t *entry, - ASSERT(level < target); - ASSERT(p2m_is_superpage(*entry, level)); - -- page = alloc_domheap_page(NULL, 0); -+ page = p2m_alloc_page(p2m->domain); - if ( !page ) - return false; - -@@ -1641,7 +1689,7 @@ int p2m_teardown(struct domain *d) - - while ( (pg = page_list_remove_head(&p2m->pages)) ) - { -- free_domheap_page(pg); -+ p2m_free_page(p2m->domain, pg); - count++; - /* Arbitrarily preempt every 512 iterations */ - if ( !(count % 512) && hypercall_preempt_check() ) -@@ -1665,6 +1713,7 @@ void p2m_final_teardown(struct domain *d) - return; - - ASSERT(page_list_empty(&p2m->pages)); -+ ASSERT(page_list_empty(&d->arch.paging.p2m_freelist)); - - if ( p2m->root ) - free_domheap_pages(p2m->root, P2M_ROOT_ORDER); --- -2.37.1 - diff --git a/xsa410-4.16-01.patch b/xsa410-4.16-01.patch deleted file mode 100644 index 8f352ee..0000000 --- a/xsa410-4.16-01.patch +++ /dev/null @@ -1,59 +0,0 @@ -From 4b4359122a414cc15156e13e3805988b71ff9da0 Mon Sep 17 00:00:00 2001 -From: Julien Grall -Date: Mon, 6 Jun 2022 06:17:25 +0000 -Subject: [PATCH 1/2] xen/arm: p2m: Prevent adding mapping when domain is dying - -During the domain destroy process, the domain will still be accessible -until it is fully destroyed. So does the P2M because we don't bail -out early if is_dying is non-zero. If a domain has permission to -modify the other domain's P2M (i.e. dom0, or a stubdomain), then -foreign mapping can be added past relinquish_p2m_mapping(). - -Therefore, we need to prevent mapping to be added when the domain -is dying. This commit prevents such adding of mapping by adding the -d->is_dying check to p2m_set_entry(). Also this commit enhances the -check in relinquish_p2m_mapping() to make sure that no mappings can -be added in the P2M after the P2M lock is released. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Julien Grall -Signed-off-by: Henry Wang -Tested-by: Henry Wang -Reviewed-by: Stefano Stabellini ---- - xen/arch/arm/p2m.c | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index fb71fa4c1c90..cbeff90f4371 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1093,6 +1093,15 @@ int p2m_set_entry(struct p2m_domain *p2m, - { - int rc = 0; - -+ /* -+ * Any reference taken by the P2M mappings (e.g. foreign mapping) will -+ * be dropped in relinquish_p2m_mapping(). As the P2M will still -+ * be accessible after, we need to prevent mapping to be added when the -+ * domain is dying. -+ */ -+ if ( unlikely(p2m->domain->is_dying) ) -+ return -ENOMEM; -+ - while ( nr ) - { - unsigned long mask; -@@ -1610,6 +1619,8 @@ int relinquish_p2m_mapping(struct domain *d) - unsigned int order; - gfn_t start, end; - -+ BUG_ON(!d->is_dying); -+ /* No mappings can be added in the P2M after the P2M lock is released. */ - p2m_write_lock(p2m); - - start = p2m->lowest_mapped_gfn; --- -2.37.1 - diff --git a/xsa410-4.16-02.patch b/xsa410-4.16-02.patch deleted file mode 100644 index 7599a47..0000000 --- a/xsa410-4.16-02.patch +++ /dev/null @@ -1,165 +0,0 @@ -From 0d5846490348fa09a0d0915d7c795685a016ce10 Mon Sep 17 00:00:00 2001 -From: Julien Grall -Date: Mon, 6 Jun 2022 06:17:26 +0000 -Subject: [PATCH 2/2] xen/arm: p2m: Handle preemption when freeing intermediate - page tables - -At the moment the P2M page tables will be freed when the domain structure -is freed without any preemption. As the P2M is quite large, iterating -through this may take more time than it is reasonable without intermediate -preemption (to run softirqs and perhaps scheduler). - -Split p2m_teardown() in two parts: one preemptible and called when -relinquishing the resources, the other one non-preemptible and called -when freeing the domain structure. - -As we are now freeing the P2M pages early, we also need to prevent -further allocation if someone call p2m_set_entry() past p2m_teardown() -(I wasn't able to prove this will never happen). This is done by -the checking domain->is_dying from previous patch in p2m_set_entry(). - -Similarly, we want to make sure that no-one can accessed the free -pages. Therefore the root is cleared before freeing pages. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Julien Grall -Signed-off-by: Henry Wang -Tested-by: Henry Wang -Reviewed-by: Stefano Stabellini ---- - xen/arch/arm/domain.c | 10 +++++++-- - xen/arch/arm/p2m.c | 47 ++++++++++++++++++++++++++++++++++++--- - xen/include/asm-arm/p2m.h | 13 +++++++++-- - 3 files changed, 63 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c -index 96e1b235501d..2694c39127c5 100644 ---- a/xen/arch/arm/domain.c -+++ b/xen/arch/arm/domain.c -@@ -789,10 +789,10 @@ fail: - void arch_domain_destroy(struct domain *d) - { - /* IOMMU page table is shared with P2M, always call -- * iommu_domain_destroy() before p2m_teardown(). -+ * iommu_domain_destroy() before p2m_final_teardown(). - */ - iommu_domain_destroy(d); -- p2m_teardown(d); -+ p2m_final_teardown(d); - domain_vgic_free(d); - domain_vuart_free(d); - free_xenheap_page(d->shared_info); -@@ -996,6 +996,7 @@ enum { - PROG_xen, - PROG_page, - PROG_mapping, -+ PROG_p2m, - PROG_done, - }; - -@@ -1056,6 +1057,11 @@ int domain_relinquish_resources(struct domain *d) - if ( ret ) - return ret; - -+ PROGRESS(p2m): -+ ret = p2m_teardown(d); -+ if ( ret ) -+ return ret; -+ - PROGRESS(done): - break; - -diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c -index cbeff90f4371..3bcd1e897e88 100644 ---- a/xen/arch/arm/p2m.c -+++ b/xen/arch/arm/p2m.c -@@ -1527,17 +1527,58 @@ static void p2m_free_vmid(struct domain *d) - spin_unlock(&vmid_alloc_lock); - } - --void p2m_teardown(struct domain *d) -+int p2m_teardown(struct domain *d) - { - struct p2m_domain *p2m = p2m_get_hostp2m(d); -+ unsigned long count = 0; - struct page_info *pg; -+ unsigned int i; -+ int rc = 0; -+ -+ p2m_write_lock(p2m); -+ -+ /* -+ * We are about to free the intermediate page-tables, so clear the -+ * root to prevent any walk to use them. -+ */ -+ for ( i = 0; i < P2M_ROOT_PAGES; i++ ) -+ clear_and_clean_page(p2m->root + i); -+ -+ /* -+ * The domain will not be scheduled anymore, so in theory we should -+ * not need to flush the TLBs. Do it for safety purpose. -+ * -+ * Note that all the devices have already been de-assigned. So we don't -+ * need to flush the IOMMU TLB here. -+ */ -+ p2m_force_tlb_flush_sync(p2m); -+ -+ while ( (pg = page_list_remove_head(&p2m->pages)) ) -+ { -+ free_domheap_page(pg); -+ count++; -+ /* Arbitrarily preempt every 512 iterations */ -+ if ( !(count % 512) && hypercall_preempt_check() ) -+ { -+ rc = -ERESTART; -+ break; -+ } -+ } -+ -+ p2m_write_unlock(p2m); -+ -+ return rc; -+} -+ -+void p2m_final_teardown(struct domain *d) -+{ -+ struct p2m_domain *p2m = p2m_get_hostp2m(d); - - /* p2m not actually initialized */ - if ( !p2m->domain ) - return; - -- while ( (pg = page_list_remove_head(&p2m->pages)) ) -- free_domheap_page(pg); -+ ASSERT(page_list_empty(&p2m->pages)); - - if ( p2m->root ) - free_domheap_pages(p2m->root, P2M_ROOT_ORDER); -diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h -index 8f11d9c97b5d..b3ba83283e11 100644 ---- a/xen/include/asm-arm/p2m.h -+++ b/xen/include/asm-arm/p2m.h -@@ -192,8 +192,17 @@ void setup_virt_paging(void); - /* Init the datastructures for later use by the p2m code */ - int p2m_init(struct domain *d); - --/* Return all the p2m resources to Xen. */ --void p2m_teardown(struct domain *d); -+/* -+ * The P2M resources are freed in two parts: -+ * - p2m_teardown() will be called when relinquish the resources. It -+ * will free large resources (e.g. intermediate page-tables) that -+ * requires preemption. -+ * - p2m_final_teardown() will be called when domain struct is been -+ * freed. This *cannot* be preempted and therefore one small -+ * resources should be freed here. -+ */ -+int p2m_teardown(struct domain *d); -+void p2m_final_teardown(struct domain *d); - - /* - * Remove mapping refcount on each mapping page in the p2m --- -2.37.1 - diff --git a/xsa410-4.16-03.patch b/xsa410-4.16-03.patch deleted file mode 100644 index 7411d90..0000000 --- a/xsa410-4.16-03.patch +++ /dev/null @@ -1,113 +0,0 @@ -From: Roger Pau Monné -Subject: x86/p2m: add option to skip root pagetable removal in p2m_teardown() - -Add a new parameter to p2m_teardown() in order to select whether the -root page table should also be freed. Note that all users are -adjusted to pass the parameter to remove the root page tables, so -behavior is not modified. - -No functional change intended. - -This is part of CVE-2022-33746 / XSA-410. - -Suggested-by: Julien Grall -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich -Acked-by: Tim Deegan - ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -574,7 +574,7 @@ int p2m_init(struct domain *d); - int p2m_alloc_table(struct p2m_domain *p2m); - - /* Return all the p2m resources to Xen. */ --void p2m_teardown(struct p2m_domain *p2m); -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root); - void p2m_final_teardown(struct domain *d); - - /* Add a page to a domain's p2m table */ ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -541,18 +541,18 @@ void hap_final_teardown(struct domain *d - } - - for ( i = 0; i < MAX_ALTP2M; i++ ) -- p2m_teardown(d->arch.altp2m_p2m[i]); -+ p2m_teardown(d->arch.altp2m_p2m[i], true); - } - - /* Destroy nestedp2m's first */ - for (i = 0; i < MAX_NESTEDP2M; i++) { -- p2m_teardown(d->arch.nested_p2m[i]); -+ p2m_teardown(d->arch.nested_p2m[i], true); - } - - if ( d->arch.paging.hap.total_pages != 0 ) - hap_teardown(d, NULL); - -- p2m_teardown(p2m_get_hostp2m(d)); -+ p2m_teardown(p2m_get_hostp2m(d), true); - /* Free any memory that the p2m teardown released */ - paging_lock(d); - hap_set_allocation(d, 0, NULL); ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -749,11 +749,11 @@ int p2m_alloc_table(struct p2m_domain *p - * hvm fixme: when adding support for pvh non-hardware domains, this path must - * cleanup any foreign p2m types (release refcnts on them). - */ --void p2m_teardown(struct p2m_domain *p2m) -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root) - /* Return all the p2m pages to Xen. - * We know we don't have any extra mappings to these pages */ - { -- struct page_info *pg; -+ struct page_info *pg, *root_pg = NULL; - struct domain *d; - - if (p2m == NULL) -@@ -763,10 +763,22 @@ void p2m_teardown(struct p2m_domain *p2m - - p2m_lock(p2m); - ASSERT(atomic_read(&d->shr_pages) == 0); -- p2m->phys_table = pagetable_null(); -+ -+ if ( remove_root ) -+ p2m->phys_table = pagetable_null(); -+ else if ( !pagetable_is_null(p2m->phys_table) ) -+ { -+ root_pg = pagetable_get_page(p2m->phys_table); -+ clear_domain_page(pagetable_get_mfn(p2m->phys_table)); -+ } - - while ( (pg = page_list_remove_head(&p2m->pages)) ) -- d->arch.paging.free_page(d, pg); -+ if ( pg != root_pg ) -+ d->arch.paging.free_page(d, pg); -+ -+ if ( root_pg ) -+ page_list_add(root_pg, &p2m->pages); -+ - p2m_unlock(p2m); - } - ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2701,7 +2701,7 @@ int shadow_enable(struct domain *d, u32 - paging_unlock(d); - out_unlocked: - if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) -- p2m_teardown(p2m); -+ p2m_teardown(p2m, true); - if ( rv != 0 && pg != NULL ) - { - pg->count_info &= ~PGC_count_mask; -@@ -2866,7 +2866,7 @@ void shadow_final_teardown(struct domain - shadow_teardown(d, NULL); - - /* It is now safe to pull down the p2m map. */ -- p2m_teardown(p2m_get_hostp2m(d)); -+ p2m_teardown(p2m_get_hostp2m(d), true); - /* Free any shadow memory that the p2m teardown released */ - paging_lock(d); - shadow_set_allocation(d, 0, NULL); diff --git a/xsa410-4.16-04.patch b/xsa410-4.16-04.patch deleted file mode 100644 index 2170b8e..0000000 --- a/xsa410-4.16-04.patch +++ /dev/null @@ -1,62 +0,0 @@ -From: Jan Beulich -Subject: x86/HAP: adjust monitor table related error handling - -hap_make_monitor_table() will return INVALID_MFN if it encounters an -error condition, but hap_update_paging_modes() wasn’t handling this -value, resulting in an inappropriate value being stored in -monitor_table. This would subsequently misguide at least -hap_vcpu_teardown(). Avoid this by bailing early. - -Further, when a domain has/was already crashed or (perhaps less -important as there's no such path known to lead here) is already dying, -avoid calling domain_crash() on it again - that's at best confusing. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Jan Beulich -Reviewed-by: Roger Pau Monné - ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -39,6 +39,7 @@ - #include - #include - #include -+#include - - #include "private.h" - -@@ -405,8 +406,13 @@ static mfn_t hap_make_monitor_table(stru - return m4mfn; - - oom: -- printk(XENLOG_G_ERR "out of memory building monitor pagetable\n"); -- domain_crash(d); -+ if ( !d->is_dying && -+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) -+ { -+ printk(XENLOG_G_ERR "%pd: out of memory building monitor pagetable\n", -+ d); -+ domain_crash(d); -+ } - return INVALID_MFN; - } - -@@ -766,6 +772,9 @@ static void hap_update_paging_modes(stru - if ( pagetable_is_null(v->arch.hvm.monitor_table) ) - { - mfn_t mmfn = hap_make_monitor_table(v); -+ -+ if ( mfn_eq(mmfn, INVALID_MFN) ) -+ goto unlock; - v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); - make_cr3(v, mmfn); - hvm_update_host_cr3(v); -@@ -774,6 +783,7 @@ static void hap_update_paging_modes(stru - /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ - hap_update_cr3(v, 0, false); - -+ unlock: - paging_unlock(d); - put_gfn(d, cr3_gfn); - } diff --git a/xsa410-4.16-05.patch b/xsa410-4.16-05.patch deleted file mode 100644 index dc626c7..0000000 --- a/xsa410-4.16-05.patch +++ /dev/null @@ -1,60 +0,0 @@ -From: Jan Beulich -Subject: x86/shadow: tolerate failure of sh_set_toplevel_shadow() - -Subsequently sh_set_toplevel_shadow() will be adjusted to install a -blank entry in case prealloc fails. There are, in fact, pre-existing -error paths which would put in place a blank entry. The 4- and 2-level -code in sh_update_cr3(), however, assume the top level entry to be -valid. - -Hence bail from the function in the unlikely event that it's not. Note -that 3-level logic works differently: In particular a guest is free to -supply a PDPTR pointing at 4 non-present (or otherwise deemed invalid) -entries. The guest will crash, but we already cope with that. - -Really mfn_valid() is likely wrong to use in sh_set_toplevel_shadow(), -and it should instead be !mfn_eq(gmfn, INVALID_MFN). Avoid such a change -in security context, but add a respective assertion. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Jan Beulich -Acked-by: Tim Deegan -Reviewed-by: Andrew Cooper - ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2516,6 +2516,7 @@ void sh_set_toplevel_shadow(struct vcpu - /* Now figure out the new contents: is this a valid guest MFN? */ - if ( !mfn_valid(gmfn) ) - { -+ ASSERT(mfn_eq(gmfn, INVALID_MFN)); - new_entry = pagetable_null(); - goto install_new_entry; - } ---- a/xen/arch/x86/mm/shadow/multi.c -+++ b/xen/arch/x86/mm/shadow/multi.c -@@ -3312,6 +3312,11 @@ sh_update_cr3(struct vcpu *v, int do_loc - if ( sh_remove_write_access(d, gmfn, 4, 0) != 0 ) - guest_flush_tlb_mask(d, d->dirty_cpumask); - sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l4_shadow, sh_make_shadow); -+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) -+ { -+ ASSERT(d->is_dying || d->is_shutting_down); -+ return; -+ } - if ( !shadow_mode_external(d) && !is_pv_32bit_domain(d) ) - { - mfn_t smfn = pagetable_get_mfn(v->arch.paging.shadow.shadow_table[0]); -@@ -3370,6 +3375,11 @@ sh_update_cr3(struct vcpu *v, int do_loc - if ( sh_remove_write_access(d, gmfn, 2, 0) != 0 ) - guest_flush_tlb_mask(d, d->dirty_cpumask); - sh_set_toplevel_shadow(v, 0, gmfn, SH_type_l2_shadow, sh_make_shadow); -+ if ( unlikely(pagetable_is_null(v->arch.paging.shadow.shadow_table[0])) ) -+ { -+ ASSERT(d->is_dying || d->is_shutting_down); -+ return; -+ } - #else - #error This should never happen - #endif diff --git a/xsa410-4.16-06.patch b/xsa410-4.16-06.patch deleted file mode 100644 index 8c89eb3..0000000 --- a/xsa410-4.16-06.patch +++ /dev/null @@ -1,255 +0,0 @@ -From: Roger Pau Monné -Subject: x86/shadow: tolerate failure in shadow_prealloc() - -Prevent _shadow_prealloc() from calling BUG() when unable to fulfill -the pre-allocation and instead return true/false. Modify -shadow_prealloc() to crash the domain on allocation failure (if the -domain is not already dying), as shadow cannot operate normally after -that. Modify callers to also gracefully handle {_,}shadow_prealloc() -failing to fulfill the request. - -Note this in turn requires adjusting the callers of -sh_make_monitor_table() also to handle it returning INVALID_MFN. -sh_update_paging_modes() is also modified to add additional error -paths in case of allocation failure, some of those will return with -null monitor page tables (and the domain likely crashed). This is no -different that current error paths, but the newly introduced ones are -more likely to trigger. - -The now added failure points in sh_update_paging_modes() also require -that on some error return paths the previous structures are cleared, -and thus monitor table is null. - -While there adjust the 'type' parameter type of shadow_prealloc() to -unsigned int rather than u32. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Roger Pau Monné -Signed-off-by: Jan Beulich -Acked-by: Tim Deegan - ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -36,6 +36,7 @@ - #include - #include - #include -+#include - #include "private.h" - - DEFINE_PER_CPU(uint32_t,trace_shadow_path_flags); -@@ -928,14 +929,15 @@ static inline void trace_shadow_prealloc - - /* Make sure there are at least count order-sized pages - * available in the shadow page pool. */ --static void _shadow_prealloc(struct domain *d, unsigned int pages) -+static bool __must_check _shadow_prealloc(struct domain *d, unsigned int pages) - { - struct vcpu *v; - struct page_info *sp, *t; - mfn_t smfn; - int i; - -- if ( d->arch.paging.shadow.free_pages >= pages ) return; -+ if ( d->arch.paging.shadow.free_pages >= pages ) -+ return true; - - /* Shouldn't have enabled shadows if we've no vcpus. */ - ASSERT(d->vcpu && d->vcpu[0]); -@@ -951,7 +953,8 @@ static void _shadow_prealloc(struct doma - sh_unpin(d, smfn); - - /* See if that freed up enough space */ -- if ( d->arch.paging.shadow.free_pages >= pages ) return; -+ if ( d->arch.paging.shadow.free_pages >= pages ) -+ return true; - } - - /* Stage two: all shadow pages are in use in hierarchies that are -@@ -974,7 +977,7 @@ static void _shadow_prealloc(struct doma - if ( d->arch.paging.shadow.free_pages >= pages ) - { - guest_flush_tlb_mask(d, d->dirty_cpumask); -- return; -+ return true; - } - } - } -@@ -987,7 +990,12 @@ static void _shadow_prealloc(struct doma - d->arch.paging.shadow.total_pages, - d->arch.paging.shadow.free_pages, - d->arch.paging.shadow.p2m_pages); -- BUG(); -+ -+ ASSERT(d->is_dying); -+ -+ guest_flush_tlb_mask(d, d->dirty_cpumask); -+ -+ return false; - } - - /* Make sure there are at least count pages of the order according to -@@ -995,9 +1003,19 @@ static void _shadow_prealloc(struct doma - * This must be called before any calls to shadow_alloc(). Since this - * will free existing shadows to make room, it must be called early enough - * to avoid freeing shadows that the caller is currently working on. */ --void shadow_prealloc(struct domain *d, u32 type, unsigned int count) -+bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) - { -- return _shadow_prealloc(d, shadow_size(type) * count); -+ bool ret = _shadow_prealloc(d, shadow_size(type) * count); -+ -+ if ( !ret && !d->is_dying && -+ (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) -+ /* -+ * Failing to allocate memory required for shadow usage can only result in -+ * a domain crash, do it here rather that relying on every caller to do it. -+ */ -+ domain_crash(d); -+ -+ return ret; - } - - /* Deliberately free all the memory we can: this will tear down all of -@@ -1218,7 +1236,7 @@ void shadow_free(struct domain *d, mfn_t - static struct page_info * - shadow_alloc_p2m_page(struct domain *d) - { -- struct page_info *pg; -+ struct page_info *pg = NULL; - - /* This is called both from the p2m code (which never holds the - * paging lock) and the log-dirty code (which always does). */ -@@ -1236,16 +1254,18 @@ shadow_alloc_p2m_page(struct domain *d) - d->arch.paging.shadow.p2m_pages, - shadow_min_acceptable_pages(d)); - } -- paging_unlock(d); -- return NULL; -+ goto out; - } - -- shadow_prealloc(d, SH_type_p2m_table, 1); -+ if ( !shadow_prealloc(d, SH_type_p2m_table, 1) ) -+ goto out; -+ - pg = mfn_to_page(shadow_alloc(d, SH_type_p2m_table, 0)); - d->arch.paging.shadow.p2m_pages++; - d->arch.paging.shadow.total_pages--; - ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); - -+ out: - paging_unlock(d); - - return pg; -@@ -1336,7 +1356,9 @@ int shadow_set_allocation(struct domain - else if ( d->arch.paging.shadow.total_pages > pages ) - { - /* Need to return memory to domheap */ -- _shadow_prealloc(d, 1); -+ if ( !_shadow_prealloc(d, 1) ) -+ return -ENOMEM; -+ - sp = page_list_remove_head(&d->arch.paging.shadow.freelist); - ASSERT(sp); - /* -@@ -2334,12 +2356,13 @@ static void sh_update_paging_modes(struc - if ( mfn_eq(v->arch.paging.shadow.oos_snapshot[0], INVALID_MFN) ) - { - int i; -+ -+ if ( !shadow_prealloc(d, SH_type_oos_snapshot, SHADOW_OOS_PAGES) ) -+ return; -+ - for(i = 0; i < SHADOW_OOS_PAGES; i++) -- { -- shadow_prealloc(d, SH_type_oos_snapshot, 1); - v->arch.paging.shadow.oos_snapshot[i] = - shadow_alloc(d, SH_type_oos_snapshot, 0); -- } - } - #endif /* OOS */ - -@@ -2403,6 +2426,9 @@ static void sh_update_paging_modes(struc - mfn_t mmfn = sh_make_monitor_table( - v, v->arch.paging.mode->shadow.shadow_levels); - -+ if ( mfn_eq(mmfn, INVALID_MFN) ) -+ return; -+ - v->arch.hvm.monitor_table = pagetable_from_mfn(mmfn); - make_cr3(v, mmfn); - hvm_update_host_cr3(v); -@@ -2441,6 +2467,12 @@ static void sh_update_paging_modes(struc - v->arch.hvm.monitor_table = pagetable_null(); - new_mfn = sh_make_monitor_table( - v, v->arch.paging.mode->shadow.shadow_levels); -+ if ( mfn_eq(new_mfn, INVALID_MFN) ) -+ { -+ sh_destroy_monitor_table(v, old_mfn, -+ old_mode->shadow.shadow_levels); -+ return; -+ } - v->arch.hvm.monitor_table = pagetable_from_mfn(new_mfn); - SHADOW_PRINTK("new monitor table %"PRI_mfn "\n", - mfn_x(new_mfn)); -@@ -2526,7 +2558,12 @@ void sh_set_toplevel_shadow(struct vcpu - if ( !mfn_valid(smfn) ) - { - /* Make sure there's enough free shadow memory. */ -- shadow_prealloc(d, root_type, 1); -+ if ( !shadow_prealloc(d, root_type, 1) ) -+ { -+ new_entry = pagetable_null(); -+ goto install_new_entry; -+ } -+ - /* Shadow the page. */ - smfn = make_shadow(v, gmfn, root_type); - } ---- a/xen/arch/x86/mm/shadow/hvm.c -+++ b/xen/arch/x86/mm/shadow/hvm.c -@@ -700,7 +700,9 @@ mfn_t sh_make_monitor_table(const struct - ASSERT(!pagetable_get_pfn(v->arch.hvm.monitor_table)); - - /* Guarantee we can get the memory we need */ -- shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS); -+ if ( !shadow_prealloc(d, SH_type_monitor_table, CONFIG_PAGING_LEVELS) ) -+ return INVALID_MFN; -+ - m4mfn = shadow_alloc(d, SH_type_monitor_table, 0); - mfn_to_page(m4mfn)->shadow_flags = 4; - ---- a/xen/arch/x86/mm/shadow/multi.c -+++ b/xen/arch/x86/mm/shadow/multi.c -@@ -2440,9 +2440,14 @@ static int sh_page_fault(struct vcpu *v, - * Preallocate shadow pages *before* removing writable accesses - * otherwhise an OOS L1 might be demoted and promoted again with - * writable mappings. */ -- shadow_prealloc(d, -- SH_type_l1_shadow, -- GUEST_PAGING_LEVELS < 4 ? 1 : GUEST_PAGING_LEVELS - 1); -+ if ( !shadow_prealloc(d, SH_type_l1_shadow, -+ GUEST_PAGING_LEVELS < 4 -+ ? 1 : GUEST_PAGING_LEVELS - 1) ) -+ { -+ paging_unlock(d); -+ put_gfn(d, gfn_x(gfn)); -+ return 0; -+ } - - rc = gw_remove_write_accesses(v, va, &gw); - ---- a/xen/arch/x86/mm/shadow/private.h -+++ b/xen/arch/x86/mm/shadow/private.h -@@ -383,7 +383,8 @@ void shadow_promote(struct domain *d, mf - void shadow_demote(struct domain *d, mfn_t gmfn, u32 type); - - /* Shadow page allocation functions */ --void shadow_prealloc(struct domain *d, u32 shadow_type, unsigned int count); -+bool __must_check shadow_prealloc(struct domain *d, unsigned int shadow_type, -+ unsigned int count); - mfn_t shadow_alloc(struct domain *d, - u32 shadow_type, - unsigned long backpointer); diff --git a/xsa410-4.16-07.patch b/xsa410-4.16-07.patch deleted file mode 100644 index 9eea91c..0000000 --- a/xsa410-4.16-07.patch +++ /dev/null @@ -1,82 +0,0 @@ -From: Roger Pau Monné -Subject: x86/p2m: refuse new allocations for dying domains - -This will in particular prevent any attempts to add entries to the p2m, -once - in a subsequent change - non-root entries have been removed. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Roger Pau Monné -Signed-off-by: Jan Beulich -Acked-by: Tim Deegan - ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -245,6 +245,9 @@ static struct page_info *hap_alloc(struc - - ASSERT(paging_locked_by_me(d)); - -+ if ( unlikely(d->is_dying) ) -+ return NULL; -+ - pg = page_list_remove_head(&d->arch.paging.hap.freelist); - if ( unlikely(!pg) ) - return NULL; -@@ -281,7 +284,7 @@ static struct page_info *hap_alloc_p2m_p - d->arch.paging.hap.p2m_pages++; - ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); - } -- else if ( !d->arch.paging.p2m_alloc_failed ) -+ else if ( !d->arch.paging.p2m_alloc_failed && !d->is_dying ) - { - d->arch.paging.p2m_alloc_failed = 1; - dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n", ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -939,6 +939,10 @@ static bool __must_check _shadow_preallo - if ( d->arch.paging.shadow.free_pages >= pages ) - return true; - -+ if ( unlikely(d->is_dying) ) -+ /* No reclaim when the domain is dying, teardown will take care of it. */ -+ return false; -+ - /* Shouldn't have enabled shadows if we've no vcpus. */ - ASSERT(d->vcpu && d->vcpu[0]); - -@@ -991,7 +995,7 @@ static bool __must_check _shadow_preallo - d->arch.paging.shadow.free_pages, - d->arch.paging.shadow.p2m_pages); - -- ASSERT(d->is_dying); -+ ASSERT_UNREACHABLE(); - - guest_flush_tlb_mask(d, d->dirty_cpumask); - -@@ -1005,10 +1009,13 @@ static bool __must_check _shadow_preallo - * to avoid freeing shadows that the caller is currently working on. */ - bool shadow_prealloc(struct domain *d, unsigned int type, unsigned int count) - { -- bool ret = _shadow_prealloc(d, shadow_size(type) * count); -+ bool ret; -+ -+ if ( unlikely(d->is_dying) ) -+ return false; - -- if ( !ret && !d->is_dying && -- (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) -+ ret = _shadow_prealloc(d, shadow_size(type) * count); -+ if ( !ret && (!d->is_shutting_down || d->shutdown_code != SHUTDOWN_crash) ) - /* - * Failing to allocate memory required for shadow usage can only result in - * a domain crash, do it here rather that relying on every caller to do it. -@@ -1238,6 +1245,9 @@ shadow_alloc_p2m_page(struct domain *d) - { - struct page_info *pg = NULL; - -+ if ( unlikely(d->is_dying) ) -+ return NULL; -+ - /* This is called both from the p2m code (which never holds the - * paging lock) and the log-dirty code (which always does). */ - paging_lock_recursive(d); diff --git a/xsa410-4.16-08.patch b/xsa410-4.16-08.patch deleted file mode 100644 index 92cdb49..0000000 --- a/xsa410-4.16-08.patch +++ /dev/null @@ -1,96 +0,0 @@ -From: Roger Pau Monné -Subject: x86/p2m: truly free paging pool memory for dying domains - -Modify {hap,shadow}_free to free the page immediately if the domain is -dying, so that pages don't accumulate in the pool when -{shadow,hap}_final_teardown() get called. This is to limit the amount of -work which needs to be done there (in a non-preemptable manner). - -Note the call to shadow_free() in shadow_free_p2m_page() is moved after -increasing total_pages, so that the decrease done in shadow_free() in -case the domain is dying doesn't underflow the counter, even if just for -a short interval. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Roger Pau Monné -Signed-off-by: Jan Beulich -Acked-by: Tim Deegan - ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -265,6 +265,18 @@ static void hap_free(struct domain *d, m - - ASSERT(paging_locked_by_me(d)); - -+ /* -+ * For dying domains, actually free the memory here. This way less work is -+ * left to hap_final_teardown(), which cannot easily have preemption checks -+ * added. -+ */ -+ if ( unlikely(d->is_dying) ) -+ { -+ free_domheap_page(pg); -+ d->arch.paging.hap.total_pages--; -+ return; -+ } -+ - d->arch.paging.hap.free_pages++; - page_list_add_tail(pg, &d->arch.paging.hap.freelist); - } ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -1187,6 +1187,7 @@ mfn_t shadow_alloc(struct domain *d, - void shadow_free(struct domain *d, mfn_t smfn) - { - struct page_info *next = NULL, *sp = mfn_to_page(smfn); -+ bool dying = ACCESS_ONCE(d->is_dying); - struct page_list_head *pin_list; - unsigned int pages; - u32 shadow_type; -@@ -1229,11 +1230,32 @@ void shadow_free(struct domain *d, mfn_t - * just before the allocator hands the page out again. */ - page_set_tlbflush_timestamp(sp); - perfc_decr(shadow_alloc_count); -- page_list_add_tail(sp, &d->arch.paging.shadow.freelist); -+ -+ /* -+ * For dying domains, actually free the memory here. This way less -+ * work is left to shadow_final_teardown(), which cannot easily have -+ * preemption checks added. -+ */ -+ if ( unlikely(dying) ) -+ { -+ /* -+ * The backpointer field (sh.back) used by shadow code aliases the -+ * domain owner field, unconditionally clear it here to avoid -+ * free_domheap_page() attempting to parse it. -+ */ -+ page_set_owner(sp, NULL); -+ free_domheap_page(sp); -+ } -+ else -+ page_list_add_tail(sp, &d->arch.paging.shadow.freelist); -+ - sp = next; - } - -- d->arch.paging.shadow.free_pages += pages; -+ if ( unlikely(dying) ) -+ d->arch.paging.shadow.total_pages -= pages; -+ else -+ d->arch.paging.shadow.free_pages += pages; - } - - /* Divert a page from the pool to be used by the p2m mapping. -@@ -1303,9 +1325,9 @@ shadow_free_p2m_page(struct domain *d, s - * paging lock) and the log-dirty code (which always does). */ - paging_lock_recursive(d); - -- shadow_free(d, page_to_mfn(pg)); - d->arch.paging.shadow.p2m_pages--; - d->arch.paging.shadow.total_pages++; -+ shadow_free(d, page_to_mfn(pg)); - - paging_unlock(d); - } diff --git a/xsa410-4.16-09.patch b/xsa410-4.16-09.patch deleted file mode 100644 index 60259e1..0000000 --- a/xsa410-4.16-09.patch +++ /dev/null @@ -1,159 +0,0 @@ -From: Roger Pau Monné -Subject: x86/p2m: free the paging memory pool preemptively - -The paging memory pool is currently freed in two different places: -from {shadow,hap}_teardown() via domain_relinquish_resources() and -from {shadow,hap}_final_teardown() via complete_domain_destroy(). -While the former does handle preemption, the later doesn't. - -Attempt to move as much p2m related freeing as possible to happen -before the call to {shadow,hap}_teardown(), so that most memory can be -freed in a preemptive way. In order to avoid causing issues to -existing callers leave the root p2m page tables set and free them in -{hap,shadow}_final_teardown(). Also modify {hap,shadow}_free to free -the page immediately if the domain is dying, so that pages don't -accumulate in the pool when {shadow,hap}_final_teardown() get called. - -Move altp2m_vcpu_disable_ve() to be done in hap_teardown(), as that's -the place where altp2m_active gets disabled now. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Roger Pau Monné -Signed-off-by: Jan Beulich -Acked-by: Tim Deegan - ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -38,7 +38,6 @@ - #include - #include - #include --#include - #include - #include - #include -@@ -2381,12 +2380,6 @@ int domain_relinquish_resources(struct d - vpmu_destroy(v); - } - -- if ( altp2m_active(d) ) -- { -- for_each_vcpu ( d, v ) -- altp2m_vcpu_disable_ve(v); -- } -- - if ( is_pv_domain(d) ) - { - for_each_vcpu ( d, v ) ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -28,6 +28,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -546,24 +547,8 @@ void hap_final_teardown(struct domain *d - unsigned int i; - - if ( hvm_altp2m_supported() ) -- { -- d->arch.altp2m_active = 0; -- -- if ( d->arch.altp2m_eptp ) -- { -- free_xenheap_page(d->arch.altp2m_eptp); -- d->arch.altp2m_eptp = NULL; -- } -- -- if ( d->arch.altp2m_visible_eptp ) -- { -- free_xenheap_page(d->arch.altp2m_visible_eptp); -- d->arch.altp2m_visible_eptp = NULL; -- } -- - for ( i = 0; i < MAX_ALTP2M; i++ ) - p2m_teardown(d->arch.altp2m_p2m[i], true); -- } - - /* Destroy nestedp2m's first */ - for (i = 0; i < MAX_NESTEDP2M; i++) { -@@ -578,6 +563,8 @@ void hap_final_teardown(struct domain *d - paging_lock(d); - hap_set_allocation(d, 0, NULL); - ASSERT(d->arch.paging.hap.p2m_pages == 0); -+ ASSERT(d->arch.paging.hap.free_pages == 0); -+ ASSERT(d->arch.paging.hap.total_pages == 0); - paging_unlock(d); - } - -@@ -603,6 +590,7 @@ void hap_vcpu_teardown(struct vcpu *v) - void hap_teardown(struct domain *d, bool *preempted) - { - struct vcpu *v; -+ unsigned int i; - - ASSERT(d->is_dying); - ASSERT(d != current->domain); -@@ -611,6 +599,28 @@ void hap_teardown(struct domain *d, bool - for_each_vcpu ( d, v ) - hap_vcpu_teardown(v); - -+ /* Leave the root pt in case we get further attempts to modify the p2m. */ -+ if ( hvm_altp2m_supported() ) -+ { -+ if ( altp2m_active(d) ) -+ for_each_vcpu ( d, v ) -+ altp2m_vcpu_disable_ve(v); -+ -+ d->arch.altp2m_active = 0; -+ -+ FREE_XENHEAP_PAGE(d->arch.altp2m_eptp); -+ FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); -+ -+ for ( i = 0; i < MAX_ALTP2M; i++ ) -+ p2m_teardown(d->arch.altp2m_p2m[i], false); -+ } -+ -+ /* Destroy nestedp2m's after altp2m. */ -+ for ( i = 0; i < MAX_NESTEDP2M; i++ ) -+ p2m_teardown(d->arch.nested_p2m[i], false); -+ -+ p2m_teardown(p2m_get_hostp2m(d), false); -+ - paging_lock(d); /* Keep various asserts happy */ - - if ( d->arch.paging.hap.total_pages != 0 ) ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2824,8 +2824,17 @@ void shadow_teardown(struct domain *d, b - for_each_vcpu ( d, v ) - shadow_vcpu_teardown(v); - -+ p2m_teardown(p2m_get_hostp2m(d), false); -+ - paging_lock(d); - -+ /* -+ * Reclaim all shadow memory so that shadow_set_allocation() doesn't find -+ * in-use pages, as _shadow_prealloc() will no longer try to reclaim pages -+ * because the domain is dying. -+ */ -+ shadow_blow_tables(d); -+ - #if (SHADOW_OPTIMIZATIONS & (SHOPT_VIRTUAL_TLB|SHOPT_OUT_OF_SYNC)) - /* Free the virtual-TLB array attached to each vcpu */ - for_each_vcpu(d, v) -@@ -2946,6 +2955,9 @@ void shadow_final_teardown(struct domain - d->arch.paging.shadow.total_pages, - d->arch.paging.shadow.free_pages, - d->arch.paging.shadow.p2m_pages); -+ ASSERT(!d->arch.paging.shadow.total_pages); -+ ASSERT(!d->arch.paging.shadow.free_pages); -+ ASSERT(!d->arch.paging.shadow.p2m_pages); - paging_unlock(d); - } - diff --git a/xsa410-4.16-10.patch b/xsa410-4.16-10.patch deleted file mode 100644 index 258b7ff..0000000 --- a/xsa410-4.16-10.patch +++ /dev/null @@ -1,171 +0,0 @@ -From: Julien Grall -Subject: xen/x86: p2m: Add preemption in p2m_teardown() - -The list p2m->pages contain all the pages used by the P2M. On large -instance this can be quite large and the time spent to call -d->arch.paging.free_page() will take more than 1ms for a 80GB guest -on a Xen running in nested environment on a c5.metal. - -By extrapolation, it would take > 100ms for a 8TB guest (what we -current security support). So add some preemption in p2m_teardown() -and propagate to the callers. Note there are 3 places where -the preemption is not enabled: - - hap_final_teardown()/shadow_final_teardown(): We are - preventing update the P2M once the domain is dying (so - no more pages could be allocated) and most of the P2M pages - will be freed in preemptive manneer when relinquishing the - resources. So this is fine to disable preemption. - - shadow_enable(): This is fine because it will undo the allocation - that may have been made by p2m_alloc_table() (so only the root - page table). - -The preemption is arbitrarily checked every 1024 iterations. - -Note that with the current approach, Xen doesn't keep track on whether -the alt/nested P2Ms have been cleared. So there are some redundant work. -However, this is not expected to incurr too much overhead (the P2M lock -shouldn't be contended during teardown). So this is optimization is -left outside of the security event. - -This is part of CVE-2022-33746 / XSA-410. - -Signed-off-by: Julien Grall -Signed-off-by: Jan Beulich - ---- a/xen/include/asm-x86/p2m.h -+++ b/xen/include/asm-x86/p2m.h -@@ -574,7 +574,7 @@ int p2m_init(struct domain *d); - int p2m_alloc_table(struct p2m_domain *p2m); - - /* Return all the p2m resources to Xen. */ --void p2m_teardown(struct p2m_domain *p2m, bool remove_root); -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted); - void p2m_final_teardown(struct domain *d); - - /* Add a page to a domain's p2m table */ ---- a/xen/arch/x86/mm/hap/hap.c -+++ b/xen/arch/x86/mm/hap/hap.c -@@ -548,17 +548,17 @@ void hap_final_teardown(struct domain *d - - if ( hvm_altp2m_supported() ) - for ( i = 0; i < MAX_ALTP2M; i++ ) -- p2m_teardown(d->arch.altp2m_p2m[i], true); -+ p2m_teardown(d->arch.altp2m_p2m[i], true, NULL); - - /* Destroy nestedp2m's first */ - for (i = 0; i < MAX_NESTEDP2M; i++) { -- p2m_teardown(d->arch.nested_p2m[i], true); -+ p2m_teardown(d->arch.nested_p2m[i], true, NULL); - } - - if ( d->arch.paging.hap.total_pages != 0 ) - hap_teardown(d, NULL); - -- p2m_teardown(p2m_get_hostp2m(d), true); -+ p2m_teardown(p2m_get_hostp2m(d), true, NULL); - /* Free any memory that the p2m teardown released */ - paging_lock(d); - hap_set_allocation(d, 0, NULL); -@@ -612,14 +612,24 @@ void hap_teardown(struct domain *d, bool - FREE_XENHEAP_PAGE(d->arch.altp2m_visible_eptp); - - for ( i = 0; i < MAX_ALTP2M; i++ ) -- p2m_teardown(d->arch.altp2m_p2m[i], false); -+ { -+ p2m_teardown(d->arch.altp2m_p2m[i], false, preempted); -+ if ( preempted && *preempted ) -+ return; -+ } - } - - /* Destroy nestedp2m's after altp2m. */ - for ( i = 0; i < MAX_NESTEDP2M; i++ ) -- p2m_teardown(d->arch.nested_p2m[i], false); -+ { -+ p2m_teardown(d->arch.nested_p2m[i], false, preempted); -+ if ( preempted && *preempted ) -+ return; -+ } - -- p2m_teardown(p2m_get_hostp2m(d), false); -+ p2m_teardown(p2m_get_hostp2m(d), false, preempted); -+ if ( preempted && *preempted ) -+ return; - - paging_lock(d); /* Keep various asserts happy */ - ---- a/xen/arch/x86/mm/p2m.c -+++ b/xen/arch/x86/mm/p2m.c -@@ -749,12 +749,13 @@ int p2m_alloc_table(struct p2m_domain *p - * hvm fixme: when adding support for pvh non-hardware domains, this path must - * cleanup any foreign p2m types (release refcnts on them). - */ --void p2m_teardown(struct p2m_domain *p2m, bool remove_root) -+void p2m_teardown(struct p2m_domain *p2m, bool remove_root, bool *preempted) - /* Return all the p2m pages to Xen. - * We know we don't have any extra mappings to these pages */ - { - struct page_info *pg, *root_pg = NULL; - struct domain *d; -+ unsigned int i = 0; - - if (p2m == NULL) - return; -@@ -773,8 +774,19 @@ void p2m_teardown(struct p2m_domain *p2m - } - - while ( (pg = page_list_remove_head(&p2m->pages)) ) -- if ( pg != root_pg ) -- d->arch.paging.free_page(d, pg); -+ { -+ if ( pg == root_pg ) -+ continue; -+ -+ d->arch.paging.free_page(d, pg); -+ -+ /* Arbitrarily check preemption every 1024 iterations */ -+ if ( preempted && !(++i % 1024) && general_preempt_check() ) -+ { -+ *preempted = true; -+ break; -+ } -+ } - - if ( root_pg ) - page_list_add(root_pg, &p2m->pages); ---- a/xen/arch/x86/mm/shadow/common.c -+++ b/xen/arch/x86/mm/shadow/common.c -@@ -2770,8 +2770,12 @@ int shadow_enable(struct domain *d, u32 - out_locked: - paging_unlock(d); - out_unlocked: -+ /* -+ * This is fine to ignore the preemption here because only the root -+ * will be allocated by p2m_alloc_table(). -+ */ - if ( rv != 0 && !pagetable_is_null(p2m_get_pagetable(p2m)) ) -- p2m_teardown(p2m, true); -+ p2m_teardown(p2m, true, NULL); - if ( rv != 0 && pg != NULL ) - { - pg->count_info &= ~PGC_count_mask; -@@ -2824,7 +2828,9 @@ void shadow_teardown(struct domain *d, b - for_each_vcpu ( d, v ) - shadow_vcpu_teardown(v); - -- p2m_teardown(p2m_get_hostp2m(d), false); -+ p2m_teardown(p2m_get_hostp2m(d), false, preempted); -+ if ( preempted && *preempted ) -+ return; - - paging_lock(d); - -@@ -2945,7 +2951,7 @@ void shadow_final_teardown(struct domain - shadow_teardown(d, NULL); - - /* It is now safe to pull down the p2m map. */ -- p2m_teardown(p2m_get_hostp2m(d), true); -+ p2m_teardown(p2m_get_hostp2m(d), true, NULL); - /* Free any shadow memory that the p2m teardown released */ - paging_lock(d); - shadow_set_allocation(d, 0, NULL); diff --git a/xsa411.patch b/xsa411.patch deleted file mode 100644 index 50dcae4..0000000 --- a/xsa411.patch +++ /dev/null @@ -1,55 +0,0 @@ -From: Jan Beulich -Subject: gnttab: correct locking on transitive grant copy error path - -While the comment next to the lock dropping in preparation of -recursively calling acquire_grant_for_copy() mistakenly talks about the -rd == td case (excluded a few lines further up), the same concerns apply -to the calling of release_grant_for_copy() on a subsequent error path. - -This is CVE-2022-33748 / XSA-411. - -Fixes: ad48fb963dbf ("gnttab: fix transitive grant handling") -Signed-off-by: Jan Beulich ---- -v2: Extend code comment. - ---- a/xen/common/grant_table.c -+++ b/xen/common/grant_table.c -@@ -2622,9 +2622,8 @@ acquire_grant_for_copy( - trans_domid); - - /* -- * acquire_grant_for_copy() could take the lock on the -- * remote table (if rd == td), so we have to drop the lock -- * here and reacquire. -+ * acquire_grant_for_copy() will take the lock on the remote table, -+ * so we have to drop the lock here and reacquire. - */ - active_entry_release(act); - grant_read_unlock(rgt); -@@ -2661,11 +2660,25 @@ acquire_grant_for_copy( - act->trans_gref != trans_gref || - !act->is_sub_page)) ) - { -+ /* -+ * Like above for acquire_grant_for_copy() we need to drop and then -+ * re-acquire the locks here to prevent lock order inversion issues. -+ * Unlike for acquire_grant_for_copy() we don't need to re-check -+ * anything, as release_grant_for_copy() doesn't depend on the grant -+ * table entry: It only updates internal state and the status flags. -+ */ -+ active_entry_release(act); -+ grant_read_unlock(rgt); -+ - release_grant_for_copy(td, trans_gref, readonly); - rcu_unlock_domain(td); -+ -+ grant_read_lock(rgt); -+ act = active_entry_acquire(rgt, gref); - reduce_status_for_pin(rd, act, status, readonly); - active_entry_release(act); - grant_read_unlock(rgt); -+ - put_page(*page); - *page = NULL; - return ERESTART; diff --git a/xsa412-4.16.patch b/xsa412-4.16.patch deleted file mode 100644 index f37fc21..0000000 --- a/xsa412-4.16.patch +++ /dev/null @@ -1,245 +0,0 @@ -From: Andrew Cooper -Subject: x86/vmx: Revert "VMX: use a single, global APIC access page" - -The claim "No accesses would ever go to this page." is false. A consequence -of how Intel's APIC Acceleration works, and Xen's choice to have per-domain -P2Ms (rather than per-vCPU P2Ms) means that the APIC page is fully read-write -to any vCPU which is not in xAPIC mode. - -This reverts commit 58850b9074d3e7affdf3bc94c84e417ecfa4d165. - -This is XSA-412 / CVE-2022-42327. - -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich - -diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c -index d429d76c18c9..3f4276531322 100644 ---- a/xen/arch/x86/hvm/vmx/vmx.c -+++ b/xen/arch/x86/hvm/vmx/vmx.c -@@ -66,7 +66,8 @@ boolean_param("force-ept", opt_force_ept); - static void vmx_ctxt_switch_from(struct vcpu *v); - static void vmx_ctxt_switch_to(struct vcpu *v); - --static int alloc_vlapic_mapping(void); -+static int vmx_alloc_vlapic_mapping(struct domain *d); -+static void vmx_free_vlapic_mapping(struct domain *d); - static void vmx_install_vlapic_mapping(struct vcpu *v); - static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr, - unsigned int flags); -@@ -77,8 +78,6 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content); - static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content); - static void vmx_invlpg(struct vcpu *v, unsigned long linear); - --static mfn_t __read_mostly apic_access_mfn = INVALID_MFN_INITIALIZER; -- - /* Values for domain's ->arch.hvm_domain.pi_ops.flags. */ - #define PI_CSW_FROM (1u << 0) - #define PI_CSW_TO (1u << 1) -@@ -402,6 +401,7 @@ static int vmx_domain_initialise(struct domain *d) - .to = vmx_ctxt_switch_to, - .tail = vmx_do_resume, - }; -+ int rc; - - d->arch.ctxt_switch = &csw; - -@@ -411,15 +411,24 @@ static int vmx_domain_initialise(struct domain *d) - */ - d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp; - -+ if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 ) -+ return rc; -+ - return 0; - } - -+static void vmx_domain_relinquish_resources(struct domain *d) -+{ -+ vmx_free_vlapic_mapping(d); -+} -+ - static void domain_creation_finished(struct domain *d) - { - gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE); -+ mfn_t apic_access_mfn = d->arch.hvm.vmx.apic_access_mfn; - bool ipat; - -- if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, INVALID_MFN) ) -+ if ( mfn_eq(apic_access_mfn, _mfn(0)) ) - return; - - ASSERT(epte_get_entry_emt(d, gfn, apic_access_mfn, 0, &ipat, -@@ -2481,6 +2490,7 @@ static struct hvm_function_table __initdata vmx_function_table = { - .cpu_up_prepare = vmx_cpu_up_prepare, - .cpu_dead = vmx_cpu_dead, - .domain_initialise = vmx_domain_initialise, -+ .domain_relinquish_resources = vmx_domain_relinquish_resources, - .domain_creation_finished = domain_creation_finished, - .vcpu_initialise = vmx_vcpu_initialise, - .vcpu_destroy = vmx_vcpu_destroy, -@@ -2731,7 +2741,7 @@ const struct hvm_function_table * __init start_vmx(void) - { - set_in_cr4(X86_CR4_VMXE); - -- if ( vmx_vmcs_init() || alloc_vlapic_mapping() ) -+ if ( vmx_vmcs_init() ) - { - printk("VMX: failed to initialise.\n"); - return NULL; -@@ -3305,36 +3315,55 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content) - return X86EMUL_EXCEPTION; - } - --static int __init alloc_vlapic_mapping(void) -+static int vmx_alloc_vlapic_mapping(struct domain *d) - { - struct page_info *pg; - mfn_t mfn; - -- if ( !cpu_has_vmx_virtualize_apic_accesses ) -+ if ( !has_vlapic(d) || !cpu_has_vmx_virtualize_apic_accesses ) - return 0; - -- pg = alloc_domheap_page(NULL, 0); -+ pg = alloc_domheap_page(d, MEMF_no_refcount); - if ( !pg ) - return -ENOMEM; - -- /* -- * Signal to shadow code that this page cannot be refcounted. This also -- * makes epte_get_entry_emt() recognize this page as "special". -- */ -- page_suppress_refcounting(pg); -+ if ( !get_page_and_type(pg, d, PGT_writable_page) ) -+ { -+ /* -+ * The domain can't possibly know about this page yet, so failure -+ * here is a clear indication of something fishy going on. -+ */ -+ domain_crash(d); -+ return -ENODATA; -+ } - - mfn = page_to_mfn(pg); - clear_domain_page(mfn); -- apic_access_mfn = mfn; -+ d->arch.hvm.vmx.apic_access_mfn = mfn; - - return 0; - } - -+static void vmx_free_vlapic_mapping(struct domain *d) -+{ -+ mfn_t mfn = d->arch.hvm.vmx.apic_access_mfn; -+ -+ d->arch.hvm.vmx.apic_access_mfn = _mfn(0); -+ if ( !mfn_eq(mfn, _mfn(0)) ) -+ { -+ struct page_info *pg = mfn_to_page(mfn); -+ -+ put_page_alloc_ref(pg); -+ put_page_and_type(pg); -+ } -+} -+ - static void vmx_install_vlapic_mapping(struct vcpu *v) - { -+ mfn_t apic_access_mfn = v->domain->arch.hvm.vmx.apic_access_mfn; - paddr_t virt_page_ma, apic_page_ma; - -- if ( !has_vlapic(v->domain) || mfn_eq(apic_access_mfn, INVALID_MFN) ) -+ if ( mfn_eq(apic_access_mfn, _mfn(0)) ) - return; - - ASSERT(cpu_has_vmx_virtualize_apic_accesses); -diff --git a/xen/arch/x86/mm/shadow/set.c b/xen/arch/x86/mm/shadow/set.c -index 87e9c6eeb219..bd6c68b547c9 100644 ---- a/xen/arch/x86/mm/shadow/set.c -+++ b/xen/arch/x86/mm/shadow/set.c -@@ -101,14 +101,6 @@ shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d, p2m_type_t type) - owner = page_get_owner(pg); - } - -- /* -- * Check whether refcounting is suppressed on this page. For example, -- * VMX'es APIC access MFN is just a surrogate page. It doesn't actually -- * get accessed, and hence there's no need to refcount it. -- */ -- if ( pg && page_refcounting_suppressed(pg) ) -- return 0; -- - if ( owner == dom_io ) - owner = NULL; - -diff --git a/xen/arch/x86/mm/shadow/types.h b/xen/arch/x86/mm/shadow/types.h -index 6970e7d6ea4a..814a4018535a 100644 ---- a/xen/arch/x86/mm/shadow/types.h -+++ b/xen/arch/x86/mm/shadow/types.h -@@ -276,16 +276,9 @@ int shadow_set_l4e(struct domain *d, shadow_l4e_t *sl4e, - static void inline - shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d) - { -- mfn_t mfn = shadow_l1e_get_mfn(sl1e); -- - if ( !shadow_mode_refcounts(d) ) - return; - -- if ( mfn_valid(mfn) && -- /* See the respective comment in shadow_get_page_from_l1e(). */ -- page_refcounting_suppressed(mfn_to_page(mfn)) ) -- return; -- - put_page_from_l1e(sl1e, d); - } - -diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h -index 03c9ccf627ab..8073af323b96 100644 ---- a/xen/include/asm-x86/hvm/vmx/vmcs.h -+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h -@@ -58,6 +58,7 @@ struct ept_data { - #define _VMX_DOMAIN_PML_ENABLED 0 - #define VMX_DOMAIN_PML_ENABLED (1ul << _VMX_DOMAIN_PML_ENABLED) - struct vmx_domain { -+ mfn_t apic_access_mfn; - /* VMX_DOMAIN_* */ - unsigned int status; - -diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h -index 7bdf9c2290d8..e1bcea57a8f5 100644 ---- a/xen/include/asm-x86/mm.h -+++ b/xen/include/asm-x86/mm.h -@@ -83,7 +83,7 @@ - #define PGC_state_offlined PG_mask(2, 6) - #define PGC_state_free PG_mask(3, 6) - #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st) --/* Page is not reference counted (see below for caveats) */ -+/* Page is not reference counted */ - #define _PGC_extra PG_shift(7) - #define PGC_extra PG_mask(1, 7) - -@@ -375,24 +375,6 @@ void zap_ro_mpt(mfn_t mfn); - - bool is_iomem_page(mfn_t mfn); - --/* -- * Pages with no owner which may get passed to functions wanting to -- * refcount them can be marked PGC_extra to bypass this refcounting (which -- * would fail due to the lack of an owner). -- * -- * (For pages with owner PGC_extra has different meaning.) -- */ --static inline void page_suppress_refcounting(struct page_info *pg) --{ -- ASSERT(!page_get_owner(pg)); -- pg->count_info |= PGC_extra; --} -- --static inline bool page_refcounting_suppressed(const struct page_info *pg) --{ -- return !page_get_owner(pg) && (pg->count_info & PGC_extra); --} -- - struct platform_bad_page { - unsigned long mfn; - unsigned int order; diff --git a/xsa414.patch b/xsa414.patch deleted file mode 100644 index 27ab0c2..0000000 --- a/xsa414.patch +++ /dev/null @@ -1,112 +0,0 @@ -From: Julien Grall -Subject: tools/xenstore: create_node: Don't defer work to undo any changes on - failure - -XSA-115 extended destroy_node() to update the node accounting for the -connection. The implementation is assuming the connection is the parent -of the node, however all the nodes are allocated using a separate context -(see process_message()). This will result to crash (or corrupt) xenstored -as the pointer is wrongly used. - -In case of an error, any changes to the database or update to the -accounting will now be reverted in create_node() by calling directly -destroy_node(). This has the nice advantage to remove the loop to unset -the destructors in case of success. - -Take the opportunity to free the nodes right now as they are not -going to be reachable (the function returns NULL) and are just wasting -resources. - -This is XSA-414 / CVE-2022-42309. - -Reported-by: Julien Grall -Fixes: 0bfb2101f243 ("tools/xenstore: fix node accounting after failed node creation") -Signed-off-by: Julien Grall -Reviewed-by: Juergen Gross - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 8867f93431d4..c30d14cbf2ab 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1084,9 +1084,8 @@ static struct node *construct_node(struct connection *conn, const void *ctx, - return NULL; - } - --static int destroy_node(void *_node) -+static int destroy_node(struct connection *conn, struct node *node) - { -- struct node *node = _node; - TDB_DATA key; - - if (streq(node->name, "/")) -@@ -1095,7 +1094,7 @@ static int destroy_node(void *_node) - set_tdb_key(node->name, &key); - tdb_delete(tdb_ctx, key); - -- domain_entry_dec(talloc_parent(node), node); -+ domain_entry_dec(conn, node); - - return 0; - } -@@ -1104,7 +1103,8 @@ static struct node *create_node(struct connection *conn, const void *ctx, - const char *name, - void *data, unsigned int datalen) - { -- struct node *node, *i; -+ struct node *node, *i, *j; -+ int ret; - - node = construct_node(conn, ctx, name); - if (!node) -@@ -1126,23 +1126,40 @@ static struct node *create_node(struct connection *conn, const void *ctx, - /* i->parent is set for each new node, so check quota. */ - if (i->parent && - domain_entry(conn) >= quota_nb_entry_per_domain) { -- errno = ENOSPC; -- return NULL; -+ ret = ENOSPC; -+ goto err; - } -- if (write_node(conn, i, false)) -- return NULL; - -- /* Account for new node, set destructor for error case. */ -- if (i->parent) { -+ ret = write_node(conn, i, false); -+ if (ret) -+ goto err; -+ -+ /* Account for new node */ -+ if (i->parent) - domain_entry_inc(conn, i); -- talloc_set_destructor(i, destroy_node); -- } - } - -- /* OK, now remove destructors so they stay around */ -- for (i = node; i->parent; i = i->parent) -- talloc_set_destructor(i, NULL); - return node; -+ -+err: -+ /* -+ * We failed to update TDB for some of the nodes. Undo any work that -+ * have already been done. -+ */ -+ for (j = node; j != i; j = j->parent) -+ destroy_node(conn, j); -+ -+ /* We don't need to keep the nodes around, so free them. */ -+ i = node; -+ while (i) { -+ j = i; -+ i = i->parent; -+ talloc_free(j); -+ } -+ -+ errno = ret; -+ -+ return NULL; - } - - /* path, data... */ diff --git a/xsa415.patch b/xsa415.patch deleted file mode 100644 index b6f6971..0000000 --- a/xsa415.patch +++ /dev/null @@ -1,134 +0,0 @@ -From: Julien Grall -Subject: tools/xenstore: Fail a transaction if it is not possible to create a - node - -Commit f2bebf72c4d5 "xenstore: rework of transaction handling" moved -out from copying the entire database everytime a new transaction is -opened to track the list of nodes changed. - -The content of all the nodes accessed during a transaction will be -temporarily stored in TDB using a different key. - -The function create_node() may write/update multiple nodes if the child -doesn't exist. In case of a failure, the function will revert any -changes (this include any update to TDB). Unfortunately, the function -which reverts the changes (i.e. destroy_node()) will not use the correct -key to delete any update or even request the transaction to fail. - -This means that if a client decide to go ahead with committing the -transaction, orphan nodes will be created because they were not linked -to an existing node (create_node() will write the nodes backwards). - -Once some nodes have been partially updated in a transaction, it is not -easily possible to undo any changes. So rather than continuing and hit -weird issue while committing, it is much saner to fail the transaction. - -This will have an impact on any client that decides to commit even if it -can't write a node. Although, it is not clear why a normal client would -want to do that... - -Lastly, update destroy_node() to use the correct key for deleting the -node. Rather than recreating it (this will allocate memory and -therefore fail), stash the key in the structure node. - -This is XSA-415 / CVE-2022-42310. - -Reported-by: Julien Grall -Signed-off-by: Julien Grall -Reviewed-by: Juergen Gross - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index c30d14cbf2ab..55b79e4c032e 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -562,15 +562,17 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - return 0; - } - -+/* -+ * Write the node. If the node is written, caller can find the key used in -+ * node->key. This can later be used if the change needs to be reverted. -+ */ - static int write_node(struct connection *conn, struct node *node, - bool no_quota_check) - { -- TDB_DATA key; -- -- if (access_node(conn, node, NODE_ACCESS_WRITE, &key)) -+ if (access_node(conn, node, NODE_ACCESS_WRITE, &node->key)) - return errno; - -- return write_node_raw(conn, &key, node, no_quota_check); -+ return write_node_raw(conn, &node->key, node, no_quota_check); - } - - unsigned int perm_for_conn(struct connection *conn, -@@ -1086,16 +1088,21 @@ static struct node *construct_node(struct connection *conn, const void *ctx, - - static int destroy_node(struct connection *conn, struct node *node) - { -- TDB_DATA key; -- - if (streq(node->name, "/")) - corrupt(NULL, "Destroying root node!"); - -- set_tdb_key(node->name, &key); -- tdb_delete(tdb_ctx, key); -+ tdb_delete(tdb_ctx, node->key); - - domain_entry_dec(conn, node); - -+ /* -+ * It is not possible to easily revert the changes in a transaction. -+ * So if the failure happens in a transaction, mark it as fail to -+ * prevent any commit. -+ */ -+ if ( conn->transaction ) -+ fail_transaction(conn->transaction); -+ - return 0; - } - -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index 742812a97469..7d0fe77e7989 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -155,6 +155,8 @@ struct node_perms { - - struct node { - const char *name; -+ /* Key used to update TDB */ -+ TDB_DATA key; - - /* Parent (optional) */ - struct node *parent; -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index cd07fb0f218b..faf6c930e42a 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -580,6 +580,11 @@ void transaction_entry_dec(struct transaction *trans, unsigned int domid) - list_add_tail(&d->list, &trans->changed_domains); - } - -+void fail_transaction(struct transaction *trans) -+{ -+ trans->fail = true; -+} -+ - void conn_delete_all_transactions(struct connection *conn) - { - struct transaction *trans; -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 43a162bea3f3..14062730e3c9 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -46,6 +46,9 @@ int access_node(struct connection *conn, struct node *node, - int transaction_prepend(struct connection *conn, const char *name, - TDB_DATA *key); - -+/* Mark the transaction as failed. This will prevent it to be committed. */ -+void fail_transaction(struct transaction *trans); -+ - void conn_delete_all_transactions(struct connection *conn); - int check_transactions(struct hashtable *hash); - diff --git a/xsa416-4.16.patch b/xsa416-4.16.patch deleted file mode 100644 index 0df409a..0000000 --- a/xsa416-4.16.patch +++ /dev/null @@ -1,704 +0,0 @@ -From 80d128b14482d2e9342184d2d9949367851c4d14 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: tools/xenstore: don't use conn->in as context for temporary - allocations - -Using the struct buffered data pointer of the current processed request -for temporary data allocations has a major drawback: the used area (and -with that the temporary data) is freed only after the response of the -request has been written to the ring page or has been read via the -socket. This can happen much later in case a guest isn't reading its -responses fast enough. - -As the temporary data can be safely freed after creating the response, -add a temporary context for that purpose and use that for allocating -the temporary memory, as it was already the case before commit -cc0612464896 ("xenstore: add small default data buffer to internal -struct"). - -Some sub-functions need to gain the "const" attribute for the talloc -context. - -This is XSA-416 / CVE-2022-42319. - -Reported-by: Julien Grall -Fixes: cc0612464896 ("xenstore: add small default data buffer to internal struct") -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_control.c b/tools/xenstore/xenstored_control.c -index 1031a81c3874..d0350c6ad861 100644 ---- a/tools/xenstore/xenstored_control.c -+++ b/tools/xenstore/xenstored_control.c -@@ -155,7 +155,7 @@ bool lu_is_pending(void) - - struct cmd_s { - char *cmd; -- int (*func)(void *, struct connection *, char **, int); -+ int (*func)(const void *, struct connection *, char **, int); - char *pars; - /* - * max_pars can be used to limit the size of the parameter vector, -@@ -167,7 +167,7 @@ struct cmd_s { - unsigned int max_pars; - }; - --static int do_control_check(void *ctx, struct connection *conn, -+static int do_control_check(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num) -@@ -179,7 +179,7 @@ static int do_control_check(void *ctx, struct connection *conn, - return 0; - } - --static int do_control_log(void *ctx, struct connection *conn, -+static int do_control_log(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num != 1) -@@ -281,7 +281,7 @@ static int quota_get(const void *ctx, struct connection *conn, - return domain_get_quota(ctx, conn, atoi(vec[0])); - } - --static int do_control_quota(void *ctx, struct connection *conn, -+static int do_control_quota(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num == 0) -@@ -293,7 +293,7 @@ static int do_control_quota(void *ctx, struct connection *conn, - return quota_get(ctx, conn, vec, num); - } - --static int do_control_quota_s(void *ctx, struct connection *conn, -+static int do_control_quota_s(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num == 0) -@@ -306,7 +306,7 @@ static int do_control_quota_s(void *ctx, struct connection *conn, - } - - #ifdef __MINIOS__ --static int do_control_memreport(void *ctx, struct connection *conn, -+static int do_control_memreport(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num) -@@ -318,7 +318,7 @@ static int do_control_memreport(void *ctx, struct connection *conn, - return 0; - } - #else --static int do_control_logfile(void *ctx, struct connection *conn, -+static int do_control_logfile(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num != 1) -@@ -333,7 +333,7 @@ static int do_control_logfile(void *ctx, struct connection *conn, - return 0; - } - --static int do_control_memreport(void *ctx, struct connection *conn, -+static int do_control_memreport(const void *ctx, struct connection *conn, - char **vec, int num) - { - FILE *fp; -@@ -373,7 +373,7 @@ static int do_control_memreport(void *ctx, struct connection *conn, - } - #endif - --static int do_control_print(void *ctx, struct connection *conn, -+static int do_control_print(const void *ctx, struct connection *conn, - char **vec, int num) - { - if (num != 1) -@@ -875,7 +875,7 @@ static const char *lu_start(const void *ctx, struct connection *conn, - return NULL; - } - --static int do_control_lu(void *ctx, struct connection *conn, -+static int do_control_lu(const void *ctx, struct connection *conn, - char **vec, int num) - { - const char *ret = NULL; -@@ -922,7 +922,7 @@ static int do_control_lu(void *ctx, struct connection *conn, - } - #endif - --static int do_control_help(void *, struct connection *, char **, int); -+static int do_control_help(const void *, struct connection *, char **, int); - - static struct cmd_s cmds[] = { - { "check", do_control_check, "" }, -@@ -961,7 +961,7 @@ static struct cmd_s cmds[] = { - { "help", do_control_help, "" }, - }; - --static int do_control_help(void *ctx, struct connection *conn, -+static int do_control_help(const void *ctx, struct connection *conn, - char **vec, int num) - { - int cmd, len = 0; -@@ -997,7 +997,8 @@ static int do_control_help(void *ctx, struct connection *conn, - return 0; - } - --int do_control(struct connection *conn, struct buffered_data *in) -+int do_control(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - unsigned int cmd, num, off; - char **vec = NULL; -@@ -1017,11 +1018,11 @@ int do_control(struct connection *conn, struct buffered_data *in) - num = xs_count_strings(in->buffer, in->used); - if (cmds[cmd].max_pars) - num = min(num, cmds[cmd].max_pars); -- vec = talloc_array(in, char *, num); -+ vec = talloc_array(ctx, char *, num); - if (!vec) - return ENOMEM; - if (get_strings(in, vec, num) < num) - return EIO; - -- return cmds[cmd].func(in, conn, vec + 1, num - 1); -+ return cmds[cmd].func(ctx, conn, vec + 1, num - 1); - } -diff --git a/tools/xenstore/xenstored_control.h b/tools/xenstore/xenstored_control.h -index 98b6fbcea2b1..a8cb76559ba1 100644 ---- a/tools/xenstore/xenstored_control.h -+++ b/tools/xenstore/xenstored_control.h -@@ -16,7 +16,8 @@ - along with this program; If not, see . - */ - --int do_control(struct connection *conn, struct buffered_data *in); -+int do_control(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - void lu_read_state(void); - - struct connection *lu_get_connection(void); -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 16504de42017..411cc0e44714 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1248,11 +1248,13 @@ static struct node *get_node_canonicalized(struct connection *conn, - return get_node(conn, ctx, *canonical_name, perm); - } - --static int send_directory(struct connection *conn, struct buffered_data *in) -+static int send_directory(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - -- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1261,7 +1263,7 @@ static int send_directory(struct connection *conn, struct buffered_data *in) - return 0; - } - --static int send_directory_part(struct connection *conn, -+static int send_directory_part(const void *ctx, struct connection *conn, - struct buffered_data *in) - { - unsigned int off, len, maxlen, genlen; -@@ -1273,7 +1275,8 @@ static int send_directory_part(struct connection *conn, - return EINVAL; - - /* First arg is node name. */ -- node = get_node_canonicalized(conn, in, in->buffer, NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, in->buffer, NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1300,7 +1303,7 @@ static int send_directory_part(struct connection *conn, - break; - } - -- data = talloc_array(in, char, genlen + len + 1); -+ data = talloc_array(ctx, char, genlen + len + 1); - if (!data) - return ENOMEM; - -@@ -1316,11 +1319,13 @@ static int send_directory_part(struct connection *conn, - return 0; - } - --static int do_read(struct connection *conn, struct buffered_data *in) -+static int do_read(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - -- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1510,7 +1515,8 @@ static struct node *create_node(struct connection *conn, const void *ctx, - } - - /* path, data... */ --static int do_write(struct connection *conn, struct buffered_data *in) -+static int do_write(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - unsigned int offset, datalen; - struct node *node; -@@ -1524,12 +1530,12 @@ static int do_write(struct connection *conn, struct buffered_data *in) - offset = strlen(vec[0]) + 1; - datalen = in->used - offset; - -- node = get_node_canonicalized(conn, in, vec[0], &name, XS_PERM_WRITE); -+ node = get_node_canonicalized(conn, ctx, vec[0], &name, XS_PERM_WRITE); - if (!node) { - /* No permissions, invalid input? */ - if (errno != ENOENT) - return errno; -- node = create_node(conn, in, name, in->buffer + offset, -+ node = create_node(conn, ctx, name, in->buffer + offset, - datalen); - if (!node) - return errno; -@@ -1540,18 +1546,19 @@ static int do_write(struct connection *conn, struct buffered_data *in) - return errno; - } - -- fire_watches(conn, in, name, node, false, NULL); -+ fire_watches(conn, ctx, name, node, false, NULL); - send_ack(conn, XS_WRITE); - - return 0; - } - --static int do_mkdir(struct connection *conn, struct buffered_data *in) -+static int do_mkdir(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - char *name; - -- node = get_node_canonicalized(conn, in, onearg(in), &name, -+ node = get_node_canonicalized(conn, ctx, onearg(in), &name, - XS_PERM_WRITE); - - /* If it already exists, fine. */ -@@ -1561,10 +1568,10 @@ static int do_mkdir(struct connection *conn, struct buffered_data *in) - return errno; - if (!name) - return ENOMEM; -- node = create_node(conn, in, name, NULL, 0); -+ node = create_node(conn, ctx, name, NULL, 0); - if (!node) - return errno; -- fire_watches(conn, in, name, node, false, NULL); -+ fire_watches(conn, ctx, name, node, false, NULL); - } - send_ack(conn, XS_MKDIR); - -@@ -1662,24 +1669,25 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - } - - --static int do_rm(struct connection *conn, struct buffered_data *in) -+static int do_rm(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - int ret; - char *name; - char *parentname; - -- node = get_node_canonicalized(conn, in, onearg(in), &name, -+ node = get_node_canonicalized(conn, ctx, onearg(in), &name, - XS_PERM_WRITE); - if (!node) { - /* Didn't exist already? Fine, if parent exists. */ - if (errno == ENOENT) { - if (!name) - return ENOMEM; -- parentname = get_parent(in, name); -+ parentname = get_parent(ctx, name); - if (!parentname) - return errno; -- node = read_node(conn, in, parentname); -+ node = read_node(conn, ctx, parentname); - if (node) { - send_ack(conn, XS_RM); - return 0; -@@ -1694,7 +1702,7 @@ static int do_rm(struct connection *conn, struct buffered_data *in) - if (streq(name, "/")) - return EINVAL; - -- ret = _rm(conn, in, node, name); -+ ret = _rm(conn, ctx, node, name); - if (ret) - return ret; - -@@ -1704,13 +1712,15 @@ static int do_rm(struct connection *conn, struct buffered_data *in) - } - - --static int do_get_perms(struct connection *conn, struct buffered_data *in) -+static int do_get_perms(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node *node; - char *strings; - unsigned int len; - -- node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ); -+ node = get_node_canonicalized(conn, ctx, onearg(in), NULL, -+ XS_PERM_READ); - if (!node) - return errno; - -@@ -1723,7 +1733,8 @@ static int do_get_perms(struct connection *conn, struct buffered_data *in) - return 0; - } - --static int do_set_perms(struct connection *conn, struct buffered_data *in) -+static int do_set_perms(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct node_perms perms, old_perms; - char *name, *permstr; -@@ -1740,7 +1751,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - - permstr = in->buffer + strlen(in->buffer) + 1; - -- perms.p = talloc_array(in, struct xs_permissions, perms.num); -+ perms.p = talloc_array(ctx, struct xs_permissions, perms.num); - if (!perms.p) - return ENOMEM; - if (!xs_strings_to_perms(perms.p, perms.num, permstr)) -@@ -1755,7 +1766,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - } - - /* We must own node to do this (tools can do this too). */ -- node = get_node_canonicalized(conn, in, in->buffer, &name, -+ node = get_node_canonicalized(conn, ctx, in->buffer, &name, - XS_PERM_WRITE | XS_PERM_OWNER); - if (!node) - return errno; -@@ -1790,7 +1801,7 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - return errno; - } - -- fire_watches(conn, in, name, node, false, &old_perms); -+ fire_watches(conn, ctx, name, node, false, &old_perms); - send_ack(conn, XS_SET_PERMS); - - return 0; -@@ -1798,7 +1809,8 @@ static int do_set_perms(struct connection *conn, struct buffered_data *in) - - static struct { - const char *str; -- int (*func)(struct connection *conn, struct buffered_data *in); -+ int (*func)(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - unsigned int flags; - #define XS_FLAG_NOTID (1U << 0) /* Ignore transaction id. */ - #define XS_FLAG_PRIV (1U << 1) /* Privileged domain only. */ -@@ -1874,6 +1886,7 @@ static void process_message(struct connection *conn, struct buffered_data *in) - struct transaction *trans; - enum xsd_sockmsg_type type = in->hdr.msg.type; - int ret; -+ void *ctx; - - /* At least send_error() and send_reply() expects conn->in == in */ - assert(conn->in == in); -@@ -1898,10 +1911,17 @@ static void process_message(struct connection *conn, struct buffered_data *in) - return; - } - -+ ctx = talloc_new(NULL); -+ if (!ctx) { -+ send_error(conn, ENOMEM); -+ return; -+ } -+ - assert(conn->transaction == NULL); - conn->transaction = trans; - -- ret = wire_funcs[type].func(conn, in); -+ ret = wire_funcs[type].func(ctx, conn, in); -+ talloc_free(ctx); - if (ret) - send_error(conn, ret); - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index e7c6886ccf47..fb732d0a14c3 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -330,7 +330,7 @@ bool domain_is_unprivileged(struct connection *conn) - domid_is_unprivileged(conn->domain->domid); - } - --static char *talloc_domain_path(void *context, unsigned int domid) -+static char *talloc_domain_path(const void *context, unsigned int domid) - { - return talloc_asprintf(context, "/local/domain/%u", domid); - } -@@ -534,7 +534,8 @@ static struct domain *introduce_domain(const void *ctx, - } - - /* domid, gfn, evtchn, path */ --int do_introduce(struct connection *conn, struct buffered_data *in) -+int do_introduce(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct domain *domain; - char *vec[3]; -@@ -552,7 +553,7 @@ int do_introduce(struct connection *conn, struct buffered_data *in) - if (port <= 0) - return EINVAL; - -- domain = introduce_domain(in, domid, port, false); -+ domain = introduce_domain(ctx, domid, port, false); - if (!domain) - return errno; - -@@ -575,7 +576,8 @@ static struct domain *find_connected_domain(unsigned int domid) - return domain; - } - --int do_set_target(struct connection *conn, struct buffered_data *in) -+int do_set_target(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - char *vec[2]; - unsigned int domid, tdomid; -@@ -619,7 +621,8 @@ static struct domain *onearg_domain(struct connection *conn, - } - - /* domid */ --int do_release(struct connection *conn, struct buffered_data *in) -+int do_release(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct domain *domain; - -@@ -634,7 +637,8 @@ int do_release(struct connection *conn, struct buffered_data *in) - return 0; - } - --int do_resume(struct connection *conn, struct buffered_data *in) -+int do_resume(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct domain *domain; - -@@ -649,7 +653,8 @@ int do_resume(struct connection *conn, struct buffered_data *in) - return 0; - } - --int do_get_domain_path(struct connection *conn, struct buffered_data *in) -+int do_get_domain_path(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - char *path; - const char *domid_str = onearg(in); -@@ -657,18 +662,17 @@ int do_get_domain_path(struct connection *conn, struct buffered_data *in) - if (!domid_str) - return EINVAL; - -- path = talloc_domain_path(conn, atoi(domid_str)); -+ path = talloc_domain_path(ctx, atoi(domid_str)); - if (!path) - return errno; - - send_reply(conn, XS_GET_DOMAIN_PATH, path, strlen(path) + 1); - -- talloc_free(path); -- - return 0; - } - --int do_is_domain_introduced(struct connection *conn, struct buffered_data *in) -+int do_is_domain_introduced(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - int result; - unsigned int domid; -@@ -689,7 +693,8 @@ int do_is_domain_introduced(struct connection *conn, struct buffered_data *in) - } - - /* Allow guest to reset all watches */ --int do_reset_watches(struct connection *conn, struct buffered_data *in) -+int do_reset_watches(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - conn_delete_all_watches(conn); - conn_delete_all_transactions(conn); -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 904faa923afb..b9e152890149 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -24,25 +24,32 @@ void handle_event(void); - void check_domains(void); - - /* domid, mfn, eventchn, path */ --int do_introduce(struct connection *conn, struct buffered_data *in); -+int do_introduce(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_is_domain_introduced(struct connection *conn, struct buffered_data *in); -+int do_is_domain_introduced(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_release(struct connection *conn, struct buffered_data *in); -+int do_release(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_resume(struct connection *conn, struct buffered_data *in); -+int do_resume(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid, target */ --int do_set_target(struct connection *conn, struct buffered_data *in); -+int do_set_target(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* domid */ --int do_get_domain_path(struct connection *conn, struct buffered_data *in); -+int do_get_domain_path(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* Allow guest to reset all watches */ --int do_reset_watches(struct connection *conn, struct buffered_data *in); -+int do_reset_watches(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - void domain_init(int evtfd); - void dom0_init(void); -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 28774813de83..3e3eb47326cc 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -481,7 +481,8 @@ struct transaction *transaction_lookup(struct connection *conn, uint32_t id) - return ERR_PTR(-ENOENT); - } - --int do_transaction_start(struct connection *conn, struct buffered_data *in) -+int do_transaction_start(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct transaction *trans, *exists; - char id_str[20]; -@@ -494,8 +495,8 @@ int do_transaction_start(struct connection *conn, struct buffered_data *in) - conn->transaction_started > quota_max_transaction) - return ENOSPC; - -- /* Attach transaction to input for autofree until it's complete */ -- trans = talloc_zero(in, struct transaction); -+ /* Attach transaction to ctx for autofree until it's complete */ -+ trans = talloc_zero(ctx, struct transaction); - if (!trans) - return ENOMEM; - -@@ -544,7 +545,8 @@ static int transaction_fix_domains(struct transaction *trans, bool update) - return 0; - } - --int do_transaction_end(struct connection *conn, struct buffered_data *in) -+int do_transaction_end(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - const char *arg = onearg(in); - struct transaction *trans; -@@ -562,8 +564,8 @@ int do_transaction_end(struct connection *conn, struct buffered_data *in) - if (!conn->transaction_started) - conn->ta_start_time = 0; - -- /* Attach transaction to in for auto-cleanup */ -- talloc_steal(in, trans); -+ /* Attach transaction to ctx for auto-cleanup */ -+ talloc_steal(ctx, trans); - - if (streq(arg, "T")) { - if (trans->fail) -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index e3cbd6b23095..39d7f81c5127 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -29,8 +29,10 @@ struct transaction; - - extern uint64_t generation; - --int do_transaction_start(struct connection *conn, struct buffered_data *node); --int do_transaction_end(struct connection *conn, struct buffered_data *in); -+int do_transaction_start(const void *ctx, struct connection *conn, -+ struct buffered_data *node); -+int do_transaction_end(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - struct transaction *transaction_lookup(struct connection *conn, uint32_t id); - -diff --git a/tools/xenstore/xenstored_watch.c b/tools/xenstore/xenstored_watch.c -index 85362bcce314..316c08b7f754 100644 ---- a/tools/xenstore/xenstored_watch.c -+++ b/tools/xenstore/xenstored_watch.c -@@ -243,7 +243,7 @@ static struct watch *add_watch(struct connection *conn, char *path, char *token, - return NULL; - } - --int do_watch(struct connection *conn, struct buffered_data *in) -+int do_watch(const void *ctx, struct connection *conn, struct buffered_data *in) - { - struct watch *watch; - char *vec[2]; -@@ -252,7 +252,7 @@ int do_watch(struct connection *conn, struct buffered_data *in) - if (get_strings(in, vec, ARRAY_SIZE(vec)) != ARRAY_SIZE(vec)) - return EINVAL; - -- errno = check_watch_path(conn, in, &(vec[0]), &relative); -+ errno = check_watch_path(conn, ctx, &(vec[0]), &relative); - if (errno) - return errno; - -@@ -283,7 +283,8 @@ int do_watch(struct connection *conn, struct buffered_data *in) - return 0; - } - --int do_unwatch(struct connection *conn, struct buffered_data *in) -+int do_unwatch(const void *ctx, struct connection *conn, -+ struct buffered_data *in) - { - struct watch *watch; - char *node, *vec[2]; -@@ -291,7 +292,7 @@ int do_unwatch(struct connection *conn, struct buffered_data *in) - if (get_strings(in, vec, ARRAY_SIZE(vec)) != ARRAY_SIZE(vec)) - return EINVAL; - -- node = xenstore_canonicalize(conn, in, vec[0]); -+ node = xenstore_canonicalize(conn, ctx, vec[0]); - if (!node) - return ENOMEM; - list_for_each_entry(watch, &conn->watches, list) { -diff --git a/tools/xenstore/xenstored_watch.h b/tools/xenstore/xenstored_watch.h -index 0e693f0839cd..091890edca96 100644 ---- a/tools/xenstore/xenstored_watch.h -+++ b/tools/xenstore/xenstored_watch.h -@@ -21,8 +21,10 @@ - - #include "xenstored_core.h" - --int do_watch(struct connection *conn, struct buffered_data *in); --int do_unwatch(struct connection *conn, struct buffered_data *in); -+int do_watch(const void *ctx, struct connection *conn, -+ struct buffered_data *in); -+int do_unwatch(const void *ctx, struct connection *conn, -+ struct buffered_data *in); - - /* Fire all watches: !exact means all the children are affected (ie. rm). */ - void fire_watches(struct connection *conn, const void *tmp, const char *name, diff --git a/xsa417.patch b/xsa417.patch deleted file mode 100644 index 3f10bf7..0000000 --- a/xsa417.patch +++ /dev/null @@ -1,135 +0,0 @@ -From 67d5ecd609b8f12346eadb40e547cd7e01d825dc Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:10 +0200 -Subject: tools/xenstore: fix checking node permissions - -Today chk_domain_generation() is being used to check whether a node -permission entry is still valid or whether it is referring to a domain -no longer existing. This is done by comparing the node's and the -domain's generation count. - -In case no struct domain is existing for a checked domain, but the -domain itself is valid, chk_domain_generation() assumes it is being -called due to the first node created for a new domain and it will -return success. - -This might be wrong in case the checked permission is related to an -old domain, which has just been replaced with a new domain using the -same domid. - -Fix that by letting chk_domain_generation() fail in case a struct -domain isn't found. In order to cover the case of the first node for -a new domain try to allocate the needed struct domain explicitly when -processing the related SET_PERMS command. In case a referenced domain -isn't existing, flag the related permission to be ignored right away. - -This is XSA-417 / CVE-2022-42320. - -Reported-by: Juergen Gross -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 66bbeaf6bfb0..a0c176fa203e 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1753,6 +1753,11 @@ static int do_set_perms(const void *ctx, struct connection *conn, - if (!xs_strings_to_perms(perms.p, perms.num, permstr)) - return errno; - -+ if (domain_alloc_permrefs(&perms) < 0) -+ return ENOMEM; -+ if (perms.p[0].perms & XS_PERM_IGNORE) -+ return ENOENT; -+ - /* First arg is node name. */ - if (strstarts(in->buffer, "@")) { - if (set_perms_special(conn, in->buffer, &perms)) -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index b9ff4ded8360..98b401fdec30 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -907,7 +907,6 @@ int domain_entry_inc(struct connection *conn, struct node *node) - * count (used for testing whether a node permission is older than a domain). - * - * Return values: -- * -1: error - * 0: domain has higher generation count (it is younger than a node with the - * given count), or domain isn't existing any longer - * 1: domain is older than the node -@@ -915,20 +914,38 @@ int domain_entry_inc(struct connection *conn, struct node *node) - static int chk_domain_generation(unsigned int domid, uint64_t gen) - { - struct domain *d; -- xc_dominfo_t dominfo; - - if (!xc_handle && domid == 0) - return 1; - - d = find_domain_struct(domid); -- if (d) -- return (d->generation <= gen) ? 1 : 0; - -- if (!get_domain_info(domid, &dominfo)) -- return 0; -+ return (d && d->generation <= gen) ? 1 : 0; -+} - -- d = alloc_domain(NULL, domid); -- return d ? 1 : -1; -+/* -+ * Allocate all missing struct domain referenced by a permission set. -+ * Any permission entries for not existing domains will be marked to be -+ * ignored. -+ */ -+int domain_alloc_permrefs(struct node_perms *perms) -+{ -+ unsigned int i, domid; -+ struct domain *d; -+ xc_dominfo_t dominfo; -+ -+ for (i = 0; i < perms->num; i++) { -+ domid = perms->p[i].id; -+ d = find_domain_struct(domid); -+ if (!d) { -+ if (!get_domain_info(domid, &dominfo)) -+ perms->p[i].perms |= XS_PERM_IGNORE; -+ else if (!alloc_domain(NULL, domid)) -+ return ENOMEM; -+ } -+ } -+ -+ return 0; - } - - /* -@@ -941,8 +958,6 @@ int domain_adjust_node_perms(struct connection *conn, struct node *node) - int ret; - - ret = chk_domain_generation(node->perms.p[0].id, node->generation); -- if (ret < 0) -- return errno; - - /* If the owner doesn't exist any longer give it to priv domain. */ - if (!ret) { -@@ -959,8 +974,6 @@ int domain_adjust_node_perms(struct connection *conn, struct node *node) - continue; - ret = chk_domain_generation(node->perms.p[i].id, - node->generation); -- if (ret < 0) -- return errno; - if (!ret) - node->perms.p[i].perms |= XS_PERM_IGNORE; - } -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 209442190911..7fe0a21d9e45 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -63,6 +63,7 @@ bool domain_is_unprivileged(struct connection *conn); - - /* Remove node permissions for no longer existing domains. */ - int domain_adjust_node_perms(struct connection *conn, struct node *node); -+int domain_alloc_permrefs(struct node_perms *perms); - - /* Quota manipulation */ - int domain_entry_inc(struct connection *conn, struct node *); diff --git a/xsa418-4.16-01.patch b/xsa418-4.16-01.patch deleted file mode 100644 index c4a17b4..0000000 --- a/xsa418-4.16-01.patch +++ /dev/null @@ -1,119 +0,0 @@ -From d1e6dca486599ab914af7b38b3782b237d3d603b Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:11 +0200 -Subject: tools/xenstore: remove recursion from construct_node() - -In order to reduce stack usage due to recursion, switch -construct_node() to use a loop instead. - -This is part of XSA-418 / CVE-2022-42321. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index c676ee4e4e4f..3907c35643e9 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1377,45 +1377,69 @@ static int add_child(const void *ctx, struct node *parent, const char *name) - static struct node *construct_node(struct connection *conn, const void *ctx, - const char *name) - { -- struct node *parent, *node; -- char *parentname = get_parent(ctx, name); -+ const char **names = NULL; -+ unsigned int levels = 0; -+ struct node *node = NULL; -+ struct node *parent = NULL; -+ const char *parentname = talloc_strdup(ctx, name); - - if (!parentname) - return NULL; - -- /* If parent doesn't exist, create it. */ -- parent = read_node(conn, parentname, parentname); -- if (!parent && errno == ENOENT) -- parent = construct_node(conn, ctx, parentname); -- if (!parent) -- return NULL; -+ /* Walk the path up until an existing node is found. */ -+ while (!parent) { -+ names = talloc_realloc(ctx, names, const char *, levels + 1); -+ if (!names) -+ goto nomem; - -- /* Add child to parent. */ -- if (add_child(ctx, parent, name)) -- goto nomem; -+ /* -+ * names[0] is the name of the node to construct initially, -+ * names[1] is its parent, and so on. -+ */ -+ names[levels] = parentname; -+ parentname = get_parent(ctx, parentname); -+ if (!parentname) -+ return NULL; - -- /* Allocate node */ -- node = talloc(ctx, struct node); -- if (!node) -- goto nomem; -- node->name = talloc_strdup(node, name); -- if (!node->name) -- goto nomem; -+ /* Try to read parent node until we found an existing one. */ -+ parent = read_node(conn, ctx, parentname); -+ if (!parent && (errno != ENOENT || !strcmp(parentname, "/"))) -+ return NULL; - -- /* Inherit permissions, except unprivileged domains own what they create */ -- node->perms.num = parent->perms.num; -- node->perms.p = talloc_memdup(node, parent->perms.p, -- node->perms.num * sizeof(*node->perms.p)); -- if (!node->perms.p) -- goto nomem; -- if (domain_is_unprivileged(conn)) -- node->perms.p[0].id = conn->id; -+ levels++; -+ } -+ -+ /* Walk the path down again constructing the missing nodes. */ -+ for (; levels > 0; levels--) { -+ /* Add child to parent. */ -+ if (add_child(ctx, parent, names[levels - 1])) -+ goto nomem; -+ -+ /* Allocate node */ -+ node = talloc(ctx, struct node); -+ if (!node) -+ goto nomem; -+ node->name = talloc_steal(node, names[levels - 1]); -+ -+ /* Inherit permissions, unpriv domains own what they create. */ -+ node->perms.num = parent->perms.num; -+ node->perms.p = talloc_memdup(node, parent->perms.p, -+ node->perms.num * -+ sizeof(*node->perms.p)); -+ if (!node->perms.p) -+ goto nomem; -+ if (domain_is_unprivileged(conn)) -+ node->perms.p[0].id = conn->id; -+ -+ /* No children, no data */ -+ node->children = node->data = NULL; -+ node->childlen = node->datalen = 0; -+ node->acc.memory = 0; -+ node->parent = parent; -+ -+ parent = node; -+ } - -- /* No children, no data */ -- node->children = node->data = NULL; -- node->childlen = node->datalen = 0; -- node->acc.memory = 0; -- node->parent = parent; - return node; - - nomem: diff --git a/xsa418-4.16-02.patch b/xsa418-4.16-02.patch deleted file mode 100644 index 874bab2..0000000 --- a/xsa418-4.16-02.patch +++ /dev/null @@ -1,103 +0,0 @@ -From c13d85a2fe94bbf3cb8186b89324c5d1b4f9a61f Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:11 +0200 -Subject: tools/xenstore: don't let remove_child_entry() call corrupt() - -In case of write_node() returning an error, remove_child_entry() will -call corrupt() today. This could result in an endless recursion, as -remove_child_entry() is called by corrupt(), too: - -corrupt() - check_store() - check_store_() - remove_child_entry() - -Fix that by letting remove_child_entry() return an error instead and -let the caller decide what to do. - -This is part of XSA-418 / CVE-2022-42321. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 3907c35643e9..f433a45dc217 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1608,15 +1608,15 @@ static void memdel(void *mem, unsigned off, unsigned len, unsigned total) - memmove(mem + off, mem + off + len, total - off - len); - } - --static void remove_child_entry(struct connection *conn, struct node *node, -- size_t offset) -+static int remove_child_entry(struct connection *conn, struct node *node, -+ size_t offset) - { - size_t childlen = strlen(node->children + offset); - - memdel(node->children, offset, childlen + 1, node->childlen); - node->childlen -= childlen + 1; -- if (write_node(conn, node, true)) -- corrupt(conn, "Can't update parent node '%s'", node->name); -+ -+ return write_node(conn, node, true); - } - - static void delete_child(struct connection *conn, -@@ -1626,7 +1626,9 @@ static void delete_child(struct connection *conn, - - for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) { - if (streq(node->children+i, childname)) { -- remove_child_entry(conn, node, i); -+ if (remove_child_entry(conn, node, i)) -+ corrupt(conn, "Can't update parent node '%s'", -+ node->name); - return; - } - } -@@ -2325,6 +2327,17 @@ int remember_string(struct hashtable *hash, const char *str) - return hashtable_insert(hash, k, (void *)1); - } - -+static int rm_child_entry(struct node *node, size_t off, size_t len) -+{ -+ if (!recovery) -+ return off; -+ -+ if (remove_child_entry(NULL, node, off)) -+ log("check_store: child entry could not be removed from '%s'", -+ node->name); -+ -+ return off - len - 1; -+} - - /** - * A node has a children field that names the children of the node, separated -@@ -2377,12 +2390,7 @@ static int check_store_(const char *name, struct hashtable *reachable) - if (hashtable_search(children, childname)) { - log("check_store: '%s' is duplicated!", - childname); -- -- if (recovery) { -- remove_child_entry(NULL, node, -- i); -- i -= childlen + 1; -- } -+ i = rm_child_entry(node, i, childlen); - } - else { - if (!remember_string(children, -@@ -2399,11 +2407,7 @@ static int check_store_(const char *name, struct hashtable *reachable) - } else if (errno != ENOMEM) { - log("check_store: No child '%s' found!\n", - childname); -- -- if (recovery) { -- remove_child_entry(NULL, node, i); -- i -= childlen + 1; -- } -+ i = rm_child_entry(node, i, childlen); - } else { - log("check_store: ENOMEM"); - ret = ENOMEM; diff --git a/xsa418-4.16-03.patch b/xsa418-4.16-03.patch deleted file mode 100644 index 321ccf0..0000000 --- a/xsa418-4.16-03.patch +++ /dev/null @@ -1,243 +0,0 @@ -From aac9b51b6fbbbd16c910f69365345528c5bec106 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:11 +0200 -Subject: tools/xenstore: add generic treewalk function - -Add a generic function to walk the complete node tree. It will start -at "/" and descend recursively into each child, calling a function -specified by the caller. Depending on the return value of the user -specified function the walk will be aborted, continued, or the current -child will be skipped by not descending into its children. - -This is part of XSA-418 / CVE-2022-42321. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Acked-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index f433a45dc217..2cda3ee375ab 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1838,6 +1838,135 @@ static int do_set_perms(const void *ctx, struct connection *conn, - return 0; - } - -+static char *child_name(const void *ctx, const char *s1, const char *s2) -+{ -+ if (strcmp(s1, "/")) -+ return talloc_asprintf(ctx, "%s/%s", s1, s2); -+ return talloc_asprintf(ctx, "/%s", s2); -+} -+ -+static int rm_from_parent(struct connection *conn, struct node *parent, -+ const char *name) -+{ -+ size_t off; -+ -+ if (!parent) -+ return WALK_TREE_ERROR_STOP; -+ -+ for (off = parent->childoff - 1; off && parent->children[off - 1]; -+ off--); -+ if (remove_child_entry(conn, parent, off)) { -+ log("treewalk: child entry could not be removed from '%s'", -+ parent->name); -+ return WALK_TREE_ERROR_STOP; -+ } -+ parent->childoff = off; -+ -+ return WALK_TREE_OK; -+} -+ -+static int walk_call_func(const void *ctx, struct connection *conn, -+ struct node *node, struct node *parent, void *arg, -+ int (*func)(const void *ctx, struct connection *conn, -+ struct node *node, void *arg)) -+{ -+ int ret; -+ -+ if (!func) -+ return WALK_TREE_OK; -+ -+ ret = func(ctx, conn, node, arg); -+ if (ret == WALK_TREE_RM_CHILDENTRY && parent) -+ ret = rm_from_parent(conn, parent, node->name); -+ -+ return ret; -+} -+ -+int walk_node_tree(const void *ctx, struct connection *conn, const char *root, -+ struct walk_funcs *funcs, void *arg) -+{ -+ int ret = 0; -+ void *tmpctx; -+ char *name; -+ struct node *node = NULL; -+ struct node *parent = NULL; -+ -+ tmpctx = talloc_new(ctx); -+ if (!tmpctx) { -+ errno = ENOMEM; -+ return WALK_TREE_ERROR_STOP; -+ } -+ name = talloc_strdup(tmpctx, root); -+ if (!name) { -+ errno = ENOMEM; -+ talloc_free(tmpctx); -+ return WALK_TREE_ERROR_STOP; -+ } -+ -+ /* Continue the walk until an error is returned. */ -+ while (ret >= 0) { -+ /* node == NULL possible only for the initial loop iteration. */ -+ if (node) { -+ /* Go one step up if ret or if last child finished. */ -+ if (ret || node->childoff >= node->childlen) { -+ parent = node->parent; -+ /* Call function AFTER processing a node. */ -+ ret = walk_call_func(ctx, conn, node, parent, -+ arg, funcs->exit); -+ /* Last node, so exit loop. */ -+ if (!parent) -+ break; -+ talloc_free(node); -+ /* Continue with parent. */ -+ node = parent; -+ continue; -+ } -+ /* Get next child of current node. */ -+ name = child_name(tmpctx, node->name, -+ node->children + node->childoff); -+ if (!name) { -+ ret = WALK_TREE_ERROR_STOP; -+ break; -+ } -+ /* Point to next child. */ -+ node->childoff += strlen(node->children + -+ node->childoff) + 1; -+ /* Descent into children. */ -+ parent = node; -+ } -+ /* Read next node (root node or next child). */ -+ node = read_node(conn, tmpctx, name); -+ if (!node) { -+ /* Child not found - should not happen! */ -+ /* ENOENT case can be handled by supplied function. */ -+ if (errno == ENOENT && funcs->enoent) -+ ret = funcs->enoent(ctx, conn, parent, name, -+ arg); -+ else -+ ret = WALK_TREE_ERROR_STOP; -+ if (!parent) -+ break; -+ if (ret == WALK_TREE_RM_CHILDENTRY) -+ ret = rm_from_parent(conn, parent, name); -+ if (ret < 0) -+ break; -+ talloc_free(name); -+ node = parent; -+ continue; -+ } -+ talloc_free(name); -+ node->parent = parent; -+ node->childoff = 0; -+ /* Call function BEFORE processing a node. */ -+ ret = walk_call_func(ctx, conn, node, parent, arg, -+ funcs->enter); -+ } -+ -+ talloc_free(tmpctx); -+ -+ return ret < 0 ? ret : WALK_TREE_OK; -+} -+ - static struct { - const char *str; - int (*func)(const void *ctx, struct connection *conn, -@@ -2305,18 +2434,6 @@ static int keys_equal_fn(void *key1, void *key2) - return 0 == strcmp((char *)key1, (char *)key2); - } - -- --static char *child_name(const char *s1, const char *s2) --{ -- if (strcmp(s1, "/")) { -- return talloc_asprintf(NULL, "%s/%s", s1, s2); -- } -- else { -- return talloc_asprintf(NULL, "/%s", s2); -- } --} -- -- - int remember_string(struct hashtable *hash, const char *str) - { - char *k = malloc(strlen(str) + 1); -@@ -2376,7 +2493,7 @@ static int check_store_(const char *name, struct hashtable *reachable) - while (i < node->childlen && !ret) { - struct node *childnode; - size_t childlen = strlen(node->children + i); -- char * childname = child_name(node->name, -+ char * childname = child_name(NULL, node->name, - node->children + i); - - if (!childname) { -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index bfd3fc1e9df3..2d9942171d92 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -202,6 +202,7 @@ struct node { - - /* Children, each nul-terminated. */ - unsigned int childlen; -+ unsigned int childoff; /* Used by walk_node_tree() internally. */ - char *children; - - /* Allocation information for node currently in store. */ -@@ -338,6 +339,45 @@ void read_state_buffered_data(const void *ctx, struct connection *conn, - const struct xs_state_connection *sc); - void read_state_node(const void *ctx, const void *state); - -+/* -+ * Walk the node tree below root calling funcs->enter() and funcs->exit() for -+ * each node. funcs->enter() is being called when entering a node, so before -+ * any of the children of the node is processed. funcs->exit() is being -+ * called when leaving the node, so after all children have been processed. -+ * funcs->enoent() is being called when a node isn't existing. -+ * funcs->*() return values: -+ * < 0: tree walk is stopped, walk_node_tree() returns funcs->*() return value -+ * in case WALK_TREE_ERROR_STOP is returned, errno should be set -+ * WALK_TREE_OK: tree walk is continuing -+ * WALK_TREE_SKIP_CHILDREN: tree walk won't descend below current node, but -+ * walk continues -+ * WALK_TREE_RM_CHILDENTRY: Remove the child entry from its parent and write -+ * the modified parent node back to the data base, implies to not descend -+ * below the current node, but to continue the walk -+ * funcs->*() is allowed to modify the node it is called for in the data base. -+ * In case funcs->enter() is deleting the node, it must not return WALK_TREE_OK -+ * in order to avoid descending into no longer existing children. -+ */ -+/* Return values for funcs->*() and walk_node_tree(). */ -+#define WALK_TREE_SUCCESS_STOP -100 /* Stop walk early, no error. */ -+#define WALK_TREE_ERROR_STOP -1 /* Stop walk due to error. */ -+#define WALK_TREE_OK 0 /* No error. */ -+/* Return value for funcs->*() only. */ -+#define WALK_TREE_SKIP_CHILDREN 1 /* Don't recurse below current node. */ -+#define WALK_TREE_RM_CHILDENTRY 2 /* Remove child entry from parent. */ -+ -+struct walk_funcs { -+ int (*enter)(const void *ctx, struct connection *conn, -+ struct node *node, void *arg); -+ int (*exit)(const void *ctx, struct connection *conn, -+ struct node *node, void *arg); -+ int (*enoent)(const void *ctx, struct connection *conn, -+ struct node *parent, char *name, void *arg); -+}; -+ -+int walk_node_tree(const void *ctx, struct connection *conn, const char *root, -+ struct walk_funcs *funcs, void *arg); -+ - #endif /* _XENSTORED_CORE_H */ - - /* diff --git a/xsa418-4.16-04.patch b/xsa418-4.16-04.patch deleted file mode 100644 index 95de88f..0000000 --- a/xsa418-4.16-04.patch +++ /dev/null @@ -1,108 +0,0 @@ -From bdc931fb5dcebbd8d0e44b5d8bd3fb9106ee8596 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: tools/xenstore: simplify check_store() - -check_store() is using a hash table for storing all node names it has -found via walking the tree. Additionally it using another hash table -for all children of a node to detect duplicate child names. - -Simplify that by dropping the second hash table as the first one is -already holding all the needed information. - -This is part of XSA-418 / CVE-2022-42321. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 2cda3ee375ab..760f3c16c794 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2477,50 +2477,34 @@ static int check_store_(const char *name, struct hashtable *reachable) - if (node) { - size_t i = 0; - -- struct hashtable * children = -- create_hashtable(16, hash_from_key_fn, keys_equal_fn); -- if (!children) { -- log("check_store create table: ENOMEM"); -- return ENOMEM; -- } -- - if (!remember_string(reachable, name)) { -- hashtable_destroy(children, 0); - log("check_store: ENOMEM"); - return ENOMEM; - } - - while (i < node->childlen && !ret) { -- struct node *childnode; -+ struct node *childnode = NULL; - size_t childlen = strlen(node->children + i); -- char * childname = child_name(NULL, node->name, -- node->children + i); -+ char *childname = child_name(NULL, node->name, -+ node->children + i); - - if (!childname) { - log("check_store: ENOMEM"); - ret = ENOMEM; - break; - } -+ -+ if (hashtable_search(reachable, childname)) { -+ log("check_store: '%s' is duplicated!", -+ childname); -+ i = rm_child_entry(node, i, childlen); -+ goto next; -+ } -+ - childnode = read_node(NULL, childname, childname); -- -+ - if (childnode) { -- if (hashtable_search(children, childname)) { -- log("check_store: '%s' is duplicated!", -- childname); -- i = rm_child_entry(node, i, childlen); -- } -- else { -- if (!remember_string(children, -- childname)) { -- log("check_store: ENOMEM"); -- talloc_free(childnode); -- talloc_free(childname); -- ret = ENOMEM; -- break; -- } -- ret = check_store_(childname, -- reachable); -- } -+ ret = check_store_(childname, reachable); - } else if (errno != ENOMEM) { - log("check_store: No child '%s' found!\n", - childname); -@@ -2530,19 +2514,18 @@ static int check_store_(const char *name, struct hashtable *reachable) - ret = ENOMEM; - } - -+ next: - talloc_free(childnode); - talloc_free(childname); - i += childlen + 1; - } - -- hashtable_destroy(children, 0 /* Don't free values (they are -- all (void *)1) */); - talloc_free(node); - } else if (errno != ENOMEM) { - /* Impossible, because no database should ever be without the - root, and otherwise, we've just checked in our caller - (which made a recursive call to get here). */ -- -+ - log("check_store: No child '%s' found: impossible!", name); - } else { - log("check_store: ENOMEM"); diff --git a/xsa418-4.16-05.patch b/xsa418-4.16-05.patch deleted file mode 100644 index fca551e..0000000 --- a/xsa418-4.16-05.patch +++ /dev/null @@ -1,164 +0,0 @@ -From 27817f0a7d6802be04e8f43a0900b02f881b28b2 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: tools/xenstore: use treewalk for check_store() - -Instead of doing an open tree walk using call recursion, use -walk_node_tree() when checking the store for inconsistencies. - -This will reduce code size and avoid many nesting levels of function -calls which could potentially exhaust the stack. - -This is part of XSA-418 / CVE-2022-42321. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 760f3c16c794..efdd1888fd78 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2444,18 +2444,6 @@ int remember_string(struct hashtable *hash, const char *str) - return hashtable_insert(hash, k, (void *)1); - } - --static int rm_child_entry(struct node *node, size_t off, size_t len) --{ -- if (!recovery) -- return off; -- -- if (remove_child_entry(NULL, node, off)) -- log("check_store: child entry could not be removed from '%s'", -- node->name); -- -- return off - len - 1; --} -- - /** - * A node has a children field that names the children of the node, separated - * by NULs. We check whether there are entries in there that are duplicated -@@ -2469,70 +2457,29 @@ static int rm_child_entry(struct node *node, size_t off, size_t len) - * As we go, we record each node in the given reachable hashtable. These - * entries will be used later in clean_store. - */ --static int check_store_(const char *name, struct hashtable *reachable) -+static int check_store_step(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) - { -- struct node *node = read_node(NULL, name, name); -- int ret = 0; -- -- if (node) { -- size_t i = 0; -- -- if (!remember_string(reachable, name)) { -- log("check_store: ENOMEM"); -- return ENOMEM; -- } -- -- while (i < node->childlen && !ret) { -- struct node *childnode = NULL; -- size_t childlen = strlen(node->children + i); -- char *childname = child_name(NULL, node->name, -- node->children + i); -- -- if (!childname) { -- log("check_store: ENOMEM"); -- ret = ENOMEM; -- break; -- } -+ struct hashtable *reachable = arg; - -- if (hashtable_search(reachable, childname)) { -- log("check_store: '%s' is duplicated!", -- childname); -- i = rm_child_entry(node, i, childlen); -- goto next; -- } -- -- childnode = read_node(NULL, childname, childname); -- -- if (childnode) { -- ret = check_store_(childname, reachable); -- } else if (errno != ENOMEM) { -- log("check_store: No child '%s' found!\n", -- childname); -- i = rm_child_entry(node, i, childlen); -- } else { -- log("check_store: ENOMEM"); -- ret = ENOMEM; -- } -+ if (hashtable_search(reachable, (void *)node->name)) { -+ log("check_store: '%s' is duplicated!", node->name); -+ return recovery ? WALK_TREE_RM_CHILDENTRY -+ : WALK_TREE_SKIP_CHILDREN; -+ } - -- next: -- talloc_free(childnode); -- talloc_free(childname); -- i += childlen + 1; -- } -+ if (!remember_string(reachable, node->name)) -+ return WALK_TREE_ERROR_STOP; - -- talloc_free(node); -- } else if (errno != ENOMEM) { -- /* Impossible, because no database should ever be without the -- root, and otherwise, we've just checked in our caller -- (which made a recursive call to get here). */ -+ return WALK_TREE_OK; -+} - -- log("check_store: No child '%s' found: impossible!", name); -- } else { -- log("check_store: ENOMEM"); -- ret = ENOMEM; -- } -+static int check_store_enoent(const void *ctx, struct connection *conn, -+ struct node *parent, char *name, void *arg) -+{ -+ log("check_store: node '%s' not found", name); - -- return ret; -+ return recovery ? WALK_TREE_RM_CHILDENTRY : WALK_TREE_OK; - } - - -@@ -2581,24 +2528,28 @@ static void clean_store(struct hashtable *reachable) - - void check_store(void) - { -- char * root = talloc_strdup(NULL, "/"); -- struct hashtable * reachable = -- create_hashtable(16, hash_from_key_fn, keys_equal_fn); -- -+ struct hashtable *reachable; -+ struct walk_funcs walkfuncs = { -+ .enter = check_store_step, -+ .enoent = check_store_enoent, -+ }; -+ -+ reachable = create_hashtable(16, hash_from_key_fn, keys_equal_fn); - if (!reachable) { - log("check_store: ENOMEM"); - return; - } - - log("Checking store ..."); -- if (!check_store_(root, reachable) && -- !check_transactions(reachable)) -+ if (walk_node_tree(NULL, NULL, "/", &walkfuncs, reachable)) { -+ if (errno == ENOMEM) -+ log("check_store: ENOMEM"); -+ } else if (!check_transactions(reachable)) - clean_store(reachable); - log("Checking store complete."); - - hashtable_destroy(reachable, 0 /* Don't free values (they are all - (void *)1) */); -- talloc_free(root); - } - - diff --git a/xsa418-4.16-06.patch b/xsa418-4.16-06.patch deleted file mode 100644 index d46c057..0000000 --- a/xsa418-4.16-06.patch +++ /dev/null @@ -1,174 +0,0 @@ -From 6ea0ffbd88b11f23779d763501ec1370b590bb2a Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: tools/xenstore: use treewalk for deleting nodes - -Instead of doing an open tree walk using call recursion, use -walk_node_tree() when deleting a sub-tree of nodes. - -This will reduce code size and avoid many nesting levels of function -calls which could potentially exhaust the stack. - -This is part of XSA-418 / CVE-2022-42321. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Acked-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index efdd1888fd78..58fb651542ec 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -1334,21 +1334,6 @@ static int do_read(const void *ctx, struct connection *conn, - return 0; - } - --static void delete_node_single(struct connection *conn, struct node *node) --{ -- TDB_DATA key; -- -- if (access_node(conn, node, NODE_ACCESS_DELETE, &key)) -- return; -- -- if (do_tdb_delete(conn, &key, &node->acc) != 0) { -- corrupt(conn, "Could not delete '%s'", node->name); -- return; -- } -- -- domain_entry_dec(conn, node); --} -- - /* Must not be / */ - static char *basename(const char *name) - { -@@ -1619,69 +1604,59 @@ static int remove_child_entry(struct connection *conn, struct node *node, - return write_node(conn, node, true); - } - --static void delete_child(struct connection *conn, -- struct node *node, const char *childname) -+static int delete_child(struct connection *conn, -+ struct node *node, const char *childname) - { - unsigned int i; - - for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) { - if (streq(node->children+i, childname)) { -- if (remove_child_entry(conn, node, i)) -- corrupt(conn, "Can't update parent node '%s'", -- node->name); -- return; -+ errno = remove_child_entry(conn, node, i) ? EIO : 0; -+ return errno; - } - } - corrupt(conn, "Can't find child '%s' in %s", childname, node->name); -+ -+ errno = EIO; -+ return errno; - } - --static int delete_node(struct connection *conn, const void *ctx, -- struct node *parent, struct node *node, bool watch_exact) -+static int delnode_sub(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) - { -- char *name; -+ const char *root = arg; -+ bool watch_exact; -+ int ret; -+ TDB_DATA key; - -- /* Delete children. */ -- while (node->childlen) { -- struct node *child; -+ /* Any error here will probably be repeated for all following calls. */ -+ ret = access_node(conn, node, NODE_ACCESS_DELETE, &key); -+ if (ret > 0) -+ return WALK_TREE_SUCCESS_STOP; - -- name = talloc_asprintf(node, "%s/%s", node->name, -- node->children); -- child = name ? read_node(conn, node, name) : NULL; -- if (child) { -- if (delete_node(conn, ctx, node, child, true)) -- return errno; -- } else { -- trace("delete_node: Error deleting child '%s/%s'!\n", -- node->name, node->children); -- /* Quit deleting. */ -- errno = ENOMEM; -- return errno; -- } -- talloc_free(name); -- } -+ /* In case of error stop the walk. */ -+ if (!ret && do_tdb_delete(conn, &key, &node->acc)) -+ return WALK_TREE_SUCCESS_STOP; - - /* - * Fire the watches now, when we can still see the node permissions. - * This fine as we are single threaded and the next possible read will - * be handled only after the node has been really removed. -- */ -+ */ -+ watch_exact = strcmp(root, node->name); - fire_watches(conn, ctx, node->name, node, watch_exact, NULL); -- delete_node_single(conn, node); -- delete_child(conn, parent, basename(node->name)); -- talloc_free(node); - -- return 0; -+ domain_entry_dec(conn, node); -+ -+ return WALK_TREE_RM_CHILDENTRY; - } - --static int _rm(struct connection *conn, const void *ctx, struct node *node, -- const char *name) -+static int _rm(struct connection *conn, const void *ctx, const char *name) - { -- /* -- * Deleting node by node, so the result is always consistent even in -- * case of a failure. -- */ - struct node *parent; - char *parentname = get_parent(ctx, name); -+ struct walk_funcs walkfuncs = { .exit = delnode_sub }; -+ int ret; - - if (!parentname) - return errno; -@@ -1689,9 +1664,21 @@ static int _rm(struct connection *conn, const void *ctx, struct node *node, - parent = read_node(conn, ctx, parentname); - if (!parent) - return read_node_can_propagate_errno() ? errno : EINVAL; -- node->parent = parent; - -- return delete_node(conn, ctx, parent, node, false); -+ ret = walk_node_tree(ctx, conn, name, &walkfuncs, (void *)name); -+ if (ret < 0) { -+ if (ret == WALK_TREE_ERROR_STOP) { -+ corrupt(conn, "error when deleting sub-nodes of %s\n", -+ name); -+ errno = EIO; -+ } -+ return errno; -+ } -+ -+ if (delete_child(conn, parent, basename(name))) -+ return errno; -+ -+ return 0; - } - - -@@ -1728,7 +1715,7 @@ static int do_rm(const void *ctx, struct connection *conn, - if (streq(name, "/")) - return EINVAL; - -- ret = _rm(conn, ctx, node, name); -+ ret = _rm(conn, ctx, name); - if (ret) - return ret; - diff --git a/xsa418-4.16-07.patch b/xsa418-4.16-07.patch deleted file mode 100644 index 9600d8c..0000000 --- a/xsa418-4.16-07.patch +++ /dev/null @@ -1,163 +0,0 @@ -From 1ee281b18b52bec87335ea64ee74cc159e63d036 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: tools/xenstore: use treewalk for creating node records - -Instead of doing an open tree walk using call recursion, use -walk_node_tree() when creating the node records during a live update. - -This will reduce code size and avoid many nesting levels of function -calls which could potentially exhaust the stack. - -This is part of XSA-418 / CVE-2022-42321. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 58fb651542ec..05d349778bb4 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -3120,101 +3120,76 @@ const char *dump_state_node_perms(FILE *fp, const struct xs_permissions *perms, - return NULL; - } - --static const char *dump_state_node_tree(FILE *fp, char *path, -- unsigned int path_max_len) -+struct dump_node_data { -+ FILE *fp; -+ const char *err; -+}; -+ -+static int dump_state_node_err(struct dump_node_data *data, const char *err) - { -- unsigned int pathlen, childlen, p = 0; -+ data->err = err; -+ return WALK_TREE_ERROR_STOP; -+} -+ -+static int dump_state_node(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) -+{ -+ struct dump_node_data *data = arg; -+ FILE *fp = data->fp; -+ unsigned int pathlen; - struct xs_state_record_header head; - struct xs_state_node sn; -- TDB_DATA key, data; -- const struct xs_tdb_record_hdr *hdr; -- const char *child; - const char *ret; - -- pathlen = strlen(path) + 1; -- -- set_tdb_key(path, &key); -- data = tdb_fetch(tdb_ctx, key); -- if (data.dptr == NULL) -- return "Error reading node"; -- -- /* Clean up in case of failure. */ -- talloc_steal(path, data.dptr); -- -- hdr = (void *)data.dptr; -+ pathlen = strlen(node->name) + 1; - - head.type = XS_STATE_TYPE_NODE; - head.length = sizeof(sn); - sn.conn_id = 0; - sn.ta_id = 0; - sn.ta_access = 0; -- sn.perm_n = hdr->num_perms; -+ sn.perm_n = node->perms.num; - sn.path_len = pathlen; -- sn.data_len = hdr->datalen; -- head.length += hdr->num_perms * sizeof(*sn.perms); -+ sn.data_len = node->datalen; -+ head.length += node->perms.num * sizeof(*sn.perms); - head.length += pathlen; -- head.length += hdr->datalen; -+ head.length += node->datalen; - head.length = ROUNDUP(head.length, 3); - - if (fwrite(&head, sizeof(head), 1, fp) != 1) -- return "Dump node state error"; -+ return dump_state_node_err(data, "Dump node head error"); - if (fwrite(&sn, sizeof(sn), 1, fp) != 1) -- return "Dump node state error"; -+ return dump_state_node_err(data, "Dump node state error"); - -- ret = dump_state_node_perms(fp, hdr->perms, hdr->num_perms); -+ ret = dump_state_node_perms(fp, node->perms.p, node->perms.num); - if (ret) -- return ret; -+ return dump_state_node_err(data, ret); -+ -+ if (fwrite(node->name, pathlen, 1, fp) != 1) -+ return dump_state_node_err(data, "Dump node path error"); - -- if (fwrite(path, pathlen, 1, fp) != 1) -- return "Dump node path error"; -- if (hdr->datalen && -- fwrite(hdr->perms + hdr->num_perms, hdr->datalen, 1, fp) != 1) -- return "Dump node data error"; -+ if (node->datalen && fwrite(node->data, node->datalen, 1, fp) != 1) -+ return dump_state_node_err(data, "Dump node data error"); - - ret = dump_state_align(fp); - if (ret) -- return ret; -+ return dump_state_node_err(data, ret); - -- child = (char *)(hdr->perms + hdr->num_perms) + hdr->datalen; -- -- /* -- * Use path for constructing children paths. -- * As we don't write out nodes without having written their parent -- * already we will never clobber a part of the path we'll need later. -- */ -- pathlen--; -- if (path[pathlen - 1] != '/') { -- path[pathlen] = '/'; -- pathlen++; -- } -- while (p < hdr->childlen) { -- childlen = strlen(child) + 1; -- if (pathlen + childlen > path_max_len) -- return "Dump node path length error"; -- strcpy(path + pathlen, child); -- ret = dump_state_node_tree(fp, path, path_max_len); -- if (ret) -- return ret; -- p += childlen; -- child += childlen; -- } -- -- talloc_free(data.dptr); -- -- return NULL; -+ return WALK_TREE_OK; - } - - const char *dump_state_nodes(FILE *fp, const void *ctx) - { -- char *path; -- -- path = talloc_size(ctx, XENSTORE_ABS_PATH_MAX + 1); -- if (!path) -- return "Path buffer allocation error"; -+ struct dump_node_data data = { -+ .fp = fp, -+ .err = "Dump node walk error" -+ }; -+ struct walk_funcs walkfuncs = { .enter = dump_state_node }; - -- strcpy(path, "/"); -+ if (walk_node_tree(ctx, NULL, "/", &walkfuncs, &data)) -+ return data.err; - -- return dump_state_node_tree(fp, path, XENSTORE_ABS_PATH_MAX + 1); -+ return NULL; - } - - void read_state_global(const void *ctx, const void *state) diff --git a/xsa419-oxenstored.patch b/xsa419-oxenstored.patch deleted file mode 100644 index 0ac365d..0000000 --- a/xsa419-oxenstored.patch +++ /dev/null @@ -1,85 +0,0 @@ -From 09228369a549427294febe351372d7227e624da1 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= -Date: Wed, 12 Oct 2022 19:13:06 +0100 -Subject: tools/ocaml/xenstored: Fix quota bypass on domain shutdown -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -XSA-322 fixed a domid reuse vulnerability by assigning Dom0 as the owner of -any nodes left after a domain is shutdown (e.g. outside its /local/domain/N -tree). - -However Dom0 has no quota on purpose, so this opened up another potential -attack vector. Avoid it by deleting these nodes instead of assigning them to -Dom0. - -This is part of XSA-419 / CVE-2022-42323. - -Reported-by: Juergen Gross -Fixes: c46eff921209 ("tools/ocaml/xenstored: clean up permissions for dead domains") -Signed-off-by: Edwin Török -Acked-by: Christian Lindig - -diff --git a/tools/ocaml/xenstored/perms.ml b/tools/ocaml/xenstored/perms.ml -index e8a16221f8fa..84f2503e8e29 100644 ---- a/tools/ocaml/xenstored/perms.ml -+++ b/tools/ocaml/xenstored/perms.ml -@@ -64,8 +64,7 @@ let get_owner perm = perm.owner - * *) - let remove_domid ~domid perm = - let acl = List.filter (fun (acl_domid, _) -> acl_domid <> domid) perm.acl in -- let owner = if perm.owner = domid then 0 else perm.owner in -- { perm with acl; owner } -+ if perm.owner = domid then None else Some { perm with acl; owner = perm.owner } - - let default0 = create 0 NONE [] - -diff --git a/tools/ocaml/xenstored/store.ml b/tools/ocaml/xenstored/store.ml -index 20e67b142746..70f0c83de404 100644 ---- a/tools/ocaml/xenstored/store.ml -+++ b/tools/ocaml/xenstored/store.ml -@@ -87,10 +87,21 @@ let check_owner node connection = - - let rec recurse fct node = fct node; SymbolMap.iter (fun _ -> recurse fct) node.children - --(** [recurse_map f tree] applies [f] on each node in the tree recursively *) --let recurse_map f = -+(** [recurse_filter_map f tree] applies [f] on each node in the tree recursively, -+ possibly removing some nodes. -+ Note that the nodes removed this way won't generate watch events. -+*) -+let recurse_filter_map f = -+ let invalid = -1 in -+ let is_valid _ node = node.perms.owner <> invalid in - let rec walk node = -- f { node with children = SymbolMap.map walk node.children } -+ (* Map.filter_map is Ocaml 4.11+ only *) -+ let node = -+ { node with children = -+ SymbolMap.map walk node.children |> SymbolMap.filter is_valid } in -+ match f node with -+ | Some keep -> keep -+ | None -> { node with perms = {node.perms with owner = invalid } } - in - walk - -@@ -444,11 +455,13 @@ let setperms store perm path nperms = - - let reset_permissions store domid = - Logging.info "store|node" "Cleaning up xenstore ACLs for domid %d" domid; -- store.root <- Node.recurse_map (fun node -> -- let perms = Perms.Node.remove_domid ~domid node.perms in -- if perms <> node.perms then -- Logging.debug "store|node" "Changed permissions for node %s" (Node.get_name node); -- { node with perms } -+ store.root <- Node.recurse_filter_map (fun node -> -+ match Perms.Node.remove_domid ~domid node.perms with -+ | None -> None -+ | Some perms -> -+ if perms <> node.perms then -+ Logging.debug "store|node" "Changed permissions for node %s" (Node.get_name node); -+ Some { node with perms } - ) store.root - - type ops = { diff --git a/xsa419-xenstored-01.patch b/xsa419-xenstored-01.patch deleted file mode 100644 index 3409790..0000000 --- a/xsa419-xenstored-01.patch +++ /dev/null @@ -1,289 +0,0 @@ -From e4250bf8b39ed73623c75b0f1436ac7c1d45aba7 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:12 +0200 -Subject: tools/xenstore: remove nodes owned by destroyed domain - -In case a domain is removed from Xenstore, remove all nodes owned by -it per default. - -This tackles the problem that nodes might be created by a domain -outside its home path in Xenstore, leading to Xenstore hogging more -and more memory. Domain quota don't work in this case if the guest is -rebooting in between. - -Since XSA-322 ownership of such stale nodes is transferred to dom0, -which is helping against unintended access, but not against OOM of -Xenstore. - -As a fallback for weird cases add a Xenstore start parameter for -keeping today's way to handle stale nodes, adding the risk of Xenstore -hitting an OOM situation. - -This is part of XSA-419 / CVE-2022-42322. - -Reported-by: Juergen Gross -Fixes: 496306324d8d ("tools/xenstore: revoke access rights for removed domains") -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index bdc14679adf5..13e48aaa731c 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -80,6 +80,7 @@ static bool verbose = false; - LIST_HEAD(connections); - int tracefd = -1; - static bool recovery = true; -+bool keep_orphans = false; - static int reopen_log_pipe[2]; - static int reopen_log_pipe0_pollfd_idx = -1; - char *tracefile = NULL; -@@ -753,7 +754,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - node->perms.p = hdr->perms; - node->acc.domid = node->perms.p[0].id; - node->acc.memory = data.dsize; -- if (domain_adjust_node_perms(conn, node)) -+ if (domain_adjust_node_perms(node)) - goto error; - - /* If owner is gone reset currently accounted memory size. */ -@@ -796,7 +797,7 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - void *p; - struct xs_tdb_record_hdr *hdr; - -- if (domain_adjust_node_perms(conn, node)) -+ if (domain_adjust_node_perms(node)) - return errno; - - data.dsize = sizeof(*hdr) -@@ -1647,7 +1648,7 @@ static int delnode_sub(const void *ctx, struct connection *conn, - return WALK_TREE_RM_CHILDENTRY; - } - --static int _rm(struct connection *conn, const void *ctx, const char *name) -+int rm_node(struct connection *conn, const void *ctx, const char *name) - { - struct node *parent; - char *parentname = get_parent(ctx, name); -@@ -1711,7 +1712,7 @@ static int do_rm(const void *ctx, struct connection *conn, - if (streq(name, "/")) - return EINVAL; - -- ret = _rm(conn, ctx, name); -+ ret = rm_node(conn, ctx, name); - if (ret) - return ret; - -@@ -2618,6 +2619,8 @@ static void usage(void) - " -R, --no-recovery to request that no recovery should be attempted when\n" - " the store is corrupted (debug only),\n" - " -I, --internal-db store database in memory, not on disk\n" -+" -K, --keep-orphans don't delete nodes owned by a domain when the\n" -+" domain is deleted (this is a security risk!)\n" - " -V, --verbose to request verbose execution.\n"); - } - -@@ -2642,6 +2645,7 @@ static struct option options[] = { - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, - { "internal-db", 0, NULL, 'I' }, -+ { "keep-orphans", 0, NULL, 'K' }, - { "verbose", 0, NULL, 'V' }, - { "watch-nb", 1, NULL, 'W' }, - #ifndef NO_LIVE_UPDATE -@@ -2721,7 +2725,7 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:A:M:Q:q:T:RVW:w:U", -+ while ((opt = getopt_long(argc, argv, "DE:F:HKNPS:t:A:M:Q:q:T:RVW:w:U", - options, NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2757,6 +2761,9 @@ int main(int argc, char *argv[]) - case 'I': - tdb_flags = TDB_INTERNAL|TDB_NOLOCK; - break; -+ case 'K': -+ keep_orphans = true; -+ break; - case 'V': - verbose = true; - break; -diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h -index acb00ad96914..37006d508dbf 100644 ---- a/tools/xenstore/xenstored_core.h -+++ b/tools/xenstore/xenstored_core.h -@@ -240,6 +240,9 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - struct node *read_node(struct connection *conn, const void *ctx, - const char *name); - -+/* Remove a node and its children. */ -+int rm_node(struct connection *conn, const void *ctx, const char *name); -+ - void setup_structure(bool live_update); - struct connection *new_connection(const struct interface_funcs *funcs); - struct connection *get_connection_by_id(unsigned int conn_id); -@@ -284,6 +287,7 @@ extern int quota_req_outstanding; - extern int quota_trans_nodes; - extern int quota_memory_per_domain_soft; - extern int quota_memory_per_domain_hard; -+extern bool keep_orphans; - - extern unsigned int timeout_watch_event_msec; - -diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c -index 98b401fdec30..84b7817cd5e6 100644 ---- a/tools/xenstore/xenstored_domain.c -+++ b/tools/xenstore/xenstored_domain.c -@@ -227,10 +227,64 @@ static void unmap_interface(void *interface) - xengnttab_unmap(*xgt_handle, interface, 1); - } - -+static int domain_tree_remove_sub(const void *ctx, struct connection *conn, -+ struct node *node, void *arg) -+{ -+ struct domain *domain = arg; -+ TDB_DATA key; -+ int ret = WALK_TREE_OK; -+ -+ if (node->perms.p[0].id != domain->domid) -+ return WALK_TREE_OK; -+ -+ if (keep_orphans) { -+ set_tdb_key(node->name, &key); -+ domain->nbentry--; -+ node->perms.p[0].id = priv_domid; -+ node->acc.memory = 0; -+ domain_entry_inc(NULL, node); -+ if (write_node_raw(NULL, &key, node, true)) { -+ /* That's unfortunate. We only can try to continue. */ -+ syslog(LOG_ERR, -+ "error when moving orphaned node %s to dom0\n", -+ node->name); -+ } else -+ trace("orphaned node %s moved to dom0\n", node->name); -+ } else { -+ if (rm_node(NULL, ctx, node->name)) { -+ /* That's unfortunate. We only can try to continue. */ -+ syslog(LOG_ERR, -+ "error when deleting orphaned node %s\n", -+ node->name); -+ } else -+ trace("orphaned node %s deleted\n", node->name); -+ -+ /* Skip children in all cases in order to avoid more errors. */ -+ ret = WALK_TREE_SKIP_CHILDREN; -+ } -+ -+ return domain->nbentry > 0 ? ret : WALK_TREE_SUCCESS_STOP; -+} -+ -+static void domain_tree_remove(struct domain *domain) -+{ -+ int ret; -+ struct walk_funcs walkfuncs = { .enter = domain_tree_remove_sub }; -+ -+ if (domain->nbentry > 0) { -+ ret = walk_node_tree(domain, NULL, "/", &walkfuncs, domain); -+ if (ret == WALK_TREE_ERROR_STOP) -+ syslog(LOG_ERR, -+ "error when looking for orphaned nodes\n"); -+ } -+} -+ - static int destroy_domain(void *_domain) - { - struct domain *domain = _domain; - -+ domain_tree_remove(domain); -+ - list_del(&domain->list); - - if (!domain->introduced) -@@ -883,15 +937,15 @@ int domain_entry_inc(struct connection *conn, struct node *node) - struct domain *d; - unsigned int domid; - -- if (!conn) -+ if (!node->perms.p) - return 0; - -- domid = node->perms.p ? node->perms.p[0].id : conn->id; -+ domid = node->perms.p[0].id; - -- if (conn->transaction) { -+ if (conn && conn->transaction) { - transaction_entry_inc(conn->transaction, domid); - } else { -- d = (domid == conn->id && conn->domain) ? conn->domain -+ d = (conn && domid == conn->id && conn->domain) ? conn->domain - : find_or_alloc_existing_domain(domid); - if (d) - d->nbentry++; -@@ -952,23 +1006,11 @@ int domain_alloc_permrefs(struct node_perms *perms) - * Remove permissions for no longer existing domains in order to avoid a new - * domain with the same domid inheriting the permissions. - */ --int domain_adjust_node_perms(struct connection *conn, struct node *node) -+int domain_adjust_node_perms(struct node *node) - { - unsigned int i; - int ret; - -- ret = chk_domain_generation(node->perms.p[0].id, node->generation); -- -- /* If the owner doesn't exist any longer give it to priv domain. */ -- if (!ret) { -- /* -- * In theory we'd need to update the number of dom0 nodes here, -- * but we could be called for a read of the node. So better -- * avoid the risk to overflow the node count of dom0. -- */ -- node->perms.p[0].id = priv_domid; -- } -- - for (i = 1; i < node->perms.num; i++) { - if (node->perms.p[i].perms & XS_PERM_IGNORE) - continue; -@@ -986,15 +1028,15 @@ void domain_entry_dec(struct connection *conn, struct node *node) - struct domain *d; - unsigned int domid; - -- if (!conn) -+ if (!node->perms.p) - return; - - domid = node->perms.p ? node->perms.p[0].id : conn->id; - -- if (conn->transaction) { -+ if (conn && conn->transaction) { - transaction_entry_dec(conn->transaction, domid); - } else { -- d = (domid == conn->id && conn->domain) ? conn->domain -+ d = (conn && domid == conn->id && conn->domain) ? conn->domain - : find_domain_struct(domid); - if (d) { - d->nbentry--; -@@ -1113,7 +1155,7 @@ int domain_memory_add(unsigned int domid, int mem, bool no_quota_check) - * exist, as accounting is done either for a domain related to - * the current connection, or for the domain owning a node - * (which is always existing, as the owner of the node is -- * tested to exist and replaced by domid 0 if not). -+ * tested to exist and deleted or replaced by domid 0 if not). - * So not finding the related domain MUST be an error in the - * data base. - */ -diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h -index 7fe0a21d9e45..b38c82991dc6 100644 ---- a/tools/xenstore/xenstored_domain.h -+++ b/tools/xenstore/xenstored_domain.h -@@ -62,7 +62,7 @@ const char *get_implicit_path(const struct connection *conn); - bool domain_is_unprivileged(struct connection *conn); - - /* Remove node permissions for no longer existing domains. */ --int domain_adjust_node_perms(struct connection *conn, struct node *node); -+int domain_adjust_node_perms(struct node *node); - int domain_alloc_permrefs(struct node_perms *perms); - - /* Quota manipulation */ diff --git a/xsa419-xenstored-02.patch b/xsa419-xenstored-02.patch deleted file mode 100644 index e5d46a2..0000000 --- a/xsa419-xenstored-02.patch +++ /dev/null @@ -1,93 +0,0 @@ -From 929da557efea6c7d2340467d9a7fdae7fda6d2b1 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:13 +0200 -Subject: tools/xenstore: make the internal memory data base the default - -Having a file backed data base has the only advantage of being capable -to dump the contents of it while Xenstore is running, and potentially -using less swap space in case the data base can't be kept in memory. - -It has the major disadvantage of a huge performance overhead: switching -to keep the data base in memory only speeds up live update of xenstored -with 120000 nodes from 20 minutes to 11 seconds. A complete tree walk -of this configuration will be reduced from 7 seconds to 280 msecs -(measured by "xenstore-control check"). - -So make the internal memory data base the default and enhance the -"--internal-db" command line parameter to take an optional parameter -allowing to switch the internal data base back to the file based one. - -This is part of XSA-419. - -Reported-by: Juergen Gross -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall - -diff --git a/tools/helpers/init-xenstore-domain.c b/tools/helpers/init-xenstore-domain.c -index 2d9ab6f1c583..04e351ca29a8 100644 ---- a/tools/helpers/init-xenstore-domain.c -+++ b/tools/helpers/init-xenstore-domain.c -@@ -222,9 +222,9 @@ static int build(xc_interface *xch) - } - - if ( param ) -- snprintf(cmdline, 512, "--event %d --internal-db %s", rv, param); -+ snprintf(cmdline, 512, "--event %d %s", rv, param); - else -- snprintf(cmdline, 512, "--event %d --internal-db", rv); -+ snprintf(cmdline, 512, "--event %d", rv); - - dom->guest_domid = domid; - dom->cmdline = xc_dom_strdup(dom, cmdline); -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 13e48aaa731c..36fb4a832834 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -2308,7 +2308,7 @@ static void accept_connection(int sock) - } - #endif - --static int tdb_flags; -+static int tdb_flags = TDB_INTERNAL | TDB_NOLOCK; - - /* We create initial nodes manually. */ - static void manual_node(const char *name, const char *child) -@@ -2618,7 +2618,8 @@ static void usage(void) - " watch-event: time a watch-event is kept pending\n" - " -R, --no-recovery to request that no recovery should be attempted when\n" - " the store is corrupted (debug only),\n" --" -I, --internal-db store database in memory, not on disk\n" -+" -I, --internal-db [on|off] store database in memory, not on disk, default is\n" -+" memory, with \"--internal-db off\" it is on disk\n" - " -K, --keep-orphans don't delete nodes owned by a domain when the\n" - " domain is deleted (this is a security risk!)\n" - " -V, --verbose to request verbose execution.\n"); -@@ -2644,7 +2645,7 @@ static struct option options[] = { - { "quota-soft", 1, NULL, 'q' }, - { "timeout", 1, NULL, 'w' }, - { "no-recovery", 0, NULL, 'R' }, -- { "internal-db", 0, NULL, 'I' }, -+ { "internal-db", 2, NULL, 'I' }, - { "keep-orphans", 0, NULL, 'K' }, - { "verbose", 0, NULL, 'V' }, - { "watch-nb", 1, NULL, 'W' }, -@@ -2725,7 +2726,8 @@ int main(int argc, char *argv[]) - orig_argc = argc; - orig_argv = argv; - -- while ((opt = getopt_long(argc, argv, "DE:F:HKNPS:t:A:M:Q:q:T:RVW:w:U", -+ while ((opt = getopt_long(argc, argv, -+ "DE:F:HI::KNPS:t:A:M:Q:q:T:RVW:w:U", - options, NULL)) != -1) { - switch (opt) { - case 'D': -@@ -2759,7 +2761,8 @@ int main(int argc, char *argv[]) - tracefile = optarg; - break; - case 'I': -- tdb_flags = TDB_INTERNAL|TDB_NOLOCK; -+ if (optarg && !strcmp(optarg, "off")) -+ tdb_flags = 0; - break; - case 'K': - keep_orphans = true; diff --git a/xsa419-xenstored-03.patch b/xsa419-xenstored-03.patch deleted file mode 100644 index 38130c0..0000000 --- a/xsa419-xenstored-03.patch +++ /dev/null @@ -1,44 +0,0 @@ -From 54e63b7e7c42e4f975163809a01574e78552a6ab Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:13 +0200 -Subject: docs: enhance xenstore.txt with permissions description -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The permission scheme of Xenstore nodes is not really covered by -docs/misc/xenstore.txt, other than referring to the Xen wiki. - -Add a paragraph explaining the permissions of nodes, and especially -mentioning removal of nodes when a domain has been removed from -Xenstore. - -This is part of XSA-419. - -Reported-by: Juergen Gross -Signed-off-by: Juergen Gross -Reviewed-by: Edwin Török -Acked-by: Julien Grall - -diff --git a/docs/misc/xenstore.txt b/docs/misc/xenstore.txt -index 988ef89cba2d..44428ae3a755 100644 ---- a/docs/misc/xenstore.txt -+++ b/docs/misc/xenstore.txt -@@ -43,6 +43,17 @@ bytes are forbidden; clients specifying relative paths should keep - them to within 2048 bytes. (See XENSTORE_*_PATH_MAX in xs_wire.h.) - - -+Each node has one or multiple permission entries. Permissions are -+granted by domain-id, the first permission entry of each node specifies -+the owner of the node. Permissions of a node can be changed by the -+owner of the node, the owner can only be modified by the control -+domain (usually domain id 0). The owner always has the right to read -+and write the node, while other permissions can be setup to allow -+read and/or write access. When a domain is being removed from Xenstore -+nodes owned by that domain will be removed together with all of those -+nodes' children. -+ -+ - Communication with xenstore is via either sockets, or event channel - and shared memory, as specified in io/xs_wire.h: each message in - either direction is a header formatted as a struct xsd_sockmsg diff --git a/xsa420.patch b/xsa420.patch deleted file mode 100644 index 5d00dc2..0000000 --- a/xsa420.patch +++ /dev/null @@ -1,68 +0,0 @@ -From 210879456769ca211c6630f47399ca7a61a37f35 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= -Date: Wed, 12 Oct 2022 19:13:05 +0100 -Subject: tools/ocaml: Ensure packet size is never negative -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Integers in Ocaml have 63 or 31 bits of signed precision. - -On 64-bit builds of Ocaml, this is fine because a C uint32_t always fits -within a 63-bit signed integer. - -In 32-bit builds of Ocaml, this goes wrong. The C uint32_t is truncated -first (loses the top bit), then has a unsigned/signed mismatch. - -A "negative" value (i.e. a packet on the ring of between 1G and 2G in size) -will trigger an exception later in Bytes.make in xb.ml, and because the packet -is not removed from the ring, the exception re-triggers on every subsequent -query, creating a livelock. - -Fix both the source of the exception in Xb, and as defence in depth, mark the -domain as bad for any Invalid_argument exceptions to avoid the risk of -livelock. - -This is XSA-420 / CVE-2022-42324. - -Reported-by: Juergen Gross -Signed-off-by: Edwin Török -Acked-by: Christian Lindig - -diff --git a/tools/ocaml/libs/xb/partial.ml b/tools/ocaml/libs/xb/partial.ml -index b6e2a716e263..3aa8927eb7f0 100644 ---- a/tools/ocaml/libs/xb/partial.ml -+++ b/tools/ocaml/libs/xb/partial.ml -@@ -36,7 +36,7 @@ let of_string s = - This will leave the guest connection is a bad state and will - be hard to recover from without restarting the connection - (ie rebooting the guest) *) -- let dlen = min xenstore_payload_max dlen in -+ let dlen = max 0 (min xenstore_payload_max dlen) in - { - tid = tid; - rid = rid; -@@ -46,8 +46,8 @@ let of_string s = - } - - let append pkt s sz = -- if pkt.len > 4096 then failwith "Buffer.add: cannot grow buffer"; -- Buffer.add_string pkt.buf (String.sub s 0 sz) -+ if Buffer.length pkt.buf + sz > xenstore_payload_max then failwith "Buffer.add: cannot grow buffer"; -+ Buffer.add_substring pkt.buf s 0 sz - - let to_complete pkt = - pkt.len - (Buffer.length pkt.buf) -diff --git a/tools/ocaml/xenstored/process.ml b/tools/ocaml/xenstored/process.ml -index 5f439fe59f47..f3a71b24ad94 100644 ---- a/tools/ocaml/xenstored/process.ml -+++ b/tools/ocaml/xenstored/process.ml -@@ -722,7 +722,7 @@ let do_input store cons doms con = - History.reconnect con; - info "%s reconnection complete" (Connection.get_domstr con); - None -- | Failure exp -> -+ | Invalid_argument exp | Failure exp -> - error "caught exception %s" exp; - error "got a bad client %s" (sprintf "%-8s" (Connection.get_domstr con)); - Connection.mark_as_bad con; diff --git a/xsa421-01.patch b/xsa421-01.patch deleted file mode 100644 index 10960ba..0000000 --- a/xsa421-01.patch +++ /dev/null @@ -1,40 +0,0 @@ -From 9166869e7e6530befddfd8bb46ff37436a38efc1 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:13 +0200 -Subject: tools/xenstore: fix deleting node in transaction - -In case a node has been created in a transaction and it is later -deleted in the same transaction, the transaction will be terminated -with an error. - -As this error is encountered only when handling the deleted node at -transaction finalization, the transaction will have been performed -partially and without updating the accounting information. This will -enable a malicious guest to create arbitrary number of nodes. - -This is part of XSA-421 / CVE-2022-42325. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Tested-by: Julien Grall -Reviewed-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 3e3eb47326cc..7ffe21bb5285 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -418,7 +418,13 @@ static int finalize_transaction(struct connection *conn, - true); - talloc_free(data.dptr); - } else { -- ret = do_tdb_delete(conn, &key, NULL); -+ /* -+ * A node having been created and later deleted -+ * in this transaction will have no generation -+ * information stored. -+ */ -+ ret = (i->generation == NO_GENERATION) -+ ? 0 : do_tdb_delete(conn, &key, NULL); - } - if (ret) - goto err; diff --git a/xsa421-02.patch b/xsa421-02.patch deleted file mode 100644 index 1a0b5cd..0000000 --- a/xsa421-02.patch +++ /dev/null @@ -1,401 +0,0 @@ -From 09fc22fea8a4689c5e563ba4a2fa959282071792 Mon Sep 17 00:00:00 2001 -From: Juergen Gross -Date: Tue, 13 Sep 2022 07:35:14 +0200 -Subject: tools/xenstore: harden transaction finalization against errors - -When finalizing a transaction, any error occurring after checking for -conflicts will result in the transaction being performed only -partially today. Additionally accounting data will not be updated at -the end of the transaction, which might result in further problems -later. - -Avoid those problems by multiple modifications: - -- free any transaction specific nodes which don't need to be committed - as they haven't been written during the transaction as soon as their - generation count has been verified, this will reduce the risk of - out-of-memory situations - -- store the transaction specific node name in struct accessed_node in - order to avoid the need to allocate additional memory for it when - finalizing the transaction - -- don't stop the transaction finalization when hitting an error - condition, but try to continue to handle all modified nodes - -- in case of a detected error do the accounting update as needed and - call the data base checking only after that - -- if writing a node in a transaction is failing (e.g. due to a failed - quota check), fail the transaction, as prior changes to struct - accessed_node can't easily be undone in that case - -This is part of XSA-421 / CVE-2022-42326. - -Reported-by: Julien Grall -Signed-off-by: Juergen Gross -Reviewed-by: Julien Grall -Tested-by: Julien Grall - -diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c -index 36fb4a832834..476d5c6d51bd 100644 ---- a/tools/xenstore/xenstored_core.c -+++ b/tools/xenstore/xenstored_core.c -@@ -723,8 +723,7 @@ struct node *read_node(struct connection *conn, const void *ctx, - return NULL; - } - -- if (transaction_prepend(conn, name, &key)) -- return NULL; -+ transaction_prepend(conn, name, &key); - - data = tdb_fetch(tdb_ctx, key); - -@@ -842,10 +841,21 @@ int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node, - static int write_node(struct connection *conn, struct node *node, - bool no_quota_check) - { -+ int ret; -+ - if (access_node(conn, node, NODE_ACCESS_WRITE, &node->key)) - return errno; - -- return write_node_raw(conn, &node->key, node, no_quota_check); -+ ret = write_node_raw(conn, &node->key, node, no_quota_check); -+ if (ret && conn && conn->transaction) { -+ /* -+ * Reverting access_node() is hard, so just fail the -+ * transaction. -+ */ -+ fail_transaction(conn->transaction); -+ } -+ -+ return ret; - } - - unsigned int perm_for_conn(struct connection *conn, -diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c -index 7ffe21bb5285..ac854197cadb 100644 ---- a/tools/xenstore/xenstored_transaction.c -+++ b/tools/xenstore/xenstored_transaction.c -@@ -114,7 +114,8 @@ struct accessed_node - struct list_head list; - - /* The name of the node. */ -- char *node; -+ char *trans_name; /* Transaction specific name. */ -+ char *node; /* Main data base name. */ - - /* Generation count (or NO_GENERATION) for conflict checking. */ - uint64_t generation; -@@ -199,25 +200,20 @@ static char *transaction_get_node_name(void *ctx, struct transaction *trans, - * Prepend the transaction to name if node has been modified in the current - * transaction. - */ --int transaction_prepend(struct connection *conn, const char *name, -- TDB_DATA *key) -+void transaction_prepend(struct connection *conn, const char *name, -+ TDB_DATA *key) - { -- char *tdb_name; -+ struct accessed_node *i; - -- if (!conn || !conn->transaction || -- !find_accessed_node(conn->transaction, name)) { -- set_tdb_key(name, key); -- return 0; -+ if (conn && conn->transaction) { -+ i = find_accessed_node(conn->transaction, name); -+ if (i) { -+ set_tdb_key(i->trans_name, key); -+ return; -+ } - } - -- tdb_name = transaction_get_node_name(conn->transaction, -- conn->transaction, name); -- if (!tdb_name) -- return errno; -- -- set_tdb_key(tdb_name, key); -- -- return 0; -+ set_tdb_key(name, key); - } - - /* -@@ -240,7 +236,6 @@ int access_node(struct connection *conn, struct node *node, - struct accessed_node *i = NULL; - struct transaction *trans; - TDB_DATA local_key; -- const char *trans_name = NULL; - int ret; - bool introduce = false; - -@@ -259,10 +254,6 @@ int access_node(struct connection *conn, struct node *node, - - trans = conn->transaction; - -- trans_name = transaction_get_node_name(node, trans, node->name); -- if (!trans_name) -- goto nomem; -- - i = find_accessed_node(trans, node->name); - if (!i) { - if (trans->nodes >= quota_trans_nodes && -@@ -273,9 +264,10 @@ int access_node(struct connection *conn, struct node *node, - i = talloc_zero(trans, struct accessed_node); - if (!i) - goto nomem; -- i->node = talloc_strdup(i, node->name); -- if (!i->node) -+ i->trans_name = transaction_get_node_name(i, trans, node->name); -+ if (!i->trans_name) - goto nomem; -+ i->node = strchr(i->trans_name, '/') + 1; - if (node->generation != NO_GENERATION && node->perms.num) { - i->perms.p = talloc_array(i, struct xs_permissions, - node->perms.num); -@@ -302,7 +294,7 @@ int access_node(struct connection *conn, struct node *node, - i->generation = node->generation; - i->check_gen = true; - if (node->generation != NO_GENERATION) { -- set_tdb_key(trans_name, &local_key); -+ set_tdb_key(i->trans_name, &local_key); - ret = write_node_raw(conn, &local_key, node, true); - if (ret) - goto err; -@@ -321,7 +313,7 @@ int access_node(struct connection *conn, struct node *node, - return -1; - - if (key) { -- set_tdb_key(trans_name, key); -+ set_tdb_key(i->trans_name, key); - if (type == NODE_ACCESS_WRITE) - i->ta_node = true; - if (type == NODE_ACCESS_DELETE) -@@ -333,7 +325,6 @@ int access_node(struct connection *conn, struct node *node, - nomem: - ret = ENOMEM; - err: -- talloc_free((void *)trans_name); - talloc_free(i); - trans->fail = true; - errno = ret; -@@ -371,100 +362,90 @@ void queue_watches(struct connection *conn, const char *name, bool watch_exact) - * base. - */ - static int finalize_transaction(struct connection *conn, -- struct transaction *trans) -+ struct transaction *trans, bool *is_corrupt) - { -- struct accessed_node *i; -+ struct accessed_node *i, *n; - TDB_DATA key, ta_key, data; - struct xs_tdb_record_hdr *hdr; - uint64_t gen; -- char *trans_name; -- int ret; - -- list_for_each_entry(i, &trans->accessed, list) { -- if (!i->check_gen) -- continue; -+ list_for_each_entry_safe(i, n, &trans->accessed, list) { -+ if (i->check_gen) { -+ set_tdb_key(i->node, &key); -+ data = tdb_fetch(tdb_ctx, key); -+ hdr = (void *)data.dptr; -+ if (!data.dptr) { -+ if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST) -+ return EIO; -+ gen = NO_GENERATION; -+ } else -+ gen = hdr->generation; -+ talloc_free(data.dptr); -+ if (i->generation != gen) -+ return EAGAIN; -+ } - -- set_tdb_key(i->node, &key); -- data = tdb_fetch(tdb_ctx, key); -- hdr = (void *)data.dptr; -- if (!data.dptr) { -- if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST) -- return EIO; -- gen = NO_GENERATION; -- } else -- gen = hdr->generation; -- talloc_free(data.dptr); -- if (i->generation != gen) -- return EAGAIN; -+ /* Entries for unmodified nodes can be removed early. */ -+ if (!i->modified) { -+ if (i->ta_node) { -+ set_tdb_key(i->trans_name, &ta_key); -+ if (do_tdb_delete(conn, &ta_key, NULL)) -+ return EIO; -+ } -+ list_del(&i->list); -+ talloc_free(i); -+ } - } - - while ((i = list_top(&trans->accessed, struct accessed_node, list))) { -- trans_name = transaction_get_node_name(i, trans, i->node); -- if (!trans_name) -- /* We are doomed: the transaction is only partial. */ -- goto err; -- -- set_tdb_key(trans_name, &ta_key); -- -- if (i->modified) { -- set_tdb_key(i->node, &key); -- if (i->ta_node) { -- data = tdb_fetch(tdb_ctx, ta_key); -- if (!data.dptr) -- goto err; -+ set_tdb_key(i->node, &key); -+ if (i->ta_node) { -+ set_tdb_key(i->trans_name, &ta_key); -+ data = tdb_fetch(tdb_ctx, ta_key); -+ if (data.dptr) { - hdr = (void *)data.dptr; - hdr->generation = ++generation; -- ret = do_tdb_write(conn, &key, &data, NULL, -- true); -+ *is_corrupt |= do_tdb_write(conn, &key, &data, -+ NULL, true); - talloc_free(data.dptr); -+ if (do_tdb_delete(conn, &ta_key, NULL)) -+ *is_corrupt = true; - } else { -- /* -- * A node having been created and later deleted -- * in this transaction will have no generation -- * information stored. -- */ -- ret = (i->generation == NO_GENERATION) -- ? 0 : do_tdb_delete(conn, &key, NULL); -- } -- if (ret) -- goto err; -- if (i->fire_watch) { -- fire_watches(conn, trans, i->node, NULL, -- i->watch_exact, -- i->perms.p ? &i->perms : NULL); -+ *is_corrupt = true; - } -+ } else { -+ /* -+ * A node having been created and later deleted -+ * in this transaction will have no generation -+ * information stored. -+ */ -+ *is_corrupt |= (i->generation == NO_GENERATION) -+ ? false -+ : do_tdb_delete(conn, &key, NULL); - } -+ if (i->fire_watch) -+ fire_watches(conn, trans, i->node, NULL, i->watch_exact, -+ i->perms.p ? &i->perms : NULL); - -- if (i->ta_node && do_tdb_delete(conn, &ta_key, NULL)) -- goto err; - list_del(&i->list); - talloc_free(i); - } - - return 0; -- --err: -- corrupt(conn, "Partial transaction"); -- return EIO; - } - - static int destroy_transaction(void *_transaction) - { - struct transaction *trans = _transaction; - struct accessed_node *i; -- char *trans_name; - TDB_DATA key; - - wrl_ntransactions--; - trace_destroy(trans, "transaction"); - while ((i = list_top(&trans->accessed, struct accessed_node, list))) { - if (i->ta_node) { -- trans_name = transaction_get_node_name(i, trans, -- i->node); -- if (trans_name) { -- set_tdb_key(trans_name, &key); -- do_tdb_delete(trans->conn, &key, NULL); -- } -+ set_tdb_key(i->trans_name, &key); -+ do_tdb_delete(trans->conn, &key, NULL); - } - list_del(&i->list); - talloc_free(i); -@@ -556,6 +537,7 @@ int do_transaction_end(const void *ctx, struct connection *conn, - { - const char *arg = onearg(in); - struct transaction *trans; -+ bool is_corrupt = false; - int ret; - - if (!arg || (!streq(arg, "T") && !streq(arg, "F"))) -@@ -579,13 +561,17 @@ int do_transaction_end(const void *ctx, struct connection *conn, - ret = transaction_fix_domains(trans, false); - if (ret) - return ret; -- if (finalize_transaction(conn, trans)) -- return EAGAIN; -+ ret = finalize_transaction(conn, trans, &is_corrupt); -+ if (ret) -+ return ret; - - wrl_apply_debit_trans_commit(conn); - - /* fix domain entry for each changed domain */ - transaction_fix_domains(trans, true); -+ -+ if (is_corrupt) -+ corrupt(conn, "transaction inconsistency"); - } - send_ack(conn, XS_TRANSACTION_END); - -@@ -660,7 +646,7 @@ int check_transactions(struct hashtable *hash) - struct connection *conn; - struct transaction *trans; - struct accessed_node *i; -- char *tname, *tnode; -+ char *tname; - - list_for_each_entry(conn, &connections, list) { - list_for_each_entry(trans, &conn->transaction_list, list) { -@@ -672,11 +658,8 @@ int check_transactions(struct hashtable *hash) - list_for_each_entry(i, &trans->accessed, list) { - if (!i->ta_node) - continue; -- tnode = transaction_get_node_name(tname, trans, -- i->node); -- if (!tnode || !remember_string(hash, tnode)) -+ if (!remember_string(hash, i->trans_name)) - goto nomem; -- talloc_free(tnode); - } - - talloc_free(tname); -diff --git a/tools/xenstore/xenstored_transaction.h b/tools/xenstore/xenstored_transaction.h -index 39d7f81c5127..3417303f9427 100644 ---- a/tools/xenstore/xenstored_transaction.h -+++ b/tools/xenstore/xenstored_transaction.h -@@ -48,8 +48,8 @@ int __must_check access_node(struct connection *conn, struct node *node, - void queue_watches(struct connection *conn, const char *name, bool watch_exact); - - /* Prepend the transaction to name if appropriate. */ --int transaction_prepend(struct connection *conn, const char *name, -- TDB_DATA *key); -+void transaction_prepend(struct connection *conn, const char *name, -+ TDB_DATA *key); - - /* Mark the transaction as failed. This will prevent it to be committed. */ - void fail_transaction(struct transaction *trans); diff --git a/xsa422-4.16-1.patch b/xsa422-4.16-1.patch deleted file mode 100644 index 1d36873..0000000 --- a/xsa422-4.16-1.patch +++ /dev/null @@ -1,70 +0,0 @@ -From: Andrew Cooper -Subject: x86/spec-ctrl: Enumeration for IBPB_RET - -The IBPB_RET bit indicates that the CPU's implementation of MSR_PRED_CMD.IBPB -does flush the RSB/RAS too. - -This is part of XSA-422 / CVE-2022-23824. - -Signed-off-by: Andrew Cooper -Acked-by: Jan Beulich - -diff --git a/tools/libs/light/libxl_cpuid.c b/tools/libs/light/libxl_cpuid.c -index bf6fdee360a9..691d5c6b2a68 100644 ---- a/tools/libs/light/libxl_cpuid.c -+++ b/tools/libs/light/libxl_cpuid.c -@@ -289,6 +289,7 @@ int libxl_cpuid_parse_config(libxl_cpuid_policy_list *cpuid, const char* str) - {"ssb-no", 0x80000008, NA, CPUID_REG_EBX, 26, 1}, - {"psfd", 0x80000008, NA, CPUID_REG_EBX, 28, 1}, - {"btc-no", 0x80000008, NA, CPUID_REG_EBX, 29, 1}, -+ {"ibpb-ret", 0x80000008, NA, CPUID_REG_EBX, 30, 1}, - - {"nc", 0x80000008, NA, CPUID_REG_ECX, 0, 8}, - {"apicidsize", 0x80000008, NA, CPUID_REG_ECX, 12, 4}, -diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c -index fe22f5f5b68b..cd094427dd4c 100644 ---- a/tools/misc/xen-cpuid.c -+++ b/tools/misc/xen-cpuid.c -@@ -159,6 +159,7 @@ static const char *const str_e8b[32] = - [24] = "amd-ssbd", [25] = "virt-ssbd", - [26] = "ssb-no", - [28] = "psfd", [29] = "btc-no", -+ [30] = "ibpb-ret", - }; - - static const char *const str_7d0[32] = -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 0f4bad3d3abb..16a562d3a172 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -419,7 +419,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - * Hardware read-only information, stating immunity to certain issues, or - * suggestions of which mitigation to use. - */ -- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", -+ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", - (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", - (caps & ARCH_CAPS_IBRS_ALL) ? " IBRS_ALL" : "", - (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", -@@ -436,7 +436,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps) - (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", - (e8b & cpufeat_mask(X86_FEATURE_IBRS_SAME_MODE)) ? " IBRS_SAME_MODE" : "", -- (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : ""); -+ (e8b & cpufeat_mask(X86_FEATURE_BTC_NO)) ? " BTC_NO" : "", -+ (e8b & cpufeat_mask(X86_FEATURE_IBPB_RET)) ? " IBPB_RET" : ""); - - /* Hardware features which need driving to mitigate issues. */ - printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n", -diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h -index e7b8167800a2..e0731221404c 100644 ---- a/xen/include/public/arch-x86/cpufeatureset.h -+++ b/xen/include/public/arch-x86/cpufeatureset.h -@@ -267,6 +267,7 @@ XEN_CPUFEATURE(VIRT_SSBD, 8*32+25) /* MSR_VIRT_SPEC_CTRL.SSBD */ - XEN_CPUFEATURE(SSB_NO, 8*32+26) /*A Hardware not vulnerable to SSB */ - XEN_CPUFEATURE(PSFD, 8*32+28) /*S MSR_SPEC_CTRL.PSFD */ - XEN_CPUFEATURE(BTC_NO, 8*32+29) /*A Hardware not vulnerable to Branch Type Confusion */ -+XEN_CPUFEATURE(IBPB_RET, 8*32+30) /*A IBPB clears RSB/RAS too. */ - - /* Intel-defined CPU features, CPUID level 0x00000007:0.edx, word 9 */ - XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /*A AVX512 Neural Network Instructions */ diff --git a/xsa422-4.16-2.patch b/xsa422-4.16-2.patch deleted file mode 100644 index a1a2f8d..0000000 --- a/xsa422-4.16-2.patch +++ /dev/null @@ -1,100 +0,0 @@ -From: Andrew Cooper -Subject: x86/spec-ctrl: Mitigate IBPB not flushing the RSB/RAS - -Introduce spec_ctrl_new_guest_context() to encapsulate all logic pertaining to -using MSR_PRED_CMD for a new guest context, even if it only has one user -presently. - -Introduce X86_BUG_IBPB_NO_RET, and use it extend spec_ctrl_new_guest_context() -with a manual fixup for hardware which mis-implements IBPB. - -This is part of XSA-422 / CVE-2022-23824. - -Signed-off-by: Andrew Cooper -Acked-by: Jan Beulich - -diff --git a/xen/arch/x86/asm-macros.c b/xen/arch/x86/asm-macros.c -index 7e536b0d82f5..891d86c7655c 100644 ---- a/xen/arch/x86/asm-macros.c -+++ b/xen/arch/x86/asm-macros.c -@@ -1,2 +1,3 @@ - #include - #include -+#include -diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c -index 3fab2364be8d..3080cde62b5b 100644 ---- a/xen/arch/x86/domain.c -+++ b/xen/arch/x86/domain.c -@@ -2092,7 +2092,7 @@ void context_switch(struct vcpu *prev, struct vcpu *next) - */ - if ( *last_id != next_id ) - { -- wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); -+ spec_ctrl_new_guest_context(); - *last_id = next_id; - } - } -diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c -index 16a562d3a172..90d86fe5cb47 100644 ---- a/xen/arch/x86/spec_ctrl.c -+++ b/xen/arch/x86/spec_ctrl.c -@@ -805,6 +805,14 @@ static void __init ibpb_calculations(void) - } - - /* -+ * AMD/Hygon CPUs to date (June 2022) don't flush the the RAS. Future -+ * CPUs are expected to enumerate IBPB_RET when this has been fixed. -+ * Until then, cover the difference with the software sequence. -+ */ -+ if ( boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_IBPB_RET) ) -+ setup_force_cpu_cap(X86_BUG_IBPB_NO_RET); -+ -+ /* - * IBPB-on-entry mitigations for Branch Type Confusion. - * - * IBPB && !BTC_NO selects all AMD/Hygon hardware, not known to be safe, -diff --git a/xen/include/asm-x86/cpufeatures.h b/xen/include/asm-x86/cpufeatures.h -index 672c9ee22ba2..ecc1bb09505a 100644 ---- a/xen/include/asm-x86/cpufeatures.h -+++ b/xen/include/asm-x86/cpufeatures.h -@@ -49,6 +49,7 @@ XEN_CPUFEATURE(IBPB_ENTRY_HVM, X86_SYNTH(29)) /* MSR_PRED_CMD used by Xen for - #define X86_BUG_FPU_PTRS X86_BUG( 0) /* (F)X{SAVE,RSTOR} doesn't save/restore FOP/FIP/FDP. */ - #define X86_BUG_NULL_SEG X86_BUG( 1) /* NULL-ing a selector preserves the base and limit. */ - #define X86_BUG_CLFLUSH_MFENCE X86_BUG( 2) /* MFENCE needed to serialise CLFLUSH */ -+#define X86_BUG_IBPB_NO_RET X86_BUG( 3) /* IBPB doesn't flush the RSB/RAS */ - - /* Total number of capability words, inc synth and bug words. */ - #define NCAPINTS (FSCAPINTS + X86_NR_SYNTH + X86_NR_BUG) /* N 32-bit words worth of info */ -diff --git a/xen/include/asm-x86/spec_ctrl.h b/xen/include/asm-x86/spec_ctrl.h -index 9403b81dc7af..6a77c3937844 100644 ---- a/xen/include/asm-x86/spec_ctrl.h -+++ b/xen/include/asm-x86/spec_ctrl.h -@@ -65,6 +65,28 @@ - void init_speculation_mitigations(void); - void spec_ctrl_init_domain(struct domain *d); - -+/* -+ * Switch to a new guest prediction context. -+ * -+ * This flushes all indirect branch predictors (BTB, RSB/RAS), so guest code -+ * which has previously run on this CPU can't attack subsequent guest code. -+ * -+ * As this flushes the RSB/RAS, it destroys the predictions of the calling -+ * context. For best performace, arrange for this to be used when we're going -+ * to jump out of the current context, e.g. with reset_stack_and_jump(). -+ * -+ * For hardware which mis-implements IBPB, fix up by flushing the RSB/RAS -+ * manually. -+ */ -+static always_inline void spec_ctrl_new_guest_context(void) -+{ -+ wrmsrl(MSR_PRED_CMD, PRED_CMD_IBPB); -+ -+ /* (ab)use alternative_input() to specify clobbers. */ -+ alternative_input("", "DO_OVERWRITE_RSB", X86_BUG_IBPB_NO_RET, -+ : "rax", "rcx"); -+} -+ - extern int8_t opt_ibpb_ctxt_switch; - extern bool opt_ssbd; - extern int8_t opt_eager_fpu;