diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD
index 643d1bcc6d1..8845418d5cb 100644
--- a/main/xen/APKBUILD
+++ b/main/xen/APKBUILD
@@ -2,7 +2,7 @@
 # Maintainer: Natanael Copa <ncopa@alpinelinux.org>
 pkgname=xen
 pkgver=4.18.0
-pkgrel=4
+pkgrel=5
 pkgdesc="Xen hypervisor"
 url="https://www.xenproject.org/"
 arch="x86_64 armv7 aarch64"
@@ -364,6 +364,9 @@ options="!strip"
 #     - CVE-2023-46839 XSA-449
 #   4.18.0-r4:
 #     - CVE-2023-46841 XSA-451
+#   4.18.0-r5:
+#     - CVE-2023-28746 XSA-452
+#     - CVE-2024-2193 XSA-453
 
 case "$CARCH" in
 x86*)
@@ -409,10 +412,7 @@ source="https://downloads.xenproject.org/release/xen/$pkgver/xen-$pkgver.tar.gz
 	https://xenbits.xen.org/xen-extfiles/zlib-$_ZLIB_VERSION.tar.gz
 	https://xenbits.xen.org/xen-extfiles/ipxe-git-$_IPXE_GIT_TAG.tar.gz
 
-	xsa447.patch
-	xsa449.patch
-	xsa450.patch
-	xsa451-4.18.patch
+	xen-stable-4.18-20240312.patch
 
 	mini-os-__divmoddi4.patch
 	qemu-xen_paths.patch
@@ -701,10 +701,7 @@ qemu_openrc() {
 
 sha512sums="
 4cc9fd155144045a173c5f8ecc45f149817f1034eec618cb6f8b0494ef2fb5b95c4c60cf0bf4bec4bef8a622c35b6a3cb7dedc38e6d95e726f1611c73ddb3273  xen-4.18.0.tar.gz
-459e490b33b95202167862a84eadb656a418b252ffa786db05640f025886bf1e2a5c59387d4b99ced552ae316eb64b6f9888a850bf6860a115e7f3eabed52d20  xsa447.patch
-ea185b6f7ca375b49351a4006f22e449312e0a8180c93db2bb1aca43658de5abc8d1a21c1b6eedf320dd51a5e1475ace1652eddaacee28d36cc83d5beb05a918  xsa449.patch
-901359c8fd08adc49961e1296e45fa98da6e090a82f8888fef6cccebf5b443e80cd905dff51e336e43c22bfac118481d65f8e4a9aa56ddd5c8e1775c6083e08d  xsa450.patch
-394fe51160f5ce79086d0f250c99daa3ecde1012ebdb5c6301f0033e79809e8b2061de7988f1a713c9674ac9b73d88df8be89e8cc668efb64c5b53039c574eef  xsa451-4.18.patch
+8df958195290a39b54493766e7555d71c68083d75edd13a2f77ad237d6b6fb52bce816b9e975c0c14024a01042e599415360dcf475f7d2e0c6bee8f9fd2ed6ef  xen-stable-4.18-20240312.patch
 2e0b0fd23e6f10742a5517981e5171c6e88b0a93c83da701b296f5c0861d72c19782daab589a7eac3f9032152a0fc7eff7f5362db8fccc4859564a9aa82329cf  gmp-4.3.2.tar.bz2
 c2bc9ffc8583aeae71cee9ddcc4418969768d4e3764d47307da54f93981c0109fb07d84b061b3a3628bd00ba4d14a54742bc04848110eb3ae8ca25dbfbaabadb  grub-0.97.tar.gz
 1465b58279af1647f909450e394fe002ca165f0ff4a0254bfa9fe0e64316f50facdde2729d79a4e632565b4500cf4d6c74192ac0dd3bc9fe09129bbd67ba089d  lwip-1.3.0.tar.gz
diff --git a/main/xen/xen-stable-4.18-20240312.patch b/main/xen/xen-stable-4.18-20240312.patch
new file mode 100644
index 00000000000..78d40c44459
--- /dev/null
+++ b/main/xen/xen-stable-4.18-20240312.patch
@@ -0,0 +1,8490 @@
+From 52be29df793f282822436c8c13e0948a01aee1ad Mon Sep 17 00:00:00 2001
+From: Tamas K Lengyel <tamas@tklengyel.com>
+Date: Thu, 23 Nov 2023 12:10:46 +0100
+Subject: [PATCH 01/70] x86/mem_sharing: add missing m2p entry when mapping
+ shared_info page
+
+When mapping in the shared_info page to a fork the m2p entry wasn't set
+resulting in the shared_info being reset even when the fork reset was called
+with only reset_state and not reset_memory. This results in an extra
+unnecessary TLB flush.
+
+Fixes: 1a0000ac775 ("mem_sharing: map shared_info page to same gfn during fork")
+Signed-off-by: Tamas K Lengyel <tamas@tklengyel.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 23eb39acf011ef9bbe02ed4619c55f208fbcd39b
+master date: 2023-10-31 16:10:14 +0000
+---
+ xen/arch/x86/mm/mem_sharing.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
+index 94b6b782ef..142258f16a 100644
+--- a/xen/arch/x86/mm/mem_sharing.c
++++ b/xen/arch/x86/mm/mem_sharing.c
+@@ -1847,6 +1847,8 @@ static int copy_special_pages(struct domain *cd, struct domain *d)
+                                 p2m_ram_rw, p2m->default_access, -1);
+             if ( rc )
+                 return rc;
++
++            set_gpfn_from_mfn(mfn_x(new_mfn), gfn_x(old_gfn));
+         }
+     }
+ 
+-- 
+2.44.0
+
+
+From 880e06fdea401493a3f408deb0f411f7aeccee27 Mon Sep 17 00:00:00 2001
+From: David Woodhouse <dwmw@amazon.co.uk>
+Date: Thu, 23 Nov 2023 12:11:21 +0100
+Subject: [PATCH 02/70] x86/pv-shim: fix grant table operations for 32-bit
+ guests
+
+When switching to call the shim functions from the normal handlers, the
+compat_grant_table_op() function was omitted, leaving it calling the
+real grant table operations in !PV_SHIM_EXCLUSIVE builds. This leaves a
+32-bit shim guest failing to set up its real grant table with the parent
+hypervisor.
+
+Fixes: e7db635f4428 ("x86/pv-shim: Don't modify the hypercall table")
+Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 93ec30bc545f15760039c23ee4b97b80c0b3b3b3
+master date: 2023-10-31 16:10:14 +0000
+---
+ xen/common/compat/grant_table.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/xen/common/compat/grant_table.c b/xen/common/compat/grant_table.c
+index e00bc24a34..af98eade17 100644
+--- a/xen/common/compat/grant_table.c
++++ b/xen/common/compat/grant_table.c
+@@ -63,6 +63,11 @@ int compat_grant_table_op(
+     unsigned int i, cmd_op;
+     XEN_GUEST_HANDLE_PARAM(void) cnt_uop;
+ 
++#ifdef CONFIG_PV_SHIM
++    if ( unlikely(pv_shim) )
++        return pv_shim_grant_table_op(cmd, uop, count);
++#endif
++
+     set_xen_guest_handle(cnt_uop, NULL);
+     cmd_op = cmd & GNTTABOP_CMD_MASK;
+     if ( cmd_op != GNTTABOP_cache_flush )
+-- 
+2.44.0
+
+
+From 9e8edd4c75564530a6fb98f5abba267edb906313 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Thu, 23 Nov 2023 12:12:18 +0100
+Subject: [PATCH 03/70] x86/x2apic: remove usage of ACPI_FADT_APIC_CLUSTER
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The ACPI FADT APIC_CLUSTER flag mandates that when the interrupt delivery is
+Logical mode APIC must be configured for Cluster destination model.  However in
+apic_x2apic_probe() such flag is incorrectly used to gate whether Physical mode
+can be used.
+
+Since Xen when in x2APIC mode only uses Logical mode together with Cluster
+model completely remove checking for ACPI_FADT_APIC_CLUSTER, as Xen always
+fulfills the requirement signaled by the flag.
+
+Fixes: eb40ae41b658 ('x86/Kconfig: add option for default x2APIC destination mode')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 26a449ce32cef33f2cb50602be19fcc0c4223ba9
+master date: 2023-11-02 10:50:26 +0100
+---
+ xen/arch/x86/genapic/x2apic.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
+index ca1db27157..707deef98c 100644
+--- a/xen/arch/x86/genapic/x2apic.c
++++ b/xen/arch/x86/genapic/x2apic.c
+@@ -231,8 +231,7 @@ const struct genapic *__init apic_x2apic_probe(void)
+          */
+         x2apic_phys = iommu_intremap != iommu_intremap_full ||
+                       (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) ||
+-                      (IS_ENABLED(CONFIG_X2APIC_PHYSICAL) &&
+-                       !(acpi_gbl_FADT.flags & ACPI_FADT_APIC_CLUSTER));
++                      IS_ENABLED(CONFIG_X2APIC_PHYSICAL);
+     }
+     else if ( !x2apic_phys )
+         switch ( iommu_intremap )
+-- 
+2.44.0
+
+
+From fcb1016bbd476e17c72b1837ae2a3eaac517fa52 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Thu, 23 Nov 2023 12:12:47 +0100
+Subject: [PATCH 04/70] x86/i8259: do not assume interrupts always target CPU0
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Sporadically we have seen the following during AP bringup on AMD platforms
+only:
+
+microcode: CPU59 updated from revision 0x830107a to 0x830107a, date = 2023-05-17
+microcode: CPU60 updated from revision 0x830104d to 0x830107a, date = 2023-05-17
+CPU60: No irq handler for vector 27 (IRQ -2147483648)
+microcode: CPU61 updated from revision 0x830107a to 0x830107a, date = 2023-05-17
+
+This is similar to the issue raised on Linux commit 36e9e1eab777e, where they
+observed i8259 (active) vectors getting delivered to CPUs different than 0.
+
+On AMD or Hygon platforms adjust the target CPU mask of i8259 interrupt
+descriptors to contain all possible CPUs, so that APs will reserve the vector
+at startup if any legacy IRQ is still delivered through the i8259.  Note that
+if the IO-APIC takes over those interrupt descriptors the CPU mask will be
+reset.
+
+Spurious i8259 interrupt vectors however (IRQ7 and IRQ15) can be injected even
+when all i8259 pins are masked, and hence would need to be handled on all CPUs.
+
+Continue to reserve PIC vectors on CPU0 only, but do check for such spurious
+interrupts on all CPUs if the vendor is AMD or Hygon.  Note that once the
+vectors get used by devices detecting PIC spurious interrupts will no longer be
+possible, however the device driver should be able to cope with spurious
+interrupts.  Such PIC spurious interrupts occurring when the vector is in use
+by a local APIC routed source will lead to an extra EOI, which might
+unintentionally clear a different vector from ISR.  Note this is already the
+current behavior, so assume it's infrequent enough to not cause real issues.
+
+Finally, adjust the printed message to display the CPU where the spurious
+interrupt has been received, so it looks like:
+
+microcode: CPU1 updated from revision 0x830107a to 0x830107a, date = 2023-05-17
+cpu1: spurious 8259A interrupt: IRQ7
+microcode: CPU2 updated from revision 0x830104d to 0x830107a, date = 2023-05-17
+
+Amends: 3fba06ba9f8b ('x86/IRQ: re-use legacy vector ranges on APs')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 87f37449d586b4d407b75235bb0a171e018e25ec
+master date: 2023-11-02 10:50:59 +0100
+---
+ xen/arch/x86/i8259.c | 21 +++++++++++++++++++--
+ xen/arch/x86/irq.c   | 11 ++++++++++-
+ 2 files changed, 29 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/i8259.c b/xen/arch/x86/i8259.c
+index ed9f55abe5..e0fa1f96b4 100644
+--- a/xen/arch/x86/i8259.c
++++ b/xen/arch/x86/i8259.c
+@@ -222,7 +222,8 @@ static bool _mask_and_ack_8259A_irq(unsigned int irq)
+         is_real_irq = false;
+         /* Report spurious IRQ, once per IRQ line. */
+         if (!(spurious_irq_mask & irqmask)) {
+-            printk("spurious 8259A interrupt: IRQ%d.\n", irq);
++            printk("cpu%u: spurious 8259A interrupt: IRQ%u\n",
++                   smp_processor_id(), irq);
+             spurious_irq_mask |= irqmask;
+         }
+         /*
+@@ -349,7 +350,23 @@ void __init init_IRQ(void)
+             continue;
+         desc->handler = &i8259A_irq_type;
+         per_cpu(vector_irq, cpu)[LEGACY_VECTOR(irq)] = irq;
+-        cpumask_copy(desc->arch.cpu_mask, cpumask_of(cpu));
++
++        /*
++         * The interrupt affinity logic never targets interrupts to offline
++         * CPUs, hence it's safe to use cpumask_all here.
++         *
++         * Legacy PIC interrupts are only targeted to CPU0, but depending on
++         * the platform they can be distributed to any online CPU in hardware.
++         * Note this behavior has only been observed on AMD hardware. In order
++         * to cope install all active legacy vectors on all CPUs.
++         *
++         * IO-APIC will change the destination mask if/when taking ownership of
++         * the interrupt.
++         */
++        cpumask_copy(desc->arch.cpu_mask,
++                     (boot_cpu_data.x86_vendor &
++                      (X86_VENDOR_AMD | X86_VENDOR_HYGON) ? &cpumask_all
++                                                          : cpumask_of(cpu)));
+         desc->arch.vector = LEGACY_VECTOR(irq);
+     }
+     
+diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
+index f42ad539dc..16d9fceba1 100644
+--- a/xen/arch/x86/irq.c
++++ b/xen/arch/x86/irq.c
+@@ -1920,7 +1920,16 @@ void do_IRQ(struct cpu_user_regs *regs)
+                 kind = "";
+             if ( !(vector >= FIRST_LEGACY_VECTOR &&
+                    vector <= LAST_LEGACY_VECTOR &&
+-                   !smp_processor_id() &&
++                   (!smp_processor_id() ||
++                    /*
++                     * For AMD/Hygon do spurious PIC interrupt
++                     * detection on all CPUs, as it has been observed
++                     * that during unknown circumstances spurious PIC
++                     * interrupts have been delivered to CPUs
++                     * different than the BSP.
++                     */
++                    (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD |
++                                                 X86_VENDOR_HYGON))) &&
+                    bogus_8259A_irq(vector - FIRST_LEGACY_VECTOR)) )
+             {
+                 printk("CPU%u: No irq handler for vector %02x (IRQ %d%s)\n",
+-- 
+2.44.0
+
+
+From 40bfa9dd57f1efdd0f0dc974e80a438d9db90874 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 23 Nov 2023 12:13:31 +0100
+Subject: [PATCH 05/70] x86/spec-ctrl: Add SRSO whitepaper URL
+
+... now that it exists in public.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 78a86b26868c12ae1cc3dd2a8bb9aa5eebaa41fd
+master date: 2023-11-07 17:47:34 +0000
+---
+ xen/arch/x86/spec_ctrl.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 6fd7d44ce4..a8d8af22f6 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -903,6 +903,9 @@ static bool __init should_use_eager_fpu(void)
+     }
+ }
+ 
++/*
++ * https://www.amd.com/content/dam/amd/en/documents/corporate/cr/speculative-return-stack-overflow-whitepaper.pdf
++ */
+ static void __init srso_calculations(bool hw_smt_enabled)
+ {
+     if ( !(boot_cpu_data.x86_vendor &
+-- 
+2.44.0
+
+
+From 3f9390fea5c51a6d64596d295902d28931eeca4c Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Thu, 23 Nov 2023 12:13:53 +0100
+Subject: [PATCH 06/70] xen/sched: fix sched_move_domain()
+
+When moving a domain out of a cpupool running with the credit2
+scheduler and having multiple run-queues, the following ASSERT() can
+be observed:
+
+(XEN) Xen call trace:
+(XEN)    [<ffff82d04023a700>] R credit2.c#csched2_unit_remove+0xe3/0xe7
+(XEN)    [<ffff82d040246adb>] S sched_move_domain+0x2f3/0x5b1
+(XEN)    [<ffff82d040234cf7>] S cpupool.c#cpupool_move_domain_locked+0x1d/0x3b
+(XEN)    [<ffff82d040236025>] S cpupool_move_domain+0x24/0x35
+(XEN)    [<ffff82d040206513>] S domain_kill+0xa5/0x116
+(XEN)    [<ffff82d040232b12>] S do_domctl+0xe5f/0x1951
+(XEN)    [<ffff82d0402276ba>] S timer.c#timer_lock+0x69/0x143
+(XEN)    [<ffff82d0402dc71b>] S pv_hypercall+0x44e/0x4a9
+(XEN)    [<ffff82d0402012b7>] S lstar_enter+0x137/0x140
+(XEN)
+(XEN)
+(XEN) ****************************************
+(XEN) Panic on CPU 1:
+(XEN) Assertion 'svc->rqd == c2rqd(sched_unit_master(unit))' failed at common/sched/credit2.c:1159
+(XEN) ****************************************
+
+This is happening as sched_move_domain() is setting a different cpu
+for a scheduling unit without telling the scheduler. When this unit is
+removed from the scheduler, the ASSERT() will trigger.
+
+In non-debug builds the result is usually a clobbered pointer, leading
+to another crash a short time later.
+
+Fix that by swapping the two involved actions (setting another cpu and
+removing the unit from the scheduler).
+
+Link: https://github.com/Dasharo/dasharo-issues/issues/488
+Fixes: 70fadc41635b ("xen/cpupool: support moving domain between cpupools with different granularity")
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: George Dunlap <george.dunlap@cloud.com>
+master commit: 4709ec82917668c2df958ef91b4f21c049c76bee
+master date: 2023-11-20 10:49:29 +0100
+---
+ xen/common/sched/core.c | 12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
+index 12deefa745..eba0cea4bb 100644
+--- a/xen/common/sched/core.c
++++ b/xen/common/sched/core.c
+@@ -732,18 +732,20 @@ int sched_move_domain(struct domain *d, struct cpupool *c)
+     old_domdata = d->sched_priv;
+ 
+     /*
+-     * Temporarily move all units to same processor to make locking
+-     * easier when moving the new units to the new processors.
++     * Remove all units from the old scheduler, and temporarily move them to
++     * the same processor to make locking easier when moving the new units to
++     * new processors.
+      */
+     new_p = cpumask_first(d->cpupool->cpu_valid);
+     for_each_sched_unit ( d, unit )
+     {
+-        spinlock_t *lock = unit_schedule_lock_irq(unit);
++        spinlock_t *lock;
++
++        sched_remove_unit(old_ops, unit);
+ 
++        lock = unit_schedule_lock_irq(unit);
+         sched_set_res(unit, get_sched_res(new_p));
+         spin_unlock_irq(lock);
+-
+-        sched_remove_unit(old_ops, unit);
+     }
+ 
+     old_units = d->sched_unit_list;
+-- 
+2.44.0
+
+
+From 90a6d821757edf1202c527143b8a05b0d2a3dfaa Mon Sep 17 00:00:00 2001
+From: Frediano Ziglio <frediano.ziglio@cloud.com>
+Date: Wed, 6 Dec 2023 10:37:13 +0100
+Subject: [PATCH 07/70] x86/mem_sharing: Release domain if we are not able to
+ enable memory sharing
+
+In case it's not possible to enable memory sharing (mem_sharing_control
+fails) we just return the error code without releasing the domain
+acquired some lines above by rcu_lock_live_remote_domain_by_id().
+
+Fixes: 72f8d45d69b8 ("x86/mem_sharing: enable mem_sharing on first memop")
+Signed-off-by: Frediano Ziglio <frediano.ziglio@cloud.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Tamas K Lengyel <tamas@tklengyel.com>
+master commit: fbcec32d6d3ea0ac329301925b317478316209ed
+master date: 2023-11-27 12:06:13 +0000
+---
+ xen/arch/x86/mm/mem_sharing.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
+index 142258f16a..429d27ef85 100644
+--- a/xen/arch/x86/mm/mem_sharing.c
++++ b/xen/arch/x86/mm/mem_sharing.c
+@@ -2013,7 +2013,7 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
+ 
+     if ( !mem_sharing_enabled(d) &&
+          (rc = mem_sharing_control(d, true, 0)) )
+-        return rc;
++        goto out;
+ 
+     switch ( mso.op )
+     {
+-- 
+2.44.0
+
+
+From 480168fcb3135f0da6e7a6b3b754c78fabc24d4f Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Wed, 6 Dec 2023 10:38:03 +0100
+Subject: [PATCH 08/70] livepatch: do not use .livepatch.funcs section to store
+ internal state
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Currently the livepatch logic inside of Xen will use fields of struct
+livepatch_func in order to cache internal state of patched functions.  Note
+this is a field that is part of the payload, and is loaded as an ELF section
+(.livepatch.funcs), taking into account the SHF_* flags in the section
+header.
+
+The flags for the .livepatch.funcs section, as set by livepatch-build-tools,
+are SHF_ALLOC, which leads to its contents (the array of livepatch_func
+structures) being placed in read-only memory:
+
+Section Headers:
+  [Nr] Name              Type             Address           Offset
+       Size              EntSize          Flags  Link  Info  Align
+[...]
+  [ 4] .livepatch.funcs  PROGBITS         0000000000000000  00000080
+       0000000000000068  0000000000000000   A       0     0     8
+
+This previously went unnoticed, as all writes to the fields of livepatch_func
+happen in the critical region that had WP disabled in CR0.  After 8676092a0f16
+however WP is no longer toggled in CR0 for patch application, and only the
+hypervisor .text mappings are made write-accessible.  That leads to the
+following page fault when attempting to apply a livepatch:
+
+----[ Xen-4.19-unstable  x86_64  debug=y  Tainted:   C    ]----
+CPU:    4
+RIP:    e008:[<ffff82d040221e81>] common/livepatch.c#apply_payload+0x45/0x1e1
+[...]
+Xen call trace:
+   [<ffff82d040221e81>] R common/livepatch.c#apply_payload+0x45/0x1e1
+   [<ffff82d0402235b2>] F check_for_livepatch_work+0x385/0xaa5
+   [<ffff82d04032508f>] F arch/x86/domain.c#idle_loop+0x92/0xee
+
+Pagetable walk from ffff82d040625079:
+ L4[0x105] = 000000008c6c9063 ffffffffffffffff
+ L3[0x141] = 000000008c6c6063 ffffffffffffffff
+ L2[0x003] = 000000086a1e7063 ffffffffffffffff
+ L1[0x025] = 800000086ca5d121 ffffffffffffffff
+
+****************************************
+Panic on CPU 4:
+FATAL PAGE FAULT
+[error_code=0003]
+Faulting linear address: ffff82d040625079
+****************************************
+
+Fix this by moving the internal Xen function patching state out of
+livepatch_func into an area not allocated as part of the ELF payload.  While
+there also constify the array of livepatch_func structures in order to prevent
+further surprises.
+
+Note there's still one field (old_addr) that gets set during livepatch load.  I
+consider this fine since the field is read-only after load, and at the point
+the field gets set the underlying mapping hasn't been made read-only yet.
+
+Fixes: 8676092a0f16 ('x86/livepatch: Fix livepatch application when CET is active')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com>
+
+xen/livepatch: fix livepatch tests
+
+The current set of in-tree livepatch tests in xen/test/livepatch started
+failing after the constify of the payload funcs array, and the movement of the
+status data into a separate array.
+
+Fix the tests so they respect the constness of the funcs array and also make
+use of the new location of the per-func state data.
+
+Fixes: 82182ad7b46e ('livepatch: do not use .livepatch.funcs section to store internal state')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com>
+master commit: 82182ad7b46e0f7a3856bb12c7a9bf2e2a4570bc
+master date: 2023-11-27 15:16:01 +0100
+master commit: 902377b690f42ddf44ae91c4b0751d597f1cd694
+master date: 2023-11-29 10:46:42 +0000
+---
+ xen/arch/arm/arm32/livepatch.c                |  9 +++--
+ xen/arch/arm/arm64/livepatch.c                |  9 +++--
+ xen/arch/arm/livepatch.c                      |  9 +++--
+ xen/arch/x86/livepatch.c                      | 26 +++++++------
+ xen/common/livepatch.c                        | 25 ++++++++----
+ xen/include/public/sysctl.h                   |  5 +--
+ xen/include/xen/livepatch.h                   | 38 +++++++++++++------
+ xen/include/xen/livepatch_payload.h           |  3 +-
+ xen/test/livepatch/xen_action_hooks.c         | 12 +++---
+ xen/test/livepatch/xen_action_hooks_marker.c  | 20 ++++++----
+ xen/test/livepatch/xen_action_hooks_noapply.c | 22 ++++++-----
+ xen/test/livepatch/xen_action_hooks_nofunc.c  |  6 +--
+ .../livepatch/xen_action_hooks_norevert.c     | 24 +++++++-----
+ xen/test/livepatch/xen_prepost_hooks.c        |  8 ++--
+ xen/test/livepatch/xen_prepost_hooks_fail.c   |  2 +-
+ 15 files changed, 130 insertions(+), 88 deletions(-)
+
+diff --git a/xen/arch/arm/arm32/livepatch.c b/xen/arch/arm/arm32/livepatch.c
+index 3c50283b2a..80d2659b78 100644
+--- a/xen/arch/arm/arm32/livepatch.c
++++ b/xen/arch/arm/arm32/livepatch.c
+@@ -11,23 +11,24 @@
+ #include <asm/page.h>
+ #include <asm/livepatch.h>
+ 
+-void arch_livepatch_apply(struct livepatch_func *func)
++void arch_livepatch_apply(const struct livepatch_func *func,
++                          struct livepatch_fstate *state)
+ {
+     uint32_t insn;
+     uint32_t *new_ptr;
+     unsigned int i, len;
+ 
+-    BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(func->opaque));
++    BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(state->insn_buffer));
+     BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE != sizeof(insn));
+ 
+     ASSERT(vmap_of_xen_text);
+ 
+-    len = livepatch_insn_len(func);
++    len = livepatch_insn_len(func, state);
+     if ( !len )
+         return;
+ 
+     /* Save old ones. */
+-    memcpy(func->opaque, func->old_addr, len);
++    memcpy(state->insn_buffer, func->old_addr, len);
+ 
+     if ( func->new_addr )
+     {
+diff --git a/xen/arch/arm/arm64/livepatch.c b/xen/arch/arm/arm64/livepatch.c
+index 62d2ef373a..df2cebedde 100644
+--- a/xen/arch/arm/arm64/livepatch.c
++++ b/xen/arch/arm/arm64/livepatch.c
+@@ -15,23 +15,24 @@
+ #include <asm/insn.h>
+ #include <asm/livepatch.h>
+ 
+-void arch_livepatch_apply(struct livepatch_func *func)
++void arch_livepatch_apply(const struct livepatch_func *func,
++                          struct livepatch_fstate *state)
+ {
+     uint32_t insn;
+     uint32_t *new_ptr;
+     unsigned int i, len;
+ 
+-    BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(func->opaque));
++    BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(state->insn_buffer));
+     BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE != sizeof(insn));
+ 
+     ASSERT(vmap_of_xen_text);
+ 
+-    len = livepatch_insn_len(func);
++    len = livepatch_insn_len(func, state);
+     if ( !len )
+         return;
+ 
+     /* Save old ones. */
+-    memcpy(func->opaque, func->old_addr, len);
++    memcpy(state->insn_buffer, func->old_addr, len);
+ 
+     if ( func->new_addr )
+         insn = aarch64_insn_gen_branch_imm((unsigned long)func->old_addr,
+diff --git a/xen/arch/arm/livepatch.c b/xen/arch/arm/livepatch.c
+index d646379c8c..bbca1e5a5e 100644
+--- a/xen/arch/arm/livepatch.c
++++ b/xen/arch/arm/livepatch.c
+@@ -69,7 +69,7 @@ void arch_livepatch_revive(void)
+ int arch_livepatch_verify_func(const struct livepatch_func *func)
+ {
+     /* If NOPing only do up to maximum amount we can put in the ->opaque. */
+-    if ( !func->new_addr && (func->new_size > sizeof(func->opaque) ||
++    if ( !func->new_addr && (func->new_size > LIVEPATCH_OPAQUE_SIZE ||
+          func->new_size % ARCH_PATCH_INSN_SIZE) )
+         return -EOPNOTSUPP;
+ 
+@@ -79,15 +79,16 @@ int arch_livepatch_verify_func(const struct livepatch_func *func)
+     return 0;
+ }
+ 
+-void arch_livepatch_revert(const struct livepatch_func *func)
++void arch_livepatch_revert(const struct livepatch_func *func,
++                           struct livepatch_fstate *state)
+ {
+     uint32_t *new_ptr;
+     unsigned int len;
+ 
+     new_ptr = func->old_addr - (void *)_start + vmap_of_xen_text;
+ 
+-    len = livepatch_insn_len(func);
+-    memcpy(new_ptr, func->opaque, len);
++    len = livepatch_insn_len(func, state);
++    memcpy(new_ptr, state->insn_buffer, len);
+ 
+     clean_and_invalidate_dcache_va_range(new_ptr, len);
+ }
+diff --git a/xen/arch/x86/livepatch.c b/xen/arch/x86/livepatch.c
+index a54d991c5f..ee539f001b 100644
+--- a/xen/arch/x86/livepatch.c
++++ b/xen/arch/x86/livepatch.c
+@@ -95,7 +95,7 @@ int arch_livepatch_verify_func(const struct livepatch_func *func)
+     if ( !func->new_addr )
+     {
+         /* Only do up to maximum amount we can put in the ->opaque. */
+-        if ( func->new_size > sizeof(func->opaque) )
++        if ( func->new_size > LIVEPATCH_OPAQUE_SIZE )
+             return -EOPNOTSUPP;
+ 
+         if ( func->old_size < func->new_size )
+@@ -123,13 +123,14 @@ int arch_livepatch_verify_func(const struct livepatch_func *func)
+  * "noinline" to cause control flow change and thus invalidate I$ and
+  * cause refetch after modification.
+  */
+-void noinline arch_livepatch_apply(struct livepatch_func *func)
++void noinline arch_livepatch_apply(const struct livepatch_func *func,
++                                   struct livepatch_fstate *state)
+ {
+     uint8_t *old_ptr;
+-    uint8_t insn[sizeof(func->opaque)];
++    uint8_t insn[sizeof(state->insn_buffer)];
+     unsigned int len;
+ 
+-    func->patch_offset = 0;
++    state->patch_offset = 0;
+     old_ptr = func->old_addr;
+ 
+     /*
+@@ -141,14 +142,14 @@ void noinline arch_livepatch_apply(struct livepatch_func *func)
+      * ENDBR64 or similar instructions).
+      */
+     if ( is_endbr64(old_ptr) || is_endbr64_poison(func->old_addr) )
+-        func->patch_offset += ENDBR64_LEN;
++        state->patch_offset += ENDBR64_LEN;
+ 
+     /* This call must be done with ->patch_offset already set. */
+-    len = livepatch_insn_len(func);
++    len = livepatch_insn_len(func, state);
+     if ( !len )
+         return;
+ 
+-    memcpy(func->opaque, old_ptr + func->patch_offset, len);
++    memcpy(state->insn_buffer, old_ptr + state->patch_offset, len);
+     if ( func->new_addr )
+     {
+         int32_t val;
+@@ -156,7 +157,7 @@ void noinline arch_livepatch_apply(struct livepatch_func *func)
+         BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE != (1 + sizeof(val)));
+ 
+         insn[0] = 0xe9; /* Relative jump. */
+-        val = func->new_addr - (func->old_addr + func->patch_offset +
++        val = func->new_addr - (func->old_addr + state->patch_offset +
+                                 ARCH_PATCH_INSN_SIZE);
+ 
+         memcpy(&insn[1], &val, sizeof(val));
+@@ -164,17 +165,18 @@ void noinline arch_livepatch_apply(struct livepatch_func *func)
+     else
+         add_nops(insn, len);
+ 
+-    memcpy(old_ptr + func->patch_offset, insn, len);
++    memcpy(old_ptr + state->patch_offset, insn, len);
+ }
+ 
+ /*
+  * "noinline" to cause control flow change and thus invalidate I$ and
+  * cause refetch after modification.
+  */
+-void noinline arch_livepatch_revert(const struct livepatch_func *func)
++void noinline arch_livepatch_revert(const struct livepatch_func *func,
++                                    struct livepatch_fstate *state)
+ {
+-    memcpy(func->old_addr + func->patch_offset, func->opaque,
+-           livepatch_insn_len(func));
++    memcpy(func->old_addr + state->patch_offset, state->insn_buffer,
++           livepatch_insn_len(func, state));
+ }
+ 
+ /*
+diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c
+index d89a904bd4..e635606c10 100644
+--- a/xen/common/livepatch.c
++++ b/xen/common/livepatch.c
+@@ -260,6 +260,9 @@ static void free_payload_data(struct payload *payload)
+     vfree((void *)payload->text_addr);
+ 
+     payload->pages = 0;
++
++    /* fstate gets allocated strictly after move_payload. */
++    XFREE(payload->fstate);
+ }
+ 
+ /*
+@@ -656,6 +659,7 @@ static int prepare_payload(struct payload *payload,
+ {
+     const struct livepatch_elf_sec *sec;
+     unsigned int i;
++    struct livepatch_func *funcs;
+     struct livepatch_func *f;
+     struct virtual_region *region;
+     const Elf_Note *n;
+@@ -666,14 +670,19 @@ static int prepare_payload(struct payload *payload,
+         if ( !section_ok(elf, sec, sizeof(*payload->funcs)) )
+             return -EINVAL;
+ 
+-        payload->funcs = sec->load_addr;
++        payload->funcs = funcs = sec->load_addr;
+         payload->nfuncs = sec->sec->sh_size / sizeof(*payload->funcs);
+ 
++        payload->fstate = xzalloc_array(typeof(*payload->fstate),
++                                        payload->nfuncs);
++        if ( !payload->fstate )
++            return -ENOMEM;
++
+         for ( i = 0; i < payload->nfuncs; i++ )
+         {
+             int rc;
+ 
+-            f = &(payload->funcs[i]);
++            f = &(funcs[i]);
+ 
+             if ( f->version != LIVEPATCH_PAYLOAD_VERSION )
+             {
+@@ -1361,7 +1370,7 @@ static int apply_payload(struct payload *data)
+     ASSERT(!local_irq_is_enabled());
+ 
+     for ( i = 0; i < data->nfuncs; i++ )
+-        common_livepatch_apply(&data->funcs[i]);
++        common_livepatch_apply(&data->funcs[i], &data->fstate[i]);
+ 
+     arch_livepatch_revive();
+ 
+@@ -1397,7 +1406,7 @@ static int revert_payload(struct payload *data)
+     }
+ 
+     for ( i = 0; i < data->nfuncs; i++ )
+-        common_livepatch_revert(&data->funcs[i]);
++        common_livepatch_revert(&data->funcs[i], &data->fstate[i]);
+ 
+     /*
+      * Since we are running with IRQs disabled and the hooks may call common
+@@ -1438,9 +1447,10 @@ static inline bool was_action_consistent(const struct payload *data, livepatch_f
+ 
+     for ( i = 0; i < data->nfuncs; i++ )
+     {
+-        struct livepatch_func *f = &(data->funcs[i]);
++        const struct livepatch_func *f = &(data->funcs[i]);
++        const struct livepatch_fstate *s = &(data->fstate[i]);
+ 
+-        if ( f->applied != expected_state )
++        if ( s->applied != expected_state )
+         {
+             printk(XENLOG_ERR LIVEPATCH "%s: Payload has a function: '%s' with inconsistent applied state.\n",
+                    data->name, f->name ?: "noname");
+@@ -2157,7 +2167,8 @@ static void cf_check livepatch_printall(unsigned char key)
+ 
+         for ( i = 0; i < data->nfuncs; i++ )
+         {
+-            struct livepatch_func *f = &(data->funcs[i]);
++            const struct livepatch_func *f = &(data->funcs[i]);
++
+             printk("    %s patch %p(%u) with %p (%u)\n",
+                    f->name, f->old_addr, f->old_size, f->new_addr, f->new_size);
+ 
+diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h
+index f1eba78405..9b19679cae 100644
+--- a/xen/include/public/sysctl.h
++++ b/xen/include/public/sysctl.h
+@@ -991,10 +991,7 @@ struct livepatch_func {
+     uint32_t new_size;
+     uint32_t old_size;
+     uint8_t version;        /* MUST be LIVEPATCH_PAYLOAD_VERSION. */
+-    uint8_t opaque[LIVEPATCH_OPAQUE_SIZE];
+-    uint8_t applied;
+-    uint8_t patch_offset;
+-    uint8_t _pad[6];
++    uint8_t _pad[39];
+     livepatch_expectation_t expect;
+ };
+ typedef struct livepatch_func livepatch_func_t;
+diff --git a/xen/include/xen/livepatch.h b/xen/include/xen/livepatch.h
+index 9fdb29c382..537d3d58b6 100644
+--- a/xen/include/xen/livepatch.h
++++ b/xen/include/xen/livepatch.h
+@@ -13,6 +13,9 @@ struct xen_sysctl_livepatch_op;
+ 
+ #include <xen/elfstructs.h>
+ #include <xen/errno.h> /* For -ENOSYS or -EOVERFLOW */
++
++#include <public/sysctl.h> /* For LIVEPATCH_OPAQUE_SIZE */
++
+ #ifdef CONFIG_LIVEPATCH
+ 
+ /*
+@@ -51,6 +54,12 @@ struct livepatch_symbol {
+     bool_t new_symbol;
+ };
+ 
++struct livepatch_fstate {
++    unsigned int patch_offset;
++    enum livepatch_func_state applied;
++    uint8_t insn_buffer[LIVEPATCH_OPAQUE_SIZE];
++};
++
+ int livepatch_op(struct xen_sysctl_livepatch_op *);
+ void check_for_livepatch_work(void);
+ unsigned long livepatch_symbols_lookup_by_name(const char *symname);
+@@ -87,10 +96,11 @@ void arch_livepatch_init(void);
+ int arch_livepatch_verify_func(const struct livepatch_func *func);
+ 
+ static inline
+-unsigned int livepatch_insn_len(const struct livepatch_func *func)
++unsigned int livepatch_insn_len(const struct livepatch_func *func,
++                                const struct livepatch_fstate *state)
+ {
+     if ( !func->new_addr )
+-        return func->new_size - func->patch_offset;
++        return func->new_size - state->patch_offset;
+ 
+     return ARCH_PATCH_INSN_SIZE;
+ }
+@@ -117,39 +127,43 @@ int arch_livepatch_safety_check(void);
+ int arch_livepatch_quiesce(void);
+ void arch_livepatch_revive(void);
+ 
+-void arch_livepatch_apply(struct livepatch_func *func);
+-void arch_livepatch_revert(const struct livepatch_func *func);
++void arch_livepatch_apply(const struct livepatch_func *func,
++                          struct livepatch_fstate *state);
++void arch_livepatch_revert(const struct livepatch_func *func,
++                           struct livepatch_fstate *state);
+ void arch_livepatch_post_action(void);
+ 
+ void arch_livepatch_mask(void);
+ void arch_livepatch_unmask(void);
+ 
+-static inline void common_livepatch_apply(struct livepatch_func *func)
++static inline void common_livepatch_apply(const struct livepatch_func *func,
++                                          struct livepatch_fstate *state)
+ {
+     /* If the action has been already executed on this function, do nothing. */
+-    if ( func->applied == LIVEPATCH_FUNC_APPLIED )
++    if ( state->applied == LIVEPATCH_FUNC_APPLIED )
+     {
+         printk(XENLOG_WARNING LIVEPATCH "%s: %s has been already applied before\n",
+                 __func__, func->name);
+         return;
+     }
+ 
+-    arch_livepatch_apply(func);
+-    func->applied = LIVEPATCH_FUNC_APPLIED;
++    arch_livepatch_apply(func, state);
++    state->applied = LIVEPATCH_FUNC_APPLIED;
+ }
+ 
+-static inline void common_livepatch_revert(struct livepatch_func *func)
++static inline void common_livepatch_revert(const struct livepatch_func *func,
++                                           struct livepatch_fstate *state)
+ {
+     /* If the apply action hasn't been executed on this function, do nothing. */
+-    if ( !func->old_addr || func->applied == LIVEPATCH_FUNC_NOT_APPLIED )
++    if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED )
+     {
+         printk(XENLOG_WARNING LIVEPATCH "%s: %s has not been applied before\n",
+                 __func__, func->name);
+         return;
+     }
+ 
+-    arch_livepatch_revert(func);
+-    func->applied = LIVEPATCH_FUNC_NOT_APPLIED;
++    arch_livepatch_revert(func, state);
++    state->applied = LIVEPATCH_FUNC_NOT_APPLIED;
+ }
+ #else
+ 
+diff --git a/xen/include/xen/livepatch_payload.h b/xen/include/xen/livepatch_payload.h
+index 9f5f064205..b9cd4f2096 100644
+--- a/xen/include/xen/livepatch_payload.h
++++ b/xen/include/xen/livepatch_payload.h
+@@ -52,7 +52,8 @@ struct payload {
+     size_t ro_size;                      /* .. and its size (if any). */
+     unsigned int pages;                  /* Total pages for [text,rw,ro]_addr */
+     struct list_head applied_list;       /* Linked to 'applied_list'. */
+-    struct livepatch_func *funcs;        /* The array of functions to patch. */
++    const struct livepatch_func *funcs;  /* The array of functions to patch. */
++    struct livepatch_fstate *fstate;     /* State of patched functions. */
+     unsigned int nfuncs;                 /* Nr of functions to patch. */
+     const struct livepatch_symbol *symtab; /* All symbols. */
+     const char *strtab;                  /* Pointer to .strtab. */
+diff --git a/xen/test/livepatch/xen_action_hooks.c b/xen/test/livepatch/xen_action_hooks.c
+index 39b5313027..fa0b3ab35f 100644
+--- a/xen/test/livepatch/xen_action_hooks.c
++++ b/xen/test/livepatch/xen_action_hooks.c
+@@ -26,9 +26,10 @@ static int apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        func->applied = LIVEPATCH_FUNC_APPLIED;
++        fstate->applied = LIVEPATCH_FUNC_APPLIED;
+         apply_cnt++;
+ 
+         printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name);
+@@ -47,9 +48,10 @@ static int revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        func->applied = LIVEPATCH_FUNC_NOT_APPLIED;
++        fstate->applied = LIVEPATCH_FUNC_NOT_APPLIED;
+         revert_cnt++;
+ 
+         printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name);
+@@ -68,7 +70,7 @@ static void post_revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         printk(KERN_DEBUG "%s: reverted: %s\n", __func__, func->name);
+     }
+diff --git a/xen/test/livepatch/xen_action_hooks_marker.c b/xen/test/livepatch/xen_action_hooks_marker.c
+index 4f807a577f..d2e22f70d1 100644
+--- a/xen/test/livepatch/xen_action_hooks_marker.c
++++ b/xen/test/livepatch/xen_action_hooks_marker.c
+@@ -23,9 +23,10 @@ static int pre_apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: pre applied: %s\n", __func__, func->name);
+     }
+ 
+@@ -42,9 +43,10 @@ static void post_apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: post applied: %s\n", __func__, func->name);
+     }
+ 
+@@ -59,9 +61,10 @@ static int pre_revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: pre reverted: %s\n", __func__, func->name);
+     }
+ 
+@@ -78,9 +81,10 @@ static void post_revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name);
+     }
+ 
+diff --git a/xen/test/livepatch/xen_action_hooks_noapply.c b/xen/test/livepatch/xen_action_hooks_noapply.c
+index 4c55c156a6..646a5fd2f0 100644
+--- a/xen/test/livepatch/xen_action_hooks_noapply.c
++++ b/xen/test/livepatch/xen_action_hooks_noapply.c
+@@ -25,9 +25,10 @@ static int pre_apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: pre applied: %s\n", __func__, func->name);
+     }
+ 
+@@ -44,7 +45,7 @@ static int apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         apply_cnt++;
+         printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name);
+@@ -63,10 +64,11 @@ static void post_apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+         BUG_ON(apply_cnt != 1);
+-        BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: post applied: %s\n", __func__, func->name);
+     }
+ 
+@@ -81,9 +83,10 @@ static int pre_revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: pre reverted: %s\n", __func__, func->name);
+     }
+ 
+@@ -100,9 +103,10 @@ static void post_revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name);
+     }
+ 
+diff --git a/xen/test/livepatch/xen_action_hooks_nofunc.c b/xen/test/livepatch/xen_action_hooks_nofunc.c
+index 2b4e90436f..077c4c1738 100644
+--- a/xen/test/livepatch/xen_action_hooks_nofunc.c
++++ b/xen/test/livepatch/xen_action_hooks_nofunc.c
+@@ -23,7 +23,7 @@ static int apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         apply_cnt++;
+         printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name);
+@@ -42,7 +42,7 @@ static int revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         revert_cnt++;
+         printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name);
+@@ -61,7 +61,7 @@ static void post_revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         printk(KERN_DEBUG "%s: reverted: %s\n", __func__, func->name);
+     }
+diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c
+index ef77e72071..3e21ade6ab 100644
+--- a/xen/test/livepatch/xen_action_hooks_norevert.c
++++ b/xen/test/livepatch/xen_action_hooks_norevert.c
+@@ -25,9 +25,10 @@ static int pre_apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: pre applied: %s\n", __func__, func->name);
+     }
+ 
+@@ -44,9 +45,10 @@ static void post_apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: post applied: %s\n", __func__, func->name);
+     }
+ 
+@@ -61,9 +63,10 @@ static int pre_revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+-        BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED);
+         printk(KERN_DEBUG "%s: pre reverted: %s\n", __func__, func->name);
+     }
+ 
+@@ -80,7 +83,7 @@ static int revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         revert_cnt++;
+         printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name);
+@@ -99,16 +102,17 @@ static void post_revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
++        struct livepatch_fstate *fstate = &payload->fstate[i];
+ 
+         BUG_ON(revert_cnt != 1);
+-        BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED);
+ 
+         /* Outside of quiesce zone: MAY TRIGGER HOST CRASH/UNDEFINED BEHAVIOR */
+         arch_livepatch_quiesce();
+         common_livepatch_revert(payload);
+         arch_livepatch_revive();
+-        BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED);
++        BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED);
+ 
+         printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name);
+     }
+diff --git a/xen/test/livepatch/xen_prepost_hooks.c b/xen/test/livepatch/xen_prepost_hooks.c
+index 889377d6eb..17f5af6a19 100644
+--- a/xen/test/livepatch/xen_prepost_hooks.c
++++ b/xen/test/livepatch/xen_prepost_hooks.c
+@@ -30,7 +30,7 @@ static int pre_apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         pre_apply_cnt++;
+         printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name);
+@@ -49,7 +49,7 @@ static void post_apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         post_apply_cnt++;
+         printk(KERN_DEBUG "%s: applied: %s\n", __func__, func->name);
+@@ -66,7 +66,7 @@ static int pre_revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         pre_revert_cnt++;
+         printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name);
+@@ -86,7 +86,7 @@ static void post_revert_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         post_revert_cnt++;
+         printk(KERN_DEBUG "%s: reverted: %s\n", __func__, func->name);
+diff --git a/xen/test/livepatch/xen_prepost_hooks_fail.c b/xen/test/livepatch/xen_prepost_hooks_fail.c
+index c6feb5d32d..52fd7f642e 100644
+--- a/xen/test/livepatch/xen_prepost_hooks_fail.c
++++ b/xen/test/livepatch/xen_prepost_hooks_fail.c
+@@ -24,7 +24,7 @@ static int pre_apply_hook(livepatch_payload_t *payload)
+ 
+     for (i = 0; i < payload->nfuncs; i++)
+     {
+-        struct livepatch_func *func = &payload->funcs[i];
++        const struct livepatch_func *func = &payload->funcs[i];
+ 
+         printk(KERN_DEBUG "%s: pre applying: %s\n", __func__, func->name);
+     }
+-- 
+2.44.0
+
+
+From 61d032e322b178a49983359b0dfd64a42c1f5fca Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo@cloud.com>
+Date: Wed, 6 Dec 2023 10:39:15 +0100
+Subject: [PATCH 09/70] xen/x86: In x2APIC mode, derive LDR from APIC ID
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Both Intel and AMD manuals agree that in x2APIC mode, the APIC LDR and ID
+registers are derivable from each other through a fixed formula.
+
+Xen uses that formula, but applies it to vCPU IDs (which are sequential)
+rather than x2APIC IDs (which are not, at the moment). As I understand it,
+this is an attempt to tightly pack vCPUs into clusters so each cluster has
+16 vCPUs rather than 8, but this is a spec violation.
+
+This patch fixes the implementation so we follow the x2APIC spec for new
+VMs, while preserving the behaviour (buggy or fixed) for migrated-in VMs.
+
+While touching that area, remove the existing printk statement in
+vlapic_load_fixup() (as the checks it performed didn't make sense in x2APIC
+mode and wouldn't affect the outcome) and put another printk as an else
+branch so we get warnings trying to load nonsensical LDR values we don't
+know about.
+
+Fixes: f9e0cccf7b35 ("x86/HVM: fix ID handling of x2APIC emulation")
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 90309854fd2440fb08b4c808f47d7670ba0d250d
+master date: 2023-11-29 10:05:55 +0100
+---
+ xen/arch/x86/hvm/vlapic.c             | 64 +++++++++++++++++++--------
+ xen/arch/x86/include/asm/hvm/domain.h |  3 ++
+ 2 files changed, 48 insertions(+), 19 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c
+index c7ce82d064..ba569043ea 100644
+--- a/xen/arch/x86/hvm/vlapic.c
++++ b/xen/arch/x86/hvm/vlapic.c
+@@ -1061,13 +1061,26 @@ static const struct hvm_mmio_ops vlapic_mmio_ops = {
+     .write = vlapic_mmio_write,
+ };
+ 
++static uint32_t x2apic_ldr_from_id(uint32_t id)
++{
++    return ((id & ~0xf) << 12) | (1 << (id & 0xf));
++}
++
+ static void set_x2apic_id(struct vlapic *vlapic)
+ {
+-    u32 id = vlapic_vcpu(vlapic)->vcpu_id;
+-    u32 ldr = ((id & ~0xf) << 12) | (1 << (id & 0xf));
++    const struct vcpu *v = vlapic_vcpu(vlapic);
++    uint32_t apic_id = v->vcpu_id * 2;
++    uint32_t apic_ldr = x2apic_ldr_from_id(apic_id);
+ 
+-    vlapic_set_reg(vlapic, APIC_ID, id * 2);
+-    vlapic_set_reg(vlapic, APIC_LDR, ldr);
++    /*
++     * Workaround for migrated domains to derive LDRs as the source host
++     * would've.
++     */
++    if ( v->domain->arch.hvm.bug_x2apic_ldr_vcpu_id )
++        apic_ldr = x2apic_ldr_from_id(v->vcpu_id);
++
++    vlapic_set_reg(vlapic, APIC_ID, apic_id);
++    vlapic_set_reg(vlapic, APIC_LDR, apic_ldr);
+ }
+ 
+ int guest_wrmsr_apic_base(struct vcpu *v, uint64_t val)
+@@ -1498,27 +1511,40 @@ static int cf_check lapic_save_regs(struct vcpu *v, hvm_domain_context_t *h)
+  */
+ static void lapic_load_fixup(struct vlapic *vlapic)
+ {
+-    uint32_t id = vlapic->loaded.id;
++    const struct vcpu *v = vlapic_vcpu(vlapic);
++    uint32_t good_ldr = x2apic_ldr_from_id(vlapic->loaded.id);
+ 
+-    if ( vlapic_x2apic_mode(vlapic) && id && vlapic->loaded.ldr == 1 )
++    /* Skip fixups on xAPIC mode, or if the x2APIC LDR is already correct */
++    if ( !vlapic_x2apic_mode(vlapic) ||
++         (vlapic->loaded.ldr == good_ldr) )
++        return;
++
++    if ( vlapic->loaded.ldr == 1 )
+     {
+-        /*
+-         * This is optional: ID != 0 contradicts LDR == 1. It's being added
+-         * to aid in eventual debugging of issues arising from the fixup done
+-         * here, but can be dropped as soon as it is found to conflict with
+-         * other (future) changes.
+-         */
+-        if ( GET_xAPIC_ID(id) != vlapic_vcpu(vlapic)->vcpu_id * 2 ||
+-             id != SET_xAPIC_ID(GET_xAPIC_ID(id)) )
+-            printk(XENLOG_G_WARNING "%pv: bogus APIC ID %#x loaded\n",
+-                   vlapic_vcpu(vlapic), id);
++       /*
++        * Xen <= 4.4 may have a bug by which all the APICs configured in
++        * x2APIC mode got LDR = 1, which is inconsistent on every vCPU
++        * except for the one with ID = 0. We'll fix the bug now and assign
++        * an LDR value consistent with the APIC ID.
++        */
+         set_x2apic_id(vlapic);
+     }
+-    else /* Undo an eventual earlier fixup. */
++    else if ( vlapic->loaded.ldr == x2apic_ldr_from_id(v->vcpu_id) )
+     {
+-        vlapic_set_reg(vlapic, APIC_ID, id);
+-        vlapic_set_reg(vlapic, APIC_LDR, vlapic->loaded.ldr);
++        /*
++         * Migrations from Xen 4.4 to date (4.19 dev window, Nov 2023) may
++         * have LDR drived from the vCPU ID, not the APIC ID. We must preserve
++         * LDRs so new vCPUs use consistent derivations and existing guests,
++         * which may have already read the LDR at the source host, aren't
++         * surprised when interrupts stop working the way they did at the
++         * other end.
++         */
++        v->domain->arch.hvm.bug_x2apic_ldr_vcpu_id = true;
+     }
++    else
++        printk(XENLOG_G_WARNING
++               "%pv: bogus x2APIC record: ID %#x, LDR %#x, expected LDR %#x\n",
++               v, vlapic->loaded.id, vlapic->loaded.ldr, good_ldr);
+ }
+ 
+ static int cf_check lapic_load_hidden(struct domain *d, hvm_domain_context_t *h)
+diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h
+index 6e53ce4449..dd9d837e84 100644
+--- a/xen/arch/x86/include/asm/hvm/domain.h
++++ b/xen/arch/x86/include/asm/hvm/domain.h
+@@ -106,6 +106,9 @@ struct hvm_domain {
+ 
+     bool                   is_s3_suspended;
+ 
++    /* Compatibility setting for a bug in x2APIC LDR */
++    bool bug_x2apic_ldr_vcpu_id;
++
+     /* hypervisor intercepted msix table */
+     struct list_head       msixtbl_list;
+ 
+-- 
+2.44.0
+
+
+From 3af9d1cbb602a9dcbab2e43fab74a881c2e05d81 Mon Sep 17 00:00:00 2001
+From: Alejandro Vallejo <alejandro.vallejo@cloud.com>
+Date: Wed, 6 Dec 2023 10:39:55 +0100
+Subject: [PATCH 10/70] tools/xg: Fix potential memory leak in cpu policy
+ getters/setters
+
+They allocate two different hypercall buffers, but leak the first
+allocation if the second one failed due to an early return that bypasses
+cleanup.
+
+Remove the early exit and go through _post() instead. Invoking _post() is
+benign even if _pre() failed.
+
+Fixes: 6b85e427098c ('x86/sysctl: Implement XEN_SYSCTL_get_cpu_policy')
+Fixes: 60529dfeca14 ('x86/domctl: Implement XEN_DOMCTL_get_cpu_policy')
+Fixes: 14ba07e6f816 ('x86/domctl: Implement XEN_DOMCTL_set_cpumsr_policy')
+Signed-off-by: Alejandro Vallejo <alejandro.vallejo@cloud.com>
+Reviewed-by: Anthony PERARD <anthony.perard@citrix.com>
+master commit: 1571ff7a987b88b20598a6d49910457f3b2c59f1
+master date: 2023-12-01 10:53:07 +0100
+---
+ tools/libs/guest/xg_cpuid_x86.c | 86 +++++++++++++++------------------
+ 1 file changed, 39 insertions(+), 47 deletions(-)
+
+diff --git a/tools/libs/guest/xg_cpuid_x86.c b/tools/libs/guest/xg_cpuid_x86.c
+index f2b1e80901..3a74bb2b37 100644
+--- a/tools/libs/guest/xg_cpuid_x86.c
++++ b/tools/libs/guest/xg_cpuid_x86.c
+@@ -136,20 +136,20 @@ static int get_system_cpu_policy(xc_interface *xch, uint32_t index,
+     DECLARE_HYPERCALL_BOUNCE(msrs,
+                              *nr_msrs * sizeof(*msrs),
+                              XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+-    int ret;
+-
+-    if ( xc_hypercall_bounce_pre(xch, leaves) ||
+-         xc_hypercall_bounce_pre(xch, msrs) )
+-        return -1;
++    int ret = -1;
+ 
+-    sysctl.cmd = XEN_SYSCTL_get_cpu_policy;
+-    sysctl.u.cpu_policy.index = index;
+-    sysctl.u.cpu_policy.nr_leaves = *nr_leaves;
+-    set_xen_guest_handle(sysctl.u.cpu_policy.leaves, leaves);
+-    sysctl.u.cpu_policy.nr_msrs = *nr_msrs;
+-    set_xen_guest_handle(sysctl.u.cpu_policy.msrs, msrs);
+-
+-    ret = do_sysctl(xch, &sysctl);
++    if ( !xc_hypercall_bounce_pre(xch, leaves) &&
++         !xc_hypercall_bounce_pre(xch, msrs) )
++    {
++        sysctl.cmd = XEN_SYSCTL_get_cpu_policy;
++        sysctl.u.cpu_policy.index = index;
++        sysctl.u.cpu_policy.nr_leaves = *nr_leaves;
++        set_xen_guest_handle(sysctl.u.cpu_policy.leaves, leaves);
++        sysctl.u.cpu_policy.nr_msrs = *nr_msrs;
++        set_xen_guest_handle(sysctl.u.cpu_policy.msrs, msrs);
++
++        ret = do_sysctl(xch, &sysctl);
++    }
+ 
+     xc_hypercall_bounce_post(xch, leaves);
+     xc_hypercall_bounce_post(xch, msrs);
+@@ -174,20 +174,20 @@ static int get_domain_cpu_policy(xc_interface *xch, uint32_t domid,
+     DECLARE_HYPERCALL_BOUNCE(msrs,
+                              *nr_msrs * sizeof(*msrs),
+                              XC_HYPERCALL_BUFFER_BOUNCE_OUT);
+-    int ret;
+-
+-    if ( xc_hypercall_bounce_pre(xch, leaves) ||
+-         xc_hypercall_bounce_pre(xch, msrs) )
+-        return -1;
+-
+-    domctl.cmd = XEN_DOMCTL_get_cpu_policy;
+-    domctl.domain = domid;
+-    domctl.u.cpu_policy.nr_leaves = *nr_leaves;
+-    set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves);
+-    domctl.u.cpu_policy.nr_msrs = *nr_msrs;
+-    set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs);
++    int ret = -1;
+ 
+-    ret = do_domctl(xch, &domctl);
++    if ( !xc_hypercall_bounce_pre(xch, leaves) &&
++         !xc_hypercall_bounce_pre(xch, msrs) )
++    {
++        domctl.cmd = XEN_DOMCTL_get_cpu_policy;
++        domctl.domain = domid;
++        domctl.u.cpu_policy.nr_leaves = *nr_leaves;
++        set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves);
++        domctl.u.cpu_policy.nr_msrs = *nr_msrs;
++        set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs);
++
++        ret = do_domctl(xch, &domctl);
++    }
+ 
+     xc_hypercall_bounce_post(xch, leaves);
+     xc_hypercall_bounce_post(xch, msrs);
+@@ -214,32 +214,24 @@ int xc_set_domain_cpu_policy(xc_interface *xch, uint32_t domid,
+     DECLARE_HYPERCALL_BOUNCE(msrs,
+                              nr_msrs * sizeof(*msrs),
+                              XC_HYPERCALL_BUFFER_BOUNCE_IN);
+-    int ret;
+-
+-    if ( err_leaf_p )
+-        *err_leaf_p = -1;
+-    if ( err_subleaf_p )
+-        *err_subleaf_p = -1;
+-    if ( err_msr_p )
+-        *err_msr_p = -1;
++    int ret = -1;
+ 
+-    if ( xc_hypercall_bounce_pre(xch, leaves) )
+-        return -1;
+-
+-    if ( xc_hypercall_bounce_pre(xch, msrs) )
+-        return -1;
+-
+-    domctl.cmd = XEN_DOMCTL_set_cpu_policy;
+-    domctl.domain = domid;
+-    domctl.u.cpu_policy.nr_leaves = nr_leaves;
+-    set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves);
+-    domctl.u.cpu_policy.nr_msrs = nr_msrs;
+-    set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs);
+     domctl.u.cpu_policy.err_leaf = -1;
+     domctl.u.cpu_policy.err_subleaf = -1;
+     domctl.u.cpu_policy.err_msr = -1;
+ 
+-    ret = do_domctl(xch, &domctl);
++    if ( !xc_hypercall_bounce_pre(xch, leaves) &&
++         !xc_hypercall_bounce_pre(xch, msrs) )
++    {
++        domctl.cmd = XEN_DOMCTL_set_cpu_policy;
++        domctl.domain = domid;
++        domctl.u.cpu_policy.nr_leaves = nr_leaves;
++        set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves);
++        domctl.u.cpu_policy.nr_msrs = nr_msrs;
++        set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs);
++
++        ret = do_domctl(xch, &domctl);
++    }
+ 
+     xc_hypercall_bounce_post(xch, leaves);
+     xc_hypercall_bounce_post(xch, msrs);
+-- 
+2.44.0
+
+
+From 18f900b77b3a85acadc2fe152ea354a02569acab Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Wed, 6 Dec 2023 10:40:19 +0100
+Subject: [PATCH 11/70] x86emul: avoid triggering event related assertions
+
+The assertion at the end of x86_emulate_wrapper() as well as the ones
+in x86_emul_{hw_exception,pagefault}() can trigger if we ignore
+X86EMUL_EXCEPTION coming back from certain hook functions. Squash
+exceptions when merely probing MSRs, plus on SWAPGS'es "best effort"
+error handling path.
+
+In adjust_bnd() add another assertion after the read_xcr(0, ...)
+invocation, paralleling the one in x86emul_get_fpu() - XCR0 reads should
+never fault when XSAVE is (implicitly) known to be available.
+
+Also update the respective comment in x86_emulate_wrapper().
+
+Fixes: 14a6be89ec04 ("x86emul: correct EFLAGS.TF handling")
+Fixes: cb2626c75813 ("x86emul: conditionally clear BNDn for branches")
+Fixes: 6eb43fcf8a0b ("x86emul: support SWAPGS")
+Reported-by: AFL
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 787d11c5aaf4d3411d4658cff137cd49b0bd951b
+master date: 2023-12-05 09:57:05 +0100
+---
+ xen/arch/x86/x86_emulate/0f01.c        |  6 ++++--
+ xen/arch/x86/x86_emulate/0fae.c        |  3 +++
+ xen/arch/x86/x86_emulate/x86_emulate.c | 28 +++++++++++++++++++++-----
+ 3 files changed, 30 insertions(+), 7 deletions(-)
+
+diff --git a/xen/arch/x86/x86_emulate/0f01.c b/xen/arch/x86/x86_emulate/0f01.c
+index ba43fc394b..1ba99609d6 100644
+--- a/xen/arch/x86/x86_emulate/0f01.c
++++ b/xen/arch/x86/x86_emulate/0f01.c
+@@ -200,8 +200,10 @@ int x86emul_0f01(struct x86_emulate_state *s,
+         if ( (rc = ops->write_segment(x86_seg_gs, &sreg,
+                                       ctxt)) != X86EMUL_OKAY )
+         {
+-            /* Best effort unwind (i.e. no error checking). */
+-            ops->write_msr(MSR_SHADOW_GS_BASE, msr_val, ctxt);
++            /* Best effort unwind (i.e. no real error checking). */
++            if ( ops->write_msr(MSR_SHADOW_GS_BASE, msr_val,
++                                ctxt) == X86EMUL_EXCEPTION )
++                x86_emul_reset_event(ctxt);
+             goto done;
+         }
+         break;
+diff --git a/xen/arch/x86/x86_emulate/0fae.c b/xen/arch/x86/x86_emulate/0fae.c
+index 00840b1d07..ba77af58f2 100644
+--- a/xen/arch/x86/x86_emulate/0fae.c
++++ b/xen/arch/x86/x86_emulate/0fae.c
+@@ -55,7 +55,10 @@ int x86emul_0fae(struct x86_emulate_state *s,
+                     cr4 = X86_CR4_OSFXSR;
+                 if ( !ops->read_msr ||
+                      ops->read_msr(MSR_EFER, &msr_val, ctxt) != X86EMUL_OKAY )
++                {
++                    x86_emul_reset_event(ctxt);
+                     msr_val = 0;
++                }
+                 if ( !(cr4 & X86_CR4_OSFXSR) ||
+                      (mode_64bit() && mode_ring0() && (msr_val & EFER_FFXSE)) )
+                     s->op_bytes = offsetof(struct x86_fxsr, xmm[0]);
+diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
+index 94caec1d14..cf780da501 100644
+--- a/xen/arch/x86/x86_emulate/x86_emulate.c
++++ b/xen/arch/x86/x86_emulate/x86_emulate.c
+@@ -1143,10 +1143,18 @@ static bool is_branch_step(struct x86_emulate_ctxt *ctxt,
+                            const struct x86_emulate_ops *ops)
+ {
+     uint64_t debugctl;
++    int rc = X86EMUL_UNHANDLEABLE;
+ 
+-    return ops->read_msr &&
+-           ops->read_msr(MSR_IA32_DEBUGCTLMSR, &debugctl, ctxt) == X86EMUL_OKAY &&
+-           (debugctl & IA32_DEBUGCTLMSR_BTF);
++    if ( !ops->read_msr ||
++         (rc = ops->read_msr(MSR_IA32_DEBUGCTLMSR, &debugctl,
++                             ctxt)) != X86EMUL_OKAY )
++    {
++        if ( rc == X86EMUL_EXCEPTION )
++            x86_emul_reset_event(ctxt);
++        debugctl = 0;
++    }
++
++    return debugctl & IA32_DEBUGCTLMSR_BTF;
+ }
+ 
+ static void adjust_bnd(struct x86_emulate_ctxt *ctxt,
+@@ -1160,13 +1168,21 @@ static void adjust_bnd(struct x86_emulate_ctxt *ctxt,
+ 
+     if ( !ops->read_xcr || ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY ||
+          !(xcr0 & X86_XCR0_BNDREGS) || !(xcr0 & X86_XCR0_BNDCSR) )
++    {
++        ASSERT(!ctxt->event_pending);
+         return;
++    }
+ 
+     if ( !mode_ring0() )
+         bndcfg = read_bndcfgu();
+     else if ( !ops->read_msr ||
+-              ops->read_msr(MSR_IA32_BNDCFGS, &bndcfg, ctxt) != X86EMUL_OKAY )
++              (rc = ops->read_msr(MSR_IA32_BNDCFGS, &bndcfg,
++                                  ctxt)) != X86EMUL_OKAY )
++    {
++        if ( rc == X86EMUL_EXCEPTION )
++            x86_emul_reset_event(ctxt);
+         return;
++    }
+     if ( (bndcfg & IA32_BNDCFGS_ENABLE) && !(bndcfg & IA32_BNDCFGS_PRESERVE) )
+     {
+         /*
+@@ -8677,7 +8693,9 @@ int x86_emulate_wrapper(
+      * An event being pending should exactly match returning
+      * X86EMUL_EXCEPTION.  (If this trips, the chances are a codepath has
+      * called hvm_inject_hw_exception() rather than using
+-     * x86_emul_hw_exception().)
++     * x86_emul_hw_exception(), or the invocation of a hook has caused an
++     * exception to be raised, while the caller was only checking for
++     * success/failure.)
+      */
+     ASSERT(ctxt->event_pending == (rc == X86EMUL_EXCEPTION));
+ 
+-- 
+2.44.0
+
+
+From 5ac87c8afd2ae2b1a9fd46a9b80d9152d650fb26 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Wed, 6 Dec 2023 10:40:54 +0100
+Subject: [PATCH 12/70] xen/sched: fix adding offline cpu to cpupool
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Trying to add an offline cpu to a cpupool can crash the hypervisor,
+as the probably non-existing percpu area of the cpu is accessed before
+the availability of the cpu is being tested. This can happen in case
+the cpupool's granularity is "core" or "socket".
+
+Fix that by testing the cpu to be online.
+
+Fixes: cb563d7665f2 ("xen/sched: support core scheduling for moving cpus to/from cpupools")
+Reported-by: René Winther Højgaard <renewin@proton.me>
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 06e8d65d33896aa90f5b6d9b2bce7f11433b33c9
+master date: 2023-12-05 09:57:38 +0100
+---
+ xen/common/sched/cpupool.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c
+index 2e094b0cfa..ad8f608462 100644
+--- a/xen/common/sched/cpupool.c
++++ b/xen/common/sched/cpupool.c
+@@ -892,6 +892,8 @@ int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
+         if ( cpu >= nr_cpu_ids )
+             goto addcpu_out;
+         ret = -ENODEV;
++        if ( !cpu_online(cpu) )
++            goto addcpu_out;
+         cpus = sched_get_opt_cpumask(c->gran, cpu);
+         if ( !cpumask_subset(cpus, &cpupool_free_cpus) ||
+              cpumask_intersects(cpus, &cpupool_locked_cpus) )
+-- 
+2.44.0
+
+
+From 25b7f9ed0f8c7e138a2cecb113bd377c613153d7 Mon Sep 17 00:00:00 2001
+From: Stewart Hildebrand <stewart.hildebrand@amd.com>
+Date: Wed, 6 Dec 2023 10:41:19 +0100
+Subject: [PATCH 13/70] xen/domain: fix error path in domain_create()
+
+If rangeset_new() fails, err would not be set to an appropriate error
+code. Set it to -ENOMEM.
+
+Fixes: 580c458699e3 ("xen/domain: Call arch_domain_create() as early as possible in domain_create()")
+Signed-off-by: Stewart Hildebrand <stewart.hildebrand@amd.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: ff1178062094837d55ef342070e58316c43a54c9
+master date: 2023-12-05 10:00:51 +0100
+---
+ xen/common/domain.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/xen/common/domain.c b/xen/common/domain.c
+index 8f9ab01c0c..003f4ab125 100644
+--- a/xen/common/domain.c
++++ b/xen/common/domain.c
+@@ -703,6 +703,7 @@ struct domain *domain_create(domid_t domid,
+         watchdog_domain_init(d);
+         init_status |= INIT_watchdog;
+ 
++        err = -ENOMEM;
+         d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex);
+         d->irq_caps   = rangeset_new(d, "Interrupts", 0);
+         if ( !d->iomem_caps || !d->irq_caps )
+-- 
+2.44.0
+
+
+From a56d598e13db413f98e149f8e10cc13e8d4c1635 Mon Sep 17 00:00:00 2001
+From: Julien Grall <jgrall@amazon.com>
+Date: Tue, 12 Dec 2023 14:26:18 +0100
+Subject: [PATCH 14/70] Only compile the hypervisor with
+ -Wdeclaration-after-statement
+
+Right now, all tools and hypervisor will be complied with the option
+-Wdeclaration-after-statement. While most of the code in the hypervisor
+is controlled by us, for tools we may import external libraries.
+
+The build will fail if one of them are using the construct we are
+trying to prevent. This is the case when building against Python 3.12
+and Yocto:
+
+| In file included from /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/Python.h:44,
+|                  from xen/lowlevel/xc/xc.c:8:
+| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/object.h: In function 'Py_SIZE':
+| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/object.h:233:5: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement]
+|   233 |     PyVarObject *var_ob = _PyVarObject_CAST(ob);
+|       |     ^~~~~~~~~~~
+| In file included from /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/Python.h:53:
+| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/cpython/longintrepr.h: In function '_PyLong_CompactValue':
+| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/cpython/longintrepr.h:121:5: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement]
+|   121 |     Py_ssize_t sign = 1 - (op->long_value.lv_tag & _PyLong_SIGN_MASK);
+|       |     ^~~~~~~~~~
+| cc1: all warnings being treated as errors
+
+Looking at the tools directory, a fair few directory already add
+-Wno-declaration-after-statement to inhibit the default behavior.
+
+We have always build the hypervisor with the flag, so for now remove
+only the flag for anything but the hypervisor. We can decide at later
+time whether we want to relax.
+
+Also remove the -Wno-declaration-after-statement in some subdirectory
+as the flag is now unnecessary.
+
+Part of the commit message was take from Alexander's first proposal:
+
+Link: https://lore.kernel.org/xen-devel/20231128174729.3880113-1-alex@linutronix.de/
+Reported-by: Alexander Kanavin <alex@linutronix.de>
+Acked-by: Anthony PERARD <anthony.perard@citrix.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Tested-by: Jason Andryuk <jandryuk@gmail.com>
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+
+xen/hypervisor: Don't use cc-option-add for -Wdeclaration-after-statement
+
+Per Andrew's comment in [1] all the compilers we support should
+recognize the flag.
+
+I forgot to address the comment while committing.
+
+[1] fcf00090-304a-49f7-8a61-a54347e90a3b@citrix.com
+
+Signed-off-by: Julien Grall <jgrall@amazon.com>
+master commit: 40be6307ec005539635e7b8fcef67e989dc441f6
+master date: 2023-12-06 19:12:40 +0000
+master commit: d4bfd3899886d0fbe259c20660dadb1e00170f2d
+master date: 2023-12-06 19:19:59 +0000
+---
+ Config.mk                   | 2 --
+ stubdom/Makefile            | 2 +-
+ stubdom/vtpmmgr/Makefile    | 2 +-
+ tools/libs/light/Makefile   | 3 +--
+ tools/libs/util/Makefile    | 3 +--
+ tools/tests/depriv/Makefile | 2 --
+ tools/xl/Makefile           | 3 +--
+ xen/Makefile                | 1 +
+ 8 files changed, 6 insertions(+), 12 deletions(-)
+
+diff --git a/Config.mk b/Config.mk
+index 29b0d1e12a..2a3e16d0bd 100644
+--- a/Config.mk
++++ b/Config.mk
+@@ -177,8 +177,6 @@ CFLAGS += -std=gnu99
+ 
+ CFLAGS += -Wall -Wstrict-prototypes
+ 
+-$(call cc-option-add,HOSTCFLAGS,HOSTCC,-Wdeclaration-after-statement)
+-$(call cc-option-add,CFLAGS,CC,-Wdeclaration-after-statement)
+ $(call cc-option-add,CFLAGS,CC,-Wno-unused-but-set-variable)
+ $(call cc-option-add,CFLAGS,CC,-Wno-unused-local-typedefs)
+ 
+diff --git a/stubdom/Makefile b/stubdom/Makefile
+index 0ddfce1ba2..888fa20d72 100644
+--- a/stubdom/Makefile
++++ b/stubdom/Makefile
+@@ -245,7 +245,7 @@ tpm_emulator-$(XEN_TARGET_ARCH): tpm_emulator-$(TPMEMU_VERSION).tar.gz
+ 	patch -d $@ -p1 < vtpm-command-duration.patch
+ 	patch -d $@ -p1 < vtpm-tpm_bn_t-addr.patch
+ 	mkdir $@/build
+-	cd $@/build; CC=${CC} $(CMAKE) .. -DCMAKE_C_FLAGS:STRING="-std=c99 -DTPM_NO_EXTERN $(TARGET_CPPFLAGS) $(TARGET_CFLAGS) -Wno-declaration-after-statement"
++	cd $@/build; CC=${CC} $(CMAKE) .. -DCMAKE_C_FLAGS:STRING="-std=c99 -DTPM_NO_EXTERN $(TARGET_CPPFLAGS) $(TARGET_CFLAGS)"
+ 	touch $@
+ 
+ TPMEMU_STAMPFILE=$(CROSS_ROOT)/$(GNU_TARGET_ARCH)-xen-elf/lib/libtpm.a
+diff --git a/stubdom/vtpmmgr/Makefile b/stubdom/vtpmmgr/Makefile
+index 6dae034a07..c29bb49838 100644
+--- a/stubdom/vtpmmgr/Makefile
++++ b/stubdom/vtpmmgr/Makefile
+@@ -17,7 +17,7 @@ OBJS += vtpm_disk.o disk_tpm.o disk_io.o disk_crypto.o disk_read.o disk_write.o
+ OBJS += mgmt_authority.o
+ 
+ CFLAGS+=-Werror -Iutil -Icrypto -Itcs
+-CFLAGS+=-Wno-declaration-after-statement -Wno-unused-label
++CFLAGS+=-Wno-unused-label
+ 
+ build: $(TARGET)
+ $(TARGET): $(OBJS)
+diff --git a/tools/libs/light/Makefile b/tools/libs/light/Makefile
+index ba4c1b7933..37e4d16709 100644
+--- a/tools/libs/light/Makefile
++++ b/tools/libs/light/Makefile
+@@ -38,8 +38,7 @@ vpath static_tables.c $(ACPI_PATH)/
+ 
+ OBJS-$(CONFIG_X86) += $(ACPI_OBJS)
+ 
+-CFLAGS += -Wno-format-zero-length -Wmissing-declarations \
+-	-Wno-declaration-after-statement -Wformat-nonliteral
++CFLAGS += -Wno-format-zero-length -Wmissing-declarations -Wformat-nonliteral
+ 
+ CFLAGS-$(CONFIG_X86) += -DCONFIG_PCI_SUPP_LEGACY_IRQ
+ 
+diff --git a/tools/libs/util/Makefile b/tools/libs/util/Makefile
+index c3b21875dc..936ec90a31 100644
+--- a/tools/libs/util/Makefile
++++ b/tools/libs/util/Makefile
+@@ -9,8 +9,7 @@ OBJS-y += libxlu_disk.o
+ OBJS-y += libxlu_vif.o
+ OBJS-y += libxlu_pci.o
+ 
+-CFLAGS += -Wno-format-zero-length -Wmissing-declarations \
+-	-Wno-declaration-after-statement -Wformat-nonliteral
++CFLAGS += -Wno-format-zero-length -Wmissing-declarations -Wformat-nonliteral
+ CFLAGS += $(CFLAGS_libxenctrl)
+ 
+ CFLAGS += $(PTHREAD_CFLAGS)
+diff --git a/tools/tests/depriv/Makefile b/tools/tests/depriv/Makefile
+index 7d9e3b01bb..5404a12f47 100644
+--- a/tools/tests/depriv/Makefile
++++ b/tools/tests/depriv/Makefile
+@@ -1,8 +1,6 @@
+ XEN_ROOT=$(CURDIR)/../../..
+ include $(XEN_ROOT)/tools/Rules.mk
+ 
+-CFLAGS += -Wno-declaration-after-statement
+-
+ CFLAGS += $(CFLAGS_xeninclude)
+ CFLAGS += $(CFLAGS_libxenctrl)
+ CFLAGS += $(CFLAGS_libxencall)
+diff --git a/tools/xl/Makefile b/tools/xl/Makefile
+index 5f7aa5f46c..d742e96a5b 100644
+--- a/tools/xl/Makefile
++++ b/tools/xl/Makefile
+@@ -5,8 +5,7 @@
+ XEN_ROOT = $(CURDIR)/../..
+ include $(XEN_ROOT)/tools/Rules.mk
+ 
+-CFLAGS += -Wno-format-zero-length -Wmissing-declarations \
+-	-Wno-declaration-after-statement -Wformat-nonliteral
++CFLAGS += -Wno-format-zero-length -Wmissing-declarations -Wformat-nonliteral
+ CFLAGS += -fPIC
+ 
+ CFLAGS += $(PTHREAD_CFLAGS)
+diff --git a/xen/Makefile b/xen/Makefile
+index e39290f638..a92709b43e 100644
+--- a/xen/Makefile
++++ b/xen/Makefile
+@@ -392,6 +392,7 @@ CFLAGS-$(CONFIG_CC_SPLIT_SECTIONS) += -ffunction-sections -fdata-sections
+ 
+ CFLAGS += -nostdinc -fno-builtin -fno-common
+ CFLAGS += -Werror -Wredundant-decls -Wno-pointer-arith
++CFLAGS += -Wdeclaration-after-statement
+ $(call cc-option-add,CFLAGS,CC,-Wvla)
+ CFLAGS += -pipe -D__XEN__ -include $(srctree)/include/xen/config.h
+ CFLAGS-$(CONFIG_DEBUG_INFO) += -g
+-- 
+2.44.0
+
+
+From 48eb9e91990b3fd42f8e847780f6cdb188245b4a Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Tue, 12 Dec 2023 14:26:35 +0100
+Subject: [PATCH 15/70] xen/sched: fix sched_move_domain()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Do cleanup in sched_move_domain() in a dedicated service function,
+which is called either in error case with newly allocated data, or in
+success case with the old data to be freed.
+
+This will at once fix some subtle bugs which sneaked in due to
+forgetting to overwrite some pointers in the error case.
+
+Fixes: 70fadc41635b ("xen/cpupool: support moving domain between cpupools with different granularity")
+Reported-by: René Winther Højgaard <renewin@proton.me>
+Initial-fix-by: Jan Beulich <jbeulich@suse.com>
+Initial-fix-by: George Dunlap <george.dunlap@cloud.com>
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: George Dunlap <george.dunlap@cloud.com>
+master commit: 23792cc0f22cff4e106d838b83aa9ae1cb6ffaf4
+master date: 2023-12-07 13:37:25 +0000
+---
+ xen/common/sched/core.c | 47 +++++++++++++++++++++++------------------
+ 1 file changed, 27 insertions(+), 20 deletions(-)
+
+diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
+index eba0cea4bb..901782bbb4 100644
+--- a/xen/common/sched/core.c
++++ b/xen/common/sched/core.c
+@@ -647,6 +647,24 @@ static void sched_move_irqs(const struct sched_unit *unit)
+         vcpu_move_irqs(v);
+ }
+ 
++static void sched_move_domain_cleanup(const struct scheduler *ops,
++                                      struct sched_unit *units,
++                                      void *domdata)
++{
++    struct sched_unit *unit, *old_unit;
++
++    for ( unit = units; unit; )
++    {
++        if ( unit->priv )
++            sched_free_udata(ops, unit->priv);
++        old_unit = unit;
++        unit = unit->next_in_list;
++        xfree(old_unit);
++    }
++
++    sched_free_domdata(ops, domdata);
++}
++
+ /*
+  * Move a domain from one cpupool to another.
+  *
+@@ -686,7 +704,6 @@ int sched_move_domain(struct domain *d, struct cpupool *c)
+     void *old_domdata;
+     unsigned int gran = cpupool_get_granularity(c);
+     unsigned int n_units = d->vcpu[0] ? DIV_ROUND_UP(d->max_vcpus, gran) : 0;
+-    int ret = 0;
+ 
+     for_each_vcpu ( d, v )
+     {
+@@ -699,8 +716,9 @@ int sched_move_domain(struct domain *d, struct cpupool *c)
+     domdata = sched_alloc_domdata(c->sched, d);
+     if ( IS_ERR(domdata) )
+     {
+-        ret = PTR_ERR(domdata);
+-        goto out;
++        rcu_read_unlock(&sched_res_rculock);
++
++        return PTR_ERR(domdata);
+     }
+ 
+     for ( unit_idx = 0; unit_idx < n_units; unit_idx++ )
+@@ -718,10 +736,10 @@ int sched_move_domain(struct domain *d, struct cpupool *c)
+ 
+         if ( !unit || !unit->priv )
+         {
+-            old_units = new_units;
+-            old_domdata = domdata;
+-            ret = -ENOMEM;
+-            goto out_free;
++            sched_move_domain_cleanup(c->sched, new_units, domdata);
++            rcu_read_unlock(&sched_res_rculock);
++
++            return -ENOMEM;
+         }
+ 
+         unit_ptr = &unit->next_in_list;
+@@ -808,22 +826,11 @@ int sched_move_domain(struct domain *d, struct cpupool *c)
+ 
+     domain_unpause(d);
+ 
+- out_free:
+-    for ( unit = old_units; unit; )
+-    {
+-        if ( unit->priv )
+-            sched_free_udata(c->sched, unit->priv);
+-        old_unit = unit;
+-        unit = unit->next_in_list;
+-        xfree(old_unit);
+-    }
+-
+-    sched_free_domdata(old_ops, old_domdata);
++    sched_move_domain_cleanup(old_ops, old_units, old_domdata);
+ 
+- out:
+     rcu_read_unlock(&sched_res_rculock);
+ 
+-    return ret;
++    return 0;
+ }
+ 
+ void sched_destroy_vcpu(struct vcpu *v)
+-- 
+2.44.0
+
+
+From a4f3f5a62c10a5adc898cf45261783209f5bc037 Mon Sep 17 00:00:00 2001
+From: Michal Orzel <michal.orzel@amd.com>
+Date: Tue, 12 Dec 2023 14:27:10 +0100
+Subject: [PATCH 16/70] xen/arm: page: Avoid pointer overflow on cache clean &
+ invalidate
+
+On Arm32, after cleaning and invalidating the last dcache line of the top
+domheap page i.e. VA = 0xfffff000 (as a result of flushing the page to
+RAM), we end up adding the value of a dcache line size to the pointer
+once again, which results in a pointer arithmetic overflow (with 64B line
+size, operation 0xffffffc0 + 0x40 overflows to 0x0). Such behavior is
+undefined and given the wide range of compiler versions we support, it is
+difficult to determine what could happen in such scenario.
+
+Modify clean_and_invalidate_dcache_va_range() as well as
+clean_dcache_va_range() and invalidate_dcache_va_range() due to similarity
+of handling to prevent pointer arithmetic overflow. Modify the loops to
+use an additional variable to store the index of the next cacheline.
+Add an assert to prevent passing a region that wraps around which is
+illegal and would end up in a page fault anyway (region 0-2MB is
+unmapped). Lastly, return early if size passed is 0.
+
+Note that on Arm64, we don't have this problem given that the max VA
+space we support is 48-bits.
+
+This is XSA-447 / CVE-2023-46837.
+
+Signed-off-by: Michal Orzel <michal.orzel@amd.com>
+Reviewed-by: Julien Grall <jgrall@amazon.com>
+master commit: 190b7f49af6487a9665da63d43adc9d9a5fbd01e
+master date: 2023-12-12 14:01:00 +0100
+---
+ xen/arch/arm/include/asm/page.h | 35 ++++++++++++++++++++++++++-------
+ 1 file changed, 28 insertions(+), 7 deletions(-)
+
+diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h
+index aa0080e8d7..645331fc89 100644
+--- a/xen/arch/arm/include/asm/page.h
++++ b/xen/arch/arm/include/asm/page.h
+@@ -162,6 +162,13 @@ static inline size_t read_dcache_line_bytes(void)
+ static inline int invalidate_dcache_va_range(const void *p, unsigned long size)
+ {
+     size_t cacheline_mask = dcache_line_bytes - 1;
++    unsigned long idx = 0;
++
++    if ( !size )
++        return 0;
++
++    /* Passing a region that wraps around is illegal */
++    ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p);
+ 
+     dsb(sy);           /* So the CPU issues all writes to the range */
+ 
+@@ -174,11 +181,11 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size)
+     }
+ 
+     for ( ; size >= dcache_line_bytes;
+-            p += dcache_line_bytes, size -= dcache_line_bytes )
+-        asm volatile (__invalidate_dcache_one(0) : : "r" (p));
++            idx += dcache_line_bytes, size -= dcache_line_bytes )
++        asm volatile (__invalidate_dcache_one(0) : : "r" (p + idx));
+ 
+     if ( size > 0 )
+-        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p));
++        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx));
+ 
+     dsb(sy);           /* So we know the flushes happen before continuing */
+ 
+@@ -188,14 +195,21 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size)
+ static inline int clean_dcache_va_range(const void *p, unsigned long size)
+ {
+     size_t cacheline_mask = dcache_line_bytes - 1;
++    unsigned long idx = 0;
++
++    if ( !size )
++        return 0;
++
++    /* Passing a region that wraps around is illegal */
++    ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p);
+ 
+     dsb(sy);           /* So the CPU issues all writes to the range */
+     size += (uintptr_t)p & cacheline_mask;
+     size = (size + cacheline_mask) & ~cacheline_mask;
+     p = (void *)((uintptr_t)p & ~cacheline_mask);
+     for ( ; size >= dcache_line_bytes;
+-            p += dcache_line_bytes, size -= dcache_line_bytes )
+-        asm volatile (__clean_dcache_one(0) : : "r" (p));
++            idx += dcache_line_bytes, size -= dcache_line_bytes )
++        asm volatile (__clean_dcache_one(0) : : "r" (p + idx));
+     dsb(sy);           /* So we know the flushes happen before continuing */
+     /* ARM callers assume that dcache_* functions cannot fail. */
+     return 0;
+@@ -205,14 +219,21 @@ static inline int clean_and_invalidate_dcache_va_range
+     (const void *p, unsigned long size)
+ {
+     size_t cacheline_mask = dcache_line_bytes - 1;
++    unsigned long idx = 0;
++
++    if ( !size )
++        return 0;
++
++    /* Passing a region that wraps around is illegal */
++    ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p);
+ 
+     dsb(sy);         /* So the CPU issues all writes to the range */
+     size += (uintptr_t)p & cacheline_mask;
+     size = (size + cacheline_mask) & ~cacheline_mask;
+     p = (void *)((uintptr_t)p & ~cacheline_mask);
+     for ( ; size >= dcache_line_bytes;
+-            p += dcache_line_bytes, size -= dcache_line_bytes )
+-        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p));
++            idx += dcache_line_bytes, size -= dcache_line_bytes )
++        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx));
+     dsb(sy);         /* So we know the flushes happen before continuing */
+     /* ARM callers assume that dcache_* functions cannot fail. */
+     return 0;
+-- 
+2.44.0
+
+
+From 1792d1723b7fb45a20b145d2de4d233913b22c09 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 12 Dec 2023 14:45:52 +0100
+Subject: [PATCH 17/70] x86/x2apic: introduce a mixed physical/cluster mode
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The current implementation of x2APIC requires to either use Cluster Logical or
+Physical mode for all interrupts.  However the selection of Physical vs Logical
+is not done at APIC setup, an APIC can be addressed both in Physical or Logical
+destination modes concurrently.
+
+Introduce a new x2APIC mode called Mixed, which uses Logical Cluster mode for
+IPIs, and Physical mode for external interrupts, thus attempting to use the
+best method for each interrupt type.
+
+Using Physical mode for external interrupts allows more vectors to be used, and
+interrupt balancing to be more accurate.
+
+Using Logical Cluster mode for IPIs allows fewer accesses to the ICR register
+when sending those, as multiple CPUs can be targeted with a single ICR register
+write.
+
+A simple test calling flush_tlb_all() 10000 times on a tight loop on AMD EPYC
+9754 with 512 CPUs gives the following figures in nano seconds:
+
+x mixed
++ phys
+* cluster
+    N           Min           Max        Median           Avg        Stddev
+x  25 3.5131328e+08 3.5716441e+08 3.5410987e+08 3.5432659e+08     1566737.4
++  12  1.231082e+09  1.238824e+09 1.2370528e+09 1.2357981e+09     2853892.9
+Difference at 95.0% confidence
+	8.81472e+08 +/- 1.46849e+06
+	248.774% +/- 0.96566%
+	(Student's t, pooled s = 2.05985e+06)
+*  11 3.5099276e+08 3.5561459e+08 3.5461234e+08 3.5415668e+08     1415071.9
+No difference proven at 95.0% confidence
+
+So Mixed has no difference when compared to Cluster mode, and Physical mode is
+248% slower when compared to either Mixed or Cluster modes with a 95%
+confidence.
+
+Note that Xen uses Cluster mode by default, and hence is already using the
+fastest way for IPI delivery at the cost of reducing the amount of vectors
+available system-wide.
+
+Make the newly introduced mode the default one.
+
+Note the printing of the APIC addressing mode done in connect_bsp_APIC() has
+been removed, as with the newly introduced mixed mode this would require more
+fine grained printing, or else would be incorrect.  The addressing mode can
+already be derived from the APIC driver in use, which is printed by different
+helpers.
+
+Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Henry Wang <Henry.Wang@arm.com>
+master commit: e3c409d59ac87ccdf97b8c7708c81efa8069cb31
+master date: 2023-11-07 09:59:48 +0000
+---
+ CHANGELOG.md                      |  7 +++
+ docs/misc/xen-command-line.pandoc | 12 ++++
+ xen/arch/x86/Kconfig              | 35 +++++++++--
+ xen/arch/x86/apic.c               |  6 +-
+ xen/arch/x86/genapic/x2apic.c     | 98 +++++++++++++++++++++++--------
+ 5 files changed, 123 insertions(+), 35 deletions(-)
+
+diff --git a/CHANGELOG.md b/CHANGELOG.md
+index 7fb4d366c3..5aa01dae5d 100644
+--- a/CHANGELOG.md
++++ b/CHANGELOG.md
+@@ -4,6 +4,13 @@ Notable changes to Xen will be documented in this file.
+ 
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
+ 
++## [4.18.1](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.18.1)
++
++### Added
++ - On x86:
++   - Introduce a new x2APIC driver that uses Cluster Logical addressing mode
++     for IPIs and Physical addressing mode for external interrupts.
++
+ ## [4.18.0](https://xenbits.xenproject.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.18.0) - 2023-11-16
+ 
+ ### Changed
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index 9a19a04157..8e65f8bd18 100644
+--- a/docs/misc/xen-command-line.pandoc
++++ b/docs/misc/xen-command-line.pandoc
+@@ -2804,6 +2804,15 @@ the watchdog.
+ 
+ Permit use of x2apic setup for SMP environments.
+ 
++### x2apic-mode (x86)
++> `= physical | cluster | mixed`
++
++> Default: `physical` if **FADT** mandates physical mode, otherwise set at
++>          build time by CONFIG_X2APIC_{PHYSICAL,LOGICAL,MIXED}.
++
++In the case that x2apic is in use, this option switches between modes to
++address APICs in the system as interrupt destinations.
++
+ ### x2apic_phys (x86)
+ > `= <boolean>`
+ 
+@@ -2814,6 +2823,9 @@ In the case that x2apic is in use, this option switches between physical and
+ clustered mode.  The default, given no hint from the **FADT**, is cluster
+ mode.
+ 
++**WARNING: `x2apic_phys` is deprecated and superseded by `x2apic-mode`.
++The latter takes precedence if both are set.**
++
+ ### xenheap_megabytes (arm32)
+ > `= <size>`
+ 
+diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
+index eac77573bd..1acdffc51c 100644
+--- a/xen/arch/x86/Kconfig
++++ b/xen/arch/x86/Kconfig
+@@ -228,11 +228,18 @@ config XEN_ALIGN_2M
+ 
+ endchoice
+ 
+-config X2APIC_PHYSICAL
+-	bool "x2APIC Physical Destination mode"
++choice
++	prompt "x2APIC Driver default"
++	default X2APIC_MIXED
+ 	help
+-	  Use x2APIC Physical Destination mode by default when available.
++	  Select APIC addressing when x2APIC is enabled.
++
++	  The default mode is mixed which should provide the best aspects
++	  of both physical and cluster modes.
+ 
++config X2APIC_PHYSICAL
++	bool "Physical Destination mode"
++	help
+ 	  When using this mode APICs are addressed using the Physical
+ 	  Destination mode, which allows using all dynamic vectors on each
+ 	  CPU independently.
+@@ -242,9 +249,27 @@ config X2APIC_PHYSICAL
+ 	  destination inter processor interrupts (IPIs) slightly slower than
+ 	  Logical Destination mode.
+ 
+-	  The mode when this option is not selected is Logical Destination.
++config X2APIC_CLUSTER
++	bool "Cluster Destination mode"
++	help
++	  When using this mode APICs are addressed using the Cluster Logical
++	  Destination mode.
++
++	  Cluster Destination has the benefit of sending IPIs faster since
++	  multiple APICs can be targeted as destinations of a single IPI.
++	  However the vector space is shared between all CPUs on the cluster,
++	  and hence using this mode reduces the number of available vectors
++	  when compared to Physical mode.
+ 
+-	  If unsure, say N.
++config X2APIC_MIXED
++	bool "Mixed Destination mode"
++	help
++	  When using this mode APICs are addressed using the Cluster Logical
++	  Destination mode for IPIs and Physical mode for external interrupts.
++
++	  Should provide the best of both modes.
++
++endchoice
+ 
+ config GUEST
+ 	bool
+diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c
+index f1264ce7ed..6acdd0ec14 100644
+--- a/xen/arch/x86/apic.c
++++ b/xen/arch/x86/apic.c
+@@ -229,11 +229,7 @@ void __init connect_bsp_APIC(void)
+         outb(0x01, 0x23);
+     }
+ 
+-    printk("Enabling APIC mode:  %s.  Using %d I/O APICs\n",
+-           !INT_DEST_MODE ? "Physical"
+-                          : init_apic_ldr == init_apic_ldr_flat ? "Flat"
+-                                                                : "Clustered",
+-           nr_ioapics);
++    printk("Enabling APIC mode.  Using %d I/O APICs\n", nr_ioapics);
+     enable_apic_mode();
+ }
+ 
+diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c
+index 707deef98c..b88c7a96fe 100644
+--- a/xen/arch/x86/genapic/x2apic.c
++++ b/xen/arch/x86/genapic/x2apic.c
+@@ -180,6 +180,36 @@ static const struct genapic __initconstrel apic_x2apic_cluster = {
+     .send_IPI_self = send_IPI_self_x2apic
+ };
+ 
++/*
++ * Mixed x2APIC mode: use physical for external (device) interrupts, and
++ * cluster for inter processor interrupts.  Such mode has the benefits of not
++ * sharing the vector space with all CPUs on the cluster, while still allowing
++ * IPIs to be more efficiently delivered by not having to perform an ICR write
++ * for each target CPU.
++ */
++static const struct genapic __initconstrel apic_x2apic_mixed = {
++    APIC_INIT("x2apic_mixed", NULL),
++
++    /*
++     * The following fields are exclusively used by external interrupts and
++     * hence are set to use Physical destination mode handlers.
++     */
++    .int_delivery_mode = dest_Fixed,
++    .int_dest_mode = 0 /* physical delivery */,
++    .vector_allocation_cpumask = vector_allocation_cpumask_phys,
++    .cpu_mask_to_apicid = cpu_mask_to_apicid_phys,
++
++    /*
++     * The following fields are exclusively used by IPIs and hence are set to
++     * use Cluster Logical destination mode handlers.  Note that init_apic_ldr
++     * is not used by IPIs, but the per-CPU fields it initializes are only used
++     * by the IPI hooks.
++     */
++    .init_apic_ldr = init_apic_ldr_x2apic_cluster,
++    .send_IPI_mask = send_IPI_mask_x2apic_cluster,
++    .send_IPI_self = send_IPI_self_x2apic,
++};
++
+ static int cf_check update_clusterinfo(
+     struct notifier_block *nfb, unsigned long action, void *hcpu)
+ {
+@@ -220,38 +250,56 @@ static struct notifier_block x2apic_cpu_nfb = {
+ static int8_t __initdata x2apic_phys = -1;
+ boolean_param("x2apic_phys", x2apic_phys);
+ 
++enum {
++   unset, physical, cluster, mixed
++} static __initdata x2apic_mode = unset;
++
++static int __init cf_check parse_x2apic_mode(const char *s)
++{
++    if ( !cmdline_strcmp(s, "physical") )
++        x2apic_mode = physical;
++    else if ( !cmdline_strcmp(s, "cluster") )
++        x2apic_mode = cluster;
++    else if ( !cmdline_strcmp(s, "mixed") )
++        x2apic_mode = mixed;
++    else
++        return -EINVAL;
++
++    return 0;
++}
++custom_param("x2apic-mode", parse_x2apic_mode);
++
+ const struct genapic *__init apic_x2apic_probe(void)
+ {
+-    if ( x2apic_phys < 0 )
++    /* Honour the legacy cmdline setting if it's the only one provided. */
++    if ( x2apic_mode == unset && x2apic_phys >= 0 )
++        x2apic_mode = x2apic_phys ? physical : cluster;
++
++    if ( x2apic_mode == unset )
+     {
+-        /*
+-         * Force physical mode if there's no (full) interrupt remapping support:
+-         * The ID in clustered mode requires a 32 bit destination field due to
+-         * the usage of the high 16 bits to hold the cluster ID.
+-         */
+-        x2apic_phys = iommu_intremap != iommu_intremap_full ||
+-                      (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) ||
+-                      IS_ENABLED(CONFIG_X2APIC_PHYSICAL);
+-    }
+-    else if ( !x2apic_phys )
+-        switch ( iommu_intremap )
++        if ( acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL )
+         {
+-        case iommu_intremap_off:
+-        case iommu_intremap_restricted:
+-            printk("WARNING: x2APIC cluster mode is not supported %s interrupt remapping -"
+-                   " forcing phys mode\n",
+-                   iommu_intremap == iommu_intremap_off ? "without"
+-                                                        : "with restricted");
+-            x2apic_phys = true;
+-            break;
+-
+-        case iommu_intremap_full:
+-            break;
++            printk(XENLOG_INFO "ACPI FADT forcing x2APIC physical mode\n");
++            x2apic_mode = physical;
+         }
++        else
++            x2apic_mode = IS_ENABLED(CONFIG_X2APIC_MIXED) ? mixed
++                          : (IS_ENABLED(CONFIG_X2APIC_PHYSICAL) ? physical
++                                                                : cluster);
++    }
+ 
+-    if ( x2apic_phys )
++    if ( x2apic_mode == physical )
+         return &apic_x2apic_phys;
+ 
++    if ( x2apic_mode == cluster && iommu_intremap != iommu_intremap_full )
++    {
++        printk("WARNING: x2APIC cluster mode is not supported %s interrupt remapping -"
++               " forcing mixed mode\n",
++               iommu_intremap == iommu_intremap_off ? "without"
++                                                    : "with restricted");
++        x2apic_mode = mixed;
++    }
++
+     if ( !this_cpu(cluster_cpus) )
+     {
+         update_clusterinfo(NULL, CPU_UP_PREPARE,
+@@ -260,7 +308,7 @@ const struct genapic *__init apic_x2apic_probe(void)
+         register_cpu_notifier(&x2apic_cpu_nfb);
+     }
+ 
+-    return &apic_x2apic_cluster;
++    return x2apic_mode == cluster ? &apic_x2apic_cluster : &apic_x2apic_mixed;
+ }
+ 
+ void __init check_x2apic_preenabled(void)
+-- 
+2.44.0
+
+
+From 637da04812fba259a5d06591ec535345637a4407 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 30 Jan 2024 14:33:48 +0100
+Subject: [PATCH 18/70] pci: fail device assignment if phantom functions cannot
+ be assigned
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The current behavior is that no error is reported if (some) phantom functions
+fail to be assigned during device add or assignment, so the operation succeeds
+even if some phantom functions are not correctly setup.
+
+This can lead to devices possibly being successfully assigned to a domU while
+some of the device phantom functions are still assigned to dom0.  Even when the
+device is assigned domIO before being assigned to a domU phantom functions
+might fail to be assigned to domIO, and also fail to be assigned to the domU,
+leaving them assigned to dom0.
+
+Since the device can generate requests using the IDs of those phantom
+functions, given the scenario above a device in such state would be in control
+of a domU, but still capable of generating transactions that use a context ID
+targeting dom0 owned memory.
+
+Modify device assign in order to attempt to deassign the device if phantom
+functions failed to be assigned.
+
+Note that device addition is not modified in the same way, as in that case the
+device is assigned to a trusted domain, and hence partial assign can lead to
+device malfunction but not a security issue.
+
+This is XSA-449 / CVE-2023-46839
+
+Fixes: 4e9950dc1bd2 ('IOMMU: add phantom function support')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: cb4ecb3cc17b02c2814bc817efd05f3f3ba33d1e
+master date: 2024-01-30 14:28:01 +0100
+---
+ xen/drivers/passthrough/pci.c | 27 +++++++++++++++++++++------
+ 1 file changed, 21 insertions(+), 6 deletions(-)
+
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index 04d00c7c37..e99837b6e1 100644
+--- a/xen/drivers/passthrough/pci.c
++++ b/xen/drivers/passthrough/pci.c
+@@ -1439,11 +1439,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ 
+     pdev->fault.count = 0;
+ 
+-    if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn,
+-                          pci_to_dev(pdev), flag)) )
+-        goto done;
++    rc = iommu_call(hd->platform_ops, assign_device, d, devfn, pci_to_dev(pdev),
++                    flag);
+ 
+-    for ( ; pdev->phantom_stride; rc = 0 )
++    while ( pdev->phantom_stride && !rc )
+     {
+         devfn += pdev->phantom_stride;
+         if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+@@ -1454,8 +1453,24 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
+ 
+  done:
+     if ( rc )
+-        printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n",
+-               d, &PCI_SBDF(seg, bus, devfn), rc);
++    {
++        printk(XENLOG_G_WARNING "%pd: assign %s(%pp) failed (%d)\n",
++               d, devfn != pdev->devfn ? "phantom function " : "",
++               &PCI_SBDF(seg, bus, devfn), rc);
++
++        if ( devfn != pdev->devfn && deassign_device(d, seg, bus, pdev->devfn) )
++        {
++            /*
++             * Device with phantom functions that failed to both assign and
++             * rollback.  Mark the device as broken and crash the target domain,
++             * as the state of the functions at this point is unknown and Xen
++             * has no way to assert consistent context assignment among them.
++             */
++            pdev->broken = true;
++            if ( !is_hardware_domain(d) && d != dom_io )
++                domain_crash(d);
++        }
++    }
+     /* The device is assigned to dom_io so mark it as quarantined */
+     else if ( d == dom_io )
+         pdev->quarantine = true;
+-- 
+2.44.0
+
+
+From c7ac596a575a05d6ff1e35c3ff98bc4d143712d2 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Tue, 30 Jan 2024 14:34:40 +0100
+Subject: [PATCH 19/70] VT-d: Fix "else" vs "#endif" misplacement
+
+In domain_pgd_maddr() the "#endif" is misplaced with respect to "else".  This
+generates incorrect logic when CONFIG_HVM is compiled out, as the "else" body
+is executed unconditionally.
+
+Rework the logic to use IS_ENABLED() instead of explicit #ifdef-ary, as it's
+clearer to follow.  This in turn involves adjusting p2m_get_pagetable() to
+compile when CONFIG_HVM is disabled.
+
+This is XSA-450 / CVE-2023-46840.
+
+Fixes: 033ff90aa9c1 ("x86/P2M: p2m_{alloc,free}_ptp() and p2m_alloc_table() are HVM-only")
+Reported-by: Teddy Astie <teddy.astie@vates.tech>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: cc6ba68edf6dcd18c3865e7d7c0f1ed822796426
+master date: 2024-01-30 14:29:15 +0100
+---
+ xen/arch/x86/include/asm/p2m.h      | 9 ++++++++-
+ xen/drivers/passthrough/vtd/iommu.c | 4 +---
+ 2 files changed, 9 insertions(+), 4 deletions(-)
+
+diff --git a/xen/arch/x86/include/asm/p2m.h b/xen/arch/x86/include/asm/p2m.h
+index 40545f5fa8..1e0b0e2dcc 100644
+--- a/xen/arch/x86/include/asm/p2m.h
++++ b/xen/arch/x86/include/asm/p2m.h
+@@ -435,7 +435,14 @@ static inline bool_t p2m_is_altp2m(const struct p2m_domain *p2m)
+     return p2m->p2m_class == p2m_alternate;
+ }
+ 
+-#define p2m_get_pagetable(p2m)  ((p2m)->phys_table)
++#ifdef CONFIG_HVM
++static inline pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m)
++{
++    return p2m->phys_table;
++}
++#else
++pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m);
++#endif
+ 
+ /*
+  * Ensure any deferred p2m TLB flush has been completed on all VCPUs.
+diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
+index e13b7d99db..9ed616e211 100644
+--- a/xen/drivers/passthrough/vtd/iommu.c
++++ b/xen/drivers/passthrough/vtd/iommu.c
+@@ -438,15 +438,13 @@ static paddr_t domain_pgd_maddr(struct domain *d, paddr_t pgd_maddr,
+ 
+     if ( pgd_maddr )
+         /* nothing */;
+-#ifdef CONFIG_HVM
+-    else if ( iommu_use_hap_pt(d) )
++    else if ( IS_ENABLED(CONFIG_HVM) && iommu_use_hap_pt(d) )
+     {
+         pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d));
+ 
+         pgd_maddr = pagetable_get_paddr(pgt);
+     }
+     else
+-#endif
+     {
+         if ( !hd->arch.vtd.pgd_maddr )
+         {
+-- 
+2.44.0
+
+
+From 62b3d7f8e45a7ec1597f0ed61a99d1f423b22315 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Thu, 1 Feb 2024 17:58:17 +0100
+Subject: [PATCH 20/70] x86/amd: Extend CPU erratum #1474 fix to more affected
+ models
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Erratum #1474 has now been extended to cover models from family 17h ranges
+00-2Fh, so the errata now covers all the models released under Family
+17h (Zen, Zen+ and Zen2).
+
+Additionally extend the workaround to Family 18h (Hygon), since it's based on
+the Zen architecture and very likely affected.
+
+Rename all the zen2 related symbols to fam17, since the errata doesn't
+exclusively affect Zen2 anymore.
+
+Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 23db507a01a4ec5259ec0ab43d296a41b1c326ba
+master date: 2023-12-21 12:19:40 +0000
+---
+ xen/arch/x86/cpu/amd.c | 27 ++++++++++++++-------------
+ 1 file changed, 14 insertions(+), 13 deletions(-)
+
+diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c
+index 0f305312ff..d43288ae97 100644
+--- a/xen/arch/x86/cpu/amd.c
++++ b/xen/arch/x86/cpu/amd.c
+@@ -54,7 +54,7 @@ bool __read_mostly amd_acpi_c1e_quirk;
+ bool __ro_after_init amd_legacy_ssbd;
+ bool __initdata amd_virt_spec_ctrl;
+ 
+-static bool __read_mostly zen2_c6_disabled;
++static bool __read_mostly fam17_c6_disabled;
+ 
+ static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo,
+ 				 unsigned int *hi)
+@@ -978,24 +978,24 @@ void amd_check_zenbleed(void)
+ 		       val & chickenbit ? "chickenbit" : "microcode");
+ }
+ 
+-static void cf_check zen2_disable_c6(void *arg)
++static void cf_check fam17_disable_c6(void *arg)
+ {
+ 	/* Disable C6 by clearing the CCR{0,1,2}_CC6EN bits. */
+ 	const uint64_t mask = ~((1ul << 6) | (1ul << 14) | (1ul << 22));
+ 	uint64_t val;
+ 
+-	if (!zen2_c6_disabled) {
++	if (!fam17_c6_disabled) {
+ 		printk(XENLOG_WARNING
+     "Disabling C6 after 1000 days apparent uptime due to AMD errata 1474\n");
+-		zen2_c6_disabled = true;
++		fam17_c6_disabled = true;
+ 		/*
+ 		 * Prevent CPU hotplug so that started CPUs will either see
+-		 * zen2_c6_disabled set, or will be handled by
++		 * zen_c6_disabled set, or will be handled by
+ 		 * smp_call_function().
+ 		 */
+ 		while (!get_cpu_maps())
+ 			process_pending_softirqs();
+-		smp_call_function(zen2_disable_c6, NULL, 0);
++		smp_call_function(fam17_disable_c6, NULL, 0);
+ 		put_cpu_maps();
+ 	}
+ 
+@@ -1294,8 +1294,8 @@ static void cf_check init_amd(struct cpuinfo_x86 *c)
+ 	amd_check_zenbleed();
+ 	amd_check_erratum_1485();
+ 
+-	if (zen2_c6_disabled)
+-		zen2_disable_c6(NULL);
++	if (fam17_c6_disabled)
++		fam17_disable_c6(NULL);
+ 
+ 	check_syscfg_dram_mod_en();
+ 
+@@ -1307,7 +1307,7 @@ const struct cpu_dev amd_cpu_dev = {
+ 	.c_init		= init_amd,
+ };
+ 
+-static int __init cf_check zen2_c6_errata_check(void)
++static int __init cf_check amd_check_erratum_1474(void)
+ {
+ 	/*
+ 	 * Errata #1474: A Core May Hang After About 1044 Days
+@@ -1315,7 +1315,8 @@ static int __init cf_check zen2_c6_errata_check(void)
+ 	 */
+ 	s_time_t delta;
+ 
+-	if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 || !is_zen2_uarch())
++	if (cpu_has_hypervisor ||
++	    (boot_cpu_data.x86 != 0x17 && boot_cpu_data.x86 != 0x18))
+ 		return 0;
+ 
+ 	/*
+@@ -1330,10 +1331,10 @@ static int __init cf_check zen2_c6_errata_check(void)
+ 	if (delta > 0) {
+ 		static struct timer errata_c6;
+ 
+-		init_timer(&errata_c6, zen2_disable_c6, NULL, 0);
++		init_timer(&errata_c6, fam17_disable_c6, NULL, 0);
+ 		set_timer(&errata_c6, NOW() + delta);
+ 	} else
+-		zen2_disable_c6(NULL);
++		fam17_disable_c6(NULL);
+ 
+ 	return 0;
+ }
+@@ -1341,4 +1342,4 @@ static int __init cf_check zen2_c6_errata_check(void)
+  * Must be executed after early_time_init() for tsc_ticks2ns() to have been
+  * calibrated.  That prevents us doing the check in init_amd().
+  */
+-presmp_initcall(zen2_c6_errata_check);
++presmp_initcall(amd_check_erratum_1474);
+-- 
+2.44.0
+
+
+From b26c30a408255454f8ceb4e49e3c4385aa32fbc3 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Thu, 1 Feb 2024 17:58:59 +0100
+Subject: [PATCH 21/70] CirrusCI: drop FreeBSD 12
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Went EOL by the end of December 2023, and the pkg repos have been shut down.
+
+Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: c2ce3466472e9c9eda79f5dc98eb701bc6fdba20
+master date: 2024-01-15 12:20:11 +0100
+---
+ .cirrus.yml | 6 ------
+ 1 file changed, 6 deletions(-)
+
+diff --git a/.cirrus.yml b/.cirrus.yml
+index 7e0beb200d..63f3afb104 100644
+--- a/.cirrus.yml
++++ b/.cirrus.yml
+@@ -14,12 +14,6 @@ freebsd_template: &FREEBSD_TEMPLATE
+     - ./configure --with-system-seabios=/usr/local/share/seabios/bios.bin
+     - gmake -j`sysctl -n hw.ncpu` clang=y
+ 
+-task:
+-  name: 'FreeBSD 12'
+-  freebsd_instance:
+-    image_family: freebsd-12-4
+-  << : *FREEBSD_TEMPLATE
+-
+ task:
+   name: 'FreeBSD 13'
+   freebsd_instance:
+-- 
+2.44.0
+
+
+From 6ccf064b0ce1d06449565129ab944b4fd9531b3a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Thu, 1 Feb 2024 17:59:25 +0100
+Subject: [PATCH 22/70] x86/intel: ensure Global Performance Counter Control is
+ setup correctly
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When Architectural Performance Monitoring is available, the PERF_GLOBAL_CTRL
+MSR contains per-counter enable bits that is ANDed with the enable bit in the
+counter EVNTSEL MSR in order for a PMC counter to be enabled.
+
+So far the watchdog code seems to have relied on the PERF_GLOBAL_CTRL enable
+bits being set by default, but at least on some Intel Sapphire and Emerald
+Rapids this is no longer the case, and Xen reports:
+
+Testing NMI watchdog on all CPUs: 0 40 stuck
+
+The first CPU on each package is started with PERF_GLOBAL_CTRL zeroed, so PMC0
+doesn't start counting when the enable bit in EVNTSEL0 is set, due to the
+relevant enable bit in PERF_GLOBAL_CTRL not being set.
+
+Check and adjust PERF_GLOBAL_CTRL during CPU initialization so that all the
+general-purpose PMCs are enabled.  Doing so brings the state of the package-BSP
+PERF_GLOBAL_CTRL in line with the rest of the CPUs on the system.
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+master commit: 6bdb965178bbb3fc50cd4418d4770a7789956e2c
+master date: 2024-01-17 10:40:52 +0100
+---
+ xen/arch/x86/cpu/intel.c | 23 ++++++++++++++++++++++-
+ 1 file changed, 22 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c
+index a8ba3191e6..aef8e4506c 100644
+--- a/xen/arch/x86/cpu/intel.c
++++ b/xen/arch/x86/cpu/intel.c
+@@ -533,9 +533,30 @@ static void cf_check init_intel(struct cpuinfo_x86 *c)
+ 	init_intel_cacheinfo(c);
+ 	if (c->cpuid_level > 9) {
+ 		unsigned eax = cpuid_eax(10);
++		unsigned int cnt = (eax >> 8) & 0xff;
++
+ 		/* Check for version and the number of counters */
+-		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
++		if ((eax & 0xff) && (cnt > 1) && (cnt <= 32)) {
++			uint64_t global_ctrl;
++			unsigned int cnt_mask = (1UL << cnt) - 1;
++
++			/*
++			 * On (some?) Sapphire/Emerald Rapids platforms each
++			 * package-BSP starts with all the enable bits for the
++			 * general-purpose PMCs cleared.  Adjust so counters
++			 * can be enabled from EVNTSEL.
++			 */
++			rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_ctrl);
++			if ((global_ctrl & cnt_mask) != cnt_mask) {
++				printk("CPU%u: invalid PERF_GLOBAL_CTRL: %#"
++				       PRIx64 " adjusting to %#" PRIx64 "\n",
++				       smp_processor_id(), global_ctrl,
++				       global_ctrl | cnt_mask);
++				wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
++				       global_ctrl | cnt_mask);
++			}
+ 			__set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability);
++		}
+ 	}
+ 
+ 	if ( !cpu_has(c, X86_FEATURE_XTOPOLOGY) )
+-- 
+2.44.0
+
+
+From 4cc0f88c42f374c7a8e2d05e38777fa18619482e Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 1 Feb 2024 17:59:57 +0100
+Subject: [PATCH 23/70] x86/vmx: Fix IRQ handling for EXIT_REASON_INIT
+
+When receiving an INIT, a prior bugfix tried to ignore the INIT and continue
+onwards.
+
+Unfortunately it's not safe to return at that point in vmx_vmexit_handler().
+Just out of context in the first hunk is a local_irqs_enabled() which is
+depended-upon by the return-to-guest path, causing the following checklock
+failure in debug builds:
+
+  (XEN) Error: INIT received - ignoring
+  (XEN) CHECKLOCK FAILURE: prev irqsafe: 0, curr irqsafe 1
+  (XEN) Xen BUG at common/spinlock.c:132
+  (XEN) ----[ Xen-4.19-unstable  x86_64  debug=y  Tainted:     H  ]----
+  ...
+  (XEN) Xen call trace:
+  (XEN)    [<ffff82d040238e10>] R check_lock+0xcd/0xe1
+  (XEN)    [<ffff82d040238fe3>] F _spin_lock+0x1b/0x60
+  (XEN)    [<ffff82d0402ed6a8>] F pt_update_irq+0x32/0x3bb
+  (XEN)    [<ffff82d0402b9632>] F vmx_intr_assist+0x3b/0x51d
+  (XEN)    [<ffff82d040206447>] F vmx_asm_vmexit_handler+0xf7/0x210
+
+Luckily, this is benign in release builds.  Accidentally having IRQs disabled
+when trying to take an IRQs-on lock isn't a deadlock-vulnerable pattern.
+
+Drop the problematic early return.  In hindsight, it's wrong to skip other
+normal VMExit steps.
+
+Fixes: b1f11273d5a7 ("x86/vmx: Don't spuriously crash the domain when INIT is received")
+Reported-by: Reima ISHII <ishiir@g.ecc.u-tokyo.ac.jp>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: d1f8883aebe00f6a9632d77ab0cd5c6d02c9cbe4
+master date: 2024-01-18 20:59:06 +0000
+---
+ xen/arch/x86/hvm/vmx/vmx.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 1edc7f1e91..964891934b 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -4100,7 +4100,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
+ 
+     case EXIT_REASON_INIT:
+         printk(XENLOG_ERR "Error: INIT received - ignoring\n");
+-        return; /* Renter the guest without further processing */
++        break;
+     }
+ 
+     /* Now enable interrupts so it's safe to take locks. */
+@@ -4385,6 +4385,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
+         break;
+     }
+     case EXIT_REASON_EXTERNAL_INTERRUPT:
++    case EXIT_REASON_INIT:
+         /* Already handled above. */
+         break;
+     case EXIT_REASON_TRIPLE_FAULT:
+-- 
+2.44.0
+
+
+From 00550e808c10c67710ebb8867200eda1fbee332c Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 1 Feb 2024 18:00:32 +0100
+Subject: [PATCH 24/70] x86/vmx: Disallow the use of inactivity states
+
+Right now, vvmx will blindly copy L12's ACTIVITY_STATE into the L02 VMCS and
+enter the vCPU.  Luckily for us, nested-virt is explicitly unsupported for
+security bugs.
+
+The inactivity states are HLT, SHUTDOWN and WAIT-FOR-SIPI, and as noted by the
+SDM in Vol3 27.7 "Special Features of VM Entry":
+
+  If VM entry ends with the logical processor in an inactive activity state,
+  the VM entry generates any special bus cycle that is normally generated when
+  that activity state is entered from the active state.
+
+Also,
+
+  Some activity states unconditionally block certain events.
+
+I.e. A VMEntry with ACTIVITY=SHUTDOWN will initiate a platform reset, while a
+VMEntry with ACTIVITY=WAIT-FOR-SIPI will really block everything other than
+SIPIs.
+
+Both of these activity states are for the TXT ACM to use, not for regular
+hypervisors, and Xen doesn't support dropping the HLT intercept either.
+
+There are two paths in Xen which operate on ACTIVITY_STATE.
+
+1) The vmx_{get,set}_nonreg_state() helpers for VM-Fork.
+
+   As regular VMs can't use any inactivity states, this is just duplicating
+   the 0 from construct_vmcs().  Retain the ability to query activity_state,
+   but crash the domain on any attempt to set an inactivity state.
+
+2) Nested virt, because of ACTIVITY_STATE in vmcs_gstate_field[].
+
+   Explicitly hide the inactivity states in the guest's view of MSR_VMX_MISC,
+   and remove ACTIVITY_STATE from vmcs_gstate_field[].
+
+   In virtual_vmentry(), we should trigger a VMEntry failure for the use of
+   any inactivity states, but there's no support for that in the code at all
+   so leave a TODO for when we finally start working on nested-virt in
+   earnest.
+
+Reported-by: Reima Ishii <ishiir@g.ecc.u-tokyo.ac.jp>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Tamas K Lengyel <tamas@tklengyel.com>
+master commit: 3643bb53a05b7c8fbac072c63bef1538f2a6d0d2
+master date: 2024-01-18 20:59:06 +0000
+---
+ xen/arch/x86/hvm/vmx/vmx.c              | 5 ++++-
+ xen/arch/x86/hvm/vmx/vvmx.c             | 9 +++++++--
+ xen/arch/x86/include/asm/hvm/vmx/vmcs.h | 1 +
+ 3 files changed, 12 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
+index 964891934b..28dece7c6b 100644
+--- a/xen/arch/x86/hvm/vmx/vmx.c
++++ b/xen/arch/x86/hvm/vmx/vmx.c
+@@ -1558,7 +1558,10 @@ static void cf_check vmx_set_nonreg_state(struct vcpu *v,
+ {
+     vmx_vmcs_enter(v);
+ 
+-    __vmwrite(GUEST_ACTIVITY_STATE, nrs->vmx.activity_state);
++    if ( nrs->vmx.activity_state )
++        domain_crash(v->domain, "Attempt to set %pv activity_state %#lx\n",
++                     v, nrs->vmx.activity_state);
++
+     __vmwrite(GUEST_INTERRUPTIBILITY_INFO, nrs->vmx.interruptibility_info);
+     __vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, nrs->vmx.pending_dbg);
+ 
+diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
+index 16b0ef82b6..fd0ae39166 100644
+--- a/xen/arch/x86/hvm/vmx/vvmx.c
++++ b/xen/arch/x86/hvm/vmx/vvmx.c
+@@ -899,7 +899,10 @@ static const u16 vmcs_gstate_field[] = {
+     GUEST_LDTR_AR_BYTES,
+     GUEST_TR_AR_BYTES,
+     GUEST_INTERRUPTIBILITY_INFO,
++    /*
++     * ACTIVITY_STATE is handled specially.
+     GUEST_ACTIVITY_STATE,
++     */
+     GUEST_SYSENTER_CS,
+     GUEST_PREEMPTION_TIMER,
+     /* natural */
+@@ -1200,6 +1203,8 @@ static void virtual_vmentry(struct cpu_user_regs *regs)
+     nvcpu->nv_vmentry_pending = 0;
+     nvcpu->nv_vmswitch_in_progress = 1;
+ 
++    /* TODO: Fail VMentry for GUEST_ACTIVITY_STATE != 0 */
++
+     /*
+      * EFER handling:
+      * hvm_set_efer won't work if CR0.PG = 1, so we change the value
+@@ -2316,8 +2321,8 @@ int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content)
+         data = hvm_cr4_guest_valid_bits(d);
+         break;
+     case MSR_IA32_VMX_MISC:
+-        /* Do not support CR3-target feature now */
+-        data = host_data & ~VMX_MISC_CR3_TARGET;
++        /* Do not support CR3-targets or activity states. */
++        data = host_data & ~(VMX_MISC_CR3_TARGET | VMX_MISC_ACTIVITY_MASK);
+         break;
+     case MSR_IA32_VMX_EPT_VPID_CAP:
+         data = nept_get_ept_vpid_cap();
+diff --git a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
+index d07fcb2bc9..8de9977eb3 100644
+--- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
++++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h
+@@ -277,6 +277,7 @@ extern u32 vmx_secondary_exec_control;
+ #define VMX_VPID_INVVPID_SINGLE_CONTEXT_RETAINING_GLOBAL 0x80000000000ULL
+ extern u64 vmx_ept_vpid_cap;
+ 
++#define VMX_MISC_ACTIVITY_MASK                  0x000001c0
+ #define VMX_MISC_PROC_TRACE                     0x00004000
+ #define VMX_MISC_CR3_TARGET                     0x01ff0000
+ #define VMX_MISC_VMWRITE_ALL                    0x20000000
+-- 
+2.44.0
+
+
+From 579a622eb41cf4e1ae4d94100985a81eebda23b9 Mon Sep 17 00:00:00 2001
+From: Michal Orzel <michal.orzel@amd.com>
+Date: Thu, 1 Feb 2024 18:01:27 +0100
+Subject: [PATCH 25/70] lib{fdt,elf}: move lib{fdt,elf}-temp.o and their deps
+ to $(targets)
+
+At the moment, trying to run xencov read/reset (calling SYSCTL_coverage_op
+under the hood) results in a crash. This is due to a profiler trying to
+access data in the .init.* sections (libfdt for Arm and libelf for x86)
+that are stripped after boot. Normally, the build system compiles any
+*.init.o file without COV_FLAGS. However, these two libraries are
+handled differently as sections will be renamed to init after linking.
+
+To override COV_FLAGS to empty for these libraries, lib{fdt,elf}.o were
+added to nocov-y. This worked until e321576f4047 ("xen/build: start using
+if_changed") that added lib{fdt,elf}-temp.o and their deps to extra-y.
+This way, even though these objects appear as prerequisites of
+lib{fdt,elf}.o and the settings should propagate to them, make can also
+build them as a prerequisite of __build, in which case COV_FLAGS would
+still have the unwanted flags. Fix it by switching to $(targets) instead.
+
+Also, for libfdt, append libfdt.o to nocov-y only if CONFIG_OVERLAY_DTB
+is not set. Otherwise, there is no section renaming and we should be able
+to run the coverage.
+
+Fixes: e321576f4047 ("xen/build: start using if_changed")
+Signed-off-by: Michal Orzel <michal.orzel@amd.com>
+Reviewed-by: Anthony PERARD <anthony.perard@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+master commit: 79519fcfa0605bbf19d8c02b979af3a2c8afed68
+master date: 2024-01-23 12:02:44 +0100
+---
+ xen/common/libelf/Makefile | 2 +-
+ xen/common/libfdt/Makefile | 4 ++--
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/xen/common/libelf/Makefile b/xen/common/libelf/Makefile
+index 8a4522e4e1..917d12b006 100644
+--- a/xen/common/libelf/Makefile
++++ b/xen/common/libelf/Makefile
+@@ -13,4 +13,4 @@ $(obj)/libelf.o: $(obj)/libelf-temp.o FORCE
+ $(obj)/libelf-temp.o: $(addprefix $(obj)/,$(libelf-objs)) FORCE
+ 	$(call if_changed,ld)
+ 
+-extra-y += libelf-temp.o $(libelf-objs)
++targets += libelf-temp.o $(libelf-objs)
+diff --git a/xen/common/libfdt/Makefile b/xen/common/libfdt/Makefile
+index d50487aa6e..6ce679f98f 100644
+--- a/xen/common/libfdt/Makefile
++++ b/xen/common/libfdt/Makefile
+@@ -5,10 +5,10 @@ SECTIONS := text data $(SPECIAL_DATA_SECTIONS)
+ # For CONFIG_OVERLAY_DTB, libfdt functionalities will be needed during runtime.
+ ifneq ($(CONFIG_OVERLAY_DTB),y)
+ OBJCOPYFLAGS := $(foreach s,$(SECTIONS),--rename-section .$(s)=.init.$(s))
++nocov-y += libfdt.o
+ endif
+ 
+ obj-y += libfdt.o
+-nocov-y += libfdt.o
+ 
+ CFLAGS-y += -I$(srctree)/include/xen/libfdt/
+ 
+@@ -18,4 +18,4 @@ $(obj)/libfdt.o: $(obj)/libfdt-temp.o FORCE
+ $(obj)/libfdt-temp.o: $(addprefix $(obj)/,$(LIBFDT_OBJS)) FORCE
+ 	$(call if_changed,ld)
+ 
+-extra-y += libfdt-temp.o $(LIBFDT_OBJS)
++targets += libfdt-temp.o $(LIBFDT_OBJS)
+-- 
+2.44.0
+
+
+From 295ab8060d95ed8c365077946c7faf8793099ef8 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Thu, 1 Feb 2024 18:01:52 +0100
+Subject: [PATCH 26/70] x86/p2m-pt: fix off by one in entry check assert
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The MMIO RO rangeset overlap check is bogus: the rangeset is inclusive so the
+passed end mfn should be the last mfn to be mapped (not last + 1).
+
+Fixes: 6fa1755644d0 ('amd/npt/shadow: replace assert that prevents creating 2M/1G MMIO entries')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: George Dunlap <george.dunlap@cloud.com>
+master commit: 610775d0dd61c1bd2f4720c755986098e6a5bafd
+master date: 2024-01-25 16:09:04 +0100
+---
+ xen/arch/x86/mm/p2m-pt.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c
+index b2b14746c1..88d3733891 100644
+--- a/xen/arch/x86/mm/p2m-pt.c
++++ b/xen/arch/x86/mm/p2m-pt.c
+@@ -552,7 +552,7 @@ static void check_entry(mfn_t mfn, p2m_type_t new, p2m_type_t old,
+     if ( new == p2m_mmio_direct )
+         ASSERT(!mfn_eq(mfn, INVALID_MFN) &&
+                !rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn),
+-                                        mfn_x(mfn) + (1UL << order)));
++                                        mfn_x(mfn) + (1UL << order) - 1));
+     else if ( p2m_allows_invalid_mfn(new) || new == p2m_invalid ||
+               new == p2m_mmio_dm )
+         ASSERT(mfn_valid(mfn) || mfn_eq(mfn, INVALID_MFN));
+-- 
+2.44.0
+
+
+From b1fdd7d0e47e0831ac7a99d0417385fc10d3068c Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 1 Feb 2024 18:02:24 +0100
+Subject: [PATCH 27/70] x86/ucode: Fix stability of the raw CPU Policy rescan
+
+Always run microcode_update_helper() on the BSP, so the the updated Raw CPU
+policy doesn't get non-BSP topology details included.
+
+Have calculate_raw_cpu_policy() clear the instantanious XSTATE sizes.  The
+value XCR0 | MSR_XSS had when we scanned the policy isn't terribly interesting
+to report.
+
+When CPUID Masking is active, it affects CPUID instructions issued by Xen
+too.  Transiently disable masking to get a clean scan.
+
+Fixes: 694d79ed5aac ("x86/ucode: Refresh raw CPU policy after microcode load")
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: cf7fe8b72deaa94157ddf97d4bb391480205e9c2
+master date: 2024-01-25 17:46:57 +0000
+---
+ xen/arch/x86/cpu-policy.c         |  7 +++++++
+ xen/arch/x86/cpu/microcode/core.c | 20 +++++++++++++++++---
+ 2 files changed, 24 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c
+index 81e574390f..bcb17b7ce3 100644
+--- a/xen/arch/x86/cpu-policy.c
++++ b/xen/arch/x86/cpu-policy.c
+@@ -353,6 +353,13 @@ void calculate_raw_cpu_policy(void)
+     /* Nothing good will come from Xen and libx86 disagreeing on vendor. */
+     ASSERT(p->x86_vendor == boot_cpu_data.x86_vendor);
+ 
++    /*
++     * Clear the truly dynamic fields.  These vary with the in-context XCR0
++     * and MSR_XSS, and aren't interesting fields in the raw policy.
++     */
++    p->xstate.raw[0].b = 0;
++    p->xstate.raw[1].b = 0;
++
+     /* 0x000000ce  MSR_INTEL_PLATFORM_INFO */
+     /* Was already added by probe_cpuid_faulting() */
+ }
+diff --git a/xen/arch/x86/cpu/microcode/core.c b/xen/arch/x86/cpu/microcode/core.c
+index 65ebeb50de..4e011cdc41 100644
+--- a/xen/arch/x86/cpu/microcode/core.c
++++ b/xen/arch/x86/cpu/microcode/core.c
+@@ -680,8 +680,18 @@ static long cf_check microcode_update_helper(void *data)
+         microcode_update_cache(patch);
+         spin_unlock(&microcode_mutex);
+ 
+-        /* Refresh the raw CPU policy, in case the features have changed. */
++        /*
++         * Refresh the raw CPU policy, in case the features have changed.
++         * Disable CPUID masking if in use, to avoid having current's
++         * cpu_policy affect the rescan.
++         */
++	if ( ctxt_switch_masking )
++            alternative_vcall(ctxt_switch_masking, NULL);
++
+         calculate_raw_cpu_policy();
++
++	if ( ctxt_switch_masking )
++            alternative_vcall(ctxt_switch_masking, current);
+     }
+     else
+         microcode_free_patch(patch);
+@@ -721,8 +731,12 @@ int microcode_update(XEN_GUEST_HANDLE(const_void) buf, unsigned long len)
+     }
+     buffer->len = len;
+ 
+-    return continue_hypercall_on_cpu(smp_processor_id(),
+-                                     microcode_update_helper, buffer);
++    /*
++     * Always queue microcode_update_helper() on CPU0.  Most of the logic
++     * won't care, but the update of the Raw CPU policy wants to (re)run on
++     * the BSP.
++     */
++    return continue_hypercall_on_cpu(0, microcode_update_helper, buffer);
+ }
+ 
+ static int __init cf_check microcode_init(void)
+-- 
+2.44.0
+
+
+From 184d723e7a5d1c021d297e14d19fe5344eac7a56 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Cyril=20R=C3=A9bert=20=28zithro=29?= <slack@rabbit.lu>
+Date: Tue, 27 Feb 2024 13:53:42 +0100
+Subject: [PATCH 28/70] tools/xentop: fix sorting bug for some columns
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Sort doesn't work on columns VBD_OO, VBD_RD, VBD_WR and VBD_RSECT.
+Fix by adjusting variables names in compare functions.
+Bug fix only. No functional change.
+
+Fixes: 91c3e3dc91d6 ("tools/xentop: Display '-' when stats are not available.")
+Signed-off-by: Cyril Rébert (zithro) <slack@rabbit.lu>
+Reviewed-by: Anthony PERARD <anthony.perard@citrix.com>
+master commit: 29f17d837421f13c0e0010802de1b2d51d2ded4a
+master date: 2024-02-05 17:58:23 +0000
+---
+ tools/xentop/xentop.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/tools/xentop/xentop.c b/tools/xentop/xentop.c
+index 950e8935c4..545bd5e96d 100644
+--- a/tools/xentop/xentop.c
++++ b/tools/xentop/xentop.c
+@@ -684,7 +684,7 @@ static int compare_vbd_oo(xenstat_domain *domain1, xenstat_domain *domain2)
+ 	unsigned long long dom1_vbd_oo = 0, dom2_vbd_oo = 0;
+ 
+ 	tot_vbd_reqs(domain1, FIELD_VBD_OO, &dom1_vbd_oo);
+-	tot_vbd_reqs(domain1, FIELD_VBD_OO, &dom2_vbd_oo);
++	tot_vbd_reqs(domain2, FIELD_VBD_OO, &dom2_vbd_oo);
+ 
+ 	return -compare(dom1_vbd_oo, dom2_vbd_oo);
+ }
+@@ -711,9 +711,9 @@ static int compare_vbd_rd(xenstat_domain *domain1, xenstat_domain *domain2)
+ 	unsigned long long dom1_vbd_rd = 0, dom2_vbd_rd = 0;
+ 
+ 	tot_vbd_reqs(domain1, FIELD_VBD_RD, &dom1_vbd_rd);
+-	tot_vbd_reqs(domain1, FIELD_VBD_RD, &dom2_vbd_rd);
++	tot_vbd_reqs(domain2, FIELD_VBD_RD, &dom2_vbd_rd);
+ 
+-	return -compare(dom1_vbd_rd, dom1_vbd_rd);
++	return -compare(dom1_vbd_rd, dom2_vbd_rd);
+ }
+ 
+ /* Prints number of total VBD READ requests statistic */
+@@ -738,7 +738,7 @@ static int compare_vbd_wr(xenstat_domain *domain1, xenstat_domain *domain2)
+ 	unsigned long long dom1_vbd_wr = 0, dom2_vbd_wr = 0;
+ 
+ 	tot_vbd_reqs(domain1, FIELD_VBD_WR, &dom1_vbd_wr);
+-	tot_vbd_reqs(domain1, FIELD_VBD_WR, &dom2_vbd_wr);
++	tot_vbd_reqs(domain2, FIELD_VBD_WR, &dom2_vbd_wr);
+ 
+ 	return -compare(dom1_vbd_wr, dom2_vbd_wr);
+ }
+@@ -765,7 +765,7 @@ static int compare_vbd_rsect(xenstat_domain *domain1, xenstat_domain *domain2)
+ 	unsigned long long dom1_vbd_rsect = 0, dom2_vbd_rsect = 0;
+ 
+ 	tot_vbd_reqs(domain1, FIELD_VBD_RSECT, &dom1_vbd_rsect);
+-	tot_vbd_reqs(domain1, FIELD_VBD_RSECT, &dom2_vbd_rsect);
++	tot_vbd_reqs(domain2, FIELD_VBD_RSECT, &dom2_vbd_rsect);
+ 
+ 	return -compare(dom1_vbd_rsect, dom2_vbd_rsect);
+ }
+-- 
+2.44.0
+
+
+From fa9950a527a70971bf9279be62d445cf9c83aedf Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 27 Feb 2024 13:54:04 +0100
+Subject: [PATCH 29/70] amd-vi: fix IVMD memory type checks
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The current code that parses the IVMD blocks is relaxed with regard to the
+restriction that such unity regions should always fall into memory ranges
+marked as reserved in the memory map.
+
+However the type checks for the IVMD addresses are inverted, and as a result
+IVMD ranges falling into RAM areas are accepted.  Note that having such ranges
+in the first place is a firmware bug, as IVMD should always fall into reserved
+ranges.
+
+Fixes: ed6c77ebf0c1 ('AMD/IOMMU: check / convert IVMD ranges for being / to be reserved')
+Reported-by: Ox <oxjo@proton.me>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Tested-by: oxjo <oxjo@proton.me>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 83afa313583019d9f159c122cecf867735d27ec5
+master date: 2024-02-06 11:56:13 +0100
+---
+ xen/drivers/passthrough/amd/iommu_acpi.c | 11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+diff --git a/xen/drivers/passthrough/amd/iommu_acpi.c b/xen/drivers/passthrough/amd/iommu_acpi.c
+index 699d33f429..96d8879e7b 100644
+--- a/xen/drivers/passthrough/amd/iommu_acpi.c
++++ b/xen/drivers/passthrough/amd/iommu_acpi.c
+@@ -426,9 +426,14 @@ static int __init parse_ivmd_block(const struct acpi_ivrs_memory *ivmd_block)
+                 return -EIO;
+             }
+ 
+-            /* Types which won't be handed out are considered good enough. */
+-            if ( !(type & (RAM_TYPE_RESERVED | RAM_TYPE_ACPI |
+-                           RAM_TYPE_UNUSABLE)) )
++            /*
++             * Types which aren't RAM are considered good enough.
++             * Note that a page being partially RESERVED, ACPI or UNUSABLE will
++             * force Xen into assuming the whole page as having that type in
++             * practice.
++             */
++            if ( type & (RAM_TYPE_RESERVED | RAM_TYPE_ACPI |
++                         RAM_TYPE_UNUSABLE) )
+                 continue;
+ 
+             AMD_IOMMU_ERROR("IVMD: page at %lx can't be converted\n", addr);
+-- 
+2.44.0
+
+
+From 16475909baa2bcfda3ebc07ced5e5cd0ca8172d6 Mon Sep 17 00:00:00 2001
+From: Jason Andryuk <jandryuk@gmail.com>
+Date: Tue, 27 Feb 2024 13:55:03 +0100
+Subject: [PATCH 30/70] block-common: Fix same_vm for no targets
+
+same_vm is broken when the two main domains do not have targets.  otvm
+and targetvm are both missing, which means they get set to -1 and then
+converted to empty strings:
+
+++10697+ local targetvm=-1
+++10697+ local otvm=-1
+++10697+ otvm=
+++10697+ othervm=/vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4
+++10697+ targetvm=
+++10697+ local frontend_uuid=/vm/844dea4e-44f8-4e3e-8145-325132a31ca5
+
+The final comparison returns true since the two empty strings match:
+
+++10697+ '[' /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o '' = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = '' -o '' = '' ']'
+
+Replace -1 with distinct strings indicating the lack of a value and
+remove the collescing to empty stings.  The strings themselves will no
+longer match, and that is correct.
+
+++12364+ '[' /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o 'No target' = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = 'No other target' -o 'No target' = 'No other target' ']'
+
+Signed-off-by: Jason Andryuk <jandryuk@gmail.com>
+Reviewed-by: Anthony PERARD <anthony.perard@citrix.com>
+master commit: e8f1bb803fdf44db708991593568a9e3e6b3d130
+master date: 2024-02-07 13:46:52 +0100
+---
+ tools/hotplug/Linux/block-common.sh | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+diff --git a/tools/hotplug/Linux/block-common.sh b/tools/hotplug/Linux/block-common.sh
+index f86a88c4eb..5c80237d99 100644
+--- a/tools/hotplug/Linux/block-common.sh
++++ b/tools/hotplug/Linux/block-common.sh
+@@ -112,14 +112,12 @@ same_vm()
+                   "$FRONTEND_UUID")
+   local target=$(xenstore_read_default  "/local/domain/$FRONTEND_ID/target"   \
+                  "-1")
+-  local targetvm=$(xenstore_read_default "/local/domain/$target/vm" "-1")
++  local targetvm=$(xenstore_read_default "/local/domain/$target/vm" "No Target")
+   local otarget=$(xenstore_read_default  "/local/domain/$otherdom/target"   \
+                  "-1")
+   local otvm=$(xenstore_read_default  "/local/domain/$otarget/vm"   \
+-                 "-1")
+-  otvm=${otvm%-1}
+-  othervm=${othervm%-1}
+-  targetvm=${targetvm%-1}
++                 "No Other Target")
++
+   local frontend_uuid=${FRONTEND_UUID%-1}
+   
+   [ "$frontend_uuid" = "$othervm" -o "$targetvm" = "$othervm" -o \
+-- 
+2.44.0
+
+
+From b51fd78aed865033413178f5953147effedc7ce0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Petr=20Bene=C5=A1?= <w1benny@gmail.com>
+Date: Tue, 27 Feb 2024 13:55:25 +0100
+Subject: [PATCH 31/70] x86/hvm: Fix fast singlestep state persistence
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This patch addresses an issue where the fast singlestep setting would persist
+despite xc_domain_debug_control being called with XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF.
+Specifically, if fast singlestep was enabled in a VMI session and that session
+stopped before the MTF trap occurred, the fast singlestep setting remained
+active even though MTF itself was disabled.  This led to a situation where, upon
+starting a new VMI session, the first event to trigger an EPT violation would
+cause the corresponding EPT event callback to be skipped due to the lingering
+fast singlestep setting.
+
+The fix ensures that the fast singlestep setting is properly reset when
+disabling single step debugging operations.
+
+Signed-off-by: Petr Beneš <w1benny@gmail.com>
+Reviewed-by: Tamas K Lengyel <tamas@tklengyel.com>
+master commit: 897def94b56175ce569673a05909d2f223e1e749
+master date: 2024-02-12 09:37:58 +0100
+---
+ xen/arch/x86/hvm/hvm.c | 34 ++++++++++++++++++++++++----------
+ 1 file changed, 24 insertions(+), 10 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
+index 482eebbabf..a70b351373 100644
+--- a/xen/arch/x86/hvm/hvm.c
++++ b/xen/arch/x86/hvm/hvm.c
+@@ -5167,26 +5167,40 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
+ 
+ int hvm_debug_op(struct vcpu *v, int32_t op)
+ {
+-    int rc;
++    int rc = 0;
+ 
+     switch ( op )
+     {
+         case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON:
+         case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF:
+-            rc = -EOPNOTSUPP;
+             if ( !cpu_has_monitor_trap_flag )
+-                break;
+-            rc = 0;
+-            vcpu_pause(v);
+-            v->arch.hvm.single_step =
+-                (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON);
+-            vcpu_unpause(v); /* guest will latch new state */
++                return -EOPNOTSUPP;
+             break;
+         default:
+-            rc = -ENOSYS;
+-            break;
++            return -ENOSYS;
++    }
++
++    vcpu_pause(v);
++
++    switch ( op )
++    {
++    case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON:
++        v->arch.hvm.single_step = true;
++        break;
++
++    case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF:
++        v->arch.hvm.single_step = false;
++        v->arch.hvm.fast_single_step.enabled = false;
++        v->arch.hvm.fast_single_step.p2midx = 0;
++        break;
++
++    default: /* Excluded above */
++        ASSERT_UNREACHABLE();
++        return -ENOSYS;
+     }
+ 
++    vcpu_unpause(v); /* guest will latch new state */
++
+     return rc;
+ }
+ 
+-- 
+2.44.0
+
+
+From 59e6ad6597dc9930c966b20485a9d0b369ff71a5 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 27 Feb 2024 13:55:56 +0100
+Subject: [PATCH 32/70] x86/HVM: tidy state on hvmemul_map_linear_addr()'s
+ error path
+
+While in the vast majority of cases failure of the function will not
+be followed by re-invocation with the same emulation context, a few
+very specific insns - involving multiple independent writes, e.g. ENTER
+and PUSHA - exist where this can happen. Since failure of the function
+only signals to the caller that it ought to try an MMIO write instead,
+such failure also cannot be assumed to result in wholesale failure of
+emulation of the current insn. Instead we have to maintain internal
+state such that another invocation of the function with the same
+emulation context remains possible. To achieve that we need to reset MFN
+slots after putting page references on the error path.
+
+Note that all of this affects debugging code only, in causing an
+assertion to trigger (higher up in the function). There's otherwise no
+misbehavior - such a "leftover" slot would simply be overwritten by new
+contents in a release build.
+
+Also extend the related unmap() assertion, to further check for MFN 0.
+
+Fixes: 8cbd4fb0b7ea ("x86/hvm: implement hvmemul_write() using real mappings")
+Reported-by: Manuel Andreas <manuel.andreas@tum.de>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Paul Durrant <paul@xen.org>
+master commit: e72f951df407bc3be82faac64d8733a270036ba1
+master date: 2024-02-13 09:36:14 +0100
+---
+ xen/arch/x86/hvm/emulate.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
+index 254716c766..865aa08bbc 100644
+--- a/xen/arch/x86/hvm/emulate.c
++++ b/xen/arch/x86/hvm/emulate.c
+@@ -696,7 +696,12 @@ static void *hvmemul_map_linear_addr(
+  out:
+     /* Drop all held references. */
+     while ( mfn-- > hvmemul_ctxt->mfn )
++    {
+         put_page(mfn_to_page(*mfn));
++#ifndef NDEBUG /* Clean slot for a subsequent map()'s error checking. */
++        *mfn = _mfn(0);
++#endif
++    }
+ 
+     return err;
+ }
+@@ -718,7 +723,7 @@ static void hvmemul_unmap_linear_addr(
+ 
+     for ( i = 0; i < nr_frames; i++ )
+     {
+-        ASSERT(mfn_valid(*mfn));
++        ASSERT(mfn_x(*mfn) && mfn_valid(*mfn));
+         paging_mark_dirty(currd, *mfn);
+         put_page(mfn_to_page(*mfn));
+ 
+-- 
+2.44.0
+
+
+From 006764b871db75d5d025500a079ad246d1d418a1 Mon Sep 17 00:00:00 2001
+From: Anthony PERARD <anthony.perard@citrix.com>
+Date: Tue, 27 Feb 2024 13:56:25 +0100
+Subject: [PATCH 33/70] build: Replace `which` with `command -v`
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The `which` command is not standard, may not exist on the build host,
+or may not behave as expected by the build system. It is recommended
+to use `command -v` to find out if a command exist and have its path,
+and it's part of a POSIX shell standard (at least, it seems to be
+mandatory since IEEE Std 1003.1-2008, but was optional before).
+
+Fixes: c8a8645f1efe ("xen/build: Automatically locate a suitable python interpreter")
+Fixes: 3b47bcdb6d38 ("xen/build: Use a distro version of figlet")
+Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
+Tested-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: f93629b18b528a5ab1b1092949c5420069c7226c
+master date: 2024-02-19 12:45:48 +0100
+---
+ xen/Makefile | 4 ++--
+ xen/build.mk | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/xen/Makefile b/xen/Makefile
+index a92709b43e..59d368e4d8 100644
+--- a/xen/Makefile
++++ b/xen/Makefile
+@@ -25,8 +25,8 @@ export XEN_BUILD_HOST	:= $(shell hostname)
+ endif
+ 
+ # Best effort attempt to find a python interpreter, defaulting to Python 3 if
+-# available.  Fall back to just `python` if `which` is nowhere to be found.
+-PYTHON_INTERPRETER	:= $(word 1,$(shell which python3 python python2 2>/dev/null) python)
++# available.  Fall back to just `python`.
++PYTHON_INTERPRETER	:= $(word 1,$(shell command -v python3 || command -v python || command -v python2) python)
+ export PYTHON		?= $(PYTHON_INTERPRETER)
+ 
+ export CHECKPOLICY	?= checkpolicy
+diff --git a/xen/build.mk b/xen/build.mk
+index 26dd5a8e87..0f490ca71b 100644
+--- a/xen/build.mk
++++ b/xen/build.mk
+@@ -1,6 +1,6 @@
+ quiet_cmd_banner = BANNER  $@
+ define cmd_banner
+-    if which figlet >/dev/null 2>&1 ; then \
++    if command -v figlet >/dev/null 2>&1 ; then \
+ 	echo " Xen $(XEN_FULLVERSION)" | figlet -f $< > $@.tmp; \
+     else \
+ 	echo " Xen $(XEN_FULLVERSION)" > $@.tmp; \
+-- 
+2.44.0
+
+
+From 489c2b9ba173376e978c0ef3de416a2f09452e85 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?=
+ <marmarek@invisiblethingslab.com>
+Date: Tue, 27 Feb 2024 13:57:07 +0100
+Subject: [PATCH 34/70] libxl: Disable relocating memory for qemu-xen in
+ stubdomain too
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+According to comments (and experiments) qemu-xen cannot handle memory
+reolcation done by hvmloader. The code was already disabled when running
+qemu-xen in dom0 (see libxl__spawn_local_dm()), but it was missed when
+adding qemu-xen support to stubdomain. Adjust libxl__spawn_stub_dm() to
+be consistent in this regard.
+
+Reported-by: Neowutran <xen@neowutran.ovh>
+Signed-off-by: Marek Marczykowski-Górecki <marmarek@invisiblethingslab.com>
+Reviewed-by: Jason Andryuk <jandryuk@gmail.com>
+Acked-by: Anthony PERARD <anthony.perard@citrix.com>
+master commit: 97883aa269f6745a6ded232be3a855abb1297e0d
+master date: 2024-02-22 11:48:22 +0100
+---
+ tools/libs/light/libxl_dm.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c
+index 14b593110f..ed620a9d8e 100644
+--- a/tools/libs/light/libxl_dm.c
++++ b/tools/libs/light/libxl_dm.c
+@@ -2432,6 +2432,16 @@ void libxl__spawn_stub_dm(libxl__egc *egc, libxl__stub_dm_spawn_state *sdss)
+                         "%s",
+                         libxl_bios_type_to_string(guest_config->b_info.u.hvm.bios));
+     }
++    /* Disable relocating memory to make the MMIO hole larger
++     * unless we're running qemu-traditional and vNUMA is not
++     * configured. */
++    libxl__xs_printf(gc, XBT_NULL,
++                     libxl__sprintf(gc, "%s/hvmloader/allow-memory-relocate",
++                                    libxl__xs_get_dompath(gc, guest_domid)),
++                     "%d",
++                     guest_config->b_info.device_model_version
++                        == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL &&
++                     !libxl__vnuma_configured(&guest_config->b_info));
+     ret = xc_domain_set_target(ctx->xch, dm_domid, guest_domid);
+     if (ret<0) {
+         LOGED(ERROR, guest_domid, "setting target domain %d -> %d",
+-- 
+2.44.0
+
+
+From 5fda82641461a5234ab9bf0575423dfb8bfc5657 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 27 Feb 2024 13:57:31 +0100
+Subject: [PATCH 35/70] build: make sure build fails when running kconfig fails
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Because of using "-include", failure to (re)build auto.conf (with
+auto.conf.cmd produced as a secondary target) won't stop make from
+continuing the build. Arrange for it being possible to drop the - from
+Rules.mk, requiring that the include be skipped for tools-only targets.
+Note that relying on the inclusion in those cases wouldn't be correct
+anyway, as it might be a stale file (yet to be rebuilt) which would be
+included, while during initial build, the file would be absent
+altogether.
+
+Fixes: 8d4c17a90b0a ("xen/build: silence make warnings about missing auto.conf*")
+Reported-by: Roger Pau Monné <roger.pau@citrix.com>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Anthony PERARD <anthony.perard@citrix.com>
+master commit: d34e5fa2e8db19f23081f46a3e710bb122130691
+master date: 2024-02-22 11:52:47 +0100
+---
+ xen/Makefile | 1 +
+ xen/Rules.mk | 4 +++-
+ 2 files changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/xen/Makefile b/xen/Makefile
+index 59d368e4d8..fdf9fd3f22 100644
+--- a/xen/Makefile
++++ b/xen/Makefile
+@@ -374,6 +374,7 @@ $(KCONFIG_CONFIG): tools_fixdep
+ # This exploits the 'multi-target pattern rule' trick.
+ # The syncconfig should be executed only once to make all the targets.
+ include/config/%.conf include/config/%.conf.cmd: $(KCONFIG_CONFIG)
++	$(Q)rm -f include/config/auto.conf
+ 	$(Q)$(MAKE) $(build)=tools/kconfig syncconfig
+ 
+ ifeq ($(CONFIG_DEBUG),y)
+diff --git a/xen/Rules.mk b/xen/Rules.mk
+index 8af3dd7277..d759cccee3 100644
+--- a/xen/Rules.mk
++++ b/xen/Rules.mk
+@@ -15,7 +15,9 @@ srcdir := $(srctree)/$(src)
+ PHONY := __build
+ __build:
+ 
+--include $(objtree)/include/config/auto.conf
++ifneq ($(firstword $(subst /, ,$(obj))),tools)
++include $(objtree)/include/config/auto.conf
++endif
+ 
+ include $(XEN_ROOT)/Config.mk
+ include $(srctree)/scripts/Kbuild.include
+-- 
+2.44.0
+
+
+From a751d1321f6e1491d6ec2134d59eefa9f9752b86 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 27 Feb 2024 13:57:50 +0100
+Subject: [PATCH 36/70] x86emul: add missing EVEX.R' checks
+
+EVEX.R' is not ignored in 64-bit code when encoding a GPR or mask
+register. While for mask registers suitable checks are in place (there
+also covering EVEX.R), they were missing for the few cases where in
+EVEX-encoded instructions ModR/M.reg encodes a GPR. While for VPEXTRW
+the bit is replaced before an emulation stub is invoked, for
+VCVT{,T}{S,D,H}2{,U}SI this actually would have led to #UD from inside
+an emulation stub, in turn raising #UD to the guest, but accompanied by
+log messages indicating something's wrong in Xen nevertheless.
+
+Fixes: 001bd91ad864 ("x86emul: support AVX512{F,BW,DQ} extract insns")
+Fixes: baf4a376f550 ("x86emul: support AVX512F legacy-equivalent scalar int/FP conversion insns")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: cb319824bfa8d3c9ea0410cc71daaedc3e11aa2a
+master date: 2024-02-22 11:54:07 +0100
+---
+ xen/arch/x86/x86_emulate/x86_emulate.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c
+index cf780da501..d6b60f0539 100644
+--- a/xen/arch/x86/x86_emulate/x86_emulate.c
++++ b/xen/arch/x86/x86_emulate/x86_emulate.c
+@@ -3686,7 +3686,8 @@ x86_emulate(
+     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */
+     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x78): /* vcvtts{s,d}2usi xmm/mem,reg */
+     CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x79): /* vcvts{s,d}2usi xmm/mem,reg */
+-        generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk ||
++        generate_exception_if((evex.reg != 0xf || !evex.RX || !evex.R ||
++                               evex.opmsk ||
+                                (ea.type != OP_REG && evex.brs)),
+                               X86_EXC_UD);
+         host_and_vcpu_must_have(avx512f);
+@@ -7295,7 +7296,7 @@ x86_emulate(
+         goto pextr;
+ 
+     case X86EMUL_OPC_EVEX_66(0x0f, 0xc5):   /* vpextrw $imm8,xmm,reg */
+-        generate_exception_if(ea.type != OP_REG, X86_EXC_UD);
++        generate_exception_if(ea.type != OP_REG || !evex.R, X86_EXC_UD);
+         /* Convert to alternative encoding: We want to use a memory operand. */
+         evex.opcx = ext_0f3a;
+         b = 0x15;
+-- 
+2.44.0
+
+
+From 33a0368d3beb82ddb0cf7ed398b047325bb7be1c Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 27 Feb 2024 13:58:21 +0100
+Subject: [PATCH 37/70] xen/livepatch: fix norevert test hook setup typo
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The test code has a typo in using LIVEPATCH_APPLY_HOOK() instead of
+LIVEPATCH_REVERT_HOOK().
+
+Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com>
+master commit: f0622dd4fd6ae6ddb523a45d89ed9b8f3a9a8f36
+master date: 2024-02-26 10:13:46 +0100
+---
+ xen/test/livepatch/xen_action_hooks_norevert.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c
+index 3e21ade6ab..c173855192 100644
+--- a/xen/test/livepatch/xen_action_hooks_norevert.c
++++ b/xen/test/livepatch/xen_action_hooks_norevert.c
+@@ -120,7 +120,7 @@ static void post_revert_hook(livepatch_payload_t *payload)
+     printk(KERN_DEBUG "%s: Hook done.\n", __func__);
+ }
+ 
+-LIVEPATCH_APPLY_HOOK(revert_hook);
++LIVEPATCH_REVERT_HOOK(revert_hook);
+ 
+ LIVEPATCH_PREAPPLY_HOOK(pre_apply_hook);
+ LIVEPATCH_POSTAPPLY_HOOK(post_apply_hook);
+-- 
+2.44.0
+
+
+From f6e5ab5fa7257783fdbbaabf6010d8d97656c11f Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 27 Feb 2024 13:58:36 +0100
+Subject: [PATCH 38/70] xen/cmdline: fix printf format specifier in
+ no_config_param()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+'*' sets the width field, which is the minimum number of characters to output,
+but what we want in no_config_param() is the precision instead, which is '.*'
+as it imposes a maximum limit on the output.
+
+Fixes: 68d757df8dd2 ('x86/pv: Options to disable and/or compile out 32bit PV support')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: ef101f525173cf51dc70f4c77862f6f10a8ddccf
+master date: 2024-02-26 10:17:40 +0100
+---
+ xen/include/xen/param.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/include/xen/param.h b/xen/include/xen/param.h
+index 93c3fe7cb7..e02e49635c 100644
+--- a/xen/include/xen/param.h
++++ b/xen/include/xen/param.h
+@@ -191,7 +191,7 @@ static inline void no_config_param(const char *cfg, const char *param,
+ {
+     int len = e ? ({ ASSERT(e >= s); e - s; }) : strlen(s);
+ 
+-    printk(XENLOG_INFO "CONFIG_%s disabled - ignoring '%s=%*s' setting\n",
++    printk(XENLOG_INFO "CONFIG_%s disabled - ignoring '%s=%.*s' setting\n",
+            cfg, param, len, s);
+ }
+ 
+-- 
+2.44.0
+
+
+From 19fd9ff9981732995b1028f9e7e406061b723651 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 27 Feb 2024 13:59:05 +0100
+Subject: [PATCH 39/70] x86/altcall: use a union as register type for function
+ parameters on clang
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The current code for alternative calls uses the caller parameter types as the
+types for the register variables that serve as function parameters:
+
+uint8_t foo;
+[...]
+alternative_call(myfunc, foo);
+
+Would expand roughly into:
+
+register unint8_t a1_ asm("rdi") = foo;
+register unsigned long a2_ asm("rsi");
+[...]
+asm volatile ("call *%c[addr](%%rip)"...);
+
+However with -O2 clang will generate incorrect code, given the following
+example:
+
+unsigned int func(uint8_t t)
+{
+    return t;
+}
+
+static void bar(uint8_t b)
+{
+    int ret_;
+    register uint8_t di asm("rdi") = b;
+    register unsigned long si asm("rsi");
+    register unsigned long dx asm("rdx");
+    register unsigned long cx asm("rcx");
+    register unsigned long r8 asm("r8");
+    register unsigned long r9 asm("r9");
+    register unsigned long r10 asm("r10");
+    register unsigned long r11 asm("r11");
+
+    asm volatile ( "call %c[addr]"
+                   : "+r" (di), "=r" (si), "=r" (dx),
+                     "=r" (cx), "=r" (r8), "=r" (r9),
+                     "=r" (r10), "=r" (r11), "=a" (ret_)
+                   : [addr] "i" (&(func)), "g" (func)
+                   : "memory" );
+}
+
+void foo(unsigned int a)
+{
+    bar(a);
+}
+
+Clang generates the following assembly code:
+
+func:                                   # @func
+        movl    %edi, %eax
+        retq
+foo:                                    # @foo
+        callq   func
+        retq
+
+Note the truncation of the unsigned int parameter 'a' of foo() to uint8_t when
+passed into bar() is lost.  clang doesn't zero extend the parameters in the
+callee when required, as the psABI mandates.
+
+The above can be worked around by using a union when defining the register
+variables, so that `di` becomes:
+
+register union {
+    uint8_t e;
+    unsigned long r;
+} di asm("rdi") = { .e = b };
+
+Which results in following code generated for `foo()`:
+
+foo:                                    # @foo
+        movzbl  %dil, %edi
+        callq   func
+        retq
+
+So the truncation is not longer lost.  Apply such workaround only when built
+with clang.
+
+Reported-by: Matthew Grooms <mgrooms@shrew.net>
+Link: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=277200
+Link: https://github.com/llvm/llvm-project/issues/12579
+Link: https://github.com/llvm/llvm-project/issues/82598
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+master commit: 2ce562b2a413cbdb2e1128989ed1722290a27c4e
+master date: 2024-02-26 10:18:01 +0100
+---
+ xen/arch/x86/include/asm/alternative.h | 25 +++++++++++++++++++++++++
+ 1 file changed, 25 insertions(+)
+
+diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h
+index a1cd6a9fe5..3c14db5078 100644
+--- a/xen/arch/x86/include/asm/alternative.h
++++ b/xen/arch/x86/include/asm/alternative.h
+@@ -167,9 +167,34 @@ extern void alternative_branches(void);
+ #define ALT_CALL_arg5 "r8"
+ #define ALT_CALL_arg6 "r9"
+ 
++#ifdef CONFIG_CC_IS_CLANG
++/*
++ * Use a union with an unsigned long in order to prevent clang from
++ * skipping a possible truncation of the value.  By using the union any
++ * truncation is carried before the call instruction, in turn covering
++ * for ABI-non-compliance in that the necessary clipping / extension of
++ * the value is supposed to be carried out in the callee.
++ *
++ * Note this behavior is not mandated by the standard, and hence could
++ * stop being a viable workaround, or worse, could cause a different set
++ * of code-generation issues in future clang versions.
++ *
++ * This has been reported upstream:
++ * https://github.com/llvm/llvm-project/issues/12579
++ * https://github.com/llvm/llvm-project/issues/82598
++ */
++#define ALT_CALL_ARG(arg, n)                                            \
++    register union {                                                    \
++        typeof(arg) e;                                                  \
++        unsigned long r;                                                \
++    } a ## n ## _ asm ( ALT_CALL_arg ## n ) = {                         \
++        .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })   \
++    }
++#else
+ #define ALT_CALL_ARG(arg, n) \
+     register typeof(arg) a ## n ## _ asm ( ALT_CALL_arg ## n ) = \
+         ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); })
++#endif
+ #define ALT_CALL_NO_ARG(n) \
+     register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n )
+ 
+-- 
+2.44.0
+
+
+From 4d47dca20dcfdca2340c8cda6f50dcdcafb1c054 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 27 Feb 2024 13:59:42 +0100
+Subject: [PATCH 40/70] x86/spec: fix BRANCH_HARDEN option to only be set when
+ build-enabled
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The current logic to handle the BRANCH_HARDEN option will report it as enabled
+even when build-time disabled. Fix this by only allowing the option to be set
+when support for it is built into Xen.
+
+Fixes: 2d6f36daa086 ('x86/nospec: Introduce CONFIG_SPECULATIVE_HARDEN_BRANCH')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 60e00f77a5cc671d30c5ef3318f5b8e9b74e4aa3
+master date: 2024-02-26 16:06:42 +0100
+---
+ xen/arch/x86/spec_ctrl.c | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index a8d8af22f6..01ba59cff7 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -50,7 +50,8 @@ static int8_t __initdata opt_psfd = -1;
+ int8_t __ro_after_init opt_ibpb_ctxt_switch = -1;
+ int8_t __read_mostly opt_eager_fpu = -1;
+ int8_t __read_mostly opt_l1d_flush = -1;
+-static bool __initdata opt_branch_harden = true;
++static bool __initdata opt_branch_harden =
++    IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH);
+ 
+ bool __initdata bsp_delay_spec_ctrl;
+ uint8_t __read_mostly default_xen_spec_ctrl;
+@@ -268,7 +269,16 @@ static int __init cf_check parse_spec_ctrl(const char *s)
+         else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 )
+             opt_l1d_flush = val;
+         else if ( (val = parse_boolean("branch-harden", s, ss)) >= 0 )
+-            opt_branch_harden = val;
++        {
++            if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) )
++                opt_branch_harden = val;
++            else
++            {
++                no_config_param("SPECULATIVE_HARDEN_BRANCH", "spec-ctrl", s,
++                                ss);
++                rc = -EINVAL;
++            }
++        }
+         else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 )
+             opt_srb_lock = val;
+         else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 )
+-- 
+2.44.0
+
+
+From 58bb8115104c9fca749ee4cfcd3579ac1ed644db Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 27 Feb 2024 14:00:22 +0100
+Subject: [PATCH 41/70] x86: account for shadow stack in exception-from-stub
+ recovery
+
+Dealing with exceptions raised from within emulation stubs involves
+discarding return address (replaced by exception related information).
+Such discarding of course also requires removing the corresponding entry
+from the shadow stack.
+
+Also amend the comment in fixup_exception_return(), to further clarify
+why use of ptr[1] can't be an out-of-bounds access.
+
+While touching do_invalid_op() also add a missing fall-through
+annotation.
+
+This is CVE-2023-46841 / XSA-451.
+
+Fixes: 209fb9919b50 ("x86/extable: Adjust extable handling to be shadow stack compatible")
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 91f5f7a9154919a765c3933521760acffeddbf28
+master date: 2024-02-27 13:49:22 +0100
+---
+ xen/arch/x86/extable.c             | 20 ++++++----
+ xen/arch/x86/include/asm/uaccess.h |  3 +-
+ xen/arch/x86/traps.c               | 62 +++++++++++++++++++++++++++---
+ 3 files changed, 71 insertions(+), 14 deletions(-)
+
+diff --git a/xen/arch/x86/extable.c b/xen/arch/x86/extable.c
+index 74b14246e9..8ffcd346d7 100644
+--- a/xen/arch/x86/extable.c
++++ b/xen/arch/x86/extable.c
+@@ -86,26 +86,29 @@ search_one_extable(const struct exception_table_entry *first,
+ }
+ 
+ unsigned long
+-search_exception_table(const struct cpu_user_regs *regs)
++search_exception_table(const struct cpu_user_regs *regs, unsigned long *stub_ra)
+ {
+     const struct virtual_region *region = find_text_region(regs->rip);
+     unsigned long stub = this_cpu(stubs.addr);
+ 
+     if ( region && region->ex )
++    {
++        *stub_ra = 0;
+         return search_one_extable(region->ex, region->ex_end, regs->rip);
++    }
+ 
+     if ( regs->rip >= stub + STUB_BUF_SIZE / 2 &&
+          regs->rip < stub + STUB_BUF_SIZE &&
+          regs->rsp > (unsigned long)regs &&
+          regs->rsp < (unsigned long)get_cpu_info() )
+     {
+-        unsigned long retptr = *(unsigned long *)regs->rsp;
++        unsigned long retaddr = *(unsigned long *)regs->rsp, fixup;
+ 
+-        region = find_text_region(retptr);
+-        retptr = region && region->ex
+-                 ? search_one_extable(region->ex, region->ex_end, retptr)
+-                 : 0;
+-        if ( retptr )
++        region = find_text_region(retaddr);
++        fixup = region && region->ex
++                ? search_one_extable(region->ex, region->ex_end, retaddr)
++                : 0;
++        if ( fixup )
+         {
+             /*
+              * Put trap number and error code on the stack (in place of the
+@@ -117,7 +120,8 @@ search_exception_table(const struct cpu_user_regs *regs)
+             };
+ 
+             *(unsigned long *)regs->rsp = token.raw;
+-            return retptr;
++            *stub_ra = retaddr;
++            return fixup;
+         }
+     }
+ 
+diff --git a/xen/arch/x86/include/asm/uaccess.h b/xen/arch/x86/include/asm/uaccess.h
+index 684fccd95c..74bb222c03 100644
+--- a/xen/arch/x86/include/asm/uaccess.h
++++ b/xen/arch/x86/include/asm/uaccess.h
+@@ -421,7 +421,8 @@ union stub_exception_token {
+     unsigned long raw;
+ };
+ 
+-extern unsigned long search_exception_table(const struct cpu_user_regs *regs);
++extern unsigned long search_exception_table(const struct cpu_user_regs *regs,
++                                            unsigned long *stub_ra);
+ extern void sort_exception_tables(void);
+ extern void sort_exception_table(struct exception_table_entry *start,
+                                  const struct exception_table_entry *stop);
+diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
+index e1356f696a..45e1b277ea 100644
+--- a/xen/arch/x86/traps.c
++++ b/xen/arch/x86/traps.c
+@@ -845,7 +845,7 @@ void do_unhandled_trap(struct cpu_user_regs *regs)
+ }
+ 
+ static void fixup_exception_return(struct cpu_user_regs *regs,
+-                                   unsigned long fixup)
++                                   unsigned long fixup, unsigned long stub_ra)
+ {
+     if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
+     {
+@@ -862,7 +862,8 @@ static void fixup_exception_return(struct cpu_user_regs *regs,
+             /*
+              * Search for %rip.  The shstk currently looks like this:
+              *
+-             *   ...  [Likely pointed to by SSP]
++             *   tok  [Supervisor token, == &tok | BUSY, only with FRED inactive]
++             *   ...  [Pointed to by SSP for most exceptions, empty in IST cases]
+              *   %cs  [== regs->cs]
+              *   %rip [== regs->rip]
+              *   SSP  [Likely points to 3 slots higher, above %cs]
+@@ -880,7 +881,56 @@ static void fixup_exception_return(struct cpu_user_regs *regs,
+              */
+             if ( ptr[0] == regs->rip && ptr[1] == regs->cs )
+             {
++                unsigned long primary_shstk =
++                    (ssp & ~(STACK_SIZE - 1)) +
++                    (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8;
++
+                 wrss(fixup, ptr);
++
++                if ( !stub_ra )
++                    goto shstk_done;
++
++                /*
++                 * Stub recovery ought to happen only when the outer context
++                 * was on the main shadow stack.  We need to also "pop" the
++                 * stub's return address from the interrupted context's shadow
++                 * stack.  That is,
++                 * - if we're still on the main stack, we need to move the
++                 *   entire stack (up to and including the exception frame)
++                 *   up by one slot, incrementing the original SSP in the
++                 *   exception frame,
++                 * - if we're on an IST stack, we need to increment the
++                 *   original SSP.
++                 */
++                BUG_ON((ptr[-1] ^ primary_shstk) >> PAGE_SHIFT);
++
++                if ( (ssp ^ primary_shstk) >> PAGE_SHIFT )
++                {
++                    /*
++                     * We're on an IST stack.  First make sure the two return
++                     * addresses actually match.  Then increment the interrupted
++                     * context's SSP.
++                     */
++                    BUG_ON(stub_ra != *(unsigned long*)ptr[-1]);
++                    wrss(ptr[-1] + 8, &ptr[-1]);
++                    goto shstk_done;
++                }
++
++                /* Make sure the two return addresses actually match. */
++                BUG_ON(stub_ra != ptr[2]);
++
++                /* Move exception frame, updating SSP there. */
++                wrss(ptr[1], &ptr[2]); /* %cs */
++                wrss(ptr[0], &ptr[1]); /* %rip */
++                wrss(ptr[-1] + 8, &ptr[0]); /* SSP */
++
++                /* Move all newer entries. */
++                while ( --ptr != _p(ssp) )
++                    wrss(ptr[-1], &ptr[0]);
++
++                /* Finally account for our own stack having shifted up. */
++                asm volatile ( "incsspd %0" :: "r" (2) );
++
+                 goto shstk_done;
+             }
+         }
+@@ -901,7 +951,8 @@ static void fixup_exception_return(struct cpu_user_regs *regs,
+ 
+ static bool extable_fixup(struct cpu_user_regs *regs, bool print)
+ {
+-    unsigned long fixup = search_exception_table(regs);
++    unsigned long stub_ra = 0;
++    unsigned long fixup = search_exception_table(regs, &stub_ra);
+ 
+     if ( unlikely(fixup == 0) )
+         return false;
+@@ -915,7 +966,7 @@ static bool extable_fixup(struct cpu_user_regs *regs, bool print)
+                vector_name(regs->entry_vector), regs->error_code,
+                _p(regs->rip), _p(regs->rip), _p(fixup));
+ 
+-    fixup_exception_return(regs, fixup);
++    fixup_exception_return(regs, fixup, stub_ra);
+     this_cpu(last_extable_addr) = regs->rip;
+ 
+     return true;
+@@ -1183,7 +1234,8 @@ void do_invalid_op(struct cpu_user_regs *regs)
+     {
+     case BUGFRAME_run_fn:
+     case BUGFRAME_warn:
+-        fixup_exception_return(regs, (unsigned long)eip);
++        fixup_exception_return(regs, (unsigned long)eip, 0);
++        fallthrough;
+     case BUGFRAME_bug:
+     case BUGFRAME_assert:
+         return;
+-- 
+2.44.0
+
+
+From 498b3624d0ecc1267773e6482fd0b732e90c4511 Mon Sep 17 00:00:00 2001
+From: Michal Orzel <michal.orzel@amd.com>
+Date: Thu, 8 Feb 2024 11:43:39 +0100
+Subject: [PATCH 42/70] xen/arm: Fix UBSAN failure in start_xen()
+
+When running Xen on arm32, in scenario where Xen is loaded at an address
+such as boot_phys_offset >= 2GB, UBSAN reports the following:
+
+(XEN) UBSAN: Undefined behaviour in arch/arm/setup.c:739:58
+(XEN) pointer operation underflowed 00200000 to 86800000
+(XEN) Xen WARN at common/ubsan/ubsan.c:172
+(XEN) ----[ Xen-4.19-unstable  arm32  debug=y ubsan=y  Not tainted ]----
+...
+(XEN) Xen call trace:
+(XEN)    [<0031b4c0>] ubsan.c#ubsan_epilogue+0x18/0xf0 (PC)
+(XEN)    [<0031d134>] __ubsan_handle_pointer_overflow+0xb8/0xd4 (LR)
+(XEN)    [<0031d134>] __ubsan_handle_pointer_overflow+0xb8/0xd4
+(XEN)    [<004d15a8>] start_xen+0xe0/0xbe0
+(XEN)    [<0020007c>] head.o#primary_switched+0x4/0x30
+
+The failure is reported for the following line:
+(paddr_t)(uintptr_t)(_start + boot_phys_offset)
+
+This occurs because the compiler treats (ptr + size) with size bigger than
+PTRDIFF_MAX as undefined behavior. To address this, switch to macro
+virt_to_maddr(), given the future plans to eliminate boot_phys_offset.
+
+Signed-off-by: Michal Orzel <michal.orzel@amd.com>
+Reviewed-by: Luca Fancellu <luca.fancellu@arm.com>
+Tested-by: Luca Fancellu <luca.fancellu@arm.com>
+Acked-by: Julien Grall <jgrall@amazon.com>
+(cherry picked from commit e11f5766503c0ff074b4e0f888bbfc931518a169)
+---
+ xen/arch/arm/setup.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c
+index db748839d3..2ccdde5277 100644
+--- a/xen/arch/arm/setup.c
++++ b/xen/arch/arm/setup.c
+@@ -1109,7 +1109,7 @@ void __init start_xen(unsigned long boot_phys_offset,
+ 
+     /* Register Xen's load address as a boot module. */
+     xen_bootmodule = add_boot_module(BOOTMOD_XEN,
+-                             (paddr_t)(uintptr_t)(_start + boot_phys_offset),
++                             virt_to_maddr(_start),
+                              (paddr_t)(uintptr_t)(_end - _start), false);
+     BUG_ON(!xen_bootmodule);
+ 
+-- 
+2.44.0
+
+
+From 3e383bb4137c6ca3058cd55cb867ecc2b7414499 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 5 Mar 2024 11:48:39 +0100
+Subject: [PATCH 43/70] x86/HVM: hide SVM/VMX when their enabling is prohibited
+ by firmware
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+... or we fail to enable the functionality on the BSP for other reasons.
+The only place where hardware announcing the feature is recorded is the
+raw CPU policy/featureset.
+
+Inspired by https://lore.kernel.org/all/20230921114940.957141-1-pbonzini@redhat.com/.
+
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 0b5f149338e35a795bf609ce584640b0977f9e6c
+master date: 2024-01-09 14:06:34 +0100
+---
+ xen/arch/x86/hvm/svm/svm.c  |  1 +
+ xen/arch/x86/hvm/vmx/vmcs.c | 17 +++++++++++++++++
+ 2 files changed, 18 insertions(+)
+
+diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
+index 24c417ca71..ff991c82cf 100644
+--- a/xen/arch/x86/hvm/svm/svm.c
++++ b/xen/arch/x86/hvm/svm/svm.c
+@@ -2543,6 +2543,7 @@ const struct hvm_function_table * __init start_svm(void)
+ 
+     if ( _svm_cpu_up(true) )
+     {
++        setup_clear_cpu_cap(X86_FEATURE_SVM);
+         printk("SVM: failed to initialise.\n");
+         return NULL;
+     }
+diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
+index 13719cc923..e382aa16c5 100644
+--- a/xen/arch/x86/hvm/vmx/vmcs.c
++++ b/xen/arch/x86/hvm/vmx/vmcs.c
+@@ -2165,6 +2165,23 @@ int __init vmx_vmcs_init(void)
+ 
+     if ( !ret )
+         register_keyhandler('v', vmcs_dump, "dump VT-x VMCSs", 1);
++    else
++    {
++        setup_clear_cpu_cap(X86_FEATURE_VMX);
++
++        /*
++         * _vmx_vcpu_up() may have made it past feature identification.
++         * Make sure all dependent features are off as well.
++         */
++        vmx_basic_msr              = 0;
++        vmx_pin_based_exec_control = 0;
++        vmx_cpu_based_exec_control = 0;
++        vmx_secondary_exec_control = 0;
++        vmx_vmexit_control         = 0;
++        vmx_vmentry_control        = 0;
++        vmx_ept_vpid_cap           = 0;
++        vmx_vmfunc                 = 0;
++    }
+ 
+     return ret;
+ }
+-- 
+2.44.0
+
+
+From 57f137053652d5a981ae21f3abe7becc507fe434 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Tue, 5 Mar 2024 11:49:22 +0100
+Subject: [PATCH 44/70] xen/sched: Fix UB shift in compat_set_timer_op()
+
+Tamas reported this UBSAN failure from fuzzing:
+
+  (XEN) ================================================================================
+  (XEN) UBSAN: Undefined behaviour in common/sched/compat.c:48:37
+  (XEN) left shift of negative value -2147425536
+  (XEN) ----[ Xen-4.19-unstable  x86_64  debug=y ubsan=y  Not tainted ]----
+  ...
+  (XEN) Xen call trace:
+  (XEN)    [<ffff82d040307c1c>] R ubsan.c#ubsan_epilogue+0xa/0xd9
+  (XEN)    [<ffff82d040308afb>] F __ubsan_handle_shift_out_of_bounds+0x11a/0x1c5
+  (XEN)    [<ffff82d040307758>] F compat_set_timer_op+0x41/0x43
+  (XEN)    [<ffff82d04040e4cc>] F hvm_do_multicall_call+0x77f/0xa75
+  (XEN)    [<ffff82d040519462>] F arch_do_multicall_call+0xec/0xf1
+  (XEN)    [<ffff82d040261567>] F do_multicall+0x1dc/0xde3
+  (XEN)    [<ffff82d04040d2b3>] F hvm_hypercall+0xa00/0x149a
+  (XEN)    [<ffff82d0403cd072>] F vmx_vmexit_handler+0x1596/0x279c
+  (XEN)    [<ffff82d0403d909b>] F vmx_asm_vmexit_handler+0xdb/0x200
+
+Left-shifting any negative value is strictly undefined behaviour in C, and
+the two parameters here come straight from the guest.
+
+The fuzzer happened to choose lo 0xf, hi 0x8000e300.
+
+Switch everything to be unsigned values, making the shift well defined.
+
+As GCC documents:
+
+  As an extension to the C language, GCC does not use the latitude given in
+  C99 and C11 only to treat certain aspects of signed '<<' as undefined.
+  However, -fsanitize=shift (and -fsanitize=undefined) will diagnose such
+  cases.
+
+this was deemed not to need an XSA.
+
+Note: The unsigned -> signed conversion for do_set_timer_op()'s s_time_t
+parameter is also well defined.  C makes it implementation defined, and GCC
+defines it as reduction modulo 2^N to be within range of the new type.
+
+Fixes: 2942f45e09fb ("Enable compatibility mode operation for HYPERVISOR_sched_op and HYPERVISOR_set_timer_op.")
+Reported-by: Tamas K Lengyel <tamas@tklengyel.com>
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: ae6d4fd876765e6d623eec67d14f5d0464be09cb
+master date: 2024-02-01 19:52:44 +0000
+---
+ xen/common/sched/compat.c    | 4 ++--
+ xen/include/hypercall-defs.c | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/xen/common/sched/compat.c b/xen/common/sched/compat.c
+index d718e450d4..dd97593630 100644
+--- a/xen/common/sched/compat.c
++++ b/xen/common/sched/compat.c
+@@ -43,9 +43,9 @@ static int compat_poll(struct compat_sched_poll *compat)
+ 
+ #include "core.c"
+ 
+-int compat_set_timer_op(uint32_t lo, int32_t hi)
++int compat_set_timer_op(uint32_t lo, uint32_t hi)
+ {
+-    return do_set_timer_op(((s64)hi << 32) | lo);
++    return do_set_timer_op(((uint64_t)hi << 32) | lo);
+ }
+ 
+ #endif /* __COMMON_SCHED_COMPAT_C__ */
+diff --git a/xen/include/hypercall-defs.c b/xen/include/hypercall-defs.c
+index 6d361ddfce..47c093acc8 100644
+--- a/xen/include/hypercall-defs.c
++++ b/xen/include/hypercall-defs.c
+@@ -134,7 +134,7 @@ xenoprof_op(int op, void *arg)
+ 
+ #ifdef CONFIG_COMPAT
+ prefix: compat
+-set_timer_op(uint32_t lo, int32_t hi)
++set_timer_op(uint32_t lo, uint32_t hi)
+ multicall(multicall_entry_compat_t *call_list, uint32_t nr_calls)
+ memory_op(unsigned int cmd, void *arg)
+ #ifdef CONFIG_IOREQ_SERVER
+-- 
+2.44.0
+
+
+From b7f9168878155e2d29b9b4a3048b0a9a68ed82ed Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 5 Mar 2024 11:50:16 +0100
+Subject: [PATCH 45/70] x86/spec: print the built-in SPECULATIVE_HARDEN_*
+ options
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Just like it's done for INDIRECT_THUNK and SHADOW_PAGING.
+
+Reported-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 6e9507f7d51fe49df8bc70f83e49ce06c92e4e54
+master date: 2024-02-27 14:57:52 +0100
+---
+ xen/arch/x86/spec_ctrl.c | 14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 01ba59cff7..04e508b622 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -476,13 +476,25 @@ static void __init print_details(enum ind_thunk thunk)
+            (e21a & cpufeat_mask(X86_FEATURE_SBPB))           ? " SBPB"           : "");
+ 
+     /* Compiled-in support which pertains to mitigations. */
+-    if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
++    if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ||
++         IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) ||
++         IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) ||
++         IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) )
+         printk("  Compiled-in support:"
+ #ifdef CONFIG_INDIRECT_THUNK
+                " INDIRECT_THUNK"
+ #endif
+ #ifdef CONFIG_SHADOW_PAGING
+                " SHADOW_PAGING"
++#endif
++#ifdef CONFIG_SPECULATIVE_HARDEN_ARRAY
++               " HARDEN_ARRAY"
++#endif
++#ifdef CONFIG_SPECULATIVE_HARDEN_BRANCH
++               " HARDEN_BRANCH"
++#endif
++#ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS
++               " HARDEN_GUEST_ACCESS"
+ #endif
+                "\n");
+ 
+-- 
+2.44.0
+
+
+From 09b9db0413b1f31f27bece07b2bfa1723b89ace6 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 5 Mar 2024 11:50:53 +0100
+Subject: [PATCH 46/70] x86/spec: fix INDIRECT_THUNK option to only be set when
+ build-enabled
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Attempt to provide a more helpful error message when the user attempts to set
+spec-ctrl=bti-thunk option but the support is build-time disabled.
+
+While there also adjust the command line documentation to mention
+CONFIG_INDIRECT_THUNK instead of INDIRECT_THUNK.
+
+Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 8441fa806a3b778867867cd0159fa1722e90397e
+master date: 2024-02-27 14:58:20 +0100
+---
+ docs/misc/xen-command-line.pandoc | 10 +++++-----
+ xen/arch/x86/spec_ctrl.c          |  7 ++++++-
+ 2 files changed, 11 insertions(+), 6 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index 8e65f8bd18..582d6741d1 100644
+--- a/docs/misc/xen-command-line.pandoc
++++ b/docs/misc/xen-command-line.pandoc
+@@ -2424,11 +2424,11 @@ guests to use.
+   performance reasons dom0 is unprotected by default.  If it is necessary to
+   protect dom0 too, boot with `spec-ctrl=ibpb-entry`.
+ 
+-If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to
+-select which of the thunks gets patched into the `__x86_indirect_thunk_%reg`
+-locations.  The default thunk is `retpoline` (generally preferred), with the
+-alternatives being `jmp` (a `jmp *%reg` gadget, minimal overhead), and
+-`lfence` (an `lfence; jmp *%reg` gadget).
++If Xen was compiled with `CONFIG_INDIRECT_THUNK` support, `bti-thunk=` can be
++used to select which of the thunks gets patched into the
++`__x86_indirect_thunk_%reg` locations.  The default thunk is `retpoline`
++(generally preferred), with the alternatives being `jmp` (a `jmp *%reg` gadget,
++minimal overhead), and `lfence` (an `lfence; jmp *%reg` gadget).
+ 
+ On hardware supporting IBRS (Indirect Branch Restricted Speculation), the
+ `ibrs=` option can be used to force or prevent Xen using the feature itself.
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 04e508b622..99ecfb3cba 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -241,7 +241,12 @@ static int __init cf_check parse_spec_ctrl(const char *s)
+         {
+             s += 10;
+ 
+-            if ( !cmdline_strcmp(s, "retpoline") )
++            if ( !IS_ENABLED(CONFIG_INDIRECT_THUNK) )
++            {
++                no_config_param("INDIRECT_THUNK", "spec-ctrl", s - 10, ss);
++                rc = -EINVAL;
++            }
++            else if ( !cmdline_strcmp(s, "retpoline") )
+                 opt_thunk = THUNK_RETPOLINE;
+             else if ( !cmdline_strcmp(s, "lfence") )
+                 opt_thunk = THUNK_LFENCE;
+-- 
+2.44.0
+
+
+From 7404c25efdc70091817479b80dbbd945e6ab4861 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 5 Mar 2024 11:51:56 +0100
+Subject: [PATCH 47/70] x86/spec: do not print thunk option selection if not
+ built-in
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Since the thunk built-in enable is printed as part of the "Compiled-in
+support:" line, avoid printing anything in "Xen settings:" if the thunk is
+disabled at build time.
+
+Note the BTI-Thunk option printing is also adjusted to print a colon in the
+same way the other options on the line do.
+
+Requested-by: Jan Beulich <jbeulich@suse.com>
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+master commit: 576528a2a742069af203e90c613c5c93e23c9755
+master date: 2024-02-27 14:58:40 +0100
+---
+ xen/arch/x86/spec_ctrl.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 99ecfb3cba..a965b6db28 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -504,11 +504,12 @@ static void __init print_details(enum ind_thunk thunk)
+                "\n");
+ 
+     /* Settings for Xen's protection, irrespective of guests. */
+-    printk("  Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n",
+-           thunk == THUNK_NONE      ? "N/A" :
+-           thunk == THUNK_RETPOLINE ? "RETPOLINE" :
+-           thunk == THUNK_LFENCE    ? "LFENCE" :
+-           thunk == THUNK_JMP       ? "JMP" : "?",
++    printk("  Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n",
++           thunk != THUNK_NONE      ? "BTI-Thunk: " : "",
++           thunk == THUNK_NONE      ? "" :
++           thunk == THUNK_RETPOLINE ? "RETPOLINE, " :
++           thunk == THUNK_LFENCE    ? "LFENCE, " :
++           thunk == THUNK_JMP       ? "JMP, " : "?, ",
+            (!boot_cpu_has(X86_FEATURE_IBRSB) &&
+             !boot_cpu_has(X86_FEATURE_IBRS))         ? "No" :
+            (default_xen_spec_ctrl & SPEC_CTRL_IBRS)  ? "IBRS+" :  "IBRS-",
+-- 
+2.44.0
+
+
+From 5382a6a79cb544f2eecc47330b531802f8c52977 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 5 Mar 2024 11:52:57 +0100
+Subject: [PATCH 48/70] xen/livepatch: register livepatch regions when loaded
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Currently livepatch regions are registered as virtual regions only after the
+livepatch has been applied.
+
+This can lead to issues when using the pre-apply or post-revert hooks, as at
+that point the livepatch is not in the virtual regions list.  If a livepatch
+pre-apply hook contains a WARN() it would trigger an hypervisor crash, as the
+code to handle the bug frame won't be able to find the instruction pointer that
+triggered the #UD in any of the registered virtual regions, and hence crash.
+
+Fix this by adding the livepatch payloads as virtual regions as soon as loaded,
+and only remove them once the payload is unloaded.  This requires some changes
+to the virtual regions code, as the removal of the virtual regions is no longer
+done in stop machine context, and hence an RCU barrier is added in order to
+make sure there are no users of the virtual region after it's been removed from
+the list.
+
+Fixes: 8313c864fa95 ('livepatch: Implement pre-|post- apply|revert hooks')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com>
+master commit: a57b4074ab39bee78b6c116277f0a9963bd8e687
+master date: 2024-02-28 16:57:25 +0000
+---
+ xen/common/livepatch.c      |  4 ++--
+ xen/common/virtual_region.c | 44 ++++++++++++++-----------------------
+ 2 files changed, 19 insertions(+), 29 deletions(-)
+
+diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c
+index e635606c10..e1964b841a 100644
+--- a/xen/common/livepatch.c
++++ b/xen/common/livepatch.c
+@@ -1071,6 +1071,7 @@ static int build_symbol_table(struct payload *payload,
+ static void free_payload(struct payload *data)
+ {
+     ASSERT(spin_is_locked(&payload_lock));
++    unregister_virtual_region(&data->region);
+     list_del(&data->list);
+     payload_cnt--;
+     payload_version++;
+@@ -1170,6 +1171,7 @@ static int livepatch_upload(struct xen_sysctl_livepatch_upload *upload)
+         INIT_LIST_HEAD(&data->list);
+         INIT_LIST_HEAD(&data->applied_list);
+ 
++        register_virtual_region(&data->region);
+         list_add_tail(&data->list, &payload_list);
+         payload_cnt++;
+         payload_version++;
+@@ -1386,7 +1388,6 @@ static inline void apply_payload_tail(struct payload *data)
+      * The applied_list is iterated by the trap code.
+      */
+     list_add_tail_rcu(&data->applied_list, &applied_list);
+-    register_virtual_region(&data->region);
+ 
+     data->state = LIVEPATCH_STATE_APPLIED;
+ }
+@@ -1432,7 +1433,6 @@ static inline void revert_payload_tail(struct payload *data)
+      * The applied_list is iterated by the trap code.
+      */
+     list_del_rcu(&data->applied_list);
+-    unregister_virtual_region(&data->region);
+ 
+     data->reverted = true;
+     data->state = LIVEPATCH_STATE_CHECKED;
+diff --git a/xen/common/virtual_region.c b/xen/common/virtual_region.c
+index 5f89703f51..9f12c30efe 100644
+--- a/xen/common/virtual_region.c
++++ b/xen/common/virtual_region.c
+@@ -23,14 +23,8 @@ static struct virtual_region core_init __initdata = {
+ };
+ 
+ /*
+- * RCU locking. Additions are done either at startup (when there is only
+- * one CPU) or when all CPUs are running without IRQs.
+- *
+- * Deletions are bit tricky. We do it when Live Patch (all CPUs running
+- * without IRQs) or during bootup (when clearing the init).
+- *
+- * Hence we use list_del_rcu (which sports an memory fence) and a spinlock
+- * on deletion.
++ * RCU locking. Modifications to the list must be done in exclusive mode, and
++ * hence need to hold the spinlock.
+  *
+  * All readers of virtual_region_list MUST use list_for_each_entry_rcu.
+  */
+@@ -58,41 +52,36 @@ const struct virtual_region *find_text_region(unsigned long addr)
+ 
+ void register_virtual_region(struct virtual_region *r)
+ {
+-    ASSERT(!local_irq_is_enabled());
++    unsigned long flags;
+ 
++    spin_lock_irqsave(&virtual_region_lock, flags);
+     list_add_tail_rcu(&r->list, &virtual_region_list);
++    spin_unlock_irqrestore(&virtual_region_lock, flags);
+ }
+ 
+-static void remove_virtual_region(struct virtual_region *r)
++/*
++ * Suggest inline so when !CONFIG_LIVEPATCH the function is not left
++ * unreachable after init code is removed.
++ */
++static void inline remove_virtual_region(struct virtual_region *r)
+ {
+     unsigned long flags;
+ 
+     spin_lock_irqsave(&virtual_region_lock, flags);
+     list_del_rcu(&r->list);
+     spin_unlock_irqrestore(&virtual_region_lock, flags);
+-    /*
+-     * We do not need to invoke call_rcu.
+-     *
+-     * This is due to the fact that on the deletion we have made sure
+-     * to use spinlocks (to guard against somebody else calling
+-     * unregister_virtual_region) and list_deletion spiced with
+-     * memory barrier.
+-     *
+-     * That protects us from corrupting the list as the readers all
+-     * use list_for_each_entry_rcu which is safe against concurrent
+-     * deletions.
+-     */
+ }
+ 
++#ifdef CONFIG_LIVEPATCH
+ void unregister_virtual_region(struct virtual_region *r)
+ {
+-    /* Expected to be called from Live Patch - which has IRQs disabled. */
+-    ASSERT(!local_irq_is_enabled());
+-
+     remove_virtual_region(r);
++
++    /* Assert that no CPU might be using the removed region. */
++    rcu_barrier();
+ }
+ 
+-#if defined(CONFIG_LIVEPATCH) && defined(CONFIG_X86)
++#ifdef CONFIG_X86
+ void relax_virtual_region_perms(void)
+ {
+     const struct virtual_region *region;
+@@ -116,7 +105,8 @@ void tighten_virtual_region_perms(void)
+                                  PAGE_HYPERVISOR_RX);
+     rcu_read_unlock(&rcu_virtual_region_lock);
+ }
+-#endif
++#endif /* CONFIG_X86 */
++#endif /* CONFIG_LIVEPATCH */
+ 
+ void __init unregister_init_virtual_region(void)
+ {
+-- 
+2.44.0
+
+
+From 50a8f74df76b7ce7c35ad97a539f505eb0a9baa6 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 5 Mar 2024 11:53:05 +0100
+Subject: [PATCH 49/70] xen/livepatch: search for symbols in all loaded
+ payloads
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When checking if an address belongs to a patch, or when resolving a symbol,
+take into account all loaded livepatch payloads, even if not applied.
+
+This is required in order for the pre-apply and post-revert hooks to work
+properly, or else Xen won't detect the instruction pointer belonging to those
+hooks as being part of the currently active text.
+
+Move the RCU handling to be used for payload_list instead of applied_list, as
+now the calls from trap code will iterate over the payload_list.
+
+Fixes: 8313c864fa95 ('livepatch: Implement pre-|post- apply|revert hooks')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com>
+master commit: d2daa40fb3ddb8f83e238e57854bd878924cde90
+master date: 2024-02-28 16:57:25 +0000
+---
+ xen/common/livepatch.c | 49 +++++++++++++++---------------------------
+ 1 file changed, 17 insertions(+), 32 deletions(-)
+
+diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c
+index e1964b841a..135c47e9b8 100644
+--- a/xen/common/livepatch.c
++++ b/xen/common/livepatch.c
+@@ -36,13 +36,14 @@
+  * caller in schedule_work.
+  */
+ static DEFINE_SPINLOCK(payload_lock);
+-static LIST_HEAD(payload_list);
+-
+ /*
+- * Patches which have been applied. Need RCU in case we crash (and then
+- * traps code would iterate via applied_list) when adding entries on the list.
++ * Need RCU in case we crash (and then traps code would iterate via
++ * payload_list) when adding entries on the list.
+  */
+-static DEFINE_RCU_READ_LOCK(rcu_applied_lock);
++static DEFINE_RCU_READ_LOCK(rcu_payload_lock);
++static LIST_HEAD(payload_list);
++
++/* Patches which have been applied. Only modified from stop machine context. */
+ static LIST_HEAD(applied_list);
+ 
+ static unsigned int payload_cnt;
+@@ -111,12 +112,8 @@ bool_t is_patch(const void *ptr)
+     const struct payload *data;
+     bool_t r = 0;
+ 
+-    /*
+-     * Only RCU locking since this list is only ever changed during apply
+-     * or revert context. And in case it dies there we need an safe list.
+-     */
+-    rcu_read_lock(&rcu_applied_lock);
+-    list_for_each_entry_rcu ( data, &applied_list, applied_list )
++    rcu_read_lock(&rcu_payload_lock);
++    list_for_each_entry_rcu ( data, &payload_list, list )
+     {
+         if ( (ptr >= data->rw_addr &&
+               ptr < (data->rw_addr + data->rw_size)) ||
+@@ -130,7 +127,7 @@ bool_t is_patch(const void *ptr)
+         }
+ 
+     }
+-    rcu_read_unlock(&rcu_applied_lock);
++    rcu_read_unlock(&rcu_payload_lock);
+ 
+     return r;
+ }
+@@ -166,12 +163,8 @@ static const char *cf_check livepatch_symbols_lookup(
+     const void *va = (const void *)addr;
+     const char *n = NULL;
+ 
+-    /*
+-     * Only RCU locking since this list is only ever changed during apply
+-     * or revert context. And in case it dies there we need an safe list.
+-     */
+-    rcu_read_lock(&rcu_applied_lock);
+-    list_for_each_entry_rcu ( data, &applied_list, applied_list )
++    rcu_read_lock(&rcu_payload_lock);
++    list_for_each_entry_rcu ( data, &payload_list, list )
+     {
+         if ( va < data->text_addr ||
+              va >= (data->text_addr + data->text_size) )
+@@ -200,7 +193,7 @@ static const char *cf_check livepatch_symbols_lookup(
+         n = data->symtab[best].name;
+         break;
+     }
+-    rcu_read_unlock(&rcu_applied_lock);
++    rcu_read_unlock(&rcu_payload_lock);
+ 
+     return n;
+ }
+@@ -1072,7 +1065,8 @@ static void free_payload(struct payload *data)
+ {
+     ASSERT(spin_is_locked(&payload_lock));
+     unregister_virtual_region(&data->region);
+-    list_del(&data->list);
++    list_del_rcu(&data->list);
++    rcu_barrier();
+     payload_cnt--;
+     payload_version++;
+     free_payload_data(data);
+@@ -1172,7 +1166,7 @@ static int livepatch_upload(struct xen_sysctl_livepatch_upload *upload)
+         INIT_LIST_HEAD(&data->applied_list);
+ 
+         register_virtual_region(&data->region);
+-        list_add_tail(&data->list, &payload_list);
++        list_add_tail_rcu(&data->list, &payload_list);
+         payload_cnt++;
+         payload_version++;
+     }
+@@ -1383,11 +1377,7 @@ static int apply_payload(struct payload *data)
+ 
+ static inline void apply_payload_tail(struct payload *data)
+ {
+-    /*
+-     * We need RCU variant (which has barriers) in case we crash here.
+-     * The applied_list is iterated by the trap code.
+-     */
+-    list_add_tail_rcu(&data->applied_list, &applied_list);
++    list_add_tail(&data->applied_list, &applied_list);
+ 
+     data->state = LIVEPATCH_STATE_APPLIED;
+ }
+@@ -1427,12 +1417,7 @@ static int revert_payload(struct payload *data)
+ 
+ static inline void revert_payload_tail(struct payload *data)
+ {
+-
+-    /*
+-     * We need RCU variant (which has barriers) in case we crash here.
+-     * The applied_list is iterated by the trap code.
+-     */
+-    list_del_rcu(&data->applied_list);
++    list_del(&data->applied_list);
+ 
+     data->reverted = true;
+     data->state = LIVEPATCH_STATE_CHECKED;
+-- 
+2.44.0
+
+
+From d81bfc7ff887426727504086fa363f91bf8c19f8 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 5 Mar 2024 11:53:13 +0100
+Subject: [PATCH 50/70] xen/livepatch: fix norevert test attempt to open-code
+ revert
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The purpose of the norevert test is to install a dummy handler that replaces
+the internal Xen revert code, and then perform the revert in the post-revert
+hook.  For that purpose the usage of the previous common_livepatch_revert() is
+not enough, as that just reverts specific functions, but not the whole state of
+the payload.
+
+Remove both common_livepatch_{apply,revert}() and instead expose
+revert_payload{,_tail}() in order to perform the patch revert from the
+post-revert hook.
+
+Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com>
+master commit: cdae267ce10d04d71d1687b5701ff2911a96b6dc
+master date: 2024-02-28 16:57:25 +0000
+---
+ xen/common/livepatch.c                        | 41 +++++++++++++++++--
+ xen/include/xen/livepatch.h                   | 32 ++-------------
+ .../livepatch/xen_action_hooks_norevert.c     | 22 +++-------
+ 3 files changed, 46 insertions(+), 49 deletions(-)
+
+diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c
+index 135c47e9b8..0cc048fd83 100644
+--- a/xen/common/livepatch.c
++++ b/xen/common/livepatch.c
+@@ -1366,7 +1366,22 @@ static int apply_payload(struct payload *data)
+     ASSERT(!local_irq_is_enabled());
+ 
+     for ( i = 0; i < data->nfuncs; i++ )
+-        common_livepatch_apply(&data->funcs[i], &data->fstate[i]);
++    {
++        const struct livepatch_func *func = &data->funcs[i];
++        struct livepatch_fstate *state = &data->fstate[i];
++
++        /* If the action has been already executed on this function, do nothing. */
++        if ( state->applied == LIVEPATCH_FUNC_APPLIED )
++        {
++            printk(XENLOG_WARNING LIVEPATCH
++                   "%s: %s has been already applied before\n",
++                   __func__, func->name);
++            continue;
++        }
++
++        arch_livepatch_apply(func, state);
++        state->applied = LIVEPATCH_FUNC_APPLIED;
++    }
+ 
+     arch_livepatch_revive();
+ 
+@@ -1382,7 +1397,7 @@ static inline void apply_payload_tail(struct payload *data)
+     data->state = LIVEPATCH_STATE_APPLIED;
+ }
+ 
+-static int revert_payload(struct payload *data)
++int revert_payload(struct payload *data)
+ {
+     unsigned int i;
+     int rc;
+@@ -1397,7 +1412,25 @@ static int revert_payload(struct payload *data)
+     }
+ 
+     for ( i = 0; i < data->nfuncs; i++ )
+-        common_livepatch_revert(&data->funcs[i], &data->fstate[i]);
++    {
++        const struct livepatch_func *func = &data->funcs[i];
++        struct livepatch_fstate *state = &data->fstate[i];
++
++        /*
++         * If the apply action hasn't been executed on this function, do
++         * nothing.
++         */
++        if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED )
++        {
++            printk(XENLOG_WARNING LIVEPATCH
++                   "%s: %s has not been applied before\n",
++                   __func__, func->name);
++            continue;
++        }
++
++        arch_livepatch_revert(func, state);
++        state->applied = LIVEPATCH_FUNC_NOT_APPLIED;
++    }
+ 
+     /*
+      * Since we are running with IRQs disabled and the hooks may call common
+@@ -1415,7 +1448,7 @@ static int revert_payload(struct payload *data)
+     return 0;
+ }
+ 
+-static inline void revert_payload_tail(struct payload *data)
++void revert_payload_tail(struct payload *data)
+ {
+     list_del(&data->applied_list);
+ 
+diff --git a/xen/include/xen/livepatch.h b/xen/include/xen/livepatch.h
+index 537d3d58b6..c9ee58fd37 100644
+--- a/xen/include/xen/livepatch.h
++++ b/xen/include/xen/livepatch.h
+@@ -136,35 +136,11 @@ void arch_livepatch_post_action(void);
+ void arch_livepatch_mask(void);
+ void arch_livepatch_unmask(void);
+ 
+-static inline void common_livepatch_apply(const struct livepatch_func *func,
+-                                          struct livepatch_fstate *state)
+-{
+-    /* If the action has been already executed on this function, do nothing. */
+-    if ( state->applied == LIVEPATCH_FUNC_APPLIED )
+-    {
+-        printk(XENLOG_WARNING LIVEPATCH "%s: %s has been already applied before\n",
+-                __func__, func->name);
+-        return;
+-    }
+-
+-    arch_livepatch_apply(func, state);
+-    state->applied = LIVEPATCH_FUNC_APPLIED;
+-}
++/* Only for testing purposes. */
++struct payload;
++int revert_payload(struct payload *data);
++void revert_payload_tail(struct payload *data);
+ 
+-static inline void common_livepatch_revert(const struct livepatch_func *func,
+-                                           struct livepatch_fstate *state)
+-{
+-    /* If the apply action hasn't been executed on this function, do nothing. */
+-    if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED )
+-    {
+-        printk(XENLOG_WARNING LIVEPATCH "%s: %s has not been applied before\n",
+-                __func__, func->name);
+-        return;
+-    }
+-
+-    arch_livepatch_revert(func, state);
+-    state->applied = LIVEPATCH_FUNC_NOT_APPLIED;
+-}
+ #else
+ 
+ /*
+diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c
+index c173855192..c5fbab1746 100644
+--- a/xen/test/livepatch/xen_action_hooks_norevert.c
++++ b/xen/test/livepatch/xen_action_hooks_norevert.c
+@@ -96,26 +96,14 @@ static int revert_hook(livepatch_payload_t *payload)
+ 
+ static void post_revert_hook(livepatch_payload_t *payload)
+ {
+-    int i;
++    unsigned long flags;
+ 
+     printk(KERN_DEBUG "%s: Hook starting.\n", __func__);
+ 
+-    for (i = 0; i < payload->nfuncs; i++)
+-    {
+-        const struct livepatch_func *func = &payload->funcs[i];
+-        struct livepatch_fstate *fstate = &payload->fstate[i];
+-
+-        BUG_ON(revert_cnt != 1);
+-        BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED);
+-
+-        /* Outside of quiesce zone: MAY TRIGGER HOST CRASH/UNDEFINED BEHAVIOR */
+-        arch_livepatch_quiesce();
+-        common_livepatch_revert(payload);
+-        arch_livepatch_revive();
+-        BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED);
+-
+-        printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name);
+-    }
++    local_irq_save(flags);
++    BUG_ON(revert_payload(payload));
++    revert_payload_tail(payload);
++    local_irq_restore(flags);
+ 
+     printk(KERN_DEBUG "%s: Hook done.\n", __func__);
+ }
+-- 
+2.44.0
+
+
+From e9516b73e7d499684092c1d345818585403cf190 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 5 Mar 2024 11:53:22 +0100
+Subject: [PATCH 51/70] xen/livepatch: properly build the noapply and norevert
+ tests
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+It seems the build variables for those tests where copy-pasted from
+xen_action_hooks_marker-objs and not adjusted to use the correct source files.
+
+Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Ross Lagerwall <ross.lagerwall@citrix.com>
+master commit: e579677095782c7dec792597ba8b037b7d716b32
+master date: 2024-02-28 16:57:25 +0000
+---
+ xen/test/livepatch/Makefile | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/xen/test/livepatch/Makefile b/xen/test/livepatch/Makefile
+index c258ab0b59..d987a8367f 100644
+--- a/xen/test/livepatch/Makefile
++++ b/xen/test/livepatch/Makefile
+@@ -118,12 +118,12 @@ xen_action_hooks_marker-objs := xen_action_hooks_marker.o xen_hello_world_func.o
+ $(obj)/xen_action_hooks_noapply.o: $(obj)/config.h
+ 
+ extra-y += xen_action_hooks_noapply.livepatch
+-xen_action_hooks_noapply-objs := xen_action_hooks_marker.o xen_hello_world_func.o note.o xen_note.o
++xen_action_hooks_noapply-objs := xen_action_hooks_noapply.o xen_hello_world_func.o note.o xen_note.o
+ 
+ $(obj)/xen_action_hooks_norevert.o: $(obj)/config.h
+ 
+ extra-y += xen_action_hooks_norevert.livepatch
+-xen_action_hooks_norevert-objs := xen_action_hooks_marker.o xen_hello_world_func.o note.o xen_note.o
++xen_action_hooks_norevert-objs := xen_action_hooks_norevert.o xen_hello_world_func.o note.o xen_note.o
+ 
+ EXPECT_BYTES_COUNT := 8
+ CODE_GET_EXPECT=$(shell $(OBJDUMP) -d --insn-width=1 $(1) | sed -n -e '/<'$(2)'>:$$/,/^$$/ p' | tail -n +2 | head -n $(EXPECT_BYTES_COUNT) | awk '{$$0=$$2; printf "%s", substr($$0,length-1)}' | sed 's/.\{2\}/0x&,/g' | sed 's/^/{/;s/,$$/}/g')
+-- 
+2.44.0
+
+
+From 267845a8389d5d34edb2b38a1972f32f51f70b4e Mon Sep 17 00:00:00 2001
+From: Jason Andryuk <jandryuk@gmail.com>
+Date: Tue, 5 Mar 2024 11:54:12 +0100
+Subject: [PATCH 52/70] libxl: Fix segfault in device_model_spawn_outcome
+
+libxl__spawn_qdisk_backend() explicitly sets guest_config to NULL when
+starting QEMU (the usual launch through libxl__spawn_local_dm() has a
+guest_config though).
+
+Bail early on a NULL guest_config/d_config.  This skips the QMP queries
+for chardevs and VNC, but this xenpv QEMU instance isn't expected to
+provide those - only qdisk (or 9pfs backends after an upcoming change).
+
+Signed-off-by: Jason Andryuk <jandryuk@gmail.com>
+Acked-by: Anthony PERARD <anthony.perard@citrix.com>
+master commit: d4f3d35f043f6ef29393166b0dd131c8102cf255
+master date: 2024-02-29 08:18:38 +0100
+---
+ tools/libs/light/libxl_dm.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c
+index ed620a9d8e..29b43ed20a 100644
+--- a/tools/libs/light/libxl_dm.c
++++ b/tools/libs/light/libxl_dm.c
+@@ -3172,8 +3172,8 @@ static void device_model_spawn_outcome(libxl__egc *egc,
+ 
+     /* Check if spawn failed */
+     if (rc) goto out;
+-
+-    if (d_config->b_info.device_model_version
++    /* d_config is NULL for xl devd/libxl__spawn_qemu_xenpv_backend(). */
++    if (d_config && d_config->b_info.device_model_version
+             == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN) {
+         rc = libxl__ev_time_register_rel(ao, &dmss->timeout,
+                                          devise_model_postconfig_timeout,
+-- 
+2.44.0
+
+
+From 75221fb0f87e4d7278b0a540bc28a6d0b74afeba Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 5 Mar 2024 11:54:33 +0100
+Subject: [PATCH 53/70] x86/altcall: always use a temporary parameter stashing
+ variable
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The usage in ALT_CALL_ARG() on clang of:
+
+register union {
+    typeof(arg) e;
+    const unsigned long r;
+} ...
+
+When `arg` is the first argument to alternative_{,v}call() and
+const_vlapic_vcpu() is used results in clang 3.5.0 complaining with:
+
+arch/x86/hvm/vlapic.c:141:47: error: non-const static data member must be initialized out of line
+         alternative_call(hvm_funcs.test_pir, const_vlapic_vcpu(vlapic), vec) )
+
+Workaround this by pulling `arg1` into a local variable, like it's done for
+further arguments (arg2, arg3...)
+
+Originally arg1 wasn't pulled into a variable because for the a1_ register
+local variable the possible clobbering as a result of operators on other
+variables don't matter:
+
+https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
+
+Note clang version 3.8.1 seems to already be fixed and don't require the
+workaround, but since it's harmless do it uniformly everywhere.
+
+Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Fixes: 2ce562b2a413 ('x86/altcall: use a union as register type for function parameters on clang')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+master commit: c20850540ad6a32f4fc17bde9b01c92b0df18bf0
+master date: 2024-02-29 08:21:49 +0100
+---
+ xen/arch/x86/include/asm/alternative.h | 36 +++++++++++++++++---------
+ 1 file changed, 24 insertions(+), 12 deletions(-)
+
+diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h
+index 3c14db5078..0d3697f1de 100644
+--- a/xen/arch/x86/include/asm/alternative.h
++++ b/xen/arch/x86/include/asm/alternative.h
+@@ -253,21 +253,24 @@ extern void alternative_branches(void);
+ })
+ 
+ #define alternative_vcall1(func, arg) ({           \
+-    ALT_CALL_ARG(arg, 1);                          \
++    typeof(arg) v1_ = (arg);                       \
++    ALT_CALL_ARG(v1_, 1);                          \
+     ALT_CALL_NO_ARG2;                              \
+     (void)sizeof(func(arg));                       \
+     (void)alternative_callN(1, int, func);         \
+ })
+ 
+ #define alternative_call1(func, arg) ({            \
+-    ALT_CALL_ARG(arg, 1);                          \
++    typeof(arg) v1_ = (arg);                       \
++    ALT_CALL_ARG(v1_, 1);                          \
+     ALT_CALL_NO_ARG2;                              \
+     alternative_callN(1, typeof(func(arg)), func); \
+ })
+ 
+ #define alternative_vcall2(func, arg1, arg2) ({           \
++    typeof(arg1) v1_ = (arg1);                            \
+     typeof(arg2) v2_ = (arg2);                            \
+-    ALT_CALL_ARG(arg1, 1);                                \
++    ALT_CALL_ARG(v1_, 1);                                 \
+     ALT_CALL_ARG(v2_, 2);                                 \
+     ALT_CALL_NO_ARG3;                                     \
+     (void)sizeof(func(arg1, arg2));                       \
+@@ -275,17 +278,19 @@ extern void alternative_branches(void);
+ })
+ 
+ #define alternative_call2(func, arg1, arg2) ({            \
++    typeof(arg1) v1_ = (arg1);                            \
+     typeof(arg2) v2_ = (arg2);                            \
+-    ALT_CALL_ARG(arg1, 1);                                \
++    ALT_CALL_ARG(v1_, 1);                                 \
+     ALT_CALL_ARG(v2_, 2);                                 \
+     ALT_CALL_NO_ARG3;                                     \
+     alternative_callN(2, typeof(func(arg1, arg2)), func); \
+ })
+ 
+ #define alternative_vcall3(func, arg1, arg2, arg3) ({    \
++    typeof(arg1) v1_ = (arg1);                           \
+     typeof(arg2) v2_ = (arg2);                           \
+     typeof(arg3) v3_ = (arg3);                           \
+-    ALT_CALL_ARG(arg1, 1);                               \
++    ALT_CALL_ARG(v1_, 1);                                \
+     ALT_CALL_ARG(v2_, 2);                                \
+     ALT_CALL_ARG(v3_, 3);                                \
+     ALT_CALL_NO_ARG4;                                    \
+@@ -294,9 +299,10 @@ extern void alternative_branches(void);
+ })
+ 
+ #define alternative_call3(func, arg1, arg2, arg3) ({     \
++    typeof(arg1) v1_ = (arg1);                            \
+     typeof(arg2) v2_ = (arg2);                           \
+     typeof(arg3) v3_ = (arg3);                           \
+-    ALT_CALL_ARG(arg1, 1);                               \
++    ALT_CALL_ARG(v1_, 1);                                \
+     ALT_CALL_ARG(v2_, 2);                                \
+     ALT_CALL_ARG(v3_, 3);                                \
+     ALT_CALL_NO_ARG4;                                    \
+@@ -305,10 +311,11 @@ extern void alternative_branches(void);
+ })
+ 
+ #define alternative_vcall4(func, arg1, arg2, arg3, arg4) ({ \
++    typeof(arg1) v1_ = (arg1);                              \
+     typeof(arg2) v2_ = (arg2);                              \
+     typeof(arg3) v3_ = (arg3);                              \
+     typeof(arg4) v4_ = (arg4);                              \
+-    ALT_CALL_ARG(arg1, 1);                                  \
++    ALT_CALL_ARG(v1_, 1);                                   \
+     ALT_CALL_ARG(v2_, 2);                                   \
+     ALT_CALL_ARG(v3_, 3);                                   \
+     ALT_CALL_ARG(v4_, 4);                                   \
+@@ -318,10 +325,11 @@ extern void alternative_branches(void);
+ })
+ 
+ #define alternative_call4(func, arg1, arg2, arg3, arg4) ({  \
++    typeof(arg1) v1_ = (arg1);                              \
+     typeof(arg2) v2_ = (arg2);                              \
+     typeof(arg3) v3_ = (arg3);                              \
+     typeof(arg4) v4_ = (arg4);                              \
+-    ALT_CALL_ARG(arg1, 1);                                  \
++    ALT_CALL_ARG(v1_, 1);                                   \
+     ALT_CALL_ARG(v2_, 2);                                   \
+     ALT_CALL_ARG(v3_, 3);                                   \
+     ALT_CALL_ARG(v4_, 4);                                   \
+@@ -332,11 +340,12 @@ extern void alternative_branches(void);
+ })
+ 
+ #define alternative_vcall5(func, arg1, arg2, arg3, arg4, arg5) ({ \
++    typeof(arg1) v1_ = (arg1);                                    \
+     typeof(arg2) v2_ = (arg2);                                    \
+     typeof(arg3) v3_ = (arg3);                                    \
+     typeof(arg4) v4_ = (arg4);                                    \
+     typeof(arg5) v5_ = (arg5);                                    \
+-    ALT_CALL_ARG(arg1, 1);                                        \
++    ALT_CALL_ARG(v1_, 1);                                         \
+     ALT_CALL_ARG(v2_, 2);                                         \
+     ALT_CALL_ARG(v3_, 3);                                         \
+     ALT_CALL_ARG(v4_, 4);                                         \
+@@ -347,11 +356,12 @@ extern void alternative_branches(void);
+ })
+ 
+ #define alternative_call5(func, arg1, arg2, arg3, arg4, arg5) ({  \
++    typeof(arg1) v1_ = (arg1);                                    \
+     typeof(arg2) v2_ = (arg2);                                    \
+     typeof(arg3) v3_ = (arg3);                                    \
+     typeof(arg4) v4_ = (arg4);                                    \
+     typeof(arg5) v5_ = (arg5);                                    \
+-    ALT_CALL_ARG(arg1, 1);                                        \
++    ALT_CALL_ARG(v1_, 1);                                         \
+     ALT_CALL_ARG(v2_, 2);                                         \
+     ALT_CALL_ARG(v3_, 3);                                         \
+     ALT_CALL_ARG(v4_, 4);                                         \
+@@ -363,12 +373,13 @@ extern void alternative_branches(void);
+ })
+ 
+ #define alternative_vcall6(func, arg1, arg2, arg3, arg4, arg5, arg6) ({ \
++    typeof(arg1) v1_ = (arg1);                                          \
+     typeof(arg2) v2_ = (arg2);                                          \
+     typeof(arg3) v3_ = (arg3);                                          \
+     typeof(arg4) v4_ = (arg4);                                          \
+     typeof(arg5) v5_ = (arg5);                                          \
+     typeof(arg6) v6_ = (arg6);                                          \
+-    ALT_CALL_ARG(arg1, 1);                                              \
++    ALT_CALL_ARG(v1_, 1);                                               \
+     ALT_CALL_ARG(v2_, 2);                                               \
+     ALT_CALL_ARG(v3_, 3);                                               \
+     ALT_CALL_ARG(v4_, 4);                                               \
+@@ -379,12 +390,13 @@ extern void alternative_branches(void);
+ })
+ 
+ #define alternative_call6(func, arg1, arg2, arg3, arg4, arg5, arg6) ({  \
++    typeof(arg1) v1_ = (arg1);                                          \
+     typeof(arg2) v2_ = (arg2);                                          \
+     typeof(arg3) v3_ = (arg3);                                          \
+     typeof(arg4) v4_ = (arg4);                                          \
+     typeof(arg5) v5_ = (arg5);                                          \
+     typeof(arg6) v6_ = (arg6);                                          \
+-    ALT_CALL_ARG(arg1, 1);                                              \
++    ALT_CALL_ARG(v1_, 1);                                               \
+     ALT_CALL_ARG(v2_, 2);                                               \
+     ALT_CALL_ARG(v3_, 3);                                               \
+     ALT_CALL_ARG(v4_, 4);                                               \
+-- 
+2.44.0
+
+
+From fd7cb7a1d0433049d8fc59444d0e91b71728763e Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Tue, 5 Mar 2024 11:55:17 +0100
+Subject: [PATCH 54/70] x86/cpu-policy: Allow for levelling of VERW side
+ effects
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+MD_CLEAR and FB_CLEAR need OR-ing across a migrate pool.  Allow this, by
+having them unconditinally set in max, with the host values reflected in
+default.  Annotate the bits as having special properies.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: de17162cafd27f2865a3102a2ec0f386a02ed03d
+master date: 2024-03-01 20:14:19 +0000
+---
+ xen/arch/x86/cpu-policy.c                   | 24 +++++++++++++++++++++
+ xen/arch/x86/include/asm/cpufeature.h       |  1 +
+ xen/include/public/arch-x86/cpufeatureset.h |  4 ++--
+ 3 files changed, 27 insertions(+), 2 deletions(-)
+
+diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c
+index bcb17b7ce3..c7c5e99b7b 100644
+--- a/xen/arch/x86/cpu-policy.c
++++ b/xen/arch/x86/cpu-policy.c
+@@ -442,6 +442,16 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs)
+         __set_bit(X86_FEATURE_RSBA, fs);
+         __set_bit(X86_FEATURE_RRSBA, fs);
+ 
++        /*
++         * These bits indicate that the VERW instruction may have gained
++         * scrubbing side effects.  With pooling, they mean "you might migrate
++         * somewhere where scrubbing is necessary", and may need exposing on
++         * unaffected hardware.  This is fine, because the VERW instruction
++         * has been around since the 286.
++         */
++        __set_bit(X86_FEATURE_MD_CLEAR, fs);
++        __set_bit(X86_FEATURE_FB_CLEAR, fs);
++
+         /*
+          * The Gather Data Sampling microcode mitigation (August 2023) has an
+          * adverse performance impact on the CLWB instruction on SKX/CLX/CPX.
+@@ -476,6 +486,20 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs)
+              cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) )
+             __clear_bit(X86_FEATURE_RDRAND, fs);
+ 
++        /*
++         * These bits indicate that the VERW instruction may have gained
++         * scrubbing side effects.  The max policy has them set for migration
++         * reasons, so reset the default policy back to the host values in
++         * case we're unaffected.
++         */
++        __clear_bit(X86_FEATURE_MD_CLEAR, fs);
++        if ( cpu_has_md_clear )
++            __set_bit(X86_FEATURE_MD_CLEAR, fs);
++
++        __clear_bit(X86_FEATURE_FB_CLEAR, fs);
++        if ( cpu_has_fb_clear )
++            __set_bit(X86_FEATURE_FB_CLEAR, fs);
++
+         /*
+          * The Gather Data Sampling microcode mitigation (August 2023) has an
+          * adverse performance impact on the CLWB instruction on SKX/CLX/CPX.
+diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h
+index 06e1dd7f33..76ef2aeb1d 100644
+--- a/xen/arch/x86/include/asm/cpufeature.h
++++ b/xen/arch/x86/include/asm/cpufeature.h
+@@ -177,6 +177,7 @@ static inline bool boot_cpu_has(unsigned int feat)
+ #define cpu_has_avx512_4fmaps   boot_cpu_has(X86_FEATURE_AVX512_4FMAPS)
+ #define cpu_has_avx512_vp2intersect boot_cpu_has(X86_FEATURE_AVX512_VP2INTERSECT)
+ #define cpu_has_srbds_ctrl      boot_cpu_has(X86_FEATURE_SRBDS_CTRL)
++#define cpu_has_md_clear        boot_cpu_has(X86_FEATURE_MD_CLEAR)
+ #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)
+ #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)
+ #define cpu_has_serialize       boot_cpu_has(X86_FEATURE_SERIALIZE)
+diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
+index 6b6ce2745c..337aaa9c77 100644
+--- a/xen/include/public/arch-x86/cpufeatureset.h
++++ b/xen/include/public/arch-x86/cpufeatureset.h
+@@ -262,7 +262,7 @@ XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A  AVX512 Multiply Accumulation Single
+ XEN_CPUFEATURE(FSRM,          9*32+ 4) /*A  Fast Short REP MOVS */
+ XEN_CPUFEATURE(AVX512_VP2INTERSECT, 9*32+8) /*a  VP2INTERSECT{D,Q} insns */
+ XEN_CPUFEATURE(SRBDS_CTRL,    9*32+ 9) /*   MSR_MCU_OPT_CTRL and RNGDS_MITG_DIS. */
+-XEN_CPUFEATURE(MD_CLEAR,      9*32+10) /*A  VERW clears microarchitectural buffers */
++XEN_CPUFEATURE(MD_CLEAR,      9*32+10) /*!A VERW clears microarchitectural buffers */
+ XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */
+ XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */
+ XEN_CPUFEATURE(SERIALIZE,     9*32+14) /*A  SERIALIZE insn */
+@@ -329,7 +329,7 @@ XEN_CPUFEATURE(DOITM,              16*32+12) /*   Data Operand Invariant Timing
+ XEN_CPUFEATURE(SBDR_SSDP_NO,       16*32+13) /*A  No Shared Buffer Data Read or Sideband Stale Data Propagation */
+ XEN_CPUFEATURE(FBSDP_NO,           16*32+14) /*A  No Fill Buffer Stale Data Propagation */
+ XEN_CPUFEATURE(PSDP_NO,            16*32+15) /*A  No Primary Stale Data Propagation */
+-XEN_CPUFEATURE(FB_CLEAR,           16*32+17) /*A  Fill Buffers cleared by VERW */
++XEN_CPUFEATURE(FB_CLEAR,           16*32+17) /*!A Fill Buffers cleared by VERW */
+ XEN_CPUFEATURE(FB_CLEAR_CTRL,      16*32+18) /*   MSR_OPT_CPU_CTRL.FB_CLEAR_DIS */
+ XEN_CPUFEATURE(RRSBA,              16*32+19) /*!  Restricted RSB Alternative */
+ XEN_CPUFEATURE(BHI_NO,             16*32+20) /*A  No Branch History Injection  */
+-- 
+2.44.0
+
+
+From 4c84fa6cb66fe66f2c5dad65208c497558ab7d17 Mon Sep 17 00:00:00 2001
+From: Jan Beulich <jbeulich@suse.com>
+Date: Tue, 12 Mar 2024 12:06:57 +0100
+Subject: [PATCH 55/70] hvmloader/PCI: skip huge BARs in certain calculations
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+BARs of size 2Gb and up can't possibly fit below 4Gb: Both the bottom of
+the lower 2Gb range and the top of the higher 2Gb range have special
+purpose. Don't even have them influence whether to (perhaps) relocate
+low RAM.
+
+Reported-by: Neowutran <xen@neowutran.ovh>
+Signed-off-by: Jan Beulich <jbeulich@suse.com>
+Acked-by: Roger Pau Monné <roger.pau@citrix.com>
+master commit: 57acad12a09ffa490e870ebe17596aad858f0191
+master date: 2024-03-06 10:19:29 +0100
+---
+ tools/firmware/hvmloader/pci.c | 28 ++++++++++++++++++++--------
+ 1 file changed, 20 insertions(+), 8 deletions(-)
+
+diff --git a/tools/firmware/hvmloader/pci.c b/tools/firmware/hvmloader/pci.c
+index 257a6feb61..c3c61ca060 100644
+--- a/tools/firmware/hvmloader/pci.c
++++ b/tools/firmware/hvmloader/pci.c
+@@ -33,6 +33,13 @@ uint32_t pci_mem_start = HVM_BELOW_4G_MMIO_START;
+ const uint32_t pci_mem_end = RESERVED_MEMBASE;
+ uint64_t pci_hi_mem_start = 0, pci_hi_mem_end = 0;
+ 
++/*
++ * BARs larger than this value are put in 64-bit space unconditionally.  That
++ * is, such BARs also don't play into the determination of how big the lowmem
++ * MMIO hole needs to be.
++ */
++#define BAR_RELOC_THRESH GB(1)
++
+ enum virtual_vga virtual_vga = VGA_none;
+ unsigned long igd_opregion_pgbase = 0;
+ 
+@@ -286,9 +293,11 @@ void pci_setup(void)
+             bars[i].bar_reg = bar_reg;
+             bars[i].bar_sz  = bar_sz;
+ 
+-            if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) ==
+-                  PCI_BASE_ADDRESS_SPACE_MEMORY) ||
+-                 (bar_reg == PCI_ROM_ADDRESS) )
++            if ( is_64bar && bar_sz > BAR_RELOC_THRESH )
++                bar64_relocate = 1;
++            else if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) ==
++                       PCI_BASE_ADDRESS_SPACE_MEMORY) ||
++                      (bar_reg == PCI_ROM_ADDRESS) )
+                 mmio_total += bar_sz;
+ 
+             nr_bars++;
+@@ -367,7 +376,7 @@ void pci_setup(void)
+             pci_mem_start = hvm_info->low_mem_pgend << PAGE_SHIFT;
+     }
+ 
+-    if ( mmio_total > (pci_mem_end - pci_mem_start) )
++    if ( mmio_total > (pci_mem_end - pci_mem_start) || bar64_relocate )
+     {
+         printf("Low MMIO hole not large enough for all devices,"
+                " relocating some BARs to 64-bit\n");
+@@ -430,7 +439,8 @@ void pci_setup(void)
+ 
+         /*
+          * Relocate to high memory if the total amount of MMIO needed
+-         * is more than the low MMIO available.  Because devices are
++         * is more than the low MMIO available or BARs bigger than
++         * BAR_RELOC_THRESH are present.  Because devices are
+          * processed in order of bar_sz, this will preferentially
+          * relocate larger devices to high memory first.
+          *
+@@ -446,8 +456,9 @@ void pci_setup(void)
+          *   the code here assumes it to be.)
+          * Should either of those two conditions change, this code will break.
+          */
+-        using_64bar = bars[i].is_64bar && bar64_relocate
+-            && (mmio_total > (mem_resource.max - mem_resource.base));
++        using_64bar = bars[i].is_64bar && bar64_relocate &&
++            (mmio_total > (mem_resource.max - mem_resource.base) ||
++             bar_sz > BAR_RELOC_THRESH);
+         bar_data = pci_readl(devfn, bar_reg);
+ 
+         if ( (bar_data & PCI_BASE_ADDRESS_SPACE) ==
+@@ -467,7 +478,8 @@ void pci_setup(void)
+                 resource = &mem_resource;
+                 bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK;
+             }
+-            mmio_total -= bar_sz;
++            if ( bar_sz <= BAR_RELOC_THRESH )
++                mmio_total -= bar_sz;
+         }
+         else
+         {
+-- 
+2.44.0
+
+
+From a96d2d4355d85fc82abd0a3799978db04ee8cff3 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 12 Mar 2024 12:07:07 +0100
+Subject: [PATCH 56/70] x86/mm: fix detection of last L1 entry in
+ modify_xen_mappings_lite()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+The current logic to detect when to switch to the next L1 table is incorrectly
+using l2_table_offset() in order to notice when the last entry on the current
+L1 table has been reached.
+
+It should instead use l1_table_offset() to check whether the index has wrapped
+to point to the first entry, and so the next L1 table should be used.
+
+Fixes: 8676092a0f16 ('x86/livepatch: Fix livepatch application when CET is active')
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
+master commit: 7c81558208de7858251b62f168a449be84305595
+master date: 2024-03-11 11:09:42 +0000
+---
+ xen/arch/x86/mm.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 39544bd9f9..ab0acbfea6 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -5947,7 +5947,7 @@ void init_or_livepatch modify_xen_mappings_lite(
+ 
+                 v += 1UL << L1_PAGETABLE_SHIFT;
+ 
+-                if ( l2_table_offset(v) == 0 )
++                if ( l1_table_offset(v) == 0 )
+                     break;
+             }
+ 
+-- 
+2.44.0
+
+
+From fe1869a569bab56e44c35d1522ee064bab6286da Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Sat, 27 Jan 2024 17:52:09 +0000
+Subject: [PATCH 57/70] x86/entry: Introduce EFRAME_* constants
+
+restore_all_guest() does a lot of manipulation of the stack after popping the
+GPRs, and uses raw %rsp displacements to do so.  Also, almost all entrypaths
+use raw %rsp displacements prior to pushing GPRs.
+
+Provide better mnemonics, to aid readability and reduce the chance of errors
+when editing.
+
+No functional change.  The resulting binary is identical.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 37541208f119a9c552c6c6c3246ea61be0d44035)
+---
+ xen/arch/x86/x86_64/asm-offsets.c  | 17 ++++++++
+ xen/arch/x86/x86_64/compat/entry.S |  2 +-
+ xen/arch/x86/x86_64/entry.S        | 70 +++++++++++++++---------------
+ 3 files changed, 53 insertions(+), 36 deletions(-)
+
+diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
+index 57b73a4e62..2fc4d9130a 100644
+--- a/xen/arch/x86/x86_64/asm-offsets.c
++++ b/xen/arch/x86/x86_64/asm-offsets.c
+@@ -51,6 +51,23 @@ void __dummy__(void)
+     OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, es);
+     BLANK();
+ 
++    /*
++     * EFRAME_* is for the entry/exit logic where %rsp is pointing at
++     * UREGS_error_code and GPRs are still/already guest values.
++     */
++#define OFFSET_EF(sym, mem)                                             \
++    DEFINE(sym, offsetof(struct cpu_user_regs, mem) -                   \
++                offsetof(struct cpu_user_regs, error_code))
++
++    OFFSET_EF(EFRAME_entry_vector,    entry_vector);
++    OFFSET_EF(EFRAME_rip,             rip);
++    OFFSET_EF(EFRAME_cs,              cs);
++    OFFSET_EF(EFRAME_eflags,          eflags);
++    OFFSET_EF(EFRAME_rsp,             rsp);
++    BLANK();
++
++#undef OFFSET_EF
++
+     OFFSET(VCPU_processor, struct vcpu, processor);
+     OFFSET(VCPU_domain, struct vcpu, domain);
+     OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map);
+diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
+index fcc3a721f1..cb473f08ee 100644
+--- a/xen/arch/x86/x86_64/compat/entry.S
++++ b/xen/arch/x86/x86_64/compat/entry.S
+@@ -15,7 +15,7 @@ ENTRY(entry_int82)
+         ENDBR64
+         ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP
+         pushq $0
+-        movl  $HYPERCALL_VECTOR, 4(%rsp)
++        movl  $HYPERCALL_VECTOR, EFRAME_entry_vector(%rsp)
+         SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */
+ 
+         SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
+diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
+index 9a7b129aa7..968da9d727 100644
+--- a/xen/arch/x86/x86_64/entry.S
++++ b/xen/arch/x86/x86_64/entry.S
+@@ -190,15 +190,15 @@ restore_all_guest:
+         SPEC_CTRL_EXIT_TO_PV    /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
+ 
+         RESTORE_ALL
+-        testw $TRAP_syscall,4(%rsp)
++        testw $TRAP_syscall, EFRAME_entry_vector(%rsp)
+         jz    iret_exit_to_guest
+ 
+-        movq  24(%rsp),%r11           # RFLAGS
++        mov   EFRAME_eflags(%rsp), %r11
+         andq  $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11
+         orq   $X86_EFLAGS_IF,%r11
+ 
+         /* Don't use SYSRET path if the return address is not canonical. */
+-        movq  8(%rsp),%rcx
++        mov   EFRAME_rip(%rsp), %rcx
+         sarq  $47,%rcx
+         incl  %ecx
+         cmpl  $1,%ecx
+@@ -213,20 +213,20 @@ restore_all_guest:
+         ALTERNATIVE "", rag_clrssbsy, X86_FEATURE_XEN_SHSTK
+ #endif
+ 
+-        movq  8(%rsp), %rcx           # RIP
+-        cmpw  $FLAT_USER_CS32,16(%rsp)# CS
+-        movq  32(%rsp),%rsp           # RSP
++        mov   EFRAME_rip(%rsp), %rcx
++        cmpw  $FLAT_USER_CS32, EFRAME_cs(%rsp)
++        mov   EFRAME_rsp(%rsp), %rsp
+         je    1f
+         sysretq
+ 1:      sysretl
+ 
+         ALIGN
+ .Lrestore_rcx_iret_exit_to_guest:
+-        movq  8(%rsp), %rcx           # RIP
++        mov   EFRAME_rip(%rsp), %rcx
+ /* No special register assumptions. */
+ iret_exit_to_guest:
+-        andl  $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), 24(%rsp)
+-        orl   $X86_EFLAGS_IF,24(%rsp)
++        andl  $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp)
++        orl   $X86_EFLAGS_IF, EFRAME_eflags(%rsp)
+         addq  $8,%rsp
+ .Lft0:  iretq
+         _ASM_PRE_EXTABLE(.Lft0, handle_exception)
+@@ -257,7 +257,7 @@ ENTRY(lstar_enter)
+         pushq $FLAT_KERNEL_CS64
+         pushq %rcx
+         pushq $0
+-        movl  $TRAP_syscall, 4(%rsp)
++        movl  $TRAP_syscall, EFRAME_entry_vector(%rsp)
+         SAVE_ALL
+ 
+         SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
+@@ -294,7 +294,7 @@ ENTRY(cstar_enter)
+         pushq $FLAT_USER_CS32
+         pushq %rcx
+         pushq $0
+-        movl  $TRAP_syscall, 4(%rsp)
++        movl  $TRAP_syscall, EFRAME_entry_vector(%rsp)
+         SAVE_ALL
+ 
+         SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
+@@ -335,7 +335,7 @@ GLOBAL(sysenter_eflags_saved)
+         pushq $3 /* ring 3 null cs */
+         pushq $0 /* null rip */
+         pushq $0
+-        movl  $TRAP_syscall, 4(%rsp)
++        movl  $TRAP_syscall, EFRAME_entry_vector(%rsp)
+         SAVE_ALL
+ 
+         SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
+@@ -389,7 +389,7 @@ ENTRY(int80_direct_trap)
+         ENDBR64
+         ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP
+         pushq $0
+-        movl  $0x80, 4(%rsp)
++        movl  $0x80, EFRAME_entry_vector(%rsp)
+         SAVE_ALL
+ 
+         SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */
+@@ -649,7 +649,7 @@ ret_from_intr:
+         .section .init.text, "ax", @progbits
+ ENTRY(early_page_fault)
+         ENDBR64
+-        movl  $X86_EXC_PF, 4(%rsp)
++        movl  $X86_EXC_PF, EFRAME_entry_vector(%rsp)
+         SAVE_ALL
+         movq  %rsp, %rdi
+         call  do_early_page_fault
+@@ -716,7 +716,7 @@ ENTRY(common_interrupt)
+ 
+ ENTRY(entry_PF)
+         ENDBR64
+-        movl  $X86_EXC_PF, 4(%rsp)
++        movl  $X86_EXC_PF, EFRAME_entry_vector(%rsp)
+ /* No special register assumptions. */
+ GLOBAL(handle_exception)
+         ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP
+@@ -890,90 +890,90 @@ FATAL_exception_with_ints_disabled:
+ ENTRY(entry_DE)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_DE, 4(%rsp)
++        movl  $X86_EXC_DE, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_MF)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_MF, 4(%rsp)
++        movl  $X86_EXC_MF, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_XM)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_XM, 4(%rsp)
++        movl  $X86_EXC_XM, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_NM)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_NM, 4(%rsp)
++        movl  $X86_EXC_NM, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_DB)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_DB, 4(%rsp)
++        movl  $X86_EXC_DB, EFRAME_entry_vector(%rsp)
+         jmp   handle_ist_exception
+ 
+ ENTRY(entry_BP)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_BP, 4(%rsp)
++        movl  $X86_EXC_BP, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_OF)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_OF, 4(%rsp)
++        movl  $X86_EXC_OF, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_BR)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_BR, 4(%rsp)
++        movl  $X86_EXC_BR, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_UD)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_UD, 4(%rsp)
++        movl  $X86_EXC_UD, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_TS)
+         ENDBR64
+-        movl  $X86_EXC_TS, 4(%rsp)
++        movl  $X86_EXC_TS, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_NP)
+         ENDBR64
+-        movl  $X86_EXC_NP, 4(%rsp)
++        movl  $X86_EXC_NP, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_SS)
+         ENDBR64
+-        movl  $X86_EXC_SS, 4(%rsp)
++        movl  $X86_EXC_SS, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_GP)
+         ENDBR64
+-        movl  $X86_EXC_GP, 4(%rsp)
++        movl  $X86_EXC_GP, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_AC)
+         ENDBR64
+-        movl  $X86_EXC_AC, 4(%rsp)
++        movl  $X86_EXC_AC, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_CP)
+         ENDBR64
+-        movl  $X86_EXC_CP, 4(%rsp)
++        movl  $X86_EXC_CP, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+ ENTRY(entry_DF)
+         ENDBR64
+-        movl  $X86_EXC_DF, 4(%rsp)
++        movl  $X86_EXC_DF, EFRAME_entry_vector(%rsp)
+         /* Set AC to reduce chance of further SMAP faults */
+         ALTERNATIVE "", stac, X86_FEATURE_XEN_SMAP
+         SAVE_ALL
+@@ -998,7 +998,7 @@ ENTRY(entry_DF)
+ ENTRY(entry_NMI)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_NMI, 4(%rsp)
++        movl  $X86_EXC_NMI, EFRAME_entry_vector(%rsp)
+ handle_ist_exception:
+         ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP
+         SAVE_ALL
+@@ -1130,7 +1130,7 @@ handle_ist_exception:
+ ENTRY(entry_MC)
+         ENDBR64
+         pushq $0
+-        movl  $X86_EXC_MC, 4(%rsp)
++        movl  $X86_EXC_MC, EFRAME_entry_vector(%rsp)
+         jmp   handle_ist_exception
+ 
+ /* No op trap handler.  Required for kexec crash path. */
+@@ -1167,7 +1167,7 @@ autogen_stubs: /* Automatically generated stubs. */
+ 1:
+         ENDBR64
+         pushq $0
+-        movb  $vec,4(%rsp)
++        movb  $vec, EFRAME_entry_vector(%rsp)
+         jmp   common_interrupt
+ 
+         entrypoint 1b
+@@ -1181,7 +1181,7 @@ autogen_stubs: /* Automatically generated stubs. */
+         test  $8,%spl        /* 64bit exception frames are 16 byte aligned, but the word */
+         jz    2f             /* size is 8 bytes.  Check whether the processor gave us an */
+         pushq $0             /* error code, and insert an empty one if not.              */
+-2:      movb  $vec,4(%rsp)
++2:      movb  $vec, EFRAME_entry_vector(%rsp)
+         jmp   handle_exception
+ 
+         entrypoint 1b
+-- 
+2.44.0
+
+
+From b91c253e81db915f685b29e6947144ab9905388d Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Tue, 27 Feb 2024 16:07:39 +0000
+Subject: [PATCH 58/70] x86: Resync intel-family.h from Linux
+
+From v6.8-rc6
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 195e75371b13c4f7ecdf7b5c50aed0d02f2d7ce8)
+---
+ xen/arch/x86/include/asm/intel-family.h | 38 ++++++++++++++++++++++---
+ 1 file changed, 34 insertions(+), 4 deletions(-)
+
+diff --git a/xen/arch/x86/include/asm/intel-family.h b/xen/arch/x86/include/asm/intel-family.h
+index ffc49151be..b65e9c46b9 100644
+--- a/xen/arch/x86/include/asm/intel-family.h
++++ b/xen/arch/x86/include/asm/intel-family.h
+@@ -26,6 +26,9 @@
+  *		_G	- parts with extra graphics on
+  *		_X	- regular server parts
+  *		_D	- micro server parts
++ *		_N,_P	- other mobile parts
++ *		_H	- premium mobile parts
++ *		_S	- other client parts
+  *
+  *		Historical OPTDIFFs:
+  *
+@@ -37,6 +40,9 @@
+  * their own names :-(
+  */
+ 
++/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */
++#define INTEL_FAM6_ANY			X86_MODEL_ANY
++
+ #define INTEL_FAM6_CORE_YONAH		0x0E
+ 
+ #define INTEL_FAM6_CORE2_MEROM		0x0F
+@@ -93,8 +99,6 @@
+ #define INTEL_FAM6_ICELAKE_L		0x7E	/* Sunny Cove */
+ #define INTEL_FAM6_ICELAKE_NNPI		0x9D	/* Sunny Cove */
+ 
+-#define INTEL_FAM6_LAKEFIELD		0x8A	/* Sunny Cove / Tremont */
+-
+ #define INTEL_FAM6_ROCKETLAKE		0xA7	/* Cypress Cove */
+ 
+ #define INTEL_FAM6_TIGERLAKE_L		0x8C	/* Willow Cove */
+@@ -102,12 +106,31 @@
+ 
+ #define INTEL_FAM6_SAPPHIRERAPIDS_X	0x8F	/* Golden Cove */
+ 
++#define INTEL_FAM6_EMERALDRAPIDS_X	0xCF
++
++#define INTEL_FAM6_GRANITERAPIDS_X	0xAD
++#define INTEL_FAM6_GRANITERAPIDS_D	0xAE
++
++/* "Hybrid" Processors (P-Core/E-Core) */
++
++#define INTEL_FAM6_LAKEFIELD		0x8A	/* Sunny Cove / Tremont */
++
+ #define INTEL_FAM6_ALDERLAKE		0x97	/* Golden Cove / Gracemont */
+ #define INTEL_FAM6_ALDERLAKE_L		0x9A	/* Golden Cove / Gracemont */
+ 
+-#define INTEL_FAM6_RAPTORLAKE		0xB7
++#define INTEL_FAM6_RAPTORLAKE		0xB7	/* Raptor Cove / Enhanced Gracemont */
++#define INTEL_FAM6_RAPTORLAKE_P		0xBA
++#define INTEL_FAM6_RAPTORLAKE_S		0xBF
++
++#define INTEL_FAM6_METEORLAKE		0xAC
++#define INTEL_FAM6_METEORLAKE_L		0xAA
++
++#define INTEL_FAM6_ARROWLAKE_H		0xC5
++#define INTEL_FAM6_ARROWLAKE		0xC6
++
++#define INTEL_FAM6_LUNARLAKE_M		0xBD
+ 
+-/* "Small Core" Processors (Atom) */
++/* "Small Core" Processors (Atom/E-Core) */
+ 
+ #define INTEL_FAM6_ATOM_BONNELL		0x1C /* Diamondville, Pineview */
+ #define INTEL_FAM6_ATOM_BONNELL_MID	0x26 /* Silverthorne, Lincroft */
+@@ -134,6 +157,13 @@
+ #define INTEL_FAM6_ATOM_TREMONT		0x96 /* Elkhart Lake */
+ #define INTEL_FAM6_ATOM_TREMONT_L	0x9C /* Jasper Lake */
+ 
++#define INTEL_FAM6_ATOM_GRACEMONT	0xBE /* Alderlake N */
++
++#define INTEL_FAM6_ATOM_CRESTMONT_X	0xAF /* Sierra Forest */
++#define INTEL_FAM6_ATOM_CRESTMONT	0xB6 /* Grand Ridge */
++
++#define INTEL_FAM6_ATOM_DARKMONT_X	0xDD /* Clearwater Forest */
++
+ /* Xeon Phi */
+ 
+ #define INTEL_FAM6_XEON_PHI_KNL		0x57 /* Knights Landing */
+-- 
+2.44.0
+
+
+From 9f89ec65fbe49c3be32a456091097d7ef017d268 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Fri, 23 Jun 2023 11:32:00 +0100
+Subject: [PATCH 59/70] x86/vmx: Perform VERW flushing later in the VMExit path
+
+Broken out of the following patch because this change is subtle enough on its
+own.  See it for the rational of why we're moving VERW.
+
+As for how, extend the trick already used to hold one condition in
+flags (RESUME vs LAUNCH) through the POPing of GPRs.
+
+Move the MOV CR earlier.  Intel specify flags to be undefined across it.
+
+Encode the two conditions we want using SF and PF.  See the code comment for
+exactly how.
+
+Leave a comment to explain the lack of any content around
+SPEC_CTRL_EXIT_TO_VMX, but leave the block in place.  Sods law says if we
+delete it, we'll need to reintroduce it.
+
+This is part of XSA-452 / CVE-2023-28746.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 475fa20b7384464210f42bad7195f87bd6f1c63f)
+---
+ xen/arch/x86/hvm/vmx/entry.S             | 36 +++++++++++++++++++++---
+ xen/arch/x86/include/asm/asm_defns.h     |  8 ++++++
+ xen/arch/x86/include/asm/spec_ctrl_asm.h |  7 +++++
+ xen/arch/x86/x86_64/asm-offsets.c        |  1 +
+ 4 files changed, 48 insertions(+), 4 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S
+index e3f60d5a82..1bead826ca 100644
+--- a/xen/arch/x86/hvm/vmx/entry.S
++++ b/xen/arch/x86/hvm/vmx/entry.S
+@@ -87,17 +87,39 @@ UNLIKELY_END(realmode)
+ 
+         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
+         /* SPEC_CTRL_EXIT_TO_VMX   Req: %rsp=regs/cpuinfo              Clob:    */
+-        DO_SPEC_CTRL_COND_VERW
++        /*
++         * All speculation safety work happens to be elsewhere.  VERW is after
++         * popping the GPRs, while restoring the guest MSR_SPEC_CTRL is left
++         * to the MSR load list.
++         */
+ 
+         mov  VCPU_hvm_guest_cr2(%rbx),%rax
++        mov  %rax, %cr2
++
++        /*
++         * We need to perform two conditional actions (VERW, and Resume vs
++         * Launch) after popping GPRs.  With some cunning, we can encode both
++         * of these in eflags together.
++         *
++         * Parity is only calculated over the bottom byte of the answer, while
++         * Sign is simply the top bit.
++         *
++         * Therefore, the final OR instruction ends up producing:
++         *   SF = VCPU_vmx_launched
++         *   PF = !SCF_verw
++         */
++        BUILD_BUG_ON(SCF_verw & ~0xff)
++        movzbl VCPU_vmx_launched(%rbx), %ecx
++        shl  $31, %ecx
++        movzbl CPUINFO_spec_ctrl_flags(%rsp), %eax
++        and  $SCF_verw, %eax
++        or   %eax, %ecx
+ 
+         pop  %r15
+         pop  %r14
+         pop  %r13
+         pop  %r12
+         pop  %rbp
+-        mov  %rax,%cr2
+-        cmpb $0,VCPU_vmx_launched(%rbx)
+         pop  %rbx
+         pop  %r11
+         pop  %r10
+@@ -108,7 +130,13 @@ UNLIKELY_END(realmode)
+         pop  %rdx
+         pop  %rsi
+         pop  %rdi
+-        je   .Lvmx_launch
++
++        jpe  .L_skip_verw
++        /* VERW clobbers ZF, but preserves all others, including SF. */
++        verw STK_REL(CPUINFO_verw_sel, CPUINFO_error_code)(%rsp)
++.L_skip_verw:
++
++        jns  .Lvmx_launch
+ 
+ /*.Lvmx_resume:*/
+         VMRESUME
+diff --git a/xen/arch/x86/include/asm/asm_defns.h b/xen/arch/x86/include/asm/asm_defns.h
+index baaaccb26e..56ae26e542 100644
+--- a/xen/arch/x86/include/asm/asm_defns.h
++++ b/xen/arch/x86/include/asm/asm_defns.h
+@@ -81,6 +81,14 @@ register unsigned long current_stack_pointer asm("rsp");
+ 
+ #ifdef __ASSEMBLY__
+ 
++.macro BUILD_BUG_ON condstr, cond:vararg
++        .if \cond
++        .error "Condition \"\condstr\" not satisfied"
++        .endif
++.endm
++/* preprocessor macro to make error message more user friendly */
++#define BUILD_BUG_ON(cond) BUILD_BUG_ON #cond, cond
++
+ #ifdef HAVE_AS_QUOTED_SYM
+ #define SUBSECTION_LBL(tag)                        \
+         .ifndef .L.tag;                            \
+diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h
+index 6cb7c1b949..525745a066 100644
+--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
++++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
+@@ -152,6 +152,13 @@
+ #endif
+ .endm
+ 
++/*
++ * Helper to improve the readibility of stack dispacements with %rsp in
++ * unusual positions.  Both @field and @top_of_stack should be constants from
++ * the same object.  @top_of_stack should be where %rsp is currently pointing.
++ */
++#define STK_REL(field, top_of_stk) ((field) - (top_of_stk))
++
+ .macro DO_SPEC_CTRL_COND_VERW
+ /*
+  * Requires %rsp=cpuinfo
+diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
+index 2fc4d9130a..0d33678898 100644
+--- a/xen/arch/x86/x86_64/asm-offsets.c
++++ b/xen/arch/x86/x86_64/asm-offsets.c
+@@ -135,6 +135,7 @@ void __dummy__(void)
+ #endif
+ 
+     OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs);
++    OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code);
+     OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel);
+     OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu);
+     OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset);
+-- 
+2.44.0
+
+
+From 95dd34fdbea5408872d5c244fe268222a4f145d0 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Sat, 27 Jan 2024 18:20:56 +0000
+Subject: [PATCH 60/70] x86/spec-ctrl: Perform VERW flushing later in exit
+ paths
+
+On parts vulnerable to RFDS, VERW's side effects are extended to scrub all
+non-architectural entries in various Physical Register Files.  To remove all
+of Xen's values, the VERW must be after popping the GPRs.
+
+Rework SPEC_CTRL_COND_VERW to default to an CPUINFO_error_code %rsp position,
+but with overrides for other contexts.  Identify that it clobbers eflags; this
+is particularly relevant for the SYSRET path.
+
+For the IST exit return to Xen, have the main SPEC_CTRL_EXIT_TO_XEN put a
+shadow copy of spec_ctrl_flags, as GPRs can't be used at the point we want to
+issue the VERW.
+
+This is part of XSA-452 / CVE-2023-28746.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 0a666cf2cd99df6faf3eebc81a1fc286e4eca4c7)
+---
+ xen/arch/x86/include/asm/spec_ctrl_asm.h | 36 ++++++++++++++++--------
+ xen/arch/x86/x86_64/asm-offsets.c        | 13 +++++++--
+ xen/arch/x86/x86_64/compat/entry.S       |  6 ++++
+ xen/arch/x86/x86_64/entry.S              | 21 +++++++++++++-
+ 4 files changed, 61 insertions(+), 15 deletions(-)
+
+diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h
+index 525745a066..13acebc75d 100644
+--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h
++++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h
+@@ -159,16 +159,23 @@
+  */
+ #define STK_REL(field, top_of_stk) ((field) - (top_of_stk))
+ 
+-.macro DO_SPEC_CTRL_COND_VERW
++.macro SPEC_CTRL_COND_VERW \
++    scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_error_code), \
++    sel=STK_REL(CPUINFO_verw_sel,        CPUINFO_error_code)
+ /*
+- * Requires %rsp=cpuinfo
++ * Requires \scf and \sel as %rsp-relative expressions
++ * Clobbers eflags
++ *
++ * VERW needs to run after guest GPRs have been restored, where only %rsp is
++ * good to use.  Default to expecting %rsp pointing at CPUINFO_error_code.
++ * Contexts where this is not true must provide an alternative \scf and \sel.
+  *
+  * Issue a VERW for its flushing side effect, if indicated.  This is a Spectre
+  * v1 gadget, but the IRET/VMEntry is serialising.
+  */
+-    testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp)
++    testb $SCF_verw, \scf(%rsp)
+     jz .L\@_verw_skip
+-    verw CPUINFO_verw_sel(%rsp)
++    verw \sel(%rsp)
+ .L\@_verw_skip:
+ .endm
+ 
+@@ -286,8 +293,6 @@
+  */
+     ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV
+ 
+-    DO_SPEC_CTRL_COND_VERW
+-
+     ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
+ .endm
+ 
+@@ -367,7 +372,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
+  */
+ .macro SPEC_CTRL_EXIT_TO_XEN
+ /*
+- * Requires %r12=ist_exit, %r14=stack_end
++ * Requires %r12=ist_exit, %r14=stack_end, %rsp=regs
+  * Clobbers %rax, %rbx, %rcx, %rdx
+  */
+     movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx
+@@ -395,11 +400,18 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise):
+     test %r12, %r12
+     jz .L\@_skip_ist_exit
+ 
+-    /* Logically DO_SPEC_CTRL_COND_VERW but without the %rsp=cpuinfo dependency */
+-    testb $SCF_verw, %bl
+-    jz .L\@_skip_verw
+-    verw STACK_CPUINFO_FIELD(verw_sel)(%r14)
+-.L\@_skip_verw:
++    /*
++     * Stash SCF and verw_sel above eflags in the case of an IST_exit.  The
++     * VERW logic needs to run after guest GPRs have been restored; i.e. where
++     * we cannot use %r12 or %r14 for the purposes they have here.
++     *
++     * When the CPU pushed this exception frame, it zero-extended eflags.
++     * Therefore it is safe for the VERW logic to look at the stashed SCF
++     * outside of the ist_exit condition.  Also, this stashing won't influence
++     * any other restore_all_guest() paths.
++     */
++    or $(__HYPERVISOR_DS32 << 16), %ebx
++    mov %ebx, UREGS_eflags + 4(%rsp) /* EFRAME_shadow_scf/sel */
+ 
+     ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV
+ 
+diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c
+index 0d33678898..85c7d0c989 100644
+--- a/xen/arch/x86/x86_64/asm-offsets.c
++++ b/xen/arch/x86/x86_64/asm-offsets.c
+@@ -55,14 +55,22 @@ void __dummy__(void)
+      * EFRAME_* is for the entry/exit logic where %rsp is pointing at
+      * UREGS_error_code and GPRs are still/already guest values.
+      */
+-#define OFFSET_EF(sym, mem)                                             \
++#define OFFSET_EF(sym, mem, ...)                                        \
+     DEFINE(sym, offsetof(struct cpu_user_regs, mem) -                   \
+-                offsetof(struct cpu_user_regs, error_code))
++                offsetof(struct cpu_user_regs, error_code) __VA_ARGS__)
+ 
+     OFFSET_EF(EFRAME_entry_vector,    entry_vector);
+     OFFSET_EF(EFRAME_rip,             rip);
+     OFFSET_EF(EFRAME_cs,              cs);
+     OFFSET_EF(EFRAME_eflags,          eflags);
++
++    /*
++     * These aren't real fields.  They're spare space, used by the IST
++     * exit-to-xen path.
++     */
++    OFFSET_EF(EFRAME_shadow_scf,      eflags, +4);
++    OFFSET_EF(EFRAME_shadow_sel,      eflags, +6);
++
+     OFFSET_EF(EFRAME_rsp,             rsp);
+     BLANK();
+ 
+@@ -136,6 +144,7 @@ void __dummy__(void)
+ 
+     OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs);
+     OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code);
++    OFFSET(CPUINFO_rip, struct cpu_info, guest_cpu_user_regs.rip);
+     OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel);
+     OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu);
+     OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset);
+diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S
+index cb473f08ee..3bbe3a79a5 100644
+--- a/xen/arch/x86/x86_64/compat/entry.S
++++ b/xen/arch/x86/x86_64/compat/entry.S
+@@ -161,6 +161,12 @@ ENTRY(compat_restore_all_guest)
+         SPEC_CTRL_EXIT_TO_PV    /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */
+ 
+         RESTORE_ALL adj=8 compat=1
++
++        /* Account for ev/ec having already been popped off the stack. */
++        SPEC_CTRL_COND_VERW \
++            scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_rip), \
++            sel=STK_REL(CPUINFO_verw_sel,        CPUINFO_rip)
++
+ .Lft0:  iretq
+         _ASM_PRE_EXTABLE(.Lft0, handle_exception)
+ 
+diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S
+index 968da9d727..2c7512130f 100644
+--- a/xen/arch/x86/x86_64/entry.S
++++ b/xen/arch/x86/x86_64/entry.S
+@@ -214,6 +214,9 @@ restore_all_guest:
+ #endif
+ 
+         mov   EFRAME_rip(%rsp), %rcx
++
++        SPEC_CTRL_COND_VERW     /* Req: %rsp=eframe                    Clob: efl */
++
+         cmpw  $FLAT_USER_CS32, EFRAME_cs(%rsp)
+         mov   EFRAME_rsp(%rsp), %rsp
+         je    1f
+@@ -227,6 +230,9 @@ restore_all_guest:
+ iret_exit_to_guest:
+         andl  $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp)
+         orl   $X86_EFLAGS_IF, EFRAME_eflags(%rsp)
++
++        SPEC_CTRL_COND_VERW     /* Req: %rsp=eframe                    Clob: efl */
++
+         addq  $8,%rsp
+ .Lft0:  iretq
+         _ASM_PRE_EXTABLE(.Lft0, handle_exception)
+@@ -679,9 +685,22 @@ UNLIKELY_START(ne, exit_cr3)
+ UNLIKELY_END(exit_cr3)
+ 
+         /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */
+-        SPEC_CTRL_EXIT_TO_XEN     /* Req: %r12=ist_exit %r14=end, Clob: abcd */
++        SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end %rsp=regs, Clob: abcd */
+ 
+         RESTORE_ALL adj=8
++
++        /*
++         * When the CPU pushed this exception frame, it zero-extended eflags.
++         * For an IST exit, SPEC_CTRL_EXIT_TO_XEN stashed shadow copies of
++         * spec_ctrl_flags and ver_sel above eflags, as we can't use any GPRs,
++         * and we're at a random place on the stack, not in a CPUFINFO block.
++         *
++         * Account for ev/ec having already been popped off the stack.
++         */
++        SPEC_CTRL_COND_VERW \
++            scf=STK_REL(EFRAME_shadow_scf, EFRAME_rip), \
++            sel=STK_REL(EFRAME_shadow_sel, EFRAME_rip)
++
+         iretq
+ 
+ ENTRY(common_interrupt)
+-- 
+2.44.0
+
+
+From b7205fc1cbad0c633e92d2d019a02a507467507b Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Mon, 12 Feb 2024 17:50:43 +0000
+Subject: [PATCH 61/70] x86/spec-ctrl: Rename VERW related options
+
+VERW is going to be used for a 3rd purpose, and the existing nomenclature
+didn't survive the Stale MMIO issues terribly well.
+
+Rename the command line option from `md-clear=` to `verw=`.  This is more
+consistent with other options which tend to be named based on what they're
+doing, not which feature enumeration they use behind the scenes.  Retain
+`md-clear=` as a deprecated alias.
+
+Rename opt_md_clear_{pv,hvm} and opt_fb_clear_mmio to opt_verw_{pv,hvm,mmio},
+which has a side effect of making spec_ctrl_init_domain() rather clearer to
+follow.
+
+No functional change.
+
+This is part of XSA-452 / CVE-2023-28746.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit f7603ca252e4226739eb3129a5290ee3da3f8ea4)
+---
+ docs/misc/xen-command-line.pandoc | 15 ++++----
+ xen/arch/x86/spec_ctrl.c          | 62 ++++++++++++++++---------------
+ 2 files changed, 40 insertions(+), 37 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index 582d6741d1..fbf1683924 100644
+--- a/docs/misc/xen-command-line.pandoc
++++ b/docs/misc/xen-command-line.pandoc
+@@ -2370,7 +2370,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
+ 
+ ### spec-ctrl (x86)
+ > `= List of [ <bool>, xen=<bool>, {pv,hvm}=<bool>,
+->              {msr-sc,rsb,md-clear,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
++>              {msr-sc,rsb,verw,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
+ >              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
+ >              eager-fpu,l1d-flush,branch-harden,srb-lock,
+ >              unpriv-mmio,gds-mit,div-scrub}=<bool> ]`
+@@ -2395,7 +2395,7 @@ in place for guests to use.
+ 
+ Use of a positive boolean value for either of these options is invalid.
+ 
+-The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options
++The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `verw=` and `ibpb-entry=` options
+ offer fine grained control over the primitives by Xen.  These impact Xen's
+ ability to protect itself, and/or Xen's ability to virtualise support for
+ guests to use.
+@@ -2412,11 +2412,12 @@ guests to use.
+   guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc.
+ * `rsb=` offers control over whether to overwrite the Return Stack Buffer /
+   Return Address Stack on entry to Xen and on idle.
+-* `md-clear=` offers control over whether to use VERW to flush
+-  microarchitectural buffers on idle and exit from Xen.  *Note: For
+-  compatibility with development versions of this fix, `mds=` is also accepted
+-  on Xen 4.12 and earlier as an alias.  Consult vendor documentation in
+-  preference to here.*
++* `verw=` offers control over whether to use VERW for its scrubbing side
++  effects at appropriate privilege transitions.  The exact side effects are
++  microarchitecture and microcode specific.  *Note: `md-clear=` is accepted as
++  a deprecated alias.  For compatibility with development versions of XSA-297,
++  `mds=` is also accepted on Xen 4.12 and earlier as an alias.  Consult vendor
++  documentation in preference to here.*
+ * `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction
+   Barrier) is used on entry to Xen.  This is used by default on hardware
+   vulnerable to Branch Type Confusion, and hardware vulnerable to Speculative
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index a965b6db28..c42d8cdc22 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -25,8 +25,8 @@ static bool __initdata opt_msr_sc_pv = true;
+ static bool __initdata opt_msr_sc_hvm = true;
+ static int8_t __initdata opt_rsb_pv = -1;
+ static bool __initdata opt_rsb_hvm = true;
+-static int8_t __ro_after_init opt_md_clear_pv = -1;
+-static int8_t __ro_after_init opt_md_clear_hvm = -1;
++static int8_t __ro_after_init opt_verw_pv = -1;
++static int8_t __ro_after_init opt_verw_hvm = -1;
+ 
+ static int8_t __ro_after_init opt_ibpb_entry_pv = -1;
+ static int8_t __ro_after_init opt_ibpb_entry_hvm = -1;
+@@ -66,7 +66,7 @@ static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination.
+ 
+ static int8_t __initdata opt_srb_lock = -1;
+ static bool __initdata opt_unpriv_mmio;
+-static bool __ro_after_init opt_fb_clear_mmio;
++static bool __ro_after_init opt_verw_mmio;
+ static int8_t __initdata opt_gds_mit = -1;
+ static int8_t __initdata opt_div_scrub = -1;
+ 
+@@ -108,8 +108,8 @@ static int __init cf_check parse_spec_ctrl(const char *s)
+         disable_common:
+             opt_rsb_pv = false;
+             opt_rsb_hvm = false;
+-            opt_md_clear_pv = 0;
+-            opt_md_clear_hvm = 0;
++            opt_verw_pv = 0;
++            opt_verw_hvm = 0;
+             opt_ibpb_entry_pv = 0;
+             opt_ibpb_entry_hvm = 0;
+             opt_ibpb_entry_dom0 = false;
+@@ -140,14 +140,14 @@ static int __init cf_check parse_spec_ctrl(const char *s)
+         {
+             opt_msr_sc_pv = val;
+             opt_rsb_pv = val;
+-            opt_md_clear_pv = val;
++            opt_verw_pv = val;
+             opt_ibpb_entry_pv = val;
+         }
+         else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
+         {
+             opt_msr_sc_hvm = val;
+             opt_rsb_hvm = val;
+-            opt_md_clear_hvm = val;
++            opt_verw_hvm = val;
+             opt_ibpb_entry_hvm = val;
+         }
+         else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 )
+@@ -192,21 +192,22 @@ static int __init cf_check parse_spec_ctrl(const char *s)
+                 break;
+             }
+         }
+-        else if ( (val = parse_boolean("md-clear", s, ss)) != -1 )
++        else if ( (val = parse_boolean("verw", s, ss)) != -1 ||
++                  (val = parse_boolean("md-clear", s, ss)) != -1 )
+         {
+             switch ( val )
+             {
+             case 0:
+             case 1:
+-                opt_md_clear_pv = opt_md_clear_hvm = val;
++                opt_verw_pv = opt_verw_hvm = val;
+                 break;
+ 
+             case -2:
+-                s += strlen("md-clear=");
++                s += (*s == 'v') ? strlen("verw=") : strlen("md-clear=");
+                 if ( (val = parse_boolean("pv", s, ss)) >= 0 )
+-                    opt_md_clear_pv = val;
++                    opt_verw_pv = val;
+                 else if ( (val = parse_boolean("hvm", s, ss)) >= 0 )
+-                    opt_md_clear_hvm = val;
++                    opt_verw_hvm = val;
+                 else
+             default:
+                     rc = -EINVAL;
+@@ -528,8 +529,8 @@ static void __init print_details(enum ind_thunk thunk)
+            opt_srb_lock                              ? " SRB_LOCK+" : " SRB_LOCK-",
+            opt_ibpb_ctxt_switch                      ? " IBPB-ctxt" : "",
+            opt_l1d_flush                             ? " L1D_FLUSH" : "",
+-           opt_md_clear_pv || opt_md_clear_hvm ||
+-           opt_fb_clear_mmio                         ? " VERW"  : "",
++           opt_verw_pv || opt_verw_hvm ||
++           opt_verw_mmio                             ? " VERW"  : "",
+            opt_div_scrub                             ? " DIV" : "",
+            opt_branch_harden                         ? " BRANCH_HARDEN" : "");
+ 
+@@ -550,13 +551,13 @@ static void __init print_details(enum ind_thunk thunk)
+             boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ||
+             boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ||
+             amd_virt_spec_ctrl ||
+-            opt_eager_fpu || opt_md_clear_hvm)       ? ""               : " None",
++            opt_eager_fpu || opt_verw_hvm)           ? ""               : " None",
+            boot_cpu_has(X86_FEATURE_SC_MSR_HVM)      ? " MSR_SPEC_CTRL" : "",
+            (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ||
+             amd_virt_spec_ctrl)                      ? " MSR_VIRT_SPEC_CTRL" : "",
+            boot_cpu_has(X86_FEATURE_SC_RSB_HVM)      ? " RSB"           : "",
+            opt_eager_fpu                             ? " EAGER_FPU"     : "",
+-           opt_md_clear_hvm                          ? " MD_CLEAR"      : "",
++           opt_verw_hvm                              ? " VERW"          : "",
+            boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM)  ? " IBPB-entry"    : "");
+ 
+ #endif
+@@ -565,11 +566,11 @@ static void __init print_details(enum ind_thunk thunk)
+            (boot_cpu_has(X86_FEATURE_SC_MSR_PV) ||
+             boot_cpu_has(X86_FEATURE_SC_RSB_PV) ||
+             boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ||
+-            opt_eager_fpu || opt_md_clear_pv)        ? ""               : " None",
++            opt_eager_fpu || opt_verw_pv)            ? ""               : " None",
+            boot_cpu_has(X86_FEATURE_SC_MSR_PV)       ? " MSR_SPEC_CTRL" : "",
+            boot_cpu_has(X86_FEATURE_SC_RSB_PV)       ? " RSB"           : "",
+            opt_eager_fpu                             ? " EAGER_FPU"     : "",
+-           opt_md_clear_pv                           ? " MD_CLEAR"      : "",
++           opt_verw_pv                               ? " VERW"          : "",
+            boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV)   ? " IBPB-entry"    : "");
+ 
+     printk("  XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n",
+@@ -1502,8 +1503,8 @@ void spec_ctrl_init_domain(struct domain *d)
+ {
+     bool pv = is_pv_domain(d);
+ 
+-    bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) ||
+-                 (opt_fb_clear_mmio && is_iommu_enabled(d)));
++    bool verw = ((pv ? opt_verw_pv : opt_verw_hvm) ||
++                 (opt_verw_mmio && is_iommu_enabled(d)));
+ 
+     bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) &&
+                  (d->domain_id != 0 || opt_ibpb_entry_dom0));
+@@ -1866,19 +1867,20 @@ void __init init_speculation_mitigations(void)
+      * the return-to-guest path.
+      */
+     if ( opt_unpriv_mmio )
+-        opt_fb_clear_mmio = cpu_has_fb_clear;
++        opt_verw_mmio = cpu_has_fb_clear;
+ 
+     /*
+      * By default, enable PV and HVM mitigations on MDS-vulnerable hardware.
+      * This will only be a token effort for MLPDS/MFBDS when HT is enabled,
+      * but it is somewhat better than nothing.
+      */
+-    if ( opt_md_clear_pv == -1 )
+-        opt_md_clear_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) &&
+-                           boot_cpu_has(X86_FEATURE_MD_CLEAR));
+-    if ( opt_md_clear_hvm == -1 )
+-        opt_md_clear_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) &&
+-                            boot_cpu_has(X86_FEATURE_MD_CLEAR));
++    if ( opt_verw_pv == -1 )
++        opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) &&
++                       cpu_has_md_clear);
++
++    if ( opt_verw_hvm == -1 )
++        opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) &&
++                        cpu_has_md_clear);
+ 
+     /*
+      * Enable MDS/MMIO defences as applicable.  The Idle blocks need using if
+@@ -1891,12 +1893,12 @@ void __init init_speculation_mitigations(void)
+      * MDS mitigations.  L1D_FLUSH is not safe for MMIO mitigations.)
+      *
+      * After calculating the appropriate idle setting, simplify
+-     * opt_md_clear_hvm to mean just "should we VERW on the way into HVM
++     * opt_verw_hvm to mean just "should we VERW on the way into HVM
+      * guests", so spec_ctrl_init_domain() can calculate suitable settings.
+      */
+-    if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio )
++    if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio )
+         setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE);
+-    opt_md_clear_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush;
++    opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush;
+ 
+     /*
+      * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT
+-- 
+2.44.0
+
+
+From fb85a8fc91f8cfd61d7c7f9742502b223d4024b5 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Tue, 5 Mar 2024 19:33:37 +0000
+Subject: [PATCH 62/70] x86/spec-ctrl: VERW-handling adjustments
+
+... before we add yet more complexity to this logic.  Mostly expanded
+comments, but with three minor changes.
+
+1) Introduce cpu_has_useful_md_clear to simplify later logic in this patch and
+   future ones.
+
+2) We only ever need SC_VERW_IDLE when SMT is active.  If SMT isn't active,
+   then there's no re-partition of pipeline resources based on thread-idleness
+   to worry about.
+
+3) The logic to adjust HVM VERW based on L1D_FLUSH is unmaintainable and, as
+   it turns out, wrong.  SKIP_L1DFL is just a hint bit, whereas opt_l1d_flush
+   is the relevant decision of whether to use L1D_FLUSH based on
+   susceptibility and user preference.
+
+   Rewrite the logic so it can be followed, and incorporate the fact that when
+   FB_CLEAR is visible, L1D_FLUSH isn't a safe substitution.
+
+This is part of XSA-452 / CVE-2023-28746.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 1eb91a8a06230b4b64228c9a380194f8cfe6c5e2)
+---
+ xen/arch/x86/spec_ctrl.c | 99 +++++++++++++++++++++++++++++-----------
+ 1 file changed, 73 insertions(+), 26 deletions(-)
+
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index c42d8cdc22..a4afcd8570 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -1519,7 +1519,7 @@ void __init init_speculation_mitigations(void)
+ {
+     enum ind_thunk thunk = THUNK_DEFAULT;
+     bool has_spec_ctrl, ibrs = false, hw_smt_enabled;
+-    bool cpu_has_bug_taa, retpoline_safe;
++    bool cpu_has_bug_taa, cpu_has_useful_md_clear, retpoline_safe;
+ 
+     hw_smt_enabled = check_smt_enabled();
+ 
+@@ -1855,50 +1855,97 @@ void __init init_speculation_mitigations(void)
+             "enabled.  Please assess your configuration and choose an\n"
+             "explicit 'smt=<bool>' setting.  See XSA-273.\n");
+ 
++    /*
++     * A brief summary of VERW-related changes.
++     *
++     * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html
++     * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html
++     *
++     * Relevant ucodes:
++     *
++     * - May 2019, for MDS.  Introduces the MD_CLEAR CPUID bit and VERW side
++     *   effects to scrub Store/Load/Fill buffers as applicable.  MD_CLEAR
++     *   exists architecturally, even when the side effects have been removed.
++     *
++     *   Use VERW to scrub on return-to-guest.  Parts with L1D_FLUSH to
++     *   mitigate L1TF have the same side effect, so no need to do both.
++     *
++     *   Various Atoms suffer from Store-buffer sampling only.  Store buffers
++     *   are statically partitioned between non-idle threads, so scrubbing is
++     *   wanted when going idle too.
++     *
++     *   Load ports and Fill buffers are competitively shared between threads.
++     *   SMT must be disabled for VERW scrubbing to be fully effective.
++     *
++     * - November 2019, for TAA.  Extended VERW side effects to TSX-enabled
++     *   MDS_NO parts.
++     *
++     * - February 2022, for Client TSX de-feature.  Removed VERW side effects
++     *   from Client CPUs only.
++     *
++     * - May 2022, for MMIO Stale Data.  (Re)introduced Fill Buffer scrubbing
++     *   on all MMIO-affected parts which didn't already have it for MDS
++     *   reasons, enumerating FB_CLEAR on those parts only.
++     *
++     *   If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing
++     *   side effects as VERW and cannot be used in its place.
++     */
+     mds_calculations();
+ 
+     /*
+-     * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have
+-     * reintroduced the VERW fill buffer flushing side effect because of a
+-     * susceptibility to FBSDP.
++     * Parts which enumerate FB_CLEAR are those with now-updated microcode
++     * which weren't susceptible to the original MFBDS (and therefore didn't
++     * have Fill Buffer scrubbing side effects to begin with, or were Client
++     * MDS_NO non-TAA_NO parts where the scrubbing was removed), but have had
++     * the scrubbing reintroduced because of a susceptibility to FBSDP.
+      *
+      * If unprivileged guests have (or will have) MMIO mappings, we can
+      * mitigate cross-domain leakage of fill buffer data by issuing VERW on
+-     * the return-to-guest path.
++     * the return-to-guest path.  This is only a token effort if SMT is
++     * active.
+      */
+     if ( opt_unpriv_mmio )
+         opt_verw_mmio = cpu_has_fb_clear;
+ 
+     /*
+-     * By default, enable PV and HVM mitigations on MDS-vulnerable hardware.
+-     * This will only be a token effort for MLPDS/MFBDS when HT is enabled,
+-     * but it is somewhat better than nothing.
++     * MD_CLEAR is enumerated architecturally forevermore, even after the
++     * scrubbing side effects have been removed.  Create ourselves an version
++     * which expressed whether we think MD_CLEAR is having any useful side
++     * effect.
++     */
++    cpu_has_useful_md_clear = (cpu_has_md_clear &&
++                               (cpu_has_bug_mds || cpu_has_bug_msbds_only));
++
++    /*
++     * By default, use VERW scrubbing on applicable hardware, if we think it's
++     * going to have an effect.  This will only be a token effort for
++     * MLPDS/MFBDS when SMT is enabled.
+      */
+     if ( opt_verw_pv == -1 )
+-        opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) &&
+-                       cpu_has_md_clear);
++        opt_verw_pv = cpu_has_useful_md_clear;
+ 
+     if ( opt_verw_hvm == -1 )
+-        opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) &&
+-                        cpu_has_md_clear);
++        opt_verw_hvm = cpu_has_useful_md_clear;
+ 
+     /*
+-     * Enable MDS/MMIO defences as applicable.  The Idle blocks need using if
+-     * either the PV or HVM MDS defences are used, or if we may give MMIO
+-     * access to untrusted guests.
+-     *
+-     * HVM is more complicated.  The MD_CLEAR microcode extends L1D_FLUSH with
+-     * equivalent semantics to avoid needing to perform both flushes on the
+-     * HVM path.  Therefore, we don't need VERW in addition to L1D_FLUSH (for
+-     * MDS mitigations.  L1D_FLUSH is not safe for MMIO mitigations.)
+-     *
+-     * After calculating the appropriate idle setting, simplify
+-     * opt_verw_hvm to mean just "should we VERW on the way into HVM
+-     * guests", so spec_ctrl_init_domain() can calculate suitable settings.
++     * If SMT is active, and we're protecting against MDS or MMIO stale data,
++     * we need to scrub before going idle as well as on return to guest.
++     * Various pipeline resources are repartitioned amongst non-idle threads.
+      */
+-    if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio )
++    if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) ||
++          opt_verw_mmio) && hw_smt_enabled )
+         setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE);
+-    opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush;
++
++    /*
++     * After calculating the appropriate idle setting, simplify opt_verw_hvm
++     * to mean just "should we VERW on the way into HVM guests", so
++     * spec_ctrl_init_domain() can calculate suitable settings.
++     *
++     * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the
++     * only *_CLEAR we can see.
++     */
++    if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear )
++        opt_verw_hvm = false;
+ 
+     /*
+      * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT
+-- 
+2.44.0
+
+
+From 908cbd1893e80eb52b92b2c70c2bfd9ffdf6f77b Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Thu, 22 Jun 2023 23:32:19 +0100
+Subject: [PATCH 63/70] x86/spec-ctrl: Mitigation Register File Data Sampling
+
+RFDS affects Atom cores, also branded E-cores, between the Goldmont and
+Gracemont microarchitectures.  This includes Alder Lake and Raptor Lake hybrid
+clien systems which have a mix of Gracemont and other types of cores.
+
+Two new bits have been defined; RFDS_CLEAR to indicate VERW has more side
+effets, and RFDS_NO to incidate that the system is unaffected.  Plenty of
+unaffected CPUs won't be getting RFDS_NO retrofitted in microcode, so we
+synthesise it.  Alder Lake and Raptor Lake Xeon-E's are unaffected due to
+their platform configuration, and we must use the Hybrid CPUID bit to
+distinguish them from their non-Xeon counterparts.
+
+Like MD_CLEAR and FB_CLEAR, RFDS_CLEAR needs OR-ing across a resource pool, so
+set it in the max policies and reflect the host setting in default.
+
+This is part of XSA-452 / CVE-2023-28746.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit fb5b6f6744713410c74cfc12b7176c108e3c9a31)
+---
+ tools/misc/xen-cpuid.c                      |   5 +-
+ xen/arch/x86/cpu-policy.c                   |   5 +
+ xen/arch/x86/include/asm/cpufeature.h       |   3 +
+ xen/arch/x86/include/asm/msr-index.h        |   2 +
+ xen/arch/x86/spec_ctrl.c                    | 100 +++++++++++++++++++-
+ xen/include/public/arch-x86/cpufeatureset.h |   3 +
+ 6 files changed, 111 insertions(+), 7 deletions(-)
+
+diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c
+index 7370f1b56e..52e451a806 100644
+--- a/tools/misc/xen-cpuid.c
++++ b/tools/misc/xen-cpuid.c
+@@ -172,7 +172,7 @@ static const char *const str_7d0[32] =
+     [ 8] = "avx512-vp2intersect", [ 9] = "srbds-ctrl",
+     [10] = "md-clear",            [11] = "rtm-always-abort",
+     /* 12 */                [13] = "tsx-force-abort",
+-    [14] = "serialize",
++    [14] = "serialize",     [15] = "hybrid",
+     [16] = "tsxldtrk",
+     [18] = "pconfig",
+     [20] = "cet-ibt",
+@@ -245,7 +245,8 @@ static const char *const str_m10Al[32] =
+     [20] = "bhi-no",              [21] = "xapic-status",
+     /* 22 */                      [23] = "ovrclk-status",
+     [24] = "pbrsb-no",            [25] = "gds-ctrl",
+-    [26] = "gds-no",
++    [26] = "gds-no",              [27] = "rfds-no",
++    [28] = "rfds-clear",
+ };
+ 
+ static const char *const str_m10Ah[32] =
+diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c
+index c7c5e99b7b..12e621b97d 100644
+--- a/xen/arch/x86/cpu-policy.c
++++ b/xen/arch/x86/cpu-policy.c
+@@ -451,6 +451,7 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs)
+          */
+         __set_bit(X86_FEATURE_MD_CLEAR, fs);
+         __set_bit(X86_FEATURE_FB_CLEAR, fs);
++        __set_bit(X86_FEATURE_RFDS_CLEAR, fs);
+ 
+         /*
+          * The Gather Data Sampling microcode mitigation (August 2023) has an
+@@ -500,6 +501,10 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs)
+         if ( cpu_has_fb_clear )
+             __set_bit(X86_FEATURE_FB_CLEAR, fs);
+ 
++        __clear_bit(X86_FEATURE_RFDS_CLEAR, fs);
++        if ( cpu_has_rfds_clear )
++            __set_bit(X86_FEATURE_RFDS_CLEAR, fs);
++
+         /*
+          * The Gather Data Sampling microcode mitigation (August 2023) has an
+          * adverse performance impact on the CLWB instruction on SKX/CLX/CPX.
+diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h
+index 76ef2aeb1d..3c57f55de0 100644
+--- a/xen/arch/x86/include/asm/cpufeature.h
++++ b/xen/arch/x86/include/asm/cpufeature.h
+@@ -181,6 +181,7 @@ static inline bool boot_cpu_has(unsigned int feat)
+ #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT)
+ #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)
+ #define cpu_has_serialize       boot_cpu_has(X86_FEATURE_SERIALIZE)
++#define cpu_has_hybrid          boot_cpu_has(X86_FEATURE_HYBRID)
+ #define cpu_has_avx512_fp16     boot_cpu_has(X86_FEATURE_AVX512_FP16)
+ #define cpu_has_arch_caps       boot_cpu_has(X86_FEATURE_ARCH_CAPS)
+ 
+@@ -208,6 +209,8 @@ static inline bool boot_cpu_has(unsigned int feat)
+ #define cpu_has_rrsba           boot_cpu_has(X86_FEATURE_RRSBA)
+ #define cpu_has_gds_ctrl        boot_cpu_has(X86_FEATURE_GDS_CTRL)
+ #define cpu_has_gds_no          boot_cpu_has(X86_FEATURE_GDS_NO)
++#define cpu_has_rfds_no         boot_cpu_has(X86_FEATURE_RFDS_NO)
++#define cpu_has_rfds_clear      boot_cpu_has(X86_FEATURE_RFDS_CLEAR)
+ 
+ /* Synthesized. */
+ #define cpu_has_arch_perfmon    boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
+diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h
+index 82a81bd0a2..85ef28a612 100644
+--- a/xen/arch/x86/include/asm/msr-index.h
++++ b/xen/arch/x86/include/asm/msr-index.h
+@@ -89,6 +89,8 @@
+ #define  ARCH_CAPS_PBRSB_NO                 (_AC(1, ULL) << 24)
+ #define  ARCH_CAPS_GDS_CTRL                 (_AC(1, ULL) << 25)
+ #define  ARCH_CAPS_GDS_NO                   (_AC(1, ULL) << 26)
++#define  ARCH_CAPS_RFDS_NO                  (_AC(1, ULL) << 27)
++#define  ARCH_CAPS_RFDS_CLEAR               (_AC(1, ULL) << 28)
+ 
+ #define MSR_FLUSH_CMD                       0x0000010b
+ #define  FLUSH_CMD_L1D                      (_AC(1, ULL) <<  0)
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index a4afcd8570..8165379fed 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -12,6 +12,7 @@
+ 
+ #include <asm/amd.h>
+ #include <asm/hvm/svm/svm.h>
++#include <asm/intel-family.h>
+ #include <asm/microcode.h>
+ #include <asm/msr.h>
+ #include <asm/pv/domain.h>
+@@ -435,7 +436,7 @@ static void __init print_details(enum ind_thunk thunk)
+      * Hardware read-only information, stating immunity to certain issues, or
+      * suggestions of which mitigation to use.
+      */
+-    printk("  Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
++    printk("  Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+            (caps & ARCH_CAPS_RDCL_NO)                        ? " RDCL_NO"        : "",
+            (caps & ARCH_CAPS_EIBRS)                          ? " EIBRS"          : "",
+            (caps & ARCH_CAPS_RSBA)                           ? " RSBA"           : "",
+@@ -451,6 +452,7 @@ static void __init print_details(enum ind_thunk thunk)
+            (caps & ARCH_CAPS_FB_CLEAR)                       ? " FB_CLEAR"       : "",
+            (caps & ARCH_CAPS_PBRSB_NO)                       ? " PBRSB_NO"       : "",
+            (caps & ARCH_CAPS_GDS_NO)                         ? " GDS_NO"         : "",
++           (caps & ARCH_CAPS_RFDS_NO)                        ? " RFDS_NO"        : "",
+            (e8b  & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS))    ? " IBRS_ALWAYS"    : "",
+            (e8b  & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS))   ? " STIBP_ALWAYS"   : "",
+            (e8b  & cpufeat_mask(X86_FEATURE_IBRS_FAST))      ? " IBRS_FAST"      : "",
+@@ -461,7 +463,7 @@ static void __init print_details(enum ind_thunk thunk)
+            (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO))        ? " SRSO_NO"        : "");
+ 
+     /* Hardware features which need driving to mitigate issues. */
+-    printk("  Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
++    printk("  Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+            (e8b  & cpufeat_mask(X86_FEATURE_IBPB)) ||
+            (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB))          ? " IBPB"           : "",
+            (e8b  & cpufeat_mask(X86_FEATURE_IBRS)) ||
+@@ -479,6 +481,7 @@ static void __init print_details(enum ind_thunk thunk)
+            (caps & ARCH_CAPS_TSX_CTRL)                       ? " TSX_CTRL"       : "",
+            (caps & ARCH_CAPS_FB_CLEAR_CTRL)                  ? " FB_CLEAR_CTRL"  : "",
+            (caps & ARCH_CAPS_GDS_CTRL)                       ? " GDS_CTRL"       : "",
++           (caps & ARCH_CAPS_RFDS_CLEAR)                     ? " RFDS_CLEAR"     : "",
+            (e21a & cpufeat_mask(X86_FEATURE_SBPB))           ? " SBPB"           : "");
+ 
+     /* Compiled-in support which pertains to mitigations. */
+@@ -1347,6 +1350,83 @@ static __init void mds_calculations(void)
+     }
+ }
+ 
++/*
++ * Register File Data Sampling affects Atom cores from the Goldmont to
++ * Gracemont microarchitectures.  The March 2024 microcode adds RFDS_NO to
++ * some but not all unaffected parts, and RFDS_CLEAR to affected parts still
++ * in support.
++ *
++ * Alder Lake and Raptor Lake client CPUs have a mix of P cores
++ * (Golden/Raptor Cove, not vulnerable) and E cores (Gracemont,
++ * vulnerable), and both enumerate RFDS_CLEAR.
++ *
++ * Both exist in a Xeon SKU, which has the E cores (Gracemont) disabled by
++ * platform configuration, and enumerate RFDS_NO.
++ *
++ * With older parts, or with out-of-date microcode, synthesise RFDS_NO when
++ * safe to do so.
++ *
++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html
++ */
++static void __init rfds_calculations(void)
++{
++    /* RFDS is only known to affect Intel Family 6 processors at this time. */
++    if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
++         boot_cpu_data.x86 != 6 )
++        return;
++
++    /*
++     * If RFDS_NO or RFDS_CLEAR are visible, we've either got suitable
++     * microcode, or an RFDS-aware hypervisor is levelling us in a pool.
++     */
++    if ( cpu_has_rfds_no || cpu_has_rfds_clear )
++        return;
++
++    /* If we're virtualised, don't attempt to synthesise RFDS_NO. */
++    if ( cpu_has_hypervisor )
++        return;
++
++    /*
++     * Not all CPUs are expected to get a microcode update enumerating one of
++     * RFDS_{NO,CLEAR}, or we might have out-of-date microcode.
++     */
++    switch ( boot_cpu_data.x86_model )
++    {
++    case INTEL_FAM6_ALDERLAKE:
++    case INTEL_FAM6_RAPTORLAKE:
++        /*
++         * Alder Lake and Raptor Lake might be a client SKU (with the
++         * Gracemont cores active, and therefore vulnerable) or might be a
++         * server SKU (with the Gracemont cores disabled, and therefore not
++         * vulnerable).
++         *
++         * See if the CPU identifies as hybrid to distinguish the two cases.
++         */
++        if ( !cpu_has_hybrid )
++            break;
++        fallthrough;
++    case INTEL_FAM6_ALDERLAKE_L:
++    case INTEL_FAM6_RAPTORLAKE_P:
++    case INTEL_FAM6_RAPTORLAKE_S:
++
++    case INTEL_FAM6_ATOM_GOLDMONT:      /* Apollo Lake */
++    case INTEL_FAM6_ATOM_GOLDMONT_D:    /* Denverton */
++    case INTEL_FAM6_ATOM_GOLDMONT_PLUS: /* Gemini Lake */
++    case INTEL_FAM6_ATOM_TREMONT_D:     /* Snow Ridge / Parker Ridge */
++    case INTEL_FAM6_ATOM_TREMONT:       /* Elkhart Lake */
++    case INTEL_FAM6_ATOM_TREMONT_L:     /* Jasper Lake */
++    case INTEL_FAM6_ATOM_GRACEMONT:     /* Alder Lake N */
++        return;
++    }
++
++    /*
++     * We appear to be on an unaffected CPU which didn't enumerate RFDS_NO,
++     * perhaps because of it's age or because of out-of-date microcode.
++     * Synthesise it.
++     */
++    setup_force_cpu_cap(X86_FEATURE_RFDS_NO);
++}
++
+ static bool __init cpu_has_gds(void)
+ {
+     /*
+@@ -1860,6 +1940,7 @@ void __init init_speculation_mitigations(void)
+      *
+      * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html
+      * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html
++     * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html
+      *
+      * Relevant ucodes:
+      *
+@@ -1889,8 +1970,12 @@ void __init init_speculation_mitigations(void)
+      *
+      *   If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing
+      *   side effects as VERW and cannot be used in its place.
++     *
++     * - March 2023, for RFDS.  Enumerate RFDS_CLEAR to mean that VERW now
++     *   scrubs non-architectural entries from certain register files.
+      */
+     mds_calculations();
++    rfds_calculations();
+ 
+     /*
+      * Parts which enumerate FB_CLEAR are those with now-updated microcode
+@@ -1922,15 +2007,19 @@ void __init init_speculation_mitigations(void)
+      * MLPDS/MFBDS when SMT is enabled.
+      */
+     if ( opt_verw_pv == -1 )
+-        opt_verw_pv = cpu_has_useful_md_clear;
++        opt_verw_pv = cpu_has_useful_md_clear || cpu_has_rfds_clear;
+ 
+     if ( opt_verw_hvm == -1 )
+-        opt_verw_hvm = cpu_has_useful_md_clear;
++        opt_verw_hvm = cpu_has_useful_md_clear || cpu_has_rfds_clear;
+ 
+     /*
+      * If SMT is active, and we're protecting against MDS or MMIO stale data,
+      * we need to scrub before going idle as well as on return to guest.
+      * Various pipeline resources are repartitioned amongst non-idle threads.
++     *
++     * We don't need to scrub on idle for RFDS.  There are no affected cores
++     * which support SMT, despite there being affected cores in hybrid systems
++     * which have SMT elsewhere in the platform.
+      */
+     if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) ||
+           opt_verw_mmio) && hw_smt_enabled )
+@@ -1944,7 +2033,8 @@ void __init init_speculation_mitigations(void)
+      * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the
+      * only *_CLEAR we can see.
+      */
+-    if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear )
++    if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear &&
++         !cpu_has_rfds_clear )
+         opt_verw_hvm = false;
+ 
+     /*
+diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h
+index 337aaa9c77..8e17ef670f 100644
+--- a/xen/include/public/arch-x86/cpufeatureset.h
++++ b/xen/include/public/arch-x86/cpufeatureset.h
+@@ -266,6 +266,7 @@ XEN_CPUFEATURE(MD_CLEAR,      9*32+10) /*!A VERW clears microarchitectural buffe
+ XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */
+ XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */
+ XEN_CPUFEATURE(SERIALIZE,     9*32+14) /*A  SERIALIZE insn */
++XEN_CPUFEATURE(HYBRID,        9*32+15) /*   Heterogeneous platform */
+ XEN_CPUFEATURE(TSXLDTRK,      9*32+16) /*a  TSX load tracking suspend/resume insns */
+ XEN_CPUFEATURE(CET_IBT,       9*32+20) /*   CET - Indirect Branch Tracking */
+ XEN_CPUFEATURE(AVX512_FP16,   9*32+23) /*A  AVX512 FP16 instructions */
+@@ -338,6 +339,8 @@ XEN_CPUFEATURE(OVRCLK_STATUS,      16*32+23) /*   MSR_OVERCLOCKING_STATUS */
+ XEN_CPUFEATURE(PBRSB_NO,           16*32+24) /*A  No Post-Barrier RSB predictions */
+ XEN_CPUFEATURE(GDS_CTRL,           16*32+25) /*   MCU_OPT_CTRL.GDS_MIT_{DIS,LOCK} */
+ XEN_CPUFEATURE(GDS_NO,             16*32+26) /*A  No Gather Data Sampling */
++XEN_CPUFEATURE(RFDS_NO,            16*32+27) /*A  No Register File Data Sampling */
++XEN_CPUFEATURE(RFDS_CLEAR,         16*32+28) /*!A Register File(s) cleared by VERW */
+ 
+ /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.edx, word 17 */
+ 
+-- 
+2.44.0
+
+
+From bdda600406e5f5c35bcb17b2f9458e2138d7ad46 Mon Sep 17 00:00:00 2001
+From: Andrew Cooper <andrew.cooper3@citrix.com>
+Date: Fri, 2 Feb 2024 00:39:42 +0000
+Subject: [PATCH 64/70] xen: Swap order of actions in the FREE*() macros
+
+Wherever possible, it is a good idea to NULL out the visible reference to an
+object prior to freeing it.  The FREE*() macros already collect together both
+parts, making it easy to adjust.
+
+This has a marginal code generation improvement, as some of the calls to the
+free() function can be tailcall optimised.
+
+No functional change.
+
+Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
+Acked-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit c4f427ec879e7c0df6d44d02561e8bee838a293e)
+---
+ xen/include/xen/mm.h      | 3 ++-
+ xen/include/xen/xmalloc.h | 7 ++++---
+ 2 files changed, 6 insertions(+), 4 deletions(-)
+
+diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h
+index 8b9618609f..8bc5f4249d 100644
+--- a/xen/include/xen/mm.h
++++ b/xen/include/xen/mm.h
+@@ -91,8 +91,9 @@ bool scrub_free_pages(void);
+ 
+ /* Free an allocation, and zero the pointer to it. */
+ #define FREE_XENHEAP_PAGES(p, o) do { \
+-    free_xenheap_pages(p, o);         \
++    void *_ptr_ = (p);                \
+     (p) = NULL;                       \
++    free_xenheap_pages(_ptr_, o);     \
+ } while ( false )
+ #define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0)
+ 
+diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h
+index 16979a117c..d857298011 100644
+--- a/xen/include/xen/xmalloc.h
++++ b/xen/include/xen/xmalloc.h
+@@ -66,9 +66,10 @@
+ extern void xfree(void *);
+ 
+ /* Free an allocation, and zero the pointer to it. */
+-#define XFREE(p) do { \
+-    xfree(p);         \
+-    (p) = NULL;       \
++#define XFREE(p) do {                       \
++    void *_ptr_ = (p);                      \
++    (p) = NULL;                             \
++    xfree(_ptr_);                           \
+ } while ( false )
+ 
+ /* Underlying functions */
+-- 
+2.44.0
+
+
+From 1932973ac9a8c28197ebb24749c73c18cf23f5f1 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 13 Feb 2024 13:08:05 +0100
+Subject: [PATCH 65/70] x86/spinlock: introduce support for blocking
+ speculation into critical regions
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Introduce a new Kconfig option to block speculation into lock protected
+critical regions.  The Kconfig option is enabled by default, but the mitigation
+won't be engaged unless it's explicitly enabled in the command line using
+`spec-ctrl=lock-harden`.
+
+Convert the spinlock acquire macros into always-inline functions, and introduce
+a speculation barrier after the lock has been taken.  Note the speculation
+barrier is not placed inside the implementation of the spin lock functions, as
+to prevent speculation from falling through the call to the lock functions
+resulting in the barrier also being skipped.
+
+trylock variants are protected using a construct akin to the existing
+evaluate_nospec().
+
+This patch only implements the speculation barrier for x86.
+
+Note spin locks are the only locking primitive taken care in this change,
+further locking primitives will be adjusted by separate changes.
+
+This is part of XSA-453 / CVE-2024-2193
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 7ef0084418e188d05f338c3e028fbbe8b6924afa)
+---
+ docs/misc/xen-command-line.pandoc      |  7 ++++-
+ xen/arch/x86/include/asm/cpufeatures.h |  2 +-
+ xen/arch/x86/include/asm/nospec.h      | 26 ++++++++++++++++++
+ xen/arch/x86/spec_ctrl.c               | 26 +++++++++++++++---
+ xen/common/Kconfig                     | 17 ++++++++++++
+ xen/include/xen/nospec.h               | 15 +++++++++++
+ xen/include/xen/spinlock.h             | 37 +++++++++++++++++++++-----
+ 7 files changed, 119 insertions(+), 11 deletions(-)
+
+diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc
+index fbf1683924..3f9f916718 100644
+--- a/docs/misc/xen-command-line.pandoc
++++ b/docs/misc/xen-command-line.pandoc
+@@ -2373,7 +2373,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`).
+ >              {msr-sc,rsb,verw,ibpb-entry}=<bool>|{pv,hvm}=<bool>,
+ >              bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd,
+ >              eager-fpu,l1d-flush,branch-harden,srb-lock,
+->              unpriv-mmio,gds-mit,div-scrub}=<bool> ]`
++>              unpriv-mmio,gds-mit,div-scrub,lock-harden}=<bool> ]`
+ 
+ Controls for speculative execution sidechannel mitigations.  By default, Xen
+ will pick the most appropriate mitigations based on compiled in support,
+@@ -2500,6 +2500,11 @@ On all hardware, the `div-scrub=` option can be used to force or prevent Xen
+ from mitigating the DIV-leakage vulnerability.  By default, Xen will mitigate
+ DIV-leakage on hardware believed to be vulnerable.
+ 
++If Xen is compiled with `CONFIG_SPECULATIVE_HARDEN_LOCK`, the `lock-harden=`
++boolean can be used to force or prevent Xen from using speculation barriers to
++protect lock critical regions.  This mitigation won't be engaged by default,
++and needs to be explicitly enabled on the command line.
++
+ ### sync_console
+ > `= <boolean>`
+ 
+diff --git a/xen/arch/x86/include/asm/cpufeatures.h b/xen/arch/x86/include/asm/cpufeatures.h
+index c3aad21c3b..7e8221fd85 100644
+--- a/xen/arch/x86/include/asm/cpufeatures.h
++++ b/xen/arch/x86/include/asm/cpufeatures.h
+@@ -24,7 +24,7 @@ XEN_CPUFEATURE(APERFMPERF,        X86_SYNTH( 8)) /* APERFMPERF */
+ XEN_CPUFEATURE(MFENCE_RDTSC,      X86_SYNTH( 9)) /* MFENCE synchronizes RDTSC */
+ XEN_CPUFEATURE(XEN_SMEP,          X86_SYNTH(10)) /* SMEP gets used by Xen itself */
+ XEN_CPUFEATURE(XEN_SMAP,          X86_SYNTH(11)) /* SMAP gets used by Xen itself */
+-/* Bit 12 unused. */
++XEN_CPUFEATURE(SC_NO_LOCK_HARDEN, X86_SYNTH(12)) /* (Disable) Lock critical region hardening */
+ XEN_CPUFEATURE(IND_THUNK_LFENCE,  X86_SYNTH(13)) /* Use IND_THUNK_LFENCE */
+ XEN_CPUFEATURE(IND_THUNK_JMP,     X86_SYNTH(14)) /* Use IND_THUNK_JMP */
+ XEN_CPUFEATURE(SC_NO_BRANCH_HARDEN, X86_SYNTH(15)) /* (Disable) Conditional branch hardening */
+diff --git a/xen/arch/x86/include/asm/nospec.h b/xen/arch/x86/include/asm/nospec.h
+index 7150e76b87..0725839e19 100644
+--- a/xen/arch/x86/include/asm/nospec.h
++++ b/xen/arch/x86/include/asm/nospec.h
+@@ -38,6 +38,32 @@ static always_inline void block_speculation(void)
+     barrier_nospec_true();
+ }
+ 
++static always_inline void arch_block_lock_speculation(void)
++{
++    alternative("lfence", "", X86_FEATURE_SC_NO_LOCK_HARDEN);
++}
++
++/* Allow to insert a read memory barrier into conditionals */
++static always_inline bool barrier_lock_true(void)
++{
++    alternative("lfence #nospec-true", "", X86_FEATURE_SC_NO_LOCK_HARDEN);
++    return true;
++}
++
++static always_inline bool barrier_lock_false(void)
++{
++    alternative("lfence #nospec-false", "", X86_FEATURE_SC_NO_LOCK_HARDEN);
++    return false;
++}
++
++static always_inline bool arch_lock_evaluate_nospec(bool condition)
++{
++    if ( condition )
++        return barrier_lock_true();
++    else
++        return barrier_lock_false();
++}
++
+ #endif /* _ASM_X86_NOSPEC_H */
+ 
+ /*
+diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
+index 8165379fed..5dfc4ed69e 100644
+--- a/xen/arch/x86/spec_ctrl.c
++++ b/xen/arch/x86/spec_ctrl.c
+@@ -53,6 +53,7 @@ int8_t __read_mostly opt_eager_fpu = -1;
+ int8_t __read_mostly opt_l1d_flush = -1;
+ static bool __initdata opt_branch_harden =
+     IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH);
++static bool __initdata opt_lock_harden;
+ 
+ bool __initdata bsp_delay_spec_ctrl;
+ uint8_t __read_mostly default_xen_spec_ctrl;
+@@ -121,6 +122,7 @@ static int __init cf_check parse_spec_ctrl(const char *s)
+             opt_ssbd = false;
+             opt_l1d_flush = 0;
+             opt_branch_harden = false;
++            opt_lock_harden = false;
+             opt_srb_lock = 0;
+             opt_unpriv_mmio = false;
+             opt_gds_mit = 0;
+@@ -286,6 +288,16 @@ static int __init cf_check parse_spec_ctrl(const char *s)
+                 rc = -EINVAL;
+             }
+         }
++        else if ( (val = parse_boolean("lock-harden", s, ss)) >= 0 )
++        {
++            if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) )
++                opt_lock_harden = val;
++            else
++            {
++                no_config_param("SPECULATIVE_HARDEN_LOCK", "spec-ctrl", s, ss);
++                rc = -EINVAL;
++            }
++        }
+         else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 )
+             opt_srb_lock = val;
+         else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 )
+@@ -488,7 +500,8 @@ static void __init print_details(enum ind_thunk thunk)
+     if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ||
+          IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) ||
+          IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) ||
+-         IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) )
++         IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) ||
++         IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) )
+         printk("  Compiled-in support:"
+ #ifdef CONFIG_INDIRECT_THUNK
+                " INDIRECT_THUNK"
+@@ -504,11 +517,14 @@ static void __init print_details(enum ind_thunk thunk)
+ #endif
+ #ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS
+                " HARDEN_GUEST_ACCESS"
++#endif
++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK
++               " HARDEN_LOCK"
+ #endif
+                "\n");
+ 
+     /* Settings for Xen's protection, irrespective of guests. */
+-    printk("  Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n",
++    printk("  Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s%s\n",
+            thunk != THUNK_NONE      ? "BTI-Thunk: " : "",
+            thunk == THUNK_NONE      ? "" :
+            thunk == THUNK_RETPOLINE ? "RETPOLINE, " :
+@@ -535,7 +551,8 @@ static void __init print_details(enum ind_thunk thunk)
+            opt_verw_pv || opt_verw_hvm ||
+            opt_verw_mmio                             ? " VERW"  : "",
+            opt_div_scrub                             ? " DIV" : "",
+-           opt_branch_harden                         ? " BRANCH_HARDEN" : "");
++           opt_branch_harden                         ? " BRANCH_HARDEN" : "",
++           opt_lock_harden                           ? " LOCK_HARDEN" : "");
+ 
+     /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */
+     if ( cpu_has_bug_l1tf || opt_pv_l1tf_hwdom || opt_pv_l1tf_domu )
+@@ -1918,6 +1935,9 @@ void __init init_speculation_mitigations(void)
+     if ( !opt_branch_harden )
+         setup_force_cpu_cap(X86_FEATURE_SC_NO_BRANCH_HARDEN);
+ 
++    if ( !opt_lock_harden )
++        setup_force_cpu_cap(X86_FEATURE_SC_NO_LOCK_HARDEN);
++
+     /*
+      * We do not disable HT by default on affected hardware.
+      *
+diff --git a/xen/common/Kconfig b/xen/common/Kconfig
+index 4d6fe05164..3361a6d892 100644
+--- a/xen/common/Kconfig
++++ b/xen/common/Kconfig
+@@ -188,6 +188,23 @@ config SPECULATIVE_HARDEN_GUEST_ACCESS
+ 
+ 	  If unsure, say Y.
+ 
++config SPECULATIVE_HARDEN_LOCK
++	bool "Speculative lock context hardening"
++	default y
++	depends on X86
++	help
++	  Contemporary processors may use speculative execution as a
++	  performance optimisation, but this can potentially be abused by an
++	  attacker to leak data via speculative sidechannels.
++
++	  One source of data leakage is via speculative accesses to lock
++	  critical regions.
++
++	  This option is disabled by default at run time, and needs to be
++	  enabled on the command line.
++
++	  If unsure, say Y.
++
+ endmenu
+ 
+ config DIT_DEFAULT
+diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h
+index 76255bc46e..4552846403 100644
+--- a/xen/include/xen/nospec.h
++++ b/xen/include/xen/nospec.h
+@@ -70,6 +70,21 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
+ #define array_access_nospec(array, index)                               \
+     (array)[array_index_nospec(index, ARRAY_SIZE(array))]
+ 
++static always_inline void block_lock_speculation(void)
++{
++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK
++    arch_block_lock_speculation();
++#endif
++}
++
++static always_inline bool lock_evaluate_nospec(bool condition)
++{
++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK
++    return arch_lock_evaluate_nospec(condition);
++#endif
++    return condition;
++}
++
+ #endif /* XEN_NOSPEC_H */
+ 
+ /*
+diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h
+index e7a1c1aa89..28fce5615e 100644
+--- a/xen/include/xen/spinlock.h
++++ b/xen/include/xen/spinlock.h
+@@ -1,6 +1,7 @@
+ #ifndef __SPINLOCK_H__
+ #define __SPINLOCK_H__
+ 
++#include <xen/nospec.h>
+ #include <xen/time.h>
+ #include <xen/types.h>
+ 
+@@ -195,13 +196,30 @@ int _spin_trylock_recursive(spinlock_t *lock);
+ void _spin_lock_recursive(spinlock_t *lock);
+ void _spin_unlock_recursive(spinlock_t *lock);
+ 
+-#define spin_lock(l)                  _spin_lock(l)
+-#define spin_lock_cb(l, c, d)         _spin_lock_cb(l, c, d)
+-#define spin_lock_irq(l)              _spin_lock_irq(l)
++static always_inline void spin_lock(spinlock_t *l)
++{
++    _spin_lock(l);
++    block_lock_speculation();
++}
++
++static always_inline void spin_lock_cb(spinlock_t *l, void (*c)(void *data),
++                                       void *d)
++{
++    _spin_lock_cb(l, c, d);
++    block_lock_speculation();
++}
++
++static always_inline void spin_lock_irq(spinlock_t *l)
++{
++    _spin_lock_irq(l);
++    block_lock_speculation();
++}
++
+ #define spin_lock_irqsave(l, f)                                 \
+     ({                                                          \
+         BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long));       \
+         ((f) = _spin_lock_irqsave(l));                          \
++        block_lock_speculation();                               \
+     })
+ 
+ #define spin_unlock(l)                _spin_unlock(l)
+@@ -209,7 +227,7 @@ void _spin_unlock_recursive(spinlock_t *lock);
+ #define spin_unlock_irqrestore(l, f)  _spin_unlock_irqrestore(l, f)
+ 
+ #define spin_is_locked(l)             _spin_is_locked(l)
+-#define spin_trylock(l)               _spin_trylock(l)
++#define spin_trylock(l)               lock_evaluate_nospec(_spin_trylock(l))
+ 
+ #define spin_trylock_irqsave(lock, flags)       \
+ ({                                              \
+@@ -230,8 +248,15 @@ void _spin_unlock_recursive(spinlock_t *lock);
+  * are any critical regions that cannot form part of such a set, they can use
+  * standard spin_[un]lock().
+  */
+-#define spin_trylock_recursive(l)     _spin_trylock_recursive(l)
+-#define spin_lock_recursive(l)        _spin_lock_recursive(l)
++#define spin_trylock_recursive(l) \
++    lock_evaluate_nospec(_spin_trylock_recursive(l))
++
++static always_inline void spin_lock_recursive(spinlock_t *l)
++{
++    _spin_lock_recursive(l);
++    block_lock_speculation();
++}
++
+ #define spin_unlock_recursive(l)      _spin_unlock_recursive(l)
+ 
+ #endif /* __SPINLOCK_H__ */
+-- 
+2.44.0
+
+
+From e7f0f11c888757e62940ded87b4ab5ebc992764f Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 13 Feb 2024 16:08:52 +0100
+Subject: [PATCH 66/70] rwlock: introduce support for blocking speculation into
+ critical regions
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Introduce inline wrappers as required and add direct calls to
+block_lock_speculation() in order to prevent speculation into the rwlock
+protected critical regions.
+
+Note the rwlock primitives are adjusted to use the non speculation safe variants
+of the spinlock handlers, as a speculation barrier is added in the rwlock
+calling wrappers.
+
+trylock variants are protected by using lock_evaluate_nospec().
+
+This is part of XSA-453 / CVE-2024-2193
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit a1fb15f61692b1fa9945fc51f55471ace49cdd59)
+---
+ xen/common/rwlock.c      | 14 +++++++++++---
+ xen/include/xen/rwlock.h | 34 ++++++++++++++++++++++++++++------
+ 2 files changed, 39 insertions(+), 9 deletions(-)
+
+diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c
+index 18224a4bb5..290602936d 100644
+--- a/xen/common/rwlock.c
++++ b/xen/common/rwlock.c
+@@ -34,8 +34,11 @@ void queue_read_lock_slowpath(rwlock_t *lock)
+ 
+     /*
+      * Put the reader into the wait queue.
++     *
++     * Use the speculation unsafe helper, as it's the caller responsibility to
++     * issue a speculation barrier if required.
+      */
+-    spin_lock(&lock->lock);
++    _spin_lock(&lock->lock);
+ 
+     /*
+      * At the head of the wait queue now, wait until the writer state
+@@ -66,8 +69,13 @@ void queue_write_lock_slowpath(rwlock_t *lock)
+ {
+     u32 cnts;
+ 
+-    /* Put the writer into the wait queue. */
+-    spin_lock(&lock->lock);
++    /*
++     * Put the writer into the wait queue.
++     *
++     * Use the speculation unsafe helper, as it's the caller responsibility to
++     * issue a speculation barrier if required.
++     */
++    _spin_lock(&lock->lock);
+ 
+     /* Try to acquire the lock directly if no reader is present. */
+     if ( !atomic_read(&lock->cnts) &&
+diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h
+index e0d2b41c5c..9a0d3ec238 100644
+--- a/xen/include/xen/rwlock.h
++++ b/xen/include/xen/rwlock.h
+@@ -259,27 +259,49 @@ static inline int _rw_is_write_locked(const rwlock_t *lock)
+     return (atomic_read(&lock->cnts) & _QW_WMASK) == _QW_LOCKED;
+ }
+ 
+-#define read_lock(l)                  _read_lock(l)
+-#define read_lock_irq(l)              _read_lock_irq(l)
++static always_inline void read_lock(rwlock_t *l)
++{
++    _read_lock(l);
++    block_lock_speculation();
++}
++
++static always_inline void read_lock_irq(rwlock_t *l)
++{
++    _read_lock_irq(l);
++    block_lock_speculation();
++}
++
+ #define read_lock_irqsave(l, f)                                 \
+     ({                                                          \
+         BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long));       \
+         ((f) = _read_lock_irqsave(l));                          \
++        block_lock_speculation();                               \
+     })
+ 
+ #define read_unlock(l)                _read_unlock(l)
+ #define read_unlock_irq(l)            _read_unlock_irq(l)
+ #define read_unlock_irqrestore(l, f)  _read_unlock_irqrestore(l, f)
+-#define read_trylock(l)               _read_trylock(l)
++#define read_trylock(l)               lock_evaluate_nospec(_read_trylock(l))
++
++static always_inline void write_lock(rwlock_t *l)
++{
++    _write_lock(l);
++    block_lock_speculation();
++}
++
++static always_inline void write_lock_irq(rwlock_t *l)
++{
++    _write_lock_irq(l);
++    block_lock_speculation();
++}
+ 
+-#define write_lock(l)                 _write_lock(l)
+-#define write_lock_irq(l)             _write_lock_irq(l)
+ #define write_lock_irqsave(l, f)                                \
+     ({                                                          \
+         BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long));       \
+         ((f) = _write_lock_irqsave(l));                         \
++        block_lock_speculation();                               \
+     })
+-#define write_trylock(l)              _write_trylock(l)
++#define write_trylock(l)              lock_evaluate_nospec(_write_trylock(l))
+ 
+ #define write_unlock(l)               _write_unlock(l)
+ #define write_unlock_irq(l)           _write_unlock_irq(l)
+-- 
+2.44.0
+
+
+From 5a13c81542a163718d7cb9b150b0282b7855efde Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Tue, 13 Feb 2024 17:57:38 +0100
+Subject: [PATCH 67/70] percpu-rwlock: introduce support for blocking
+ speculation into critical regions
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add direct calls to block_lock_speculation() where required in order to prevent
+speculation into the lock protected critical regions.  Also convert
+_percpu_read_lock() from inline to always_inline.
+
+Note that _percpu_write_lock() has been modified the use the non speculation
+safe of the locking primites, as a speculation is added unconditionally by the
+calling wrapper.
+
+This is part of XSA-453 / CVE-2024-2193
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit f218daf6d3a3b847736d37c6a6b76031a0d08441)
+---
+ xen/common/rwlock.c      |  6 +++++-
+ xen/include/xen/rwlock.h | 14 ++++++++++----
+ 2 files changed, 15 insertions(+), 5 deletions(-)
+
+diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c
+index 290602936d..f5a249bcc2 100644
+--- a/xen/common/rwlock.c
++++ b/xen/common/rwlock.c
+@@ -129,8 +129,12 @@ void _percpu_write_lock(percpu_rwlock_t **per_cpudata,
+     /*
+      * First take the write lock to protect against other writers or slow
+      * path readers.
++     *
++     * Note we use the speculation unsafe variant of write_lock(), as the
++     * calling wrapper already adds a speculation barrier after the lock has
++     * been taken.
+      */
+-    write_lock(&percpu_rwlock->rwlock);
++    _write_lock(&percpu_rwlock->rwlock);
+ 
+     /* Now set the global variable so that readers start using read_lock. */
+     percpu_rwlock->writer_activating = 1;
+diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h
+index 9a0d3ec238..9e35ee2edf 100644
+--- a/xen/include/xen/rwlock.h
++++ b/xen/include/xen/rwlock.h
+@@ -338,8 +338,8 @@ static inline void _percpu_rwlock_owner_check(percpu_rwlock_t **per_cpudata,
+ #define percpu_rwlock_resource_init(l, owner) \
+     (*(l) = (percpu_rwlock_t)PERCPU_RW_LOCK_UNLOCKED(&get_per_cpu_var(owner)))
+ 
+-static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata,
+-                                         percpu_rwlock_t *percpu_rwlock)
++static always_inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata,
++                                            percpu_rwlock_t *percpu_rwlock)
+ {
+     /* Validate the correct per_cpudata variable has been provided. */
+     _percpu_rwlock_owner_check(per_cpudata, percpu_rwlock);
+@@ -374,6 +374,8 @@ static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata,
+     }
+     else
+     {
++        /* Other branch already has a speculation barrier in read_lock(). */
++        block_lock_speculation();
+         /* All other paths have implicit check_lock() calls via read_lock(). */
+         check_lock(&percpu_rwlock->rwlock.lock.debug, false);
+     }
+@@ -430,8 +432,12 @@ static inline void _percpu_write_unlock(percpu_rwlock_t **per_cpudata,
+     _percpu_read_lock(&get_per_cpu_var(percpu), lock)
+ #define percpu_read_unlock(percpu, lock) \
+     _percpu_read_unlock(&get_per_cpu_var(percpu), lock)
+-#define percpu_write_lock(percpu, lock) \
+-    _percpu_write_lock(&get_per_cpu_var(percpu), lock)
++
++#define percpu_write_lock(percpu, lock)                 \
++({                                                      \
++    _percpu_write_lock(&get_per_cpu_var(percpu), lock); \
++    block_lock_speculation();                           \
++})
+ #define percpu_write_unlock(percpu, lock) \
+     _percpu_write_unlock(&get_per_cpu_var(percpu), lock)
+ 
+-- 
+2.44.0
+
+
+From 9de8a52b0e09a2491736abbd4a865a06ac2ced7a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Mon, 4 Mar 2024 14:29:36 +0100
+Subject: [PATCH 68/70] locking: attempt to ensure lock wrappers are always
+ inline
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+In order to prevent the locking speculation barriers from being inside of
+`call`ed functions that could be speculatively bypassed.
+
+While there also add an extra locking barrier to _mm_write_lock() in the branch
+taken when the lock is already held.
+
+Note some functions are switched to use the unsafe variants (without speculation
+barrier) of the locking primitives, but a speculation barrier is always added
+to the exposed public lock wrapping helper.  That's the case with
+sched_spin_lock_double() or pcidevs_lock() for example.
+
+This is part of XSA-453 / CVE-2024-2193
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 197ecd838a2aaf959a469df3696d4559c4f8b762)
+---
+ xen/arch/x86/hvm/vpt.c         | 10 +++++++---
+ xen/arch/x86/include/asm/irq.h |  1 +
+ xen/arch/x86/mm/mm-locks.h     | 28 +++++++++++++++-------------
+ xen/arch/x86/mm/p2m-pod.c      |  2 +-
+ xen/common/event_channel.c     |  5 +++--
+ xen/common/grant_table.c       |  6 +++---
+ xen/common/sched/core.c        | 19 ++++++++++++-------
+ xen/common/sched/private.h     | 26 ++++++++++++++++++++++++--
+ xen/common/timer.c             |  8 +++++---
+ xen/drivers/passthrough/pci.c  |  5 +++--
+ xen/include/xen/event.h        |  4 ++--
+ xen/include/xen/pci.h          |  8 ++++++--
+ 12 files changed, 82 insertions(+), 40 deletions(-)
+
+diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c
+index 8f53e88d67..e1d6845a28 100644
+--- a/xen/arch/x86/hvm/vpt.c
++++ b/xen/arch/x86/hvm/vpt.c
+@@ -150,7 +150,7 @@ static int pt_irq_masked(struct periodic_time *pt)
+  * pt->vcpu field, because another thread holding the pt_migrate lock
+  * may already be spinning waiting for your vcpu lock.
+  */
+-static void pt_vcpu_lock(struct vcpu *v)
++static always_inline void pt_vcpu_lock(struct vcpu *v)
+ {
+     spin_lock(&v->arch.hvm.tm_lock);
+ }
+@@ -169,9 +169,13 @@ static void pt_vcpu_unlock(struct vcpu *v)
+  * need to take an additional lock that protects against pt->vcpu
+  * changing.
+  */
+-static void pt_lock(struct periodic_time *pt)
++static always_inline void pt_lock(struct periodic_time *pt)
+ {
+-    read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
++    /*
++     * Use the speculation unsafe variant for the first lock, as the following
++     * lock taking helper already includes a speculation barrier.
++     */
++    _read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate);
+     spin_lock(&pt->vcpu->arch.hvm.tm_lock);
+ }
+ 
+diff --git a/xen/arch/x86/include/asm/irq.h b/xen/arch/x86/include/asm/irq.h
+index a87af47ece..465ab39bb0 100644
+--- a/xen/arch/x86/include/asm/irq.h
++++ b/xen/arch/x86/include/asm/irq.h
+@@ -174,6 +174,7 @@ void cf_check irq_complete_move(struct irq_desc *desc);
+ 
+ extern struct irq_desc *irq_desc;
+ 
++/* Not speculation safe, only used for AP bringup. */
+ void lock_vector_lock(void);
+ void unlock_vector_lock(void);
+ 
+diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h
+index 5a3f96fbaa..5ec080c02f 100644
+--- a/xen/arch/x86/mm/mm-locks.h
++++ b/xen/arch/x86/mm/mm-locks.h
+@@ -74,8 +74,8 @@ static inline void _set_lock_level(int l)
+     this_cpu(mm_lock_level) = l;
+ }
+ 
+-static inline void _mm_lock(const struct domain *d, mm_lock_t *l,
+-                            const char *func, int level, int rec)
++static always_inline void _mm_lock(const struct domain *d, mm_lock_t *l,
++                                   const char *func, int level, int rec)
+ {
+     if ( !((mm_locked_by_me(l)) && rec) )
+         _check_lock_level(d, level);
+@@ -125,8 +125,8 @@ static inline int mm_write_locked_by_me(mm_rwlock_t *l)
+     return (l->locker == get_processor_id());
+ }
+ 
+-static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l,
+-                                  const char *func, int level)
++static always_inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l,
++                                         const char *func, int level)
+ {
+     if ( !mm_write_locked_by_me(l) )
+     {
+@@ -137,6 +137,8 @@ static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l,
+         l->unlock_level = _get_lock_level();
+         _set_lock_level(_lock_level(d, level));
+     }
++    else
++        block_speculation();
+     l->recurse_count++;
+ }
+ 
+@@ -150,8 +152,8 @@ static inline void mm_write_unlock(mm_rwlock_t *l)
+     percpu_write_unlock(p2m_percpu_rwlock, &l->lock);
+ }
+ 
+-static inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l,
+-                                 int level)
++static always_inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l,
++                                        int level)
+ {
+     _check_lock_level(d, level);
+     percpu_read_lock(p2m_percpu_rwlock, &l->lock);
+@@ -166,15 +168,15 @@ static inline void mm_read_unlock(mm_rwlock_t *l)
+ 
+ /* This wrapper uses the line number to express the locking order below */
+ #define declare_mm_lock(name)                                                 \
+-    static inline void mm_lock_##name(const struct domain *d, mm_lock_t *l,   \
+-                                      const char *func, int rec)              \
++    static always_inline void mm_lock_##name(                                 \
++        const struct domain *d, mm_lock_t *l, const char *func, int rec)      \
+     { _mm_lock(d, l, func, MM_LOCK_ORDER_##name, rec); }
+ #define declare_mm_rwlock(name)                                               \
+-    static inline void mm_write_lock_##name(const struct domain *d,           \
+-                                            mm_rwlock_t *l, const char *func) \
++    static always_inline void mm_write_lock_##name(                           \
++        const struct domain *d, mm_rwlock_t *l, const char *func)             \
+     { _mm_write_lock(d, l, func, MM_LOCK_ORDER_##name); }                     \
+-    static inline void mm_read_lock_##name(const struct domain *d,            \
+-                                           mm_rwlock_t *l)                    \
++    static always_inline void mm_read_lock_##name(const struct domain *d,     \
++                                                  mm_rwlock_t *l)             \
+     { _mm_read_lock(d, l, MM_LOCK_ORDER_##name); }
+ /* These capture the name of the calling function */
+ #define mm_lock(name, d, l) mm_lock_##name(d, l, __func__, 0)
+@@ -309,7 +311,7 @@ declare_mm_lock(altp2mlist)
+ #define MM_LOCK_ORDER_altp2m                 40
+ declare_mm_rwlock(altp2m);
+ 
+-static inline void p2m_lock(struct p2m_domain *p)
++static always_inline void p2m_lock(struct p2m_domain *p)
+ {
+     if ( p2m_is_altp2m(p) )
+         mm_write_lock(altp2m, p->domain, &p->lock);
+diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c
+index 9969eb45fa..9be67b63ce 100644
+--- a/xen/arch/x86/mm/p2m-pod.c
++++ b/xen/arch/x86/mm/p2m-pod.c
+@@ -24,7 +24,7 @@
+ #define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
+ 
+ /* Enforce lock ordering when grabbing the "external" page_alloc lock */
+-static inline void lock_page_alloc(struct p2m_domain *p2m)
++static always_inline void lock_page_alloc(struct p2m_domain *p2m)
+ {
+     page_alloc_mm_pre_lock(p2m->domain);
+     spin_lock(&(p2m->domain->page_alloc_lock));
+diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
+index a7a004a084..66f924a7b0 100644
+--- a/xen/common/event_channel.c
++++ b/xen/common/event_channel.c
+@@ -45,7 +45,7 @@
+  * just assume the event channel is free or unbound at the moment when the
+  * evtchn_read_trylock() returns false.
+  */
+-static inline void evtchn_write_lock(struct evtchn *evtchn)
++static always_inline void evtchn_write_lock(struct evtchn *evtchn)
+ {
+     write_lock(&evtchn->lock);
+ 
+@@ -351,7 +351,8 @@ int evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc, evtchn_port_t port)
+     return rc;
+ }
+ 
+-static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn)
++static always_inline void double_evtchn_lock(struct evtchn *lchn,
++                                             struct evtchn *rchn)
+ {
+     ASSERT(lchn != rchn);
+ 
+diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
+index 89b7811c51..934924cbda 100644
+--- a/xen/common/grant_table.c
++++ b/xen/common/grant_table.c
+@@ -403,7 +403,7 @@ static inline void act_set_gfn(struct active_grant_entry *act, gfn_t gfn)
+ 
+ static DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock);
+ 
+-static inline void grant_read_lock(struct grant_table *gt)
++static always_inline void grant_read_lock(struct grant_table *gt)
+ {
+     percpu_read_lock(grant_rwlock, &gt->lock);
+ }
+@@ -413,7 +413,7 @@ static inline void grant_read_unlock(struct grant_table *gt)
+     percpu_read_unlock(grant_rwlock, &gt->lock);
+ }
+ 
+-static inline void grant_write_lock(struct grant_table *gt)
++static always_inline void grant_write_lock(struct grant_table *gt)
+ {
+     percpu_write_lock(grant_rwlock, &gt->lock);
+ }
+@@ -450,7 +450,7 @@ nr_active_grant_frames(struct grant_table *gt)
+     return num_act_frames_from_sha_frames(nr_grant_frames(gt));
+ }
+ 
+-static inline struct active_grant_entry *
++static always_inline struct active_grant_entry *
+ active_entry_acquire(struct grant_table *t, grant_ref_t e)
+ {
+     struct active_grant_entry *act;
+diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c
+index 901782bbb4..34ad39b9ad 100644
+--- a/xen/common/sched/core.c
++++ b/xen/common/sched/core.c
+@@ -348,23 +348,28 @@ uint64_t get_cpu_idle_time(unsigned int cpu)
+  * This avoids dead- or live-locks when this code is running on both
+  * cpus at the same time.
+  */
+-static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
+-                                   unsigned long *flags)
++static always_inline void sched_spin_lock_double(
++    spinlock_t *lock1, spinlock_t *lock2, unsigned long *flags)
+ {
++    /*
++     * In order to avoid extra overhead, use the locking primitives without the
++     * speculation barrier, and introduce a single barrier here.
++     */
+     if ( lock1 == lock2 )
+     {
+-        spin_lock_irqsave(lock1, *flags);
++        *flags = _spin_lock_irqsave(lock1);
+     }
+     else if ( lock1 < lock2 )
+     {
+-        spin_lock_irqsave(lock1, *flags);
+-        spin_lock(lock2);
++        *flags = _spin_lock_irqsave(lock1);
++        _spin_lock(lock2);
+     }
+     else
+     {
+-        spin_lock_irqsave(lock2, *flags);
+-        spin_lock(lock1);
++        *flags = _spin_lock_irqsave(lock2);
++        _spin_lock(lock1);
+     }
++    block_lock_speculation();
+ }
+ 
+ static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
+diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h
+index c516976c37..3b97f15767 100644
+--- a/xen/common/sched/private.h
++++ b/xen/common/sched/private.h
+@@ -207,8 +207,24 @@ DECLARE_PER_CPU(cpumask_t, cpumask_scratch);
+ #define cpumask_scratch        (&this_cpu(cpumask_scratch))
+ #define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c))
+ 
++/*
++ * Deal with _spin_lock_irqsave() returning the flags value instead of storing
++ * it in a passed parameter.
++ */
++#define _sched_spinlock0(lock, irq) _spin_lock##irq(lock)
++#define _sched_spinlock1(lock, irq, arg) ({ \
++    BUILD_BUG_ON(sizeof(arg) != sizeof(unsigned long)); \
++    (arg) = _spin_lock##irq(lock); \
++})
++
++#define _sched_spinlock__(nr) _sched_spinlock ## nr
++#define _sched_spinlock_(nr)  _sched_spinlock__(nr)
++#define _sched_spinlock(lock, irq, args...) \
++    _sched_spinlock_(count_args(args))(lock, irq, ## args)
++
+ #define sched_lock(kind, param, cpu, irq, arg...) \
+-static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \
++static always_inline spinlock_t \
++*kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \
+ { \
+     for ( ; ; ) \
+     { \
+@@ -220,10 +236,16 @@ static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \
+          * \
+          * It may also be the case that v->processor may change but the \
+          * lock may be the same; this will succeed in that case. \
++         * \
++         * Use the speculation unsafe locking helper, there's a speculation \
++         * barrier before returning to the caller. \
+          */ \
+-        spin_lock##irq(lock, ## arg); \
++        _sched_spinlock(lock, irq, ## arg); \
+         if ( likely(lock == get_sched_res(cpu)->schedule_lock) ) \
++        { \
++            block_lock_speculation(); \
+             return lock; \
++        } \
+         spin_unlock##irq(lock, ## arg); \
+     } \
+ }
+diff --git a/xen/common/timer.c b/xen/common/timer.c
+index 0fddfa7487..38eb5fd20d 100644
+--- a/xen/common/timer.c
++++ b/xen/common/timer.c
+@@ -239,7 +239,7 @@ static inline void deactivate_timer(struct timer *timer)
+     list_add(&timer->inactive, &per_cpu(timers, timer->cpu).inactive);
+ }
+ 
+-static inline bool_t timer_lock(struct timer *timer)
++static inline bool_t timer_lock_unsafe(struct timer *timer)
+ {
+     unsigned int cpu;
+ 
+@@ -253,7 +253,8 @@ static inline bool_t timer_lock(struct timer *timer)
+             rcu_read_unlock(&timer_cpu_read_lock);
+             return 0;
+         }
+-        spin_lock(&per_cpu(timers, cpu).lock);
++        /* Use the speculation unsafe variant, the wrapper has the barrier. */
++        _spin_lock(&per_cpu(timers, cpu).lock);
+         if ( likely(timer->cpu == cpu) )
+             break;
+         spin_unlock(&per_cpu(timers, cpu).lock);
+@@ -266,8 +267,9 @@ static inline bool_t timer_lock(struct timer *timer)
+ #define timer_lock_irqsave(t, flags) ({         \
+     bool_t __x;                                 \
+     local_irq_save(flags);                      \
+-    if ( !(__x = timer_lock(t)) )               \
++    if ( !(__x = timer_lock_unsafe(t)) )        \
+         local_irq_restore(flags);               \
++    block_lock_speculation();                   \
+     __x;                                        \
+ })
+ 
+diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
+index e99837b6e1..2a1e7ee89a 100644
+--- a/xen/drivers/passthrough/pci.c
++++ b/xen/drivers/passthrough/pci.c
+@@ -52,9 +52,10 @@ struct pci_seg {
+ 
+ static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED;
+ 
+-void pcidevs_lock(void)
++/* Do not use, as it has no speculation barrier, use pcidevs_lock() instead. */
++void pcidevs_lock_unsafe(void)
+ {
+-    spin_lock_recursive(&_pcidevs_lock);
++    _spin_lock_recursive(&_pcidevs_lock);
+ }
+ 
+ void pcidevs_unlock(void)
+diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
+index 8e509e0784..f1472ea1eb 100644
+--- a/xen/include/xen/event.h
++++ b/xen/include/xen/event.h
+@@ -114,12 +114,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport);
+ #define bucket_from_port(d, p) \
+     ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET])
+ 
+-static inline void evtchn_read_lock(struct evtchn *evtchn)
++static always_inline void evtchn_read_lock(struct evtchn *evtchn)
+ {
+     read_lock(&evtchn->lock);
+ }
+ 
+-static inline bool evtchn_read_trylock(struct evtchn *evtchn)
++static always_inline bool evtchn_read_trylock(struct evtchn *evtchn)
+ {
+     return read_trylock(&evtchn->lock);
+ }
+diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
+index 251b8761a8..a71bed36be 100644
+--- a/xen/include/xen/pci.h
++++ b/xen/include/xen/pci.h
+@@ -155,8 +155,12 @@ struct pci_dev {
+  * devices, it also sync the access to the msi capability that is not
+  * interrupt handling related (the mask bit register).
+  */
+-
+-void pcidevs_lock(void);
++void pcidevs_lock_unsafe(void);
++static always_inline void pcidevs_lock(void)
++{
++    pcidevs_lock_unsafe();
++    block_lock_speculation();
++}
+ void pcidevs_unlock(void);
+ bool __must_check pcidevs_locked(void);
+ 
+-- 
+2.44.0
+
+
+From e107a8ece71ec4e1bb0092d5beea6cb16a96f7ae Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Mon, 4 Mar 2024 18:08:48 +0100
+Subject: [PATCH 69/70] x86/mm: add speculation barriers to open coded locks
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Add a speculation barrier to the clearly identified open-coded lock taking
+functions.
+
+Note that the memory sharing page_lock() replacement (_page_lock()) is left
+as-is, as the code is experimental and not security supported.
+
+This is part of XSA-453 / CVE-2024-2193
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 42a572a38e22a97d86a4b648a22597628d5b42e4)
+---
+ xen/arch/x86/include/asm/mm.h | 4 +++-
+ xen/arch/x86/mm.c             | 6 ++++--
+ 2 files changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h
+index 05dfe35502..d1b1fee99b 100644
+--- a/xen/arch/x86/include/asm/mm.h
++++ b/xen/arch/x86/include/asm/mm.h
+@@ -399,7 +399,9 @@ const struct platform_bad_page *get_platform_badpages(unsigned int *array_size);
+  * The use of PGT_locked in mem_sharing does not collide, since mem_sharing is
+  * only supported for hvm guests, which do not have PV PTEs updated.
+  */
+-int page_lock(struct page_info *page);
++int page_lock_unsafe(struct page_info *page);
++#define page_lock(pg)   lock_evaluate_nospec(page_lock_unsafe(pg))
++
+ void page_unlock(struct page_info *page);
+ 
+ void put_page_type(struct page_info *page);
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index ab0acbfea6..000fd0fb55 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -2017,7 +2017,7 @@ static inline bool current_locked_page_ne_check(struct page_info *page) {
+ #define current_locked_page_ne_check(x) true
+ #endif
+ 
+-int page_lock(struct page_info *page)
++int page_lock_unsafe(struct page_info *page)
+ {
+     unsigned long x, nx;
+ 
+@@ -2078,7 +2078,7 @@ void page_unlock(struct page_info *page)
+  * l3t_lock(), so to avoid deadlock we must avoid grabbing them in
+  * reverse order.
+  */
+-static void l3t_lock(struct page_info *page)
++static always_inline void l3t_lock(struct page_info *page)
+ {
+     unsigned long x, nx;
+ 
+@@ -2087,6 +2087,8 @@ static void l3t_lock(struct page_info *page)
+             cpu_relax();
+         nx = x | PGT_locked;
+     } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
++
++    block_lock_speculation();
+ }
+ 
+ static void l3t_unlock(struct page_info *page)
+-- 
+2.44.0
+
+
+From 4da8ca9cb9cfdb92c9dd09d5270ae16a3b2dbc89 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= <roger.pau@citrix.com>
+Date: Mon, 4 Mar 2024 16:24:21 +0100
+Subject: [PATCH 70/70] x86: protect conditional lock taking from speculative
+ execution
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Conditionally taken locks that use the pattern:
+
+if ( lock )
+    spin_lock(...);
+
+Need an else branch in order to issue an speculation barrier in the else case,
+just like it's done in case the lock needs to be acquired.
+
+eval_nospec() could be used on the condition itself, but that would result in a
+double barrier on the branch where the lock is taken.
+
+Introduce a new pair of helpers, {gfn,spin}_lock_if() that can be used to
+conditionally take a lock in a speculation safe way.
+
+This is part of XSA-453 / CVE-2024-2193
+
+Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
+Reviewed-by: Jan Beulich <jbeulich@suse.com>
+(cherry picked from commit 03cf7ca23e0e876075954c558485b267b7d02406)
+---
+ xen/arch/x86/mm.c          | 35 +++++++++++++----------------------
+ xen/arch/x86/mm/mm-locks.h |  9 +++++++++
+ xen/arch/x86/mm/p2m.c      |  5 ++---
+ xen/include/xen/spinlock.h |  8 ++++++++
+ 4 files changed, 32 insertions(+), 25 deletions(-)
+
+diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
+index 000fd0fb55..45bfbc2522 100644
+--- a/xen/arch/x86/mm.c
++++ b/xen/arch/x86/mm.c
+@@ -5007,8 +5007,7 @@ static l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
+         if ( !l3t )
+             return NULL;
+         UNMAP_DOMAIN_PAGE(l3t);
+-        if ( locking )
+-            spin_lock(&map_pgdir_lock);
++        spin_lock_if(locking, &map_pgdir_lock);
+         if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
+         {
+             l4_pgentry_t l4e = l4e_from_mfn(l3mfn, __PAGE_HYPERVISOR);
+@@ -5045,8 +5044,7 @@ static l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
+             return NULL;
+         }
+         UNMAP_DOMAIN_PAGE(l2t);
+-        if ( locking )
+-            spin_lock(&map_pgdir_lock);
++        spin_lock_if(locking, &map_pgdir_lock);
+         if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
+         {
+             l3e_write(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR));
+@@ -5084,8 +5082,7 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
+             return NULL;
+         }
+         UNMAP_DOMAIN_PAGE(l1t);
+-        if ( locking )
+-            spin_lock(&map_pgdir_lock);
++        spin_lock_if(locking, &map_pgdir_lock);
+         if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
+         {
+             l2e_write(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR));
+@@ -5116,6 +5113,8 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
+     do {                      \
+         if ( locking )        \
+             l3t_lock(page);   \
++        else                            \
++            block_lock_speculation();   \
+     } while ( false )
+ 
+ #define L3T_UNLOCK(page)                           \
+@@ -5331,8 +5330,7 @@ int map_pages_to_xen(
+             if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
+                 flush_flags |= FLUSH_TLB_GLOBAL;
+ 
+-            if ( locking )
+-                spin_lock(&map_pgdir_lock);
++            spin_lock_if(locking, &map_pgdir_lock);
+             if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) &&
+                  (l3e_get_flags(*pl3e) & _PAGE_PSE) )
+             {
+@@ -5436,8 +5434,7 @@ int map_pages_to_xen(
+                 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
+                     flush_flags |= FLUSH_TLB_GLOBAL;
+ 
+-                if ( locking )
+-                    spin_lock(&map_pgdir_lock);
++                spin_lock_if(locking, &map_pgdir_lock);
+                 if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) &&
+                      (l2e_get_flags(*pl2e) & _PAGE_PSE) )
+                 {
+@@ -5478,8 +5475,7 @@ int map_pages_to_xen(
+                 unsigned long base_mfn;
+                 const l1_pgentry_t *l1t;
+ 
+-                if ( locking )
+-                    spin_lock(&map_pgdir_lock);
++                spin_lock_if(locking, &map_pgdir_lock);
+ 
+                 ol2e = *pl2e;
+                 /*
+@@ -5533,8 +5529,7 @@ int map_pages_to_xen(
+             unsigned long base_mfn;
+             const l2_pgentry_t *l2t;
+ 
+-            if ( locking )
+-                spin_lock(&map_pgdir_lock);
++            spin_lock_if(locking, &map_pgdir_lock);
+ 
+             ol3e = *pl3e;
+             /*
+@@ -5678,8 +5673,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+                                        l3e_get_flags(*pl3e)));
+             UNMAP_DOMAIN_PAGE(l2t);
+ 
+-            if ( locking )
+-                spin_lock(&map_pgdir_lock);
++            spin_lock_if(locking, &map_pgdir_lock);
+             if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) &&
+                  (l3e_get_flags(*pl3e) & _PAGE_PSE) )
+             {
+@@ -5738,8 +5732,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+                                            l2e_get_flags(*pl2e) & ~_PAGE_PSE));
+                 UNMAP_DOMAIN_PAGE(l1t);
+ 
+-                if ( locking )
+-                    spin_lock(&map_pgdir_lock);
++                spin_lock_if(locking, &map_pgdir_lock);
+                 if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) &&
+                      (l2e_get_flags(*pl2e) & _PAGE_PSE) )
+                 {
+@@ -5783,8 +5776,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+              */
+             if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) )
+                 continue;
+-            if ( locking )
+-                spin_lock(&map_pgdir_lock);
++            spin_lock_if(locking, &map_pgdir_lock);
+ 
+             /*
+              * L2E may be already cleared, or set to a superpage, by
+@@ -5831,8 +5823,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
+         if ( (nf & _PAGE_PRESENT) ||
+              ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) )
+             continue;
+-        if ( locking )
+-            spin_lock(&map_pgdir_lock);
++        spin_lock_if(locking, &map_pgdir_lock);
+ 
+         /*
+          * L3E may be already cleared, or set to a superpage, by
+diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h
+index 5ec080c02f..b4960fb90e 100644
+--- a/xen/arch/x86/mm/mm-locks.h
++++ b/xen/arch/x86/mm/mm-locks.h
+@@ -335,6 +335,15 @@ static inline void p2m_unlock(struct p2m_domain *p)
+ #define p2m_locked_by_me(p)   mm_write_locked_by_me(&(p)->lock)
+ #define gfn_locked_by_me(p,g) p2m_locked_by_me(p)
+ 
++static always_inline void gfn_lock_if(bool condition, struct p2m_domain *p2m,
++                                      gfn_t gfn, unsigned int order)
++{
++    if ( condition )
++        gfn_lock(p2m, gfn, order);
++    else
++        block_lock_speculation();
++}
++
+ /* PoD lock (per-p2m-table)
+  *
+  * Protects private PoD data structs: entry and cache
+diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
+index 0983bd71d9..22ab1d606e 100644
+--- a/xen/arch/x86/mm/p2m.c
++++ b/xen/arch/x86/mm/p2m.c
+@@ -280,9 +280,8 @@ mfn_t p2m_get_gfn_type_access(struct p2m_domain *p2m, gfn_t gfn,
+     if ( q & P2M_UNSHARE )
+         q |= P2M_ALLOC;
+ 
+-    if ( locked )
+-        /* Grab the lock here, don't release until put_gfn */
+-        gfn_lock(p2m, gfn, 0);
++    /* Grab the lock here, don't release until put_gfn */
++    gfn_lock_if(locked, p2m, gfn, 0);
+ 
+     mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL);
+ 
+diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h
+index 28fce5615e..c830df3430 100644
+--- a/xen/include/xen/spinlock.h
++++ b/xen/include/xen/spinlock.h
+@@ -222,6 +222,14 @@ static always_inline void spin_lock_irq(spinlock_t *l)
+         block_lock_speculation();                               \
+     })
+ 
++/* Conditionally take a spinlock in a speculation safe way. */
++static always_inline void spin_lock_if(bool condition, spinlock_t *l)
++{
++    if ( condition )
++        _spin_lock(l);
++    block_lock_speculation();
++}
++
+ #define spin_unlock(l)                _spin_unlock(l)
+ #define spin_unlock_irq(l)            _spin_unlock_irq(l)
+ #define spin_unlock_irqrestore(l, f)  _spin_unlock_irqrestore(l, f)
+-- 
+2.44.0
+
diff --git a/main/xen/xsa447.patch b/main/xen/xsa447.patch
deleted file mode 100644
index 2e26396b0ee..00000000000
--- a/main/xen/xsa447.patch
+++ /dev/null
@@ -1,117 +0,0 @@
-From 084c7312fa6c1d4a7fa343efa1d7d73693dafff4 Mon Sep 17 00:00:00 2001
-From: Michal Orzel <michal.orzel@amd.com>
-Date: Thu, 23 Nov 2023 15:53:02 +0100
-Subject: [PATCH] xen/arm: page: Avoid pointer overflow on cache clean &
- invalidate
-
-On Arm32, after cleaning and invalidating the last dcache line of the top
-domheap page i.e. VA = 0xfffff000 (as a result of flushing the page to
-RAM), we end up adding the value of a dcache line size to the pointer
-once again, which results in a pointer arithmetic overflow (with 64B line
-size, operation 0xffffffc0 + 0x40 overflows to 0x0). Such behavior is
-undefined and given the wide range of compiler versions we support, it is
-difficult to determine what could happen in such scenario.
-
-Modify clean_and_invalidate_dcache_va_range() as well as
-clean_dcache_va_range() and invalidate_dcache_va_range() due to similarity
-of handling to prevent pointer arithmetic overflow. Modify the loops to
-use an additional variable to store the index of the next cacheline.
-Add an assert to prevent passing a region that wraps around which is
-illegal and would end up in a page fault anyway (region 0-2MB is
-unmapped). Lastly, return early if size passed is 0.
-
-Note that on Arm64, we don't have this problem given that the max VA
-space we support is 48-bits.
-
-This is XSA-447 / CVE-2023-46837.
-
-Signed-off-by: Michal Orzel <michal.orzel@amd.com>
-Reviewed-by: Julien Grall <jgrall@amazon.com>
----
- xen/arch/arm/include/asm/page.h | 35 ++++++++++++++++++++++++++-------
- 1 file changed, 28 insertions(+), 7 deletions(-)
-
-diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h
-index ebaf5964f114..69f817d1e68a 100644
---- a/xen/arch/arm/include/asm/page.h
-+++ b/xen/arch/arm/include/asm/page.h
-@@ -162,6 +162,13 @@ static inline size_t read_dcache_line_bytes(void)
- static inline int invalidate_dcache_va_range(const void *p, unsigned long size)
- {
-     size_t cacheline_mask = dcache_line_bytes - 1;
-+    unsigned long idx = 0;
-+
-+    if ( !size )
-+        return 0;
-+
-+    /* Passing a region that wraps around is illegal */
-+    ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p);
- 
-     dsb(sy);           /* So the CPU issues all writes to the range */
- 
-@@ -174,11 +181,11 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size)
-     }
- 
-     for ( ; size >= dcache_line_bytes;
--            p += dcache_line_bytes, size -= dcache_line_bytes )
--        asm volatile (__invalidate_dcache_one(0) : : "r" (p));
-+            idx += dcache_line_bytes, size -= dcache_line_bytes )
-+        asm volatile (__invalidate_dcache_one(0) : : "r" (p + idx));
- 
-     if ( size > 0 )
--        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p));
-+        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx));
- 
-     dsb(sy);           /* So we know the flushes happen before continuing */
- 
-@@ -188,14 +195,21 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size)
- static inline int clean_dcache_va_range(const void *p, unsigned long size)
- {
-     size_t cacheline_mask = dcache_line_bytes - 1;
-+    unsigned long idx = 0;
-+
-+    if ( !size )
-+        return 0;
-+
-+    /* Passing a region that wraps around is illegal */
-+    ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p);
- 
-     dsb(sy);           /* So the CPU issues all writes to the range */
-     size += (uintptr_t)p & cacheline_mask;
-     size = (size + cacheline_mask) & ~cacheline_mask;
-     p = (void *)((uintptr_t)p & ~cacheline_mask);
-     for ( ; size >= dcache_line_bytes;
--            p += dcache_line_bytes, size -= dcache_line_bytes )
--        asm volatile (__clean_dcache_one(0) : : "r" (p));
-+            idx += dcache_line_bytes, size -= dcache_line_bytes )
-+        asm volatile (__clean_dcache_one(0) : : "r" (p + idx));
-     dsb(sy);           /* So we know the flushes happen before continuing */
-     /* ARM callers assume that dcache_* functions cannot fail. */
-     return 0;
-@@ -205,14 +219,21 @@ static inline int clean_and_invalidate_dcache_va_range
-     (const void *p, unsigned long size)
- {
-     size_t cacheline_mask = dcache_line_bytes - 1;
-+    unsigned long idx = 0;
-+
-+    if ( !size )
-+        return 0;
-+
-+    /* Passing a region that wraps around is illegal */
-+    ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p);
- 
-     dsb(sy);         /* So the CPU issues all writes to the range */
-     size += (uintptr_t)p & cacheline_mask;
-     size = (size + cacheline_mask) & ~cacheline_mask;
-     p = (void *)((uintptr_t)p & ~cacheline_mask);
-     for ( ; size >= dcache_line_bytes;
--            p += dcache_line_bytes, size -= dcache_line_bytes )
--        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p));
-+            idx += dcache_line_bytes, size -= dcache_line_bytes )
-+        asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx));
-     dsb(sy);         /* So we know the flushes happen before continuing */
-     /* ARM callers assume that dcache_* functions cannot fail. */
-     return 0;
--- 
-2.40.1
-
diff --git a/main/xen/xsa449.patch b/main/xen/xsa449.patch
deleted file mode 100644
index 80aeac29161..00000000000
--- a/main/xen/xsa449.patch
+++ /dev/null
@@ -1,89 +0,0 @@
-From d8b92b21b224126860978e4c604302f3c1e3bf75 Mon Sep 17 00:00:00 2001
-From: Roger Pau Monne <roger.pau@citrix.com>
-Date: Wed, 13 Dec 2023 15:51:59 +0100
-Subject: [PATCH] pci: fail device assignment if phantom functions cannot be
- assigned
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-The current behavior is that no error is reported if (some) phantom functions
-fail to be assigned during device add or assignment, so the operation succeeds
-even if some phantom functions are not correctly setup.
-
-This can lead to devices possibly being successfully assigned to a domU while
-some of the device phantom functions are still assigned to dom0.  Even when the
-device is assigned domIO before being assigned to a domU phantom functions
-might fail to be assigned to domIO, and also fail to be assigned to the domU,
-leaving them assigned to dom0.
-
-Since the device can generate requests using the IDs of those phantom
-functions, given the scenario above a device in such state would be in control
-of a domU, but still capable of generating transactions that use a context ID
-targeting dom0 owned memory.
-
-Modify device assign in order to attempt to deassign the device if phantom
-functions failed to be assigned.
-
-Note that device addition is not modified in the same way, as in that case the
-device is assigned to a trusted domain, and hence partial assign can lead to
-device malfunction but not a security issue.
-
-This is XSA-449 / CVE-2023-46839
-
-Fixes: 4e9950dc1bd2 ('IOMMU: add phantom function support')
-Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
----
- xen/drivers/passthrough/pci.c | 27 +++++++++++++++++++++------
- 1 file changed, 21 insertions(+), 6 deletions(-)
-
-diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
-index 1439d1ef2b26..47c0eee7bdcc 100644
---- a/xen/drivers/passthrough/pci.c
-+++ b/xen/drivers/passthrough/pci.c
-@@ -1488,11 +1488,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
- 
-     pdev->fault.count = 0;
- 
--    if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn,
--                          pci_to_dev(pdev), flag)) )
--        goto done;
-+    rc = iommu_call(hd->platform_ops, assign_device, d, devfn, pci_to_dev(pdev),
-+                    flag);
- 
--    for ( ; pdev->phantom_stride; rc = 0 )
-+    while ( pdev->phantom_stride && !rc )
-     {
-         devfn += pdev->phantom_stride;
-         if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
-@@ -1503,8 +1502,24 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
- 
-  done:
-     if ( rc )
--        printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n",
--               d, &PCI_SBDF(seg, bus, devfn), rc);
-+    {
-+        printk(XENLOG_G_WARNING "%pd: assign %s(%pp) failed (%d)\n",
-+               d, devfn != pdev->devfn ? "phantom function " : "",
-+               &PCI_SBDF(seg, bus, devfn), rc);
-+
-+        if ( devfn != pdev->devfn && deassign_device(d, seg, bus, pdev->devfn) )
-+        {
-+            /*
-+             * Device with phantom functions that failed to both assign and
-+             * rollback.  Mark the device as broken and crash the target domain,
-+             * as the state of the functions at this point is unknown and Xen
-+             * has no way to assert consistent context assignment among them.
-+             */
-+            pdev->broken = true;
-+            if ( !is_hardware_domain(d) && d != dom_io )
-+                domain_crash(d);
-+        }
-+    }
-     /* The device is assigned to dom_io so mark it as quarantined */
-     else if ( d == dom_io )
-         pdev->quarantine = true;
--- 
-2.43.0
-
diff --git a/main/xen/xsa450.patch b/main/xen/xsa450.patch
deleted file mode 100644
index e94933be0b8..00000000000
--- a/main/xen/xsa450.patch
+++ /dev/null
@@ -1,59 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: VT-d: Fix "else" vs "#endif" misplacement
-
-In domain_pgd_maddr() the "#endif" is misplaced with respect to "else".  This
-generates incorrect logic when CONFIG_HVM is compiled out, as the "else" body
-is executed unconditionally.
-
-Rework the logic to use IS_ENABLED() instead of explicit #ifdef-ary, as it's
-clearer to follow.  This in turn involves adjusting p2m_get_pagetable() to
-compile when CONFIG_HVM is disabled.
-
-This is XSA-450 / CVE-2023-46840.
-
-Reported-by: Reported-by: Teddy Astie <teddy.astie@vates.tech>
-Fixes: 033ff90aa9c1 ("x86/P2M: p2m_{alloc,free}_ptp() and p2m_alloc_table() are HVM-only")
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
-diff --git a/xen/arch/x86/include/asm/p2m.h b/xen/arch/x86/include/asm/p2m.h
-index 32f3f394b05a..6ada585eaac2 100644
---- a/xen/arch/x86/include/asm/p2m.h
-+++ b/xen/arch/x86/include/asm/p2m.h
-@@ -435,7 +435,14 @@ static inline bool p2m_is_altp2m(const struct p2m_domain *p2m)
-     return p2m->p2m_class == p2m_alternate;
- }
- 
--#define p2m_get_pagetable(p2m)  ((p2m)->phys_table)
-+#ifdef CONFIG_HVM
-+static inline pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m)
-+{
-+    return p2m->phys_table;
-+}
-+#else
-+pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m);
-+#endif
- 
- /*
-  * Ensure any deferred p2m TLB flush has been completed on all VCPUs.
-diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
-index 99b642f12ef9..4244855032ee 100644
---- a/xen/drivers/passthrough/vtd/iommu.c
-+++ b/xen/drivers/passthrough/vtd/iommu.c
-@@ -438,15 +438,13 @@ static paddr_t domain_pgd_maddr(struct domain *d, paddr_t pgd_maddr,
- 
-     if ( pgd_maddr )
-         /* nothing */;
--#ifdef CONFIG_HVM
--    else if ( iommu_use_hap_pt(d) )
-+    else if ( IS_ENABLED(CONFIG_HVM) && iommu_use_hap_pt(d) )
-     {
-         pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d));
- 
-         pgd_maddr = pagetable_get_paddr(pgt);
-     }
-     else
--#endif
-     {
-         if ( !hd->arch.vtd.pgd_maddr )
-         {
diff --git a/main/xen/xsa451-4.18.patch b/main/xen/xsa451-4.18.patch
deleted file mode 100644
index 721f3f34df6..00000000000
--- a/main/xen/xsa451-4.18.patch
+++ /dev/null
@@ -1,188 +0,0 @@
-From: Jan Beulich <jbeulich@suse.com>
-Subject: x86: account for shadow stack in exception-from-stub recovery
-
-Dealing with exceptions raised from within emulation stubs involves
-discarding return address (replaced by exception related information).
-Such discarding of course also requires removing the corresponding entry
-from the shadow stack.
-
-Also amend the comment in fixup_exception_return(), to further clarify
-why use of ptr[1] can't be an out-of-bounds access.
-
-While touching do_invalid_op() also add a missing fall-through
-annotation.
-
-This is CVE-2023-46841 / XSA-451.
-
-Fixes: 209fb9919b50 ("x86/extable: Adjust extable handling to be shadow stack compatible")
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
-
---- a/xen/arch/x86/extable.c
-+++ b/xen/arch/x86/extable.c
-@@ -86,26 +86,29 @@ search_one_extable(const struct exceptio
- }
- 
- unsigned long
--search_exception_table(const struct cpu_user_regs *regs)
-+search_exception_table(const struct cpu_user_regs *regs, unsigned long *stub_ra)
- {
-     const struct virtual_region *region = find_text_region(regs->rip);
-     unsigned long stub = this_cpu(stubs.addr);
- 
-     if ( region && region->ex )
-+    {
-+        *stub_ra = 0;
-         return search_one_extable(region->ex, region->ex_end, regs->rip);
-+    }
- 
-     if ( regs->rip >= stub + STUB_BUF_SIZE / 2 &&
-          regs->rip < stub + STUB_BUF_SIZE &&
-          regs->rsp > (unsigned long)regs &&
-          regs->rsp < (unsigned long)get_cpu_info() )
-     {
--        unsigned long retptr = *(unsigned long *)regs->rsp;
-+        unsigned long retaddr = *(unsigned long *)regs->rsp, fixup;
- 
--        region = find_text_region(retptr);
--        retptr = region && region->ex
--                 ? search_one_extable(region->ex, region->ex_end, retptr)
--                 : 0;
--        if ( retptr )
-+        region = find_text_region(retaddr);
-+        fixup = region && region->ex
-+                ? search_one_extable(region->ex, region->ex_end, retaddr)
-+                : 0;
-+        if ( fixup )
-         {
-             /*
-              * Put trap number and error code on the stack (in place of the
-@@ -117,7 +120,8 @@ search_exception_table(const struct cpu_
-             };
- 
-             *(unsigned long *)regs->rsp = token.raw;
--            return retptr;
-+            *stub_ra = retaddr;
-+            return fixup;
-         }
-     }
- 
---- a/xen/arch/x86/include/asm/uaccess.h
-+++ b/xen/arch/x86/include/asm/uaccess.h
-@@ -421,7 +421,8 @@ union stub_exception_token {
-     unsigned long raw;
- };
- 
--extern unsigned long search_exception_table(const struct cpu_user_regs *regs);
-+extern unsigned long search_exception_table(const struct cpu_user_regs *regs,
-+                                            unsigned long *stub_ra);
- extern void sort_exception_tables(void);
- extern void sort_exception_table(struct exception_table_entry *start,
-                                  const struct exception_table_entry *stop);
---- a/xen/arch/x86/traps.c
-+++ b/xen/arch/x86/traps.c
-@@ -845,7 +845,7 @@ void do_unhandled_trap(struct cpu_user_r
- }
- 
- static void fixup_exception_return(struct cpu_user_regs *regs,
--                                   unsigned long fixup)
-+                                   unsigned long fixup, unsigned long stub_ra)
- {
-     if ( IS_ENABLED(CONFIG_XEN_SHSTK) )
-     {
-@@ -862,7 +862,8 @@ static void fixup_exception_return(struc
-             /*
-              * Search for %rip.  The shstk currently looks like this:
-              *
--             *   ...  [Likely pointed to by SSP]
-+             *   tok  [Supervisor token, == &tok | BUSY, only with FRED inactive]
-+             *   ...  [Pointed to by SSP for most exceptions, empty in IST cases]
-              *   %cs  [== regs->cs]
-              *   %rip [== regs->rip]
-              *   SSP  [Likely points to 3 slots higher, above %cs]
-@@ -880,7 +881,56 @@ static void fixup_exception_return(struc
-              */
-             if ( ptr[0] == regs->rip && ptr[1] == regs->cs )
-             {
-+                unsigned long primary_shstk =
-+                    (ssp & ~(STACK_SIZE - 1)) +
-+                    (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8;
-+
-                 wrss(fixup, ptr);
-+
-+                if ( !stub_ra )
-+                    goto shstk_done;
-+
-+                /*
-+                 * Stub recovery ought to happen only when the outer context
-+                 * was on the main shadow stack.  We need to also "pop" the
-+                 * stub's return address from the interrupted context's shadow
-+                 * stack.  That is,
-+                 * - if we're still on the main stack, we need to move the
-+                 *   entire stack (up to and including the exception frame)
-+                 *   up by one slot, incrementing the original SSP in the
-+                 *   exception frame,
-+                 * - if we're on an IST stack, we need to increment the
-+                 *   original SSP.
-+                 */
-+                BUG_ON((ptr[-1] ^ primary_shstk) >> PAGE_SHIFT);
-+
-+                if ( (ssp ^ primary_shstk) >> PAGE_SHIFT )
-+                {
-+                    /*
-+                     * We're on an IST stack.  First make sure the two return
-+                     * addresses actually match.  Then increment the interrupted
-+                     * context's SSP.
-+                     */
-+                    BUG_ON(stub_ra != *(unsigned long*)ptr[-1]);
-+                    wrss(ptr[-1] + 8, &ptr[-1]);
-+                    goto shstk_done;
-+                }
-+
-+                /* Make sure the two return addresses actually match. */
-+                BUG_ON(stub_ra != ptr[2]);
-+
-+                /* Move exception frame, updating SSP there. */
-+                wrss(ptr[1], &ptr[2]); /* %cs */
-+                wrss(ptr[0], &ptr[1]); /* %rip */
-+                wrss(ptr[-1] + 8, &ptr[0]); /* SSP */
-+
-+                /* Move all newer entries. */
-+                while ( --ptr != _p(ssp) )
-+                    wrss(ptr[-1], &ptr[0]);
-+
-+                /* Finally account for our own stack having shifted up. */
-+                asm volatile ( "incsspd %0" :: "r" (2) );
-+
-                 goto shstk_done;
-             }
-         }
-@@ -901,7 +951,8 @@ static void fixup_exception_return(struc
- 
- static bool extable_fixup(struct cpu_user_regs *regs, bool print)
- {
--    unsigned long fixup = search_exception_table(regs);
-+    unsigned long stub_ra = 0;
-+    unsigned long fixup = search_exception_table(regs, &stub_ra);
- 
-     if ( unlikely(fixup == 0) )
-         return false;
-@@ -915,7 +966,7 @@ static bool extable_fixup(struct cpu_use
-                vector_name(regs->entry_vector), regs->error_code,
-                _p(regs->rip), _p(regs->rip), _p(fixup));
- 
--    fixup_exception_return(regs, fixup);
-+    fixup_exception_return(regs, fixup, stub_ra);
-     this_cpu(last_extable_addr) = regs->rip;
- 
-     return true;
-@@ -1183,7 +1234,8 @@ void do_invalid_op(struct cpu_user_regs
-     {
-     case BUGFRAME_run_fn:
-     case BUGFRAME_warn:
--        fixup_exception_return(regs, (unsigned long)eip);
-+        fixup_exception_return(regs, (unsigned long)eip, 0);
-+        fallthrough;
-     case BUGFRAME_bug:
-     case BUGFRAME_assert:
-         return;