diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD index 643d1bcc6d1..8845418d5cb 100644 --- a/main/xen/APKBUILD +++ b/main/xen/APKBUILD @@ -2,7 +2,7 @@ # Maintainer: Natanael Copa pkgname=xen pkgver=4.18.0 -pkgrel=4 +pkgrel=5 pkgdesc="Xen hypervisor" url="https://www.xenproject.org/" arch="x86_64 armv7 aarch64" @@ -364,6 +364,9 @@ options="!strip" # - CVE-2023-46839 XSA-449 # 4.18.0-r4: # - CVE-2023-46841 XSA-451 +# 4.18.0-r5: +# - CVE-2023-28746 XSA-452 +# - CVE-2024-2193 XSA-453 case "$CARCH" in x86*) @@ -409,10 +412,7 @@ source="https://downloads.xenproject.org/release/xen/$pkgver/xen-$pkgver.tar.gz https://xenbits.xen.org/xen-extfiles/zlib-$_ZLIB_VERSION.tar.gz https://xenbits.xen.org/xen-extfiles/ipxe-git-$_IPXE_GIT_TAG.tar.gz - xsa447.patch - xsa449.patch - xsa450.patch - xsa451-4.18.patch + xen-stable-4.18-20240312.patch mini-os-__divmoddi4.patch qemu-xen_paths.patch @@ -701,10 +701,7 @@ qemu_openrc() { sha512sums=" 4cc9fd155144045a173c5f8ecc45f149817f1034eec618cb6f8b0494ef2fb5b95c4c60cf0bf4bec4bef8a622c35b6a3cb7dedc38e6d95e726f1611c73ddb3273 xen-4.18.0.tar.gz -459e490b33b95202167862a84eadb656a418b252ffa786db05640f025886bf1e2a5c59387d4b99ced552ae316eb64b6f9888a850bf6860a115e7f3eabed52d20 xsa447.patch -ea185b6f7ca375b49351a4006f22e449312e0a8180c93db2bb1aca43658de5abc8d1a21c1b6eedf320dd51a5e1475ace1652eddaacee28d36cc83d5beb05a918 xsa449.patch -901359c8fd08adc49961e1296e45fa98da6e090a82f8888fef6cccebf5b443e80cd905dff51e336e43c22bfac118481d65f8e4a9aa56ddd5c8e1775c6083e08d xsa450.patch -394fe51160f5ce79086d0f250c99daa3ecde1012ebdb5c6301f0033e79809e8b2061de7988f1a713c9674ac9b73d88df8be89e8cc668efb64c5b53039c574eef xsa451-4.18.patch +8df958195290a39b54493766e7555d71c68083d75edd13a2f77ad237d6b6fb52bce816b9e975c0c14024a01042e599415360dcf475f7d2e0c6bee8f9fd2ed6ef xen-stable-4.18-20240312.patch 2e0b0fd23e6f10742a5517981e5171c6e88b0a93c83da701b296f5c0861d72c19782daab589a7eac3f9032152a0fc7eff7f5362db8fccc4859564a9aa82329cf gmp-4.3.2.tar.bz2 c2bc9ffc8583aeae71cee9ddcc4418969768d4e3764d47307da54f93981c0109fb07d84b061b3a3628bd00ba4d14a54742bc04848110eb3ae8ca25dbfbaabadb grub-0.97.tar.gz 1465b58279af1647f909450e394fe002ca165f0ff4a0254bfa9fe0e64316f50facdde2729d79a4e632565b4500cf4d6c74192ac0dd3bc9fe09129bbd67ba089d lwip-1.3.0.tar.gz diff --git a/main/xen/xen-stable-4.18-20240312.patch b/main/xen/xen-stable-4.18-20240312.patch new file mode 100644 index 00000000000..78d40c44459 --- /dev/null +++ b/main/xen/xen-stable-4.18-20240312.patch @@ -0,0 +1,8490 @@ +From 52be29df793f282822436c8c13e0948a01aee1ad Mon Sep 17 00:00:00 2001 +From: Tamas K Lengyel +Date: Thu, 23 Nov 2023 12:10:46 +0100 +Subject: [PATCH 01/70] x86/mem_sharing: add missing m2p entry when mapping + shared_info page + +When mapping in the shared_info page to a fork the m2p entry wasn't set +resulting in the shared_info being reset even when the fork reset was called +with only reset_state and not reset_memory. This results in an extra +unnecessary TLB flush. + +Fixes: 1a0000ac775 ("mem_sharing: map shared_info page to same gfn during fork") +Signed-off-by: Tamas K Lengyel +Acked-by: Andrew Cooper +master commit: 23eb39acf011ef9bbe02ed4619c55f208fbcd39b +master date: 2023-10-31 16:10:14 +0000 +--- + xen/arch/x86/mm/mem_sharing.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c +index 94b6b782ef..142258f16a 100644 +--- a/xen/arch/x86/mm/mem_sharing.c ++++ b/xen/arch/x86/mm/mem_sharing.c +@@ -1847,6 +1847,8 @@ static int copy_special_pages(struct domain *cd, struct domain *d) + p2m_ram_rw, p2m->default_access, -1); + if ( rc ) + return rc; ++ ++ set_gpfn_from_mfn(mfn_x(new_mfn), gfn_x(old_gfn)); + } + } + +-- +2.44.0 + + +From 880e06fdea401493a3f408deb0f411f7aeccee27 Mon Sep 17 00:00:00 2001 +From: David Woodhouse +Date: Thu, 23 Nov 2023 12:11:21 +0100 +Subject: [PATCH 02/70] x86/pv-shim: fix grant table operations for 32-bit + guests + +When switching to call the shim functions from the normal handlers, the +compat_grant_table_op() function was omitted, leaving it calling the +real grant table operations in !PV_SHIM_EXCLUSIVE builds. This leaves a +32-bit shim guest failing to set up its real grant table with the parent +hypervisor. + +Fixes: e7db635f4428 ("x86/pv-shim: Don't modify the hypercall table") +Signed-off-by: David Woodhouse +Reviewed-by: Andrew Cooper +master commit: 93ec30bc545f15760039c23ee4b97b80c0b3b3b3 +master date: 2023-10-31 16:10:14 +0000 +--- + xen/common/compat/grant_table.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/xen/common/compat/grant_table.c b/xen/common/compat/grant_table.c +index e00bc24a34..af98eade17 100644 +--- a/xen/common/compat/grant_table.c ++++ b/xen/common/compat/grant_table.c +@@ -63,6 +63,11 @@ int compat_grant_table_op( + unsigned int i, cmd_op; + XEN_GUEST_HANDLE_PARAM(void) cnt_uop; + ++#ifdef CONFIG_PV_SHIM ++ if ( unlikely(pv_shim) ) ++ return pv_shim_grant_table_op(cmd, uop, count); ++#endif ++ + set_xen_guest_handle(cnt_uop, NULL); + cmd_op = cmd & GNTTABOP_CMD_MASK; + if ( cmd_op != GNTTABOP_cache_flush ) +-- +2.44.0 + + +From 9e8edd4c75564530a6fb98f5abba267edb906313 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Thu, 23 Nov 2023 12:12:18 +0100 +Subject: [PATCH 03/70] x86/x2apic: remove usage of ACPI_FADT_APIC_CLUSTER +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The ACPI FADT APIC_CLUSTER flag mandates that when the interrupt delivery is +Logical mode APIC must be configured for Cluster destination model. However in +apic_x2apic_probe() such flag is incorrectly used to gate whether Physical mode +can be used. + +Since Xen when in x2APIC mode only uses Logical mode together with Cluster +model completely remove checking for ACPI_FADT_APIC_CLUSTER, as Xen always +fulfills the requirement signaled by the flag. + +Fixes: eb40ae41b658 ('x86/Kconfig: add option for default x2APIC destination mode') +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: 26a449ce32cef33f2cb50602be19fcc0c4223ba9 +master date: 2023-11-02 10:50:26 +0100 +--- + xen/arch/x86/genapic/x2apic.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c +index ca1db27157..707deef98c 100644 +--- a/xen/arch/x86/genapic/x2apic.c ++++ b/xen/arch/x86/genapic/x2apic.c +@@ -231,8 +231,7 @@ const struct genapic *__init apic_x2apic_probe(void) + */ + x2apic_phys = iommu_intremap != iommu_intremap_full || + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) || +- (IS_ENABLED(CONFIG_X2APIC_PHYSICAL) && +- !(acpi_gbl_FADT.flags & ACPI_FADT_APIC_CLUSTER)); ++ IS_ENABLED(CONFIG_X2APIC_PHYSICAL); + } + else if ( !x2apic_phys ) + switch ( iommu_intremap ) +-- +2.44.0 + + +From fcb1016bbd476e17c72b1837ae2a3eaac517fa52 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Thu, 23 Nov 2023 12:12:47 +0100 +Subject: [PATCH 04/70] x86/i8259: do not assume interrupts always target CPU0 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Sporadically we have seen the following during AP bringup on AMD platforms +only: + +microcode: CPU59 updated from revision 0x830107a to 0x830107a, date = 2023-05-17 +microcode: CPU60 updated from revision 0x830104d to 0x830107a, date = 2023-05-17 +CPU60: No irq handler for vector 27 (IRQ -2147483648) +microcode: CPU61 updated from revision 0x830107a to 0x830107a, date = 2023-05-17 + +This is similar to the issue raised on Linux commit 36e9e1eab777e, where they +observed i8259 (active) vectors getting delivered to CPUs different than 0. + +On AMD or Hygon platforms adjust the target CPU mask of i8259 interrupt +descriptors to contain all possible CPUs, so that APs will reserve the vector +at startup if any legacy IRQ is still delivered through the i8259. Note that +if the IO-APIC takes over those interrupt descriptors the CPU mask will be +reset. + +Spurious i8259 interrupt vectors however (IRQ7 and IRQ15) can be injected even +when all i8259 pins are masked, and hence would need to be handled on all CPUs. + +Continue to reserve PIC vectors on CPU0 only, but do check for such spurious +interrupts on all CPUs if the vendor is AMD or Hygon. Note that once the +vectors get used by devices detecting PIC spurious interrupts will no longer be +possible, however the device driver should be able to cope with spurious +interrupts. Such PIC spurious interrupts occurring when the vector is in use +by a local APIC routed source will lead to an extra EOI, which might +unintentionally clear a different vector from ISR. Note this is already the +current behavior, so assume it's infrequent enough to not cause real issues. + +Finally, adjust the printed message to display the CPU where the spurious +interrupt has been received, so it looks like: + +microcode: CPU1 updated from revision 0x830107a to 0x830107a, date = 2023-05-17 +cpu1: spurious 8259A interrupt: IRQ7 +microcode: CPU2 updated from revision 0x830104d to 0x830107a, date = 2023-05-17 + +Amends: 3fba06ba9f8b ('x86/IRQ: re-use legacy vector ranges on APs') +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: 87f37449d586b4d407b75235bb0a171e018e25ec +master date: 2023-11-02 10:50:59 +0100 +--- + xen/arch/x86/i8259.c | 21 +++++++++++++++++++-- + xen/arch/x86/irq.c | 11 ++++++++++- + 2 files changed, 29 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/i8259.c b/xen/arch/x86/i8259.c +index ed9f55abe5..e0fa1f96b4 100644 +--- a/xen/arch/x86/i8259.c ++++ b/xen/arch/x86/i8259.c +@@ -222,7 +222,8 @@ static bool _mask_and_ack_8259A_irq(unsigned int irq) + is_real_irq = false; + /* Report spurious IRQ, once per IRQ line. */ + if (!(spurious_irq_mask & irqmask)) { +- printk("spurious 8259A interrupt: IRQ%d.\n", irq); ++ printk("cpu%u: spurious 8259A interrupt: IRQ%u\n", ++ smp_processor_id(), irq); + spurious_irq_mask |= irqmask; + } + /* +@@ -349,7 +350,23 @@ void __init init_IRQ(void) + continue; + desc->handler = &i8259A_irq_type; + per_cpu(vector_irq, cpu)[LEGACY_VECTOR(irq)] = irq; +- cpumask_copy(desc->arch.cpu_mask, cpumask_of(cpu)); ++ ++ /* ++ * The interrupt affinity logic never targets interrupts to offline ++ * CPUs, hence it's safe to use cpumask_all here. ++ * ++ * Legacy PIC interrupts are only targeted to CPU0, but depending on ++ * the platform they can be distributed to any online CPU in hardware. ++ * Note this behavior has only been observed on AMD hardware. In order ++ * to cope install all active legacy vectors on all CPUs. ++ * ++ * IO-APIC will change the destination mask if/when taking ownership of ++ * the interrupt. ++ */ ++ cpumask_copy(desc->arch.cpu_mask, ++ (boot_cpu_data.x86_vendor & ++ (X86_VENDOR_AMD | X86_VENDOR_HYGON) ? &cpumask_all ++ : cpumask_of(cpu))); + desc->arch.vector = LEGACY_VECTOR(irq); + } + +diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c +index f42ad539dc..16d9fceba1 100644 +--- a/xen/arch/x86/irq.c ++++ b/xen/arch/x86/irq.c +@@ -1920,7 +1920,16 @@ void do_IRQ(struct cpu_user_regs *regs) + kind = ""; + if ( !(vector >= FIRST_LEGACY_VECTOR && + vector <= LAST_LEGACY_VECTOR && +- !smp_processor_id() && ++ (!smp_processor_id() || ++ /* ++ * For AMD/Hygon do spurious PIC interrupt ++ * detection on all CPUs, as it has been observed ++ * that during unknown circumstances spurious PIC ++ * interrupts have been delivered to CPUs ++ * different than the BSP. ++ */ ++ (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | ++ X86_VENDOR_HYGON))) && + bogus_8259A_irq(vector - FIRST_LEGACY_VECTOR)) ) + { + printk("CPU%u: No irq handler for vector %02x (IRQ %d%s)\n", +-- +2.44.0 + + +From 40bfa9dd57f1efdd0f0dc974e80a438d9db90874 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 23 Nov 2023 12:13:31 +0100 +Subject: [PATCH 05/70] x86/spec-ctrl: Add SRSO whitepaper URL + +... now that it exists in public. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: 78a86b26868c12ae1cc3dd2a8bb9aa5eebaa41fd +master date: 2023-11-07 17:47:34 +0000 +--- + xen/arch/x86/spec_ctrl.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 6fd7d44ce4..a8d8af22f6 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -903,6 +903,9 @@ static bool __init should_use_eager_fpu(void) + } + } + ++/* ++ * https://www.amd.com/content/dam/amd/en/documents/corporate/cr/speculative-return-stack-overflow-whitepaper.pdf ++ */ + static void __init srso_calculations(bool hw_smt_enabled) + { + if ( !(boot_cpu_data.x86_vendor & +-- +2.44.0 + + +From 3f9390fea5c51a6d64596d295902d28931eeca4c Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Thu, 23 Nov 2023 12:13:53 +0100 +Subject: [PATCH 06/70] xen/sched: fix sched_move_domain() + +When moving a domain out of a cpupool running with the credit2 +scheduler and having multiple run-queues, the following ASSERT() can +be observed: + +(XEN) Xen call trace: +(XEN) [] R credit2.c#csched2_unit_remove+0xe3/0xe7 +(XEN) [] S sched_move_domain+0x2f3/0x5b1 +(XEN) [] S cpupool.c#cpupool_move_domain_locked+0x1d/0x3b +(XEN) [] S cpupool_move_domain+0x24/0x35 +(XEN) [] S domain_kill+0xa5/0x116 +(XEN) [] S do_domctl+0xe5f/0x1951 +(XEN) [] S timer.c#timer_lock+0x69/0x143 +(XEN) [] S pv_hypercall+0x44e/0x4a9 +(XEN) [] S lstar_enter+0x137/0x140 +(XEN) +(XEN) +(XEN) **************************************** +(XEN) Panic on CPU 1: +(XEN) Assertion 'svc->rqd == c2rqd(sched_unit_master(unit))' failed at common/sched/credit2.c:1159 +(XEN) **************************************** + +This is happening as sched_move_domain() is setting a different cpu +for a scheduling unit without telling the scheduler. When this unit is +removed from the scheduler, the ASSERT() will trigger. + +In non-debug builds the result is usually a clobbered pointer, leading +to another crash a short time later. + +Fix that by swapping the two involved actions (setting another cpu and +removing the unit from the scheduler). + +Link: https://github.com/Dasharo/dasharo-issues/issues/488 +Fixes: 70fadc41635b ("xen/cpupool: support moving domain between cpupools with different granularity") +Signed-off-by: Juergen Gross +Reviewed-by: George Dunlap +master commit: 4709ec82917668c2df958ef91b4f21c049c76bee +master date: 2023-11-20 10:49:29 +0100 +--- + xen/common/sched/core.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index 12deefa745..eba0cea4bb 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -732,18 +732,20 @@ int sched_move_domain(struct domain *d, struct cpupool *c) + old_domdata = d->sched_priv; + + /* +- * Temporarily move all units to same processor to make locking +- * easier when moving the new units to the new processors. ++ * Remove all units from the old scheduler, and temporarily move them to ++ * the same processor to make locking easier when moving the new units to ++ * new processors. + */ + new_p = cpumask_first(d->cpupool->cpu_valid); + for_each_sched_unit ( d, unit ) + { +- spinlock_t *lock = unit_schedule_lock_irq(unit); ++ spinlock_t *lock; ++ ++ sched_remove_unit(old_ops, unit); + ++ lock = unit_schedule_lock_irq(unit); + sched_set_res(unit, get_sched_res(new_p)); + spin_unlock_irq(lock); +- +- sched_remove_unit(old_ops, unit); + } + + old_units = d->sched_unit_list; +-- +2.44.0 + + +From 90a6d821757edf1202c527143b8a05b0d2a3dfaa Mon Sep 17 00:00:00 2001 +From: Frediano Ziglio +Date: Wed, 6 Dec 2023 10:37:13 +0100 +Subject: [PATCH 07/70] x86/mem_sharing: Release domain if we are not able to + enable memory sharing + +In case it's not possible to enable memory sharing (mem_sharing_control +fails) we just return the error code without releasing the domain +acquired some lines above by rcu_lock_live_remote_domain_by_id(). + +Fixes: 72f8d45d69b8 ("x86/mem_sharing: enable mem_sharing on first memop") +Signed-off-by: Frediano Ziglio +Reviewed-by: Andrew Cooper +Acked-by: Tamas K Lengyel +master commit: fbcec32d6d3ea0ac329301925b317478316209ed +master date: 2023-11-27 12:06:13 +0000 +--- + xen/arch/x86/mm/mem_sharing.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c +index 142258f16a..429d27ef85 100644 +--- a/xen/arch/x86/mm/mem_sharing.c ++++ b/xen/arch/x86/mm/mem_sharing.c +@@ -2013,7 +2013,7 @@ int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg) + + if ( !mem_sharing_enabled(d) && + (rc = mem_sharing_control(d, true, 0)) ) +- return rc; ++ goto out; + + switch ( mso.op ) + { +-- +2.44.0 + + +From 480168fcb3135f0da6e7a6b3b754c78fabc24d4f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Wed, 6 Dec 2023 10:38:03 +0100 +Subject: [PATCH 08/70] livepatch: do not use .livepatch.funcs section to store + internal state +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Currently the livepatch logic inside of Xen will use fields of struct +livepatch_func in order to cache internal state of patched functions. Note +this is a field that is part of the payload, and is loaded as an ELF section +(.livepatch.funcs), taking into account the SHF_* flags in the section +header. + +The flags for the .livepatch.funcs section, as set by livepatch-build-tools, +are SHF_ALLOC, which leads to its contents (the array of livepatch_func +structures) being placed in read-only memory: + +Section Headers: + [Nr] Name Type Address Offset + Size EntSize Flags Link Info Align +[...] + [ 4] .livepatch.funcs PROGBITS 0000000000000000 00000080 + 0000000000000068 0000000000000000 A 0 0 8 + +This previously went unnoticed, as all writes to the fields of livepatch_func +happen in the critical region that had WP disabled in CR0. After 8676092a0f16 +however WP is no longer toggled in CR0 for patch application, and only the +hypervisor .text mappings are made write-accessible. That leads to the +following page fault when attempting to apply a livepatch: + +----[ Xen-4.19-unstable x86_64 debug=y Tainted: C ]---- +CPU: 4 +RIP: e008:[] common/livepatch.c#apply_payload+0x45/0x1e1 +[...] +Xen call trace: + [] R common/livepatch.c#apply_payload+0x45/0x1e1 + [] F check_for_livepatch_work+0x385/0xaa5 + [] F arch/x86/domain.c#idle_loop+0x92/0xee + +Pagetable walk from ffff82d040625079: + L4[0x105] = 000000008c6c9063 ffffffffffffffff + L3[0x141] = 000000008c6c6063 ffffffffffffffff + L2[0x003] = 000000086a1e7063 ffffffffffffffff + L1[0x025] = 800000086ca5d121 ffffffffffffffff + +**************************************** +Panic on CPU 4: +FATAL PAGE FAULT +[error_code=0003] +Faulting linear address: ffff82d040625079 +**************************************** + +Fix this by moving the internal Xen function patching state out of +livepatch_func into an area not allocated as part of the ELF payload. While +there also constify the array of livepatch_func structures in order to prevent +further surprises. + +Note there's still one field (old_addr) that gets set during livepatch load. I +consider this fine since the field is read-only after load, and at the point +the field gets set the underlying mapping hasn't been made read-only yet. + +Fixes: 8676092a0f16 ('x86/livepatch: Fix livepatch application when CET is active') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall + +xen/livepatch: fix livepatch tests + +The current set of in-tree livepatch tests in xen/test/livepatch started +failing after the constify of the payload funcs array, and the movement of the +status data into a separate array. + +Fix the tests so they respect the constness of the funcs array and also make +use of the new location of the per-func state data. + +Fixes: 82182ad7b46e ('livepatch: do not use .livepatch.funcs section to store internal state') +Signed-off-by: Roger Pau Monné +Acked-by: Andrew Cooper +Reviewed-by: Ross Lagerwall +master commit: 82182ad7b46e0f7a3856bb12c7a9bf2e2a4570bc +master date: 2023-11-27 15:16:01 +0100 +master commit: 902377b690f42ddf44ae91c4b0751d597f1cd694 +master date: 2023-11-29 10:46:42 +0000 +--- + xen/arch/arm/arm32/livepatch.c | 9 +++-- + xen/arch/arm/arm64/livepatch.c | 9 +++-- + xen/arch/arm/livepatch.c | 9 +++-- + xen/arch/x86/livepatch.c | 26 +++++++------ + xen/common/livepatch.c | 25 ++++++++---- + xen/include/public/sysctl.h | 5 +-- + xen/include/xen/livepatch.h | 38 +++++++++++++------ + xen/include/xen/livepatch_payload.h | 3 +- + xen/test/livepatch/xen_action_hooks.c | 12 +++--- + xen/test/livepatch/xen_action_hooks_marker.c | 20 ++++++---- + xen/test/livepatch/xen_action_hooks_noapply.c | 22 ++++++----- + xen/test/livepatch/xen_action_hooks_nofunc.c | 6 +-- + .../livepatch/xen_action_hooks_norevert.c | 24 +++++++----- + xen/test/livepatch/xen_prepost_hooks.c | 8 ++-- + xen/test/livepatch/xen_prepost_hooks_fail.c | 2 +- + 15 files changed, 130 insertions(+), 88 deletions(-) + +diff --git a/xen/arch/arm/arm32/livepatch.c b/xen/arch/arm/arm32/livepatch.c +index 3c50283b2a..80d2659b78 100644 +--- a/xen/arch/arm/arm32/livepatch.c ++++ b/xen/arch/arm/arm32/livepatch.c +@@ -11,23 +11,24 @@ + #include + #include + +-void arch_livepatch_apply(struct livepatch_func *func) ++void arch_livepatch_apply(const struct livepatch_func *func, ++ struct livepatch_fstate *state) + { + uint32_t insn; + uint32_t *new_ptr; + unsigned int i, len; + +- BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(func->opaque)); ++ BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(state->insn_buffer)); + BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE != sizeof(insn)); + + ASSERT(vmap_of_xen_text); + +- len = livepatch_insn_len(func); ++ len = livepatch_insn_len(func, state); + if ( !len ) + return; + + /* Save old ones. */ +- memcpy(func->opaque, func->old_addr, len); ++ memcpy(state->insn_buffer, func->old_addr, len); + + if ( func->new_addr ) + { +diff --git a/xen/arch/arm/arm64/livepatch.c b/xen/arch/arm/arm64/livepatch.c +index 62d2ef373a..df2cebedde 100644 +--- a/xen/arch/arm/arm64/livepatch.c ++++ b/xen/arch/arm/arm64/livepatch.c +@@ -15,23 +15,24 @@ + #include + #include + +-void arch_livepatch_apply(struct livepatch_func *func) ++void arch_livepatch_apply(const struct livepatch_func *func, ++ struct livepatch_fstate *state) + { + uint32_t insn; + uint32_t *new_ptr; + unsigned int i, len; + +- BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(func->opaque)); ++ BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE > sizeof(state->insn_buffer)); + BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE != sizeof(insn)); + + ASSERT(vmap_of_xen_text); + +- len = livepatch_insn_len(func); ++ len = livepatch_insn_len(func, state); + if ( !len ) + return; + + /* Save old ones. */ +- memcpy(func->opaque, func->old_addr, len); ++ memcpy(state->insn_buffer, func->old_addr, len); + + if ( func->new_addr ) + insn = aarch64_insn_gen_branch_imm((unsigned long)func->old_addr, +diff --git a/xen/arch/arm/livepatch.c b/xen/arch/arm/livepatch.c +index d646379c8c..bbca1e5a5e 100644 +--- a/xen/arch/arm/livepatch.c ++++ b/xen/arch/arm/livepatch.c +@@ -69,7 +69,7 @@ void arch_livepatch_revive(void) + int arch_livepatch_verify_func(const struct livepatch_func *func) + { + /* If NOPing only do up to maximum amount we can put in the ->opaque. */ +- if ( !func->new_addr && (func->new_size > sizeof(func->opaque) || ++ if ( !func->new_addr && (func->new_size > LIVEPATCH_OPAQUE_SIZE || + func->new_size % ARCH_PATCH_INSN_SIZE) ) + return -EOPNOTSUPP; + +@@ -79,15 +79,16 @@ int arch_livepatch_verify_func(const struct livepatch_func *func) + return 0; + } + +-void arch_livepatch_revert(const struct livepatch_func *func) ++void arch_livepatch_revert(const struct livepatch_func *func, ++ struct livepatch_fstate *state) + { + uint32_t *new_ptr; + unsigned int len; + + new_ptr = func->old_addr - (void *)_start + vmap_of_xen_text; + +- len = livepatch_insn_len(func); +- memcpy(new_ptr, func->opaque, len); ++ len = livepatch_insn_len(func, state); ++ memcpy(new_ptr, state->insn_buffer, len); + + clean_and_invalidate_dcache_va_range(new_ptr, len); + } +diff --git a/xen/arch/x86/livepatch.c b/xen/arch/x86/livepatch.c +index a54d991c5f..ee539f001b 100644 +--- a/xen/arch/x86/livepatch.c ++++ b/xen/arch/x86/livepatch.c +@@ -95,7 +95,7 @@ int arch_livepatch_verify_func(const struct livepatch_func *func) + if ( !func->new_addr ) + { + /* Only do up to maximum amount we can put in the ->opaque. */ +- if ( func->new_size > sizeof(func->opaque) ) ++ if ( func->new_size > LIVEPATCH_OPAQUE_SIZE ) + return -EOPNOTSUPP; + + if ( func->old_size < func->new_size ) +@@ -123,13 +123,14 @@ int arch_livepatch_verify_func(const struct livepatch_func *func) + * "noinline" to cause control flow change and thus invalidate I$ and + * cause refetch after modification. + */ +-void noinline arch_livepatch_apply(struct livepatch_func *func) ++void noinline arch_livepatch_apply(const struct livepatch_func *func, ++ struct livepatch_fstate *state) + { + uint8_t *old_ptr; +- uint8_t insn[sizeof(func->opaque)]; ++ uint8_t insn[sizeof(state->insn_buffer)]; + unsigned int len; + +- func->patch_offset = 0; ++ state->patch_offset = 0; + old_ptr = func->old_addr; + + /* +@@ -141,14 +142,14 @@ void noinline arch_livepatch_apply(struct livepatch_func *func) + * ENDBR64 or similar instructions). + */ + if ( is_endbr64(old_ptr) || is_endbr64_poison(func->old_addr) ) +- func->patch_offset += ENDBR64_LEN; ++ state->patch_offset += ENDBR64_LEN; + + /* This call must be done with ->patch_offset already set. */ +- len = livepatch_insn_len(func); ++ len = livepatch_insn_len(func, state); + if ( !len ) + return; + +- memcpy(func->opaque, old_ptr + func->patch_offset, len); ++ memcpy(state->insn_buffer, old_ptr + state->patch_offset, len); + if ( func->new_addr ) + { + int32_t val; +@@ -156,7 +157,7 @@ void noinline arch_livepatch_apply(struct livepatch_func *func) + BUILD_BUG_ON(ARCH_PATCH_INSN_SIZE != (1 + sizeof(val))); + + insn[0] = 0xe9; /* Relative jump. */ +- val = func->new_addr - (func->old_addr + func->patch_offset + ++ val = func->new_addr - (func->old_addr + state->patch_offset + + ARCH_PATCH_INSN_SIZE); + + memcpy(&insn[1], &val, sizeof(val)); +@@ -164,17 +165,18 @@ void noinline arch_livepatch_apply(struct livepatch_func *func) + else + add_nops(insn, len); + +- memcpy(old_ptr + func->patch_offset, insn, len); ++ memcpy(old_ptr + state->patch_offset, insn, len); + } + + /* + * "noinline" to cause control flow change and thus invalidate I$ and + * cause refetch after modification. + */ +-void noinline arch_livepatch_revert(const struct livepatch_func *func) ++void noinline arch_livepatch_revert(const struct livepatch_func *func, ++ struct livepatch_fstate *state) + { +- memcpy(func->old_addr + func->patch_offset, func->opaque, +- livepatch_insn_len(func)); ++ memcpy(func->old_addr + state->patch_offset, state->insn_buffer, ++ livepatch_insn_len(func, state)); + } + + /* +diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c +index d89a904bd4..e635606c10 100644 +--- a/xen/common/livepatch.c ++++ b/xen/common/livepatch.c +@@ -260,6 +260,9 @@ static void free_payload_data(struct payload *payload) + vfree((void *)payload->text_addr); + + payload->pages = 0; ++ ++ /* fstate gets allocated strictly after move_payload. */ ++ XFREE(payload->fstate); + } + + /* +@@ -656,6 +659,7 @@ static int prepare_payload(struct payload *payload, + { + const struct livepatch_elf_sec *sec; + unsigned int i; ++ struct livepatch_func *funcs; + struct livepatch_func *f; + struct virtual_region *region; + const Elf_Note *n; +@@ -666,14 +670,19 @@ static int prepare_payload(struct payload *payload, + if ( !section_ok(elf, sec, sizeof(*payload->funcs)) ) + return -EINVAL; + +- payload->funcs = sec->load_addr; ++ payload->funcs = funcs = sec->load_addr; + payload->nfuncs = sec->sec->sh_size / sizeof(*payload->funcs); + ++ payload->fstate = xzalloc_array(typeof(*payload->fstate), ++ payload->nfuncs); ++ if ( !payload->fstate ) ++ return -ENOMEM; ++ + for ( i = 0; i < payload->nfuncs; i++ ) + { + int rc; + +- f = &(payload->funcs[i]); ++ f = &(funcs[i]); + + if ( f->version != LIVEPATCH_PAYLOAD_VERSION ) + { +@@ -1361,7 +1370,7 @@ static int apply_payload(struct payload *data) + ASSERT(!local_irq_is_enabled()); + + for ( i = 0; i < data->nfuncs; i++ ) +- common_livepatch_apply(&data->funcs[i]); ++ common_livepatch_apply(&data->funcs[i], &data->fstate[i]); + + arch_livepatch_revive(); + +@@ -1397,7 +1406,7 @@ static int revert_payload(struct payload *data) + } + + for ( i = 0; i < data->nfuncs; i++ ) +- common_livepatch_revert(&data->funcs[i]); ++ common_livepatch_revert(&data->funcs[i], &data->fstate[i]); + + /* + * Since we are running with IRQs disabled and the hooks may call common +@@ -1438,9 +1447,10 @@ static inline bool was_action_consistent(const struct payload *data, livepatch_f + + for ( i = 0; i < data->nfuncs; i++ ) + { +- struct livepatch_func *f = &(data->funcs[i]); ++ const struct livepatch_func *f = &(data->funcs[i]); ++ const struct livepatch_fstate *s = &(data->fstate[i]); + +- if ( f->applied != expected_state ) ++ if ( s->applied != expected_state ) + { + printk(XENLOG_ERR LIVEPATCH "%s: Payload has a function: '%s' with inconsistent applied state.\n", + data->name, f->name ?: "noname"); +@@ -2157,7 +2167,8 @@ static void cf_check livepatch_printall(unsigned char key) + + for ( i = 0; i < data->nfuncs; i++ ) + { +- struct livepatch_func *f = &(data->funcs[i]); ++ const struct livepatch_func *f = &(data->funcs[i]); ++ + printk(" %s patch %p(%u) with %p (%u)\n", + f->name, f->old_addr, f->old_size, f->new_addr, f->new_size); + +diff --git a/xen/include/public/sysctl.h b/xen/include/public/sysctl.h +index f1eba78405..9b19679cae 100644 +--- a/xen/include/public/sysctl.h ++++ b/xen/include/public/sysctl.h +@@ -991,10 +991,7 @@ struct livepatch_func { + uint32_t new_size; + uint32_t old_size; + uint8_t version; /* MUST be LIVEPATCH_PAYLOAD_VERSION. */ +- uint8_t opaque[LIVEPATCH_OPAQUE_SIZE]; +- uint8_t applied; +- uint8_t patch_offset; +- uint8_t _pad[6]; ++ uint8_t _pad[39]; + livepatch_expectation_t expect; + }; + typedef struct livepatch_func livepatch_func_t; +diff --git a/xen/include/xen/livepatch.h b/xen/include/xen/livepatch.h +index 9fdb29c382..537d3d58b6 100644 +--- a/xen/include/xen/livepatch.h ++++ b/xen/include/xen/livepatch.h +@@ -13,6 +13,9 @@ struct xen_sysctl_livepatch_op; + + #include + #include /* For -ENOSYS or -EOVERFLOW */ ++ ++#include /* For LIVEPATCH_OPAQUE_SIZE */ ++ + #ifdef CONFIG_LIVEPATCH + + /* +@@ -51,6 +54,12 @@ struct livepatch_symbol { + bool_t new_symbol; + }; + ++struct livepatch_fstate { ++ unsigned int patch_offset; ++ enum livepatch_func_state applied; ++ uint8_t insn_buffer[LIVEPATCH_OPAQUE_SIZE]; ++}; ++ + int livepatch_op(struct xen_sysctl_livepatch_op *); + void check_for_livepatch_work(void); + unsigned long livepatch_symbols_lookup_by_name(const char *symname); +@@ -87,10 +96,11 @@ void arch_livepatch_init(void); + int arch_livepatch_verify_func(const struct livepatch_func *func); + + static inline +-unsigned int livepatch_insn_len(const struct livepatch_func *func) ++unsigned int livepatch_insn_len(const struct livepatch_func *func, ++ const struct livepatch_fstate *state) + { + if ( !func->new_addr ) +- return func->new_size - func->patch_offset; ++ return func->new_size - state->patch_offset; + + return ARCH_PATCH_INSN_SIZE; + } +@@ -117,39 +127,43 @@ int arch_livepatch_safety_check(void); + int arch_livepatch_quiesce(void); + void arch_livepatch_revive(void); + +-void arch_livepatch_apply(struct livepatch_func *func); +-void arch_livepatch_revert(const struct livepatch_func *func); ++void arch_livepatch_apply(const struct livepatch_func *func, ++ struct livepatch_fstate *state); ++void arch_livepatch_revert(const struct livepatch_func *func, ++ struct livepatch_fstate *state); + void arch_livepatch_post_action(void); + + void arch_livepatch_mask(void); + void arch_livepatch_unmask(void); + +-static inline void common_livepatch_apply(struct livepatch_func *func) ++static inline void common_livepatch_apply(const struct livepatch_func *func, ++ struct livepatch_fstate *state) + { + /* If the action has been already executed on this function, do nothing. */ +- if ( func->applied == LIVEPATCH_FUNC_APPLIED ) ++ if ( state->applied == LIVEPATCH_FUNC_APPLIED ) + { + printk(XENLOG_WARNING LIVEPATCH "%s: %s has been already applied before\n", + __func__, func->name); + return; + } + +- arch_livepatch_apply(func); +- func->applied = LIVEPATCH_FUNC_APPLIED; ++ arch_livepatch_apply(func, state); ++ state->applied = LIVEPATCH_FUNC_APPLIED; + } + +-static inline void common_livepatch_revert(struct livepatch_func *func) ++static inline void common_livepatch_revert(const struct livepatch_func *func, ++ struct livepatch_fstate *state) + { + /* If the apply action hasn't been executed on this function, do nothing. */ +- if ( !func->old_addr || func->applied == LIVEPATCH_FUNC_NOT_APPLIED ) ++ if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED ) + { + printk(XENLOG_WARNING LIVEPATCH "%s: %s has not been applied before\n", + __func__, func->name); + return; + } + +- arch_livepatch_revert(func); +- func->applied = LIVEPATCH_FUNC_NOT_APPLIED; ++ arch_livepatch_revert(func, state); ++ state->applied = LIVEPATCH_FUNC_NOT_APPLIED; + } + #else + +diff --git a/xen/include/xen/livepatch_payload.h b/xen/include/xen/livepatch_payload.h +index 9f5f064205..b9cd4f2096 100644 +--- a/xen/include/xen/livepatch_payload.h ++++ b/xen/include/xen/livepatch_payload.h +@@ -52,7 +52,8 @@ struct payload { + size_t ro_size; /* .. and its size (if any). */ + unsigned int pages; /* Total pages for [text,rw,ro]_addr */ + struct list_head applied_list; /* Linked to 'applied_list'. */ +- struct livepatch_func *funcs; /* The array of functions to patch. */ ++ const struct livepatch_func *funcs; /* The array of functions to patch. */ ++ struct livepatch_fstate *fstate; /* State of patched functions. */ + unsigned int nfuncs; /* Nr of functions to patch. */ + const struct livepatch_symbol *symtab; /* All symbols. */ + const char *strtab; /* Pointer to .strtab. */ +diff --git a/xen/test/livepatch/xen_action_hooks.c b/xen/test/livepatch/xen_action_hooks.c +index 39b5313027..fa0b3ab35f 100644 +--- a/xen/test/livepatch/xen_action_hooks.c ++++ b/xen/test/livepatch/xen_action_hooks.c +@@ -26,9 +26,10 @@ static int apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- func->applied = LIVEPATCH_FUNC_APPLIED; ++ fstate->applied = LIVEPATCH_FUNC_APPLIED; + apply_cnt++; + + printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name); +@@ -47,9 +48,10 @@ static int revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- func->applied = LIVEPATCH_FUNC_NOT_APPLIED; ++ fstate->applied = LIVEPATCH_FUNC_NOT_APPLIED; + revert_cnt++; + + printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name); +@@ -68,7 +70,7 @@ static void post_revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + printk(KERN_DEBUG "%s: reverted: %s\n", __func__, func->name); + } +diff --git a/xen/test/livepatch/xen_action_hooks_marker.c b/xen/test/livepatch/xen_action_hooks_marker.c +index 4f807a577f..d2e22f70d1 100644 +--- a/xen/test/livepatch/xen_action_hooks_marker.c ++++ b/xen/test/livepatch/xen_action_hooks_marker.c +@@ -23,9 +23,10 @@ static int pre_apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: pre applied: %s\n", __func__, func->name); + } + +@@ -42,9 +43,10 @@ static void post_apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: post applied: %s\n", __func__, func->name); + } + +@@ -59,9 +61,10 @@ static int pre_revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: pre reverted: %s\n", __func__, func->name); + } + +@@ -78,9 +81,10 @@ static void post_revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name); + } + +diff --git a/xen/test/livepatch/xen_action_hooks_noapply.c b/xen/test/livepatch/xen_action_hooks_noapply.c +index 4c55c156a6..646a5fd2f0 100644 +--- a/xen/test/livepatch/xen_action_hooks_noapply.c ++++ b/xen/test/livepatch/xen_action_hooks_noapply.c +@@ -25,9 +25,10 @@ static int pre_apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: pre applied: %s\n", __func__, func->name); + } + +@@ -44,7 +45,7 @@ static int apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + apply_cnt++; + printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name); +@@ -63,10 +64,11 @@ static void post_apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + + BUG_ON(apply_cnt != 1); +- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: post applied: %s\n", __func__, func->name); + } + +@@ -81,9 +83,10 @@ static int pre_revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: pre reverted: %s\n", __func__, func->name); + } + +@@ -100,9 +103,10 @@ static void post_revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name); + } + +diff --git a/xen/test/livepatch/xen_action_hooks_nofunc.c b/xen/test/livepatch/xen_action_hooks_nofunc.c +index 2b4e90436f..077c4c1738 100644 +--- a/xen/test/livepatch/xen_action_hooks_nofunc.c ++++ b/xen/test/livepatch/xen_action_hooks_nofunc.c +@@ -23,7 +23,7 @@ static int apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + apply_cnt++; + printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name); +@@ -42,7 +42,7 @@ static int revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + revert_cnt++; + printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name); +@@ -61,7 +61,7 @@ static void post_revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + printk(KERN_DEBUG "%s: reverted: %s\n", __func__, func->name); + } +diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c +index ef77e72071..3e21ade6ab 100644 +--- a/xen/test/livepatch/xen_action_hooks_norevert.c ++++ b/xen/test/livepatch/xen_action_hooks_norevert.c +@@ -25,9 +25,10 @@ static int pre_apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: pre applied: %s\n", __func__, func->name); + } + +@@ -44,9 +45,10 @@ static void post_apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: post applied: %s\n", __func__, func->name); + } + +@@ -61,9 +63,10 @@ static int pre_revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + +- BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); + printk(KERN_DEBUG "%s: pre reverted: %s\n", __func__, func->name); + } + +@@ -80,7 +83,7 @@ static int revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + revert_cnt++; + printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name); +@@ -99,16 +102,17 @@ static void post_revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; ++ struct livepatch_fstate *fstate = &payload->fstate[i]; + + BUG_ON(revert_cnt != 1); +- BUG_ON(func->applied != LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); + + /* Outside of quiesce zone: MAY TRIGGER HOST CRASH/UNDEFINED BEHAVIOR */ + arch_livepatch_quiesce(); + common_livepatch_revert(payload); + arch_livepatch_revive(); +- BUG_ON(func->applied == LIVEPATCH_FUNC_APPLIED); ++ BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); + + printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name); + } +diff --git a/xen/test/livepatch/xen_prepost_hooks.c b/xen/test/livepatch/xen_prepost_hooks.c +index 889377d6eb..17f5af6a19 100644 +--- a/xen/test/livepatch/xen_prepost_hooks.c ++++ b/xen/test/livepatch/xen_prepost_hooks.c +@@ -30,7 +30,7 @@ static int pre_apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + pre_apply_cnt++; + printk(KERN_DEBUG "%s: applying: %s\n", __func__, func->name); +@@ -49,7 +49,7 @@ static void post_apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + post_apply_cnt++; + printk(KERN_DEBUG "%s: applied: %s\n", __func__, func->name); +@@ -66,7 +66,7 @@ static int pre_revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + pre_revert_cnt++; + printk(KERN_DEBUG "%s: reverting: %s\n", __func__, func->name); +@@ -86,7 +86,7 @@ static void post_revert_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + post_revert_cnt++; + printk(KERN_DEBUG "%s: reverted: %s\n", __func__, func->name); +diff --git a/xen/test/livepatch/xen_prepost_hooks_fail.c b/xen/test/livepatch/xen_prepost_hooks_fail.c +index c6feb5d32d..52fd7f642e 100644 +--- a/xen/test/livepatch/xen_prepost_hooks_fail.c ++++ b/xen/test/livepatch/xen_prepost_hooks_fail.c +@@ -24,7 +24,7 @@ static int pre_apply_hook(livepatch_payload_t *payload) + + for (i = 0; i < payload->nfuncs; i++) + { +- struct livepatch_func *func = &payload->funcs[i]; ++ const struct livepatch_func *func = &payload->funcs[i]; + + printk(KERN_DEBUG "%s: pre applying: %s\n", __func__, func->name); + } +-- +2.44.0 + + +From 61d032e322b178a49983359b0dfd64a42c1f5fca Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Wed, 6 Dec 2023 10:39:15 +0100 +Subject: [PATCH 09/70] xen/x86: In x2APIC mode, derive LDR from APIC ID +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Both Intel and AMD manuals agree that in x2APIC mode, the APIC LDR and ID +registers are derivable from each other through a fixed formula. + +Xen uses that formula, but applies it to vCPU IDs (which are sequential) +rather than x2APIC IDs (which are not, at the moment). As I understand it, +this is an attempt to tightly pack vCPUs into clusters so each cluster has +16 vCPUs rather than 8, but this is a spec violation. + +This patch fixes the implementation so we follow the x2APIC spec for new +VMs, while preserving the behaviour (buggy or fixed) for migrated-in VMs. + +While touching that area, remove the existing printk statement in +vlapic_load_fixup() (as the checks it performed didn't make sense in x2APIC +mode and wouldn't affect the outcome) and put another printk as an else +branch so we get warnings trying to load nonsensical LDR values we don't +know about. + +Fixes: f9e0cccf7b35 ("x86/HVM: fix ID handling of x2APIC emulation") +Signed-off-by: Alejandro Vallejo +Reviewed-by: Roger Pau Monné +Reviewed-by: Andrew Cooper +master commit: 90309854fd2440fb08b4c808f47d7670ba0d250d +master date: 2023-11-29 10:05:55 +0100 +--- + xen/arch/x86/hvm/vlapic.c | 64 +++++++++++++++++++-------- + xen/arch/x86/include/asm/hvm/domain.h | 3 ++ + 2 files changed, 48 insertions(+), 19 deletions(-) + +diff --git a/xen/arch/x86/hvm/vlapic.c b/xen/arch/x86/hvm/vlapic.c +index c7ce82d064..ba569043ea 100644 +--- a/xen/arch/x86/hvm/vlapic.c ++++ b/xen/arch/x86/hvm/vlapic.c +@@ -1061,13 +1061,26 @@ static const struct hvm_mmio_ops vlapic_mmio_ops = { + .write = vlapic_mmio_write, + }; + ++static uint32_t x2apic_ldr_from_id(uint32_t id) ++{ ++ return ((id & ~0xf) << 12) | (1 << (id & 0xf)); ++} ++ + static void set_x2apic_id(struct vlapic *vlapic) + { +- u32 id = vlapic_vcpu(vlapic)->vcpu_id; +- u32 ldr = ((id & ~0xf) << 12) | (1 << (id & 0xf)); ++ const struct vcpu *v = vlapic_vcpu(vlapic); ++ uint32_t apic_id = v->vcpu_id * 2; ++ uint32_t apic_ldr = x2apic_ldr_from_id(apic_id); + +- vlapic_set_reg(vlapic, APIC_ID, id * 2); +- vlapic_set_reg(vlapic, APIC_LDR, ldr); ++ /* ++ * Workaround for migrated domains to derive LDRs as the source host ++ * would've. ++ */ ++ if ( v->domain->arch.hvm.bug_x2apic_ldr_vcpu_id ) ++ apic_ldr = x2apic_ldr_from_id(v->vcpu_id); ++ ++ vlapic_set_reg(vlapic, APIC_ID, apic_id); ++ vlapic_set_reg(vlapic, APIC_LDR, apic_ldr); + } + + int guest_wrmsr_apic_base(struct vcpu *v, uint64_t val) +@@ -1498,27 +1511,40 @@ static int cf_check lapic_save_regs(struct vcpu *v, hvm_domain_context_t *h) + */ + static void lapic_load_fixup(struct vlapic *vlapic) + { +- uint32_t id = vlapic->loaded.id; ++ const struct vcpu *v = vlapic_vcpu(vlapic); ++ uint32_t good_ldr = x2apic_ldr_from_id(vlapic->loaded.id); + +- if ( vlapic_x2apic_mode(vlapic) && id && vlapic->loaded.ldr == 1 ) ++ /* Skip fixups on xAPIC mode, or if the x2APIC LDR is already correct */ ++ if ( !vlapic_x2apic_mode(vlapic) || ++ (vlapic->loaded.ldr == good_ldr) ) ++ return; ++ ++ if ( vlapic->loaded.ldr == 1 ) + { +- /* +- * This is optional: ID != 0 contradicts LDR == 1. It's being added +- * to aid in eventual debugging of issues arising from the fixup done +- * here, but can be dropped as soon as it is found to conflict with +- * other (future) changes. +- */ +- if ( GET_xAPIC_ID(id) != vlapic_vcpu(vlapic)->vcpu_id * 2 || +- id != SET_xAPIC_ID(GET_xAPIC_ID(id)) ) +- printk(XENLOG_G_WARNING "%pv: bogus APIC ID %#x loaded\n", +- vlapic_vcpu(vlapic), id); ++ /* ++ * Xen <= 4.4 may have a bug by which all the APICs configured in ++ * x2APIC mode got LDR = 1, which is inconsistent on every vCPU ++ * except for the one with ID = 0. We'll fix the bug now and assign ++ * an LDR value consistent with the APIC ID. ++ */ + set_x2apic_id(vlapic); + } +- else /* Undo an eventual earlier fixup. */ ++ else if ( vlapic->loaded.ldr == x2apic_ldr_from_id(v->vcpu_id) ) + { +- vlapic_set_reg(vlapic, APIC_ID, id); +- vlapic_set_reg(vlapic, APIC_LDR, vlapic->loaded.ldr); ++ /* ++ * Migrations from Xen 4.4 to date (4.19 dev window, Nov 2023) may ++ * have LDR drived from the vCPU ID, not the APIC ID. We must preserve ++ * LDRs so new vCPUs use consistent derivations and existing guests, ++ * which may have already read the LDR at the source host, aren't ++ * surprised when interrupts stop working the way they did at the ++ * other end. ++ */ ++ v->domain->arch.hvm.bug_x2apic_ldr_vcpu_id = true; + } ++ else ++ printk(XENLOG_G_WARNING ++ "%pv: bogus x2APIC record: ID %#x, LDR %#x, expected LDR %#x\n", ++ v, vlapic->loaded.id, vlapic->loaded.ldr, good_ldr); + } + + static int cf_check lapic_load_hidden(struct domain *d, hvm_domain_context_t *h) +diff --git a/xen/arch/x86/include/asm/hvm/domain.h b/xen/arch/x86/include/asm/hvm/domain.h +index 6e53ce4449..dd9d837e84 100644 +--- a/xen/arch/x86/include/asm/hvm/domain.h ++++ b/xen/arch/x86/include/asm/hvm/domain.h +@@ -106,6 +106,9 @@ struct hvm_domain { + + bool is_s3_suspended; + ++ /* Compatibility setting for a bug in x2APIC LDR */ ++ bool bug_x2apic_ldr_vcpu_id; ++ + /* hypervisor intercepted msix table */ + struct list_head msixtbl_list; + +-- +2.44.0 + + +From 3af9d1cbb602a9dcbab2e43fab74a881c2e05d81 Mon Sep 17 00:00:00 2001 +From: Alejandro Vallejo +Date: Wed, 6 Dec 2023 10:39:55 +0100 +Subject: [PATCH 10/70] tools/xg: Fix potential memory leak in cpu policy + getters/setters + +They allocate two different hypercall buffers, but leak the first +allocation if the second one failed due to an early return that bypasses +cleanup. + +Remove the early exit and go through _post() instead. Invoking _post() is +benign even if _pre() failed. + +Fixes: 6b85e427098c ('x86/sysctl: Implement XEN_SYSCTL_get_cpu_policy') +Fixes: 60529dfeca14 ('x86/domctl: Implement XEN_DOMCTL_get_cpu_policy') +Fixes: 14ba07e6f816 ('x86/domctl: Implement XEN_DOMCTL_set_cpumsr_policy') +Signed-off-by: Alejandro Vallejo +Reviewed-by: Anthony PERARD +master commit: 1571ff7a987b88b20598a6d49910457f3b2c59f1 +master date: 2023-12-01 10:53:07 +0100 +--- + tools/libs/guest/xg_cpuid_x86.c | 86 +++++++++++++++------------------ + 1 file changed, 39 insertions(+), 47 deletions(-) + +diff --git a/tools/libs/guest/xg_cpuid_x86.c b/tools/libs/guest/xg_cpuid_x86.c +index f2b1e80901..3a74bb2b37 100644 +--- a/tools/libs/guest/xg_cpuid_x86.c ++++ b/tools/libs/guest/xg_cpuid_x86.c +@@ -136,20 +136,20 @@ static int get_system_cpu_policy(xc_interface *xch, uint32_t index, + DECLARE_HYPERCALL_BOUNCE(msrs, + *nr_msrs * sizeof(*msrs), + XC_HYPERCALL_BUFFER_BOUNCE_OUT); +- int ret; +- +- if ( xc_hypercall_bounce_pre(xch, leaves) || +- xc_hypercall_bounce_pre(xch, msrs) ) +- return -1; ++ int ret = -1; + +- sysctl.cmd = XEN_SYSCTL_get_cpu_policy; +- sysctl.u.cpu_policy.index = index; +- sysctl.u.cpu_policy.nr_leaves = *nr_leaves; +- set_xen_guest_handle(sysctl.u.cpu_policy.leaves, leaves); +- sysctl.u.cpu_policy.nr_msrs = *nr_msrs; +- set_xen_guest_handle(sysctl.u.cpu_policy.msrs, msrs); +- +- ret = do_sysctl(xch, &sysctl); ++ if ( !xc_hypercall_bounce_pre(xch, leaves) && ++ !xc_hypercall_bounce_pre(xch, msrs) ) ++ { ++ sysctl.cmd = XEN_SYSCTL_get_cpu_policy; ++ sysctl.u.cpu_policy.index = index; ++ sysctl.u.cpu_policy.nr_leaves = *nr_leaves; ++ set_xen_guest_handle(sysctl.u.cpu_policy.leaves, leaves); ++ sysctl.u.cpu_policy.nr_msrs = *nr_msrs; ++ set_xen_guest_handle(sysctl.u.cpu_policy.msrs, msrs); ++ ++ ret = do_sysctl(xch, &sysctl); ++ } + + xc_hypercall_bounce_post(xch, leaves); + xc_hypercall_bounce_post(xch, msrs); +@@ -174,20 +174,20 @@ static int get_domain_cpu_policy(xc_interface *xch, uint32_t domid, + DECLARE_HYPERCALL_BOUNCE(msrs, + *nr_msrs * sizeof(*msrs), + XC_HYPERCALL_BUFFER_BOUNCE_OUT); +- int ret; +- +- if ( xc_hypercall_bounce_pre(xch, leaves) || +- xc_hypercall_bounce_pre(xch, msrs) ) +- return -1; +- +- domctl.cmd = XEN_DOMCTL_get_cpu_policy; +- domctl.domain = domid; +- domctl.u.cpu_policy.nr_leaves = *nr_leaves; +- set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves); +- domctl.u.cpu_policy.nr_msrs = *nr_msrs; +- set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs); ++ int ret = -1; + +- ret = do_domctl(xch, &domctl); ++ if ( !xc_hypercall_bounce_pre(xch, leaves) && ++ !xc_hypercall_bounce_pre(xch, msrs) ) ++ { ++ domctl.cmd = XEN_DOMCTL_get_cpu_policy; ++ domctl.domain = domid; ++ domctl.u.cpu_policy.nr_leaves = *nr_leaves; ++ set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves); ++ domctl.u.cpu_policy.nr_msrs = *nr_msrs; ++ set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs); ++ ++ ret = do_domctl(xch, &domctl); ++ } + + xc_hypercall_bounce_post(xch, leaves); + xc_hypercall_bounce_post(xch, msrs); +@@ -214,32 +214,24 @@ int xc_set_domain_cpu_policy(xc_interface *xch, uint32_t domid, + DECLARE_HYPERCALL_BOUNCE(msrs, + nr_msrs * sizeof(*msrs), + XC_HYPERCALL_BUFFER_BOUNCE_IN); +- int ret; +- +- if ( err_leaf_p ) +- *err_leaf_p = -1; +- if ( err_subleaf_p ) +- *err_subleaf_p = -1; +- if ( err_msr_p ) +- *err_msr_p = -1; ++ int ret = -1; + +- if ( xc_hypercall_bounce_pre(xch, leaves) ) +- return -1; +- +- if ( xc_hypercall_bounce_pre(xch, msrs) ) +- return -1; +- +- domctl.cmd = XEN_DOMCTL_set_cpu_policy; +- domctl.domain = domid; +- domctl.u.cpu_policy.nr_leaves = nr_leaves; +- set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves); +- domctl.u.cpu_policy.nr_msrs = nr_msrs; +- set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs); + domctl.u.cpu_policy.err_leaf = -1; + domctl.u.cpu_policy.err_subleaf = -1; + domctl.u.cpu_policy.err_msr = -1; + +- ret = do_domctl(xch, &domctl); ++ if ( !xc_hypercall_bounce_pre(xch, leaves) && ++ !xc_hypercall_bounce_pre(xch, msrs) ) ++ { ++ domctl.cmd = XEN_DOMCTL_set_cpu_policy; ++ domctl.domain = domid; ++ domctl.u.cpu_policy.nr_leaves = nr_leaves; ++ set_xen_guest_handle(domctl.u.cpu_policy.leaves, leaves); ++ domctl.u.cpu_policy.nr_msrs = nr_msrs; ++ set_xen_guest_handle(domctl.u.cpu_policy.msrs, msrs); ++ ++ ret = do_domctl(xch, &domctl); ++ } + + xc_hypercall_bounce_post(xch, leaves); + xc_hypercall_bounce_post(xch, msrs); +-- +2.44.0 + + +From 18f900b77b3a85acadc2fe152ea354a02569acab Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Wed, 6 Dec 2023 10:40:19 +0100 +Subject: [PATCH 11/70] x86emul: avoid triggering event related assertions + +The assertion at the end of x86_emulate_wrapper() as well as the ones +in x86_emul_{hw_exception,pagefault}() can trigger if we ignore +X86EMUL_EXCEPTION coming back from certain hook functions. Squash +exceptions when merely probing MSRs, plus on SWAPGS'es "best effort" +error handling path. + +In adjust_bnd() add another assertion after the read_xcr(0, ...) +invocation, paralleling the one in x86emul_get_fpu() - XCR0 reads should +never fault when XSAVE is (implicitly) known to be available. + +Also update the respective comment in x86_emulate_wrapper(). + +Fixes: 14a6be89ec04 ("x86emul: correct EFLAGS.TF handling") +Fixes: cb2626c75813 ("x86emul: conditionally clear BNDn for branches") +Fixes: 6eb43fcf8a0b ("x86emul: support SWAPGS") +Reported-by: AFL +Signed-off-by: Jan Beulich +Acked-by: Andrew Cooper +master commit: 787d11c5aaf4d3411d4658cff137cd49b0bd951b +master date: 2023-12-05 09:57:05 +0100 +--- + xen/arch/x86/x86_emulate/0f01.c | 6 ++++-- + xen/arch/x86/x86_emulate/0fae.c | 3 +++ + xen/arch/x86/x86_emulate/x86_emulate.c | 28 +++++++++++++++++++++----- + 3 files changed, 30 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/x86/x86_emulate/0f01.c b/xen/arch/x86/x86_emulate/0f01.c +index ba43fc394b..1ba99609d6 100644 +--- a/xen/arch/x86/x86_emulate/0f01.c ++++ b/xen/arch/x86/x86_emulate/0f01.c +@@ -200,8 +200,10 @@ int x86emul_0f01(struct x86_emulate_state *s, + if ( (rc = ops->write_segment(x86_seg_gs, &sreg, + ctxt)) != X86EMUL_OKAY ) + { +- /* Best effort unwind (i.e. no error checking). */ +- ops->write_msr(MSR_SHADOW_GS_BASE, msr_val, ctxt); ++ /* Best effort unwind (i.e. no real error checking). */ ++ if ( ops->write_msr(MSR_SHADOW_GS_BASE, msr_val, ++ ctxt) == X86EMUL_EXCEPTION ) ++ x86_emul_reset_event(ctxt); + goto done; + } + break; +diff --git a/xen/arch/x86/x86_emulate/0fae.c b/xen/arch/x86/x86_emulate/0fae.c +index 00840b1d07..ba77af58f2 100644 +--- a/xen/arch/x86/x86_emulate/0fae.c ++++ b/xen/arch/x86/x86_emulate/0fae.c +@@ -55,7 +55,10 @@ int x86emul_0fae(struct x86_emulate_state *s, + cr4 = X86_CR4_OSFXSR; + if ( !ops->read_msr || + ops->read_msr(MSR_EFER, &msr_val, ctxt) != X86EMUL_OKAY ) ++ { ++ x86_emul_reset_event(ctxt); + msr_val = 0; ++ } + if ( !(cr4 & X86_CR4_OSFXSR) || + (mode_64bit() && mode_ring0() && (msr_val & EFER_FFXSE)) ) + s->op_bytes = offsetof(struct x86_fxsr, xmm[0]); +diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c +index 94caec1d14..cf780da501 100644 +--- a/xen/arch/x86/x86_emulate/x86_emulate.c ++++ b/xen/arch/x86/x86_emulate/x86_emulate.c +@@ -1143,10 +1143,18 @@ static bool is_branch_step(struct x86_emulate_ctxt *ctxt, + const struct x86_emulate_ops *ops) + { + uint64_t debugctl; ++ int rc = X86EMUL_UNHANDLEABLE; + +- return ops->read_msr && +- ops->read_msr(MSR_IA32_DEBUGCTLMSR, &debugctl, ctxt) == X86EMUL_OKAY && +- (debugctl & IA32_DEBUGCTLMSR_BTF); ++ if ( !ops->read_msr || ++ (rc = ops->read_msr(MSR_IA32_DEBUGCTLMSR, &debugctl, ++ ctxt)) != X86EMUL_OKAY ) ++ { ++ if ( rc == X86EMUL_EXCEPTION ) ++ x86_emul_reset_event(ctxt); ++ debugctl = 0; ++ } ++ ++ return debugctl & IA32_DEBUGCTLMSR_BTF; + } + + static void adjust_bnd(struct x86_emulate_ctxt *ctxt, +@@ -1160,13 +1168,21 @@ static void adjust_bnd(struct x86_emulate_ctxt *ctxt, + + if ( !ops->read_xcr || ops->read_xcr(0, &xcr0, ctxt) != X86EMUL_OKAY || + !(xcr0 & X86_XCR0_BNDREGS) || !(xcr0 & X86_XCR0_BNDCSR) ) ++ { ++ ASSERT(!ctxt->event_pending); + return; ++ } + + if ( !mode_ring0() ) + bndcfg = read_bndcfgu(); + else if ( !ops->read_msr || +- ops->read_msr(MSR_IA32_BNDCFGS, &bndcfg, ctxt) != X86EMUL_OKAY ) ++ (rc = ops->read_msr(MSR_IA32_BNDCFGS, &bndcfg, ++ ctxt)) != X86EMUL_OKAY ) ++ { ++ if ( rc == X86EMUL_EXCEPTION ) ++ x86_emul_reset_event(ctxt); + return; ++ } + if ( (bndcfg & IA32_BNDCFGS_ENABLE) && !(bndcfg & IA32_BNDCFGS_PRESERVE) ) + { + /* +@@ -8677,7 +8693,9 @@ int x86_emulate_wrapper( + * An event being pending should exactly match returning + * X86EMUL_EXCEPTION. (If this trips, the chances are a codepath has + * called hvm_inject_hw_exception() rather than using +- * x86_emul_hw_exception().) ++ * x86_emul_hw_exception(), or the invocation of a hook has caused an ++ * exception to be raised, while the caller was only checking for ++ * success/failure.) + */ + ASSERT(ctxt->event_pending == (rc == X86EMUL_EXCEPTION)); + +-- +2.44.0 + + +From 5ac87c8afd2ae2b1a9fd46a9b80d9152d650fb26 Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Wed, 6 Dec 2023 10:40:54 +0100 +Subject: [PATCH 12/70] xen/sched: fix adding offline cpu to cpupool +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Trying to add an offline cpu to a cpupool can crash the hypervisor, +as the probably non-existing percpu area of the cpu is accessed before +the availability of the cpu is being tested. This can happen in case +the cpupool's granularity is "core" or "socket". + +Fix that by testing the cpu to be online. + +Fixes: cb563d7665f2 ("xen/sched: support core scheduling for moving cpus to/from cpupools") +Reported-by: René Winther Højgaard +Signed-off-by: Juergen Gross +Reviewed-by: Jan Beulich +master commit: 06e8d65d33896aa90f5b6d9b2bce7f11433b33c9 +master date: 2023-12-05 09:57:38 +0100 +--- + xen/common/sched/cpupool.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/xen/common/sched/cpupool.c b/xen/common/sched/cpupool.c +index 2e094b0cfa..ad8f608462 100644 +--- a/xen/common/sched/cpupool.c ++++ b/xen/common/sched/cpupool.c +@@ -892,6 +892,8 @@ int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op) + if ( cpu >= nr_cpu_ids ) + goto addcpu_out; + ret = -ENODEV; ++ if ( !cpu_online(cpu) ) ++ goto addcpu_out; + cpus = sched_get_opt_cpumask(c->gran, cpu); + if ( !cpumask_subset(cpus, &cpupool_free_cpus) || + cpumask_intersects(cpus, &cpupool_locked_cpus) ) +-- +2.44.0 + + +From 25b7f9ed0f8c7e138a2cecb113bd377c613153d7 Mon Sep 17 00:00:00 2001 +From: Stewart Hildebrand +Date: Wed, 6 Dec 2023 10:41:19 +0100 +Subject: [PATCH 13/70] xen/domain: fix error path in domain_create() + +If rangeset_new() fails, err would not be set to an appropriate error +code. Set it to -ENOMEM. + +Fixes: 580c458699e3 ("xen/domain: Call arch_domain_create() as early as possible in domain_create()") +Signed-off-by: Stewart Hildebrand +Reviewed-by: Jan Beulich +master commit: ff1178062094837d55ef342070e58316c43a54c9 +master date: 2023-12-05 10:00:51 +0100 +--- + xen/common/domain.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/xen/common/domain.c b/xen/common/domain.c +index 8f9ab01c0c..003f4ab125 100644 +--- a/xen/common/domain.c ++++ b/xen/common/domain.c +@@ -703,6 +703,7 @@ struct domain *domain_create(domid_t domid, + watchdog_domain_init(d); + init_status |= INIT_watchdog; + ++ err = -ENOMEM; + d->iomem_caps = rangeset_new(d, "I/O Memory", RANGESETF_prettyprint_hex); + d->irq_caps = rangeset_new(d, "Interrupts", 0); + if ( !d->iomem_caps || !d->irq_caps ) +-- +2.44.0 + + +From a56d598e13db413f98e149f8e10cc13e8d4c1635 Mon Sep 17 00:00:00 2001 +From: Julien Grall +Date: Tue, 12 Dec 2023 14:26:18 +0100 +Subject: [PATCH 14/70] Only compile the hypervisor with + -Wdeclaration-after-statement + +Right now, all tools and hypervisor will be complied with the option +-Wdeclaration-after-statement. While most of the code in the hypervisor +is controlled by us, for tools we may import external libraries. + +The build will fail if one of them are using the construct we are +trying to prevent. This is the case when building against Python 3.12 +and Yocto: + +| In file included from /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/Python.h:44, +| from xen/lowlevel/xc/xc.c:8: +| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/object.h: In function 'Py_SIZE': +| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/object.h:233:5: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement] +| 233 | PyVarObject *var_ob = _PyVarObject_CAST(ob); +| | ^~~~~~~~~~~ +| In file included from /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/Python.h:53: +| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/cpython/longintrepr.h: In function '_PyLong_CompactValue': +| /srv/storage/alex/yocto/build-virt/tmp/work/core2-64-poky-linux/xen-tools/4.17+stable/recipe-sysroot/usr/include/python3.12/cpython/longintrepr.h:121:5: error: ISO C90 forbids mixed declarations and code [-Werror=declaration-after-statement] +| 121 | Py_ssize_t sign = 1 - (op->long_value.lv_tag & _PyLong_SIGN_MASK); +| | ^~~~~~~~~~ +| cc1: all warnings being treated as errors + +Looking at the tools directory, a fair few directory already add +-Wno-declaration-after-statement to inhibit the default behavior. + +We have always build the hypervisor with the flag, so for now remove +only the flag for anything but the hypervisor. We can decide at later +time whether we want to relax. + +Also remove the -Wno-declaration-after-statement in some subdirectory +as the flag is now unnecessary. + +Part of the commit message was take from Alexander's first proposal: + +Link: https://lore.kernel.org/xen-devel/20231128174729.3880113-1-alex@linutronix.de/ +Reported-by: Alexander Kanavin +Acked-by: Anthony PERARD +Acked-by: Andrew Cooper +Tested-by: Jason Andryuk +Signed-off-by: Julien Grall + +xen/hypervisor: Don't use cc-option-add for -Wdeclaration-after-statement + +Per Andrew's comment in [1] all the compilers we support should +recognize the flag. + +I forgot to address the comment while committing. + +[1] fcf00090-304a-49f7-8a61-a54347e90a3b@citrix.com + +Signed-off-by: Julien Grall +master commit: 40be6307ec005539635e7b8fcef67e989dc441f6 +master date: 2023-12-06 19:12:40 +0000 +master commit: d4bfd3899886d0fbe259c20660dadb1e00170f2d +master date: 2023-12-06 19:19:59 +0000 +--- + Config.mk | 2 -- + stubdom/Makefile | 2 +- + stubdom/vtpmmgr/Makefile | 2 +- + tools/libs/light/Makefile | 3 +-- + tools/libs/util/Makefile | 3 +-- + tools/tests/depriv/Makefile | 2 -- + tools/xl/Makefile | 3 +-- + xen/Makefile | 1 + + 8 files changed, 6 insertions(+), 12 deletions(-) + +diff --git a/Config.mk b/Config.mk +index 29b0d1e12a..2a3e16d0bd 100644 +--- a/Config.mk ++++ b/Config.mk +@@ -177,8 +177,6 @@ CFLAGS += -std=gnu99 + + CFLAGS += -Wall -Wstrict-prototypes + +-$(call cc-option-add,HOSTCFLAGS,HOSTCC,-Wdeclaration-after-statement) +-$(call cc-option-add,CFLAGS,CC,-Wdeclaration-after-statement) + $(call cc-option-add,CFLAGS,CC,-Wno-unused-but-set-variable) + $(call cc-option-add,CFLAGS,CC,-Wno-unused-local-typedefs) + +diff --git a/stubdom/Makefile b/stubdom/Makefile +index 0ddfce1ba2..888fa20d72 100644 +--- a/stubdom/Makefile ++++ b/stubdom/Makefile +@@ -245,7 +245,7 @@ tpm_emulator-$(XEN_TARGET_ARCH): tpm_emulator-$(TPMEMU_VERSION).tar.gz + patch -d $@ -p1 < vtpm-command-duration.patch + patch -d $@ -p1 < vtpm-tpm_bn_t-addr.patch + mkdir $@/build +- cd $@/build; CC=${CC} $(CMAKE) .. -DCMAKE_C_FLAGS:STRING="-std=c99 -DTPM_NO_EXTERN $(TARGET_CPPFLAGS) $(TARGET_CFLAGS) -Wno-declaration-after-statement" ++ cd $@/build; CC=${CC} $(CMAKE) .. -DCMAKE_C_FLAGS:STRING="-std=c99 -DTPM_NO_EXTERN $(TARGET_CPPFLAGS) $(TARGET_CFLAGS)" + touch $@ + + TPMEMU_STAMPFILE=$(CROSS_ROOT)/$(GNU_TARGET_ARCH)-xen-elf/lib/libtpm.a +diff --git a/stubdom/vtpmmgr/Makefile b/stubdom/vtpmmgr/Makefile +index 6dae034a07..c29bb49838 100644 +--- a/stubdom/vtpmmgr/Makefile ++++ b/stubdom/vtpmmgr/Makefile +@@ -17,7 +17,7 @@ OBJS += vtpm_disk.o disk_tpm.o disk_io.o disk_crypto.o disk_read.o disk_write.o + OBJS += mgmt_authority.o + + CFLAGS+=-Werror -Iutil -Icrypto -Itcs +-CFLAGS+=-Wno-declaration-after-statement -Wno-unused-label ++CFLAGS+=-Wno-unused-label + + build: $(TARGET) + $(TARGET): $(OBJS) +diff --git a/tools/libs/light/Makefile b/tools/libs/light/Makefile +index ba4c1b7933..37e4d16709 100644 +--- a/tools/libs/light/Makefile ++++ b/tools/libs/light/Makefile +@@ -38,8 +38,7 @@ vpath static_tables.c $(ACPI_PATH)/ + + OBJS-$(CONFIG_X86) += $(ACPI_OBJS) + +-CFLAGS += -Wno-format-zero-length -Wmissing-declarations \ +- -Wno-declaration-after-statement -Wformat-nonliteral ++CFLAGS += -Wno-format-zero-length -Wmissing-declarations -Wformat-nonliteral + + CFLAGS-$(CONFIG_X86) += -DCONFIG_PCI_SUPP_LEGACY_IRQ + +diff --git a/tools/libs/util/Makefile b/tools/libs/util/Makefile +index c3b21875dc..936ec90a31 100644 +--- a/tools/libs/util/Makefile ++++ b/tools/libs/util/Makefile +@@ -9,8 +9,7 @@ OBJS-y += libxlu_disk.o + OBJS-y += libxlu_vif.o + OBJS-y += libxlu_pci.o + +-CFLAGS += -Wno-format-zero-length -Wmissing-declarations \ +- -Wno-declaration-after-statement -Wformat-nonliteral ++CFLAGS += -Wno-format-zero-length -Wmissing-declarations -Wformat-nonliteral + CFLAGS += $(CFLAGS_libxenctrl) + + CFLAGS += $(PTHREAD_CFLAGS) +diff --git a/tools/tests/depriv/Makefile b/tools/tests/depriv/Makefile +index 7d9e3b01bb..5404a12f47 100644 +--- a/tools/tests/depriv/Makefile ++++ b/tools/tests/depriv/Makefile +@@ -1,8 +1,6 @@ + XEN_ROOT=$(CURDIR)/../../.. + include $(XEN_ROOT)/tools/Rules.mk + +-CFLAGS += -Wno-declaration-after-statement +- + CFLAGS += $(CFLAGS_xeninclude) + CFLAGS += $(CFLAGS_libxenctrl) + CFLAGS += $(CFLAGS_libxencall) +diff --git a/tools/xl/Makefile b/tools/xl/Makefile +index 5f7aa5f46c..d742e96a5b 100644 +--- a/tools/xl/Makefile ++++ b/tools/xl/Makefile +@@ -5,8 +5,7 @@ + XEN_ROOT = $(CURDIR)/../.. + include $(XEN_ROOT)/tools/Rules.mk + +-CFLAGS += -Wno-format-zero-length -Wmissing-declarations \ +- -Wno-declaration-after-statement -Wformat-nonliteral ++CFLAGS += -Wno-format-zero-length -Wmissing-declarations -Wformat-nonliteral + CFLAGS += -fPIC + + CFLAGS += $(PTHREAD_CFLAGS) +diff --git a/xen/Makefile b/xen/Makefile +index e39290f638..a92709b43e 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -392,6 +392,7 @@ CFLAGS-$(CONFIG_CC_SPLIT_SECTIONS) += -ffunction-sections -fdata-sections + + CFLAGS += -nostdinc -fno-builtin -fno-common + CFLAGS += -Werror -Wredundant-decls -Wno-pointer-arith ++CFLAGS += -Wdeclaration-after-statement + $(call cc-option-add,CFLAGS,CC,-Wvla) + CFLAGS += -pipe -D__XEN__ -include $(srctree)/include/xen/config.h + CFLAGS-$(CONFIG_DEBUG_INFO) += -g +-- +2.44.0 + + +From 48eb9e91990b3fd42f8e847780f6cdb188245b4a Mon Sep 17 00:00:00 2001 +From: Juergen Gross +Date: Tue, 12 Dec 2023 14:26:35 +0100 +Subject: [PATCH 15/70] xen/sched: fix sched_move_domain() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Do cleanup in sched_move_domain() in a dedicated service function, +which is called either in error case with newly allocated data, or in +success case with the old data to be freed. + +This will at once fix some subtle bugs which sneaked in due to +forgetting to overwrite some pointers in the error case. + +Fixes: 70fadc41635b ("xen/cpupool: support moving domain between cpupools with different granularity") +Reported-by: René Winther Højgaard +Initial-fix-by: Jan Beulich +Initial-fix-by: George Dunlap +Signed-off-by: Juergen Gross +Reviewed-by: Jan Beulich +Acked-by: George Dunlap +master commit: 23792cc0f22cff4e106d838b83aa9ae1cb6ffaf4 +master date: 2023-12-07 13:37:25 +0000 +--- + xen/common/sched/core.c | 47 +++++++++++++++++++++++------------------ + 1 file changed, 27 insertions(+), 20 deletions(-) + +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index eba0cea4bb..901782bbb4 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -647,6 +647,24 @@ static void sched_move_irqs(const struct sched_unit *unit) + vcpu_move_irqs(v); + } + ++static void sched_move_domain_cleanup(const struct scheduler *ops, ++ struct sched_unit *units, ++ void *domdata) ++{ ++ struct sched_unit *unit, *old_unit; ++ ++ for ( unit = units; unit; ) ++ { ++ if ( unit->priv ) ++ sched_free_udata(ops, unit->priv); ++ old_unit = unit; ++ unit = unit->next_in_list; ++ xfree(old_unit); ++ } ++ ++ sched_free_domdata(ops, domdata); ++} ++ + /* + * Move a domain from one cpupool to another. + * +@@ -686,7 +704,6 @@ int sched_move_domain(struct domain *d, struct cpupool *c) + void *old_domdata; + unsigned int gran = cpupool_get_granularity(c); + unsigned int n_units = d->vcpu[0] ? DIV_ROUND_UP(d->max_vcpus, gran) : 0; +- int ret = 0; + + for_each_vcpu ( d, v ) + { +@@ -699,8 +716,9 @@ int sched_move_domain(struct domain *d, struct cpupool *c) + domdata = sched_alloc_domdata(c->sched, d); + if ( IS_ERR(domdata) ) + { +- ret = PTR_ERR(domdata); +- goto out; ++ rcu_read_unlock(&sched_res_rculock); ++ ++ return PTR_ERR(domdata); + } + + for ( unit_idx = 0; unit_idx < n_units; unit_idx++ ) +@@ -718,10 +736,10 @@ int sched_move_domain(struct domain *d, struct cpupool *c) + + if ( !unit || !unit->priv ) + { +- old_units = new_units; +- old_domdata = domdata; +- ret = -ENOMEM; +- goto out_free; ++ sched_move_domain_cleanup(c->sched, new_units, domdata); ++ rcu_read_unlock(&sched_res_rculock); ++ ++ return -ENOMEM; + } + + unit_ptr = &unit->next_in_list; +@@ -808,22 +826,11 @@ int sched_move_domain(struct domain *d, struct cpupool *c) + + domain_unpause(d); + +- out_free: +- for ( unit = old_units; unit; ) +- { +- if ( unit->priv ) +- sched_free_udata(c->sched, unit->priv); +- old_unit = unit; +- unit = unit->next_in_list; +- xfree(old_unit); +- } +- +- sched_free_domdata(old_ops, old_domdata); ++ sched_move_domain_cleanup(old_ops, old_units, old_domdata); + +- out: + rcu_read_unlock(&sched_res_rculock); + +- return ret; ++ return 0; + } + + void sched_destroy_vcpu(struct vcpu *v) +-- +2.44.0 + + +From a4f3f5a62c10a5adc898cf45261783209f5bc037 Mon Sep 17 00:00:00 2001 +From: Michal Orzel +Date: Tue, 12 Dec 2023 14:27:10 +0100 +Subject: [PATCH 16/70] xen/arm: page: Avoid pointer overflow on cache clean & + invalidate + +On Arm32, after cleaning and invalidating the last dcache line of the top +domheap page i.e. VA = 0xfffff000 (as a result of flushing the page to +RAM), we end up adding the value of a dcache line size to the pointer +once again, which results in a pointer arithmetic overflow (with 64B line +size, operation 0xffffffc0 + 0x40 overflows to 0x0). Such behavior is +undefined and given the wide range of compiler versions we support, it is +difficult to determine what could happen in such scenario. + +Modify clean_and_invalidate_dcache_va_range() as well as +clean_dcache_va_range() and invalidate_dcache_va_range() due to similarity +of handling to prevent pointer arithmetic overflow. Modify the loops to +use an additional variable to store the index of the next cacheline. +Add an assert to prevent passing a region that wraps around which is +illegal and would end up in a page fault anyway (region 0-2MB is +unmapped). Lastly, return early if size passed is 0. + +Note that on Arm64, we don't have this problem given that the max VA +space we support is 48-bits. + +This is XSA-447 / CVE-2023-46837. + +Signed-off-by: Michal Orzel +Reviewed-by: Julien Grall +master commit: 190b7f49af6487a9665da63d43adc9d9a5fbd01e +master date: 2023-12-12 14:01:00 +0100 +--- + xen/arch/arm/include/asm/page.h | 35 ++++++++++++++++++++++++++------- + 1 file changed, 28 insertions(+), 7 deletions(-) + +diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h +index aa0080e8d7..645331fc89 100644 +--- a/xen/arch/arm/include/asm/page.h ++++ b/xen/arch/arm/include/asm/page.h +@@ -162,6 +162,13 @@ static inline size_t read_dcache_line_bytes(void) + static inline int invalidate_dcache_va_range(const void *p, unsigned long size) + { + size_t cacheline_mask = dcache_line_bytes - 1; ++ unsigned long idx = 0; ++ ++ if ( !size ) ++ return 0; ++ ++ /* Passing a region that wraps around is illegal */ ++ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); + + dsb(sy); /* So the CPU issues all writes to the range */ + +@@ -174,11 +181,11 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) + } + + for ( ; size >= dcache_line_bytes; +- p += dcache_line_bytes, size -= dcache_line_bytes ) +- asm volatile (__invalidate_dcache_one(0) : : "r" (p)); ++ idx += dcache_line_bytes, size -= dcache_line_bytes ) ++ asm volatile (__invalidate_dcache_one(0) : : "r" (p + idx)); + + if ( size > 0 ) +- asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); ++ asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx)); + + dsb(sy); /* So we know the flushes happen before continuing */ + +@@ -188,14 +195,21 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) + static inline int clean_dcache_va_range(const void *p, unsigned long size) + { + size_t cacheline_mask = dcache_line_bytes - 1; ++ unsigned long idx = 0; ++ ++ if ( !size ) ++ return 0; ++ ++ /* Passing a region that wraps around is illegal */ ++ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); + + dsb(sy); /* So the CPU issues all writes to the range */ + size += (uintptr_t)p & cacheline_mask; + size = (size + cacheline_mask) & ~cacheline_mask; + p = (void *)((uintptr_t)p & ~cacheline_mask); + for ( ; size >= dcache_line_bytes; +- p += dcache_line_bytes, size -= dcache_line_bytes ) +- asm volatile (__clean_dcache_one(0) : : "r" (p)); ++ idx += dcache_line_bytes, size -= dcache_line_bytes ) ++ asm volatile (__clean_dcache_one(0) : : "r" (p + idx)); + dsb(sy); /* So we know the flushes happen before continuing */ + /* ARM callers assume that dcache_* functions cannot fail. */ + return 0; +@@ -205,14 +219,21 @@ static inline int clean_and_invalidate_dcache_va_range + (const void *p, unsigned long size) + { + size_t cacheline_mask = dcache_line_bytes - 1; ++ unsigned long idx = 0; ++ ++ if ( !size ) ++ return 0; ++ ++ /* Passing a region that wraps around is illegal */ ++ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); + + dsb(sy); /* So the CPU issues all writes to the range */ + size += (uintptr_t)p & cacheline_mask; + size = (size + cacheline_mask) & ~cacheline_mask; + p = (void *)((uintptr_t)p & ~cacheline_mask); + for ( ; size >= dcache_line_bytes; +- p += dcache_line_bytes, size -= dcache_line_bytes ) +- asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); ++ idx += dcache_line_bytes, size -= dcache_line_bytes ) ++ asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx)); + dsb(sy); /* So we know the flushes happen before continuing */ + /* ARM callers assume that dcache_* functions cannot fail. */ + return 0; +-- +2.44.0 + + +From 1792d1723b7fb45a20b145d2de4d233913b22c09 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 12 Dec 2023 14:45:52 +0100 +Subject: [PATCH 17/70] x86/x2apic: introduce a mixed physical/cluster mode +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current implementation of x2APIC requires to either use Cluster Logical or +Physical mode for all interrupts. However the selection of Physical vs Logical +is not done at APIC setup, an APIC can be addressed both in Physical or Logical +destination modes concurrently. + +Introduce a new x2APIC mode called Mixed, which uses Logical Cluster mode for +IPIs, and Physical mode for external interrupts, thus attempting to use the +best method for each interrupt type. + +Using Physical mode for external interrupts allows more vectors to be used, and +interrupt balancing to be more accurate. + +Using Logical Cluster mode for IPIs allows fewer accesses to the ICR register +when sending those, as multiple CPUs can be targeted with a single ICR register +write. + +A simple test calling flush_tlb_all() 10000 times on a tight loop on AMD EPYC +9754 with 512 CPUs gives the following figures in nano seconds: + +x mixed ++ phys +* cluster + N Min Max Median Avg Stddev +x 25 3.5131328e+08 3.5716441e+08 3.5410987e+08 3.5432659e+08 1566737.4 ++ 12 1.231082e+09 1.238824e+09 1.2370528e+09 1.2357981e+09 2853892.9 +Difference at 95.0% confidence + 8.81472e+08 +/- 1.46849e+06 + 248.774% +/- 0.96566% + (Student's t, pooled s = 2.05985e+06) +* 11 3.5099276e+08 3.5561459e+08 3.5461234e+08 3.5415668e+08 1415071.9 +No difference proven at 95.0% confidence + +So Mixed has no difference when compared to Cluster mode, and Physical mode is +248% slower when compared to either Mixed or Cluster modes with a 95% +confidence. + +Note that Xen uses Cluster mode by default, and hence is already using the +fastest way for IPI delivery at the cost of reducing the amount of vectors +available system-wide. + +Make the newly introduced mode the default one. + +Note the printing of the APIC addressing mode done in connect_bsp_APIC() has +been removed, as with the newly introduced mixed mode this would require more +fine grained printing, or else would be incorrect. The addressing mode can +already be derived from the APIC driver in use, which is printed by different +helpers. + +Suggested-by: Andrew Cooper +Signed-off-by: Roger Pau Monné +Reviewed-by: Andrew Cooper +Reviewed-by: Jan Beulich +Acked-by: Henry Wang +master commit: e3c409d59ac87ccdf97b8c7708c81efa8069cb31 +master date: 2023-11-07 09:59:48 +0000 +--- + CHANGELOG.md | 7 +++ + docs/misc/xen-command-line.pandoc | 12 ++++ + xen/arch/x86/Kconfig | 35 +++++++++-- + xen/arch/x86/apic.c | 6 +- + xen/arch/x86/genapic/x2apic.c | 98 +++++++++++++++++++++++-------- + 5 files changed, 123 insertions(+), 35 deletions(-) + +diff --git a/CHANGELOG.md b/CHANGELOG.md +index 7fb4d366c3..5aa01dae5d 100644 +--- a/CHANGELOG.md ++++ b/CHANGELOG.md +@@ -4,6 +4,13 @@ Notable changes to Xen will be documented in this file. + + The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) + ++## [4.18.1](https://xenbits.xen.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.18.1) ++ ++### Added ++ - On x86: ++ - Introduce a new x2APIC driver that uses Cluster Logical addressing mode ++ for IPIs and Physical addressing mode for external interrupts. ++ + ## [4.18.0](https://xenbits.xenproject.org/gitweb/?p=xen.git;a=shortlog;h=RELEASE-4.18.0) - 2023-11-16 + + ### Changed +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index 9a19a04157..8e65f8bd18 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2804,6 +2804,15 @@ the watchdog. + + Permit use of x2apic setup for SMP environments. + ++### x2apic-mode (x86) ++> `= physical | cluster | mixed` ++ ++> Default: `physical` if **FADT** mandates physical mode, otherwise set at ++> build time by CONFIG_X2APIC_{PHYSICAL,LOGICAL,MIXED}. ++ ++In the case that x2apic is in use, this option switches between modes to ++address APICs in the system as interrupt destinations. ++ + ### x2apic_phys (x86) + > `= ` + +@@ -2814,6 +2823,9 @@ In the case that x2apic is in use, this option switches between physical and + clustered mode. The default, given no hint from the **FADT**, is cluster + mode. + ++**WARNING: `x2apic_phys` is deprecated and superseded by `x2apic-mode`. ++The latter takes precedence if both are set.** ++ + ### xenheap_megabytes (arm32) + > `= ` + +diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig +index eac77573bd..1acdffc51c 100644 +--- a/xen/arch/x86/Kconfig ++++ b/xen/arch/x86/Kconfig +@@ -228,11 +228,18 @@ config XEN_ALIGN_2M + + endchoice + +-config X2APIC_PHYSICAL +- bool "x2APIC Physical Destination mode" ++choice ++ prompt "x2APIC Driver default" ++ default X2APIC_MIXED + help +- Use x2APIC Physical Destination mode by default when available. ++ Select APIC addressing when x2APIC is enabled. ++ ++ The default mode is mixed which should provide the best aspects ++ of both physical and cluster modes. + ++config X2APIC_PHYSICAL ++ bool "Physical Destination mode" ++ help + When using this mode APICs are addressed using the Physical + Destination mode, which allows using all dynamic vectors on each + CPU independently. +@@ -242,9 +249,27 @@ config X2APIC_PHYSICAL + destination inter processor interrupts (IPIs) slightly slower than + Logical Destination mode. + +- The mode when this option is not selected is Logical Destination. ++config X2APIC_CLUSTER ++ bool "Cluster Destination mode" ++ help ++ When using this mode APICs are addressed using the Cluster Logical ++ Destination mode. ++ ++ Cluster Destination has the benefit of sending IPIs faster since ++ multiple APICs can be targeted as destinations of a single IPI. ++ However the vector space is shared between all CPUs on the cluster, ++ and hence using this mode reduces the number of available vectors ++ when compared to Physical mode. + +- If unsure, say N. ++config X2APIC_MIXED ++ bool "Mixed Destination mode" ++ help ++ When using this mode APICs are addressed using the Cluster Logical ++ Destination mode for IPIs and Physical mode for external interrupts. ++ ++ Should provide the best of both modes. ++ ++endchoice + + config GUEST + bool +diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c +index f1264ce7ed..6acdd0ec14 100644 +--- a/xen/arch/x86/apic.c ++++ b/xen/arch/x86/apic.c +@@ -229,11 +229,7 @@ void __init connect_bsp_APIC(void) + outb(0x01, 0x23); + } + +- printk("Enabling APIC mode: %s. Using %d I/O APICs\n", +- !INT_DEST_MODE ? "Physical" +- : init_apic_ldr == init_apic_ldr_flat ? "Flat" +- : "Clustered", +- nr_ioapics); ++ printk("Enabling APIC mode. Using %d I/O APICs\n", nr_ioapics); + enable_apic_mode(); + } + +diff --git a/xen/arch/x86/genapic/x2apic.c b/xen/arch/x86/genapic/x2apic.c +index 707deef98c..b88c7a96fe 100644 +--- a/xen/arch/x86/genapic/x2apic.c ++++ b/xen/arch/x86/genapic/x2apic.c +@@ -180,6 +180,36 @@ static const struct genapic __initconstrel apic_x2apic_cluster = { + .send_IPI_self = send_IPI_self_x2apic + }; + ++/* ++ * Mixed x2APIC mode: use physical for external (device) interrupts, and ++ * cluster for inter processor interrupts. Such mode has the benefits of not ++ * sharing the vector space with all CPUs on the cluster, while still allowing ++ * IPIs to be more efficiently delivered by not having to perform an ICR write ++ * for each target CPU. ++ */ ++static const struct genapic __initconstrel apic_x2apic_mixed = { ++ APIC_INIT("x2apic_mixed", NULL), ++ ++ /* ++ * The following fields are exclusively used by external interrupts and ++ * hence are set to use Physical destination mode handlers. ++ */ ++ .int_delivery_mode = dest_Fixed, ++ .int_dest_mode = 0 /* physical delivery */, ++ .vector_allocation_cpumask = vector_allocation_cpumask_phys, ++ .cpu_mask_to_apicid = cpu_mask_to_apicid_phys, ++ ++ /* ++ * The following fields are exclusively used by IPIs and hence are set to ++ * use Cluster Logical destination mode handlers. Note that init_apic_ldr ++ * is not used by IPIs, but the per-CPU fields it initializes are only used ++ * by the IPI hooks. ++ */ ++ .init_apic_ldr = init_apic_ldr_x2apic_cluster, ++ .send_IPI_mask = send_IPI_mask_x2apic_cluster, ++ .send_IPI_self = send_IPI_self_x2apic, ++}; ++ + static int cf_check update_clusterinfo( + struct notifier_block *nfb, unsigned long action, void *hcpu) + { +@@ -220,38 +250,56 @@ static struct notifier_block x2apic_cpu_nfb = { + static int8_t __initdata x2apic_phys = -1; + boolean_param("x2apic_phys", x2apic_phys); + ++enum { ++ unset, physical, cluster, mixed ++} static __initdata x2apic_mode = unset; ++ ++static int __init cf_check parse_x2apic_mode(const char *s) ++{ ++ if ( !cmdline_strcmp(s, "physical") ) ++ x2apic_mode = physical; ++ else if ( !cmdline_strcmp(s, "cluster") ) ++ x2apic_mode = cluster; ++ else if ( !cmdline_strcmp(s, "mixed") ) ++ x2apic_mode = mixed; ++ else ++ return -EINVAL; ++ ++ return 0; ++} ++custom_param("x2apic-mode", parse_x2apic_mode); ++ + const struct genapic *__init apic_x2apic_probe(void) + { +- if ( x2apic_phys < 0 ) ++ /* Honour the legacy cmdline setting if it's the only one provided. */ ++ if ( x2apic_mode == unset && x2apic_phys >= 0 ) ++ x2apic_mode = x2apic_phys ? physical : cluster; ++ ++ if ( x2apic_mode == unset ) + { +- /* +- * Force physical mode if there's no (full) interrupt remapping support: +- * The ID in clustered mode requires a 32 bit destination field due to +- * the usage of the high 16 bits to hold the cluster ID. +- */ +- x2apic_phys = iommu_intremap != iommu_intremap_full || +- (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) || +- IS_ENABLED(CONFIG_X2APIC_PHYSICAL); +- } +- else if ( !x2apic_phys ) +- switch ( iommu_intremap ) ++ if ( acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL ) + { +- case iommu_intremap_off: +- case iommu_intremap_restricted: +- printk("WARNING: x2APIC cluster mode is not supported %s interrupt remapping -" +- " forcing phys mode\n", +- iommu_intremap == iommu_intremap_off ? "without" +- : "with restricted"); +- x2apic_phys = true; +- break; +- +- case iommu_intremap_full: +- break; ++ printk(XENLOG_INFO "ACPI FADT forcing x2APIC physical mode\n"); ++ x2apic_mode = physical; + } ++ else ++ x2apic_mode = IS_ENABLED(CONFIG_X2APIC_MIXED) ? mixed ++ : (IS_ENABLED(CONFIG_X2APIC_PHYSICAL) ? physical ++ : cluster); ++ } + +- if ( x2apic_phys ) ++ if ( x2apic_mode == physical ) + return &apic_x2apic_phys; + ++ if ( x2apic_mode == cluster && iommu_intremap != iommu_intremap_full ) ++ { ++ printk("WARNING: x2APIC cluster mode is not supported %s interrupt remapping -" ++ " forcing mixed mode\n", ++ iommu_intremap == iommu_intremap_off ? "without" ++ : "with restricted"); ++ x2apic_mode = mixed; ++ } ++ + if ( !this_cpu(cluster_cpus) ) + { + update_clusterinfo(NULL, CPU_UP_PREPARE, +@@ -260,7 +308,7 @@ const struct genapic *__init apic_x2apic_probe(void) + register_cpu_notifier(&x2apic_cpu_nfb); + } + +- return &apic_x2apic_cluster; ++ return x2apic_mode == cluster ? &apic_x2apic_cluster : &apic_x2apic_mixed; + } + + void __init check_x2apic_preenabled(void) +-- +2.44.0 + + +From 637da04812fba259a5d06591ec535345637a4407 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 30 Jan 2024 14:33:48 +0100 +Subject: [PATCH 18/70] pci: fail device assignment if phantom functions cannot + be assigned +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current behavior is that no error is reported if (some) phantom functions +fail to be assigned during device add or assignment, so the operation succeeds +even if some phantom functions are not correctly setup. + +This can lead to devices possibly being successfully assigned to a domU while +some of the device phantom functions are still assigned to dom0. Even when the +device is assigned domIO before being assigned to a domU phantom functions +might fail to be assigned to domIO, and also fail to be assigned to the domU, +leaving them assigned to dom0. + +Since the device can generate requests using the IDs of those phantom +functions, given the scenario above a device in such state would be in control +of a domU, but still capable of generating transactions that use a context ID +targeting dom0 owned memory. + +Modify device assign in order to attempt to deassign the device if phantom +functions failed to be assigned. + +Note that device addition is not modified in the same way, as in that case the +device is assigned to a trusted domain, and hence partial assign can lead to +device malfunction but not a security issue. + +This is XSA-449 / CVE-2023-46839 + +Fixes: 4e9950dc1bd2 ('IOMMU: add phantom function support') +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: cb4ecb3cc17b02c2814bc817efd05f3f3ba33d1e +master date: 2024-01-30 14:28:01 +0100 +--- + xen/drivers/passthrough/pci.c | 27 +++++++++++++++++++++------ + 1 file changed, 21 insertions(+), 6 deletions(-) + +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index 04d00c7c37..e99837b6e1 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -1439,11 +1439,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + + pdev->fault.count = 0; + +- if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn, +- pci_to_dev(pdev), flag)) ) +- goto done; ++ rc = iommu_call(hd->platform_ops, assign_device, d, devfn, pci_to_dev(pdev), ++ flag); + +- for ( ; pdev->phantom_stride; rc = 0 ) ++ while ( pdev->phantom_stride && !rc ) + { + devfn += pdev->phantom_stride; + if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) +@@ -1454,8 +1453,24 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) + + done: + if ( rc ) +- printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n", +- d, &PCI_SBDF(seg, bus, devfn), rc); ++ { ++ printk(XENLOG_G_WARNING "%pd: assign %s(%pp) failed (%d)\n", ++ d, devfn != pdev->devfn ? "phantom function " : "", ++ &PCI_SBDF(seg, bus, devfn), rc); ++ ++ if ( devfn != pdev->devfn && deassign_device(d, seg, bus, pdev->devfn) ) ++ { ++ /* ++ * Device with phantom functions that failed to both assign and ++ * rollback. Mark the device as broken and crash the target domain, ++ * as the state of the functions at this point is unknown and Xen ++ * has no way to assert consistent context assignment among them. ++ */ ++ pdev->broken = true; ++ if ( !is_hardware_domain(d) && d != dom_io ) ++ domain_crash(d); ++ } ++ } + /* The device is assigned to dom_io so mark it as quarantined */ + else if ( d == dom_io ) + pdev->quarantine = true; +-- +2.44.0 + + +From c7ac596a575a05d6ff1e35c3ff98bc4d143712d2 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 30 Jan 2024 14:34:40 +0100 +Subject: [PATCH 19/70] VT-d: Fix "else" vs "#endif" misplacement + +In domain_pgd_maddr() the "#endif" is misplaced with respect to "else". This +generates incorrect logic when CONFIG_HVM is compiled out, as the "else" body +is executed unconditionally. + +Rework the logic to use IS_ENABLED() instead of explicit #ifdef-ary, as it's +clearer to follow. This in turn involves adjusting p2m_get_pagetable() to +compile when CONFIG_HVM is disabled. + +This is XSA-450 / CVE-2023-46840. + +Fixes: 033ff90aa9c1 ("x86/P2M: p2m_{alloc,free}_ptp() and p2m_alloc_table() are HVM-only") +Reported-by: Teddy Astie +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: cc6ba68edf6dcd18c3865e7d7c0f1ed822796426 +master date: 2024-01-30 14:29:15 +0100 +--- + xen/arch/x86/include/asm/p2m.h | 9 ++++++++- + xen/drivers/passthrough/vtd/iommu.c | 4 +--- + 2 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/include/asm/p2m.h b/xen/arch/x86/include/asm/p2m.h +index 40545f5fa8..1e0b0e2dcc 100644 +--- a/xen/arch/x86/include/asm/p2m.h ++++ b/xen/arch/x86/include/asm/p2m.h +@@ -435,7 +435,14 @@ static inline bool_t p2m_is_altp2m(const struct p2m_domain *p2m) + return p2m->p2m_class == p2m_alternate; + } + +-#define p2m_get_pagetable(p2m) ((p2m)->phys_table) ++#ifdef CONFIG_HVM ++static inline pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m) ++{ ++ return p2m->phys_table; ++} ++#else ++pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m); ++#endif + + /* + * Ensure any deferred p2m TLB flush has been completed on all VCPUs. +diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c +index e13b7d99db..9ed616e211 100644 +--- a/xen/drivers/passthrough/vtd/iommu.c ++++ b/xen/drivers/passthrough/vtd/iommu.c +@@ -438,15 +438,13 @@ static paddr_t domain_pgd_maddr(struct domain *d, paddr_t pgd_maddr, + + if ( pgd_maddr ) + /* nothing */; +-#ifdef CONFIG_HVM +- else if ( iommu_use_hap_pt(d) ) ++ else if ( IS_ENABLED(CONFIG_HVM) && iommu_use_hap_pt(d) ) + { + pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d)); + + pgd_maddr = pagetable_get_paddr(pgt); + } + else +-#endif + { + if ( !hd->arch.vtd.pgd_maddr ) + { +-- +2.44.0 + + +From 62b3d7f8e45a7ec1597f0ed61a99d1f423b22315 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Thu, 1 Feb 2024 17:58:17 +0100 +Subject: [PATCH 20/70] x86/amd: Extend CPU erratum #1474 fix to more affected + models +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Erratum #1474 has now been extended to cover models from family 17h ranges +00-2Fh, so the errata now covers all the models released under Family +17h (Zen, Zen+ and Zen2). + +Additionally extend the workaround to Family 18h (Hygon), since it's based on +the Zen architecture and very likely affected. + +Rename all the zen2 related symbols to fam17, since the errata doesn't +exclusively affect Zen2 anymore. + +Reported-by: Andrew Cooper +Signed-off-by: Roger Pau Monné +Reviewed-by: Andrew Cooper +master commit: 23db507a01a4ec5259ec0ab43d296a41b1c326ba +master date: 2023-12-21 12:19:40 +0000 +--- + xen/arch/x86/cpu/amd.c | 27 ++++++++++++++------------- + 1 file changed, 14 insertions(+), 13 deletions(-) + +diff --git a/xen/arch/x86/cpu/amd.c b/xen/arch/x86/cpu/amd.c +index 0f305312ff..d43288ae97 100644 +--- a/xen/arch/x86/cpu/amd.c ++++ b/xen/arch/x86/cpu/amd.c +@@ -54,7 +54,7 @@ bool __read_mostly amd_acpi_c1e_quirk; + bool __ro_after_init amd_legacy_ssbd; + bool __initdata amd_virt_spec_ctrl; + +-static bool __read_mostly zen2_c6_disabled; ++static bool __read_mostly fam17_c6_disabled; + + static inline int rdmsr_amd_safe(unsigned int msr, unsigned int *lo, + unsigned int *hi) +@@ -978,24 +978,24 @@ void amd_check_zenbleed(void) + val & chickenbit ? "chickenbit" : "microcode"); + } + +-static void cf_check zen2_disable_c6(void *arg) ++static void cf_check fam17_disable_c6(void *arg) + { + /* Disable C6 by clearing the CCR{0,1,2}_CC6EN bits. */ + const uint64_t mask = ~((1ul << 6) | (1ul << 14) | (1ul << 22)); + uint64_t val; + +- if (!zen2_c6_disabled) { ++ if (!fam17_c6_disabled) { + printk(XENLOG_WARNING + "Disabling C6 after 1000 days apparent uptime due to AMD errata 1474\n"); +- zen2_c6_disabled = true; ++ fam17_c6_disabled = true; + /* + * Prevent CPU hotplug so that started CPUs will either see +- * zen2_c6_disabled set, or will be handled by ++ * zen_c6_disabled set, or will be handled by + * smp_call_function(). + */ + while (!get_cpu_maps()) + process_pending_softirqs(); +- smp_call_function(zen2_disable_c6, NULL, 0); ++ smp_call_function(fam17_disable_c6, NULL, 0); + put_cpu_maps(); + } + +@@ -1294,8 +1294,8 @@ static void cf_check init_amd(struct cpuinfo_x86 *c) + amd_check_zenbleed(); + amd_check_erratum_1485(); + +- if (zen2_c6_disabled) +- zen2_disable_c6(NULL); ++ if (fam17_c6_disabled) ++ fam17_disable_c6(NULL); + + check_syscfg_dram_mod_en(); + +@@ -1307,7 +1307,7 @@ const struct cpu_dev amd_cpu_dev = { + .c_init = init_amd, + }; + +-static int __init cf_check zen2_c6_errata_check(void) ++static int __init cf_check amd_check_erratum_1474(void) + { + /* + * Errata #1474: A Core May Hang After About 1044 Days +@@ -1315,7 +1315,8 @@ static int __init cf_check zen2_c6_errata_check(void) + */ + s_time_t delta; + +- if (cpu_has_hypervisor || boot_cpu_data.x86 != 0x17 || !is_zen2_uarch()) ++ if (cpu_has_hypervisor || ++ (boot_cpu_data.x86 != 0x17 && boot_cpu_data.x86 != 0x18)) + return 0; + + /* +@@ -1330,10 +1331,10 @@ static int __init cf_check zen2_c6_errata_check(void) + if (delta > 0) { + static struct timer errata_c6; + +- init_timer(&errata_c6, zen2_disable_c6, NULL, 0); ++ init_timer(&errata_c6, fam17_disable_c6, NULL, 0); + set_timer(&errata_c6, NOW() + delta); + } else +- zen2_disable_c6(NULL); ++ fam17_disable_c6(NULL); + + return 0; + } +@@ -1341,4 +1342,4 @@ static int __init cf_check zen2_c6_errata_check(void) + * Must be executed after early_time_init() for tsc_ticks2ns() to have been + * calibrated. That prevents us doing the check in init_amd(). + */ +-presmp_initcall(zen2_c6_errata_check); ++presmp_initcall(amd_check_erratum_1474); +-- +2.44.0 + + +From b26c30a408255454f8ceb4e49e3c4385aa32fbc3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Thu, 1 Feb 2024 17:58:59 +0100 +Subject: [PATCH 21/70] CirrusCI: drop FreeBSD 12 +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Went EOL by the end of December 2023, and the pkg repos have been shut down. + +Reported-by: Andrew Cooper +Signed-off-by: Roger Pau Monné +Acked-by: Andrew Cooper +master commit: c2ce3466472e9c9eda79f5dc98eb701bc6fdba20 +master date: 2024-01-15 12:20:11 +0100 +--- + .cirrus.yml | 6 ------ + 1 file changed, 6 deletions(-) + +diff --git a/.cirrus.yml b/.cirrus.yml +index 7e0beb200d..63f3afb104 100644 +--- a/.cirrus.yml ++++ b/.cirrus.yml +@@ -14,12 +14,6 @@ freebsd_template: &FREEBSD_TEMPLATE + - ./configure --with-system-seabios=/usr/local/share/seabios/bios.bin + - gmake -j`sysctl -n hw.ncpu` clang=y + +-task: +- name: 'FreeBSD 12' +- freebsd_instance: +- image_family: freebsd-12-4 +- << : *FREEBSD_TEMPLATE +- + task: + name: 'FreeBSD 13' + freebsd_instance: +-- +2.44.0 + + +From 6ccf064b0ce1d06449565129ab944b4fd9531b3a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Thu, 1 Feb 2024 17:59:25 +0100 +Subject: [PATCH 22/70] x86/intel: ensure Global Performance Counter Control is + setup correctly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When Architectural Performance Monitoring is available, the PERF_GLOBAL_CTRL +MSR contains per-counter enable bits that is ANDed with the enable bit in the +counter EVNTSEL MSR in order for a PMC counter to be enabled. + +So far the watchdog code seems to have relied on the PERF_GLOBAL_CTRL enable +bits being set by default, but at least on some Intel Sapphire and Emerald +Rapids this is no longer the case, and Xen reports: + +Testing NMI watchdog on all CPUs: 0 40 stuck + +The first CPU on each package is started with PERF_GLOBAL_CTRL zeroed, so PMC0 +doesn't start counting when the enable bit in EVNTSEL0 is set, due to the +relevant enable bit in PERF_GLOBAL_CTRL not being set. + +Check and adjust PERF_GLOBAL_CTRL during CPU initialization so that all the +general-purpose PMCs are enabled. Doing so brings the state of the package-BSP +PERF_GLOBAL_CTRL in line with the rest of the CPUs on the system. + +Signed-off-by: Roger Pau Monné +Acked-by: Jan Beulich +master commit: 6bdb965178bbb3fc50cd4418d4770a7789956e2c +master date: 2024-01-17 10:40:52 +0100 +--- + xen/arch/x86/cpu/intel.c | 23 ++++++++++++++++++++++- + 1 file changed, 22 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/cpu/intel.c b/xen/arch/x86/cpu/intel.c +index a8ba3191e6..aef8e4506c 100644 +--- a/xen/arch/x86/cpu/intel.c ++++ b/xen/arch/x86/cpu/intel.c +@@ -533,9 +533,30 @@ static void cf_check init_intel(struct cpuinfo_x86 *c) + init_intel_cacheinfo(c); + if (c->cpuid_level > 9) { + unsigned eax = cpuid_eax(10); ++ unsigned int cnt = (eax >> 8) & 0xff; ++ + /* Check for version and the number of counters */ +- if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) ++ if ((eax & 0xff) && (cnt > 1) && (cnt <= 32)) { ++ uint64_t global_ctrl; ++ unsigned int cnt_mask = (1UL << cnt) - 1; ++ ++ /* ++ * On (some?) Sapphire/Emerald Rapids platforms each ++ * package-BSP starts with all the enable bits for the ++ * general-purpose PMCs cleared. Adjust so counters ++ * can be enabled from EVNTSEL. ++ */ ++ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_ctrl); ++ if ((global_ctrl & cnt_mask) != cnt_mask) { ++ printk("CPU%u: invalid PERF_GLOBAL_CTRL: %#" ++ PRIx64 " adjusting to %#" PRIx64 "\n", ++ smp_processor_id(), global_ctrl, ++ global_ctrl | cnt_mask); ++ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ++ global_ctrl | cnt_mask); ++ } + __set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability); ++ } + } + + if ( !cpu_has(c, X86_FEATURE_XTOPOLOGY) ) +-- +2.44.0 + + +From 4cc0f88c42f374c7a8e2d05e38777fa18619482e Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 1 Feb 2024 17:59:57 +0100 +Subject: [PATCH 23/70] x86/vmx: Fix IRQ handling for EXIT_REASON_INIT + +When receiving an INIT, a prior bugfix tried to ignore the INIT and continue +onwards. + +Unfortunately it's not safe to return at that point in vmx_vmexit_handler(). +Just out of context in the first hunk is a local_irqs_enabled() which is +depended-upon by the return-to-guest path, causing the following checklock +failure in debug builds: + + (XEN) Error: INIT received - ignoring + (XEN) CHECKLOCK FAILURE: prev irqsafe: 0, curr irqsafe 1 + (XEN) Xen BUG at common/spinlock.c:132 + (XEN) ----[ Xen-4.19-unstable x86_64 debug=y Tainted: H ]---- + ... + (XEN) Xen call trace: + (XEN) [] R check_lock+0xcd/0xe1 + (XEN) [] F _spin_lock+0x1b/0x60 + (XEN) [] F pt_update_irq+0x32/0x3bb + (XEN) [] F vmx_intr_assist+0x3b/0x51d + (XEN) [] F vmx_asm_vmexit_handler+0xf7/0x210 + +Luckily, this is benign in release builds. Accidentally having IRQs disabled +when trying to take an IRQs-on lock isn't a deadlock-vulnerable pattern. + +Drop the problematic early return. In hindsight, it's wrong to skip other +normal VMExit steps. + +Fixes: b1f11273d5a7 ("x86/vmx: Don't spuriously crash the domain when INIT is received") +Reported-by: Reima ISHII +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: d1f8883aebe00f6a9632d77ab0cd5c6d02c9cbe4 +master date: 2024-01-18 20:59:06 +0000 +--- + xen/arch/x86/hvm/vmx/vmx.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 1edc7f1e91..964891934b 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -4100,7 +4100,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) + + case EXIT_REASON_INIT: + printk(XENLOG_ERR "Error: INIT received - ignoring\n"); +- return; /* Renter the guest without further processing */ ++ break; + } + + /* Now enable interrupts so it's safe to take locks. */ +@@ -4385,6 +4385,7 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) + break; + } + case EXIT_REASON_EXTERNAL_INTERRUPT: ++ case EXIT_REASON_INIT: + /* Already handled above. */ + break; + case EXIT_REASON_TRIPLE_FAULT: +-- +2.44.0 + + +From 00550e808c10c67710ebb8867200eda1fbee332c Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 1 Feb 2024 18:00:32 +0100 +Subject: [PATCH 24/70] x86/vmx: Disallow the use of inactivity states + +Right now, vvmx will blindly copy L12's ACTIVITY_STATE into the L02 VMCS and +enter the vCPU. Luckily for us, nested-virt is explicitly unsupported for +security bugs. + +The inactivity states are HLT, SHUTDOWN and WAIT-FOR-SIPI, and as noted by the +SDM in Vol3 27.7 "Special Features of VM Entry": + + If VM entry ends with the logical processor in an inactive activity state, + the VM entry generates any special bus cycle that is normally generated when + that activity state is entered from the active state. + +Also, + + Some activity states unconditionally block certain events. + +I.e. A VMEntry with ACTIVITY=SHUTDOWN will initiate a platform reset, while a +VMEntry with ACTIVITY=WAIT-FOR-SIPI will really block everything other than +SIPIs. + +Both of these activity states are for the TXT ACM to use, not for regular +hypervisors, and Xen doesn't support dropping the HLT intercept either. + +There are two paths in Xen which operate on ACTIVITY_STATE. + +1) The vmx_{get,set}_nonreg_state() helpers for VM-Fork. + + As regular VMs can't use any inactivity states, this is just duplicating + the 0 from construct_vmcs(). Retain the ability to query activity_state, + but crash the domain on any attempt to set an inactivity state. + +2) Nested virt, because of ACTIVITY_STATE in vmcs_gstate_field[]. + + Explicitly hide the inactivity states in the guest's view of MSR_VMX_MISC, + and remove ACTIVITY_STATE from vmcs_gstate_field[]. + + In virtual_vmentry(), we should trigger a VMEntry failure for the use of + any inactivity states, but there's no support for that in the code at all + so leave a TODO for when we finally start working on nested-virt in + earnest. + +Reported-by: Reima Ishii +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +Reviewed-by: Tamas K Lengyel +master commit: 3643bb53a05b7c8fbac072c63bef1538f2a6d0d2 +master date: 2024-01-18 20:59:06 +0000 +--- + xen/arch/x86/hvm/vmx/vmx.c | 5 ++++- + xen/arch/x86/hvm/vmx/vvmx.c | 9 +++++++-- + xen/arch/x86/include/asm/hvm/vmx/vmcs.h | 1 + + 3 files changed, 12 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c +index 964891934b..28dece7c6b 100644 +--- a/xen/arch/x86/hvm/vmx/vmx.c ++++ b/xen/arch/x86/hvm/vmx/vmx.c +@@ -1558,7 +1558,10 @@ static void cf_check vmx_set_nonreg_state(struct vcpu *v, + { + vmx_vmcs_enter(v); + +- __vmwrite(GUEST_ACTIVITY_STATE, nrs->vmx.activity_state); ++ if ( nrs->vmx.activity_state ) ++ domain_crash(v->domain, "Attempt to set %pv activity_state %#lx\n", ++ v, nrs->vmx.activity_state); ++ + __vmwrite(GUEST_INTERRUPTIBILITY_INFO, nrs->vmx.interruptibility_info); + __vmwrite(GUEST_PENDING_DBG_EXCEPTIONS, nrs->vmx.pending_dbg); + +diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c +index 16b0ef82b6..fd0ae39166 100644 +--- a/xen/arch/x86/hvm/vmx/vvmx.c ++++ b/xen/arch/x86/hvm/vmx/vvmx.c +@@ -899,7 +899,10 @@ static const u16 vmcs_gstate_field[] = { + GUEST_LDTR_AR_BYTES, + GUEST_TR_AR_BYTES, + GUEST_INTERRUPTIBILITY_INFO, ++ /* ++ * ACTIVITY_STATE is handled specially. + GUEST_ACTIVITY_STATE, ++ */ + GUEST_SYSENTER_CS, + GUEST_PREEMPTION_TIMER, + /* natural */ +@@ -1200,6 +1203,8 @@ static void virtual_vmentry(struct cpu_user_regs *regs) + nvcpu->nv_vmentry_pending = 0; + nvcpu->nv_vmswitch_in_progress = 1; + ++ /* TODO: Fail VMentry for GUEST_ACTIVITY_STATE != 0 */ ++ + /* + * EFER handling: + * hvm_set_efer won't work if CR0.PG = 1, so we change the value +@@ -2316,8 +2321,8 @@ int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content) + data = hvm_cr4_guest_valid_bits(d); + break; + case MSR_IA32_VMX_MISC: +- /* Do not support CR3-target feature now */ +- data = host_data & ~VMX_MISC_CR3_TARGET; ++ /* Do not support CR3-targets or activity states. */ ++ data = host_data & ~(VMX_MISC_CR3_TARGET | VMX_MISC_ACTIVITY_MASK); + break; + case MSR_IA32_VMX_EPT_VPID_CAP: + data = nept_get_ept_vpid_cap(); +diff --git a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h +index d07fcb2bc9..8de9977eb3 100644 +--- a/xen/arch/x86/include/asm/hvm/vmx/vmcs.h ++++ b/xen/arch/x86/include/asm/hvm/vmx/vmcs.h +@@ -277,6 +277,7 @@ extern u32 vmx_secondary_exec_control; + #define VMX_VPID_INVVPID_SINGLE_CONTEXT_RETAINING_GLOBAL 0x80000000000ULL + extern u64 vmx_ept_vpid_cap; + ++#define VMX_MISC_ACTIVITY_MASK 0x000001c0 + #define VMX_MISC_PROC_TRACE 0x00004000 + #define VMX_MISC_CR3_TARGET 0x01ff0000 + #define VMX_MISC_VMWRITE_ALL 0x20000000 +-- +2.44.0 + + +From 579a622eb41cf4e1ae4d94100985a81eebda23b9 Mon Sep 17 00:00:00 2001 +From: Michal Orzel +Date: Thu, 1 Feb 2024 18:01:27 +0100 +Subject: [PATCH 25/70] lib{fdt,elf}: move lib{fdt,elf}-temp.o and their deps + to $(targets) + +At the moment, trying to run xencov read/reset (calling SYSCTL_coverage_op +under the hood) results in a crash. This is due to a profiler trying to +access data in the .init.* sections (libfdt for Arm and libelf for x86) +that are stripped after boot. Normally, the build system compiles any +*.init.o file without COV_FLAGS. However, these two libraries are +handled differently as sections will be renamed to init after linking. + +To override COV_FLAGS to empty for these libraries, lib{fdt,elf}.o were +added to nocov-y. This worked until e321576f4047 ("xen/build: start using +if_changed") that added lib{fdt,elf}-temp.o and their deps to extra-y. +This way, even though these objects appear as prerequisites of +lib{fdt,elf}.o and the settings should propagate to them, make can also +build them as a prerequisite of __build, in which case COV_FLAGS would +still have the unwanted flags. Fix it by switching to $(targets) instead. + +Also, for libfdt, append libfdt.o to nocov-y only if CONFIG_OVERLAY_DTB +is not set. Otherwise, there is no section renaming and we should be able +to run the coverage. + +Fixes: e321576f4047 ("xen/build: start using if_changed") +Signed-off-by: Michal Orzel +Reviewed-by: Anthony PERARD +Acked-by: Jan Beulich +master commit: 79519fcfa0605bbf19d8c02b979af3a2c8afed68 +master date: 2024-01-23 12:02:44 +0100 +--- + xen/common/libelf/Makefile | 2 +- + xen/common/libfdt/Makefile | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/common/libelf/Makefile b/xen/common/libelf/Makefile +index 8a4522e4e1..917d12b006 100644 +--- a/xen/common/libelf/Makefile ++++ b/xen/common/libelf/Makefile +@@ -13,4 +13,4 @@ $(obj)/libelf.o: $(obj)/libelf-temp.o FORCE + $(obj)/libelf-temp.o: $(addprefix $(obj)/,$(libelf-objs)) FORCE + $(call if_changed,ld) + +-extra-y += libelf-temp.o $(libelf-objs) ++targets += libelf-temp.o $(libelf-objs) +diff --git a/xen/common/libfdt/Makefile b/xen/common/libfdt/Makefile +index d50487aa6e..6ce679f98f 100644 +--- a/xen/common/libfdt/Makefile ++++ b/xen/common/libfdt/Makefile +@@ -5,10 +5,10 @@ SECTIONS := text data $(SPECIAL_DATA_SECTIONS) + # For CONFIG_OVERLAY_DTB, libfdt functionalities will be needed during runtime. + ifneq ($(CONFIG_OVERLAY_DTB),y) + OBJCOPYFLAGS := $(foreach s,$(SECTIONS),--rename-section .$(s)=.init.$(s)) ++nocov-y += libfdt.o + endif + + obj-y += libfdt.o +-nocov-y += libfdt.o + + CFLAGS-y += -I$(srctree)/include/xen/libfdt/ + +@@ -18,4 +18,4 @@ $(obj)/libfdt.o: $(obj)/libfdt-temp.o FORCE + $(obj)/libfdt-temp.o: $(addprefix $(obj)/,$(LIBFDT_OBJS)) FORCE + $(call if_changed,ld) + +-extra-y += libfdt-temp.o $(LIBFDT_OBJS) ++targets += libfdt-temp.o $(LIBFDT_OBJS) +-- +2.44.0 + + +From 295ab8060d95ed8c365077946c7faf8793099ef8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Thu, 1 Feb 2024 18:01:52 +0100 +Subject: [PATCH 26/70] x86/p2m-pt: fix off by one in entry check assert +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The MMIO RO rangeset overlap check is bogus: the rangeset is inclusive so the +passed end mfn should be the last mfn to be mapped (not last + 1). + +Fixes: 6fa1755644d0 ('amd/npt/shadow: replace assert that prevents creating 2M/1G MMIO entries') +Signed-off-by: Roger Pau Monné +Reviewed-by: George Dunlap +master commit: 610775d0dd61c1bd2f4720c755986098e6a5bafd +master date: 2024-01-25 16:09:04 +0100 +--- + xen/arch/x86/mm/p2m-pt.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c +index b2b14746c1..88d3733891 100644 +--- a/xen/arch/x86/mm/p2m-pt.c ++++ b/xen/arch/x86/mm/p2m-pt.c +@@ -552,7 +552,7 @@ static void check_entry(mfn_t mfn, p2m_type_t new, p2m_type_t old, + if ( new == p2m_mmio_direct ) + ASSERT(!mfn_eq(mfn, INVALID_MFN) && + !rangeset_overlaps_range(mmio_ro_ranges, mfn_x(mfn), +- mfn_x(mfn) + (1UL << order))); ++ mfn_x(mfn) + (1UL << order) - 1)); + else if ( p2m_allows_invalid_mfn(new) || new == p2m_invalid || + new == p2m_mmio_dm ) + ASSERT(mfn_valid(mfn) || mfn_eq(mfn, INVALID_MFN)); +-- +2.44.0 + + +From b1fdd7d0e47e0831ac7a99d0417385fc10d3068c Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 1 Feb 2024 18:02:24 +0100 +Subject: [PATCH 27/70] x86/ucode: Fix stability of the raw CPU Policy rescan + +Always run microcode_update_helper() on the BSP, so the the updated Raw CPU +policy doesn't get non-BSP topology details included. + +Have calculate_raw_cpu_policy() clear the instantanious XSTATE sizes. The +value XCR0 | MSR_XSS had when we scanned the policy isn't terribly interesting +to report. + +When CPUID Masking is active, it affects CPUID instructions issued by Xen +too. Transiently disable masking to get a clean scan. + +Fixes: 694d79ed5aac ("x86/ucode: Refresh raw CPU policy after microcode load") +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: cf7fe8b72deaa94157ddf97d4bb391480205e9c2 +master date: 2024-01-25 17:46:57 +0000 +--- + xen/arch/x86/cpu-policy.c | 7 +++++++ + xen/arch/x86/cpu/microcode/core.c | 20 +++++++++++++++++--- + 2 files changed, 24 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c +index 81e574390f..bcb17b7ce3 100644 +--- a/xen/arch/x86/cpu-policy.c ++++ b/xen/arch/x86/cpu-policy.c +@@ -353,6 +353,13 @@ void calculate_raw_cpu_policy(void) + /* Nothing good will come from Xen and libx86 disagreeing on vendor. */ + ASSERT(p->x86_vendor == boot_cpu_data.x86_vendor); + ++ /* ++ * Clear the truly dynamic fields. These vary with the in-context XCR0 ++ * and MSR_XSS, and aren't interesting fields in the raw policy. ++ */ ++ p->xstate.raw[0].b = 0; ++ p->xstate.raw[1].b = 0; ++ + /* 0x000000ce MSR_INTEL_PLATFORM_INFO */ + /* Was already added by probe_cpuid_faulting() */ + } +diff --git a/xen/arch/x86/cpu/microcode/core.c b/xen/arch/x86/cpu/microcode/core.c +index 65ebeb50de..4e011cdc41 100644 +--- a/xen/arch/x86/cpu/microcode/core.c ++++ b/xen/arch/x86/cpu/microcode/core.c +@@ -680,8 +680,18 @@ static long cf_check microcode_update_helper(void *data) + microcode_update_cache(patch); + spin_unlock(µcode_mutex); + +- /* Refresh the raw CPU policy, in case the features have changed. */ ++ /* ++ * Refresh the raw CPU policy, in case the features have changed. ++ * Disable CPUID masking if in use, to avoid having current's ++ * cpu_policy affect the rescan. ++ */ ++ if ( ctxt_switch_masking ) ++ alternative_vcall(ctxt_switch_masking, NULL); ++ + calculate_raw_cpu_policy(); ++ ++ if ( ctxt_switch_masking ) ++ alternative_vcall(ctxt_switch_masking, current); + } + else + microcode_free_patch(patch); +@@ -721,8 +731,12 @@ int microcode_update(XEN_GUEST_HANDLE(const_void) buf, unsigned long len) + } + buffer->len = len; + +- return continue_hypercall_on_cpu(smp_processor_id(), +- microcode_update_helper, buffer); ++ /* ++ * Always queue microcode_update_helper() on CPU0. Most of the logic ++ * won't care, but the update of the Raw CPU policy wants to (re)run on ++ * the BSP. ++ */ ++ return continue_hypercall_on_cpu(0, microcode_update_helper, buffer); + } + + static int __init cf_check microcode_init(void) +-- +2.44.0 + + +From 184d723e7a5d1c021d297e14d19fe5344eac7a56 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Cyril=20R=C3=A9bert=20=28zithro=29?= +Date: Tue, 27 Feb 2024 13:53:42 +0100 +Subject: [PATCH 28/70] tools/xentop: fix sorting bug for some columns +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Sort doesn't work on columns VBD_OO, VBD_RD, VBD_WR and VBD_RSECT. +Fix by adjusting variables names in compare functions. +Bug fix only. No functional change. + +Fixes: 91c3e3dc91d6 ("tools/xentop: Display '-' when stats are not available.") +Signed-off-by: Cyril Rébert (zithro) +Reviewed-by: Anthony PERARD +master commit: 29f17d837421f13c0e0010802de1b2d51d2ded4a +master date: 2024-02-05 17:58:23 +0000 +--- + tools/xentop/xentop.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/tools/xentop/xentop.c b/tools/xentop/xentop.c +index 950e8935c4..545bd5e96d 100644 +--- a/tools/xentop/xentop.c ++++ b/tools/xentop/xentop.c +@@ -684,7 +684,7 @@ static int compare_vbd_oo(xenstat_domain *domain1, xenstat_domain *domain2) + unsigned long long dom1_vbd_oo = 0, dom2_vbd_oo = 0; + + tot_vbd_reqs(domain1, FIELD_VBD_OO, &dom1_vbd_oo); +- tot_vbd_reqs(domain1, FIELD_VBD_OO, &dom2_vbd_oo); ++ tot_vbd_reqs(domain2, FIELD_VBD_OO, &dom2_vbd_oo); + + return -compare(dom1_vbd_oo, dom2_vbd_oo); + } +@@ -711,9 +711,9 @@ static int compare_vbd_rd(xenstat_domain *domain1, xenstat_domain *domain2) + unsigned long long dom1_vbd_rd = 0, dom2_vbd_rd = 0; + + tot_vbd_reqs(domain1, FIELD_VBD_RD, &dom1_vbd_rd); +- tot_vbd_reqs(domain1, FIELD_VBD_RD, &dom2_vbd_rd); ++ tot_vbd_reqs(domain2, FIELD_VBD_RD, &dom2_vbd_rd); + +- return -compare(dom1_vbd_rd, dom1_vbd_rd); ++ return -compare(dom1_vbd_rd, dom2_vbd_rd); + } + + /* Prints number of total VBD READ requests statistic */ +@@ -738,7 +738,7 @@ static int compare_vbd_wr(xenstat_domain *domain1, xenstat_domain *domain2) + unsigned long long dom1_vbd_wr = 0, dom2_vbd_wr = 0; + + tot_vbd_reqs(domain1, FIELD_VBD_WR, &dom1_vbd_wr); +- tot_vbd_reqs(domain1, FIELD_VBD_WR, &dom2_vbd_wr); ++ tot_vbd_reqs(domain2, FIELD_VBD_WR, &dom2_vbd_wr); + + return -compare(dom1_vbd_wr, dom2_vbd_wr); + } +@@ -765,7 +765,7 @@ static int compare_vbd_rsect(xenstat_domain *domain1, xenstat_domain *domain2) + unsigned long long dom1_vbd_rsect = 0, dom2_vbd_rsect = 0; + + tot_vbd_reqs(domain1, FIELD_VBD_RSECT, &dom1_vbd_rsect); +- tot_vbd_reqs(domain1, FIELD_VBD_RSECT, &dom2_vbd_rsect); ++ tot_vbd_reqs(domain2, FIELD_VBD_RSECT, &dom2_vbd_rsect); + + return -compare(dom1_vbd_rsect, dom2_vbd_rsect); + } +-- +2.44.0 + + +From fa9950a527a70971bf9279be62d445cf9c83aedf Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 27 Feb 2024 13:54:04 +0100 +Subject: [PATCH 29/70] amd-vi: fix IVMD memory type checks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current code that parses the IVMD blocks is relaxed with regard to the +restriction that such unity regions should always fall into memory ranges +marked as reserved in the memory map. + +However the type checks for the IVMD addresses are inverted, and as a result +IVMD ranges falling into RAM areas are accepted. Note that having such ranges +in the first place is a firmware bug, as IVMD should always fall into reserved +ranges. + +Fixes: ed6c77ebf0c1 ('AMD/IOMMU: check / convert IVMD ranges for being / to be reserved') +Reported-by: Ox +Signed-off-by: Roger Pau Monné +Tested-by: oxjo +Reviewed-by: Jan Beulich +master commit: 83afa313583019d9f159c122cecf867735d27ec5 +master date: 2024-02-06 11:56:13 +0100 +--- + xen/drivers/passthrough/amd/iommu_acpi.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +diff --git a/xen/drivers/passthrough/amd/iommu_acpi.c b/xen/drivers/passthrough/amd/iommu_acpi.c +index 699d33f429..96d8879e7b 100644 +--- a/xen/drivers/passthrough/amd/iommu_acpi.c ++++ b/xen/drivers/passthrough/amd/iommu_acpi.c +@@ -426,9 +426,14 @@ static int __init parse_ivmd_block(const struct acpi_ivrs_memory *ivmd_block) + return -EIO; + } + +- /* Types which won't be handed out are considered good enough. */ +- if ( !(type & (RAM_TYPE_RESERVED | RAM_TYPE_ACPI | +- RAM_TYPE_UNUSABLE)) ) ++ /* ++ * Types which aren't RAM are considered good enough. ++ * Note that a page being partially RESERVED, ACPI or UNUSABLE will ++ * force Xen into assuming the whole page as having that type in ++ * practice. ++ */ ++ if ( type & (RAM_TYPE_RESERVED | RAM_TYPE_ACPI | ++ RAM_TYPE_UNUSABLE) ) + continue; + + AMD_IOMMU_ERROR("IVMD: page at %lx can't be converted\n", addr); +-- +2.44.0 + + +From 16475909baa2bcfda3ebc07ced5e5cd0ca8172d6 Mon Sep 17 00:00:00 2001 +From: Jason Andryuk +Date: Tue, 27 Feb 2024 13:55:03 +0100 +Subject: [PATCH 30/70] block-common: Fix same_vm for no targets + +same_vm is broken when the two main domains do not have targets. otvm +and targetvm are both missing, which means they get set to -1 and then +converted to empty strings: + +++10697+ local targetvm=-1 +++10697+ local otvm=-1 +++10697+ otvm= +++10697+ othervm=/vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 +++10697+ targetvm= +++10697+ local frontend_uuid=/vm/844dea4e-44f8-4e3e-8145-325132a31ca5 + +The final comparison returns true since the two empty strings match: + +++10697+ '[' /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o '' = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = '' -o '' = '' ']' + +Replace -1 with distinct strings indicating the lack of a value and +remove the collescing to empty stings. The strings themselves will no +longer match, and that is correct. + +++12364+ '[' /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o 'No target' = /vm/cc97bc2f-3a91-43f7-8fbc-4cb92f90b4e4 -o /vm/844dea4e-44f8-4e3e-8145-325132a31ca5 = 'No other target' -o 'No target' = 'No other target' ']' + +Signed-off-by: Jason Andryuk +Reviewed-by: Anthony PERARD +master commit: e8f1bb803fdf44db708991593568a9e3e6b3d130 +master date: 2024-02-07 13:46:52 +0100 +--- + tools/hotplug/Linux/block-common.sh | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/tools/hotplug/Linux/block-common.sh b/tools/hotplug/Linux/block-common.sh +index f86a88c4eb..5c80237d99 100644 +--- a/tools/hotplug/Linux/block-common.sh ++++ b/tools/hotplug/Linux/block-common.sh +@@ -112,14 +112,12 @@ same_vm() + "$FRONTEND_UUID") + local target=$(xenstore_read_default "/local/domain/$FRONTEND_ID/target" \ + "-1") +- local targetvm=$(xenstore_read_default "/local/domain/$target/vm" "-1") ++ local targetvm=$(xenstore_read_default "/local/domain/$target/vm" "No Target") + local otarget=$(xenstore_read_default "/local/domain/$otherdom/target" \ + "-1") + local otvm=$(xenstore_read_default "/local/domain/$otarget/vm" \ +- "-1") +- otvm=${otvm%-1} +- othervm=${othervm%-1} +- targetvm=${targetvm%-1} ++ "No Other Target") ++ + local frontend_uuid=${FRONTEND_UUID%-1} + + [ "$frontend_uuid" = "$othervm" -o "$targetvm" = "$othervm" -o \ +-- +2.44.0 + + +From b51fd78aed865033413178f5953147effedc7ce0 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Petr=20Bene=C5=A1?= +Date: Tue, 27 Feb 2024 13:55:25 +0100 +Subject: [PATCH 31/70] x86/hvm: Fix fast singlestep state persistence +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This patch addresses an issue where the fast singlestep setting would persist +despite xc_domain_debug_control being called with XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF. +Specifically, if fast singlestep was enabled in a VMI session and that session +stopped before the MTF trap occurred, the fast singlestep setting remained +active even though MTF itself was disabled. This led to a situation where, upon +starting a new VMI session, the first event to trigger an EPT violation would +cause the corresponding EPT event callback to be skipped due to the lingering +fast singlestep setting. + +The fix ensures that the fast singlestep setting is properly reset when +disabling single step debugging operations. + +Signed-off-by: Petr Beneš +Reviewed-by: Tamas K Lengyel +master commit: 897def94b56175ce569673a05909d2f223e1e749 +master date: 2024-02-12 09:37:58 +0100 +--- + xen/arch/x86/hvm/hvm.c | 34 ++++++++++++++++++++++++---------- + 1 file changed, 24 insertions(+), 10 deletions(-) + +diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c +index 482eebbabf..a70b351373 100644 +--- a/xen/arch/x86/hvm/hvm.c ++++ b/xen/arch/x86/hvm/hvm.c +@@ -5167,26 +5167,40 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg) + + int hvm_debug_op(struct vcpu *v, int32_t op) + { +- int rc; ++ int rc = 0; + + switch ( op ) + { + case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON: + case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF: +- rc = -EOPNOTSUPP; + if ( !cpu_has_monitor_trap_flag ) +- break; +- rc = 0; +- vcpu_pause(v); +- v->arch.hvm.single_step = +- (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON); +- vcpu_unpause(v); /* guest will latch new state */ ++ return -EOPNOTSUPP; + break; + default: +- rc = -ENOSYS; +- break; ++ return -ENOSYS; ++ } ++ ++ vcpu_pause(v); ++ ++ switch ( op ) ++ { ++ case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON: ++ v->arch.hvm.single_step = true; ++ break; ++ ++ case XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_OFF: ++ v->arch.hvm.single_step = false; ++ v->arch.hvm.fast_single_step.enabled = false; ++ v->arch.hvm.fast_single_step.p2midx = 0; ++ break; ++ ++ default: /* Excluded above */ ++ ASSERT_UNREACHABLE(); ++ return -ENOSYS; + } + ++ vcpu_unpause(v); /* guest will latch new state */ ++ + return rc; + } + +-- +2.44.0 + + +From 59e6ad6597dc9930c966b20485a9d0b369ff71a5 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 27 Feb 2024 13:55:56 +0100 +Subject: [PATCH 32/70] x86/HVM: tidy state on hvmemul_map_linear_addr()'s + error path + +While in the vast majority of cases failure of the function will not +be followed by re-invocation with the same emulation context, a few +very specific insns - involving multiple independent writes, e.g. ENTER +and PUSHA - exist where this can happen. Since failure of the function +only signals to the caller that it ought to try an MMIO write instead, +such failure also cannot be assumed to result in wholesale failure of +emulation of the current insn. Instead we have to maintain internal +state such that another invocation of the function with the same +emulation context remains possible. To achieve that we need to reset MFN +slots after putting page references on the error path. + +Note that all of this affects debugging code only, in causing an +assertion to trigger (higher up in the function). There's otherwise no +misbehavior - such a "leftover" slot would simply be overwritten by new +contents in a release build. + +Also extend the related unmap() assertion, to further check for MFN 0. + +Fixes: 8cbd4fb0b7ea ("x86/hvm: implement hvmemul_write() using real mappings") +Reported-by: Manuel Andreas +Signed-off-by: Jan Beulich +Acked-by: Paul Durrant +master commit: e72f951df407bc3be82faac64d8733a270036ba1 +master date: 2024-02-13 09:36:14 +0100 +--- + xen/arch/x86/hvm/emulate.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c +index 254716c766..865aa08bbc 100644 +--- a/xen/arch/x86/hvm/emulate.c ++++ b/xen/arch/x86/hvm/emulate.c +@@ -696,7 +696,12 @@ static void *hvmemul_map_linear_addr( + out: + /* Drop all held references. */ + while ( mfn-- > hvmemul_ctxt->mfn ) ++ { + put_page(mfn_to_page(*mfn)); ++#ifndef NDEBUG /* Clean slot for a subsequent map()'s error checking. */ ++ *mfn = _mfn(0); ++#endif ++ } + + return err; + } +@@ -718,7 +723,7 @@ static void hvmemul_unmap_linear_addr( + + for ( i = 0; i < nr_frames; i++ ) + { +- ASSERT(mfn_valid(*mfn)); ++ ASSERT(mfn_x(*mfn) && mfn_valid(*mfn)); + paging_mark_dirty(currd, *mfn); + put_page(mfn_to_page(*mfn)); + +-- +2.44.0 + + +From 006764b871db75d5d025500a079ad246d1d418a1 Mon Sep 17 00:00:00 2001 +From: Anthony PERARD +Date: Tue, 27 Feb 2024 13:56:25 +0100 +Subject: [PATCH 33/70] build: Replace `which` with `command -v` +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The `which` command is not standard, may not exist on the build host, +or may not behave as expected by the build system. It is recommended +to use `command -v` to find out if a command exist and have its path, +and it's part of a POSIX shell standard (at least, it seems to be +mandatory since IEEE Std 1003.1-2008, but was optional before). + +Fixes: c8a8645f1efe ("xen/build: Automatically locate a suitable python interpreter") +Fixes: 3b47bcdb6d38 ("xen/build: Use a distro version of figlet") +Signed-off-by: Anthony PERARD +Tested-by: Marek Marczykowski-Górecki +Acked-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: f93629b18b528a5ab1b1092949c5420069c7226c +master date: 2024-02-19 12:45:48 +0100 +--- + xen/Makefile | 4 ++-- + xen/build.mk | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/Makefile b/xen/Makefile +index a92709b43e..59d368e4d8 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -25,8 +25,8 @@ export XEN_BUILD_HOST := $(shell hostname) + endif + + # Best effort attempt to find a python interpreter, defaulting to Python 3 if +-# available. Fall back to just `python` if `which` is nowhere to be found. +-PYTHON_INTERPRETER := $(word 1,$(shell which python3 python python2 2>/dev/null) python) ++# available. Fall back to just `python`. ++PYTHON_INTERPRETER := $(word 1,$(shell command -v python3 || command -v python || command -v python2) python) + export PYTHON ?= $(PYTHON_INTERPRETER) + + export CHECKPOLICY ?= checkpolicy +diff --git a/xen/build.mk b/xen/build.mk +index 26dd5a8e87..0f490ca71b 100644 +--- a/xen/build.mk ++++ b/xen/build.mk +@@ -1,6 +1,6 @@ + quiet_cmd_banner = BANNER $@ + define cmd_banner +- if which figlet >/dev/null 2>&1 ; then \ ++ if command -v figlet >/dev/null 2>&1 ; then \ + echo " Xen $(XEN_FULLVERSION)" | figlet -f $< > $@.tmp; \ + else \ + echo " Xen $(XEN_FULLVERSION)" > $@.tmp; \ +-- +2.44.0 + + +From 489c2b9ba173376e978c0ef3de416a2f09452e85 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Marek=20Marczykowski-G=C3=B3recki?= + +Date: Tue, 27 Feb 2024 13:57:07 +0100 +Subject: [PATCH 34/70] libxl: Disable relocating memory for qemu-xen in + stubdomain too +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +According to comments (and experiments) qemu-xen cannot handle memory +reolcation done by hvmloader. The code was already disabled when running +qemu-xen in dom0 (see libxl__spawn_local_dm()), but it was missed when +adding qemu-xen support to stubdomain. Adjust libxl__spawn_stub_dm() to +be consistent in this regard. + +Reported-by: Neowutran +Signed-off-by: Marek Marczykowski-Górecki +Reviewed-by: Jason Andryuk +Acked-by: Anthony PERARD +master commit: 97883aa269f6745a6ded232be3a855abb1297e0d +master date: 2024-02-22 11:48:22 +0100 +--- + tools/libs/light/libxl_dm.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c +index 14b593110f..ed620a9d8e 100644 +--- a/tools/libs/light/libxl_dm.c ++++ b/tools/libs/light/libxl_dm.c +@@ -2432,6 +2432,16 @@ void libxl__spawn_stub_dm(libxl__egc *egc, libxl__stub_dm_spawn_state *sdss) + "%s", + libxl_bios_type_to_string(guest_config->b_info.u.hvm.bios)); + } ++ /* Disable relocating memory to make the MMIO hole larger ++ * unless we're running qemu-traditional and vNUMA is not ++ * configured. */ ++ libxl__xs_printf(gc, XBT_NULL, ++ libxl__sprintf(gc, "%s/hvmloader/allow-memory-relocate", ++ libxl__xs_get_dompath(gc, guest_domid)), ++ "%d", ++ guest_config->b_info.device_model_version ++ == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL && ++ !libxl__vnuma_configured(&guest_config->b_info)); + ret = xc_domain_set_target(ctx->xch, dm_domid, guest_domid); + if (ret<0) { + LOGED(ERROR, guest_domid, "setting target domain %d -> %d", +-- +2.44.0 + + +From 5fda82641461a5234ab9bf0575423dfb8bfc5657 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 27 Feb 2024 13:57:31 +0100 +Subject: [PATCH 35/70] build: make sure build fails when running kconfig fails +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Because of using "-include", failure to (re)build auto.conf (with +auto.conf.cmd produced as a secondary target) won't stop make from +continuing the build. Arrange for it being possible to drop the - from +Rules.mk, requiring that the include be skipped for tools-only targets. +Note that relying on the inclusion in those cases wouldn't be correct +anyway, as it might be a stale file (yet to be rebuilt) which would be +included, while during initial build, the file would be absent +altogether. + +Fixes: 8d4c17a90b0a ("xen/build: silence make warnings about missing auto.conf*") +Reported-by: Roger Pau Monné +Signed-off-by: Jan Beulich +Reviewed-by: Anthony PERARD +master commit: d34e5fa2e8db19f23081f46a3e710bb122130691 +master date: 2024-02-22 11:52:47 +0100 +--- + xen/Makefile | 1 + + xen/Rules.mk | 4 +++- + 2 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/xen/Makefile b/xen/Makefile +index 59d368e4d8..fdf9fd3f22 100644 +--- a/xen/Makefile ++++ b/xen/Makefile +@@ -374,6 +374,7 @@ $(KCONFIG_CONFIG): tools_fixdep + # This exploits the 'multi-target pattern rule' trick. + # The syncconfig should be executed only once to make all the targets. + include/config/%.conf include/config/%.conf.cmd: $(KCONFIG_CONFIG) ++ $(Q)rm -f include/config/auto.conf + $(Q)$(MAKE) $(build)=tools/kconfig syncconfig + + ifeq ($(CONFIG_DEBUG),y) +diff --git a/xen/Rules.mk b/xen/Rules.mk +index 8af3dd7277..d759cccee3 100644 +--- a/xen/Rules.mk ++++ b/xen/Rules.mk +@@ -15,7 +15,9 @@ srcdir := $(srctree)/$(src) + PHONY := __build + __build: + +--include $(objtree)/include/config/auto.conf ++ifneq ($(firstword $(subst /, ,$(obj))),tools) ++include $(objtree)/include/config/auto.conf ++endif + + include $(XEN_ROOT)/Config.mk + include $(srctree)/scripts/Kbuild.include +-- +2.44.0 + + +From a751d1321f6e1491d6ec2134d59eefa9f9752b86 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 27 Feb 2024 13:57:50 +0100 +Subject: [PATCH 36/70] x86emul: add missing EVEX.R' checks + +EVEX.R' is not ignored in 64-bit code when encoding a GPR or mask +register. While for mask registers suitable checks are in place (there +also covering EVEX.R), they were missing for the few cases where in +EVEX-encoded instructions ModR/M.reg encodes a GPR. While for VPEXTRW +the bit is replaced before an emulation stub is invoked, for +VCVT{,T}{S,D,H}2{,U}SI this actually would have led to #UD from inside +an emulation stub, in turn raising #UD to the guest, but accompanied by +log messages indicating something's wrong in Xen nevertheless. + +Fixes: 001bd91ad864 ("x86emul: support AVX512{F,BW,DQ} extract insns") +Fixes: baf4a376f550 ("x86emul: support AVX512F legacy-equivalent scalar int/FP conversion insns") +Signed-off-by: Jan Beulich +Acked-by: Andrew Cooper +master commit: cb319824bfa8d3c9ea0410cc71daaedc3e11aa2a +master date: 2024-02-22 11:54:07 +0100 +--- + xen/arch/x86/x86_emulate/x86_emulate.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c +index cf780da501..d6b60f0539 100644 +--- a/xen/arch/x86/x86_emulate/x86_emulate.c ++++ b/xen/arch/x86/x86_emulate/x86_emulate.c +@@ -3686,7 +3686,8 @@ x86_emulate( + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x2d): /* vcvts{s,d}2si xmm/mem,reg */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x78): /* vcvtts{s,d}2usi xmm/mem,reg */ + CASE_SIMD_SCALAR_FP(_EVEX, 0x0f, 0x79): /* vcvts{s,d}2usi xmm/mem,reg */ +- generate_exception_if((evex.reg != 0xf || !evex.RX || evex.opmsk || ++ generate_exception_if((evex.reg != 0xf || !evex.RX || !evex.R || ++ evex.opmsk || + (ea.type != OP_REG && evex.brs)), + X86_EXC_UD); + host_and_vcpu_must_have(avx512f); +@@ -7295,7 +7296,7 @@ x86_emulate( + goto pextr; + + case X86EMUL_OPC_EVEX_66(0x0f, 0xc5): /* vpextrw $imm8,xmm,reg */ +- generate_exception_if(ea.type != OP_REG, X86_EXC_UD); ++ generate_exception_if(ea.type != OP_REG || !evex.R, X86_EXC_UD); + /* Convert to alternative encoding: We want to use a memory operand. */ + evex.opcx = ext_0f3a; + b = 0x15; +-- +2.44.0 + + +From 33a0368d3beb82ddb0cf7ed398b047325bb7be1c Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 27 Feb 2024 13:58:21 +0100 +Subject: [PATCH 37/70] xen/livepatch: fix norevert test hook setup typo +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The test code has a typo in using LIVEPATCH_APPLY_HOOK() instead of +LIVEPATCH_REVERT_HOOK(). + +Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: f0622dd4fd6ae6ddb523a45d89ed9b8f3a9a8f36 +master date: 2024-02-26 10:13:46 +0100 +--- + xen/test/livepatch/xen_action_hooks_norevert.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c +index 3e21ade6ab..c173855192 100644 +--- a/xen/test/livepatch/xen_action_hooks_norevert.c ++++ b/xen/test/livepatch/xen_action_hooks_norevert.c +@@ -120,7 +120,7 @@ static void post_revert_hook(livepatch_payload_t *payload) + printk(KERN_DEBUG "%s: Hook done.\n", __func__); + } + +-LIVEPATCH_APPLY_HOOK(revert_hook); ++LIVEPATCH_REVERT_HOOK(revert_hook); + + LIVEPATCH_PREAPPLY_HOOK(pre_apply_hook); + LIVEPATCH_POSTAPPLY_HOOK(post_apply_hook); +-- +2.44.0 + + +From f6e5ab5fa7257783fdbbaabf6010d8d97656c11f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 27 Feb 2024 13:58:36 +0100 +Subject: [PATCH 38/70] xen/cmdline: fix printf format specifier in + no_config_param() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +'*' sets the width field, which is the minimum number of characters to output, +but what we want in no_config_param() is the precision instead, which is '.*' +as it imposes a maximum limit on the output. + +Fixes: 68d757df8dd2 ('x86/pv: Options to disable and/or compile out 32bit PV support') +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: ef101f525173cf51dc70f4c77862f6f10a8ddccf +master date: 2024-02-26 10:17:40 +0100 +--- + xen/include/xen/param.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/include/xen/param.h b/xen/include/xen/param.h +index 93c3fe7cb7..e02e49635c 100644 +--- a/xen/include/xen/param.h ++++ b/xen/include/xen/param.h +@@ -191,7 +191,7 @@ static inline void no_config_param(const char *cfg, const char *param, + { + int len = e ? ({ ASSERT(e >= s); e - s; }) : strlen(s); + +- printk(XENLOG_INFO "CONFIG_%s disabled - ignoring '%s=%*s' setting\n", ++ printk(XENLOG_INFO "CONFIG_%s disabled - ignoring '%s=%.*s' setting\n", + cfg, param, len, s); + } + +-- +2.44.0 + + +From 19fd9ff9981732995b1028f9e7e406061b723651 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 27 Feb 2024 13:59:05 +0100 +Subject: [PATCH 39/70] x86/altcall: use a union as register type for function + parameters on clang +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current code for alternative calls uses the caller parameter types as the +types for the register variables that serve as function parameters: + +uint8_t foo; +[...] +alternative_call(myfunc, foo); + +Would expand roughly into: + +register unint8_t a1_ asm("rdi") = foo; +register unsigned long a2_ asm("rsi"); +[...] +asm volatile ("call *%c[addr](%%rip)"...); + +However with -O2 clang will generate incorrect code, given the following +example: + +unsigned int func(uint8_t t) +{ + return t; +} + +static void bar(uint8_t b) +{ + int ret_; + register uint8_t di asm("rdi") = b; + register unsigned long si asm("rsi"); + register unsigned long dx asm("rdx"); + register unsigned long cx asm("rcx"); + register unsigned long r8 asm("r8"); + register unsigned long r9 asm("r9"); + register unsigned long r10 asm("r10"); + register unsigned long r11 asm("r11"); + + asm volatile ( "call %c[addr]" + : "+r" (di), "=r" (si), "=r" (dx), + "=r" (cx), "=r" (r8), "=r" (r9), + "=r" (r10), "=r" (r11), "=a" (ret_) + : [addr] "i" (&(func)), "g" (func) + : "memory" ); +} + +void foo(unsigned int a) +{ + bar(a); +} + +Clang generates the following assembly code: + +func: # @func + movl %edi, %eax + retq +foo: # @foo + callq func + retq + +Note the truncation of the unsigned int parameter 'a' of foo() to uint8_t when +passed into bar() is lost. clang doesn't zero extend the parameters in the +callee when required, as the psABI mandates. + +The above can be worked around by using a union when defining the register +variables, so that `di` becomes: + +register union { + uint8_t e; + unsigned long r; +} di asm("rdi") = { .e = b }; + +Which results in following code generated for `foo()`: + +foo: # @foo + movzbl %dil, %edi + callq func + retq + +So the truncation is not longer lost. Apply such workaround only when built +with clang. + +Reported-by: Matthew Grooms +Link: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=277200 +Link: https://github.com/llvm/llvm-project/issues/12579 +Link: https://github.com/llvm/llvm-project/issues/82598 +Signed-off-by: Roger Pau Monné +Acked-by: Jan Beulich +master commit: 2ce562b2a413cbdb2e1128989ed1722290a27c4e +master date: 2024-02-26 10:18:01 +0100 +--- + xen/arch/x86/include/asm/alternative.h | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h +index a1cd6a9fe5..3c14db5078 100644 +--- a/xen/arch/x86/include/asm/alternative.h ++++ b/xen/arch/x86/include/asm/alternative.h +@@ -167,9 +167,34 @@ extern void alternative_branches(void); + #define ALT_CALL_arg5 "r8" + #define ALT_CALL_arg6 "r9" + ++#ifdef CONFIG_CC_IS_CLANG ++/* ++ * Use a union with an unsigned long in order to prevent clang from ++ * skipping a possible truncation of the value. By using the union any ++ * truncation is carried before the call instruction, in turn covering ++ * for ABI-non-compliance in that the necessary clipping / extension of ++ * the value is supposed to be carried out in the callee. ++ * ++ * Note this behavior is not mandated by the standard, and hence could ++ * stop being a viable workaround, or worse, could cause a different set ++ * of code-generation issues in future clang versions. ++ * ++ * This has been reported upstream: ++ * https://github.com/llvm/llvm-project/issues/12579 ++ * https://github.com/llvm/llvm-project/issues/82598 ++ */ ++#define ALT_CALL_ARG(arg, n) \ ++ register union { \ ++ typeof(arg) e; \ ++ unsigned long r; \ ++ } a ## n ## _ asm ( ALT_CALL_arg ## n ) = { \ ++ .e = ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) \ ++ } ++#else + #define ALT_CALL_ARG(arg, n) \ + register typeof(arg) a ## n ## _ asm ( ALT_CALL_arg ## n ) = \ + ({ BUILD_BUG_ON(sizeof(arg) > sizeof(void *)); (arg); }) ++#endif + #define ALT_CALL_NO_ARG(n) \ + register unsigned long a ## n ## _ asm ( ALT_CALL_arg ## n ) + +-- +2.44.0 + + +From 4d47dca20dcfdca2340c8cda6f50dcdcafb1c054 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 27 Feb 2024 13:59:42 +0100 +Subject: [PATCH 40/70] x86/spec: fix BRANCH_HARDEN option to only be set when + build-enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current logic to handle the BRANCH_HARDEN option will report it as enabled +even when build-time disabled. Fix this by only allowing the option to be set +when support for it is built into Xen. + +Fixes: 2d6f36daa086 ('x86/nospec: Introduce CONFIG_SPECULATIVE_HARDEN_BRANCH') +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: 60e00f77a5cc671d30c5ef3318f5b8e9b74e4aa3 +master date: 2024-02-26 16:06:42 +0100 +--- + xen/arch/x86/spec_ctrl.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index a8d8af22f6..01ba59cff7 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -50,7 +50,8 @@ static int8_t __initdata opt_psfd = -1; + int8_t __ro_after_init opt_ibpb_ctxt_switch = -1; + int8_t __read_mostly opt_eager_fpu = -1; + int8_t __read_mostly opt_l1d_flush = -1; +-static bool __initdata opt_branch_harden = true; ++static bool __initdata opt_branch_harden = ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH); + + bool __initdata bsp_delay_spec_ctrl; + uint8_t __read_mostly default_xen_spec_ctrl; +@@ -268,7 +269,16 @@ static int __init cf_check parse_spec_ctrl(const char *s) + else if ( (val = parse_boolean("l1d-flush", s, ss)) >= 0 ) + opt_l1d_flush = val; + else if ( (val = parse_boolean("branch-harden", s, ss)) >= 0 ) +- opt_branch_harden = val; ++ { ++ if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) ) ++ opt_branch_harden = val; ++ else ++ { ++ no_config_param("SPECULATIVE_HARDEN_BRANCH", "spec-ctrl", s, ++ ss); ++ rc = -EINVAL; ++ } ++ } + else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) + opt_srb_lock = val; + else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) +-- +2.44.0 + + +From 58bb8115104c9fca749ee4cfcd3579ac1ed644db Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 27 Feb 2024 14:00:22 +0100 +Subject: [PATCH 41/70] x86: account for shadow stack in exception-from-stub + recovery + +Dealing with exceptions raised from within emulation stubs involves +discarding return address (replaced by exception related information). +Such discarding of course also requires removing the corresponding entry +from the shadow stack. + +Also amend the comment in fixup_exception_return(), to further clarify +why use of ptr[1] can't be an out-of-bounds access. + +While touching do_invalid_op() also add a missing fall-through +annotation. + +This is CVE-2023-46841 / XSA-451. + +Fixes: 209fb9919b50 ("x86/extable: Adjust extable handling to be shadow stack compatible") +Signed-off-by: Jan Beulich +Reviewed-by: Andrew Cooper +master commit: 91f5f7a9154919a765c3933521760acffeddbf28 +master date: 2024-02-27 13:49:22 +0100 +--- + xen/arch/x86/extable.c | 20 ++++++---- + xen/arch/x86/include/asm/uaccess.h | 3 +- + xen/arch/x86/traps.c | 62 +++++++++++++++++++++++++++--- + 3 files changed, 71 insertions(+), 14 deletions(-) + +diff --git a/xen/arch/x86/extable.c b/xen/arch/x86/extable.c +index 74b14246e9..8ffcd346d7 100644 +--- a/xen/arch/x86/extable.c ++++ b/xen/arch/x86/extable.c +@@ -86,26 +86,29 @@ search_one_extable(const struct exception_table_entry *first, + } + + unsigned long +-search_exception_table(const struct cpu_user_regs *regs) ++search_exception_table(const struct cpu_user_regs *regs, unsigned long *stub_ra) + { + const struct virtual_region *region = find_text_region(regs->rip); + unsigned long stub = this_cpu(stubs.addr); + + if ( region && region->ex ) ++ { ++ *stub_ra = 0; + return search_one_extable(region->ex, region->ex_end, regs->rip); ++ } + + if ( regs->rip >= stub + STUB_BUF_SIZE / 2 && + regs->rip < stub + STUB_BUF_SIZE && + regs->rsp > (unsigned long)regs && + regs->rsp < (unsigned long)get_cpu_info() ) + { +- unsigned long retptr = *(unsigned long *)regs->rsp; ++ unsigned long retaddr = *(unsigned long *)regs->rsp, fixup; + +- region = find_text_region(retptr); +- retptr = region && region->ex +- ? search_one_extable(region->ex, region->ex_end, retptr) +- : 0; +- if ( retptr ) ++ region = find_text_region(retaddr); ++ fixup = region && region->ex ++ ? search_one_extable(region->ex, region->ex_end, retaddr) ++ : 0; ++ if ( fixup ) + { + /* + * Put trap number and error code on the stack (in place of the +@@ -117,7 +120,8 @@ search_exception_table(const struct cpu_user_regs *regs) + }; + + *(unsigned long *)regs->rsp = token.raw; +- return retptr; ++ *stub_ra = retaddr; ++ return fixup; + } + } + +diff --git a/xen/arch/x86/include/asm/uaccess.h b/xen/arch/x86/include/asm/uaccess.h +index 684fccd95c..74bb222c03 100644 +--- a/xen/arch/x86/include/asm/uaccess.h ++++ b/xen/arch/x86/include/asm/uaccess.h +@@ -421,7 +421,8 @@ union stub_exception_token { + unsigned long raw; + }; + +-extern unsigned long search_exception_table(const struct cpu_user_regs *regs); ++extern unsigned long search_exception_table(const struct cpu_user_regs *regs, ++ unsigned long *stub_ra); + extern void sort_exception_tables(void); + extern void sort_exception_table(struct exception_table_entry *start, + const struct exception_table_entry *stop); +diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c +index e1356f696a..45e1b277ea 100644 +--- a/xen/arch/x86/traps.c ++++ b/xen/arch/x86/traps.c +@@ -845,7 +845,7 @@ void do_unhandled_trap(struct cpu_user_regs *regs) + } + + static void fixup_exception_return(struct cpu_user_regs *regs, +- unsigned long fixup) ++ unsigned long fixup, unsigned long stub_ra) + { + if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) + { +@@ -862,7 +862,8 @@ static void fixup_exception_return(struct cpu_user_regs *regs, + /* + * Search for %rip. The shstk currently looks like this: + * +- * ... [Likely pointed to by SSP] ++ * tok [Supervisor token, == &tok | BUSY, only with FRED inactive] ++ * ... [Pointed to by SSP for most exceptions, empty in IST cases] + * %cs [== regs->cs] + * %rip [== regs->rip] + * SSP [Likely points to 3 slots higher, above %cs] +@@ -880,7 +881,56 @@ static void fixup_exception_return(struct cpu_user_regs *regs, + */ + if ( ptr[0] == regs->rip && ptr[1] == regs->cs ) + { ++ unsigned long primary_shstk = ++ (ssp & ~(STACK_SIZE - 1)) + ++ (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8; ++ + wrss(fixup, ptr); ++ ++ if ( !stub_ra ) ++ goto shstk_done; ++ ++ /* ++ * Stub recovery ought to happen only when the outer context ++ * was on the main shadow stack. We need to also "pop" the ++ * stub's return address from the interrupted context's shadow ++ * stack. That is, ++ * - if we're still on the main stack, we need to move the ++ * entire stack (up to and including the exception frame) ++ * up by one slot, incrementing the original SSP in the ++ * exception frame, ++ * - if we're on an IST stack, we need to increment the ++ * original SSP. ++ */ ++ BUG_ON((ptr[-1] ^ primary_shstk) >> PAGE_SHIFT); ++ ++ if ( (ssp ^ primary_shstk) >> PAGE_SHIFT ) ++ { ++ /* ++ * We're on an IST stack. First make sure the two return ++ * addresses actually match. Then increment the interrupted ++ * context's SSP. ++ */ ++ BUG_ON(stub_ra != *(unsigned long*)ptr[-1]); ++ wrss(ptr[-1] + 8, &ptr[-1]); ++ goto shstk_done; ++ } ++ ++ /* Make sure the two return addresses actually match. */ ++ BUG_ON(stub_ra != ptr[2]); ++ ++ /* Move exception frame, updating SSP there. */ ++ wrss(ptr[1], &ptr[2]); /* %cs */ ++ wrss(ptr[0], &ptr[1]); /* %rip */ ++ wrss(ptr[-1] + 8, &ptr[0]); /* SSP */ ++ ++ /* Move all newer entries. */ ++ while ( --ptr != _p(ssp) ) ++ wrss(ptr[-1], &ptr[0]); ++ ++ /* Finally account for our own stack having shifted up. */ ++ asm volatile ( "incsspd %0" :: "r" (2) ); ++ + goto shstk_done; + } + } +@@ -901,7 +951,8 @@ static void fixup_exception_return(struct cpu_user_regs *regs, + + static bool extable_fixup(struct cpu_user_regs *regs, bool print) + { +- unsigned long fixup = search_exception_table(regs); ++ unsigned long stub_ra = 0; ++ unsigned long fixup = search_exception_table(regs, &stub_ra); + + if ( unlikely(fixup == 0) ) + return false; +@@ -915,7 +966,7 @@ static bool extable_fixup(struct cpu_user_regs *regs, bool print) + vector_name(regs->entry_vector), regs->error_code, + _p(regs->rip), _p(regs->rip), _p(fixup)); + +- fixup_exception_return(regs, fixup); ++ fixup_exception_return(regs, fixup, stub_ra); + this_cpu(last_extable_addr) = regs->rip; + + return true; +@@ -1183,7 +1234,8 @@ void do_invalid_op(struct cpu_user_regs *regs) + { + case BUGFRAME_run_fn: + case BUGFRAME_warn: +- fixup_exception_return(regs, (unsigned long)eip); ++ fixup_exception_return(regs, (unsigned long)eip, 0); ++ fallthrough; + case BUGFRAME_bug: + case BUGFRAME_assert: + return; +-- +2.44.0 + + +From 498b3624d0ecc1267773e6482fd0b732e90c4511 Mon Sep 17 00:00:00 2001 +From: Michal Orzel +Date: Thu, 8 Feb 2024 11:43:39 +0100 +Subject: [PATCH 42/70] xen/arm: Fix UBSAN failure in start_xen() + +When running Xen on arm32, in scenario where Xen is loaded at an address +such as boot_phys_offset >= 2GB, UBSAN reports the following: + +(XEN) UBSAN: Undefined behaviour in arch/arm/setup.c:739:58 +(XEN) pointer operation underflowed 00200000 to 86800000 +(XEN) Xen WARN at common/ubsan/ubsan.c:172 +(XEN) ----[ Xen-4.19-unstable arm32 debug=y ubsan=y Not tainted ]---- +... +(XEN) Xen call trace: +(XEN) [<0031b4c0>] ubsan.c#ubsan_epilogue+0x18/0xf0 (PC) +(XEN) [<0031d134>] __ubsan_handle_pointer_overflow+0xb8/0xd4 (LR) +(XEN) [<0031d134>] __ubsan_handle_pointer_overflow+0xb8/0xd4 +(XEN) [<004d15a8>] start_xen+0xe0/0xbe0 +(XEN) [<0020007c>] head.o#primary_switched+0x4/0x30 + +The failure is reported for the following line: +(paddr_t)(uintptr_t)(_start + boot_phys_offset) + +This occurs because the compiler treats (ptr + size) with size bigger than +PTRDIFF_MAX as undefined behavior. To address this, switch to macro +virt_to_maddr(), given the future plans to eliminate boot_phys_offset. + +Signed-off-by: Michal Orzel +Reviewed-by: Luca Fancellu +Tested-by: Luca Fancellu +Acked-by: Julien Grall +(cherry picked from commit e11f5766503c0ff074b4e0f888bbfc931518a169) +--- + xen/arch/arm/setup.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/arm/setup.c b/xen/arch/arm/setup.c +index db748839d3..2ccdde5277 100644 +--- a/xen/arch/arm/setup.c ++++ b/xen/arch/arm/setup.c +@@ -1109,7 +1109,7 @@ void __init start_xen(unsigned long boot_phys_offset, + + /* Register Xen's load address as a boot module. */ + xen_bootmodule = add_boot_module(BOOTMOD_XEN, +- (paddr_t)(uintptr_t)(_start + boot_phys_offset), ++ virt_to_maddr(_start), + (paddr_t)(uintptr_t)(_end - _start), false); + BUG_ON(!xen_bootmodule); + +-- +2.44.0 + + +From 3e383bb4137c6ca3058cd55cb867ecc2b7414499 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 5 Mar 2024 11:48:39 +0100 +Subject: [PATCH 43/70] x86/HVM: hide SVM/VMX when their enabling is prohibited + by firmware +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +... or we fail to enable the functionality on the BSP for other reasons. +The only place where hardware announcing the feature is recorded is the +raw CPU policy/featureset. + +Inspired by https://lore.kernel.org/all/20230921114940.957141-1-pbonzini@redhat.com/. + +Signed-off-by: Jan Beulich +Acked-by: Roger Pau Monné +master commit: 0b5f149338e35a795bf609ce584640b0977f9e6c +master date: 2024-01-09 14:06:34 +0100 +--- + xen/arch/x86/hvm/svm/svm.c | 1 + + xen/arch/x86/hvm/vmx/vmcs.c | 17 +++++++++++++++++ + 2 files changed, 18 insertions(+) + +diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c +index 24c417ca71..ff991c82cf 100644 +--- a/xen/arch/x86/hvm/svm/svm.c ++++ b/xen/arch/x86/hvm/svm/svm.c +@@ -2543,6 +2543,7 @@ const struct hvm_function_table * __init start_svm(void) + + if ( _svm_cpu_up(true) ) + { ++ setup_clear_cpu_cap(X86_FEATURE_SVM); + printk("SVM: failed to initialise.\n"); + return NULL; + } +diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c +index 13719cc923..e382aa16c5 100644 +--- a/xen/arch/x86/hvm/vmx/vmcs.c ++++ b/xen/arch/x86/hvm/vmx/vmcs.c +@@ -2165,6 +2165,23 @@ int __init vmx_vmcs_init(void) + + if ( !ret ) + register_keyhandler('v', vmcs_dump, "dump VT-x VMCSs", 1); ++ else ++ { ++ setup_clear_cpu_cap(X86_FEATURE_VMX); ++ ++ /* ++ * _vmx_vcpu_up() may have made it past feature identification. ++ * Make sure all dependent features are off as well. ++ */ ++ vmx_basic_msr = 0; ++ vmx_pin_based_exec_control = 0; ++ vmx_cpu_based_exec_control = 0; ++ vmx_secondary_exec_control = 0; ++ vmx_vmexit_control = 0; ++ vmx_vmentry_control = 0; ++ vmx_ept_vpid_cap = 0; ++ vmx_vmfunc = 0; ++ } + + return ret; + } +-- +2.44.0 + + +From 57f137053652d5a981ae21f3abe7becc507fe434 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 5 Mar 2024 11:49:22 +0100 +Subject: [PATCH 44/70] xen/sched: Fix UB shift in compat_set_timer_op() + +Tamas reported this UBSAN failure from fuzzing: + + (XEN) ================================================================================ + (XEN) UBSAN: Undefined behaviour in common/sched/compat.c:48:37 + (XEN) left shift of negative value -2147425536 + (XEN) ----[ Xen-4.19-unstable x86_64 debug=y ubsan=y Not tainted ]---- + ... + (XEN) Xen call trace: + (XEN) [] R ubsan.c#ubsan_epilogue+0xa/0xd9 + (XEN) [] F __ubsan_handle_shift_out_of_bounds+0x11a/0x1c5 + (XEN) [] F compat_set_timer_op+0x41/0x43 + (XEN) [] F hvm_do_multicall_call+0x77f/0xa75 + (XEN) [] F arch_do_multicall_call+0xec/0xf1 + (XEN) [] F do_multicall+0x1dc/0xde3 + (XEN) [] F hvm_hypercall+0xa00/0x149a + (XEN) [] F vmx_vmexit_handler+0x1596/0x279c + (XEN) [] F vmx_asm_vmexit_handler+0xdb/0x200 + +Left-shifting any negative value is strictly undefined behaviour in C, and +the two parameters here come straight from the guest. + +The fuzzer happened to choose lo 0xf, hi 0x8000e300. + +Switch everything to be unsigned values, making the shift well defined. + +As GCC documents: + + As an extension to the C language, GCC does not use the latitude given in + C99 and C11 only to treat certain aspects of signed '<<' as undefined. + However, -fsanitize=shift (and -fsanitize=undefined) will diagnose such + cases. + +this was deemed not to need an XSA. + +Note: The unsigned -> signed conversion for do_set_timer_op()'s s_time_t +parameter is also well defined. C makes it implementation defined, and GCC +defines it as reduction modulo 2^N to be within range of the new type. + +Fixes: 2942f45e09fb ("Enable compatibility mode operation for HYPERVISOR_sched_op and HYPERVISOR_set_timer_op.") +Reported-by: Tamas K Lengyel +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +master commit: ae6d4fd876765e6d623eec67d14f5d0464be09cb +master date: 2024-02-01 19:52:44 +0000 +--- + xen/common/sched/compat.c | 4 ++-- + xen/include/hypercall-defs.c | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/xen/common/sched/compat.c b/xen/common/sched/compat.c +index d718e450d4..dd97593630 100644 +--- a/xen/common/sched/compat.c ++++ b/xen/common/sched/compat.c +@@ -43,9 +43,9 @@ static int compat_poll(struct compat_sched_poll *compat) + + #include "core.c" + +-int compat_set_timer_op(uint32_t lo, int32_t hi) ++int compat_set_timer_op(uint32_t lo, uint32_t hi) + { +- return do_set_timer_op(((s64)hi << 32) | lo); ++ return do_set_timer_op(((uint64_t)hi << 32) | lo); + } + + #endif /* __COMMON_SCHED_COMPAT_C__ */ +diff --git a/xen/include/hypercall-defs.c b/xen/include/hypercall-defs.c +index 6d361ddfce..47c093acc8 100644 +--- a/xen/include/hypercall-defs.c ++++ b/xen/include/hypercall-defs.c +@@ -134,7 +134,7 @@ xenoprof_op(int op, void *arg) + + #ifdef CONFIG_COMPAT + prefix: compat +-set_timer_op(uint32_t lo, int32_t hi) ++set_timer_op(uint32_t lo, uint32_t hi) + multicall(multicall_entry_compat_t *call_list, uint32_t nr_calls) + memory_op(unsigned int cmd, void *arg) + #ifdef CONFIG_IOREQ_SERVER +-- +2.44.0 + + +From b7f9168878155e2d29b9b4a3048b0a9a68ed82ed Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:50:16 +0100 +Subject: [PATCH 45/70] x86/spec: print the built-in SPECULATIVE_HARDEN_* + options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Just like it's done for INDIRECT_THUNK and SHADOW_PAGING. + +Reported-by: Jan Beulich +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: 6e9507f7d51fe49df8bc70f83e49ce06c92e4e54 +master date: 2024-02-27 14:57:52 +0100 +--- + xen/arch/x86/spec_ctrl.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 01ba59cff7..04e508b622 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -476,13 +476,25 @@ static void __init print_details(enum ind_thunk thunk) + (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); + + /* Compiled-in support which pertains to mitigations. */ +- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) ) ++ if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) || ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) || ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) ) + printk(" Compiled-in support:" + #ifdef CONFIG_INDIRECT_THUNK + " INDIRECT_THUNK" + #endif + #ifdef CONFIG_SHADOW_PAGING + " SHADOW_PAGING" ++#endif ++#ifdef CONFIG_SPECULATIVE_HARDEN_ARRAY ++ " HARDEN_ARRAY" ++#endif ++#ifdef CONFIG_SPECULATIVE_HARDEN_BRANCH ++ " HARDEN_BRANCH" ++#endif ++#ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS ++ " HARDEN_GUEST_ACCESS" + #endif + "\n"); + +-- +2.44.0 + + +From 09b9db0413b1f31f27bece07b2bfa1723b89ace6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:50:53 +0100 +Subject: [PATCH 46/70] x86/spec: fix INDIRECT_THUNK option to only be set when + build-enabled +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Attempt to provide a more helpful error message when the user attempts to set +spec-ctrl=bti-thunk option but the support is build-time disabled. + +While there also adjust the command line documentation to mention +CONFIG_INDIRECT_THUNK instead of INDIRECT_THUNK. + +Reported-by: Andrew Cooper +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: 8441fa806a3b778867867cd0159fa1722e90397e +master date: 2024-02-27 14:58:20 +0100 +--- + docs/misc/xen-command-line.pandoc | 10 +++++----- + xen/arch/x86/spec_ctrl.c | 7 ++++++- + 2 files changed, 11 insertions(+), 6 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index 8e65f8bd18..582d6741d1 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2424,11 +2424,11 @@ guests to use. + performance reasons dom0 is unprotected by default. If it is necessary to + protect dom0 too, boot with `spec-ctrl=ibpb-entry`. + +-If Xen was compiled with INDIRECT_THUNK support, `bti-thunk=` can be used to +-select which of the thunks gets patched into the `__x86_indirect_thunk_%reg` +-locations. The default thunk is `retpoline` (generally preferred), with the +-alternatives being `jmp` (a `jmp *%reg` gadget, minimal overhead), and +-`lfence` (an `lfence; jmp *%reg` gadget). ++If Xen was compiled with `CONFIG_INDIRECT_THUNK` support, `bti-thunk=` can be ++used to select which of the thunks gets patched into the ++`__x86_indirect_thunk_%reg` locations. The default thunk is `retpoline` ++(generally preferred), with the alternatives being `jmp` (a `jmp *%reg` gadget, ++minimal overhead), and `lfence` (an `lfence; jmp *%reg` gadget). + + On hardware supporting IBRS (Indirect Branch Restricted Speculation), the + `ibrs=` option can be used to force or prevent Xen using the feature itself. +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 04e508b622..99ecfb3cba 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -241,7 +241,12 @@ static int __init cf_check parse_spec_ctrl(const char *s) + { + s += 10; + +- if ( !cmdline_strcmp(s, "retpoline") ) ++ if ( !IS_ENABLED(CONFIG_INDIRECT_THUNK) ) ++ { ++ no_config_param("INDIRECT_THUNK", "spec-ctrl", s - 10, ss); ++ rc = -EINVAL; ++ } ++ else if ( !cmdline_strcmp(s, "retpoline") ) + opt_thunk = THUNK_RETPOLINE; + else if ( !cmdline_strcmp(s, "lfence") ) + opt_thunk = THUNK_LFENCE; +-- +2.44.0 + + +From 7404c25efdc70091817479b80dbbd945e6ab4861 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:51:56 +0100 +Subject: [PATCH 47/70] x86/spec: do not print thunk option selection if not + built-in +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Since the thunk built-in enable is printed as part of the "Compiled-in +support:" line, avoid printing anything in "Xen settings:" if the thunk is +disabled at build time. + +Note the BTI-Thunk option printing is also adjusted to print a colon in the +same way the other options on the line do. + +Requested-by: Jan Beulich +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +master commit: 576528a2a742069af203e90c613c5c93e23c9755 +master date: 2024-02-27 14:58:40 +0100 +--- + xen/arch/x86/spec_ctrl.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 99ecfb3cba..a965b6db28 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -504,11 +504,12 @@ static void __init print_details(enum ind_thunk thunk) + "\n"); + + /* Settings for Xen's protection, irrespective of guests. */ +- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", +- thunk == THUNK_NONE ? "N/A" : +- thunk == THUNK_RETPOLINE ? "RETPOLINE" : +- thunk == THUNK_LFENCE ? "LFENCE" : +- thunk == THUNK_JMP ? "JMP" : "?", ++ printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", ++ thunk != THUNK_NONE ? "BTI-Thunk: " : "", ++ thunk == THUNK_NONE ? "" : ++ thunk == THUNK_RETPOLINE ? "RETPOLINE, " : ++ thunk == THUNK_LFENCE ? "LFENCE, " : ++ thunk == THUNK_JMP ? "JMP, " : "?, ", + (!boot_cpu_has(X86_FEATURE_IBRSB) && + !boot_cpu_has(X86_FEATURE_IBRS)) ? "No" : + (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-", +-- +2.44.0 + + +From 5382a6a79cb544f2eecc47330b531802f8c52977 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:52:57 +0100 +Subject: [PATCH 48/70] xen/livepatch: register livepatch regions when loaded +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Currently livepatch regions are registered as virtual regions only after the +livepatch has been applied. + +This can lead to issues when using the pre-apply or post-revert hooks, as at +that point the livepatch is not in the virtual regions list. If a livepatch +pre-apply hook contains a WARN() it would trigger an hypervisor crash, as the +code to handle the bug frame won't be able to find the instruction pointer that +triggered the #UD in any of the registered virtual regions, and hence crash. + +Fix this by adding the livepatch payloads as virtual regions as soon as loaded, +and only remove them once the payload is unloaded. This requires some changes +to the virtual regions code, as the removal of the virtual regions is no longer +done in stop machine context, and hence an RCU barrier is added in order to +make sure there are no users of the virtual region after it's been removed from +the list. + +Fixes: 8313c864fa95 ('livepatch: Implement pre-|post- apply|revert hooks') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: a57b4074ab39bee78b6c116277f0a9963bd8e687 +master date: 2024-02-28 16:57:25 +0000 +--- + xen/common/livepatch.c | 4 ++-- + xen/common/virtual_region.c | 44 ++++++++++++++----------------------- + 2 files changed, 19 insertions(+), 29 deletions(-) + +diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c +index e635606c10..e1964b841a 100644 +--- a/xen/common/livepatch.c ++++ b/xen/common/livepatch.c +@@ -1071,6 +1071,7 @@ static int build_symbol_table(struct payload *payload, + static void free_payload(struct payload *data) + { + ASSERT(spin_is_locked(&payload_lock)); ++ unregister_virtual_region(&data->region); + list_del(&data->list); + payload_cnt--; + payload_version++; +@@ -1170,6 +1171,7 @@ static int livepatch_upload(struct xen_sysctl_livepatch_upload *upload) + INIT_LIST_HEAD(&data->list); + INIT_LIST_HEAD(&data->applied_list); + ++ register_virtual_region(&data->region); + list_add_tail(&data->list, &payload_list); + payload_cnt++; + payload_version++; +@@ -1386,7 +1388,6 @@ static inline void apply_payload_tail(struct payload *data) + * The applied_list is iterated by the trap code. + */ + list_add_tail_rcu(&data->applied_list, &applied_list); +- register_virtual_region(&data->region); + + data->state = LIVEPATCH_STATE_APPLIED; + } +@@ -1432,7 +1433,6 @@ static inline void revert_payload_tail(struct payload *data) + * The applied_list is iterated by the trap code. + */ + list_del_rcu(&data->applied_list); +- unregister_virtual_region(&data->region); + + data->reverted = true; + data->state = LIVEPATCH_STATE_CHECKED; +diff --git a/xen/common/virtual_region.c b/xen/common/virtual_region.c +index 5f89703f51..9f12c30efe 100644 +--- a/xen/common/virtual_region.c ++++ b/xen/common/virtual_region.c +@@ -23,14 +23,8 @@ static struct virtual_region core_init __initdata = { + }; + + /* +- * RCU locking. Additions are done either at startup (when there is only +- * one CPU) or when all CPUs are running without IRQs. +- * +- * Deletions are bit tricky. We do it when Live Patch (all CPUs running +- * without IRQs) or during bootup (when clearing the init). +- * +- * Hence we use list_del_rcu (which sports an memory fence) and a spinlock +- * on deletion. ++ * RCU locking. Modifications to the list must be done in exclusive mode, and ++ * hence need to hold the spinlock. + * + * All readers of virtual_region_list MUST use list_for_each_entry_rcu. + */ +@@ -58,41 +52,36 @@ const struct virtual_region *find_text_region(unsigned long addr) + + void register_virtual_region(struct virtual_region *r) + { +- ASSERT(!local_irq_is_enabled()); ++ unsigned long flags; + ++ spin_lock_irqsave(&virtual_region_lock, flags); + list_add_tail_rcu(&r->list, &virtual_region_list); ++ spin_unlock_irqrestore(&virtual_region_lock, flags); + } + +-static void remove_virtual_region(struct virtual_region *r) ++/* ++ * Suggest inline so when !CONFIG_LIVEPATCH the function is not left ++ * unreachable after init code is removed. ++ */ ++static void inline remove_virtual_region(struct virtual_region *r) + { + unsigned long flags; + + spin_lock_irqsave(&virtual_region_lock, flags); + list_del_rcu(&r->list); + spin_unlock_irqrestore(&virtual_region_lock, flags); +- /* +- * We do not need to invoke call_rcu. +- * +- * This is due to the fact that on the deletion we have made sure +- * to use spinlocks (to guard against somebody else calling +- * unregister_virtual_region) and list_deletion spiced with +- * memory barrier. +- * +- * That protects us from corrupting the list as the readers all +- * use list_for_each_entry_rcu which is safe against concurrent +- * deletions. +- */ + } + ++#ifdef CONFIG_LIVEPATCH + void unregister_virtual_region(struct virtual_region *r) + { +- /* Expected to be called from Live Patch - which has IRQs disabled. */ +- ASSERT(!local_irq_is_enabled()); +- + remove_virtual_region(r); ++ ++ /* Assert that no CPU might be using the removed region. */ ++ rcu_barrier(); + } + +-#if defined(CONFIG_LIVEPATCH) && defined(CONFIG_X86) ++#ifdef CONFIG_X86 + void relax_virtual_region_perms(void) + { + const struct virtual_region *region; +@@ -116,7 +105,8 @@ void tighten_virtual_region_perms(void) + PAGE_HYPERVISOR_RX); + rcu_read_unlock(&rcu_virtual_region_lock); + } +-#endif ++#endif /* CONFIG_X86 */ ++#endif /* CONFIG_LIVEPATCH */ + + void __init unregister_init_virtual_region(void) + { +-- +2.44.0 + + +From 50a8f74df76b7ce7c35ad97a539f505eb0a9baa6 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:53:05 +0100 +Subject: [PATCH 49/70] xen/livepatch: search for symbols in all loaded + payloads +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +When checking if an address belongs to a patch, or when resolving a symbol, +take into account all loaded livepatch payloads, even if not applied. + +This is required in order for the pre-apply and post-revert hooks to work +properly, or else Xen won't detect the instruction pointer belonging to those +hooks as being part of the currently active text. + +Move the RCU handling to be used for payload_list instead of applied_list, as +now the calls from trap code will iterate over the payload_list. + +Fixes: 8313c864fa95 ('livepatch: Implement pre-|post- apply|revert hooks') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: d2daa40fb3ddb8f83e238e57854bd878924cde90 +master date: 2024-02-28 16:57:25 +0000 +--- + xen/common/livepatch.c | 49 +++++++++++++++--------------------------- + 1 file changed, 17 insertions(+), 32 deletions(-) + +diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c +index e1964b841a..135c47e9b8 100644 +--- a/xen/common/livepatch.c ++++ b/xen/common/livepatch.c +@@ -36,13 +36,14 @@ + * caller in schedule_work. + */ + static DEFINE_SPINLOCK(payload_lock); +-static LIST_HEAD(payload_list); +- + /* +- * Patches which have been applied. Need RCU in case we crash (and then +- * traps code would iterate via applied_list) when adding entries on the list. ++ * Need RCU in case we crash (and then traps code would iterate via ++ * payload_list) when adding entries on the list. + */ +-static DEFINE_RCU_READ_LOCK(rcu_applied_lock); ++static DEFINE_RCU_READ_LOCK(rcu_payload_lock); ++static LIST_HEAD(payload_list); ++ ++/* Patches which have been applied. Only modified from stop machine context. */ + static LIST_HEAD(applied_list); + + static unsigned int payload_cnt; +@@ -111,12 +112,8 @@ bool_t is_patch(const void *ptr) + const struct payload *data; + bool_t r = 0; + +- /* +- * Only RCU locking since this list is only ever changed during apply +- * or revert context. And in case it dies there we need an safe list. +- */ +- rcu_read_lock(&rcu_applied_lock); +- list_for_each_entry_rcu ( data, &applied_list, applied_list ) ++ rcu_read_lock(&rcu_payload_lock); ++ list_for_each_entry_rcu ( data, &payload_list, list ) + { + if ( (ptr >= data->rw_addr && + ptr < (data->rw_addr + data->rw_size)) || +@@ -130,7 +127,7 @@ bool_t is_patch(const void *ptr) + } + + } +- rcu_read_unlock(&rcu_applied_lock); ++ rcu_read_unlock(&rcu_payload_lock); + + return r; + } +@@ -166,12 +163,8 @@ static const char *cf_check livepatch_symbols_lookup( + const void *va = (const void *)addr; + const char *n = NULL; + +- /* +- * Only RCU locking since this list is only ever changed during apply +- * or revert context. And in case it dies there we need an safe list. +- */ +- rcu_read_lock(&rcu_applied_lock); +- list_for_each_entry_rcu ( data, &applied_list, applied_list ) ++ rcu_read_lock(&rcu_payload_lock); ++ list_for_each_entry_rcu ( data, &payload_list, list ) + { + if ( va < data->text_addr || + va >= (data->text_addr + data->text_size) ) +@@ -200,7 +193,7 @@ static const char *cf_check livepatch_symbols_lookup( + n = data->symtab[best].name; + break; + } +- rcu_read_unlock(&rcu_applied_lock); ++ rcu_read_unlock(&rcu_payload_lock); + + return n; + } +@@ -1072,7 +1065,8 @@ static void free_payload(struct payload *data) + { + ASSERT(spin_is_locked(&payload_lock)); + unregister_virtual_region(&data->region); +- list_del(&data->list); ++ list_del_rcu(&data->list); ++ rcu_barrier(); + payload_cnt--; + payload_version++; + free_payload_data(data); +@@ -1172,7 +1166,7 @@ static int livepatch_upload(struct xen_sysctl_livepatch_upload *upload) + INIT_LIST_HEAD(&data->applied_list); + + register_virtual_region(&data->region); +- list_add_tail(&data->list, &payload_list); ++ list_add_tail_rcu(&data->list, &payload_list); + payload_cnt++; + payload_version++; + } +@@ -1383,11 +1377,7 @@ static int apply_payload(struct payload *data) + + static inline void apply_payload_tail(struct payload *data) + { +- /* +- * We need RCU variant (which has barriers) in case we crash here. +- * The applied_list is iterated by the trap code. +- */ +- list_add_tail_rcu(&data->applied_list, &applied_list); ++ list_add_tail(&data->applied_list, &applied_list); + + data->state = LIVEPATCH_STATE_APPLIED; + } +@@ -1427,12 +1417,7 @@ static int revert_payload(struct payload *data) + + static inline void revert_payload_tail(struct payload *data) + { +- +- /* +- * We need RCU variant (which has barriers) in case we crash here. +- * The applied_list is iterated by the trap code. +- */ +- list_del_rcu(&data->applied_list); ++ list_del(&data->applied_list); + + data->reverted = true; + data->state = LIVEPATCH_STATE_CHECKED; +-- +2.44.0 + + +From d81bfc7ff887426727504086fa363f91bf8c19f8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:53:13 +0100 +Subject: [PATCH 50/70] xen/livepatch: fix norevert test attempt to open-code + revert +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The purpose of the norevert test is to install a dummy handler that replaces +the internal Xen revert code, and then perform the revert in the post-revert +hook. For that purpose the usage of the previous common_livepatch_revert() is +not enough, as that just reverts specific functions, but not the whole state of +the payload. + +Remove both common_livepatch_{apply,revert}() and instead expose +revert_payload{,_tail}() in order to perform the patch revert from the +post-revert hook. + +Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: cdae267ce10d04d71d1687b5701ff2911a96b6dc +master date: 2024-02-28 16:57:25 +0000 +--- + xen/common/livepatch.c | 41 +++++++++++++++++-- + xen/include/xen/livepatch.h | 32 ++------------- + .../livepatch/xen_action_hooks_norevert.c | 22 +++------- + 3 files changed, 46 insertions(+), 49 deletions(-) + +diff --git a/xen/common/livepatch.c b/xen/common/livepatch.c +index 135c47e9b8..0cc048fd83 100644 +--- a/xen/common/livepatch.c ++++ b/xen/common/livepatch.c +@@ -1366,7 +1366,22 @@ static int apply_payload(struct payload *data) + ASSERT(!local_irq_is_enabled()); + + for ( i = 0; i < data->nfuncs; i++ ) +- common_livepatch_apply(&data->funcs[i], &data->fstate[i]); ++ { ++ const struct livepatch_func *func = &data->funcs[i]; ++ struct livepatch_fstate *state = &data->fstate[i]; ++ ++ /* If the action has been already executed on this function, do nothing. */ ++ if ( state->applied == LIVEPATCH_FUNC_APPLIED ) ++ { ++ printk(XENLOG_WARNING LIVEPATCH ++ "%s: %s has been already applied before\n", ++ __func__, func->name); ++ continue; ++ } ++ ++ arch_livepatch_apply(func, state); ++ state->applied = LIVEPATCH_FUNC_APPLIED; ++ } + + arch_livepatch_revive(); + +@@ -1382,7 +1397,7 @@ static inline void apply_payload_tail(struct payload *data) + data->state = LIVEPATCH_STATE_APPLIED; + } + +-static int revert_payload(struct payload *data) ++int revert_payload(struct payload *data) + { + unsigned int i; + int rc; +@@ -1397,7 +1412,25 @@ static int revert_payload(struct payload *data) + } + + for ( i = 0; i < data->nfuncs; i++ ) +- common_livepatch_revert(&data->funcs[i], &data->fstate[i]); ++ { ++ const struct livepatch_func *func = &data->funcs[i]; ++ struct livepatch_fstate *state = &data->fstate[i]; ++ ++ /* ++ * If the apply action hasn't been executed on this function, do ++ * nothing. ++ */ ++ if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED ) ++ { ++ printk(XENLOG_WARNING LIVEPATCH ++ "%s: %s has not been applied before\n", ++ __func__, func->name); ++ continue; ++ } ++ ++ arch_livepatch_revert(func, state); ++ state->applied = LIVEPATCH_FUNC_NOT_APPLIED; ++ } + + /* + * Since we are running with IRQs disabled and the hooks may call common +@@ -1415,7 +1448,7 @@ static int revert_payload(struct payload *data) + return 0; + } + +-static inline void revert_payload_tail(struct payload *data) ++void revert_payload_tail(struct payload *data) + { + list_del(&data->applied_list); + +diff --git a/xen/include/xen/livepatch.h b/xen/include/xen/livepatch.h +index 537d3d58b6..c9ee58fd37 100644 +--- a/xen/include/xen/livepatch.h ++++ b/xen/include/xen/livepatch.h +@@ -136,35 +136,11 @@ void arch_livepatch_post_action(void); + void arch_livepatch_mask(void); + void arch_livepatch_unmask(void); + +-static inline void common_livepatch_apply(const struct livepatch_func *func, +- struct livepatch_fstate *state) +-{ +- /* If the action has been already executed on this function, do nothing. */ +- if ( state->applied == LIVEPATCH_FUNC_APPLIED ) +- { +- printk(XENLOG_WARNING LIVEPATCH "%s: %s has been already applied before\n", +- __func__, func->name); +- return; +- } +- +- arch_livepatch_apply(func, state); +- state->applied = LIVEPATCH_FUNC_APPLIED; +-} ++/* Only for testing purposes. */ ++struct payload; ++int revert_payload(struct payload *data); ++void revert_payload_tail(struct payload *data); + +-static inline void common_livepatch_revert(const struct livepatch_func *func, +- struct livepatch_fstate *state) +-{ +- /* If the apply action hasn't been executed on this function, do nothing. */ +- if ( !func->old_addr || state->applied == LIVEPATCH_FUNC_NOT_APPLIED ) +- { +- printk(XENLOG_WARNING LIVEPATCH "%s: %s has not been applied before\n", +- __func__, func->name); +- return; +- } +- +- arch_livepatch_revert(func, state); +- state->applied = LIVEPATCH_FUNC_NOT_APPLIED; +-} + #else + + /* +diff --git a/xen/test/livepatch/xen_action_hooks_norevert.c b/xen/test/livepatch/xen_action_hooks_norevert.c +index c173855192..c5fbab1746 100644 +--- a/xen/test/livepatch/xen_action_hooks_norevert.c ++++ b/xen/test/livepatch/xen_action_hooks_norevert.c +@@ -96,26 +96,14 @@ static int revert_hook(livepatch_payload_t *payload) + + static void post_revert_hook(livepatch_payload_t *payload) + { +- int i; ++ unsigned long flags; + + printk(KERN_DEBUG "%s: Hook starting.\n", __func__); + +- for (i = 0; i < payload->nfuncs; i++) +- { +- const struct livepatch_func *func = &payload->funcs[i]; +- struct livepatch_fstate *fstate = &payload->fstate[i]; +- +- BUG_ON(revert_cnt != 1); +- BUG_ON(fstate->applied != LIVEPATCH_FUNC_APPLIED); +- +- /* Outside of quiesce zone: MAY TRIGGER HOST CRASH/UNDEFINED BEHAVIOR */ +- arch_livepatch_quiesce(); +- common_livepatch_revert(payload); +- arch_livepatch_revive(); +- BUG_ON(fstate->applied == LIVEPATCH_FUNC_APPLIED); +- +- printk(KERN_DEBUG "%s: post reverted: %s\n", __func__, func->name); +- } ++ local_irq_save(flags); ++ BUG_ON(revert_payload(payload)); ++ revert_payload_tail(payload); ++ local_irq_restore(flags); + + printk(KERN_DEBUG "%s: Hook done.\n", __func__); + } +-- +2.44.0 + + +From e9516b73e7d499684092c1d345818585403cf190 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:53:22 +0100 +Subject: [PATCH 51/70] xen/livepatch: properly build the noapply and norevert + tests +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +It seems the build variables for those tests where copy-pasted from +xen_action_hooks_marker-objs and not adjusted to use the correct source files. + +Fixes: 6047104c3ccc ('livepatch: Add per-function applied/reverted state tracking marker') +Signed-off-by: Roger Pau Monné +Reviewed-by: Ross Lagerwall +master commit: e579677095782c7dec792597ba8b037b7d716b32 +master date: 2024-02-28 16:57:25 +0000 +--- + xen/test/livepatch/Makefile | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/xen/test/livepatch/Makefile b/xen/test/livepatch/Makefile +index c258ab0b59..d987a8367f 100644 +--- a/xen/test/livepatch/Makefile ++++ b/xen/test/livepatch/Makefile +@@ -118,12 +118,12 @@ xen_action_hooks_marker-objs := xen_action_hooks_marker.o xen_hello_world_func.o + $(obj)/xen_action_hooks_noapply.o: $(obj)/config.h + + extra-y += xen_action_hooks_noapply.livepatch +-xen_action_hooks_noapply-objs := xen_action_hooks_marker.o xen_hello_world_func.o note.o xen_note.o ++xen_action_hooks_noapply-objs := xen_action_hooks_noapply.o xen_hello_world_func.o note.o xen_note.o + + $(obj)/xen_action_hooks_norevert.o: $(obj)/config.h + + extra-y += xen_action_hooks_norevert.livepatch +-xen_action_hooks_norevert-objs := xen_action_hooks_marker.o xen_hello_world_func.o note.o xen_note.o ++xen_action_hooks_norevert-objs := xen_action_hooks_norevert.o xen_hello_world_func.o note.o xen_note.o + + EXPECT_BYTES_COUNT := 8 + CODE_GET_EXPECT=$(shell $(OBJDUMP) -d --insn-width=1 $(1) | sed -n -e '/<'$(2)'>:$$/,/^$$/ p' | tail -n +2 | head -n $(EXPECT_BYTES_COUNT) | awk '{$$0=$$2; printf "%s", substr($$0,length-1)}' | sed 's/.\{2\}/0x&,/g' | sed 's/^/{/;s/,$$/}/g') +-- +2.44.0 + + +From 267845a8389d5d34edb2b38a1972f32f51f70b4e Mon Sep 17 00:00:00 2001 +From: Jason Andryuk +Date: Tue, 5 Mar 2024 11:54:12 +0100 +Subject: [PATCH 52/70] libxl: Fix segfault in device_model_spawn_outcome + +libxl__spawn_qdisk_backend() explicitly sets guest_config to NULL when +starting QEMU (the usual launch through libxl__spawn_local_dm() has a +guest_config though). + +Bail early on a NULL guest_config/d_config. This skips the QMP queries +for chardevs and VNC, but this xenpv QEMU instance isn't expected to +provide those - only qdisk (or 9pfs backends after an upcoming change). + +Signed-off-by: Jason Andryuk +Acked-by: Anthony PERARD +master commit: d4f3d35f043f6ef29393166b0dd131c8102cf255 +master date: 2024-02-29 08:18:38 +0100 +--- + tools/libs/light/libxl_dm.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/tools/libs/light/libxl_dm.c b/tools/libs/light/libxl_dm.c +index ed620a9d8e..29b43ed20a 100644 +--- a/tools/libs/light/libxl_dm.c ++++ b/tools/libs/light/libxl_dm.c +@@ -3172,8 +3172,8 @@ static void device_model_spawn_outcome(libxl__egc *egc, + + /* Check if spawn failed */ + if (rc) goto out; +- +- if (d_config->b_info.device_model_version ++ /* d_config is NULL for xl devd/libxl__spawn_qemu_xenpv_backend(). */ ++ if (d_config && d_config->b_info.device_model_version + == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN) { + rc = libxl__ev_time_register_rel(ao, &dmss->timeout, + devise_model_postconfig_timeout, +-- +2.44.0 + + +From 75221fb0f87e4d7278b0a540bc28a6d0b74afeba Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 5 Mar 2024 11:54:33 +0100 +Subject: [PATCH 53/70] x86/altcall: always use a temporary parameter stashing + variable +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The usage in ALT_CALL_ARG() on clang of: + +register union { + typeof(arg) e; + const unsigned long r; +} ... + +When `arg` is the first argument to alternative_{,v}call() and +const_vlapic_vcpu() is used results in clang 3.5.0 complaining with: + +arch/x86/hvm/vlapic.c:141:47: error: non-const static data member must be initialized out of line + alternative_call(hvm_funcs.test_pir, const_vlapic_vcpu(vlapic), vec) ) + +Workaround this by pulling `arg1` into a local variable, like it's done for +further arguments (arg2, arg3...) + +Originally arg1 wasn't pulled into a variable because for the a1_ register +local variable the possible clobbering as a result of operators on other +variables don't matter: + +https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + +Note clang version 3.8.1 seems to already be fixed and don't require the +workaround, but since it's harmless do it uniformly everywhere. + +Reported-by: Andrew Cooper +Fixes: 2ce562b2a413 ('x86/altcall: use a union as register type for function parameters on clang') +Signed-off-by: Roger Pau Monné +Acked-by: Jan Beulich +master commit: c20850540ad6a32f4fc17bde9b01c92b0df18bf0 +master date: 2024-02-29 08:21:49 +0100 +--- + xen/arch/x86/include/asm/alternative.h | 36 +++++++++++++++++--------- + 1 file changed, 24 insertions(+), 12 deletions(-) + +diff --git a/xen/arch/x86/include/asm/alternative.h b/xen/arch/x86/include/asm/alternative.h +index 3c14db5078..0d3697f1de 100644 +--- a/xen/arch/x86/include/asm/alternative.h ++++ b/xen/arch/x86/include/asm/alternative.h +@@ -253,21 +253,24 @@ extern void alternative_branches(void); + }) + + #define alternative_vcall1(func, arg) ({ \ +- ALT_CALL_ARG(arg, 1); \ ++ typeof(arg) v1_ = (arg); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_NO_ARG2; \ + (void)sizeof(func(arg)); \ + (void)alternative_callN(1, int, func); \ + }) + + #define alternative_call1(func, arg) ({ \ +- ALT_CALL_ARG(arg, 1); \ ++ typeof(arg) v1_ = (arg); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_NO_ARG2; \ + alternative_callN(1, typeof(func(arg)), func); \ + }) + + #define alternative_vcall2(func, arg1, arg2) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_NO_ARG3; \ + (void)sizeof(func(arg1, arg2)); \ +@@ -275,17 +278,19 @@ extern void alternative_branches(void); + }) + + #define alternative_call2(func, arg1, arg2) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_NO_ARG3; \ + alternative_callN(2, typeof(func(arg1, arg2)), func); \ + }) + + #define alternative_vcall3(func, arg1, arg2, arg3) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_NO_ARG4; \ +@@ -294,9 +299,10 @@ extern void alternative_branches(void); + }) + + #define alternative_call3(func, arg1, arg2, arg3) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_NO_ARG4; \ +@@ -305,10 +311,11 @@ extern void alternative_branches(void); + }) + + #define alternative_vcall4(func, arg1, arg2, arg3, arg4) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +@@ -318,10 +325,11 @@ extern void alternative_branches(void); + }) + + #define alternative_call4(func, arg1, arg2, arg3, arg4) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +@@ -332,11 +340,12 @@ extern void alternative_branches(void); + }) + + #define alternative_vcall5(func, arg1, arg2, arg3, arg4, arg5) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ + typeof(arg5) v5_ = (arg5); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +@@ -347,11 +356,12 @@ extern void alternative_branches(void); + }) + + #define alternative_call5(func, arg1, arg2, arg3, arg4, arg5) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ + typeof(arg5) v5_ = (arg5); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +@@ -363,12 +373,13 @@ extern void alternative_branches(void); + }) + + #define alternative_vcall6(func, arg1, arg2, arg3, arg4, arg5, arg6) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ + typeof(arg5) v5_ = (arg5); \ + typeof(arg6) v6_ = (arg6); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +@@ -379,12 +390,13 @@ extern void alternative_branches(void); + }) + + #define alternative_call6(func, arg1, arg2, arg3, arg4, arg5, arg6) ({ \ ++ typeof(arg1) v1_ = (arg1); \ + typeof(arg2) v2_ = (arg2); \ + typeof(arg3) v3_ = (arg3); \ + typeof(arg4) v4_ = (arg4); \ + typeof(arg5) v5_ = (arg5); \ + typeof(arg6) v6_ = (arg6); \ +- ALT_CALL_ARG(arg1, 1); \ ++ ALT_CALL_ARG(v1_, 1); \ + ALT_CALL_ARG(v2_, 2); \ + ALT_CALL_ARG(v3_, 3); \ + ALT_CALL_ARG(v4_, 4); \ +-- +2.44.0 + + +From fd7cb7a1d0433049d8fc59444d0e91b71728763e Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 5 Mar 2024 11:55:17 +0100 +Subject: [PATCH 54/70] x86/cpu-policy: Allow for levelling of VERW side + effects +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +MD_CLEAR and FB_CLEAR need OR-ing across a migrate pool. Allow this, by +having them unconditinally set in max, with the host values reflected in +default. Annotate the bits as having special properies. + +Signed-off-by: Andrew Cooper +Reviewed-by: Roger Pau Monné +master commit: de17162cafd27f2865a3102a2ec0f386a02ed03d +master date: 2024-03-01 20:14:19 +0000 +--- + xen/arch/x86/cpu-policy.c | 24 +++++++++++++++++++++ + xen/arch/x86/include/asm/cpufeature.h | 1 + + xen/include/public/arch-x86/cpufeatureset.h | 4 ++-- + 3 files changed, 27 insertions(+), 2 deletions(-) + +diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c +index bcb17b7ce3..c7c5e99b7b 100644 +--- a/xen/arch/x86/cpu-policy.c ++++ b/xen/arch/x86/cpu-policy.c +@@ -442,6 +442,16 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) + __set_bit(X86_FEATURE_RSBA, fs); + __set_bit(X86_FEATURE_RRSBA, fs); + ++ /* ++ * These bits indicate that the VERW instruction may have gained ++ * scrubbing side effects. With pooling, they mean "you might migrate ++ * somewhere where scrubbing is necessary", and may need exposing on ++ * unaffected hardware. This is fine, because the VERW instruction ++ * has been around since the 286. ++ */ ++ __set_bit(X86_FEATURE_MD_CLEAR, fs); ++ __set_bit(X86_FEATURE_FB_CLEAR, fs); ++ + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an + * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. +@@ -476,6 +486,20 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) + cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) ) + __clear_bit(X86_FEATURE_RDRAND, fs); + ++ /* ++ * These bits indicate that the VERW instruction may have gained ++ * scrubbing side effects. The max policy has them set for migration ++ * reasons, so reset the default policy back to the host values in ++ * case we're unaffected. ++ */ ++ __clear_bit(X86_FEATURE_MD_CLEAR, fs); ++ if ( cpu_has_md_clear ) ++ __set_bit(X86_FEATURE_MD_CLEAR, fs); ++ ++ __clear_bit(X86_FEATURE_FB_CLEAR, fs); ++ if ( cpu_has_fb_clear ) ++ __set_bit(X86_FEATURE_FB_CLEAR, fs); ++ + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an + * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. +diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h +index 06e1dd7f33..76ef2aeb1d 100644 +--- a/xen/arch/x86/include/asm/cpufeature.h ++++ b/xen/arch/x86/include/asm/cpufeature.h +@@ -177,6 +177,7 @@ static inline bool boot_cpu_has(unsigned int feat) + #define cpu_has_avx512_4fmaps boot_cpu_has(X86_FEATURE_AVX512_4FMAPS) + #define cpu_has_avx512_vp2intersect boot_cpu_has(X86_FEATURE_AVX512_VP2INTERSECT) + #define cpu_has_srbds_ctrl boot_cpu_has(X86_FEATURE_SRBDS_CTRL) ++#define cpu_has_md_clear boot_cpu_has(X86_FEATURE_MD_CLEAR) + #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) + #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) + #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index 6b6ce2745c..337aaa9c77 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -262,7 +262,7 @@ XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A AVX512 Multiply Accumulation Single + XEN_CPUFEATURE(FSRM, 9*32+ 4) /*A Fast Short REP MOVS */ + XEN_CPUFEATURE(AVX512_VP2INTERSECT, 9*32+8) /*a VP2INTERSECT{D,Q} insns */ + XEN_CPUFEATURE(SRBDS_CTRL, 9*32+ 9) /* MSR_MCU_OPT_CTRL and RNGDS_MITG_DIS. */ +-XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*A VERW clears microarchitectural buffers */ ++XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffers */ + XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ + XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ + XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*A SERIALIZE insn */ +@@ -329,7 +329,7 @@ XEN_CPUFEATURE(DOITM, 16*32+12) /* Data Operand Invariant Timing + XEN_CPUFEATURE(SBDR_SSDP_NO, 16*32+13) /*A No Shared Buffer Data Read or Sideband Stale Data Propagation */ + XEN_CPUFEATURE(FBSDP_NO, 16*32+14) /*A No Fill Buffer Stale Data Propagation */ + XEN_CPUFEATURE(PSDP_NO, 16*32+15) /*A No Primary Stale Data Propagation */ +-XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*A Fill Buffers cleared by VERW */ ++XEN_CPUFEATURE(FB_CLEAR, 16*32+17) /*!A Fill Buffers cleared by VERW */ + XEN_CPUFEATURE(FB_CLEAR_CTRL, 16*32+18) /* MSR_OPT_CPU_CTRL.FB_CLEAR_DIS */ + XEN_CPUFEATURE(RRSBA, 16*32+19) /*! Restricted RSB Alternative */ + XEN_CPUFEATURE(BHI_NO, 16*32+20) /*A No Branch History Injection */ +-- +2.44.0 + + +From 4c84fa6cb66fe66f2c5dad65208c497558ab7d17 Mon Sep 17 00:00:00 2001 +From: Jan Beulich +Date: Tue, 12 Mar 2024 12:06:57 +0100 +Subject: [PATCH 55/70] hvmloader/PCI: skip huge BARs in certain calculations +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +BARs of size 2Gb and up can't possibly fit below 4Gb: Both the bottom of +the lower 2Gb range and the top of the higher 2Gb range have special +purpose. Don't even have them influence whether to (perhaps) relocate +low RAM. + +Reported-by: Neowutran +Signed-off-by: Jan Beulich +Acked-by: Roger Pau Monné +master commit: 57acad12a09ffa490e870ebe17596aad858f0191 +master date: 2024-03-06 10:19:29 +0100 +--- + tools/firmware/hvmloader/pci.c | 28 ++++++++++++++++++++-------- + 1 file changed, 20 insertions(+), 8 deletions(-) + +diff --git a/tools/firmware/hvmloader/pci.c b/tools/firmware/hvmloader/pci.c +index 257a6feb61..c3c61ca060 100644 +--- a/tools/firmware/hvmloader/pci.c ++++ b/tools/firmware/hvmloader/pci.c +@@ -33,6 +33,13 @@ uint32_t pci_mem_start = HVM_BELOW_4G_MMIO_START; + const uint32_t pci_mem_end = RESERVED_MEMBASE; + uint64_t pci_hi_mem_start = 0, pci_hi_mem_end = 0; + ++/* ++ * BARs larger than this value are put in 64-bit space unconditionally. That ++ * is, such BARs also don't play into the determination of how big the lowmem ++ * MMIO hole needs to be. ++ */ ++#define BAR_RELOC_THRESH GB(1) ++ + enum virtual_vga virtual_vga = VGA_none; + unsigned long igd_opregion_pgbase = 0; + +@@ -286,9 +293,11 @@ void pci_setup(void) + bars[i].bar_reg = bar_reg; + bars[i].bar_sz = bar_sz; + +- if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) == +- PCI_BASE_ADDRESS_SPACE_MEMORY) || +- (bar_reg == PCI_ROM_ADDRESS) ) ++ if ( is_64bar && bar_sz > BAR_RELOC_THRESH ) ++ bar64_relocate = 1; ++ else if ( ((bar_data & PCI_BASE_ADDRESS_SPACE) == ++ PCI_BASE_ADDRESS_SPACE_MEMORY) || ++ (bar_reg == PCI_ROM_ADDRESS) ) + mmio_total += bar_sz; + + nr_bars++; +@@ -367,7 +376,7 @@ void pci_setup(void) + pci_mem_start = hvm_info->low_mem_pgend << PAGE_SHIFT; + } + +- if ( mmio_total > (pci_mem_end - pci_mem_start) ) ++ if ( mmio_total > (pci_mem_end - pci_mem_start) || bar64_relocate ) + { + printf("Low MMIO hole not large enough for all devices," + " relocating some BARs to 64-bit\n"); +@@ -430,7 +439,8 @@ void pci_setup(void) + + /* + * Relocate to high memory if the total amount of MMIO needed +- * is more than the low MMIO available. Because devices are ++ * is more than the low MMIO available or BARs bigger than ++ * BAR_RELOC_THRESH are present. Because devices are + * processed in order of bar_sz, this will preferentially + * relocate larger devices to high memory first. + * +@@ -446,8 +456,9 @@ void pci_setup(void) + * the code here assumes it to be.) + * Should either of those two conditions change, this code will break. + */ +- using_64bar = bars[i].is_64bar && bar64_relocate +- && (mmio_total > (mem_resource.max - mem_resource.base)); ++ using_64bar = bars[i].is_64bar && bar64_relocate && ++ (mmio_total > (mem_resource.max - mem_resource.base) || ++ bar_sz > BAR_RELOC_THRESH); + bar_data = pci_readl(devfn, bar_reg); + + if ( (bar_data & PCI_BASE_ADDRESS_SPACE) == +@@ -467,7 +478,8 @@ void pci_setup(void) + resource = &mem_resource; + bar_data &= ~PCI_BASE_ADDRESS_MEM_MASK; + } +- mmio_total -= bar_sz; ++ if ( bar_sz <= BAR_RELOC_THRESH ) ++ mmio_total -= bar_sz; + } + else + { +-- +2.44.0 + + +From a96d2d4355d85fc82abd0a3799978db04ee8cff3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 12 Mar 2024 12:07:07 +0100 +Subject: [PATCH 56/70] x86/mm: fix detection of last L1 entry in + modify_xen_mappings_lite() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The current logic to detect when to switch to the next L1 table is incorrectly +using l2_table_offset() in order to notice when the last entry on the current +L1 table has been reached. + +It should instead use l1_table_offset() to check whether the index has wrapped +to point to the first entry, and so the next L1 table should be used. + +Fixes: 8676092a0f16 ('x86/livepatch: Fix livepatch application when CET is active') +Signed-off-by: Roger Pau Monné +Reviewed-by: Andrew Cooper +master commit: 7c81558208de7858251b62f168a449be84305595 +master date: 2024-03-11 11:09:42 +0000 +--- + xen/arch/x86/mm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 39544bd9f9..ab0acbfea6 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -5947,7 +5947,7 @@ void init_or_livepatch modify_xen_mappings_lite( + + v += 1UL << L1_PAGETABLE_SHIFT; + +- if ( l2_table_offset(v) == 0 ) ++ if ( l1_table_offset(v) == 0 ) + break; + } + +-- +2.44.0 + + +From fe1869a569bab56e44c35d1522ee064bab6286da Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Sat, 27 Jan 2024 17:52:09 +0000 +Subject: [PATCH 57/70] x86/entry: Introduce EFRAME_* constants + +restore_all_guest() does a lot of manipulation of the stack after popping the +GPRs, and uses raw %rsp displacements to do so. Also, almost all entrypaths +use raw %rsp displacements prior to pushing GPRs. + +Provide better mnemonics, to aid readability and reduce the chance of errors +when editing. + +No functional change. The resulting binary is identical. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 37541208f119a9c552c6c6c3246ea61be0d44035) +--- + xen/arch/x86/x86_64/asm-offsets.c | 17 ++++++++ + xen/arch/x86/x86_64/compat/entry.S | 2 +- + xen/arch/x86/x86_64/entry.S | 70 +++++++++++++++--------------- + 3 files changed, 53 insertions(+), 36 deletions(-) + +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index 57b73a4e62..2fc4d9130a 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -51,6 +51,23 @@ void __dummy__(void) + OFFSET(UREGS_kernel_sizeof, struct cpu_user_regs, es); + BLANK(); + ++ /* ++ * EFRAME_* is for the entry/exit logic where %rsp is pointing at ++ * UREGS_error_code and GPRs are still/already guest values. ++ */ ++#define OFFSET_EF(sym, mem) \ ++ DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ ++ offsetof(struct cpu_user_regs, error_code)) ++ ++ OFFSET_EF(EFRAME_entry_vector, entry_vector); ++ OFFSET_EF(EFRAME_rip, rip); ++ OFFSET_EF(EFRAME_cs, cs); ++ OFFSET_EF(EFRAME_eflags, eflags); ++ OFFSET_EF(EFRAME_rsp, rsp); ++ BLANK(); ++ ++#undef OFFSET_EF ++ + OFFSET(VCPU_processor, struct vcpu, processor); + OFFSET(VCPU_domain, struct vcpu, domain); + OFFSET(VCPU_vcpu_info, struct vcpu, vcpu_info_area.map); +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index fcc3a721f1..cb473f08ee 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -15,7 +15,7 @@ ENTRY(entry_int82) + ENDBR64 + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP + pushq $0 +- movl $HYPERCALL_VECTOR, 4(%rsp) ++ movl $HYPERCALL_VECTOR, EFRAME_entry_vector(%rsp) + SAVE_ALL compat=1 /* DPL1 gate, restricted to 32bit PV guests only. */ + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index 9a7b129aa7..968da9d727 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -190,15 +190,15 @@ restore_all_guest: + SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + + RESTORE_ALL +- testw $TRAP_syscall,4(%rsp) ++ testw $TRAP_syscall, EFRAME_entry_vector(%rsp) + jz iret_exit_to_guest + +- movq 24(%rsp),%r11 # RFLAGS ++ mov EFRAME_eflags(%rsp), %r11 + andq $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), %r11 + orq $X86_EFLAGS_IF,%r11 + + /* Don't use SYSRET path if the return address is not canonical. */ +- movq 8(%rsp),%rcx ++ mov EFRAME_rip(%rsp), %rcx + sarq $47,%rcx + incl %ecx + cmpl $1,%ecx +@@ -213,20 +213,20 @@ restore_all_guest: + ALTERNATIVE "", rag_clrssbsy, X86_FEATURE_XEN_SHSTK + #endif + +- movq 8(%rsp), %rcx # RIP +- cmpw $FLAT_USER_CS32,16(%rsp)# CS +- movq 32(%rsp),%rsp # RSP ++ mov EFRAME_rip(%rsp), %rcx ++ cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) ++ mov EFRAME_rsp(%rsp), %rsp + je 1f + sysretq + 1: sysretl + + ALIGN + .Lrestore_rcx_iret_exit_to_guest: +- movq 8(%rsp), %rcx # RIP ++ mov EFRAME_rip(%rsp), %rcx + /* No special register assumptions. */ + iret_exit_to_guest: +- andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), 24(%rsp) +- orl $X86_EFLAGS_IF,24(%rsp) ++ andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) ++ orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) + addq $8,%rsp + .Lft0: iretq + _ASM_PRE_EXTABLE(.Lft0, handle_exception) +@@ -257,7 +257,7 @@ ENTRY(lstar_enter) + pushq $FLAT_KERNEL_CS64 + pushq %rcx + pushq $0 +- movl $TRAP_syscall, 4(%rsp) ++ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -294,7 +294,7 @@ ENTRY(cstar_enter) + pushq $FLAT_USER_CS32 + pushq %rcx + pushq $0 +- movl $TRAP_syscall, 4(%rsp) ++ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -335,7 +335,7 @@ GLOBAL(sysenter_eflags_saved) + pushq $3 /* ring 3 null cs */ + pushq $0 /* null rip */ + pushq $0 +- movl $TRAP_syscall, 4(%rsp) ++ movl $TRAP_syscall, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -389,7 +389,7 @@ ENTRY(int80_direct_trap) + ENDBR64 + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP + pushq $0 +- movl $0x80, 4(%rsp) ++ movl $0x80, EFRAME_entry_vector(%rsp) + SAVE_ALL + + SPEC_CTRL_ENTRY_FROM_PV /* Req: %rsp=regs/cpuinfo, %rdx=0, Clob: acd */ +@@ -649,7 +649,7 @@ ret_from_intr: + .section .init.text, "ax", @progbits + ENTRY(early_page_fault) + ENDBR64 +- movl $X86_EXC_PF, 4(%rsp) ++ movl $X86_EXC_PF, EFRAME_entry_vector(%rsp) + SAVE_ALL + movq %rsp, %rdi + call do_early_page_fault +@@ -716,7 +716,7 @@ ENTRY(common_interrupt) + + ENTRY(entry_PF) + ENDBR64 +- movl $X86_EXC_PF, 4(%rsp) ++ movl $X86_EXC_PF, EFRAME_entry_vector(%rsp) + /* No special register assumptions. */ + GLOBAL(handle_exception) + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP +@@ -890,90 +890,90 @@ FATAL_exception_with_ints_disabled: + ENTRY(entry_DE) + ENDBR64 + pushq $0 +- movl $X86_EXC_DE, 4(%rsp) ++ movl $X86_EXC_DE, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_MF) + ENDBR64 + pushq $0 +- movl $X86_EXC_MF, 4(%rsp) ++ movl $X86_EXC_MF, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_XM) + ENDBR64 + pushq $0 +- movl $X86_EXC_XM, 4(%rsp) ++ movl $X86_EXC_XM, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_NM) + ENDBR64 + pushq $0 +- movl $X86_EXC_NM, 4(%rsp) ++ movl $X86_EXC_NM, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_DB) + ENDBR64 + pushq $0 +- movl $X86_EXC_DB, 4(%rsp) ++ movl $X86_EXC_DB, EFRAME_entry_vector(%rsp) + jmp handle_ist_exception + + ENTRY(entry_BP) + ENDBR64 + pushq $0 +- movl $X86_EXC_BP, 4(%rsp) ++ movl $X86_EXC_BP, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_OF) + ENDBR64 + pushq $0 +- movl $X86_EXC_OF, 4(%rsp) ++ movl $X86_EXC_OF, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_BR) + ENDBR64 + pushq $0 +- movl $X86_EXC_BR, 4(%rsp) ++ movl $X86_EXC_BR, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_UD) + ENDBR64 + pushq $0 +- movl $X86_EXC_UD, 4(%rsp) ++ movl $X86_EXC_UD, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_TS) + ENDBR64 +- movl $X86_EXC_TS, 4(%rsp) ++ movl $X86_EXC_TS, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_NP) + ENDBR64 +- movl $X86_EXC_NP, 4(%rsp) ++ movl $X86_EXC_NP, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_SS) + ENDBR64 +- movl $X86_EXC_SS, 4(%rsp) ++ movl $X86_EXC_SS, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_GP) + ENDBR64 +- movl $X86_EXC_GP, 4(%rsp) ++ movl $X86_EXC_GP, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_AC) + ENDBR64 +- movl $X86_EXC_AC, 4(%rsp) ++ movl $X86_EXC_AC, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_CP) + ENDBR64 +- movl $X86_EXC_CP, 4(%rsp) ++ movl $X86_EXC_CP, EFRAME_entry_vector(%rsp) + jmp handle_exception + + ENTRY(entry_DF) + ENDBR64 +- movl $X86_EXC_DF, 4(%rsp) ++ movl $X86_EXC_DF, EFRAME_entry_vector(%rsp) + /* Set AC to reduce chance of further SMAP faults */ + ALTERNATIVE "", stac, X86_FEATURE_XEN_SMAP + SAVE_ALL +@@ -998,7 +998,7 @@ ENTRY(entry_DF) + ENTRY(entry_NMI) + ENDBR64 + pushq $0 +- movl $X86_EXC_NMI, 4(%rsp) ++ movl $X86_EXC_NMI, EFRAME_entry_vector(%rsp) + handle_ist_exception: + ALTERNATIVE "", clac, X86_FEATURE_XEN_SMAP + SAVE_ALL +@@ -1130,7 +1130,7 @@ handle_ist_exception: + ENTRY(entry_MC) + ENDBR64 + pushq $0 +- movl $X86_EXC_MC, 4(%rsp) ++ movl $X86_EXC_MC, EFRAME_entry_vector(%rsp) + jmp handle_ist_exception + + /* No op trap handler. Required for kexec crash path. */ +@@ -1167,7 +1167,7 @@ autogen_stubs: /* Automatically generated stubs. */ + 1: + ENDBR64 + pushq $0 +- movb $vec,4(%rsp) ++ movb $vec, EFRAME_entry_vector(%rsp) + jmp common_interrupt + + entrypoint 1b +@@ -1181,7 +1181,7 @@ autogen_stubs: /* Automatically generated stubs. */ + test $8,%spl /* 64bit exception frames are 16 byte aligned, but the word */ + jz 2f /* size is 8 bytes. Check whether the processor gave us an */ + pushq $0 /* error code, and insert an empty one if not. */ +-2: movb $vec,4(%rsp) ++2: movb $vec, EFRAME_entry_vector(%rsp) + jmp handle_exception + + entrypoint 1b +-- +2.44.0 + + +From b91c253e81db915f685b29e6947144ab9905388d Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 27 Feb 2024 16:07:39 +0000 +Subject: [PATCH 58/70] x86: Resync intel-family.h from Linux + +From v6.8-rc6 + +Signed-off-by: Andrew Cooper +Acked-by: Jan Beulich +(cherry picked from commit 195e75371b13c4f7ecdf7b5c50aed0d02f2d7ce8) +--- + xen/arch/x86/include/asm/intel-family.h | 38 ++++++++++++++++++++++--- + 1 file changed, 34 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/include/asm/intel-family.h b/xen/arch/x86/include/asm/intel-family.h +index ffc49151be..b65e9c46b9 100644 +--- a/xen/arch/x86/include/asm/intel-family.h ++++ b/xen/arch/x86/include/asm/intel-family.h +@@ -26,6 +26,9 @@ + * _G - parts with extra graphics on + * _X - regular server parts + * _D - micro server parts ++ * _N,_P - other mobile parts ++ * _H - premium mobile parts ++ * _S - other client parts + * + * Historical OPTDIFFs: + * +@@ -37,6 +40,9 @@ + * their own names :-( + */ + ++/* Wildcard match for FAM6 so X86_MATCH_INTEL_FAM6_MODEL(ANY) works */ ++#define INTEL_FAM6_ANY X86_MODEL_ANY ++ + #define INTEL_FAM6_CORE_YONAH 0x0E + + #define INTEL_FAM6_CORE2_MEROM 0x0F +@@ -93,8 +99,6 @@ + #define INTEL_FAM6_ICELAKE_L 0x7E /* Sunny Cove */ + #define INTEL_FAM6_ICELAKE_NNPI 0x9D /* Sunny Cove */ + +-#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ +- + #define INTEL_FAM6_ROCKETLAKE 0xA7 /* Cypress Cove */ + + #define INTEL_FAM6_TIGERLAKE_L 0x8C /* Willow Cove */ +@@ -102,12 +106,31 @@ + + #define INTEL_FAM6_SAPPHIRERAPIDS_X 0x8F /* Golden Cove */ + ++#define INTEL_FAM6_EMERALDRAPIDS_X 0xCF ++ ++#define INTEL_FAM6_GRANITERAPIDS_X 0xAD ++#define INTEL_FAM6_GRANITERAPIDS_D 0xAE ++ ++/* "Hybrid" Processors (P-Core/E-Core) */ ++ ++#define INTEL_FAM6_LAKEFIELD 0x8A /* Sunny Cove / Tremont */ ++ + #define INTEL_FAM6_ALDERLAKE 0x97 /* Golden Cove / Gracemont */ + #define INTEL_FAM6_ALDERLAKE_L 0x9A /* Golden Cove / Gracemont */ + +-#define INTEL_FAM6_RAPTORLAKE 0xB7 ++#define INTEL_FAM6_RAPTORLAKE 0xB7 /* Raptor Cove / Enhanced Gracemont */ ++#define INTEL_FAM6_RAPTORLAKE_P 0xBA ++#define INTEL_FAM6_RAPTORLAKE_S 0xBF ++ ++#define INTEL_FAM6_METEORLAKE 0xAC ++#define INTEL_FAM6_METEORLAKE_L 0xAA ++ ++#define INTEL_FAM6_ARROWLAKE_H 0xC5 ++#define INTEL_FAM6_ARROWLAKE 0xC6 ++ ++#define INTEL_FAM6_LUNARLAKE_M 0xBD + +-/* "Small Core" Processors (Atom) */ ++/* "Small Core" Processors (Atom/E-Core) */ + + #define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */ + #define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */ +@@ -134,6 +157,13 @@ + #define INTEL_FAM6_ATOM_TREMONT 0x96 /* Elkhart Lake */ + #define INTEL_FAM6_ATOM_TREMONT_L 0x9C /* Jasper Lake */ + ++#define INTEL_FAM6_ATOM_GRACEMONT 0xBE /* Alderlake N */ ++ ++#define INTEL_FAM6_ATOM_CRESTMONT_X 0xAF /* Sierra Forest */ ++#define INTEL_FAM6_ATOM_CRESTMONT 0xB6 /* Grand Ridge */ ++ ++#define INTEL_FAM6_ATOM_DARKMONT_X 0xDD /* Clearwater Forest */ ++ + /* Xeon Phi */ + + #define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ +-- +2.44.0 + + +From 9f89ec65fbe49c3be32a456091097d7ef017d268 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Fri, 23 Jun 2023 11:32:00 +0100 +Subject: [PATCH 59/70] x86/vmx: Perform VERW flushing later in the VMExit path + +Broken out of the following patch because this change is subtle enough on its +own. See it for the rational of why we're moving VERW. + +As for how, extend the trick already used to hold one condition in +flags (RESUME vs LAUNCH) through the POPing of GPRs. + +Move the MOV CR earlier. Intel specify flags to be undefined across it. + +Encode the two conditions we want using SF and PF. See the code comment for +exactly how. + +Leave a comment to explain the lack of any content around +SPEC_CTRL_EXIT_TO_VMX, but leave the block in place. Sods law says if we +delete it, we'll need to reintroduce it. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 475fa20b7384464210f42bad7195f87bd6f1c63f) +--- + xen/arch/x86/hvm/vmx/entry.S | 36 +++++++++++++++++++++--- + xen/arch/x86/include/asm/asm_defns.h | 8 ++++++ + xen/arch/x86/include/asm/spec_ctrl_asm.h | 7 +++++ + xen/arch/x86/x86_64/asm-offsets.c | 1 + + 4 files changed, 48 insertions(+), 4 deletions(-) + +diff --git a/xen/arch/x86/hvm/vmx/entry.S b/xen/arch/x86/hvm/vmx/entry.S +index e3f60d5a82..1bead826ca 100644 +--- a/xen/arch/x86/hvm/vmx/entry.S ++++ b/xen/arch/x86/hvm/vmx/entry.S +@@ -87,17 +87,39 @@ UNLIKELY_END(realmode) + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ + /* SPEC_CTRL_EXIT_TO_VMX Req: %rsp=regs/cpuinfo Clob: */ +- DO_SPEC_CTRL_COND_VERW ++ /* ++ * All speculation safety work happens to be elsewhere. VERW is after ++ * popping the GPRs, while restoring the guest MSR_SPEC_CTRL is left ++ * to the MSR load list. ++ */ + + mov VCPU_hvm_guest_cr2(%rbx),%rax ++ mov %rax, %cr2 ++ ++ /* ++ * We need to perform two conditional actions (VERW, and Resume vs ++ * Launch) after popping GPRs. With some cunning, we can encode both ++ * of these in eflags together. ++ * ++ * Parity is only calculated over the bottom byte of the answer, while ++ * Sign is simply the top bit. ++ * ++ * Therefore, the final OR instruction ends up producing: ++ * SF = VCPU_vmx_launched ++ * PF = !SCF_verw ++ */ ++ BUILD_BUG_ON(SCF_verw & ~0xff) ++ movzbl VCPU_vmx_launched(%rbx), %ecx ++ shl $31, %ecx ++ movzbl CPUINFO_spec_ctrl_flags(%rsp), %eax ++ and $SCF_verw, %eax ++ or %eax, %ecx + + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp +- mov %rax,%cr2 +- cmpb $0,VCPU_vmx_launched(%rbx) + pop %rbx + pop %r11 + pop %r10 +@@ -108,7 +130,13 @@ UNLIKELY_END(realmode) + pop %rdx + pop %rsi + pop %rdi +- je .Lvmx_launch ++ ++ jpe .L_skip_verw ++ /* VERW clobbers ZF, but preserves all others, including SF. */ ++ verw STK_REL(CPUINFO_verw_sel, CPUINFO_error_code)(%rsp) ++.L_skip_verw: ++ ++ jns .Lvmx_launch + + /*.Lvmx_resume:*/ + VMRESUME +diff --git a/xen/arch/x86/include/asm/asm_defns.h b/xen/arch/x86/include/asm/asm_defns.h +index baaaccb26e..56ae26e542 100644 +--- a/xen/arch/x86/include/asm/asm_defns.h ++++ b/xen/arch/x86/include/asm/asm_defns.h +@@ -81,6 +81,14 @@ register unsigned long current_stack_pointer asm("rsp"); + + #ifdef __ASSEMBLY__ + ++.macro BUILD_BUG_ON condstr, cond:vararg ++ .if \cond ++ .error "Condition \"\condstr\" not satisfied" ++ .endif ++.endm ++/* preprocessor macro to make error message more user friendly */ ++#define BUILD_BUG_ON(cond) BUILD_BUG_ON #cond, cond ++ + #ifdef HAVE_AS_QUOTED_SYM + #define SUBSECTION_LBL(tag) \ + .ifndef .L.tag; \ +diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h +index 6cb7c1b949..525745a066 100644 +--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h ++++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h +@@ -152,6 +152,13 @@ + #endif + .endm + ++/* ++ * Helper to improve the readibility of stack dispacements with %rsp in ++ * unusual positions. Both @field and @top_of_stack should be constants from ++ * the same object. @top_of_stack should be where %rsp is currently pointing. ++ */ ++#define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) ++ + .macro DO_SPEC_CTRL_COND_VERW + /* + * Requires %rsp=cpuinfo +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index 2fc4d9130a..0d33678898 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -135,6 +135,7 @@ void __dummy__(void) + #endif + + OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); ++ OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); + OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); + OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); + OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); +-- +2.44.0 + + +From 95dd34fdbea5408872d5c244fe268222a4f145d0 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Sat, 27 Jan 2024 18:20:56 +0000 +Subject: [PATCH 60/70] x86/spec-ctrl: Perform VERW flushing later in exit + paths + +On parts vulnerable to RFDS, VERW's side effects are extended to scrub all +non-architectural entries in various Physical Register Files. To remove all +of Xen's values, the VERW must be after popping the GPRs. + +Rework SPEC_CTRL_COND_VERW to default to an CPUINFO_error_code %rsp position, +but with overrides for other contexts. Identify that it clobbers eflags; this +is particularly relevant for the SYSRET path. + +For the IST exit return to Xen, have the main SPEC_CTRL_EXIT_TO_XEN put a +shadow copy of spec_ctrl_flags, as GPRs can't be used at the point we want to +issue the VERW. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit 0a666cf2cd99df6faf3eebc81a1fc286e4eca4c7) +--- + xen/arch/x86/include/asm/spec_ctrl_asm.h | 36 ++++++++++++++++-------- + xen/arch/x86/x86_64/asm-offsets.c | 13 +++++++-- + xen/arch/x86/x86_64/compat/entry.S | 6 ++++ + xen/arch/x86/x86_64/entry.S | 21 +++++++++++++- + 4 files changed, 61 insertions(+), 15 deletions(-) + +diff --git a/xen/arch/x86/include/asm/spec_ctrl_asm.h b/xen/arch/x86/include/asm/spec_ctrl_asm.h +index 525745a066..13acebc75d 100644 +--- a/xen/arch/x86/include/asm/spec_ctrl_asm.h ++++ b/xen/arch/x86/include/asm/spec_ctrl_asm.h +@@ -159,16 +159,23 @@ + */ + #define STK_REL(field, top_of_stk) ((field) - (top_of_stk)) + +-.macro DO_SPEC_CTRL_COND_VERW ++.macro SPEC_CTRL_COND_VERW \ ++ scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_error_code), \ ++ sel=STK_REL(CPUINFO_verw_sel, CPUINFO_error_code) + /* +- * Requires %rsp=cpuinfo ++ * Requires \scf and \sel as %rsp-relative expressions ++ * Clobbers eflags ++ * ++ * VERW needs to run after guest GPRs have been restored, where only %rsp is ++ * good to use. Default to expecting %rsp pointing at CPUINFO_error_code. ++ * Contexts where this is not true must provide an alternative \scf and \sel. + * + * Issue a VERW for its flushing side effect, if indicated. This is a Spectre + * v1 gadget, but the IRET/VMEntry is serialising. + */ +- testb $SCF_verw, CPUINFO_spec_ctrl_flags(%rsp) ++ testb $SCF_verw, \scf(%rsp) + jz .L\@_verw_skip +- verw CPUINFO_verw_sel(%rsp) ++ verw \sel(%rsp) + .L\@_verw_skip: + .endm + +@@ -286,8 +293,6 @@ + */ + ALTERNATIVE "", DO_SPEC_CTRL_EXIT_TO_GUEST, X86_FEATURE_SC_MSR_PV + +- DO_SPEC_CTRL_COND_VERW +- + ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV + .endm + +@@ -367,7 +372,7 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): + */ + .macro SPEC_CTRL_EXIT_TO_XEN + /* +- * Requires %r12=ist_exit, %r14=stack_end ++ * Requires %r12=ist_exit, %r14=stack_end, %rsp=regs + * Clobbers %rax, %rbx, %rcx, %rdx + */ + movzbl STACK_CPUINFO_FIELD(spec_ctrl_flags)(%r14), %ebx +@@ -395,11 +400,18 @@ UNLIKELY_DISPATCH_LABEL(\@_serialise): + test %r12, %r12 + jz .L\@_skip_ist_exit + +- /* Logically DO_SPEC_CTRL_COND_VERW but without the %rsp=cpuinfo dependency */ +- testb $SCF_verw, %bl +- jz .L\@_skip_verw +- verw STACK_CPUINFO_FIELD(verw_sel)(%r14) +-.L\@_skip_verw: ++ /* ++ * Stash SCF and verw_sel above eflags in the case of an IST_exit. The ++ * VERW logic needs to run after guest GPRs have been restored; i.e. where ++ * we cannot use %r12 or %r14 for the purposes they have here. ++ * ++ * When the CPU pushed this exception frame, it zero-extended eflags. ++ * Therefore it is safe for the VERW logic to look at the stashed SCF ++ * outside of the ist_exit condition. Also, this stashing won't influence ++ * any other restore_all_guest() paths. ++ */ ++ or $(__HYPERVISOR_DS32 << 16), %ebx ++ mov %ebx, UREGS_eflags + 4(%rsp) /* EFRAME_shadow_scf/sel */ + + ALTERNATIVE "", DO_SPEC_CTRL_DIV, X86_FEATURE_SC_DIV + +diff --git a/xen/arch/x86/x86_64/asm-offsets.c b/xen/arch/x86/x86_64/asm-offsets.c +index 0d33678898..85c7d0c989 100644 +--- a/xen/arch/x86/x86_64/asm-offsets.c ++++ b/xen/arch/x86/x86_64/asm-offsets.c +@@ -55,14 +55,22 @@ void __dummy__(void) + * EFRAME_* is for the entry/exit logic where %rsp is pointing at + * UREGS_error_code and GPRs are still/already guest values. + */ +-#define OFFSET_EF(sym, mem) \ ++#define OFFSET_EF(sym, mem, ...) \ + DEFINE(sym, offsetof(struct cpu_user_regs, mem) - \ +- offsetof(struct cpu_user_regs, error_code)) ++ offsetof(struct cpu_user_regs, error_code) __VA_ARGS__) + + OFFSET_EF(EFRAME_entry_vector, entry_vector); + OFFSET_EF(EFRAME_rip, rip); + OFFSET_EF(EFRAME_cs, cs); + OFFSET_EF(EFRAME_eflags, eflags); ++ ++ /* ++ * These aren't real fields. They're spare space, used by the IST ++ * exit-to-xen path. ++ */ ++ OFFSET_EF(EFRAME_shadow_scf, eflags, +4); ++ OFFSET_EF(EFRAME_shadow_sel, eflags, +6); ++ + OFFSET_EF(EFRAME_rsp, rsp); + BLANK(); + +@@ -136,6 +144,7 @@ void __dummy__(void) + + OFFSET(CPUINFO_guest_cpu_user_regs, struct cpu_info, guest_cpu_user_regs); + OFFSET(CPUINFO_error_code, struct cpu_info, guest_cpu_user_regs.error_code); ++ OFFSET(CPUINFO_rip, struct cpu_info, guest_cpu_user_regs.rip); + OFFSET(CPUINFO_verw_sel, struct cpu_info, verw_sel); + OFFSET(CPUINFO_current_vcpu, struct cpu_info, current_vcpu); + OFFSET(CPUINFO_per_cpu_offset, struct cpu_info, per_cpu_offset); +diff --git a/xen/arch/x86/x86_64/compat/entry.S b/xen/arch/x86/x86_64/compat/entry.S +index cb473f08ee..3bbe3a79a5 100644 +--- a/xen/arch/x86/x86_64/compat/entry.S ++++ b/xen/arch/x86/x86_64/compat/entry.S +@@ -161,6 +161,12 @@ ENTRY(compat_restore_all_guest) + SPEC_CTRL_EXIT_TO_PV /* Req: a=spec_ctrl %rsp=regs/cpuinfo, Clob: cd */ + + RESTORE_ALL adj=8 compat=1 ++ ++ /* Account for ev/ec having already been popped off the stack. */ ++ SPEC_CTRL_COND_VERW \ ++ scf=STK_REL(CPUINFO_spec_ctrl_flags, CPUINFO_rip), \ ++ sel=STK_REL(CPUINFO_verw_sel, CPUINFO_rip) ++ + .Lft0: iretq + _ASM_PRE_EXTABLE(.Lft0, handle_exception) + +diff --git a/xen/arch/x86/x86_64/entry.S b/xen/arch/x86/x86_64/entry.S +index 968da9d727..2c7512130f 100644 +--- a/xen/arch/x86/x86_64/entry.S ++++ b/xen/arch/x86/x86_64/entry.S +@@ -214,6 +214,9 @@ restore_all_guest: + #endif + + mov EFRAME_rip(%rsp), %rcx ++ ++ SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ ++ + cmpw $FLAT_USER_CS32, EFRAME_cs(%rsp) + mov EFRAME_rsp(%rsp), %rsp + je 1f +@@ -227,6 +230,9 @@ restore_all_guest: + iret_exit_to_guest: + andl $~(X86_EFLAGS_IOPL | X86_EFLAGS_VM), EFRAME_eflags(%rsp) + orl $X86_EFLAGS_IF, EFRAME_eflags(%rsp) ++ ++ SPEC_CTRL_COND_VERW /* Req: %rsp=eframe Clob: efl */ ++ + addq $8,%rsp + .Lft0: iretq + _ASM_PRE_EXTABLE(.Lft0, handle_exception) +@@ -679,9 +685,22 @@ UNLIKELY_START(ne, exit_cr3) + UNLIKELY_END(exit_cr3) + + /* WARNING! `ret`, `call *`, `jmp *` not safe beyond this point. */ +- SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end, Clob: abcd */ ++ SPEC_CTRL_EXIT_TO_XEN /* Req: %r12=ist_exit %r14=end %rsp=regs, Clob: abcd */ + + RESTORE_ALL adj=8 ++ ++ /* ++ * When the CPU pushed this exception frame, it zero-extended eflags. ++ * For an IST exit, SPEC_CTRL_EXIT_TO_XEN stashed shadow copies of ++ * spec_ctrl_flags and ver_sel above eflags, as we can't use any GPRs, ++ * and we're at a random place on the stack, not in a CPUFINFO block. ++ * ++ * Account for ev/ec having already been popped off the stack. ++ */ ++ SPEC_CTRL_COND_VERW \ ++ scf=STK_REL(EFRAME_shadow_scf, EFRAME_rip), \ ++ sel=STK_REL(EFRAME_shadow_sel, EFRAME_rip) ++ + iretq + + ENTRY(common_interrupt) +-- +2.44.0 + + +From b7205fc1cbad0c633e92d2d019a02a507467507b Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Mon, 12 Feb 2024 17:50:43 +0000 +Subject: [PATCH 61/70] x86/spec-ctrl: Rename VERW related options + +VERW is going to be used for a 3rd purpose, and the existing nomenclature +didn't survive the Stale MMIO issues terribly well. + +Rename the command line option from `md-clear=` to `verw=`. This is more +consistent with other options which tend to be named based on what they're +doing, not which feature enumeration they use behind the scenes. Retain +`md-clear=` as a deprecated alias. + +Rename opt_md_clear_{pv,hvm} and opt_fb_clear_mmio to opt_verw_{pv,hvm,mmio}, +which has a side effect of making spec_ctrl_init_domain() rather clearer to +follow. + +No functional change. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit f7603ca252e4226739eb3129a5290ee3da3f8ea4) +--- + docs/misc/xen-command-line.pandoc | 15 ++++---- + xen/arch/x86/spec_ctrl.c | 62 ++++++++++++++++--------------- + 2 files changed, 40 insertions(+), 37 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index 582d6741d1..fbf1683924 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2370,7 +2370,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). + + ### spec-ctrl (x86) + > `= List of [ , xen=, {pv,hvm}=, +-> {msr-sc,rsb,md-clear,ibpb-entry}=|{pv,hvm}=, ++> {msr-sc,rsb,verw,ibpb-entry}=|{pv,hvm}=, + > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, + > eager-fpu,l1d-flush,branch-harden,srb-lock, + > unpriv-mmio,gds-mit,div-scrub}= ]` +@@ -2395,7 +2395,7 @@ in place for guests to use. + + Use of a positive boolean value for either of these options is invalid. + +-The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `md-clear=` and `ibpb-entry=` options ++The `pv=`, `hvm=`, `msr-sc=`, `rsb=`, `verw=` and `ibpb-entry=` options + offer fine grained control over the primitives by Xen. These impact Xen's + ability to protect itself, and/or Xen's ability to virtualise support for + guests to use. +@@ -2412,11 +2412,12 @@ guests to use. + guests and if disabled, guests will be unable to use IBRS/STIBP/SSBD/etc. + * `rsb=` offers control over whether to overwrite the Return Stack Buffer / + Return Address Stack on entry to Xen and on idle. +-* `md-clear=` offers control over whether to use VERW to flush +- microarchitectural buffers on idle and exit from Xen. *Note: For +- compatibility with development versions of this fix, `mds=` is also accepted +- on Xen 4.12 and earlier as an alias. Consult vendor documentation in +- preference to here.* ++* `verw=` offers control over whether to use VERW for its scrubbing side ++ effects at appropriate privilege transitions. The exact side effects are ++ microarchitecture and microcode specific. *Note: `md-clear=` is accepted as ++ a deprecated alias. For compatibility with development versions of XSA-297, ++ `mds=` is also accepted on Xen 4.12 and earlier as an alias. Consult vendor ++ documentation in preference to here.* + * `ibpb-entry=` offers control over whether IBPB (Indirect Branch Prediction + Barrier) is used on entry to Xen. This is used by default on hardware + vulnerable to Branch Type Confusion, and hardware vulnerable to Speculative +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index a965b6db28..c42d8cdc22 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -25,8 +25,8 @@ static bool __initdata opt_msr_sc_pv = true; + static bool __initdata opt_msr_sc_hvm = true; + static int8_t __initdata opt_rsb_pv = -1; + static bool __initdata opt_rsb_hvm = true; +-static int8_t __ro_after_init opt_md_clear_pv = -1; +-static int8_t __ro_after_init opt_md_clear_hvm = -1; ++static int8_t __ro_after_init opt_verw_pv = -1; ++static int8_t __ro_after_init opt_verw_hvm = -1; + + static int8_t __ro_after_init opt_ibpb_entry_pv = -1; + static int8_t __ro_after_init opt_ibpb_entry_hvm = -1; +@@ -66,7 +66,7 @@ static bool __initdata cpu_has_bug_mds; /* Any other M{LP,SB,FB}DS combination. + + static int8_t __initdata opt_srb_lock = -1; + static bool __initdata opt_unpriv_mmio; +-static bool __ro_after_init opt_fb_clear_mmio; ++static bool __ro_after_init opt_verw_mmio; + static int8_t __initdata opt_gds_mit = -1; + static int8_t __initdata opt_div_scrub = -1; + +@@ -108,8 +108,8 @@ static int __init cf_check parse_spec_ctrl(const char *s) + disable_common: + opt_rsb_pv = false; + opt_rsb_hvm = false; +- opt_md_clear_pv = 0; +- opt_md_clear_hvm = 0; ++ opt_verw_pv = 0; ++ opt_verw_hvm = 0; + opt_ibpb_entry_pv = 0; + opt_ibpb_entry_hvm = 0; + opt_ibpb_entry_dom0 = false; +@@ -140,14 +140,14 @@ static int __init cf_check parse_spec_ctrl(const char *s) + { + opt_msr_sc_pv = val; + opt_rsb_pv = val; +- opt_md_clear_pv = val; ++ opt_verw_pv = val; + opt_ibpb_entry_pv = val; + } + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) + { + opt_msr_sc_hvm = val; + opt_rsb_hvm = val; +- opt_md_clear_hvm = val; ++ opt_verw_hvm = val; + opt_ibpb_entry_hvm = val; + } + else if ( (val = parse_boolean("msr-sc", s, ss)) != -1 ) +@@ -192,21 +192,22 @@ static int __init cf_check parse_spec_ctrl(const char *s) + break; + } + } +- else if ( (val = parse_boolean("md-clear", s, ss)) != -1 ) ++ else if ( (val = parse_boolean("verw", s, ss)) != -1 || ++ (val = parse_boolean("md-clear", s, ss)) != -1 ) + { + switch ( val ) + { + case 0: + case 1: +- opt_md_clear_pv = opt_md_clear_hvm = val; ++ opt_verw_pv = opt_verw_hvm = val; + break; + + case -2: +- s += strlen("md-clear="); ++ s += (*s == 'v') ? strlen("verw=") : strlen("md-clear="); + if ( (val = parse_boolean("pv", s, ss)) >= 0 ) +- opt_md_clear_pv = val; ++ opt_verw_pv = val; + else if ( (val = parse_boolean("hvm", s, ss)) >= 0 ) +- opt_md_clear_hvm = val; ++ opt_verw_hvm = val; + else + default: + rc = -EINVAL; +@@ -528,8 +529,8 @@ static void __init print_details(enum ind_thunk thunk) + opt_srb_lock ? " SRB_LOCK+" : " SRB_LOCK-", + opt_ibpb_ctxt_switch ? " IBPB-ctxt" : "", + opt_l1d_flush ? " L1D_FLUSH" : "", +- opt_md_clear_pv || opt_md_clear_hvm || +- opt_fb_clear_mmio ? " VERW" : "", ++ opt_verw_pv || opt_verw_hvm || ++ opt_verw_mmio ? " VERW" : "", + opt_div_scrub ? " DIV" : "", + opt_branch_harden ? " BRANCH_HARDEN" : ""); + +@@ -550,13 +551,13 @@ static void __init print_details(enum ind_thunk thunk) + boot_cpu_has(X86_FEATURE_SC_RSB_HVM) || + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) || + amd_virt_spec_ctrl || +- opt_eager_fpu || opt_md_clear_hvm) ? "" : " None", ++ opt_eager_fpu || opt_verw_hvm) ? "" : " None", + boot_cpu_has(X86_FEATURE_SC_MSR_HVM) ? " MSR_SPEC_CTRL" : "", + (boot_cpu_has(X86_FEATURE_SC_MSR_HVM) || + amd_virt_spec_ctrl) ? " MSR_VIRT_SPEC_CTRL" : "", + boot_cpu_has(X86_FEATURE_SC_RSB_HVM) ? " RSB" : "", + opt_eager_fpu ? " EAGER_FPU" : "", +- opt_md_clear_hvm ? " MD_CLEAR" : "", ++ opt_verw_hvm ? " VERW" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_HVM) ? " IBPB-entry" : ""); + + #endif +@@ -565,11 +566,11 @@ static void __init print_details(enum ind_thunk thunk) + (boot_cpu_has(X86_FEATURE_SC_MSR_PV) || + boot_cpu_has(X86_FEATURE_SC_RSB_PV) || + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) || +- opt_eager_fpu || opt_md_clear_pv) ? "" : " None", ++ opt_eager_fpu || opt_verw_pv) ? "" : " None", + boot_cpu_has(X86_FEATURE_SC_MSR_PV) ? " MSR_SPEC_CTRL" : "", + boot_cpu_has(X86_FEATURE_SC_RSB_PV) ? " RSB" : "", + opt_eager_fpu ? " EAGER_FPU" : "", +- opt_md_clear_pv ? " MD_CLEAR" : "", ++ opt_verw_pv ? " VERW" : "", + boot_cpu_has(X86_FEATURE_IBPB_ENTRY_PV) ? " IBPB-entry" : ""); + + printk(" XPTI (64-bit PV only): Dom0 %s, DomU %s (with%s PCID)\n", +@@ -1502,8 +1503,8 @@ void spec_ctrl_init_domain(struct domain *d) + { + bool pv = is_pv_domain(d); + +- bool verw = ((pv ? opt_md_clear_pv : opt_md_clear_hvm) || +- (opt_fb_clear_mmio && is_iommu_enabled(d))); ++ bool verw = ((pv ? opt_verw_pv : opt_verw_hvm) || ++ (opt_verw_mmio && is_iommu_enabled(d))); + + bool ibpb = ((pv ? opt_ibpb_entry_pv : opt_ibpb_entry_hvm) && + (d->domain_id != 0 || opt_ibpb_entry_dom0)); +@@ -1866,19 +1867,20 @@ void __init init_speculation_mitigations(void) + * the return-to-guest path. + */ + if ( opt_unpriv_mmio ) +- opt_fb_clear_mmio = cpu_has_fb_clear; ++ opt_verw_mmio = cpu_has_fb_clear; + + /* + * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. + * This will only be a token effort for MLPDS/MFBDS when HT is enabled, + * but it is somewhat better than nothing. + */ +- if ( opt_md_clear_pv == -1 ) +- opt_md_clear_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- boot_cpu_has(X86_FEATURE_MD_CLEAR)); +- if ( opt_md_clear_hvm == -1 ) +- opt_md_clear_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- boot_cpu_has(X86_FEATURE_MD_CLEAR)); ++ if ( opt_verw_pv == -1 ) ++ opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && ++ cpu_has_md_clear); ++ ++ if ( opt_verw_hvm == -1 ) ++ opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && ++ cpu_has_md_clear); + + /* + * Enable MDS/MMIO defences as applicable. The Idle blocks need using if +@@ -1891,12 +1893,12 @@ void __init init_speculation_mitigations(void) + * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) + * + * After calculating the appropriate idle setting, simplify +- * opt_md_clear_hvm to mean just "should we VERW on the way into HVM ++ * opt_verw_hvm to mean just "should we VERW on the way into HVM + * guests", so spec_ctrl_init_domain() can calculate suitable settings. + */ +- if ( opt_md_clear_pv || opt_md_clear_hvm || opt_fb_clear_mmio ) ++ if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio ) + setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); +- opt_md_clear_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; ++ opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; + + /* + * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT +-- +2.44.0 + + +From fb85a8fc91f8cfd61d7c7f9742502b223d4024b5 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Tue, 5 Mar 2024 19:33:37 +0000 +Subject: [PATCH 62/70] x86/spec-ctrl: VERW-handling adjustments + +... before we add yet more complexity to this logic. Mostly expanded +comments, but with three minor changes. + +1) Introduce cpu_has_useful_md_clear to simplify later logic in this patch and + future ones. + +2) We only ever need SC_VERW_IDLE when SMT is active. If SMT isn't active, + then there's no re-partition of pipeline resources based on thread-idleness + to worry about. + +3) The logic to adjust HVM VERW based on L1D_FLUSH is unmaintainable and, as + it turns out, wrong. SKIP_L1DFL is just a hint bit, whereas opt_l1d_flush + is the relevant decision of whether to use L1D_FLUSH based on + susceptibility and user preference. + + Rewrite the logic so it can be followed, and incorporate the fact that when + FB_CLEAR is visible, L1D_FLUSH isn't a safe substitution. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper +Acked-by: Jan Beulich +(cherry picked from commit 1eb91a8a06230b4b64228c9a380194f8cfe6c5e2) +--- + xen/arch/x86/spec_ctrl.c | 99 +++++++++++++++++++++++++++++----------- + 1 file changed, 73 insertions(+), 26 deletions(-) + +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index c42d8cdc22..a4afcd8570 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -1519,7 +1519,7 @@ void __init init_speculation_mitigations(void) + { + enum ind_thunk thunk = THUNK_DEFAULT; + bool has_spec_ctrl, ibrs = false, hw_smt_enabled; +- bool cpu_has_bug_taa, retpoline_safe; ++ bool cpu_has_bug_taa, cpu_has_useful_md_clear, retpoline_safe; + + hw_smt_enabled = check_smt_enabled(); + +@@ -1855,50 +1855,97 @@ void __init init_speculation_mitigations(void) + "enabled. Please assess your configuration and choose an\n" + "explicit 'smt=' setting. See XSA-273.\n"); + ++ /* ++ * A brief summary of VERW-related changes. ++ * ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html ++ * ++ * Relevant ucodes: ++ * ++ * - May 2019, for MDS. Introduces the MD_CLEAR CPUID bit and VERW side ++ * effects to scrub Store/Load/Fill buffers as applicable. MD_CLEAR ++ * exists architecturally, even when the side effects have been removed. ++ * ++ * Use VERW to scrub on return-to-guest. Parts with L1D_FLUSH to ++ * mitigate L1TF have the same side effect, so no need to do both. ++ * ++ * Various Atoms suffer from Store-buffer sampling only. Store buffers ++ * are statically partitioned between non-idle threads, so scrubbing is ++ * wanted when going idle too. ++ * ++ * Load ports and Fill buffers are competitively shared between threads. ++ * SMT must be disabled for VERW scrubbing to be fully effective. ++ * ++ * - November 2019, for TAA. Extended VERW side effects to TSX-enabled ++ * MDS_NO parts. ++ * ++ * - February 2022, for Client TSX de-feature. Removed VERW side effects ++ * from Client CPUs only. ++ * ++ * - May 2022, for MMIO Stale Data. (Re)introduced Fill Buffer scrubbing ++ * on all MMIO-affected parts which didn't already have it for MDS ++ * reasons, enumerating FB_CLEAR on those parts only. ++ * ++ * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing ++ * side effects as VERW and cannot be used in its place. ++ */ + mds_calculations(); + + /* +- * Parts which enumerate FB_CLEAR are those which are post-MDS_NO and have +- * reintroduced the VERW fill buffer flushing side effect because of a +- * susceptibility to FBSDP. ++ * Parts which enumerate FB_CLEAR are those with now-updated microcode ++ * which weren't susceptible to the original MFBDS (and therefore didn't ++ * have Fill Buffer scrubbing side effects to begin with, or were Client ++ * MDS_NO non-TAA_NO parts where the scrubbing was removed), but have had ++ * the scrubbing reintroduced because of a susceptibility to FBSDP. + * + * If unprivileged guests have (or will have) MMIO mappings, we can + * mitigate cross-domain leakage of fill buffer data by issuing VERW on +- * the return-to-guest path. ++ * the return-to-guest path. This is only a token effort if SMT is ++ * active. + */ + if ( opt_unpriv_mmio ) + opt_verw_mmio = cpu_has_fb_clear; + + /* +- * By default, enable PV and HVM mitigations on MDS-vulnerable hardware. +- * This will only be a token effort for MLPDS/MFBDS when HT is enabled, +- * but it is somewhat better than nothing. ++ * MD_CLEAR is enumerated architecturally forevermore, even after the ++ * scrubbing side effects have been removed. Create ourselves an version ++ * which expressed whether we think MD_CLEAR is having any useful side ++ * effect. ++ */ ++ cpu_has_useful_md_clear = (cpu_has_md_clear && ++ (cpu_has_bug_mds || cpu_has_bug_msbds_only)); ++ ++ /* ++ * By default, use VERW scrubbing on applicable hardware, if we think it's ++ * going to have an effect. This will only be a token effort for ++ * MLPDS/MFBDS when SMT is enabled. + */ + if ( opt_verw_pv == -1 ) +- opt_verw_pv = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- cpu_has_md_clear); ++ opt_verw_pv = cpu_has_useful_md_clear; + + if ( opt_verw_hvm == -1 ) +- opt_verw_hvm = ((cpu_has_bug_mds || cpu_has_bug_msbds_only) && +- cpu_has_md_clear); ++ opt_verw_hvm = cpu_has_useful_md_clear; + + /* +- * Enable MDS/MMIO defences as applicable. The Idle blocks need using if +- * either the PV or HVM MDS defences are used, or if we may give MMIO +- * access to untrusted guests. +- * +- * HVM is more complicated. The MD_CLEAR microcode extends L1D_FLUSH with +- * equivalent semantics to avoid needing to perform both flushes on the +- * HVM path. Therefore, we don't need VERW in addition to L1D_FLUSH (for +- * MDS mitigations. L1D_FLUSH is not safe for MMIO mitigations.) +- * +- * After calculating the appropriate idle setting, simplify +- * opt_verw_hvm to mean just "should we VERW on the way into HVM +- * guests", so spec_ctrl_init_domain() can calculate suitable settings. ++ * If SMT is active, and we're protecting against MDS or MMIO stale data, ++ * we need to scrub before going idle as well as on return to guest. ++ * Various pipeline resources are repartitioned amongst non-idle threads. + */ +- if ( opt_verw_pv || opt_verw_hvm || opt_verw_mmio ) ++ if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || ++ opt_verw_mmio) && hw_smt_enabled ) + setup_force_cpu_cap(X86_FEATURE_SC_VERW_IDLE); +- opt_verw_hvm &= !cpu_has_skip_l1dfl && !opt_l1d_flush; ++ ++ /* ++ * After calculating the appropriate idle setting, simplify opt_verw_hvm ++ * to mean just "should we VERW on the way into HVM guests", so ++ * spec_ctrl_init_domain() can calculate suitable settings. ++ * ++ * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the ++ * only *_CLEAR we can see. ++ */ ++ if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear ) ++ opt_verw_hvm = false; + + /* + * Warn the user if they are on MLPDS/MFBDS-vulnerable hardware with HT +-- +2.44.0 + + +From 908cbd1893e80eb52b92b2c70c2bfd9ffdf6f77b Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Thu, 22 Jun 2023 23:32:19 +0100 +Subject: [PATCH 63/70] x86/spec-ctrl: Mitigation Register File Data Sampling + +RFDS affects Atom cores, also branded E-cores, between the Goldmont and +Gracemont microarchitectures. This includes Alder Lake and Raptor Lake hybrid +clien systems which have a mix of Gracemont and other types of cores. + +Two new bits have been defined; RFDS_CLEAR to indicate VERW has more side +effets, and RFDS_NO to incidate that the system is unaffected. Plenty of +unaffected CPUs won't be getting RFDS_NO retrofitted in microcode, so we +synthesise it. Alder Lake and Raptor Lake Xeon-E's are unaffected due to +their platform configuration, and we must use the Hybrid CPUID bit to +distinguish them from their non-Xeon counterparts. + +Like MD_CLEAR and FB_CLEAR, RFDS_CLEAR needs OR-ing across a resource pool, so +set it in the max policies and reflect the host setting in default. + +This is part of XSA-452 / CVE-2023-28746. + +Signed-off-by: Andrew Cooper +Reviewed-by: Jan Beulich +(cherry picked from commit fb5b6f6744713410c74cfc12b7176c108e3c9a31) +--- + tools/misc/xen-cpuid.c | 5 +- + xen/arch/x86/cpu-policy.c | 5 + + xen/arch/x86/include/asm/cpufeature.h | 3 + + xen/arch/x86/include/asm/msr-index.h | 2 + + xen/arch/x86/spec_ctrl.c | 100 +++++++++++++++++++- + xen/include/public/arch-x86/cpufeatureset.h | 3 + + 6 files changed, 111 insertions(+), 7 deletions(-) + +diff --git a/tools/misc/xen-cpuid.c b/tools/misc/xen-cpuid.c +index 7370f1b56e..52e451a806 100644 +--- a/tools/misc/xen-cpuid.c ++++ b/tools/misc/xen-cpuid.c +@@ -172,7 +172,7 @@ static const char *const str_7d0[32] = + [ 8] = "avx512-vp2intersect", [ 9] = "srbds-ctrl", + [10] = "md-clear", [11] = "rtm-always-abort", + /* 12 */ [13] = "tsx-force-abort", +- [14] = "serialize", ++ [14] = "serialize", [15] = "hybrid", + [16] = "tsxldtrk", + [18] = "pconfig", + [20] = "cet-ibt", +@@ -245,7 +245,8 @@ static const char *const str_m10Al[32] = + [20] = "bhi-no", [21] = "xapic-status", + /* 22 */ [23] = "ovrclk-status", + [24] = "pbrsb-no", [25] = "gds-ctrl", +- [26] = "gds-no", ++ [26] = "gds-no", [27] = "rfds-no", ++ [28] = "rfds-clear", + }; + + static const char *const str_m10Ah[32] = +diff --git a/xen/arch/x86/cpu-policy.c b/xen/arch/x86/cpu-policy.c +index c7c5e99b7b..12e621b97d 100644 +--- a/xen/arch/x86/cpu-policy.c ++++ b/xen/arch/x86/cpu-policy.c +@@ -451,6 +451,7 @@ static void __init guest_common_max_feature_adjustments(uint32_t *fs) + */ + __set_bit(X86_FEATURE_MD_CLEAR, fs); + __set_bit(X86_FEATURE_FB_CLEAR, fs); ++ __set_bit(X86_FEATURE_RFDS_CLEAR, fs); + + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an +@@ -500,6 +501,10 @@ static void __init guest_common_default_feature_adjustments(uint32_t *fs) + if ( cpu_has_fb_clear ) + __set_bit(X86_FEATURE_FB_CLEAR, fs); + ++ __clear_bit(X86_FEATURE_RFDS_CLEAR, fs); ++ if ( cpu_has_rfds_clear ) ++ __set_bit(X86_FEATURE_RFDS_CLEAR, fs); ++ + /* + * The Gather Data Sampling microcode mitigation (August 2023) has an + * adverse performance impact on the CLWB instruction on SKX/CLX/CPX. +diff --git a/xen/arch/x86/include/asm/cpufeature.h b/xen/arch/x86/include/asm/cpufeature.h +index 76ef2aeb1d..3c57f55de0 100644 +--- a/xen/arch/x86/include/asm/cpufeature.h ++++ b/xen/arch/x86/include/asm/cpufeature.h +@@ -181,6 +181,7 @@ static inline bool boot_cpu_has(unsigned int feat) + #define cpu_has_rtm_always_abort boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) + #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT) + #define cpu_has_serialize boot_cpu_has(X86_FEATURE_SERIALIZE) ++#define cpu_has_hybrid boot_cpu_has(X86_FEATURE_HYBRID) + #define cpu_has_avx512_fp16 boot_cpu_has(X86_FEATURE_AVX512_FP16) + #define cpu_has_arch_caps boot_cpu_has(X86_FEATURE_ARCH_CAPS) + +@@ -208,6 +209,8 @@ static inline bool boot_cpu_has(unsigned int feat) + #define cpu_has_rrsba boot_cpu_has(X86_FEATURE_RRSBA) + #define cpu_has_gds_ctrl boot_cpu_has(X86_FEATURE_GDS_CTRL) + #define cpu_has_gds_no boot_cpu_has(X86_FEATURE_GDS_NO) ++#define cpu_has_rfds_no boot_cpu_has(X86_FEATURE_RFDS_NO) ++#define cpu_has_rfds_clear boot_cpu_has(X86_FEATURE_RFDS_CLEAR) + + /* Synthesized. */ + #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) +diff --git a/xen/arch/x86/include/asm/msr-index.h b/xen/arch/x86/include/asm/msr-index.h +index 82a81bd0a2..85ef28a612 100644 +--- a/xen/arch/x86/include/asm/msr-index.h ++++ b/xen/arch/x86/include/asm/msr-index.h +@@ -89,6 +89,8 @@ + #define ARCH_CAPS_PBRSB_NO (_AC(1, ULL) << 24) + #define ARCH_CAPS_GDS_CTRL (_AC(1, ULL) << 25) + #define ARCH_CAPS_GDS_NO (_AC(1, ULL) << 26) ++#define ARCH_CAPS_RFDS_NO (_AC(1, ULL) << 27) ++#define ARCH_CAPS_RFDS_CLEAR (_AC(1, ULL) << 28) + + #define MSR_FLUSH_CMD 0x0000010b + #define FLUSH_CMD_L1D (_AC(1, ULL) << 0) +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index a4afcd8570..8165379fed 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -12,6 +12,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -435,7 +436,7 @@ static void __init print_details(enum ind_thunk thunk) + * Hardware read-only information, stating immunity to certain issues, or + * suggestions of which mitigation to use. + */ +- printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware hints:%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (caps & ARCH_CAPS_RDCL_NO) ? " RDCL_NO" : "", + (caps & ARCH_CAPS_EIBRS) ? " EIBRS" : "", + (caps & ARCH_CAPS_RSBA) ? " RSBA" : "", +@@ -451,6 +452,7 @@ static void __init print_details(enum ind_thunk thunk) + (caps & ARCH_CAPS_FB_CLEAR) ? " FB_CLEAR" : "", + (caps & ARCH_CAPS_PBRSB_NO) ? " PBRSB_NO" : "", + (caps & ARCH_CAPS_GDS_NO) ? " GDS_NO" : "", ++ (caps & ARCH_CAPS_RFDS_NO) ? " RFDS_NO" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_ALWAYS)) ? " IBRS_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_STIBP_ALWAYS)) ? " STIBP_ALWAYS" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS_FAST)) ? " IBRS_FAST" : "", +@@ -461,7 +463,7 @@ static void __init print_details(enum ind_thunk thunk) + (e21a & cpufeat_mask(X86_FEATURE_SRSO_NO)) ? " SRSO_NO" : ""); + + /* Hardware features which need driving to mitigate issues. */ +- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s\n", ++ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + (e8b & cpufeat_mask(X86_FEATURE_IBPB)) || + (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBPB" : "", + (e8b & cpufeat_mask(X86_FEATURE_IBRS)) || +@@ -479,6 +481,7 @@ static void __init print_details(enum ind_thunk thunk) + (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "", + (caps & ARCH_CAPS_FB_CLEAR_CTRL) ? " FB_CLEAR_CTRL" : "", + (caps & ARCH_CAPS_GDS_CTRL) ? " GDS_CTRL" : "", ++ (caps & ARCH_CAPS_RFDS_CLEAR) ? " RFDS_CLEAR" : "", + (e21a & cpufeat_mask(X86_FEATURE_SBPB)) ? " SBPB" : ""); + + /* Compiled-in support which pertains to mitigations. */ +@@ -1347,6 +1350,83 @@ static __init void mds_calculations(void) + } + } + ++/* ++ * Register File Data Sampling affects Atom cores from the Goldmont to ++ * Gracemont microarchitectures. The March 2024 microcode adds RFDS_NO to ++ * some but not all unaffected parts, and RFDS_CLEAR to affected parts still ++ * in support. ++ * ++ * Alder Lake and Raptor Lake client CPUs have a mix of P cores ++ * (Golden/Raptor Cove, not vulnerable) and E cores (Gracemont, ++ * vulnerable), and both enumerate RFDS_CLEAR. ++ * ++ * Both exist in a Xeon SKU, which has the E cores (Gracemont) disabled by ++ * platform configuration, and enumerate RFDS_NO. ++ * ++ * With older parts, or with out-of-date microcode, synthesise RFDS_NO when ++ * safe to do so. ++ * ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html ++ */ ++static void __init rfds_calculations(void) ++{ ++ /* RFDS is only known to affect Intel Family 6 processors at this time. */ ++ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || ++ boot_cpu_data.x86 != 6 ) ++ return; ++ ++ /* ++ * If RFDS_NO or RFDS_CLEAR are visible, we've either got suitable ++ * microcode, or an RFDS-aware hypervisor is levelling us in a pool. ++ */ ++ if ( cpu_has_rfds_no || cpu_has_rfds_clear ) ++ return; ++ ++ /* If we're virtualised, don't attempt to synthesise RFDS_NO. */ ++ if ( cpu_has_hypervisor ) ++ return; ++ ++ /* ++ * Not all CPUs are expected to get a microcode update enumerating one of ++ * RFDS_{NO,CLEAR}, or we might have out-of-date microcode. ++ */ ++ switch ( boot_cpu_data.x86_model ) ++ { ++ case INTEL_FAM6_ALDERLAKE: ++ case INTEL_FAM6_RAPTORLAKE: ++ /* ++ * Alder Lake and Raptor Lake might be a client SKU (with the ++ * Gracemont cores active, and therefore vulnerable) or might be a ++ * server SKU (with the Gracemont cores disabled, and therefore not ++ * vulnerable). ++ * ++ * See if the CPU identifies as hybrid to distinguish the two cases. ++ */ ++ if ( !cpu_has_hybrid ) ++ break; ++ fallthrough; ++ case INTEL_FAM6_ALDERLAKE_L: ++ case INTEL_FAM6_RAPTORLAKE_P: ++ case INTEL_FAM6_RAPTORLAKE_S: ++ ++ case INTEL_FAM6_ATOM_GOLDMONT: /* Apollo Lake */ ++ case INTEL_FAM6_ATOM_GOLDMONT_D: /* Denverton */ ++ case INTEL_FAM6_ATOM_GOLDMONT_PLUS: /* Gemini Lake */ ++ case INTEL_FAM6_ATOM_TREMONT_D: /* Snow Ridge / Parker Ridge */ ++ case INTEL_FAM6_ATOM_TREMONT: /* Elkhart Lake */ ++ case INTEL_FAM6_ATOM_TREMONT_L: /* Jasper Lake */ ++ case INTEL_FAM6_ATOM_GRACEMONT: /* Alder Lake N */ ++ return; ++ } ++ ++ /* ++ * We appear to be on an unaffected CPU which didn't enumerate RFDS_NO, ++ * perhaps because of it's age or because of out-of-date microcode. ++ * Synthesise it. ++ */ ++ setup_force_cpu_cap(X86_FEATURE_RFDS_NO); ++} ++ + static bool __init cpu_has_gds(void) + { + /* +@@ -1860,6 +1940,7 @@ void __init init_speculation_mitigations(void) + * + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/intel-analysis-microarchitectural-data-sampling.html + * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/technical-documentation/processor-mmio-stale-data-vulnerabilities.html ++ * https://www.intel.com/content/www/us/en/developer/articles/technical/software-security-guidance/advisory-guidance/register-file-data-sampling.html + * + * Relevant ucodes: + * +@@ -1889,8 +1970,12 @@ void __init init_speculation_mitigations(void) + * + * If FB_CLEAR is enumerated, L1D_FLUSH does not have the same scrubbing + * side effects as VERW and cannot be used in its place. ++ * ++ * - March 2023, for RFDS. Enumerate RFDS_CLEAR to mean that VERW now ++ * scrubs non-architectural entries from certain register files. + */ + mds_calculations(); ++ rfds_calculations(); + + /* + * Parts which enumerate FB_CLEAR are those with now-updated microcode +@@ -1922,15 +2007,19 @@ void __init init_speculation_mitigations(void) + * MLPDS/MFBDS when SMT is enabled. + */ + if ( opt_verw_pv == -1 ) +- opt_verw_pv = cpu_has_useful_md_clear; ++ opt_verw_pv = cpu_has_useful_md_clear || cpu_has_rfds_clear; + + if ( opt_verw_hvm == -1 ) +- opt_verw_hvm = cpu_has_useful_md_clear; ++ opt_verw_hvm = cpu_has_useful_md_clear || cpu_has_rfds_clear; + + /* + * If SMT is active, and we're protecting against MDS or MMIO stale data, + * we need to scrub before going idle as well as on return to guest. + * Various pipeline resources are repartitioned amongst non-idle threads. ++ * ++ * We don't need to scrub on idle for RFDS. There are no affected cores ++ * which support SMT, despite there being affected cores in hybrid systems ++ * which have SMT elsewhere in the platform. + */ + if ( ((cpu_has_useful_md_clear && (opt_verw_pv || opt_verw_hvm)) || + opt_verw_mmio) && hw_smt_enabled ) +@@ -1944,7 +2033,8 @@ void __init init_speculation_mitigations(void) + * It is only safe to use L1D_FLUSH in place of VERW when MD_CLEAR is the + * only *_CLEAR we can see. + */ +- if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear ) ++ if ( opt_l1d_flush && cpu_has_md_clear && !cpu_has_fb_clear && ++ !cpu_has_rfds_clear ) + opt_verw_hvm = false; + + /* +diff --git a/xen/include/public/arch-x86/cpufeatureset.h b/xen/include/public/arch-x86/cpufeatureset.h +index 337aaa9c77..8e17ef670f 100644 +--- a/xen/include/public/arch-x86/cpufeatureset.h ++++ b/xen/include/public/arch-x86/cpufeatureset.h +@@ -266,6 +266,7 @@ XEN_CPUFEATURE(MD_CLEAR, 9*32+10) /*!A VERW clears microarchitectural buffe + XEN_CPUFEATURE(RTM_ALWAYS_ABORT, 9*32+11) /*! June 2021 TSX defeaturing in microcode. */ + XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */ + XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*A SERIALIZE insn */ ++XEN_CPUFEATURE(HYBRID, 9*32+15) /* Heterogeneous platform */ + XEN_CPUFEATURE(TSXLDTRK, 9*32+16) /*a TSX load tracking suspend/resume insns */ + XEN_CPUFEATURE(CET_IBT, 9*32+20) /* CET - Indirect Branch Tracking */ + XEN_CPUFEATURE(AVX512_FP16, 9*32+23) /*A AVX512 FP16 instructions */ +@@ -338,6 +339,8 @@ XEN_CPUFEATURE(OVRCLK_STATUS, 16*32+23) /* MSR_OVERCLOCKING_STATUS */ + XEN_CPUFEATURE(PBRSB_NO, 16*32+24) /*A No Post-Barrier RSB predictions */ + XEN_CPUFEATURE(GDS_CTRL, 16*32+25) /* MCU_OPT_CTRL.GDS_MIT_{DIS,LOCK} */ + XEN_CPUFEATURE(GDS_NO, 16*32+26) /*A No Gather Data Sampling */ ++XEN_CPUFEATURE(RFDS_NO, 16*32+27) /*A No Register File Data Sampling */ ++XEN_CPUFEATURE(RFDS_CLEAR, 16*32+28) /*!A Register File(s) cleared by VERW */ + + /* Intel-defined CPU features, MSR_ARCH_CAPS 0x10a.edx, word 17 */ + +-- +2.44.0 + + +From bdda600406e5f5c35bcb17b2f9458e2138d7ad46 Mon Sep 17 00:00:00 2001 +From: Andrew Cooper +Date: Fri, 2 Feb 2024 00:39:42 +0000 +Subject: [PATCH 64/70] xen: Swap order of actions in the FREE*() macros + +Wherever possible, it is a good idea to NULL out the visible reference to an +object prior to freeing it. The FREE*() macros already collect together both +parts, making it easy to adjust. + +This has a marginal code generation improvement, as some of the calls to the +free() function can be tailcall optimised. + +No functional change. + +Signed-off-by: Andrew Cooper +Acked-by: Jan Beulich +(cherry picked from commit c4f427ec879e7c0df6d44d02561e8bee838a293e) +--- + xen/include/xen/mm.h | 3 ++- + xen/include/xen/xmalloc.h | 7 ++++--- + 2 files changed, 6 insertions(+), 4 deletions(-) + +diff --git a/xen/include/xen/mm.h b/xen/include/xen/mm.h +index 8b9618609f..8bc5f4249d 100644 +--- a/xen/include/xen/mm.h ++++ b/xen/include/xen/mm.h +@@ -91,8 +91,9 @@ bool scrub_free_pages(void); + + /* Free an allocation, and zero the pointer to it. */ + #define FREE_XENHEAP_PAGES(p, o) do { \ +- free_xenheap_pages(p, o); \ ++ void *_ptr_ = (p); \ + (p) = NULL; \ ++ free_xenheap_pages(_ptr_, o); \ + } while ( false ) + #define FREE_XENHEAP_PAGE(p) FREE_XENHEAP_PAGES(p, 0) + +diff --git a/xen/include/xen/xmalloc.h b/xen/include/xen/xmalloc.h +index 16979a117c..d857298011 100644 +--- a/xen/include/xen/xmalloc.h ++++ b/xen/include/xen/xmalloc.h +@@ -66,9 +66,10 @@ + extern void xfree(void *); + + /* Free an allocation, and zero the pointer to it. */ +-#define XFREE(p) do { \ +- xfree(p); \ +- (p) = NULL; \ ++#define XFREE(p) do { \ ++ void *_ptr_ = (p); \ ++ (p) = NULL; \ ++ xfree(_ptr_); \ + } while ( false ) + + /* Underlying functions */ +-- +2.44.0 + + +From 1932973ac9a8c28197ebb24749c73c18cf23f5f1 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 13 Feb 2024 13:08:05 +0100 +Subject: [PATCH 65/70] x86/spinlock: introduce support for blocking + speculation into critical regions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce a new Kconfig option to block speculation into lock protected +critical regions. The Kconfig option is enabled by default, but the mitigation +won't be engaged unless it's explicitly enabled in the command line using +`spec-ctrl=lock-harden`. + +Convert the spinlock acquire macros into always-inline functions, and introduce +a speculation barrier after the lock has been taken. Note the speculation +barrier is not placed inside the implementation of the spin lock functions, as +to prevent speculation from falling through the call to the lock functions +resulting in the barrier also being skipped. + +trylock variants are protected using a construct akin to the existing +evaluate_nospec(). + +This patch only implements the speculation barrier for x86. + +Note spin locks are the only locking primitive taken care in this change, +further locking primitives will be adjusted by separate changes. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit 7ef0084418e188d05f338c3e028fbbe8b6924afa) +--- + docs/misc/xen-command-line.pandoc | 7 ++++- + xen/arch/x86/include/asm/cpufeatures.h | 2 +- + xen/arch/x86/include/asm/nospec.h | 26 ++++++++++++++++++ + xen/arch/x86/spec_ctrl.c | 26 +++++++++++++++--- + xen/common/Kconfig | 17 ++++++++++++ + xen/include/xen/nospec.h | 15 +++++++++++ + xen/include/xen/spinlock.h | 37 +++++++++++++++++++++----- + 7 files changed, 119 insertions(+), 11 deletions(-) + +diff --git a/docs/misc/xen-command-line.pandoc b/docs/misc/xen-command-line.pandoc +index fbf1683924..3f9f916718 100644 +--- a/docs/misc/xen-command-line.pandoc ++++ b/docs/misc/xen-command-line.pandoc +@@ -2373,7 +2373,7 @@ By default SSBD will be mitigated at runtime (i.e `ssbd=runtime`). + > {msr-sc,rsb,verw,ibpb-entry}=|{pv,hvm}=, + > bti-thunk=retpoline|lfence|jmp, {ibrs,ibpb,ssbd,psfd, + > eager-fpu,l1d-flush,branch-harden,srb-lock, +-> unpriv-mmio,gds-mit,div-scrub}= ]` ++> unpriv-mmio,gds-mit,div-scrub,lock-harden}= ]` + + Controls for speculative execution sidechannel mitigations. By default, Xen + will pick the most appropriate mitigations based on compiled in support, +@@ -2500,6 +2500,11 @@ On all hardware, the `div-scrub=` option can be used to force or prevent Xen + from mitigating the DIV-leakage vulnerability. By default, Xen will mitigate + DIV-leakage on hardware believed to be vulnerable. + ++If Xen is compiled with `CONFIG_SPECULATIVE_HARDEN_LOCK`, the `lock-harden=` ++boolean can be used to force or prevent Xen from using speculation barriers to ++protect lock critical regions. This mitigation won't be engaged by default, ++and needs to be explicitly enabled on the command line. ++ + ### sync_console + > `= ` + +diff --git a/xen/arch/x86/include/asm/cpufeatures.h b/xen/arch/x86/include/asm/cpufeatures.h +index c3aad21c3b..7e8221fd85 100644 +--- a/xen/arch/x86/include/asm/cpufeatures.h ++++ b/xen/arch/x86/include/asm/cpufeatures.h +@@ -24,7 +24,7 @@ XEN_CPUFEATURE(APERFMPERF, X86_SYNTH( 8)) /* APERFMPERF */ + XEN_CPUFEATURE(MFENCE_RDTSC, X86_SYNTH( 9)) /* MFENCE synchronizes RDTSC */ + XEN_CPUFEATURE(XEN_SMEP, X86_SYNTH(10)) /* SMEP gets used by Xen itself */ + XEN_CPUFEATURE(XEN_SMAP, X86_SYNTH(11)) /* SMAP gets used by Xen itself */ +-/* Bit 12 unused. */ ++XEN_CPUFEATURE(SC_NO_LOCK_HARDEN, X86_SYNTH(12)) /* (Disable) Lock critical region hardening */ + XEN_CPUFEATURE(IND_THUNK_LFENCE, X86_SYNTH(13)) /* Use IND_THUNK_LFENCE */ + XEN_CPUFEATURE(IND_THUNK_JMP, X86_SYNTH(14)) /* Use IND_THUNK_JMP */ + XEN_CPUFEATURE(SC_NO_BRANCH_HARDEN, X86_SYNTH(15)) /* (Disable) Conditional branch hardening */ +diff --git a/xen/arch/x86/include/asm/nospec.h b/xen/arch/x86/include/asm/nospec.h +index 7150e76b87..0725839e19 100644 +--- a/xen/arch/x86/include/asm/nospec.h ++++ b/xen/arch/x86/include/asm/nospec.h +@@ -38,6 +38,32 @@ static always_inline void block_speculation(void) + barrier_nospec_true(); + } + ++static always_inline void arch_block_lock_speculation(void) ++{ ++ alternative("lfence", "", X86_FEATURE_SC_NO_LOCK_HARDEN); ++} ++ ++/* Allow to insert a read memory barrier into conditionals */ ++static always_inline bool barrier_lock_true(void) ++{ ++ alternative("lfence #nospec-true", "", X86_FEATURE_SC_NO_LOCK_HARDEN); ++ return true; ++} ++ ++static always_inline bool barrier_lock_false(void) ++{ ++ alternative("lfence #nospec-false", "", X86_FEATURE_SC_NO_LOCK_HARDEN); ++ return false; ++} ++ ++static always_inline bool arch_lock_evaluate_nospec(bool condition) ++{ ++ if ( condition ) ++ return barrier_lock_true(); ++ else ++ return barrier_lock_false(); ++} ++ + #endif /* _ASM_X86_NOSPEC_H */ + + /* +diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c +index 8165379fed..5dfc4ed69e 100644 +--- a/xen/arch/x86/spec_ctrl.c ++++ b/xen/arch/x86/spec_ctrl.c +@@ -53,6 +53,7 @@ int8_t __read_mostly opt_eager_fpu = -1; + int8_t __read_mostly opt_l1d_flush = -1; + static bool __initdata opt_branch_harden = + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH); ++static bool __initdata opt_lock_harden; + + bool __initdata bsp_delay_spec_ctrl; + uint8_t __read_mostly default_xen_spec_ctrl; +@@ -121,6 +122,7 @@ static int __init cf_check parse_spec_ctrl(const char *s) + opt_ssbd = false; + opt_l1d_flush = 0; + opt_branch_harden = false; ++ opt_lock_harden = false; + opt_srb_lock = 0; + opt_unpriv_mmio = false; + opt_gds_mit = 0; +@@ -286,6 +288,16 @@ static int __init cf_check parse_spec_ctrl(const char *s) + rc = -EINVAL; + } + } ++ else if ( (val = parse_boolean("lock-harden", s, ss)) >= 0 ) ++ { ++ if ( IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) ++ opt_lock_harden = val; ++ else ++ { ++ no_config_param("SPECULATIVE_HARDEN_LOCK", "spec-ctrl", s, ss); ++ rc = -EINVAL; ++ } ++ } + else if ( (val = parse_boolean("srb-lock", s, ss)) >= 0 ) + opt_srb_lock = val; + else if ( (val = parse_boolean("unpriv-mmio", s, ss)) >= 0 ) +@@ -488,7 +500,8 @@ static void __init print_details(enum ind_thunk thunk) + if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) || + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_ARRAY) || + IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_BRANCH) || +- IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) ) ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS) || ++ IS_ENABLED(CONFIG_SPECULATIVE_HARDEN_LOCK) ) + printk(" Compiled-in support:" + #ifdef CONFIG_INDIRECT_THUNK + " INDIRECT_THUNK" +@@ -504,11 +517,14 @@ static void __init print_details(enum ind_thunk thunk) + #endif + #ifdef CONFIG_SPECULATIVE_HARDEN_GUEST_ACCESS + " HARDEN_GUEST_ACCESS" ++#endif ++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK ++ " HARDEN_LOCK" + #endif + "\n"); + + /* Settings for Xen's protection, irrespective of guests. */ +- printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s\n", ++ printk(" Xen settings: %s%sSPEC_CTRL: %s%s%s%s%s, Other:%s%s%s%s%s%s%s\n", + thunk != THUNK_NONE ? "BTI-Thunk: " : "", + thunk == THUNK_NONE ? "" : + thunk == THUNK_RETPOLINE ? "RETPOLINE, " : +@@ -535,7 +551,8 @@ static void __init print_details(enum ind_thunk thunk) + opt_verw_pv || opt_verw_hvm || + opt_verw_mmio ? " VERW" : "", + opt_div_scrub ? " DIV" : "", +- opt_branch_harden ? " BRANCH_HARDEN" : ""); ++ opt_branch_harden ? " BRANCH_HARDEN" : "", ++ opt_lock_harden ? " LOCK_HARDEN" : ""); + + /* L1TF diagnostics, printed if vulnerable or PV shadowing is in use. */ + if ( cpu_has_bug_l1tf || opt_pv_l1tf_hwdom || opt_pv_l1tf_domu ) +@@ -1918,6 +1935,9 @@ void __init init_speculation_mitigations(void) + if ( !opt_branch_harden ) + setup_force_cpu_cap(X86_FEATURE_SC_NO_BRANCH_HARDEN); + ++ if ( !opt_lock_harden ) ++ setup_force_cpu_cap(X86_FEATURE_SC_NO_LOCK_HARDEN); ++ + /* + * We do not disable HT by default on affected hardware. + * +diff --git a/xen/common/Kconfig b/xen/common/Kconfig +index 4d6fe05164..3361a6d892 100644 +--- a/xen/common/Kconfig ++++ b/xen/common/Kconfig +@@ -188,6 +188,23 @@ config SPECULATIVE_HARDEN_GUEST_ACCESS + + If unsure, say Y. + ++config SPECULATIVE_HARDEN_LOCK ++ bool "Speculative lock context hardening" ++ default y ++ depends on X86 ++ help ++ Contemporary processors may use speculative execution as a ++ performance optimisation, but this can potentially be abused by an ++ attacker to leak data via speculative sidechannels. ++ ++ One source of data leakage is via speculative accesses to lock ++ critical regions. ++ ++ This option is disabled by default at run time, and needs to be ++ enabled on the command line. ++ ++ If unsure, say Y. ++ + endmenu + + config DIT_DEFAULT +diff --git a/xen/include/xen/nospec.h b/xen/include/xen/nospec.h +index 76255bc46e..4552846403 100644 +--- a/xen/include/xen/nospec.h ++++ b/xen/include/xen/nospec.h +@@ -70,6 +70,21 @@ static inline unsigned long array_index_mask_nospec(unsigned long index, + #define array_access_nospec(array, index) \ + (array)[array_index_nospec(index, ARRAY_SIZE(array))] + ++static always_inline void block_lock_speculation(void) ++{ ++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK ++ arch_block_lock_speculation(); ++#endif ++} ++ ++static always_inline bool lock_evaluate_nospec(bool condition) ++{ ++#ifdef CONFIG_SPECULATIVE_HARDEN_LOCK ++ return arch_lock_evaluate_nospec(condition); ++#endif ++ return condition; ++} ++ + #endif /* XEN_NOSPEC_H */ + + /* +diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h +index e7a1c1aa89..28fce5615e 100644 +--- a/xen/include/xen/spinlock.h ++++ b/xen/include/xen/spinlock.h +@@ -1,6 +1,7 @@ + #ifndef __SPINLOCK_H__ + #define __SPINLOCK_H__ + ++#include + #include + #include + +@@ -195,13 +196,30 @@ int _spin_trylock_recursive(spinlock_t *lock); + void _spin_lock_recursive(spinlock_t *lock); + void _spin_unlock_recursive(spinlock_t *lock); + +-#define spin_lock(l) _spin_lock(l) +-#define spin_lock_cb(l, c, d) _spin_lock_cb(l, c, d) +-#define spin_lock_irq(l) _spin_lock_irq(l) ++static always_inline void spin_lock(spinlock_t *l) ++{ ++ _spin_lock(l); ++ block_lock_speculation(); ++} ++ ++static always_inline void spin_lock_cb(spinlock_t *l, void (*c)(void *data), ++ void *d) ++{ ++ _spin_lock_cb(l, c, d); ++ block_lock_speculation(); ++} ++ ++static always_inline void spin_lock_irq(spinlock_t *l) ++{ ++ _spin_lock_irq(l); ++ block_lock_speculation(); ++} ++ + #define spin_lock_irqsave(l, f) \ + ({ \ + BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ + ((f) = _spin_lock_irqsave(l)); \ ++ block_lock_speculation(); \ + }) + + #define spin_unlock(l) _spin_unlock(l) +@@ -209,7 +227,7 @@ void _spin_unlock_recursive(spinlock_t *lock); + #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) + + #define spin_is_locked(l) _spin_is_locked(l) +-#define spin_trylock(l) _spin_trylock(l) ++#define spin_trylock(l) lock_evaluate_nospec(_spin_trylock(l)) + + #define spin_trylock_irqsave(lock, flags) \ + ({ \ +@@ -230,8 +248,15 @@ void _spin_unlock_recursive(spinlock_t *lock); + * are any critical regions that cannot form part of such a set, they can use + * standard spin_[un]lock(). + */ +-#define spin_trylock_recursive(l) _spin_trylock_recursive(l) +-#define spin_lock_recursive(l) _spin_lock_recursive(l) ++#define spin_trylock_recursive(l) \ ++ lock_evaluate_nospec(_spin_trylock_recursive(l)) ++ ++static always_inline void spin_lock_recursive(spinlock_t *l) ++{ ++ _spin_lock_recursive(l); ++ block_lock_speculation(); ++} ++ + #define spin_unlock_recursive(l) _spin_unlock_recursive(l) + + #endif /* __SPINLOCK_H__ */ +-- +2.44.0 + + +From e7f0f11c888757e62940ded87b4ab5ebc992764f Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 13 Feb 2024 16:08:52 +0100 +Subject: [PATCH 66/70] rwlock: introduce support for blocking speculation into + critical regions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Introduce inline wrappers as required and add direct calls to +block_lock_speculation() in order to prevent speculation into the rwlock +protected critical regions. + +Note the rwlock primitives are adjusted to use the non speculation safe variants +of the spinlock handlers, as a speculation barrier is added in the rwlock +calling wrappers. + +trylock variants are protected by using lock_evaluate_nospec(). + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit a1fb15f61692b1fa9945fc51f55471ace49cdd59) +--- + xen/common/rwlock.c | 14 +++++++++++--- + xen/include/xen/rwlock.h | 34 ++++++++++++++++++++++++++++------ + 2 files changed, 39 insertions(+), 9 deletions(-) + +diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c +index 18224a4bb5..290602936d 100644 +--- a/xen/common/rwlock.c ++++ b/xen/common/rwlock.c +@@ -34,8 +34,11 @@ void queue_read_lock_slowpath(rwlock_t *lock) + + /* + * Put the reader into the wait queue. ++ * ++ * Use the speculation unsafe helper, as it's the caller responsibility to ++ * issue a speculation barrier if required. + */ +- spin_lock(&lock->lock); ++ _spin_lock(&lock->lock); + + /* + * At the head of the wait queue now, wait until the writer state +@@ -66,8 +69,13 @@ void queue_write_lock_slowpath(rwlock_t *lock) + { + u32 cnts; + +- /* Put the writer into the wait queue. */ +- spin_lock(&lock->lock); ++ /* ++ * Put the writer into the wait queue. ++ * ++ * Use the speculation unsafe helper, as it's the caller responsibility to ++ * issue a speculation barrier if required. ++ */ ++ _spin_lock(&lock->lock); + + /* Try to acquire the lock directly if no reader is present. */ + if ( !atomic_read(&lock->cnts) && +diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h +index e0d2b41c5c..9a0d3ec238 100644 +--- a/xen/include/xen/rwlock.h ++++ b/xen/include/xen/rwlock.h +@@ -259,27 +259,49 @@ static inline int _rw_is_write_locked(const rwlock_t *lock) + return (atomic_read(&lock->cnts) & _QW_WMASK) == _QW_LOCKED; + } + +-#define read_lock(l) _read_lock(l) +-#define read_lock_irq(l) _read_lock_irq(l) ++static always_inline void read_lock(rwlock_t *l) ++{ ++ _read_lock(l); ++ block_lock_speculation(); ++} ++ ++static always_inline void read_lock_irq(rwlock_t *l) ++{ ++ _read_lock_irq(l); ++ block_lock_speculation(); ++} ++ + #define read_lock_irqsave(l, f) \ + ({ \ + BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ + ((f) = _read_lock_irqsave(l)); \ ++ block_lock_speculation(); \ + }) + + #define read_unlock(l) _read_unlock(l) + #define read_unlock_irq(l) _read_unlock_irq(l) + #define read_unlock_irqrestore(l, f) _read_unlock_irqrestore(l, f) +-#define read_trylock(l) _read_trylock(l) ++#define read_trylock(l) lock_evaluate_nospec(_read_trylock(l)) ++ ++static always_inline void write_lock(rwlock_t *l) ++{ ++ _write_lock(l); ++ block_lock_speculation(); ++} ++ ++static always_inline void write_lock_irq(rwlock_t *l) ++{ ++ _write_lock_irq(l); ++ block_lock_speculation(); ++} + +-#define write_lock(l) _write_lock(l) +-#define write_lock_irq(l) _write_lock_irq(l) + #define write_lock_irqsave(l, f) \ + ({ \ + BUILD_BUG_ON(sizeof(f) != sizeof(unsigned long)); \ + ((f) = _write_lock_irqsave(l)); \ ++ block_lock_speculation(); \ + }) +-#define write_trylock(l) _write_trylock(l) ++#define write_trylock(l) lock_evaluate_nospec(_write_trylock(l)) + + #define write_unlock(l) _write_unlock(l) + #define write_unlock_irq(l) _write_unlock_irq(l) +-- +2.44.0 + + +From 5a13c81542a163718d7cb9b150b0282b7855efde Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Tue, 13 Feb 2024 17:57:38 +0100 +Subject: [PATCH 67/70] percpu-rwlock: introduce support for blocking + speculation into critical regions +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add direct calls to block_lock_speculation() where required in order to prevent +speculation into the lock protected critical regions. Also convert +_percpu_read_lock() from inline to always_inline. + +Note that _percpu_write_lock() has been modified the use the non speculation +safe of the locking primites, as a speculation is added unconditionally by the +calling wrapper. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit f218daf6d3a3b847736d37c6a6b76031a0d08441) +--- + xen/common/rwlock.c | 6 +++++- + xen/include/xen/rwlock.h | 14 ++++++++++---- + 2 files changed, 15 insertions(+), 5 deletions(-) + +diff --git a/xen/common/rwlock.c b/xen/common/rwlock.c +index 290602936d..f5a249bcc2 100644 +--- a/xen/common/rwlock.c ++++ b/xen/common/rwlock.c +@@ -129,8 +129,12 @@ void _percpu_write_lock(percpu_rwlock_t **per_cpudata, + /* + * First take the write lock to protect against other writers or slow + * path readers. ++ * ++ * Note we use the speculation unsafe variant of write_lock(), as the ++ * calling wrapper already adds a speculation barrier after the lock has ++ * been taken. + */ +- write_lock(&percpu_rwlock->rwlock); ++ _write_lock(&percpu_rwlock->rwlock); + + /* Now set the global variable so that readers start using read_lock. */ + percpu_rwlock->writer_activating = 1; +diff --git a/xen/include/xen/rwlock.h b/xen/include/xen/rwlock.h +index 9a0d3ec238..9e35ee2edf 100644 +--- a/xen/include/xen/rwlock.h ++++ b/xen/include/xen/rwlock.h +@@ -338,8 +338,8 @@ static inline void _percpu_rwlock_owner_check(percpu_rwlock_t **per_cpudata, + #define percpu_rwlock_resource_init(l, owner) \ + (*(l) = (percpu_rwlock_t)PERCPU_RW_LOCK_UNLOCKED(&get_per_cpu_var(owner))) + +-static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, +- percpu_rwlock_t *percpu_rwlock) ++static always_inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, ++ percpu_rwlock_t *percpu_rwlock) + { + /* Validate the correct per_cpudata variable has been provided. */ + _percpu_rwlock_owner_check(per_cpudata, percpu_rwlock); +@@ -374,6 +374,8 @@ static inline void _percpu_read_lock(percpu_rwlock_t **per_cpudata, + } + else + { ++ /* Other branch already has a speculation barrier in read_lock(). */ ++ block_lock_speculation(); + /* All other paths have implicit check_lock() calls via read_lock(). */ + check_lock(&percpu_rwlock->rwlock.lock.debug, false); + } +@@ -430,8 +432,12 @@ static inline void _percpu_write_unlock(percpu_rwlock_t **per_cpudata, + _percpu_read_lock(&get_per_cpu_var(percpu), lock) + #define percpu_read_unlock(percpu, lock) \ + _percpu_read_unlock(&get_per_cpu_var(percpu), lock) +-#define percpu_write_lock(percpu, lock) \ +- _percpu_write_lock(&get_per_cpu_var(percpu), lock) ++ ++#define percpu_write_lock(percpu, lock) \ ++({ \ ++ _percpu_write_lock(&get_per_cpu_var(percpu), lock); \ ++ block_lock_speculation(); \ ++}) + #define percpu_write_unlock(percpu, lock) \ + _percpu_write_unlock(&get_per_cpu_var(percpu), lock) + +-- +2.44.0 + + +From 9de8a52b0e09a2491736abbd4a865a06ac2ced7a Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Mon, 4 Mar 2024 14:29:36 +0100 +Subject: [PATCH 68/70] locking: attempt to ensure lock wrappers are always + inline +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +In order to prevent the locking speculation barriers from being inside of +`call`ed functions that could be speculatively bypassed. + +While there also add an extra locking barrier to _mm_write_lock() in the branch +taken when the lock is already held. + +Note some functions are switched to use the unsafe variants (without speculation +barrier) of the locking primitives, but a speculation barrier is always added +to the exposed public lock wrapping helper. That's the case with +sched_spin_lock_double() or pcidevs_lock() for example. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit 197ecd838a2aaf959a469df3696d4559c4f8b762) +--- + xen/arch/x86/hvm/vpt.c | 10 +++++++--- + xen/arch/x86/include/asm/irq.h | 1 + + xen/arch/x86/mm/mm-locks.h | 28 +++++++++++++++------------- + xen/arch/x86/mm/p2m-pod.c | 2 +- + xen/common/event_channel.c | 5 +++-- + xen/common/grant_table.c | 6 +++--- + xen/common/sched/core.c | 19 ++++++++++++------- + xen/common/sched/private.h | 26 ++++++++++++++++++++++++-- + xen/common/timer.c | 8 +++++--- + xen/drivers/passthrough/pci.c | 5 +++-- + xen/include/xen/event.h | 4 ++-- + xen/include/xen/pci.h | 8 ++++++-- + 12 files changed, 82 insertions(+), 40 deletions(-) + +diff --git a/xen/arch/x86/hvm/vpt.c b/xen/arch/x86/hvm/vpt.c +index 8f53e88d67..e1d6845a28 100644 +--- a/xen/arch/x86/hvm/vpt.c ++++ b/xen/arch/x86/hvm/vpt.c +@@ -150,7 +150,7 @@ static int pt_irq_masked(struct periodic_time *pt) + * pt->vcpu field, because another thread holding the pt_migrate lock + * may already be spinning waiting for your vcpu lock. + */ +-static void pt_vcpu_lock(struct vcpu *v) ++static always_inline void pt_vcpu_lock(struct vcpu *v) + { + spin_lock(&v->arch.hvm.tm_lock); + } +@@ -169,9 +169,13 @@ static void pt_vcpu_unlock(struct vcpu *v) + * need to take an additional lock that protects against pt->vcpu + * changing. + */ +-static void pt_lock(struct periodic_time *pt) ++static always_inline void pt_lock(struct periodic_time *pt) + { +- read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); ++ /* ++ * Use the speculation unsafe variant for the first lock, as the following ++ * lock taking helper already includes a speculation barrier. ++ */ ++ _read_lock(&pt->vcpu->domain->arch.hvm.pl_time->pt_migrate); + spin_lock(&pt->vcpu->arch.hvm.tm_lock); + } + +diff --git a/xen/arch/x86/include/asm/irq.h b/xen/arch/x86/include/asm/irq.h +index a87af47ece..465ab39bb0 100644 +--- a/xen/arch/x86/include/asm/irq.h ++++ b/xen/arch/x86/include/asm/irq.h +@@ -174,6 +174,7 @@ void cf_check irq_complete_move(struct irq_desc *desc); + + extern struct irq_desc *irq_desc; + ++/* Not speculation safe, only used for AP bringup. */ + void lock_vector_lock(void); + void unlock_vector_lock(void); + +diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h +index 5a3f96fbaa..5ec080c02f 100644 +--- a/xen/arch/x86/mm/mm-locks.h ++++ b/xen/arch/x86/mm/mm-locks.h +@@ -74,8 +74,8 @@ static inline void _set_lock_level(int l) + this_cpu(mm_lock_level) = l; + } + +-static inline void _mm_lock(const struct domain *d, mm_lock_t *l, +- const char *func, int level, int rec) ++static always_inline void _mm_lock(const struct domain *d, mm_lock_t *l, ++ const char *func, int level, int rec) + { + if ( !((mm_locked_by_me(l)) && rec) ) + _check_lock_level(d, level); +@@ -125,8 +125,8 @@ static inline int mm_write_locked_by_me(mm_rwlock_t *l) + return (l->locker == get_processor_id()); + } + +-static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, +- const char *func, int level) ++static always_inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, ++ const char *func, int level) + { + if ( !mm_write_locked_by_me(l) ) + { +@@ -137,6 +137,8 @@ static inline void _mm_write_lock(const struct domain *d, mm_rwlock_t *l, + l->unlock_level = _get_lock_level(); + _set_lock_level(_lock_level(d, level)); + } ++ else ++ block_speculation(); + l->recurse_count++; + } + +@@ -150,8 +152,8 @@ static inline void mm_write_unlock(mm_rwlock_t *l) + percpu_write_unlock(p2m_percpu_rwlock, &l->lock); + } + +-static inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, +- int level) ++static always_inline void _mm_read_lock(const struct domain *d, mm_rwlock_t *l, ++ int level) + { + _check_lock_level(d, level); + percpu_read_lock(p2m_percpu_rwlock, &l->lock); +@@ -166,15 +168,15 @@ static inline void mm_read_unlock(mm_rwlock_t *l) + + /* This wrapper uses the line number to express the locking order below */ + #define declare_mm_lock(name) \ +- static inline void mm_lock_##name(const struct domain *d, mm_lock_t *l, \ +- const char *func, int rec) \ ++ static always_inline void mm_lock_##name( \ ++ const struct domain *d, mm_lock_t *l, const char *func, int rec) \ + { _mm_lock(d, l, func, MM_LOCK_ORDER_##name, rec); } + #define declare_mm_rwlock(name) \ +- static inline void mm_write_lock_##name(const struct domain *d, \ +- mm_rwlock_t *l, const char *func) \ ++ static always_inline void mm_write_lock_##name( \ ++ const struct domain *d, mm_rwlock_t *l, const char *func) \ + { _mm_write_lock(d, l, func, MM_LOCK_ORDER_##name); } \ +- static inline void mm_read_lock_##name(const struct domain *d, \ +- mm_rwlock_t *l) \ ++ static always_inline void mm_read_lock_##name(const struct domain *d, \ ++ mm_rwlock_t *l) \ + { _mm_read_lock(d, l, MM_LOCK_ORDER_##name); } + /* These capture the name of the calling function */ + #define mm_lock(name, d, l) mm_lock_##name(d, l, __func__, 0) +@@ -309,7 +311,7 @@ declare_mm_lock(altp2mlist) + #define MM_LOCK_ORDER_altp2m 40 + declare_mm_rwlock(altp2m); + +-static inline void p2m_lock(struct p2m_domain *p) ++static always_inline void p2m_lock(struct p2m_domain *p) + { + if ( p2m_is_altp2m(p) ) + mm_write_lock(altp2m, p->domain, &p->lock); +diff --git a/xen/arch/x86/mm/p2m-pod.c b/xen/arch/x86/mm/p2m-pod.c +index 9969eb45fa..9be67b63ce 100644 +--- a/xen/arch/x86/mm/p2m-pod.c ++++ b/xen/arch/x86/mm/p2m-pod.c +@@ -24,7 +24,7 @@ + #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) + + /* Enforce lock ordering when grabbing the "external" page_alloc lock */ +-static inline void lock_page_alloc(struct p2m_domain *p2m) ++static always_inline void lock_page_alloc(struct p2m_domain *p2m) + { + page_alloc_mm_pre_lock(p2m->domain); + spin_lock(&(p2m->domain->page_alloc_lock)); +diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c +index a7a004a084..66f924a7b0 100644 +--- a/xen/common/event_channel.c ++++ b/xen/common/event_channel.c +@@ -45,7 +45,7 @@ + * just assume the event channel is free or unbound at the moment when the + * evtchn_read_trylock() returns false. + */ +-static inline void evtchn_write_lock(struct evtchn *evtchn) ++static always_inline void evtchn_write_lock(struct evtchn *evtchn) + { + write_lock(&evtchn->lock); + +@@ -351,7 +351,8 @@ int evtchn_alloc_unbound(evtchn_alloc_unbound_t *alloc, evtchn_port_t port) + return rc; + } + +-static void double_evtchn_lock(struct evtchn *lchn, struct evtchn *rchn) ++static always_inline void double_evtchn_lock(struct evtchn *lchn, ++ struct evtchn *rchn) + { + ASSERT(lchn != rchn); + +diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c +index 89b7811c51..934924cbda 100644 +--- a/xen/common/grant_table.c ++++ b/xen/common/grant_table.c +@@ -403,7 +403,7 @@ static inline void act_set_gfn(struct active_grant_entry *act, gfn_t gfn) + + static DEFINE_PERCPU_RWLOCK_GLOBAL(grant_rwlock); + +-static inline void grant_read_lock(struct grant_table *gt) ++static always_inline void grant_read_lock(struct grant_table *gt) + { + percpu_read_lock(grant_rwlock, >->lock); + } +@@ -413,7 +413,7 @@ static inline void grant_read_unlock(struct grant_table *gt) + percpu_read_unlock(grant_rwlock, >->lock); + } + +-static inline void grant_write_lock(struct grant_table *gt) ++static always_inline void grant_write_lock(struct grant_table *gt) + { + percpu_write_lock(grant_rwlock, >->lock); + } +@@ -450,7 +450,7 @@ nr_active_grant_frames(struct grant_table *gt) + return num_act_frames_from_sha_frames(nr_grant_frames(gt)); + } + +-static inline struct active_grant_entry * ++static always_inline struct active_grant_entry * + active_entry_acquire(struct grant_table *t, grant_ref_t e) + { + struct active_grant_entry *act; +diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c +index 901782bbb4..34ad39b9ad 100644 +--- a/xen/common/sched/core.c ++++ b/xen/common/sched/core.c +@@ -348,23 +348,28 @@ uint64_t get_cpu_idle_time(unsigned int cpu) + * This avoids dead- or live-locks when this code is running on both + * cpus at the same time. + */ +-static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2, +- unsigned long *flags) ++static always_inline void sched_spin_lock_double( ++ spinlock_t *lock1, spinlock_t *lock2, unsigned long *flags) + { ++ /* ++ * In order to avoid extra overhead, use the locking primitives without the ++ * speculation barrier, and introduce a single barrier here. ++ */ + if ( lock1 == lock2 ) + { +- spin_lock_irqsave(lock1, *flags); ++ *flags = _spin_lock_irqsave(lock1); + } + else if ( lock1 < lock2 ) + { +- spin_lock_irqsave(lock1, *flags); +- spin_lock(lock2); ++ *flags = _spin_lock_irqsave(lock1); ++ _spin_lock(lock2); + } + else + { +- spin_lock_irqsave(lock2, *flags); +- spin_lock(lock1); ++ *flags = _spin_lock_irqsave(lock2); ++ _spin_lock(lock1); + } ++ block_lock_speculation(); + } + + static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2, +diff --git a/xen/common/sched/private.h b/xen/common/sched/private.h +index c516976c37..3b97f15767 100644 +--- a/xen/common/sched/private.h ++++ b/xen/common/sched/private.h +@@ -207,8 +207,24 @@ DECLARE_PER_CPU(cpumask_t, cpumask_scratch); + #define cpumask_scratch (&this_cpu(cpumask_scratch)) + #define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c)) + ++/* ++ * Deal with _spin_lock_irqsave() returning the flags value instead of storing ++ * it in a passed parameter. ++ */ ++#define _sched_spinlock0(lock, irq) _spin_lock##irq(lock) ++#define _sched_spinlock1(lock, irq, arg) ({ \ ++ BUILD_BUG_ON(sizeof(arg) != sizeof(unsigned long)); \ ++ (arg) = _spin_lock##irq(lock); \ ++}) ++ ++#define _sched_spinlock__(nr) _sched_spinlock ## nr ++#define _sched_spinlock_(nr) _sched_spinlock__(nr) ++#define _sched_spinlock(lock, irq, args...) \ ++ _sched_spinlock_(count_args(args))(lock, irq, ## args) ++ + #define sched_lock(kind, param, cpu, irq, arg...) \ +-static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ ++static always_inline spinlock_t \ ++*kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ + { \ + for ( ; ; ) \ + { \ +@@ -220,10 +236,16 @@ static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \ + * \ + * It may also be the case that v->processor may change but the \ + * lock may be the same; this will succeed in that case. \ ++ * \ ++ * Use the speculation unsafe locking helper, there's a speculation \ ++ * barrier before returning to the caller. \ + */ \ +- spin_lock##irq(lock, ## arg); \ ++ _sched_spinlock(lock, irq, ## arg); \ + if ( likely(lock == get_sched_res(cpu)->schedule_lock) ) \ ++ { \ ++ block_lock_speculation(); \ + return lock; \ ++ } \ + spin_unlock##irq(lock, ## arg); \ + } \ + } +diff --git a/xen/common/timer.c b/xen/common/timer.c +index 0fddfa7487..38eb5fd20d 100644 +--- a/xen/common/timer.c ++++ b/xen/common/timer.c +@@ -239,7 +239,7 @@ static inline void deactivate_timer(struct timer *timer) + list_add(&timer->inactive, &per_cpu(timers, timer->cpu).inactive); + } + +-static inline bool_t timer_lock(struct timer *timer) ++static inline bool_t timer_lock_unsafe(struct timer *timer) + { + unsigned int cpu; + +@@ -253,7 +253,8 @@ static inline bool_t timer_lock(struct timer *timer) + rcu_read_unlock(&timer_cpu_read_lock); + return 0; + } +- spin_lock(&per_cpu(timers, cpu).lock); ++ /* Use the speculation unsafe variant, the wrapper has the barrier. */ ++ _spin_lock(&per_cpu(timers, cpu).lock); + if ( likely(timer->cpu == cpu) ) + break; + spin_unlock(&per_cpu(timers, cpu).lock); +@@ -266,8 +267,9 @@ static inline bool_t timer_lock(struct timer *timer) + #define timer_lock_irqsave(t, flags) ({ \ + bool_t __x; \ + local_irq_save(flags); \ +- if ( !(__x = timer_lock(t)) ) \ ++ if ( !(__x = timer_lock_unsafe(t)) ) \ + local_irq_restore(flags); \ ++ block_lock_speculation(); \ + __x; \ + }) + +diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c +index e99837b6e1..2a1e7ee89a 100644 +--- a/xen/drivers/passthrough/pci.c ++++ b/xen/drivers/passthrough/pci.c +@@ -52,9 +52,10 @@ struct pci_seg { + + static spinlock_t _pcidevs_lock = SPIN_LOCK_UNLOCKED; + +-void pcidevs_lock(void) ++/* Do not use, as it has no speculation barrier, use pcidevs_lock() instead. */ ++void pcidevs_lock_unsafe(void) + { +- spin_lock_recursive(&_pcidevs_lock); ++ _spin_lock_recursive(&_pcidevs_lock); + } + + void pcidevs_unlock(void) +diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h +index 8e509e0784..f1472ea1eb 100644 +--- a/xen/include/xen/event.h ++++ b/xen/include/xen/event.h +@@ -114,12 +114,12 @@ void notify_via_xen_event_channel(struct domain *ld, int lport); + #define bucket_from_port(d, p) \ + ((group_from_port(d, p))[((p) % EVTCHNS_PER_GROUP) / EVTCHNS_PER_BUCKET]) + +-static inline void evtchn_read_lock(struct evtchn *evtchn) ++static always_inline void evtchn_read_lock(struct evtchn *evtchn) + { + read_lock(&evtchn->lock); + } + +-static inline bool evtchn_read_trylock(struct evtchn *evtchn) ++static always_inline bool evtchn_read_trylock(struct evtchn *evtchn) + { + return read_trylock(&evtchn->lock); + } +diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h +index 251b8761a8..a71bed36be 100644 +--- a/xen/include/xen/pci.h ++++ b/xen/include/xen/pci.h +@@ -155,8 +155,12 @@ struct pci_dev { + * devices, it also sync the access to the msi capability that is not + * interrupt handling related (the mask bit register). + */ +- +-void pcidevs_lock(void); ++void pcidevs_lock_unsafe(void); ++static always_inline void pcidevs_lock(void) ++{ ++ pcidevs_lock_unsafe(); ++ block_lock_speculation(); ++} + void pcidevs_unlock(void); + bool __must_check pcidevs_locked(void); + +-- +2.44.0 + + +From e107a8ece71ec4e1bb0092d5beea6cb16a96f7ae Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Mon, 4 Mar 2024 18:08:48 +0100 +Subject: [PATCH 69/70] x86/mm: add speculation barriers to open coded locks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Add a speculation barrier to the clearly identified open-coded lock taking +functions. + +Note that the memory sharing page_lock() replacement (_page_lock()) is left +as-is, as the code is experimental and not security supported. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit 42a572a38e22a97d86a4b648a22597628d5b42e4) +--- + xen/arch/x86/include/asm/mm.h | 4 +++- + xen/arch/x86/mm.c | 6 ++++-- + 2 files changed, 7 insertions(+), 3 deletions(-) + +diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h +index 05dfe35502..d1b1fee99b 100644 +--- a/xen/arch/x86/include/asm/mm.h ++++ b/xen/arch/x86/include/asm/mm.h +@@ -399,7 +399,9 @@ const struct platform_bad_page *get_platform_badpages(unsigned int *array_size); + * The use of PGT_locked in mem_sharing does not collide, since mem_sharing is + * only supported for hvm guests, which do not have PV PTEs updated. + */ +-int page_lock(struct page_info *page); ++int page_lock_unsafe(struct page_info *page); ++#define page_lock(pg) lock_evaluate_nospec(page_lock_unsafe(pg)) ++ + void page_unlock(struct page_info *page); + + void put_page_type(struct page_info *page); +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index ab0acbfea6..000fd0fb55 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -2017,7 +2017,7 @@ static inline bool current_locked_page_ne_check(struct page_info *page) { + #define current_locked_page_ne_check(x) true + #endif + +-int page_lock(struct page_info *page) ++int page_lock_unsafe(struct page_info *page) + { + unsigned long x, nx; + +@@ -2078,7 +2078,7 @@ void page_unlock(struct page_info *page) + * l3t_lock(), so to avoid deadlock we must avoid grabbing them in + * reverse order. + */ +-static void l3t_lock(struct page_info *page) ++static always_inline void l3t_lock(struct page_info *page) + { + unsigned long x, nx; + +@@ -2087,6 +2087,8 @@ static void l3t_lock(struct page_info *page) + cpu_relax(); + nx = x | PGT_locked; + } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x ); ++ ++ block_lock_speculation(); + } + + static void l3t_unlock(struct page_info *page) +-- +2.44.0 + + +From 4da8ca9cb9cfdb92c9dd09d5270ae16a3b2dbc89 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Roger=20Pau=20Monn=C3=A9?= +Date: Mon, 4 Mar 2024 16:24:21 +0100 +Subject: [PATCH 70/70] x86: protect conditional lock taking from speculative + execution +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Conditionally taken locks that use the pattern: + +if ( lock ) + spin_lock(...); + +Need an else branch in order to issue an speculation barrier in the else case, +just like it's done in case the lock needs to be acquired. + +eval_nospec() could be used on the condition itself, but that would result in a +double barrier on the branch where the lock is taken. + +Introduce a new pair of helpers, {gfn,spin}_lock_if() that can be used to +conditionally take a lock in a speculation safe way. + +This is part of XSA-453 / CVE-2024-2193 + +Signed-off-by: Roger Pau Monné +Reviewed-by: Jan Beulich +(cherry picked from commit 03cf7ca23e0e876075954c558485b267b7d02406) +--- + xen/arch/x86/mm.c | 35 +++++++++++++---------------------- + xen/arch/x86/mm/mm-locks.h | 9 +++++++++ + xen/arch/x86/mm/p2m.c | 5 ++--- + xen/include/xen/spinlock.h | 8 ++++++++ + 4 files changed, 32 insertions(+), 25 deletions(-) + +diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c +index 000fd0fb55..45bfbc2522 100644 +--- a/xen/arch/x86/mm.c ++++ b/xen/arch/x86/mm.c +@@ -5007,8 +5007,7 @@ static l3_pgentry_t *virt_to_xen_l3e(unsigned long v) + if ( !l3t ) + return NULL; + UNMAP_DOMAIN_PAGE(l3t); +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) ) + { + l4_pgentry_t l4e = l4e_from_mfn(l3mfn, __PAGE_HYPERVISOR); +@@ -5045,8 +5044,7 @@ static l2_pgentry_t *virt_to_xen_l2e(unsigned long v) + return NULL; + } + UNMAP_DOMAIN_PAGE(l2t); +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ) + { + l3e_write(pl3e, l3e_from_mfn(l2mfn, __PAGE_HYPERVISOR)); +@@ -5084,8 +5082,7 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) + return NULL; + } + UNMAP_DOMAIN_PAGE(l1t); +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ) + { + l2e_write(pl2e, l2e_from_mfn(l1mfn, __PAGE_HYPERVISOR)); +@@ -5116,6 +5113,8 @@ l1_pgentry_t *virt_to_xen_l1e(unsigned long v) + do { \ + if ( locking ) \ + l3t_lock(page); \ ++ else \ ++ block_lock_speculation(); \ + } while ( false ) + + #define L3T_UNLOCK(page) \ +@@ -5331,8 +5330,7 @@ int map_pages_to_xen( + if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL ) + flush_flags |= FLUSH_TLB_GLOBAL; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && + (l3e_get_flags(*pl3e) & _PAGE_PSE) ) + { +@@ -5436,8 +5434,7 @@ int map_pages_to_xen( + if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL ) + flush_flags |= FLUSH_TLB_GLOBAL; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && + (l2e_get_flags(*pl2e) & _PAGE_PSE) ) + { +@@ -5478,8 +5475,7 @@ int map_pages_to_xen( + unsigned long base_mfn; + const l1_pgentry_t *l1t; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + ol2e = *pl2e; + /* +@@ -5533,8 +5529,7 @@ int map_pages_to_xen( + unsigned long base_mfn; + const l2_pgentry_t *l2t; + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + ol3e = *pl3e; + /* +@@ -5678,8 +5673,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + l3e_get_flags(*pl3e))); + UNMAP_DOMAIN_PAGE(l2t); + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) && + (l3e_get_flags(*pl3e) & _PAGE_PSE) ) + { +@@ -5738,8 +5732,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + l2e_get_flags(*pl2e) & ~_PAGE_PSE)); + UNMAP_DOMAIN_PAGE(l1t); + +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) && + (l2e_get_flags(*pl2e) & _PAGE_PSE) ) + { +@@ -5783,8 +5776,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + */ + if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) ) + continue; +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + /* + * L2E may be already cleared, or set to a superpage, by +@@ -5831,8 +5823,7 @@ int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf) + if ( (nf & _PAGE_PRESENT) || + ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) ) + continue; +- if ( locking ) +- spin_lock(&map_pgdir_lock); ++ spin_lock_if(locking, &map_pgdir_lock); + + /* + * L3E may be already cleared, or set to a superpage, by +diff --git a/xen/arch/x86/mm/mm-locks.h b/xen/arch/x86/mm/mm-locks.h +index 5ec080c02f..b4960fb90e 100644 +--- a/xen/arch/x86/mm/mm-locks.h ++++ b/xen/arch/x86/mm/mm-locks.h +@@ -335,6 +335,15 @@ static inline void p2m_unlock(struct p2m_domain *p) + #define p2m_locked_by_me(p) mm_write_locked_by_me(&(p)->lock) + #define gfn_locked_by_me(p,g) p2m_locked_by_me(p) + ++static always_inline void gfn_lock_if(bool condition, struct p2m_domain *p2m, ++ gfn_t gfn, unsigned int order) ++{ ++ if ( condition ) ++ gfn_lock(p2m, gfn, order); ++ else ++ block_lock_speculation(); ++} ++ + /* PoD lock (per-p2m-table) + * + * Protects private PoD data structs: entry and cache +diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c +index 0983bd71d9..22ab1d606e 100644 +--- a/xen/arch/x86/mm/p2m.c ++++ b/xen/arch/x86/mm/p2m.c +@@ -280,9 +280,8 @@ mfn_t p2m_get_gfn_type_access(struct p2m_domain *p2m, gfn_t gfn, + if ( q & P2M_UNSHARE ) + q |= P2M_ALLOC; + +- if ( locked ) +- /* Grab the lock here, don't release until put_gfn */ +- gfn_lock(p2m, gfn, 0); ++ /* Grab the lock here, don't release until put_gfn */ ++ gfn_lock_if(locked, p2m, gfn, 0); + + mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order, NULL); + +diff --git a/xen/include/xen/spinlock.h b/xen/include/xen/spinlock.h +index 28fce5615e..c830df3430 100644 +--- a/xen/include/xen/spinlock.h ++++ b/xen/include/xen/spinlock.h +@@ -222,6 +222,14 @@ static always_inline void spin_lock_irq(spinlock_t *l) + block_lock_speculation(); \ + }) + ++/* Conditionally take a spinlock in a speculation safe way. */ ++static always_inline void spin_lock_if(bool condition, spinlock_t *l) ++{ ++ if ( condition ) ++ _spin_lock(l); ++ block_lock_speculation(); ++} ++ + #define spin_unlock(l) _spin_unlock(l) + #define spin_unlock_irq(l) _spin_unlock_irq(l) + #define spin_unlock_irqrestore(l, f) _spin_unlock_irqrestore(l, f) +-- +2.44.0 + diff --git a/main/xen/xsa447.patch b/main/xen/xsa447.patch deleted file mode 100644 index 2e26396b0ee..00000000000 --- a/main/xen/xsa447.patch +++ /dev/null @@ -1,117 +0,0 @@ -From 084c7312fa6c1d4a7fa343efa1d7d73693dafff4 Mon Sep 17 00:00:00 2001 -From: Michal Orzel -Date: Thu, 23 Nov 2023 15:53:02 +0100 -Subject: [PATCH] xen/arm: page: Avoid pointer overflow on cache clean & - invalidate - -On Arm32, after cleaning and invalidating the last dcache line of the top -domheap page i.e. VA = 0xfffff000 (as a result of flushing the page to -RAM), we end up adding the value of a dcache line size to the pointer -once again, which results in a pointer arithmetic overflow (with 64B line -size, operation 0xffffffc0 + 0x40 overflows to 0x0). Such behavior is -undefined and given the wide range of compiler versions we support, it is -difficult to determine what could happen in such scenario. - -Modify clean_and_invalidate_dcache_va_range() as well as -clean_dcache_va_range() and invalidate_dcache_va_range() due to similarity -of handling to prevent pointer arithmetic overflow. Modify the loops to -use an additional variable to store the index of the next cacheline. -Add an assert to prevent passing a region that wraps around which is -illegal and would end up in a page fault anyway (region 0-2MB is -unmapped). Lastly, return early if size passed is 0. - -Note that on Arm64, we don't have this problem given that the max VA -space we support is 48-bits. - -This is XSA-447 / CVE-2023-46837. - -Signed-off-by: Michal Orzel -Reviewed-by: Julien Grall ---- - xen/arch/arm/include/asm/page.h | 35 ++++++++++++++++++++++++++------- - 1 file changed, 28 insertions(+), 7 deletions(-) - -diff --git a/xen/arch/arm/include/asm/page.h b/xen/arch/arm/include/asm/page.h -index ebaf5964f114..69f817d1e68a 100644 ---- a/xen/arch/arm/include/asm/page.h -+++ b/xen/arch/arm/include/asm/page.h -@@ -162,6 +162,13 @@ static inline size_t read_dcache_line_bytes(void) - static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - { - size_t cacheline_mask = dcache_line_bytes - 1; -+ unsigned long idx = 0; -+ -+ if ( !size ) -+ return 0; -+ -+ /* Passing a region that wraps around is illegal */ -+ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); - - dsb(sy); /* So the CPU issues all writes to the range */ - -@@ -174,11 +181,11 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - } - - for ( ; size >= dcache_line_bytes; -- p += dcache_line_bytes, size -= dcache_line_bytes ) -- asm volatile (__invalidate_dcache_one(0) : : "r" (p)); -+ idx += dcache_line_bytes, size -= dcache_line_bytes ) -+ asm volatile (__invalidate_dcache_one(0) : : "r" (p + idx)); - - if ( size > 0 ) -- asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); -+ asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx)); - - dsb(sy); /* So we know the flushes happen before continuing */ - -@@ -188,14 +195,21 @@ static inline int invalidate_dcache_va_range(const void *p, unsigned long size) - static inline int clean_dcache_va_range(const void *p, unsigned long size) - { - size_t cacheline_mask = dcache_line_bytes - 1; -+ unsigned long idx = 0; -+ -+ if ( !size ) -+ return 0; -+ -+ /* Passing a region that wraps around is illegal */ -+ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); - - dsb(sy); /* So the CPU issues all writes to the range */ - size += (uintptr_t)p & cacheline_mask; - size = (size + cacheline_mask) & ~cacheline_mask; - p = (void *)((uintptr_t)p & ~cacheline_mask); - for ( ; size >= dcache_line_bytes; -- p += dcache_line_bytes, size -= dcache_line_bytes ) -- asm volatile (__clean_dcache_one(0) : : "r" (p)); -+ idx += dcache_line_bytes, size -= dcache_line_bytes ) -+ asm volatile (__clean_dcache_one(0) : : "r" (p + idx)); - dsb(sy); /* So we know the flushes happen before continuing */ - /* ARM callers assume that dcache_* functions cannot fail. */ - return 0; -@@ -205,14 +219,21 @@ static inline int clean_and_invalidate_dcache_va_range - (const void *p, unsigned long size) - { - size_t cacheline_mask = dcache_line_bytes - 1; -+ unsigned long idx = 0; -+ -+ if ( !size ) -+ return 0; -+ -+ /* Passing a region that wraps around is illegal */ -+ ASSERT(((uintptr_t)p + size - 1) >= (uintptr_t)p); - - dsb(sy); /* So the CPU issues all writes to the range */ - size += (uintptr_t)p & cacheline_mask; - size = (size + cacheline_mask) & ~cacheline_mask; - p = (void *)((uintptr_t)p & ~cacheline_mask); - for ( ; size >= dcache_line_bytes; -- p += dcache_line_bytes, size -= dcache_line_bytes ) -- asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p)); -+ idx += dcache_line_bytes, size -= dcache_line_bytes ) -+ asm volatile (__clean_and_invalidate_dcache_one(0) : : "r" (p + idx)); - dsb(sy); /* So we know the flushes happen before continuing */ - /* ARM callers assume that dcache_* functions cannot fail. */ - return 0; --- -2.40.1 - diff --git a/main/xen/xsa449.patch b/main/xen/xsa449.patch deleted file mode 100644 index 80aeac29161..00000000000 --- a/main/xen/xsa449.patch +++ /dev/null @@ -1,89 +0,0 @@ -From d8b92b21b224126860978e4c604302f3c1e3bf75 Mon Sep 17 00:00:00 2001 -From: Roger Pau Monne -Date: Wed, 13 Dec 2023 15:51:59 +0100 -Subject: [PATCH] pci: fail device assignment if phantom functions cannot be - assigned -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -The current behavior is that no error is reported if (some) phantom functions -fail to be assigned during device add or assignment, so the operation succeeds -even if some phantom functions are not correctly setup. - -This can lead to devices possibly being successfully assigned to a domU while -some of the device phantom functions are still assigned to dom0. Even when the -device is assigned domIO before being assigned to a domU phantom functions -might fail to be assigned to domIO, and also fail to be assigned to the domU, -leaving them assigned to dom0. - -Since the device can generate requests using the IDs of those phantom -functions, given the scenario above a device in such state would be in control -of a domU, but still capable of generating transactions that use a context ID -targeting dom0 owned memory. - -Modify device assign in order to attempt to deassign the device if phantom -functions failed to be assigned. - -Note that device addition is not modified in the same way, as in that case the -device is assigned to a trusted domain, and hence partial assign can lead to -device malfunction but not a security issue. - -This is XSA-449 / CVE-2023-46839 - -Fixes: 4e9950dc1bd2 ('IOMMU: add phantom function support') -Signed-off-by: Roger Pau Monné -Reviewed-by: Jan Beulich ---- - xen/drivers/passthrough/pci.c | 27 +++++++++++++++++++++------ - 1 file changed, 21 insertions(+), 6 deletions(-) - -diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c -index 1439d1ef2b26..47c0eee7bdcc 100644 ---- a/xen/drivers/passthrough/pci.c -+++ b/xen/drivers/passthrough/pci.c -@@ -1488,11 +1488,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) - - pdev->fault.count = 0; - -- if ( (rc = iommu_call(hd->platform_ops, assign_device, d, devfn, -- pci_to_dev(pdev), flag)) ) -- goto done; -+ rc = iommu_call(hd->platform_ops, assign_device, d, devfn, pci_to_dev(pdev), -+ flag); - -- for ( ; pdev->phantom_stride; rc = 0 ) -+ while ( pdev->phantom_stride && !rc ) - { - devfn += pdev->phantom_stride; - if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) ) -@@ -1503,8 +1502,24 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag) - - done: - if ( rc ) -- printk(XENLOG_G_WARNING "%pd: assign (%pp) failed (%d)\n", -- d, &PCI_SBDF(seg, bus, devfn), rc); -+ { -+ printk(XENLOG_G_WARNING "%pd: assign %s(%pp) failed (%d)\n", -+ d, devfn != pdev->devfn ? "phantom function " : "", -+ &PCI_SBDF(seg, bus, devfn), rc); -+ -+ if ( devfn != pdev->devfn && deassign_device(d, seg, bus, pdev->devfn) ) -+ { -+ /* -+ * Device with phantom functions that failed to both assign and -+ * rollback. Mark the device as broken and crash the target domain, -+ * as the state of the functions at this point is unknown and Xen -+ * has no way to assert consistent context assignment among them. -+ */ -+ pdev->broken = true; -+ if ( !is_hardware_domain(d) && d != dom_io ) -+ domain_crash(d); -+ } -+ } - /* The device is assigned to dom_io so mark it as quarantined */ - else if ( d == dom_io ) - pdev->quarantine = true; --- -2.43.0 - diff --git a/main/xen/xsa450.patch b/main/xen/xsa450.patch deleted file mode 100644 index e94933be0b8..00000000000 --- a/main/xen/xsa450.patch +++ /dev/null @@ -1,59 +0,0 @@ -From: Andrew Cooper -Subject: VT-d: Fix "else" vs "#endif" misplacement - -In domain_pgd_maddr() the "#endif" is misplaced with respect to "else". This -generates incorrect logic when CONFIG_HVM is compiled out, as the "else" body -is executed unconditionally. - -Rework the logic to use IS_ENABLED() instead of explicit #ifdef-ary, as it's -clearer to follow. This in turn involves adjusting p2m_get_pagetable() to -compile when CONFIG_HVM is disabled. - -This is XSA-450 / CVE-2023-46840. - -Reported-by: Reported-by: Teddy Astie -Fixes: 033ff90aa9c1 ("x86/P2M: p2m_{alloc,free}_ptp() and p2m_alloc_table() are HVM-only") -Signed-off-by: Andrew Cooper -Reviewed-by: Jan Beulich - -diff --git a/xen/arch/x86/include/asm/p2m.h b/xen/arch/x86/include/asm/p2m.h -index 32f3f394b05a..6ada585eaac2 100644 ---- a/xen/arch/x86/include/asm/p2m.h -+++ b/xen/arch/x86/include/asm/p2m.h -@@ -435,7 +435,14 @@ static inline bool p2m_is_altp2m(const struct p2m_domain *p2m) - return p2m->p2m_class == p2m_alternate; - } - --#define p2m_get_pagetable(p2m) ((p2m)->phys_table) -+#ifdef CONFIG_HVM -+static inline pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m) -+{ -+ return p2m->phys_table; -+} -+#else -+pagetable_t p2m_get_pagetable(const struct p2m_domain *p2m); -+#endif - - /* - * Ensure any deferred p2m TLB flush has been completed on all VCPUs. -diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c -index 99b642f12ef9..4244855032ee 100644 ---- a/xen/drivers/passthrough/vtd/iommu.c -+++ b/xen/drivers/passthrough/vtd/iommu.c -@@ -438,15 +438,13 @@ static paddr_t domain_pgd_maddr(struct domain *d, paddr_t pgd_maddr, - - if ( pgd_maddr ) - /* nothing */; --#ifdef CONFIG_HVM -- else if ( iommu_use_hap_pt(d) ) -+ else if ( IS_ENABLED(CONFIG_HVM) && iommu_use_hap_pt(d) ) - { - pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d)); - - pgd_maddr = pagetable_get_paddr(pgt); - } - else --#endif - { - if ( !hd->arch.vtd.pgd_maddr ) - { diff --git a/main/xen/xsa451-4.18.patch b/main/xen/xsa451-4.18.patch deleted file mode 100644 index 721f3f34df6..00000000000 --- a/main/xen/xsa451-4.18.patch +++ /dev/null @@ -1,188 +0,0 @@ -From: Jan Beulich -Subject: x86: account for shadow stack in exception-from-stub recovery - -Dealing with exceptions raised from within emulation stubs involves -discarding return address (replaced by exception related information). -Such discarding of course also requires removing the corresponding entry -from the shadow stack. - -Also amend the comment in fixup_exception_return(), to further clarify -why use of ptr[1] can't be an out-of-bounds access. - -While touching do_invalid_op() also add a missing fall-through -annotation. - -This is CVE-2023-46841 / XSA-451. - -Fixes: 209fb9919b50 ("x86/extable: Adjust extable handling to be shadow stack compatible") -Signed-off-by: Jan Beulich -Reviewed-by: Andrew Cooper - ---- a/xen/arch/x86/extable.c -+++ b/xen/arch/x86/extable.c -@@ -86,26 +86,29 @@ search_one_extable(const struct exceptio - } - - unsigned long --search_exception_table(const struct cpu_user_regs *regs) -+search_exception_table(const struct cpu_user_regs *regs, unsigned long *stub_ra) - { - const struct virtual_region *region = find_text_region(regs->rip); - unsigned long stub = this_cpu(stubs.addr); - - if ( region && region->ex ) -+ { -+ *stub_ra = 0; - return search_one_extable(region->ex, region->ex_end, regs->rip); -+ } - - if ( regs->rip >= stub + STUB_BUF_SIZE / 2 && - regs->rip < stub + STUB_BUF_SIZE && - regs->rsp > (unsigned long)regs && - regs->rsp < (unsigned long)get_cpu_info() ) - { -- unsigned long retptr = *(unsigned long *)regs->rsp; -+ unsigned long retaddr = *(unsigned long *)regs->rsp, fixup; - -- region = find_text_region(retptr); -- retptr = region && region->ex -- ? search_one_extable(region->ex, region->ex_end, retptr) -- : 0; -- if ( retptr ) -+ region = find_text_region(retaddr); -+ fixup = region && region->ex -+ ? search_one_extable(region->ex, region->ex_end, retaddr) -+ : 0; -+ if ( fixup ) - { - /* - * Put trap number and error code on the stack (in place of the -@@ -117,7 +120,8 @@ search_exception_table(const struct cpu_ - }; - - *(unsigned long *)regs->rsp = token.raw; -- return retptr; -+ *stub_ra = retaddr; -+ return fixup; - } - } - ---- a/xen/arch/x86/include/asm/uaccess.h -+++ b/xen/arch/x86/include/asm/uaccess.h -@@ -421,7 +421,8 @@ union stub_exception_token { - unsigned long raw; - }; - --extern unsigned long search_exception_table(const struct cpu_user_regs *regs); -+extern unsigned long search_exception_table(const struct cpu_user_regs *regs, -+ unsigned long *stub_ra); - extern void sort_exception_tables(void); - extern void sort_exception_table(struct exception_table_entry *start, - const struct exception_table_entry *stop); ---- a/xen/arch/x86/traps.c -+++ b/xen/arch/x86/traps.c -@@ -845,7 +845,7 @@ void do_unhandled_trap(struct cpu_user_r - } - - static void fixup_exception_return(struct cpu_user_regs *regs, -- unsigned long fixup) -+ unsigned long fixup, unsigned long stub_ra) - { - if ( IS_ENABLED(CONFIG_XEN_SHSTK) ) - { -@@ -862,7 +862,8 @@ static void fixup_exception_return(struc - /* - * Search for %rip. The shstk currently looks like this: - * -- * ... [Likely pointed to by SSP] -+ * tok [Supervisor token, == &tok | BUSY, only with FRED inactive] -+ * ... [Pointed to by SSP for most exceptions, empty in IST cases] - * %cs [== regs->cs] - * %rip [== regs->rip] - * SSP [Likely points to 3 slots higher, above %cs] -@@ -880,7 +881,56 @@ static void fixup_exception_return(struc - */ - if ( ptr[0] == regs->rip && ptr[1] == regs->cs ) - { -+ unsigned long primary_shstk = -+ (ssp & ~(STACK_SIZE - 1)) + -+ (PRIMARY_SHSTK_SLOT + 1) * PAGE_SIZE - 8; -+ - wrss(fixup, ptr); -+ -+ if ( !stub_ra ) -+ goto shstk_done; -+ -+ /* -+ * Stub recovery ought to happen only when the outer context -+ * was on the main shadow stack. We need to also "pop" the -+ * stub's return address from the interrupted context's shadow -+ * stack. That is, -+ * - if we're still on the main stack, we need to move the -+ * entire stack (up to and including the exception frame) -+ * up by one slot, incrementing the original SSP in the -+ * exception frame, -+ * - if we're on an IST stack, we need to increment the -+ * original SSP. -+ */ -+ BUG_ON((ptr[-1] ^ primary_shstk) >> PAGE_SHIFT); -+ -+ if ( (ssp ^ primary_shstk) >> PAGE_SHIFT ) -+ { -+ /* -+ * We're on an IST stack. First make sure the two return -+ * addresses actually match. Then increment the interrupted -+ * context's SSP. -+ */ -+ BUG_ON(stub_ra != *(unsigned long*)ptr[-1]); -+ wrss(ptr[-1] + 8, &ptr[-1]); -+ goto shstk_done; -+ } -+ -+ /* Make sure the two return addresses actually match. */ -+ BUG_ON(stub_ra != ptr[2]); -+ -+ /* Move exception frame, updating SSP there. */ -+ wrss(ptr[1], &ptr[2]); /* %cs */ -+ wrss(ptr[0], &ptr[1]); /* %rip */ -+ wrss(ptr[-1] + 8, &ptr[0]); /* SSP */ -+ -+ /* Move all newer entries. */ -+ while ( --ptr != _p(ssp) ) -+ wrss(ptr[-1], &ptr[0]); -+ -+ /* Finally account for our own stack having shifted up. */ -+ asm volatile ( "incsspd %0" :: "r" (2) ); -+ - goto shstk_done; - } - } -@@ -901,7 +951,8 @@ static void fixup_exception_return(struc - - static bool extable_fixup(struct cpu_user_regs *regs, bool print) - { -- unsigned long fixup = search_exception_table(regs); -+ unsigned long stub_ra = 0; -+ unsigned long fixup = search_exception_table(regs, &stub_ra); - - if ( unlikely(fixup == 0) ) - return false; -@@ -915,7 +966,7 @@ static bool extable_fixup(struct cpu_use - vector_name(regs->entry_vector), regs->error_code, - _p(regs->rip), _p(regs->rip), _p(fixup)); - -- fixup_exception_return(regs, fixup); -+ fixup_exception_return(regs, fixup, stub_ra); - this_cpu(last_extable_addr) = regs->rip; - - return true; -@@ -1183,7 +1234,8 @@ void do_invalid_op(struct cpu_user_regs - { - case BUGFRAME_run_fn: - case BUGFRAME_warn: -- fixup_exception_return(regs, (unsigned long)eip); -+ fixup_exception_return(regs, (unsigned long)eip, 0); -+ fallthrough; - case BUGFRAME_bug: - case BUGFRAME_assert: - return;