mirror of
https://github.com/armbian/build.git
synced 2025-08-12 06:06:58 +02:00
* Attached cubox-i to stock kernel. Tested. Sadly ZRAM has some issues, so it must ramain disabled * more imx6 updates for cubox/udoo * Enable HDMI on Cubox-i Signed-off-by: Igor Pecovnik <igor.pecovnik@gmail.com>
5211 lines
186 KiB
Diff
5211 lines
186 KiB
Diff
diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu
|
|
index 069e8d52c991..cadb7a9a5218 100644
|
|
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
|
|
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
|
|
@@ -357,6 +357,7 @@ What: /sys/devices/system/cpu/vulnerabilities
|
|
/sys/devices/system/cpu/vulnerabilities/spectre_v2
|
|
/sys/devices/system/cpu/vulnerabilities/spec_store_bypass
|
|
/sys/devices/system/cpu/vulnerabilities/l1tf
|
|
+ /sys/devices/system/cpu/vulnerabilities/mds
|
|
Date: January 2018
|
|
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
|
Description: Information about CPU vulnerabilities
|
|
@@ -369,8 +370,7 @@ Description: Information about CPU vulnerabilities
|
|
"Vulnerable" CPU is affected and no mitigation in effect
|
|
"Mitigation: $M" CPU is affected and mitigation $M is in effect
|
|
|
|
- Details about the l1tf file can be found in
|
|
- Documentation/admin-guide/l1tf.rst
|
|
+ See also: Documentation/hw-vuln/index.rst
|
|
|
|
What: /sys/devices/system/cpu/smt
|
|
/sys/devices/system/cpu/smt/active
|
|
diff --git a/Documentation/hw-vuln/index.rst b/Documentation/hw-vuln/index.rst
|
|
new file mode 100644
|
|
index 000000000000..ffc064c1ec68
|
|
--- /dev/null
|
|
+++ b/Documentation/hw-vuln/index.rst
|
|
@@ -0,0 +1,13 @@
|
|
+========================
|
|
+Hardware vulnerabilities
|
|
+========================
|
|
+
|
|
+This section describes CPU vulnerabilities and provides an overview of the
|
|
+possible mitigations along with guidance for selecting mitigations if they
|
|
+are configurable at compile, boot or run time.
|
|
+
|
|
+.. toctree::
|
|
+ :maxdepth: 1
|
|
+
|
|
+ l1tf
|
|
+ mds
|
|
diff --git a/Documentation/hw-vuln/l1tf.rst b/Documentation/hw-vuln/l1tf.rst
|
|
new file mode 100644
|
|
index 000000000000..31653a9f0e1b
|
|
--- /dev/null
|
|
+++ b/Documentation/hw-vuln/l1tf.rst
|
|
@@ -0,0 +1,615 @@
|
|
+L1TF - L1 Terminal Fault
|
|
+========================
|
|
+
|
|
+L1 Terminal Fault is a hardware vulnerability which allows unprivileged
|
|
+speculative access to data which is available in the Level 1 Data Cache
|
|
+when the page table entry controlling the virtual address, which is used
|
|
+for the access, has the Present bit cleared or other reserved bits set.
|
|
+
|
|
+Affected processors
|
|
+-------------------
|
|
+
|
|
+This vulnerability affects a wide range of Intel processors. The
|
|
+vulnerability is not present on:
|
|
+
|
|
+ - Processors from AMD, Centaur and other non Intel vendors
|
|
+
|
|
+ - Older processor models, where the CPU family is < 6
|
|
+
|
|
+ - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
|
|
+ Penwell, Pineview, Silvermont, Airmont, Merrifield)
|
|
+
|
|
+ - The Intel XEON PHI family
|
|
+
|
|
+ - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
|
|
+ IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
|
|
+ by the Meltdown vulnerability either. These CPUs should become
|
|
+ available by end of 2018.
|
|
+
|
|
+Whether a processor is affected or not can be read out from the L1TF
|
|
+vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
|
|
+
|
|
+Related CVEs
|
|
+------------
|
|
+
|
|
+The following CVE entries are related to the L1TF vulnerability:
|
|
+
|
|
+ ============= ================= ==============================
|
|
+ CVE-2018-3615 L1 Terminal Fault SGX related aspects
|
|
+ CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
|
|
+ CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
|
|
+ ============= ================= ==============================
|
|
+
|
|
+Problem
|
|
+-------
|
|
+
|
|
+If an instruction accesses a virtual address for which the relevant page
|
|
+table entry (PTE) has the Present bit cleared or other reserved bits set,
|
|
+then speculative execution ignores the invalid PTE and loads the referenced
|
|
+data if it is present in the Level 1 Data Cache, as if the page referenced
|
|
+by the address bits in the PTE was still present and accessible.
|
|
+
|
|
+While this is a purely speculative mechanism and the instruction will raise
|
|
+a page fault when it is retired eventually, the pure act of loading the
|
|
+data and making it available to other speculative instructions opens up the
|
|
+opportunity for side channel attacks to unprivileged malicious code,
|
|
+similar to the Meltdown attack.
|
|
+
|
|
+While Meltdown breaks the user space to kernel space protection, L1TF
|
|
+allows to attack any physical memory address in the system and the attack
|
|
+works across all protection domains. It allows an attack of SGX and also
|
|
+works from inside virtual machines because the speculation bypasses the
|
|
+extended page table (EPT) protection mechanism.
|
|
+
|
|
+
|
|
+Attack scenarios
|
|
+----------------
|
|
+
|
|
+1. Malicious user space
|
|
+^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ Operating Systems store arbitrary information in the address bits of a
|
|
+ PTE which is marked non present. This allows a malicious user space
|
|
+ application to attack the physical memory to which these PTEs resolve.
|
|
+ In some cases user-space can maliciously influence the information
|
|
+ encoded in the address bits of the PTE, thus making attacks more
|
|
+ deterministic and more practical.
|
|
+
|
|
+ The Linux kernel contains a mitigation for this attack vector, PTE
|
|
+ inversion, which is permanently enabled and has no performance
|
|
+ impact. The kernel ensures that the address bits of PTEs, which are not
|
|
+ marked present, never point to cacheable physical memory space.
|
|
+
|
|
+ A system with an up to date kernel is protected against attacks from
|
|
+ malicious user space applications.
|
|
+
|
|
+2. Malicious guest in a virtual machine
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ The fact that L1TF breaks all domain protections allows malicious guest
|
|
+ OSes, which can control the PTEs directly, and malicious guest user
|
|
+ space applications, which run on an unprotected guest kernel lacking the
|
|
+ PTE inversion mitigation for L1TF, to attack physical host memory.
|
|
+
|
|
+ A special aspect of L1TF in the context of virtualization is symmetric
|
|
+ multi threading (SMT). The Intel implementation of SMT is called
|
|
+ HyperThreading. The fact that Hyperthreads on the affected processors
|
|
+ share the L1 Data Cache (L1D) is important for this. As the flaw allows
|
|
+ only to attack data which is present in L1D, a malicious guest running
|
|
+ on one Hyperthread can attack the data which is brought into the L1D by
|
|
+ the context which runs on the sibling Hyperthread of the same physical
|
|
+ core. This context can be host OS, host user space or a different guest.
|
|
+
|
|
+ If the processor does not support Extended Page Tables, the attack is
|
|
+ only possible, when the hypervisor does not sanitize the content of the
|
|
+ effective (shadow) page tables.
|
|
+
|
|
+ While solutions exist to mitigate these attack vectors fully, these
|
|
+ mitigations are not enabled by default in the Linux kernel because they
|
|
+ can affect performance significantly. The kernel provides several
|
|
+ mechanisms which can be utilized to address the problem depending on the
|
|
+ deployment scenario. The mitigations, their protection scope and impact
|
|
+ are described in the next sections.
|
|
+
|
|
+ The default mitigations and the rationale for choosing them are explained
|
|
+ at the end of this document. See :ref:`default_mitigations`.
|
|
+
|
|
+.. _l1tf_sys_info:
|
|
+
|
|
+L1TF system information
|
|
+-----------------------
|
|
+
|
|
+The Linux kernel provides a sysfs interface to enumerate the current L1TF
|
|
+status of the system: whether the system is vulnerable, and which
|
|
+mitigations are active. The relevant sysfs file is:
|
|
+
|
|
+/sys/devices/system/cpu/vulnerabilities/l1tf
|
|
+
|
|
+The possible values in this file are:
|
|
+
|
|
+ =========================== ===============================
|
|
+ 'Not affected' The processor is not vulnerable
|
|
+ 'Mitigation: PTE Inversion' The host protection is active
|
|
+ =========================== ===============================
|
|
+
|
|
+If KVM/VMX is enabled and the processor is vulnerable then the following
|
|
+information is appended to the 'Mitigation: PTE Inversion' part:
|
|
+
|
|
+ - SMT status:
|
|
+
|
|
+ ===================== ================
|
|
+ 'VMX: SMT vulnerable' SMT is enabled
|
|
+ 'VMX: SMT disabled' SMT is disabled
|
|
+ ===================== ================
|
|
+
|
|
+ - L1D Flush mode:
|
|
+
|
|
+ ================================ ====================================
|
|
+ 'L1D vulnerable' L1D flushing is disabled
|
|
+
|
|
+ 'L1D conditional cache flushes' L1D flush is conditionally enabled
|
|
+
|
|
+ 'L1D cache flushes' L1D flush is unconditionally enabled
|
|
+ ================================ ====================================
|
|
+
|
|
+The resulting grade of protection is discussed in the following sections.
|
|
+
|
|
+
|
|
+Host mitigation mechanism
|
|
+-------------------------
|
|
+
|
|
+The kernel is unconditionally protected against L1TF attacks from malicious
|
|
+user space running on the host.
|
|
+
|
|
+
|
|
+Guest mitigation mechanisms
|
|
+---------------------------
|
|
+
|
|
+.. _l1d_flush:
|
|
+
|
|
+1. L1D flush on VMENTER
|
|
+^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ To make sure that a guest cannot attack data which is present in the L1D
|
|
+ the hypervisor flushes the L1D before entering the guest.
|
|
+
|
|
+ Flushing the L1D evicts not only the data which should not be accessed
|
|
+ by a potentially malicious guest, it also flushes the guest
|
|
+ data. Flushing the L1D has a performance impact as the processor has to
|
|
+ bring the flushed guest data back into the L1D. Depending on the
|
|
+ frequency of VMEXIT/VMENTER and the type of computations in the guest
|
|
+ performance degradation in the range of 1% to 50% has been observed. For
|
|
+ scenarios where guest VMEXIT/VMENTER are rare the performance impact is
|
|
+ minimal. Virtio and mechanisms like posted interrupts are designed to
|
|
+ confine the VMEXITs to a bare minimum, but specific configurations and
|
|
+ application scenarios might still suffer from a high VMEXIT rate.
|
|
+
|
|
+ The kernel provides two L1D flush modes:
|
|
+ - conditional ('cond')
|
|
+ - unconditional ('always')
|
|
+
|
|
+ The conditional mode avoids L1D flushing after VMEXITs which execute
|
|
+ only audited code paths before the corresponding VMENTER. These code
|
|
+ paths have been verified that they cannot expose secrets or other
|
|
+ interesting data to an attacker, but they can leak information about the
|
|
+ address space layout of the hypervisor.
|
|
+
|
|
+ Unconditional mode flushes L1D on all VMENTER invocations and provides
|
|
+ maximum protection. It has a higher overhead than the conditional
|
|
+ mode. The overhead cannot be quantified correctly as it depends on the
|
|
+ workload scenario and the resulting number of VMEXITs.
|
|
+
|
|
+ The general recommendation is to enable L1D flush on VMENTER. The kernel
|
|
+ defaults to conditional mode on affected processors.
|
|
+
|
|
+ **Note**, that L1D flush does not prevent the SMT problem because the
|
|
+ sibling thread will also bring back its data into the L1D which makes it
|
|
+ attackable again.
|
|
+
|
|
+ L1D flush can be controlled by the administrator via the kernel command
|
|
+ line and sysfs control files. See :ref:`mitigation_control_command_line`
|
|
+ and :ref:`mitigation_control_kvm`.
|
|
+
|
|
+.. _guest_confinement:
|
|
+
|
|
+2. Guest VCPU confinement to dedicated physical cores
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ To address the SMT problem, it is possible to make a guest or a group of
|
|
+ guests affine to one or more physical cores. The proper mechanism for
|
|
+ that is to utilize exclusive cpusets to ensure that no other guest or
|
|
+ host tasks can run on these cores.
|
|
+
|
|
+ If only a single guest or related guests run on sibling SMT threads on
|
|
+ the same physical core then they can only attack their own memory and
|
|
+ restricted parts of the host memory.
|
|
+
|
|
+ Host memory is attackable, when one of the sibling SMT threads runs in
|
|
+ host OS (hypervisor) context and the other in guest context. The amount
|
|
+ of valuable information from the host OS context depends on the context
|
|
+ which the host OS executes, i.e. interrupts, soft interrupts and kernel
|
|
+ threads. The amount of valuable data from these contexts cannot be
|
|
+ declared as non-interesting for an attacker without deep inspection of
|
|
+ the code.
|
|
+
|
|
+ **Note**, that assigning guests to a fixed set of physical cores affects
|
|
+ the ability of the scheduler to do load balancing and might have
|
|
+ negative effects on CPU utilization depending on the hosting
|
|
+ scenario. Disabling SMT might be a viable alternative for particular
|
|
+ scenarios.
|
|
+
|
|
+ For further information about confining guests to a single or to a group
|
|
+ of cores consult the cpusets documentation:
|
|
+
|
|
+ https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
|
|
+
|
|
+.. _interrupt_isolation:
|
|
+
|
|
+3. Interrupt affinity
|
|
+^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ Interrupts can be made affine to logical CPUs. This is not universally
|
|
+ true because there are types of interrupts which are truly per CPU
|
|
+ interrupts, e.g. the local timer interrupt. Aside of that multi queue
|
|
+ devices affine their interrupts to single CPUs or groups of CPUs per
|
|
+ queue without allowing the administrator to control the affinities.
|
|
+
|
|
+ Moving the interrupts, which can be affinity controlled, away from CPUs
|
|
+ which run untrusted guests, reduces the attack vector space.
|
|
+
|
|
+ Whether the interrupts with are affine to CPUs, which run untrusted
|
|
+ guests, provide interesting data for an attacker depends on the system
|
|
+ configuration and the scenarios which run on the system. While for some
|
|
+ of the interrupts it can be assumed that they won't expose interesting
|
|
+ information beyond exposing hints about the host OS memory layout, there
|
|
+ is no way to make general assumptions.
|
|
+
|
|
+ Interrupt affinity can be controlled by the administrator via the
|
|
+ /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
|
|
+ available at:
|
|
+
|
|
+ https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
|
|
+
|
|
+.. _smt_control:
|
|
+
|
|
+4. SMT control
|
|
+^^^^^^^^^^^^^^
|
|
+
|
|
+ To prevent the SMT issues of L1TF it might be necessary to disable SMT
|
|
+ completely. Disabling SMT can have a significant performance impact, but
|
|
+ the impact depends on the hosting scenario and the type of workloads.
|
|
+ The impact of disabling SMT needs also to be weighted against the impact
|
|
+ of other mitigation solutions like confining guests to dedicated cores.
|
|
+
|
|
+ The kernel provides a sysfs interface to retrieve the status of SMT and
|
|
+ to control it. It also provides a kernel command line interface to
|
|
+ control SMT.
|
|
+
|
|
+ The kernel command line interface consists of the following options:
|
|
+
|
|
+ =========== ==========================================================
|
|
+ nosmt Affects the bring up of the secondary CPUs during boot. The
|
|
+ kernel tries to bring all present CPUs online during the
|
|
+ boot process. "nosmt" makes sure that from each physical
|
|
+ core only one - the so called primary (hyper) thread is
|
|
+ activated. Due to a design flaw of Intel processors related
|
|
+ to Machine Check Exceptions the non primary siblings have
|
|
+ to be brought up at least partially and are then shut down
|
|
+ again. "nosmt" can be undone via the sysfs interface.
|
|
+
|
|
+ nosmt=force Has the same effect as "nosmt" but it does not allow to
|
|
+ undo the SMT disable via the sysfs interface.
|
|
+ =========== ==========================================================
|
|
+
|
|
+ The sysfs interface provides two files:
|
|
+
|
|
+ - /sys/devices/system/cpu/smt/control
|
|
+ - /sys/devices/system/cpu/smt/active
|
|
+
|
|
+ /sys/devices/system/cpu/smt/control:
|
|
+
|
|
+ This file allows to read out the SMT control state and provides the
|
|
+ ability to disable or (re)enable SMT. The possible states are:
|
|
+
|
|
+ ============== ===================================================
|
|
+ on SMT is supported by the CPU and enabled. All
|
|
+ logical CPUs can be onlined and offlined without
|
|
+ restrictions.
|
|
+
|
|
+ off SMT is supported by the CPU and disabled. Only
|
|
+ the so called primary SMT threads can be onlined
|
|
+ and offlined without restrictions. An attempt to
|
|
+ online a non-primary sibling is rejected
|
|
+
|
|
+ forceoff Same as 'off' but the state cannot be controlled.
|
|
+ Attempts to write to the control file are rejected.
|
|
+
|
|
+ notsupported The processor does not support SMT. It's therefore
|
|
+ not affected by the SMT implications of L1TF.
|
|
+ Attempts to write to the control file are rejected.
|
|
+ ============== ===================================================
|
|
+
|
|
+ The possible states which can be written into this file to control SMT
|
|
+ state are:
|
|
+
|
|
+ - on
|
|
+ - off
|
|
+ - forceoff
|
|
+
|
|
+ /sys/devices/system/cpu/smt/active:
|
|
+
|
|
+ This file reports whether SMT is enabled and active, i.e. if on any
|
|
+ physical core two or more sibling threads are online.
|
|
+
|
|
+ SMT control is also possible at boot time via the l1tf kernel command
|
|
+ line parameter in combination with L1D flush control. See
|
|
+ :ref:`mitigation_control_command_line`.
|
|
+
|
|
+5. Disabling EPT
|
|
+^^^^^^^^^^^^^^^^
|
|
+
|
|
+ Disabling EPT for virtual machines provides full mitigation for L1TF even
|
|
+ with SMT enabled, because the effective page tables for guests are
|
|
+ managed and sanitized by the hypervisor. Though disabling EPT has a
|
|
+ significant performance impact especially when the Meltdown mitigation
|
|
+ KPTI is enabled.
|
|
+
|
|
+ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
|
|
+
|
|
+There is ongoing research and development for new mitigation mechanisms to
|
|
+address the performance impact of disabling SMT or EPT.
|
|
+
|
|
+.. _mitigation_control_command_line:
|
|
+
|
|
+Mitigation control on the kernel command line
|
|
+---------------------------------------------
|
|
+
|
|
+The kernel command line allows to control the L1TF mitigations at boot
|
|
+time with the option "l1tf=". The valid arguments for this option are:
|
|
+
|
|
+ ============ =============================================================
|
|
+ full Provides all available mitigations for the L1TF
|
|
+ vulnerability. Disables SMT and enables all mitigations in
|
|
+ the hypervisors, i.e. unconditional L1D flushing
|
|
+
|
|
+ SMT control and L1D flush control via the sysfs interface
|
|
+ is still possible after boot. Hypervisors will issue a
|
|
+ warning when the first VM is started in a potentially
|
|
+ insecure configuration, i.e. SMT enabled or L1D flush
|
|
+ disabled.
|
|
+
|
|
+ full,force Same as 'full', but disables SMT and L1D flush runtime
|
|
+ control. Implies the 'nosmt=force' command line option.
|
|
+ (i.e. sysfs control of SMT is disabled.)
|
|
+
|
|
+ flush Leaves SMT enabled and enables the default hypervisor
|
|
+ mitigation, i.e. conditional L1D flushing
|
|
+
|
|
+ SMT control and L1D flush control via the sysfs interface
|
|
+ is still possible after boot. Hypervisors will issue a
|
|
+ warning when the first VM is started in a potentially
|
|
+ insecure configuration, i.e. SMT enabled or L1D flush
|
|
+ disabled.
|
|
+
|
|
+ flush,nosmt Disables SMT and enables the default hypervisor mitigation,
|
|
+ i.e. conditional L1D flushing.
|
|
+
|
|
+ SMT control and L1D flush control via the sysfs interface
|
|
+ is still possible after boot. Hypervisors will issue a
|
|
+ warning when the first VM is started in a potentially
|
|
+ insecure configuration, i.e. SMT enabled or L1D flush
|
|
+ disabled.
|
|
+
|
|
+ flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
|
|
+ started in a potentially insecure configuration.
|
|
+
|
|
+ off Disables hypervisor mitigations and doesn't emit any
|
|
+ warnings.
|
|
+ It also drops the swap size and available RAM limit restrictions
|
|
+ on both hypervisor and bare metal.
|
|
+
|
|
+ ============ =============================================================
|
|
+
|
|
+The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
|
|
+
|
|
+
|
|
+.. _mitigation_control_kvm:
|
|
+
|
|
+Mitigation control for KVM - module parameter
|
|
+-------------------------------------------------------------
|
|
+
|
|
+The KVM hypervisor mitigation mechanism, flushing the L1D cache when
|
|
+entering a guest, can be controlled with a module parameter.
|
|
+
|
|
+The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
|
|
+following arguments:
|
|
+
|
|
+ ============ ==============================================================
|
|
+ always L1D cache flush on every VMENTER.
|
|
+
|
|
+ cond Flush L1D on VMENTER only when the code between VMEXIT and
|
|
+ VMENTER can leak host memory which is considered
|
|
+ interesting for an attacker. This still can leak host memory
|
|
+ which allows e.g. to determine the hosts address space layout.
|
|
+
|
|
+ never Disables the mitigation
|
|
+ ============ ==============================================================
|
|
+
|
|
+The parameter can be provided on the kernel command line, as a module
|
|
+parameter when loading the modules and at runtime modified via the sysfs
|
|
+file:
|
|
+
|
|
+/sys/module/kvm_intel/parameters/vmentry_l1d_flush
|
|
+
|
|
+The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
|
|
+line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
|
|
+module parameter is ignored and writes to the sysfs file are rejected.
|
|
+
|
|
+.. _mitigation_selection:
|
|
+
|
|
+Mitigation selection guide
|
|
+--------------------------
|
|
+
|
|
+1. No virtualization in use
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ The system is protected by the kernel unconditionally and no further
|
|
+ action is required.
|
|
+
|
|
+2. Virtualization with trusted guests
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ If the guest comes from a trusted source and the guest OS kernel is
|
|
+ guaranteed to have the L1TF mitigations in place the system is fully
|
|
+ protected against L1TF and no further action is required.
|
|
+
|
|
+ To avoid the overhead of the default L1D flushing on VMENTER the
|
|
+ administrator can disable the flushing via the kernel command line and
|
|
+ sysfs control files. See :ref:`mitigation_control_command_line` and
|
|
+ :ref:`mitigation_control_kvm`.
|
|
+
|
|
+
|
|
+3. Virtualization with untrusted guests
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+3.1. SMT not supported or disabled
|
|
+""""""""""""""""""""""""""""""""""
|
|
+
|
|
+ If SMT is not supported by the processor or disabled in the BIOS or by
|
|
+ the kernel, it's only required to enforce L1D flushing on VMENTER.
|
|
+
|
|
+ Conditional L1D flushing is the default behaviour and can be tuned. See
|
|
+ :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
|
|
+
|
|
+3.2. EPT not supported or disabled
|
|
+""""""""""""""""""""""""""""""""""
|
|
+
|
|
+ If EPT is not supported by the processor or disabled in the hypervisor,
|
|
+ the system is fully protected. SMT can stay enabled and L1D flushing on
|
|
+ VMENTER is not required.
|
|
+
|
|
+ EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
|
|
+
|
|
+3.3. SMT and EPT supported and active
|
|
+"""""""""""""""""""""""""""""""""""""
|
|
+
|
|
+ If SMT and EPT are supported and active then various degrees of
|
|
+ mitigations can be employed:
|
|
+
|
|
+ - L1D flushing on VMENTER:
|
|
+
|
|
+ L1D flushing on VMENTER is the minimal protection requirement, but it
|
|
+ is only potent in combination with other mitigation methods.
|
|
+
|
|
+ Conditional L1D flushing is the default behaviour and can be tuned. See
|
|
+ :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
|
|
+
|
|
+ - Guest confinement:
|
|
+
|
|
+ Confinement of guests to a single or a group of physical cores which
|
|
+ are not running any other processes, can reduce the attack surface
|
|
+ significantly, but interrupts, soft interrupts and kernel threads can
|
|
+ still expose valuable data to a potential attacker. See
|
|
+ :ref:`guest_confinement`.
|
|
+
|
|
+ - Interrupt isolation:
|
|
+
|
|
+ Isolating the guest CPUs from interrupts can reduce the attack surface
|
|
+ further, but still allows a malicious guest to explore a limited amount
|
|
+ of host physical memory. This can at least be used to gain knowledge
|
|
+ about the host address space layout. The interrupts which have a fixed
|
|
+ affinity to the CPUs which run the untrusted guests can depending on
|
|
+ the scenario still trigger soft interrupts and schedule kernel threads
|
|
+ which might expose valuable information. See
|
|
+ :ref:`interrupt_isolation`.
|
|
+
|
|
+The above three mitigation methods combined can provide protection to a
|
|
+certain degree, but the risk of the remaining attack surface has to be
|
|
+carefully analyzed. For full protection the following methods are
|
|
+available:
|
|
+
|
|
+ - Disabling SMT:
|
|
+
|
|
+ Disabling SMT and enforcing the L1D flushing provides the maximum
|
|
+ amount of protection. This mitigation is not depending on any of the
|
|
+ above mitigation methods.
|
|
+
|
|
+ SMT control and L1D flushing can be tuned by the command line
|
|
+ parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
|
|
+ time with the matching sysfs control files. See :ref:`smt_control`,
|
|
+ :ref:`mitigation_control_command_line` and
|
|
+ :ref:`mitigation_control_kvm`.
|
|
+
|
|
+ - Disabling EPT:
|
|
+
|
|
+ Disabling EPT provides the maximum amount of protection as well. It is
|
|
+ not depending on any of the above mitigation methods. SMT can stay
|
|
+ enabled and L1D flushing is not required, but the performance impact is
|
|
+ significant.
|
|
+
|
|
+ EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
|
|
+ parameter.
|
|
+
|
|
+3.4. Nested virtual machines
|
|
+""""""""""""""""""""""""""""
|
|
+
|
|
+When nested virtualization is in use, three operating systems are involved:
|
|
+the bare metal hypervisor, the nested hypervisor and the nested virtual
|
|
+machine. VMENTER operations from the nested hypervisor into the nested
|
|
+guest will always be processed by the bare metal hypervisor. If KVM is the
|
|
+bare metal hypervisor it will:
|
|
+
|
|
+ - Flush the L1D cache on every switch from the nested hypervisor to the
|
|
+ nested virtual machine, so that the nested hypervisor's secrets are not
|
|
+ exposed to the nested virtual machine;
|
|
+
|
|
+ - Flush the L1D cache on every switch from the nested virtual machine to
|
|
+ the nested hypervisor; this is a complex operation, and flushing the L1D
|
|
+ cache avoids that the bare metal hypervisor's secrets are exposed to the
|
|
+ nested virtual machine;
|
|
+
|
|
+ - Instruct the nested hypervisor to not perform any L1D cache flush. This
|
|
+ is an optimization to avoid double L1D flushing.
|
|
+
|
|
+
|
|
+.. _default_mitigations:
|
|
+
|
|
+Default mitigations
|
|
+-------------------
|
|
+
|
|
+ The kernel default mitigations for vulnerable processors are:
|
|
+
|
|
+ - PTE inversion to protect against malicious user space. This is done
|
|
+ unconditionally and cannot be controlled. The swap storage is limited
|
|
+ to ~16TB.
|
|
+
|
|
+ - L1D conditional flushing on VMENTER when EPT is enabled for
|
|
+ a guest.
|
|
+
|
|
+ The kernel does not by default enforce the disabling of SMT, which leaves
|
|
+ SMT systems vulnerable when running untrusted guests with EPT enabled.
|
|
+
|
|
+ The rationale for this choice is:
|
|
+
|
|
+ - Force disabling SMT can break existing setups, especially with
|
|
+ unattended updates.
|
|
+
|
|
+ - If regular users run untrusted guests on their machine, then L1TF is
|
|
+ just an add on to other malware which might be embedded in an untrusted
|
|
+ guest, e.g. spam-bots or attacks on the local network.
|
|
+
|
|
+ There is no technical way to prevent a user from running untrusted code
|
|
+ on their machines blindly.
|
|
+
|
|
+ - It's technically extremely unlikely and from today's knowledge even
|
|
+ impossible that L1TF can be exploited via the most popular attack
|
|
+ mechanisms like JavaScript because these mechanisms have no way to
|
|
+ control PTEs. If this would be possible and not other mitigation would
|
|
+ be possible, then the default might be different.
|
|
+
|
|
+ - The administrators of cloud and hosting setups have to carefully
|
|
+ analyze the risk for their scenarios and make the appropriate
|
|
+ mitigation choices, which might even vary across their deployed
|
|
+ machines and also result in other changes of their overall setup.
|
|
+ There is no way for the kernel to provide a sensible default for this
|
|
+ kind of scenarios.
|
|
diff --git a/Documentation/hw-vuln/mds.rst b/Documentation/hw-vuln/mds.rst
|
|
new file mode 100644
|
|
index 000000000000..daf6fdac49a3
|
|
--- /dev/null
|
|
+++ b/Documentation/hw-vuln/mds.rst
|
|
@@ -0,0 +1,308 @@
|
|
+MDS - Microarchitectural Data Sampling
|
|
+======================================
|
|
+
|
|
+Microarchitectural Data Sampling is a hardware vulnerability which allows
|
|
+unprivileged speculative access to data which is available in various CPU
|
|
+internal buffers.
|
|
+
|
|
+Affected processors
|
|
+-------------------
|
|
+
|
|
+This vulnerability affects a wide range of Intel processors. The
|
|
+vulnerability is not present on:
|
|
+
|
|
+ - Processors from AMD, Centaur and other non Intel vendors
|
|
+
|
|
+ - Older processor models, where the CPU family is < 6
|
|
+
|
|
+ - Some Atoms (Bonnell, Saltwell, Goldmont, GoldmontPlus)
|
|
+
|
|
+ - Intel processors which have the ARCH_CAP_MDS_NO bit set in the
|
|
+ IA32_ARCH_CAPABILITIES MSR.
|
|
+
|
|
+Whether a processor is affected or not can be read out from the MDS
|
|
+vulnerability file in sysfs. See :ref:`mds_sys_info`.
|
|
+
|
|
+Not all processors are affected by all variants of MDS, but the mitigation
|
|
+is identical for all of them so the kernel treats them as a single
|
|
+vulnerability.
|
|
+
|
|
+Related CVEs
|
|
+------------
|
|
+
|
|
+The following CVE entries are related to the MDS vulnerability:
|
|
+
|
|
+ ============== ===== ===================================================
|
|
+ CVE-2018-12126 MSBDS Microarchitectural Store Buffer Data Sampling
|
|
+ CVE-2018-12130 MFBDS Microarchitectural Fill Buffer Data Sampling
|
|
+ CVE-2018-12127 MLPDS Microarchitectural Load Port Data Sampling
|
|
+ CVE-2019-11091 MDSUM Microarchitectural Data Sampling Uncacheable Memory
|
|
+ ============== ===== ===================================================
|
|
+
|
|
+Problem
|
|
+-------
|
|
+
|
|
+When performing store, load, L1 refill operations, processors write data
|
|
+into temporary microarchitectural structures (buffers). The data in the
|
|
+buffer can be forwarded to load operations as an optimization.
|
|
+
|
|
+Under certain conditions, usually a fault/assist caused by a load
|
|
+operation, data unrelated to the load memory address can be speculatively
|
|
+forwarded from the buffers. Because the load operation causes a fault or
|
|
+assist and its result will be discarded, the forwarded data will not cause
|
|
+incorrect program execution or state changes. But a malicious operation
|
|
+may be able to forward this speculative data to a disclosure gadget which
|
|
+allows in turn to infer the value via a cache side channel attack.
|
|
+
|
|
+Because the buffers are potentially shared between Hyper-Threads cross
|
|
+Hyper-Thread attacks are possible.
|
|
+
|
|
+Deeper technical information is available in the MDS specific x86
|
|
+architecture section: :ref:`Documentation/x86/mds.rst <mds>`.
|
|
+
|
|
+
|
|
+Attack scenarios
|
|
+----------------
|
|
+
|
|
+Attacks against the MDS vulnerabilities can be mounted from malicious non
|
|
+priviledged user space applications running on hosts or guest. Malicious
|
|
+guest OSes can obviously mount attacks as well.
|
|
+
|
|
+Contrary to other speculation based vulnerabilities the MDS vulnerability
|
|
+does not allow the attacker to control the memory target address. As a
|
|
+consequence the attacks are purely sampling based, but as demonstrated with
|
|
+the TLBleed attack samples can be postprocessed successfully.
|
|
+
|
|
+Web-Browsers
|
|
+^^^^^^^^^^^^
|
|
+
|
|
+ It's unclear whether attacks through Web-Browsers are possible at
|
|
+ all. The exploitation through Java-Script is considered very unlikely,
|
|
+ but other widely used web technologies like Webassembly could possibly be
|
|
+ abused.
|
|
+
|
|
+
|
|
+.. _mds_sys_info:
|
|
+
|
|
+MDS system information
|
|
+-----------------------
|
|
+
|
|
+The Linux kernel provides a sysfs interface to enumerate the current MDS
|
|
+status of the system: whether the system is vulnerable, and which
|
|
+mitigations are active. The relevant sysfs file is:
|
|
+
|
|
+/sys/devices/system/cpu/vulnerabilities/mds
|
|
+
|
|
+The possible values in this file are:
|
|
+
|
|
+ .. list-table::
|
|
+
|
|
+ * - 'Not affected'
|
|
+ - The processor is not vulnerable
|
|
+ * - 'Vulnerable'
|
|
+ - The processor is vulnerable, but no mitigation enabled
|
|
+ * - 'Vulnerable: Clear CPU buffers attempted, no microcode'
|
|
+ - The processor is vulnerable but microcode is not updated.
|
|
+
|
|
+ The mitigation is enabled on a best effort basis. See :ref:`vmwerv`
|
|
+ * - 'Mitigation: Clear CPU buffers'
|
|
+ - The processor is vulnerable and the CPU buffer clearing mitigation is
|
|
+ enabled.
|
|
+
|
|
+If the processor is vulnerable then the following information is appended
|
|
+to the above information:
|
|
+
|
|
+ ======================== ============================================
|
|
+ 'SMT vulnerable' SMT is enabled
|
|
+ 'SMT mitigated' SMT is enabled and mitigated
|
|
+ 'SMT disabled' SMT is disabled
|
|
+ 'SMT Host state unknown' Kernel runs in a VM, Host SMT state unknown
|
|
+ ======================== ============================================
|
|
+
|
|
+.. _vmwerv:
|
|
+
|
|
+Best effort mitigation mode
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ If the processor is vulnerable, but the availability of the microcode based
|
|
+ mitigation mechanism is not advertised via CPUID the kernel selects a best
|
|
+ effort mitigation mode. This mode invokes the mitigation instructions
|
|
+ without a guarantee that they clear the CPU buffers.
|
|
+
|
|
+ This is done to address virtualization scenarios where the host has the
|
|
+ microcode update applied, but the hypervisor is not yet updated to expose
|
|
+ the CPUID to the guest. If the host has updated microcode the protection
|
|
+ takes effect otherwise a few cpu cycles are wasted pointlessly.
|
|
+
|
|
+ The state in the mds sysfs file reflects this situation accordingly.
|
|
+
|
|
+
|
|
+Mitigation mechanism
|
|
+-------------------------
|
|
+
|
|
+The kernel detects the affected CPUs and the presence of the microcode
|
|
+which is required.
|
|
+
|
|
+If a CPU is affected and the microcode is available, then the kernel
|
|
+enables the mitigation by default. The mitigation can be controlled at boot
|
|
+time via a kernel command line option. See
|
|
+:ref:`mds_mitigation_control_command_line`.
|
|
+
|
|
+.. _cpu_buffer_clear:
|
|
+
|
|
+CPU buffer clearing
|
|
+^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ The mitigation for MDS clears the affected CPU buffers on return to user
|
|
+ space and when entering a guest.
|
|
+
|
|
+ If SMT is enabled it also clears the buffers on idle entry when the CPU
|
|
+ is only affected by MSBDS and not any other MDS variant, because the
|
|
+ other variants cannot be protected against cross Hyper-Thread attacks.
|
|
+
|
|
+ For CPUs which are only affected by MSBDS the user space, guest and idle
|
|
+ transition mitigations are sufficient and SMT is not affected.
|
|
+
|
|
+.. _virt_mechanism:
|
|
+
|
|
+Virtualization mitigation
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ The protection for host to guest transition depends on the L1TF
|
|
+ vulnerability of the CPU:
|
|
+
|
|
+ - CPU is affected by L1TF:
|
|
+
|
|
+ If the L1D flush mitigation is enabled and up to date microcode is
|
|
+ available, the L1D flush mitigation is automatically protecting the
|
|
+ guest transition.
|
|
+
|
|
+ If the L1D flush mitigation is disabled then the MDS mitigation is
|
|
+ invoked explicit when the host MDS mitigation is enabled.
|
|
+
|
|
+ For details on L1TF and virtualization see:
|
|
+ :ref:`Documentation/hw-vuln//l1tf.rst <mitigation_control_kvm>`.
|
|
+
|
|
+ - CPU is not affected by L1TF:
|
|
+
|
|
+ CPU buffers are flushed before entering the guest when the host MDS
|
|
+ mitigation is enabled.
|
|
+
|
|
+ The resulting MDS protection matrix for the host to guest transition:
|
|
+
|
|
+ ============ ===== ============= ============ =================
|
|
+ L1TF MDS VMX-L1FLUSH Host MDS MDS-State
|
|
+
|
|
+ Don't care No Don't care N/A Not affected
|
|
+
|
|
+ Yes Yes Disabled Off Vulnerable
|
|
+
|
|
+ Yes Yes Disabled Full Mitigated
|
|
+
|
|
+ Yes Yes Enabled Don't care Mitigated
|
|
+
|
|
+ No Yes N/A Off Vulnerable
|
|
+
|
|
+ No Yes N/A Full Mitigated
|
|
+ ============ ===== ============= ============ =================
|
|
+
|
|
+ This only covers the host to guest transition, i.e. prevents leakage from
|
|
+ host to guest, but does not protect the guest internally. Guests need to
|
|
+ have their own protections.
|
|
+
|
|
+.. _xeon_phi:
|
|
+
|
|
+XEON PHI specific considerations
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ The XEON PHI processor family is affected by MSBDS which can be exploited
|
|
+ cross Hyper-Threads when entering idle states. Some XEON PHI variants allow
|
|
+ to use MWAIT in user space (Ring 3) which opens an potential attack vector
|
|
+ for malicious user space. The exposure can be disabled on the kernel
|
|
+ command line with the 'ring3mwait=disable' command line option.
|
|
+
|
|
+ XEON PHI is not affected by the other MDS variants and MSBDS is mitigated
|
|
+ before the CPU enters a idle state. As XEON PHI is not affected by L1TF
|
|
+ either disabling SMT is not required for full protection.
|
|
+
|
|
+.. _mds_smt_control:
|
|
+
|
|
+SMT control
|
|
+^^^^^^^^^^^
|
|
+
|
|
+ All MDS variants except MSBDS can be attacked cross Hyper-Threads. That
|
|
+ means on CPUs which are affected by MFBDS or MLPDS it is necessary to
|
|
+ disable SMT for full protection. These are most of the affected CPUs; the
|
|
+ exception is XEON PHI, see :ref:`xeon_phi`.
|
|
+
|
|
+ Disabling SMT can have a significant performance impact, but the impact
|
|
+ depends on the type of workloads.
|
|
+
|
|
+ See the relevant chapter in the L1TF mitigation documentation for details:
|
|
+ :ref:`Documentation/hw-vuln/l1tf.rst <smt_control>`.
|
|
+
|
|
+
|
|
+.. _mds_mitigation_control_command_line:
|
|
+
|
|
+Mitigation control on the kernel command line
|
|
+---------------------------------------------
|
|
+
|
|
+The kernel command line allows to control the MDS mitigations at boot
|
|
+time with the option "mds=". The valid arguments for this option are:
|
|
+
|
|
+ ============ =============================================================
|
|
+ full If the CPU is vulnerable, enable all available mitigations
|
|
+ for the MDS vulnerability, CPU buffer clearing on exit to
|
|
+ userspace and when entering a VM. Idle transitions are
|
|
+ protected as well if SMT is enabled.
|
|
+
|
|
+ It does not automatically disable SMT.
|
|
+
|
|
+ full,nosmt The same as mds=full, with SMT disabled on vulnerable
|
|
+ CPUs. This is the complete mitigation.
|
|
+
|
|
+ off Disables MDS mitigations completely.
|
|
+
|
|
+ ============ =============================================================
|
|
+
|
|
+Not specifying this option is equivalent to "mds=full".
|
|
+
|
|
+
|
|
+Mitigation selection guide
|
|
+--------------------------
|
|
+
|
|
+1. Trusted userspace
|
|
+^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ If all userspace applications are from a trusted source and do not
|
|
+ execute untrusted code which is supplied externally, then the mitigation
|
|
+ can be disabled.
|
|
+
|
|
+
|
|
+2. Virtualization with trusted guests
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ The same considerations as above versus trusted user space apply.
|
|
+
|
|
+3. Virtualization with untrusted guests
|
|
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ The protection depends on the state of the L1TF mitigations.
|
|
+ See :ref:`virt_mechanism`.
|
|
+
|
|
+ If the MDS mitigation is enabled and SMT is disabled, guest to host and
|
|
+ guest to guest attacks are prevented.
|
|
+
|
|
+.. _mds_default_mitigations:
|
|
+
|
|
+Default mitigations
|
|
+-------------------
|
|
+
|
|
+ The kernel default mitigations for vulnerable processors are:
|
|
+
|
|
+ - Enable CPU buffer clearing
|
|
+
|
|
+ The kernel does not by default enforce the disabling of SMT, which leaves
|
|
+ SMT systems vulnerable when running untrusted code. The same rationale as
|
|
+ for L1TF applies.
|
|
+ See :ref:`Documentation/hw-vuln//l1tf.rst <default_mitigations>`.
|
|
diff --git a/Documentation/index.rst b/Documentation/index.rst
|
|
index 213399aac757..f95c58dbbbc3 100644
|
|
--- a/Documentation/index.rst
|
|
+++ b/Documentation/index.rst
|
|
@@ -12,7 +12,6 @@ Contents:
|
|
:maxdepth: 2
|
|
|
|
kernel-documentation
|
|
- l1tf
|
|
development-process/index
|
|
dev-tools/tools
|
|
driver-api/index
|
|
@@ -20,6 +19,24 @@ Contents:
|
|
gpu/index
|
|
80211/index
|
|
|
|
+This section describes CPU vulnerabilities and their mitigations.
|
|
+
|
|
+.. toctree::
|
|
+ :maxdepth: 1
|
|
+
|
|
+ hw-vuln/index
|
|
+
|
|
+Architecture-specific documentation
|
|
+-----------------------------------
|
|
+
|
|
+These books provide programming details about architecture-specific
|
|
+implementation.
|
|
+
|
|
+.. toctree::
|
|
+ :maxdepth: 2
|
|
+
|
|
+ x86/index
|
|
+
|
|
Indices and tables
|
|
==================
|
|
|
|
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
|
|
index a1472b48ee22..55a9bbbcf5e1 100644
|
|
--- a/Documentation/kernel-parameters.txt
|
|
+++ b/Documentation/kernel-parameters.txt
|
|
@@ -2076,10 +2076,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|
off
|
|
Disables hypervisor mitigations and doesn't
|
|
emit any warnings.
|
|
+ It also drops the swap size and available
|
|
+ RAM limit restriction on both hypervisor and
|
|
+ bare metal.
|
|
|
|
Default is 'flush'.
|
|
|
|
- For details see: Documentation/admin-guide/l1tf.rst
|
|
+ For details see: Documentation/hw-vuln/l1tf.rst
|
|
|
|
l2cr= [PPC]
|
|
|
|
@@ -2322,6 +2325,32 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|
Format: <first>,<last>
|
|
Specifies range of consoles to be captured by the MDA.
|
|
|
|
+ mds= [X86,INTEL]
|
|
+ Control mitigation for the Micro-architectural Data
|
|
+ Sampling (MDS) vulnerability.
|
|
+
|
|
+ Certain CPUs are vulnerable to an exploit against CPU
|
|
+ internal buffers which can forward information to a
|
|
+ disclosure gadget under certain conditions.
|
|
+
|
|
+ In vulnerable processors, the speculatively
|
|
+ forwarded data can be used in a cache side channel
|
|
+ attack, to access data to which the attacker does
|
|
+ not have direct access.
|
|
+
|
|
+ This parameter controls the MDS mitigation. The
|
|
+ options are:
|
|
+
|
|
+ full - Enable MDS mitigation on vulnerable CPUs
|
|
+ full,nosmt - Enable MDS mitigation and disable
|
|
+ SMT on vulnerable CPUs
|
|
+ off - Unconditionally disable MDS mitigation
|
|
+
|
|
+ Not specifying this option is equivalent to
|
|
+ mds=full.
|
|
+
|
|
+ For details see: Documentation/hw-vuln/mds.rst
|
|
+
|
|
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
|
|
Amount of memory to be used when the kernel is not able
|
|
to see the whole system memory or for test.
|
|
@@ -2444,6 +2473,38 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|
in the "bleeding edge" mini2440 support kernel at
|
|
http://repo.or.cz/w/linux-2.6/mini2440.git
|
|
|
|
+ mitigations=
|
|
+ [X86] Control optional mitigations for CPU
|
|
+ vulnerabilities. This is a set of curated,
|
|
+ arch-independent options, each of which is an
|
|
+ aggregation of existing arch-specific options.
|
|
+
|
|
+ off
|
|
+ Disable all optional CPU mitigations. This
|
|
+ improves system performance, but it may also
|
|
+ expose users to several CPU vulnerabilities.
|
|
+ Equivalent to: nopti [X86]
|
|
+ nospectre_v2 [X86]
|
|
+ spectre_v2_user=off [X86]
|
|
+ spec_store_bypass_disable=off [X86]
|
|
+ l1tf=off [X86]
|
|
+ mds=off [X86]
|
|
+
|
|
+ auto (default)
|
|
+ Mitigate all CPU vulnerabilities, but leave SMT
|
|
+ enabled, even if it's vulnerable. This is for
|
|
+ users who don't want to be surprised by SMT
|
|
+ getting disabled across kernel upgrades, or who
|
|
+ have other ways of avoiding SMT-based attacks.
|
|
+ Equivalent to: (default behavior)
|
|
+
|
|
+ auto,nosmt
|
|
+ Mitigate all CPU vulnerabilities, disabling SMT
|
|
+ if needed. This is for users who always want to
|
|
+ be fully mitigated, even if it means losing SMT.
|
|
+ Equivalent to: l1tf=flush,nosmt [X86]
|
|
+ mds=full,nosmt [X86]
|
|
+
|
|
mminit_loglevel=
|
|
[KNL] When CONFIG_DEBUG_MEMORY_INIT is set, this
|
|
parameter allows control of the logging verbosity for
|
|
@@ -4030,9 +4091,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|
|
|
spectre_v2= [X86] Control mitigation of Spectre variant 2
|
|
(indirect branch speculation) vulnerability.
|
|
+ The default operation protects the kernel from
|
|
+ user space attacks.
|
|
|
|
- on - unconditionally enable
|
|
- off - unconditionally disable
|
|
+ on - unconditionally enable, implies
|
|
+ spectre_v2_user=on
|
|
+ off - unconditionally disable, implies
|
|
+ spectre_v2_user=off
|
|
auto - kernel detects whether your CPU model is
|
|
vulnerable
|
|
|
|
@@ -4042,6 +4107,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|
CONFIG_RETPOLINE configuration option, and the
|
|
compiler with which the kernel was built.
|
|
|
|
+ Selecting 'on' will also enable the mitigation
|
|
+ against user space to user space task attacks.
|
|
+
|
|
+ Selecting 'off' will disable both the kernel and
|
|
+ the user space protections.
|
|
+
|
|
Specific mitigations can also be selected manually:
|
|
|
|
retpoline - replace indirect branches
|
|
@@ -4051,6 +4122,48 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
|
|
Not specifying this option is equivalent to
|
|
spectre_v2=auto.
|
|
|
|
+ spectre_v2_user=
|
|
+ [X86] Control mitigation of Spectre variant 2
|
|
+ (indirect branch speculation) vulnerability between
|
|
+ user space tasks
|
|
+
|
|
+ on - Unconditionally enable mitigations. Is
|
|
+ enforced by spectre_v2=on
|
|
+
|
|
+ off - Unconditionally disable mitigations. Is
|
|
+ enforced by spectre_v2=off
|
|
+
|
|
+ prctl - Indirect branch speculation is enabled,
|
|
+ but mitigation can be enabled via prctl
|
|
+ per thread. The mitigation control state
|
|
+ is inherited on fork.
|
|
+
|
|
+ prctl,ibpb
|
|
+ - Like "prctl" above, but only STIBP is
|
|
+ controlled per thread. IBPB is issued
|
|
+ always when switching between different user
|
|
+ space processes.
|
|
+
|
|
+ seccomp
|
|
+ - Same as "prctl" above, but all seccomp
|
|
+ threads will enable the mitigation unless
|
|
+ they explicitly opt out.
|
|
+
|
|
+ seccomp,ibpb
|
|
+ - Like "seccomp" above, but only STIBP is
|
|
+ controlled per thread. IBPB is issued
|
|
+ always when switching between different
|
|
+ user space processes.
|
|
+
|
|
+ auto - Kernel selects the mitigation depending on
|
|
+ the available CPU features and vulnerability.
|
|
+
|
|
+ Default mitigation:
|
|
+ If CONFIG_SECCOMP=y then "seccomp", otherwise "prctl"
|
|
+
|
|
+ Not specifying this option is equivalent to
|
|
+ spectre_v2_user=auto.
|
|
+
|
|
spec_store_bypass_disable=
|
|
[HW] Control Speculative Store Bypass (SSB) Disable mitigation
|
|
(Speculative Store Bypass vulnerability)
|
|
diff --git a/Documentation/l1tf.rst b/Documentation/l1tf.rst
|
|
deleted file mode 100644
|
|
index bae52b845de0..000000000000
|
|
--- a/Documentation/l1tf.rst
|
|
+++ /dev/null
|
|
@@ -1,610 +0,0 @@
|
|
-L1TF - L1 Terminal Fault
|
|
-========================
|
|
-
|
|
-L1 Terminal Fault is a hardware vulnerability which allows unprivileged
|
|
-speculative access to data which is available in the Level 1 Data Cache
|
|
-when the page table entry controlling the virtual address, which is used
|
|
-for the access, has the Present bit cleared or other reserved bits set.
|
|
-
|
|
-Affected processors
|
|
--------------------
|
|
-
|
|
-This vulnerability affects a wide range of Intel processors. The
|
|
-vulnerability is not present on:
|
|
-
|
|
- - Processors from AMD, Centaur and other non Intel vendors
|
|
-
|
|
- - Older processor models, where the CPU family is < 6
|
|
-
|
|
- - A range of Intel ATOM processors (Cedarview, Cloverview, Lincroft,
|
|
- Penwell, Pineview, Silvermont, Airmont, Merrifield)
|
|
-
|
|
- - The Intel XEON PHI family
|
|
-
|
|
- - Intel processors which have the ARCH_CAP_RDCL_NO bit set in the
|
|
- IA32_ARCH_CAPABILITIES MSR. If the bit is set the CPU is not affected
|
|
- by the Meltdown vulnerability either. These CPUs should become
|
|
- available by end of 2018.
|
|
-
|
|
-Whether a processor is affected or not can be read out from the L1TF
|
|
-vulnerability file in sysfs. See :ref:`l1tf_sys_info`.
|
|
-
|
|
-Related CVEs
|
|
-------------
|
|
-
|
|
-The following CVE entries are related to the L1TF vulnerability:
|
|
-
|
|
- ============= ================= ==============================
|
|
- CVE-2018-3615 L1 Terminal Fault SGX related aspects
|
|
- CVE-2018-3620 L1 Terminal Fault OS, SMM related aspects
|
|
- CVE-2018-3646 L1 Terminal Fault Virtualization related aspects
|
|
- ============= ================= ==============================
|
|
-
|
|
-Problem
|
|
--------
|
|
-
|
|
-If an instruction accesses a virtual address for which the relevant page
|
|
-table entry (PTE) has the Present bit cleared or other reserved bits set,
|
|
-then speculative execution ignores the invalid PTE and loads the referenced
|
|
-data if it is present in the Level 1 Data Cache, as if the page referenced
|
|
-by the address bits in the PTE was still present and accessible.
|
|
-
|
|
-While this is a purely speculative mechanism and the instruction will raise
|
|
-a page fault when it is retired eventually, the pure act of loading the
|
|
-data and making it available to other speculative instructions opens up the
|
|
-opportunity for side channel attacks to unprivileged malicious code,
|
|
-similar to the Meltdown attack.
|
|
-
|
|
-While Meltdown breaks the user space to kernel space protection, L1TF
|
|
-allows to attack any physical memory address in the system and the attack
|
|
-works across all protection domains. It allows an attack of SGX and also
|
|
-works from inside virtual machines because the speculation bypasses the
|
|
-extended page table (EPT) protection mechanism.
|
|
-
|
|
-
|
|
-Attack scenarios
|
|
-----------------
|
|
-
|
|
-1. Malicious user space
|
|
-^^^^^^^^^^^^^^^^^^^^^^^
|
|
-
|
|
- Operating Systems store arbitrary information in the address bits of a
|
|
- PTE which is marked non present. This allows a malicious user space
|
|
- application to attack the physical memory to which these PTEs resolve.
|
|
- In some cases user-space can maliciously influence the information
|
|
- encoded in the address bits of the PTE, thus making attacks more
|
|
- deterministic and more practical.
|
|
-
|
|
- The Linux kernel contains a mitigation for this attack vector, PTE
|
|
- inversion, which is permanently enabled and has no performance
|
|
- impact. The kernel ensures that the address bits of PTEs, which are not
|
|
- marked present, never point to cacheable physical memory space.
|
|
-
|
|
- A system with an up to date kernel is protected against attacks from
|
|
- malicious user space applications.
|
|
-
|
|
-2. Malicious guest in a virtual machine
|
|
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
-
|
|
- The fact that L1TF breaks all domain protections allows malicious guest
|
|
- OSes, which can control the PTEs directly, and malicious guest user
|
|
- space applications, which run on an unprotected guest kernel lacking the
|
|
- PTE inversion mitigation for L1TF, to attack physical host memory.
|
|
-
|
|
- A special aspect of L1TF in the context of virtualization is symmetric
|
|
- multi threading (SMT). The Intel implementation of SMT is called
|
|
- HyperThreading. The fact that Hyperthreads on the affected processors
|
|
- share the L1 Data Cache (L1D) is important for this. As the flaw allows
|
|
- only to attack data which is present in L1D, a malicious guest running
|
|
- on one Hyperthread can attack the data which is brought into the L1D by
|
|
- the context which runs on the sibling Hyperthread of the same physical
|
|
- core. This context can be host OS, host user space or a different guest.
|
|
-
|
|
- If the processor does not support Extended Page Tables, the attack is
|
|
- only possible, when the hypervisor does not sanitize the content of the
|
|
- effective (shadow) page tables.
|
|
-
|
|
- While solutions exist to mitigate these attack vectors fully, these
|
|
- mitigations are not enabled by default in the Linux kernel because they
|
|
- can affect performance significantly. The kernel provides several
|
|
- mechanisms which can be utilized to address the problem depending on the
|
|
- deployment scenario. The mitigations, their protection scope and impact
|
|
- are described in the next sections.
|
|
-
|
|
- The default mitigations and the rationale for choosing them are explained
|
|
- at the end of this document. See :ref:`default_mitigations`.
|
|
-
|
|
-.. _l1tf_sys_info:
|
|
-
|
|
-L1TF system information
|
|
------------------------
|
|
-
|
|
-The Linux kernel provides a sysfs interface to enumerate the current L1TF
|
|
-status of the system: whether the system is vulnerable, and which
|
|
-mitigations are active. The relevant sysfs file is:
|
|
-
|
|
-/sys/devices/system/cpu/vulnerabilities/l1tf
|
|
-
|
|
-The possible values in this file are:
|
|
-
|
|
- =========================== ===============================
|
|
- 'Not affected' The processor is not vulnerable
|
|
- 'Mitigation: PTE Inversion' The host protection is active
|
|
- =========================== ===============================
|
|
-
|
|
-If KVM/VMX is enabled and the processor is vulnerable then the following
|
|
-information is appended to the 'Mitigation: PTE Inversion' part:
|
|
-
|
|
- - SMT status:
|
|
-
|
|
- ===================== ================
|
|
- 'VMX: SMT vulnerable' SMT is enabled
|
|
- 'VMX: SMT disabled' SMT is disabled
|
|
- ===================== ================
|
|
-
|
|
- - L1D Flush mode:
|
|
-
|
|
- ================================ ====================================
|
|
- 'L1D vulnerable' L1D flushing is disabled
|
|
-
|
|
- 'L1D conditional cache flushes' L1D flush is conditionally enabled
|
|
-
|
|
- 'L1D cache flushes' L1D flush is unconditionally enabled
|
|
- ================================ ====================================
|
|
-
|
|
-The resulting grade of protection is discussed in the following sections.
|
|
-
|
|
-
|
|
-Host mitigation mechanism
|
|
--------------------------
|
|
-
|
|
-The kernel is unconditionally protected against L1TF attacks from malicious
|
|
-user space running on the host.
|
|
-
|
|
-
|
|
-Guest mitigation mechanisms
|
|
----------------------------
|
|
-
|
|
-.. _l1d_flush:
|
|
-
|
|
-1. L1D flush on VMENTER
|
|
-^^^^^^^^^^^^^^^^^^^^^^^
|
|
-
|
|
- To make sure that a guest cannot attack data which is present in the L1D
|
|
- the hypervisor flushes the L1D before entering the guest.
|
|
-
|
|
- Flushing the L1D evicts not only the data which should not be accessed
|
|
- by a potentially malicious guest, it also flushes the guest
|
|
- data. Flushing the L1D has a performance impact as the processor has to
|
|
- bring the flushed guest data back into the L1D. Depending on the
|
|
- frequency of VMEXIT/VMENTER and the type of computations in the guest
|
|
- performance degradation in the range of 1% to 50% has been observed. For
|
|
- scenarios where guest VMEXIT/VMENTER are rare the performance impact is
|
|
- minimal. Virtio and mechanisms like posted interrupts are designed to
|
|
- confine the VMEXITs to a bare minimum, but specific configurations and
|
|
- application scenarios might still suffer from a high VMEXIT rate.
|
|
-
|
|
- The kernel provides two L1D flush modes:
|
|
- - conditional ('cond')
|
|
- - unconditional ('always')
|
|
-
|
|
- The conditional mode avoids L1D flushing after VMEXITs which execute
|
|
- only audited code paths before the corresponding VMENTER. These code
|
|
- paths have been verified that they cannot expose secrets or other
|
|
- interesting data to an attacker, but they can leak information about the
|
|
- address space layout of the hypervisor.
|
|
-
|
|
- Unconditional mode flushes L1D on all VMENTER invocations and provides
|
|
- maximum protection. It has a higher overhead than the conditional
|
|
- mode. The overhead cannot be quantified correctly as it depends on the
|
|
- workload scenario and the resulting number of VMEXITs.
|
|
-
|
|
- The general recommendation is to enable L1D flush on VMENTER. The kernel
|
|
- defaults to conditional mode on affected processors.
|
|
-
|
|
- **Note**, that L1D flush does not prevent the SMT problem because the
|
|
- sibling thread will also bring back its data into the L1D which makes it
|
|
- attackable again.
|
|
-
|
|
- L1D flush can be controlled by the administrator via the kernel command
|
|
- line and sysfs control files. See :ref:`mitigation_control_command_line`
|
|
- and :ref:`mitigation_control_kvm`.
|
|
-
|
|
-.. _guest_confinement:
|
|
-
|
|
-2. Guest VCPU confinement to dedicated physical cores
|
|
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
-
|
|
- To address the SMT problem, it is possible to make a guest or a group of
|
|
- guests affine to one or more physical cores. The proper mechanism for
|
|
- that is to utilize exclusive cpusets to ensure that no other guest or
|
|
- host tasks can run on these cores.
|
|
-
|
|
- If only a single guest or related guests run on sibling SMT threads on
|
|
- the same physical core then they can only attack their own memory and
|
|
- restricted parts of the host memory.
|
|
-
|
|
- Host memory is attackable, when one of the sibling SMT threads runs in
|
|
- host OS (hypervisor) context and the other in guest context. The amount
|
|
- of valuable information from the host OS context depends on the context
|
|
- which the host OS executes, i.e. interrupts, soft interrupts and kernel
|
|
- threads. The amount of valuable data from these contexts cannot be
|
|
- declared as non-interesting for an attacker without deep inspection of
|
|
- the code.
|
|
-
|
|
- **Note**, that assigning guests to a fixed set of physical cores affects
|
|
- the ability of the scheduler to do load balancing and might have
|
|
- negative effects on CPU utilization depending on the hosting
|
|
- scenario. Disabling SMT might be a viable alternative for particular
|
|
- scenarios.
|
|
-
|
|
- For further information about confining guests to a single or to a group
|
|
- of cores consult the cpusets documentation:
|
|
-
|
|
- https://www.kernel.org/doc/Documentation/cgroup-v1/cpusets.txt
|
|
-
|
|
-.. _interrupt_isolation:
|
|
-
|
|
-3. Interrupt affinity
|
|
-^^^^^^^^^^^^^^^^^^^^^
|
|
-
|
|
- Interrupts can be made affine to logical CPUs. This is not universally
|
|
- true because there are types of interrupts which are truly per CPU
|
|
- interrupts, e.g. the local timer interrupt. Aside of that multi queue
|
|
- devices affine their interrupts to single CPUs or groups of CPUs per
|
|
- queue without allowing the administrator to control the affinities.
|
|
-
|
|
- Moving the interrupts, which can be affinity controlled, away from CPUs
|
|
- which run untrusted guests, reduces the attack vector space.
|
|
-
|
|
- Whether the interrupts with are affine to CPUs, which run untrusted
|
|
- guests, provide interesting data for an attacker depends on the system
|
|
- configuration and the scenarios which run on the system. While for some
|
|
- of the interrupts it can be assumed that they won't expose interesting
|
|
- information beyond exposing hints about the host OS memory layout, there
|
|
- is no way to make general assumptions.
|
|
-
|
|
- Interrupt affinity can be controlled by the administrator via the
|
|
- /proc/irq/$NR/smp_affinity[_list] files. Limited documentation is
|
|
- available at:
|
|
-
|
|
- https://www.kernel.org/doc/Documentation/IRQ-affinity.txt
|
|
-
|
|
-.. _smt_control:
|
|
-
|
|
-4. SMT control
|
|
-^^^^^^^^^^^^^^
|
|
-
|
|
- To prevent the SMT issues of L1TF it might be necessary to disable SMT
|
|
- completely. Disabling SMT can have a significant performance impact, but
|
|
- the impact depends on the hosting scenario and the type of workloads.
|
|
- The impact of disabling SMT needs also to be weighted against the impact
|
|
- of other mitigation solutions like confining guests to dedicated cores.
|
|
-
|
|
- The kernel provides a sysfs interface to retrieve the status of SMT and
|
|
- to control it. It also provides a kernel command line interface to
|
|
- control SMT.
|
|
-
|
|
- The kernel command line interface consists of the following options:
|
|
-
|
|
- =========== ==========================================================
|
|
- nosmt Affects the bring up of the secondary CPUs during boot. The
|
|
- kernel tries to bring all present CPUs online during the
|
|
- boot process. "nosmt" makes sure that from each physical
|
|
- core only one - the so called primary (hyper) thread is
|
|
- activated. Due to a design flaw of Intel processors related
|
|
- to Machine Check Exceptions the non primary siblings have
|
|
- to be brought up at least partially and are then shut down
|
|
- again. "nosmt" can be undone via the sysfs interface.
|
|
-
|
|
- nosmt=force Has the same effect as "nosmt" but it does not allow to
|
|
- undo the SMT disable via the sysfs interface.
|
|
- =========== ==========================================================
|
|
-
|
|
- The sysfs interface provides two files:
|
|
-
|
|
- - /sys/devices/system/cpu/smt/control
|
|
- - /sys/devices/system/cpu/smt/active
|
|
-
|
|
- /sys/devices/system/cpu/smt/control:
|
|
-
|
|
- This file allows to read out the SMT control state and provides the
|
|
- ability to disable or (re)enable SMT. The possible states are:
|
|
-
|
|
- ============== ===================================================
|
|
- on SMT is supported by the CPU and enabled. All
|
|
- logical CPUs can be onlined and offlined without
|
|
- restrictions.
|
|
-
|
|
- off SMT is supported by the CPU and disabled. Only
|
|
- the so called primary SMT threads can be onlined
|
|
- and offlined without restrictions. An attempt to
|
|
- online a non-primary sibling is rejected
|
|
-
|
|
- forceoff Same as 'off' but the state cannot be controlled.
|
|
- Attempts to write to the control file are rejected.
|
|
-
|
|
- notsupported The processor does not support SMT. It's therefore
|
|
- not affected by the SMT implications of L1TF.
|
|
- Attempts to write to the control file are rejected.
|
|
- ============== ===================================================
|
|
-
|
|
- The possible states which can be written into this file to control SMT
|
|
- state are:
|
|
-
|
|
- - on
|
|
- - off
|
|
- - forceoff
|
|
-
|
|
- /sys/devices/system/cpu/smt/active:
|
|
-
|
|
- This file reports whether SMT is enabled and active, i.e. if on any
|
|
- physical core two or more sibling threads are online.
|
|
-
|
|
- SMT control is also possible at boot time via the l1tf kernel command
|
|
- line parameter in combination with L1D flush control. See
|
|
- :ref:`mitigation_control_command_line`.
|
|
-
|
|
-5. Disabling EPT
|
|
-^^^^^^^^^^^^^^^^
|
|
-
|
|
- Disabling EPT for virtual machines provides full mitigation for L1TF even
|
|
- with SMT enabled, because the effective page tables for guests are
|
|
- managed and sanitized by the hypervisor. Though disabling EPT has a
|
|
- significant performance impact especially when the Meltdown mitigation
|
|
- KPTI is enabled.
|
|
-
|
|
- EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
|
|
-
|
|
-There is ongoing research and development for new mitigation mechanisms to
|
|
-address the performance impact of disabling SMT or EPT.
|
|
-
|
|
-.. _mitigation_control_command_line:
|
|
-
|
|
-Mitigation control on the kernel command line
|
|
----------------------------------------------
|
|
-
|
|
-The kernel command line allows to control the L1TF mitigations at boot
|
|
-time with the option "l1tf=". The valid arguments for this option are:
|
|
-
|
|
- ============ =============================================================
|
|
- full Provides all available mitigations for the L1TF
|
|
- vulnerability. Disables SMT and enables all mitigations in
|
|
- the hypervisors, i.e. unconditional L1D flushing
|
|
-
|
|
- SMT control and L1D flush control via the sysfs interface
|
|
- is still possible after boot. Hypervisors will issue a
|
|
- warning when the first VM is started in a potentially
|
|
- insecure configuration, i.e. SMT enabled or L1D flush
|
|
- disabled.
|
|
-
|
|
- full,force Same as 'full', but disables SMT and L1D flush runtime
|
|
- control. Implies the 'nosmt=force' command line option.
|
|
- (i.e. sysfs control of SMT is disabled.)
|
|
-
|
|
- flush Leaves SMT enabled and enables the default hypervisor
|
|
- mitigation, i.e. conditional L1D flushing
|
|
-
|
|
- SMT control and L1D flush control via the sysfs interface
|
|
- is still possible after boot. Hypervisors will issue a
|
|
- warning when the first VM is started in a potentially
|
|
- insecure configuration, i.e. SMT enabled or L1D flush
|
|
- disabled.
|
|
-
|
|
- flush,nosmt Disables SMT and enables the default hypervisor mitigation,
|
|
- i.e. conditional L1D flushing.
|
|
-
|
|
- SMT control and L1D flush control via the sysfs interface
|
|
- is still possible after boot. Hypervisors will issue a
|
|
- warning when the first VM is started in a potentially
|
|
- insecure configuration, i.e. SMT enabled or L1D flush
|
|
- disabled.
|
|
-
|
|
- flush,nowarn Same as 'flush', but hypervisors will not warn when a VM is
|
|
- started in a potentially insecure configuration.
|
|
-
|
|
- off Disables hypervisor mitigations and doesn't emit any
|
|
- warnings.
|
|
- ============ =============================================================
|
|
-
|
|
-The default is 'flush'. For details about L1D flushing see :ref:`l1d_flush`.
|
|
-
|
|
-
|
|
-.. _mitigation_control_kvm:
|
|
-
|
|
-Mitigation control for KVM - module parameter
|
|
--------------------------------------------------------------
|
|
-
|
|
-The KVM hypervisor mitigation mechanism, flushing the L1D cache when
|
|
-entering a guest, can be controlled with a module parameter.
|
|
-
|
|
-The option/parameter is "kvm-intel.vmentry_l1d_flush=". It takes the
|
|
-following arguments:
|
|
-
|
|
- ============ ==============================================================
|
|
- always L1D cache flush on every VMENTER.
|
|
-
|
|
- cond Flush L1D on VMENTER only when the code between VMEXIT and
|
|
- VMENTER can leak host memory which is considered
|
|
- interesting for an attacker. This still can leak host memory
|
|
- which allows e.g. to determine the hosts address space layout.
|
|
-
|
|
- never Disables the mitigation
|
|
- ============ ==============================================================
|
|
-
|
|
-The parameter can be provided on the kernel command line, as a module
|
|
-parameter when loading the modules and at runtime modified via the sysfs
|
|
-file:
|
|
-
|
|
-/sys/module/kvm_intel/parameters/vmentry_l1d_flush
|
|
-
|
|
-The default is 'cond'. If 'l1tf=full,force' is given on the kernel command
|
|
-line, then 'always' is enforced and the kvm-intel.vmentry_l1d_flush
|
|
-module parameter is ignored and writes to the sysfs file are rejected.
|
|
-
|
|
-
|
|
-Mitigation selection guide
|
|
---------------------------
|
|
-
|
|
-1. No virtualization in use
|
|
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
-
|
|
- The system is protected by the kernel unconditionally and no further
|
|
- action is required.
|
|
-
|
|
-2. Virtualization with trusted guests
|
|
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
-
|
|
- If the guest comes from a trusted source and the guest OS kernel is
|
|
- guaranteed to have the L1TF mitigations in place the system is fully
|
|
- protected against L1TF and no further action is required.
|
|
-
|
|
- To avoid the overhead of the default L1D flushing on VMENTER the
|
|
- administrator can disable the flushing via the kernel command line and
|
|
- sysfs control files. See :ref:`mitigation_control_command_line` and
|
|
- :ref:`mitigation_control_kvm`.
|
|
-
|
|
-
|
|
-3. Virtualization with untrusted guests
|
|
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
-
|
|
-3.1. SMT not supported or disabled
|
|
-""""""""""""""""""""""""""""""""""
|
|
-
|
|
- If SMT is not supported by the processor or disabled in the BIOS or by
|
|
- the kernel, it's only required to enforce L1D flushing on VMENTER.
|
|
-
|
|
- Conditional L1D flushing is the default behaviour and can be tuned. See
|
|
- :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
|
|
-
|
|
-3.2. EPT not supported or disabled
|
|
-""""""""""""""""""""""""""""""""""
|
|
-
|
|
- If EPT is not supported by the processor or disabled in the hypervisor,
|
|
- the system is fully protected. SMT can stay enabled and L1D flushing on
|
|
- VMENTER is not required.
|
|
-
|
|
- EPT can be disabled in the hypervisor via the 'kvm-intel.ept' parameter.
|
|
-
|
|
-3.3. SMT and EPT supported and active
|
|
-"""""""""""""""""""""""""""""""""""""
|
|
-
|
|
- If SMT and EPT are supported and active then various degrees of
|
|
- mitigations can be employed:
|
|
-
|
|
- - L1D flushing on VMENTER:
|
|
-
|
|
- L1D flushing on VMENTER is the minimal protection requirement, but it
|
|
- is only potent in combination with other mitigation methods.
|
|
-
|
|
- Conditional L1D flushing is the default behaviour and can be tuned. See
|
|
- :ref:`mitigation_control_command_line` and :ref:`mitigation_control_kvm`.
|
|
-
|
|
- - Guest confinement:
|
|
-
|
|
- Confinement of guests to a single or a group of physical cores which
|
|
- are not running any other processes, can reduce the attack surface
|
|
- significantly, but interrupts, soft interrupts and kernel threads can
|
|
- still expose valuable data to a potential attacker. See
|
|
- :ref:`guest_confinement`.
|
|
-
|
|
- - Interrupt isolation:
|
|
-
|
|
- Isolating the guest CPUs from interrupts can reduce the attack surface
|
|
- further, but still allows a malicious guest to explore a limited amount
|
|
- of host physical memory. This can at least be used to gain knowledge
|
|
- about the host address space layout. The interrupts which have a fixed
|
|
- affinity to the CPUs which run the untrusted guests can depending on
|
|
- the scenario still trigger soft interrupts and schedule kernel threads
|
|
- which might expose valuable information. See
|
|
- :ref:`interrupt_isolation`.
|
|
-
|
|
-The above three mitigation methods combined can provide protection to a
|
|
-certain degree, but the risk of the remaining attack surface has to be
|
|
-carefully analyzed. For full protection the following methods are
|
|
-available:
|
|
-
|
|
- - Disabling SMT:
|
|
-
|
|
- Disabling SMT and enforcing the L1D flushing provides the maximum
|
|
- amount of protection. This mitigation is not depending on any of the
|
|
- above mitigation methods.
|
|
-
|
|
- SMT control and L1D flushing can be tuned by the command line
|
|
- parameters 'nosmt', 'l1tf', 'kvm-intel.vmentry_l1d_flush' and at run
|
|
- time with the matching sysfs control files. See :ref:`smt_control`,
|
|
- :ref:`mitigation_control_command_line` and
|
|
- :ref:`mitigation_control_kvm`.
|
|
-
|
|
- - Disabling EPT:
|
|
-
|
|
- Disabling EPT provides the maximum amount of protection as well. It is
|
|
- not depending on any of the above mitigation methods. SMT can stay
|
|
- enabled and L1D flushing is not required, but the performance impact is
|
|
- significant.
|
|
-
|
|
- EPT can be disabled in the hypervisor via the 'kvm-intel.ept'
|
|
- parameter.
|
|
-
|
|
-3.4. Nested virtual machines
|
|
-""""""""""""""""""""""""""""
|
|
-
|
|
-When nested virtualization is in use, three operating systems are involved:
|
|
-the bare metal hypervisor, the nested hypervisor and the nested virtual
|
|
-machine. VMENTER operations from the nested hypervisor into the nested
|
|
-guest will always be processed by the bare metal hypervisor. If KVM is the
|
|
-bare metal hypervisor it wiil:
|
|
-
|
|
- - Flush the L1D cache on every switch from the nested hypervisor to the
|
|
- nested virtual machine, so that the nested hypervisor's secrets are not
|
|
- exposed to the nested virtual machine;
|
|
-
|
|
- - Flush the L1D cache on every switch from the nested virtual machine to
|
|
- the nested hypervisor; this is a complex operation, and flushing the L1D
|
|
- cache avoids that the bare metal hypervisor's secrets are exposed to the
|
|
- nested virtual machine;
|
|
-
|
|
- - Instruct the nested hypervisor to not perform any L1D cache flush. This
|
|
- is an optimization to avoid double L1D flushing.
|
|
-
|
|
-
|
|
-.. _default_mitigations:
|
|
-
|
|
-Default mitigations
|
|
--------------------
|
|
-
|
|
- The kernel default mitigations for vulnerable processors are:
|
|
-
|
|
- - PTE inversion to protect against malicious user space. This is done
|
|
- unconditionally and cannot be controlled.
|
|
-
|
|
- - L1D conditional flushing on VMENTER when EPT is enabled for
|
|
- a guest.
|
|
-
|
|
- The kernel does not by default enforce the disabling of SMT, which leaves
|
|
- SMT systems vulnerable when running untrusted guests with EPT enabled.
|
|
-
|
|
- The rationale for this choice is:
|
|
-
|
|
- - Force disabling SMT can break existing setups, especially with
|
|
- unattended updates.
|
|
-
|
|
- - If regular users run untrusted guests on their machine, then L1TF is
|
|
- just an add on to other malware which might be embedded in an untrusted
|
|
- guest, e.g. spam-bots or attacks on the local network.
|
|
-
|
|
- There is no technical way to prevent a user from running untrusted code
|
|
- on their machines blindly.
|
|
-
|
|
- - It's technically extremely unlikely and from today's knowledge even
|
|
- impossible that L1TF can be exploited via the most popular attack
|
|
- mechanisms like JavaScript because these mechanisms have no way to
|
|
- control PTEs. If this would be possible and not other mitigation would
|
|
- be possible, then the default might be different.
|
|
-
|
|
- - The administrators of cloud and hosting setups have to carefully
|
|
- analyze the risk for their scenarios and make the appropriate
|
|
- mitigation choices, which might even vary across their deployed
|
|
- machines and also result in other changes of their overall setup.
|
|
- There is no way for the kernel to provide a sensible default for this
|
|
- kind of scenarios.
|
|
diff --git a/Documentation/spec_ctrl.txt b/Documentation/spec_ctrl.txt
|
|
index 32f3d55c54b7..c4dbe6f7cdae 100644
|
|
--- a/Documentation/spec_ctrl.txt
|
|
+++ b/Documentation/spec_ctrl.txt
|
|
@@ -92,3 +92,12 @@ Speculation misfeature controls
|
|
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0);
|
|
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0);
|
|
* prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0);
|
|
+
|
|
+- PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes
|
|
+ (Mitigate Spectre V2 style attacks against user processes)
|
|
+
|
|
+ Invocations:
|
|
+ * prctl(PR_GET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, 0, 0, 0);
|
|
+ * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_ENABLE, 0, 0);
|
|
+ * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_DISABLE, 0, 0);
|
|
+ * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_INDIRECT_BRANCH, PR_SPEC_FORCE_DISABLE, 0, 0);
|
|
diff --git a/Documentation/x86/conf.py b/Documentation/x86/conf.py
|
|
new file mode 100644
|
|
index 000000000000..33c5c3142e20
|
|
--- /dev/null
|
|
+++ b/Documentation/x86/conf.py
|
|
@@ -0,0 +1,10 @@
|
|
+# -*- coding: utf-8; mode: python -*-
|
|
+
|
|
+project = "X86 architecture specific documentation"
|
|
+
|
|
+tags.add("subproject")
|
|
+
|
|
+latex_documents = [
|
|
+ ('index', 'x86.tex', project,
|
|
+ 'The kernel development community', 'manual'),
|
|
+]
|
|
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
|
|
new file mode 100644
|
|
index 000000000000..ef389dcf1b1d
|
|
--- /dev/null
|
|
+++ b/Documentation/x86/index.rst
|
|
@@ -0,0 +1,8 @@
|
|
+==========================
|
|
+x86 architecture specifics
|
|
+==========================
|
|
+
|
|
+.. toctree::
|
|
+ :maxdepth: 1
|
|
+
|
|
+ mds
|
|
diff --git a/Documentation/x86/mds.rst b/Documentation/x86/mds.rst
|
|
new file mode 100644
|
|
index 000000000000..534e9baa4e1d
|
|
--- /dev/null
|
|
+++ b/Documentation/x86/mds.rst
|
|
@@ -0,0 +1,225 @@
|
|
+Microarchitectural Data Sampling (MDS) mitigation
|
|
+=================================================
|
|
+
|
|
+.. _mds:
|
|
+
|
|
+Overview
|
|
+--------
|
|
+
|
|
+Microarchitectural Data Sampling (MDS) is a family of side channel attacks
|
|
+on internal buffers in Intel CPUs. The variants are:
|
|
+
|
|
+ - Microarchitectural Store Buffer Data Sampling (MSBDS) (CVE-2018-12126)
|
|
+ - Microarchitectural Fill Buffer Data Sampling (MFBDS) (CVE-2018-12130)
|
|
+ - Microarchitectural Load Port Data Sampling (MLPDS) (CVE-2018-12127)
|
|
+ - Microarchitectural Data Sampling Uncacheable Memory (MDSUM) (CVE-2019-11091)
|
|
+
|
|
+MSBDS leaks Store Buffer Entries which can be speculatively forwarded to a
|
|
+dependent load (store-to-load forwarding) as an optimization. The forward
|
|
+can also happen to a faulting or assisting load operation for a different
|
|
+memory address, which can be exploited under certain conditions. Store
|
|
+buffers are partitioned between Hyper-Threads so cross thread forwarding is
|
|
+not possible. But if a thread enters or exits a sleep state the store
|
|
+buffer is repartitioned which can expose data from one thread to the other.
|
|
+
|
|
+MFBDS leaks Fill Buffer Entries. Fill buffers are used internally to manage
|
|
+L1 miss situations and to hold data which is returned or sent in response
|
|
+to a memory or I/O operation. Fill buffers can forward data to a load
|
|
+operation and also write data to the cache. When the fill buffer is
|
|
+deallocated it can retain the stale data of the preceding operations which
|
|
+can then be forwarded to a faulting or assisting load operation, which can
|
|
+be exploited under certain conditions. Fill buffers are shared between
|
|
+Hyper-Threads so cross thread leakage is possible.
|
|
+
|
|
+MLPDS leaks Load Port Data. Load ports are used to perform load operations
|
|
+from memory or I/O. The received data is then forwarded to the register
|
|
+file or a subsequent operation. In some implementations the Load Port can
|
|
+contain stale data from a previous operation which can be forwarded to
|
|
+faulting or assisting loads under certain conditions, which again can be
|
|
+exploited eventually. Load ports are shared between Hyper-Threads so cross
|
|
+thread leakage is possible.
|
|
+
|
|
+MDSUM is a special case of MSBDS, MFBDS and MLPDS. An uncacheable load from
|
|
+memory that takes a fault or assist can leave data in a microarchitectural
|
|
+structure that may later be observed using one of the same methods used by
|
|
+MSBDS, MFBDS or MLPDS.
|
|
+
|
|
+Exposure assumptions
|
|
+--------------------
|
|
+
|
|
+It is assumed that attack code resides in user space or in a guest with one
|
|
+exception. The rationale behind this assumption is that the code construct
|
|
+needed for exploiting MDS requires:
|
|
+
|
|
+ - to control the load to trigger a fault or assist
|
|
+
|
|
+ - to have a disclosure gadget which exposes the speculatively accessed
|
|
+ data for consumption through a side channel.
|
|
+
|
|
+ - to control the pointer through which the disclosure gadget exposes the
|
|
+ data
|
|
+
|
|
+The existence of such a construct in the kernel cannot be excluded with
|
|
+100% certainty, but the complexity involved makes it extremly unlikely.
|
|
+
|
|
+There is one exception, which is untrusted BPF. The functionality of
|
|
+untrusted BPF is limited, but it needs to be thoroughly investigated
|
|
+whether it can be used to create such a construct.
|
|
+
|
|
+
|
|
+Mitigation strategy
|
|
+-------------------
|
|
+
|
|
+All variants have the same mitigation strategy at least for the single CPU
|
|
+thread case (SMT off): Force the CPU to clear the affected buffers.
|
|
+
|
|
+This is achieved by using the otherwise unused and obsolete VERW
|
|
+instruction in combination with a microcode update. The microcode clears
|
|
+the affected CPU buffers when the VERW instruction is executed.
|
|
+
|
|
+For virtualization there are two ways to achieve CPU buffer
|
|
+clearing. Either the modified VERW instruction or via the L1D Flush
|
|
+command. The latter is issued when L1TF mitigation is enabled so the extra
|
|
+VERW can be avoided. If the CPU is not affected by L1TF then VERW needs to
|
|
+be issued.
|
|
+
|
|
+If the VERW instruction with the supplied segment selector argument is
|
|
+executed on a CPU without the microcode update there is no side effect
|
|
+other than a small number of pointlessly wasted CPU cycles.
|
|
+
|
|
+This does not protect against cross Hyper-Thread attacks except for MSBDS
|
|
+which is only exploitable cross Hyper-thread when one of the Hyper-Threads
|
|
+enters a C-state.
|
|
+
|
|
+The kernel provides a function to invoke the buffer clearing:
|
|
+
|
|
+ mds_clear_cpu_buffers()
|
|
+
|
|
+The mitigation is invoked on kernel/userspace, hypervisor/guest and C-state
|
|
+(idle) transitions.
|
|
+
|
|
+As a special quirk to address virtualization scenarios where the host has
|
|
+the microcode updated, but the hypervisor does not (yet) expose the
|
|
+MD_CLEAR CPUID bit to guests, the kernel issues the VERW instruction in the
|
|
+hope that it might actually clear the buffers. The state is reflected
|
|
+accordingly.
|
|
+
|
|
+According to current knowledge additional mitigations inside the kernel
|
|
+itself are not required because the necessary gadgets to expose the leaked
|
|
+data cannot be controlled in a way which allows exploitation from malicious
|
|
+user space or VM guests.
|
|
+
|
|
+Kernel internal mitigation modes
|
|
+--------------------------------
|
|
+
|
|
+ ======= ============================================================
|
|
+ off Mitigation is disabled. Either the CPU is not affected or
|
|
+ mds=off is supplied on the kernel command line
|
|
+
|
|
+ full Mitigation is enabled. CPU is affected and MD_CLEAR is
|
|
+ advertised in CPUID.
|
|
+
|
|
+ vmwerv Mitigation is enabled. CPU is affected and MD_CLEAR is not
|
|
+ advertised in CPUID. That is mainly for virtualization
|
|
+ scenarios where the host has the updated microcode but the
|
|
+ hypervisor does not expose MD_CLEAR in CPUID. It's a best
|
|
+ effort approach without guarantee.
|
|
+ ======= ============================================================
|
|
+
|
|
+If the CPU is affected and mds=off is not supplied on the kernel command
|
|
+line then the kernel selects the appropriate mitigation mode depending on
|
|
+the availability of the MD_CLEAR CPUID bit.
|
|
+
|
|
+Mitigation points
|
|
+-----------------
|
|
+
|
|
+1. Return to user space
|
|
+^^^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ When transitioning from kernel to user space the CPU buffers are flushed
|
|
+ on affected CPUs when the mitigation is not disabled on the kernel
|
|
+ command line. The migitation is enabled through the static key
|
|
+ mds_user_clear.
|
|
+
|
|
+ The mitigation is invoked in prepare_exit_to_usermode() which covers
|
|
+ most of the kernel to user space transitions. There are a few exceptions
|
|
+ which are not invoking prepare_exit_to_usermode() on return to user
|
|
+ space. These exceptions use the paranoid exit code.
|
|
+
|
|
+ - Non Maskable Interrupt (NMI):
|
|
+
|
|
+ Access to sensible data like keys, credentials in the NMI context is
|
|
+ mostly theoretical: The CPU can do prefetching or execute a
|
|
+ misspeculated code path and thereby fetching data which might end up
|
|
+ leaking through a buffer.
|
|
+
|
|
+ But for mounting other attacks the kernel stack address of the task is
|
|
+ already valuable information. So in full mitigation mode, the NMI is
|
|
+ mitigated on the return from do_nmi() to provide almost complete
|
|
+ coverage.
|
|
+
|
|
+ - Double fault (#DF):
|
|
+
|
|
+ A double fault is usually fatal, but the ESPFIX workaround, which can
|
|
+ be triggered from user space through modify_ldt(2) is a recoverable
|
|
+ double fault. #DF uses the paranoid exit path, so explicit mitigation
|
|
+ in the double fault handler is required.
|
|
+
|
|
+ - Machine Check Exception (#MC):
|
|
+
|
|
+ Another corner case is a #MC which hits between the CPU buffer clear
|
|
+ invocation and the actual return to user. As this still is in kernel
|
|
+ space it takes the paranoid exit path which does not clear the CPU
|
|
+ buffers. So the #MC handler repopulates the buffers to some
|
|
+ extent. Machine checks are not reliably controllable and the window is
|
|
+ extremly small so mitigation would just tick a checkbox that this
|
|
+ theoretical corner case is covered. To keep the amount of special
|
|
+ cases small, ignore #MC.
|
|
+
|
|
+ - Debug Exception (#DB):
|
|
+
|
|
+ This takes the paranoid exit path only when the INT1 breakpoint is in
|
|
+ kernel space. #DB on a user space address takes the regular exit path,
|
|
+ so no extra mitigation required.
|
|
+
|
|
+
|
|
+2. C-State transition
|
|
+^^^^^^^^^^^^^^^^^^^^^
|
|
+
|
|
+ When a CPU goes idle and enters a C-State the CPU buffers need to be
|
|
+ cleared on affected CPUs when SMT is active. This addresses the
|
|
+ repartitioning of the store buffer when one of the Hyper-Threads enters
|
|
+ a C-State.
|
|
+
|
|
+ When SMT is inactive, i.e. either the CPU does not support it or all
|
|
+ sibling threads are offline CPU buffer clearing is not required.
|
|
+
|
|
+ The idle clearing is enabled on CPUs which are only affected by MSBDS
|
|
+ and not by any other MDS variant. The other MDS variants cannot be
|
|
+ protected against cross Hyper-Thread attacks because the Fill Buffer and
|
|
+ the Load Ports are shared. So on CPUs affected by other variants, the
|
|
+ idle clearing would be a window dressing exercise and is therefore not
|
|
+ activated.
|
|
+
|
|
+ The invocation is controlled by the static key mds_idle_clear which is
|
|
+ switched depending on the chosen mitigation mode and the SMT state of
|
|
+ the system.
|
|
+
|
|
+ The buffer clear is only invoked before entering the C-State to prevent
|
|
+ that stale data from the idling CPU from spilling to the Hyper-Thread
|
|
+ sibling after the store buffer got repartitioned and all entries are
|
|
+ available to the non idle sibling.
|
|
+
|
|
+ When coming out of idle the store buffer is partitioned again so each
|
|
+ sibling has half of it available. The back from idle CPU could be then
|
|
+ speculatively exposed to contents of the sibling. The buffers are
|
|
+ flushed either on exit to user space or on VMENTER so malicious code
|
|
+ in user space or the guest cannot speculatively access them.
|
|
+
|
|
+ The mitigation is hooked into all variants of halt()/mwait(), but does
|
|
+ not cover the legacy ACPI IO-Port mechanism because the ACPI idle driver
|
|
+ has been superseded by the intel_idle driver around 2010 and is
|
|
+ preferred on all affected CPUs which are expected to gain the MD_CLEAR
|
|
+ functionality in microcode. Aside of that the IO-Port mechanism is a
|
|
+ legacy interface which is only used on older systems which are either
|
|
+ not affected or do not receive microcode updates anymore.
|
|
diff --git a/Makefile b/Makefile
|
|
index e52b0579e176..92fe701e5582 100644
|
|
--- a/Makefile
|
|
+++ b/Makefile
|
|
@@ -1,6 +1,6 @@
|
|
VERSION = 4
|
|
PATCHLEVEL = 9
|
|
-SUBLEVEL = 175
|
|
+SUBLEVEL = 176
|
|
EXTRAVERSION =
|
|
NAME = Roaring Lionus
|
|
|
|
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
|
|
index 5a4591ff8407..e0055b4302d6 100644
|
|
--- a/arch/x86/Kconfig
|
|
+++ b/arch/x86/Kconfig
|
|
@@ -937,13 +937,7 @@ config NR_CPUS
|
|
approximately eight kilobytes to the kernel image.
|
|
|
|
config SCHED_SMT
|
|
- bool "SMT (Hyperthreading) scheduler support"
|
|
- depends on SMP
|
|
- ---help---
|
|
- SMT scheduler support improves the CPU scheduler's decision making
|
|
- when dealing with Intel Pentium 4 chips with HyperThreading at a
|
|
- cost of slightly increased overhead in some places. If unsure say
|
|
- N here.
|
|
+ def_bool y if SMP
|
|
|
|
config SCHED_MC
|
|
def_bool y
|
|
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
|
|
index b0cd306dc527..8841d016b4a4 100644
|
|
--- a/arch/x86/entry/common.c
|
|
+++ b/arch/x86/entry/common.c
|
|
@@ -28,6 +28,7 @@
|
|
#include <asm/vdso.h>
|
|
#include <asm/uaccess.h>
|
|
#include <asm/cpufeature.h>
|
|
+#include <asm/nospec-branch.h>
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
#include <trace/events/syscalls.h>
|
|
@@ -206,6 +207,8 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
|
|
#endif
|
|
|
|
user_enter_irqoff();
|
|
+
|
|
+ mds_user_clear_cpu_buffers();
|
|
}
|
|
|
|
#define SYSCALL_EXIT_WORK_FLAGS \
|
|
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
|
|
index a30829052a00..cb8178a2783a 100644
|
|
--- a/arch/x86/events/intel/core.c
|
|
+++ b/arch/x86/events/intel/core.c
|
|
@@ -3750,11 +3750,11 @@ __init int intel_pmu_init(void)
|
|
pr_cont("Nehalem events, ");
|
|
break;
|
|
|
|
- case INTEL_FAM6_ATOM_PINEVIEW:
|
|
- case INTEL_FAM6_ATOM_LINCROFT:
|
|
- case INTEL_FAM6_ATOM_PENWELL:
|
|
- case INTEL_FAM6_ATOM_CLOVERVIEW:
|
|
- case INTEL_FAM6_ATOM_CEDARVIEW:
|
|
+ case INTEL_FAM6_ATOM_BONNELL:
|
|
+ case INTEL_FAM6_ATOM_BONNELL_MID:
|
|
+ case INTEL_FAM6_ATOM_SALTWELL:
|
|
+ case INTEL_FAM6_ATOM_SALTWELL_MID:
|
|
+ case INTEL_FAM6_ATOM_SALTWELL_TABLET:
|
|
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
|
|
sizeof(hw_cache_event_ids));
|
|
|
|
@@ -3766,9 +3766,11 @@ __init int intel_pmu_init(void)
|
|
pr_cont("Atom events, ");
|
|
break;
|
|
|
|
- case INTEL_FAM6_ATOM_SILVERMONT1:
|
|
- case INTEL_FAM6_ATOM_SILVERMONT2:
|
|
+ case INTEL_FAM6_ATOM_SILVERMONT:
|
|
+ case INTEL_FAM6_ATOM_SILVERMONT_X:
|
|
+ case INTEL_FAM6_ATOM_SILVERMONT_MID:
|
|
case INTEL_FAM6_ATOM_AIRMONT:
|
|
+ case INTEL_FAM6_ATOM_AIRMONT_MID:
|
|
memcpy(hw_cache_event_ids, slm_hw_cache_event_ids,
|
|
sizeof(hw_cache_event_ids));
|
|
memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs,
|
|
@@ -3785,7 +3787,7 @@ __init int intel_pmu_init(void)
|
|
break;
|
|
|
|
case INTEL_FAM6_ATOM_GOLDMONT:
|
|
- case INTEL_FAM6_ATOM_DENVERTON:
|
|
+ case INTEL_FAM6_ATOM_GOLDMONT_X:
|
|
memcpy(hw_cache_event_ids, glm_hw_cache_event_ids,
|
|
sizeof(hw_cache_event_ids));
|
|
memcpy(hw_cache_extra_regs, glm_hw_cache_extra_regs,
|
|
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
|
|
index 47d526c700a1..72d09340c24d 100644
|
|
--- a/arch/x86/events/intel/cstate.c
|
|
+++ b/arch/x86/events/intel/cstate.c
|
|
@@ -531,8 +531,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
|
|
|
|
X86_CSTATES_MODEL(INTEL_FAM6_HASWELL_ULT, hswult_cstates),
|
|
|
|
- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT1, slm_cstates),
|
|
- X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT2, slm_cstates),
|
|
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT, slm_cstates),
|
|
+ X86_CSTATES_MODEL(INTEL_FAM6_ATOM_SILVERMONT_X, slm_cstates),
|
|
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_AIRMONT, slm_cstates),
|
|
|
|
X86_CSTATES_MODEL(INTEL_FAM6_BROADWELL_CORE, snb_cstates),
|
|
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
|
|
index be0b1968d60a..68144a341903 100644
|
|
--- a/arch/x86/events/msr.c
|
|
+++ b/arch/x86/events/msr.c
|
|
@@ -61,8 +61,8 @@ static bool test_intel(int idx)
|
|
case INTEL_FAM6_BROADWELL_GT3E:
|
|
case INTEL_FAM6_BROADWELL_X:
|
|
|
|
- case INTEL_FAM6_ATOM_SILVERMONT1:
|
|
- case INTEL_FAM6_ATOM_SILVERMONT2:
|
|
+ case INTEL_FAM6_ATOM_SILVERMONT:
|
|
+ case INTEL_FAM6_ATOM_SILVERMONT_X:
|
|
case INTEL_FAM6_ATOM_AIRMONT:
|
|
if (idx == PERF_MSR_SMI)
|
|
return true;
|
|
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
|
|
index 98444b77fbe3..06de338be0d8 100644
|
|
--- a/arch/x86/include/asm/cpufeatures.h
|
|
+++ b/arch/x86/include/asm/cpufeatures.h
|
|
@@ -271,10 +271,12 @@
|
|
/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
|
|
#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */
|
|
#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */
|
|
-#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */
|
|
-#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */
|
|
-#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */
|
|
+#define X86_FEATURE_AMD_IBPB (13*32+12) /* "" Indirect Branch Prediction Barrier */
|
|
+#define X86_FEATURE_AMD_IBRS (13*32+14) /* "" Indirect Branch Restricted Speculation */
|
|
+#define X86_FEATURE_AMD_STIBP (13*32+15) /* "" Single Thread Indirect Branch Predictors */
|
|
+#define X86_FEATURE_AMD_SSBD (13*32+24) /* "" Speculative Store Bypass Disable */
|
|
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
|
|
+#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
|
|
|
|
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
|
|
#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */
|
|
@@ -315,6 +317,7 @@
|
|
#define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */
|
|
#define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */
|
|
#define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */
|
|
+#define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
|
|
#define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */
|
|
#define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */
|
|
#define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */
|
|
@@ -352,5 +355,7 @@
|
|
#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
|
|
#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */
|
|
#define X86_BUG_L1TF X86_BUG(18) /* CPU is affected by L1 Terminal Fault */
|
|
+#define X86_BUG_MDS X86_BUG(19) /* CPU is affected by Microarchitectural data sampling */
|
|
+#define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */
|
|
|
|
#endif /* _ASM_X86_CPUFEATURES_H */
|
|
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
|
|
index 75b748a1deb8..ba7b6f736414 100644
|
|
--- a/arch/x86/include/asm/intel-family.h
|
|
+++ b/arch/x86/include/asm/intel-family.h
|
|
@@ -50,19 +50,23 @@
|
|
|
|
/* "Small Core" Processors (Atom) */
|
|
|
|
-#define INTEL_FAM6_ATOM_PINEVIEW 0x1C
|
|
-#define INTEL_FAM6_ATOM_LINCROFT 0x26
|
|
-#define INTEL_FAM6_ATOM_PENWELL 0x27
|
|
-#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35
|
|
-#define INTEL_FAM6_ATOM_CEDARVIEW 0x36
|
|
-#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */
|
|
-#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */
|
|
-#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */
|
|
-#define INTEL_FAM6_ATOM_MERRIFIELD 0x4A /* Tangier */
|
|
-#define INTEL_FAM6_ATOM_MOOREFIELD 0x5A /* Anniedale */
|
|
-#define INTEL_FAM6_ATOM_GOLDMONT 0x5C
|
|
-#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */
|
|
-#define INTEL_FAM6_ATOM_GEMINI_LAKE 0x7A
|
|
+#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
|
|
+#define INTEL_FAM6_ATOM_BONNELL_MID 0x26 /* Silverthorne, Lincroft */
|
|
+
|
|
+#define INTEL_FAM6_ATOM_SALTWELL 0x36 /* Cedarview */
|
|
+#define INTEL_FAM6_ATOM_SALTWELL_MID 0x27 /* Penwell */
|
|
+#define INTEL_FAM6_ATOM_SALTWELL_TABLET 0x35 /* Cloverview */
|
|
+
|
|
+#define INTEL_FAM6_ATOM_SILVERMONT 0x37 /* Bay Trail, Valleyview */
|
|
+#define INTEL_FAM6_ATOM_SILVERMONT_X 0x4D /* Avaton, Rangely */
|
|
+#define INTEL_FAM6_ATOM_SILVERMONT_MID 0x4A /* Merriefield */
|
|
+
|
|
+#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* Cherry Trail, Braswell */
|
|
+#define INTEL_FAM6_ATOM_AIRMONT_MID 0x5A /* Moorefield */
|
|
+
|
|
+#define INTEL_FAM6_ATOM_GOLDMONT 0x5C /* Apollo Lake */
|
|
+#define INTEL_FAM6_ATOM_GOLDMONT_X 0x5F /* Denverton */
|
|
+#define INTEL_FAM6_ATOM_GOLDMONT_PLUS 0x7A /* Gemini Lake */
|
|
|
|
/* Xeon Phi */
|
|
|
|
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
|
|
index 508a062e6cf1..0c8f4281b151 100644
|
|
--- a/arch/x86/include/asm/irqflags.h
|
|
+++ b/arch/x86/include/asm/irqflags.h
|
|
@@ -5,6 +5,8 @@
|
|
|
|
#ifndef __ASSEMBLY__
|
|
|
|
+#include <asm/nospec-branch.h>
|
|
+
|
|
/* Provide __cpuidle; we can't safely include <linux/cpu.h> */
|
|
#define __cpuidle __attribute__((__section__(".cpuidle.text")))
|
|
|
|
@@ -53,11 +55,13 @@ static inline void native_irq_enable(void)
|
|
|
|
static inline __cpuidle void native_safe_halt(void)
|
|
{
|
|
+ mds_idle_clear_cpu_buffers();
|
|
asm volatile("sti; hlt": : :"memory");
|
|
}
|
|
|
|
static inline __cpuidle void native_halt(void)
|
|
{
|
|
+ mds_idle_clear_cpu_buffers();
|
|
asm volatile("hlt": : :"memory");
|
|
}
|
|
|
|
diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
|
|
index 5e69154c9f07..a61ec81b27db 100644
|
|
--- a/arch/x86/include/asm/microcode_intel.h
|
|
+++ b/arch/x86/include/asm/microcode_intel.h
|
|
@@ -52,6 +52,21 @@ struct extended_sigtable {
|
|
|
|
#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
|
|
|
|
+static inline u32 intel_get_microcode_revision(void)
|
|
+{
|
|
+ u32 rev, dummy;
|
|
+
|
|
+ native_wrmsrl(MSR_IA32_UCODE_REV, 0);
|
|
+
|
|
+ /* As documented in the SDM: Do a CPUID 1 here */
|
|
+ sync_core();
|
|
+
|
|
+ /* get the current revision from MSR 0x8B */
|
|
+ native_rdmsr(MSR_IA32_UCODE_REV, dummy, rev);
|
|
+
|
|
+ return rev;
|
|
+}
|
|
+
|
|
extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev);
|
|
extern int microcode_sanity_check(void *mc, int print_err);
|
|
extern int find_matching_signature(void *mc, unsigned int csig, int cpf);
|
|
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
|
|
index 9963e21ac443..38f94d07920d 100644
|
|
--- a/arch/x86/include/asm/msr-index.h
|
|
+++ b/arch/x86/include/asm/msr-index.h
|
|
@@ -1,6 +1,8 @@
|
|
#ifndef _ASM_X86_MSR_INDEX_H
|
|
#define _ASM_X86_MSR_INDEX_H
|
|
|
|
+#include <linux/bits.h>
|
|
+
|
|
/*
|
|
* CPU model specific register (MSR) numbers.
|
|
*
|
|
@@ -38,13 +40,14 @@
|
|
|
|
/* Intel MSRs. Some also available on other CPUs */
|
|
#define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */
|
|
-#define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */
|
|
-#define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */
|
|
+#define SPEC_CTRL_IBRS BIT(0) /* Indirect Branch Restricted Speculation */
|
|
+#define SPEC_CTRL_STIBP_SHIFT 1 /* Single Thread Indirect Branch Predictor (STIBP) bit */
|
|
+#define SPEC_CTRL_STIBP BIT(SPEC_CTRL_STIBP_SHIFT) /* STIBP mask */
|
|
#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */
|
|
-#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
|
|
+#define SPEC_CTRL_SSBD BIT(SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */
|
|
|
|
#define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */
|
|
-#define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */
|
|
+#define PRED_CMD_IBPB BIT(0) /* Indirect Branch Prediction Barrier */
|
|
|
|
#define MSR_IA32_PERFCTR0 0x000000c1
|
|
#define MSR_IA32_PERFCTR1 0x000000c2
|
|
@@ -61,20 +64,25 @@
|
|
#define MSR_MTRRcap 0x000000fe
|
|
|
|
#define MSR_IA32_ARCH_CAPABILITIES 0x0000010a
|
|
-#define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */
|
|
-#define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */
|
|
-#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH (1 << 3) /* Skip L1D flush on vmentry */
|
|
-#define ARCH_CAP_SSB_NO (1 << 4) /*
|
|
- * Not susceptible to Speculative Store Bypass
|
|
- * attack, so no Speculative Store Bypass
|
|
- * control required.
|
|
- */
|
|
+#define ARCH_CAP_RDCL_NO BIT(0) /* Not susceptible to Meltdown */
|
|
+#define ARCH_CAP_IBRS_ALL BIT(1) /* Enhanced IBRS support */
|
|
+#define ARCH_CAP_SKIP_VMENTRY_L1DFLUSH BIT(3) /* Skip L1D flush on vmentry */
|
|
+#define ARCH_CAP_SSB_NO BIT(4) /*
|
|
+ * Not susceptible to Speculative Store Bypass
|
|
+ * attack, so no Speculative Store Bypass
|
|
+ * control required.
|
|
+ */
|
|
+#define ARCH_CAP_MDS_NO BIT(5) /*
|
|
+ * Not susceptible to
|
|
+ * Microarchitectural Data
|
|
+ * Sampling (MDS) vulnerabilities.
|
|
+ */
|
|
|
|
#define MSR_IA32_FLUSH_CMD 0x0000010b
|
|
-#define L1D_FLUSH (1 << 0) /*
|
|
- * Writeback and invalidate the
|
|
- * L1 data cache.
|
|
- */
|
|
+#define L1D_FLUSH BIT(0) /*
|
|
+ * Writeback and invalidate the
|
|
+ * L1 data cache.
|
|
+ */
|
|
|
|
#define MSR_IA32_BBL_CR_CTL 0x00000119
|
|
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
|
|
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
|
|
index f37f2d8a2989..0b40cc442bda 100644
|
|
--- a/arch/x86/include/asm/mwait.h
|
|
+++ b/arch/x86/include/asm/mwait.h
|
|
@@ -4,6 +4,7 @@
|
|
#include <linux/sched.h>
|
|
|
|
#include <asm/cpufeature.h>
|
|
+#include <asm/nospec-branch.h>
|
|
|
|
#define MWAIT_SUBSTATE_MASK 0xf
|
|
#define MWAIT_CSTATE_MASK 0xf
|
|
@@ -38,6 +39,8 @@ static inline void __monitorx(const void *eax, unsigned long ecx,
|
|
|
|
static inline void __mwait(unsigned long eax, unsigned long ecx)
|
|
{
|
|
+ mds_idle_clear_cpu_buffers();
|
|
+
|
|
/* "mwait %eax, %ecx;" */
|
|
asm volatile(".byte 0x0f, 0x01, 0xc9;"
|
|
:: "a" (eax), "c" (ecx));
|
|
@@ -72,6 +75,8 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
|
|
static inline void __mwaitx(unsigned long eax, unsigned long ebx,
|
|
unsigned long ecx)
|
|
{
|
|
+ /* No MDS buffer clear as this is AMD/HYGON only */
|
|
+
|
|
/* "mwaitx %eax, %ebx, %ecx;" */
|
|
asm volatile(".byte 0x0f, 0x01, 0xfb;"
|
|
:: "a" (eax), "b" (ebx), "c" (ecx));
|
|
@@ -79,6 +84,8 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx,
|
|
|
|
static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
|
|
{
|
|
+ mds_idle_clear_cpu_buffers();
|
|
+
|
|
trace_hardirqs_on();
|
|
/* "mwait %eax, %ecx;" */
|
|
asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
|
|
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
|
|
index 1b4132161c1f..031a58e84e5b 100644
|
|
--- a/arch/x86/include/asm/nospec-branch.h
|
|
+++ b/arch/x86/include/asm/nospec-branch.h
|
|
@@ -3,6 +3,8 @@
|
|
#ifndef _ASM_X86_NOSPEC_BRANCH_H_
|
|
#define _ASM_X86_NOSPEC_BRANCH_H_
|
|
|
|
+#include <linux/static_key.h>
|
|
+
|
|
#include <asm/alternative.h>
|
|
#include <asm/alternative-asm.h>
|
|
#include <asm/cpufeatures.h>
|
|
@@ -214,10 +216,17 @@ enum spectre_v2_mitigation {
|
|
SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
|
|
SPECTRE_V2_RETPOLINE_GENERIC,
|
|
SPECTRE_V2_RETPOLINE_AMD,
|
|
- SPECTRE_V2_IBRS,
|
|
SPECTRE_V2_IBRS_ENHANCED,
|
|
};
|
|
|
|
+/* The indirect branch speculation control variants */
|
|
+enum spectre_v2_user_mitigation {
|
|
+ SPECTRE_V2_USER_NONE,
|
|
+ SPECTRE_V2_USER_STRICT,
|
|
+ SPECTRE_V2_USER_PRCTL,
|
|
+ SPECTRE_V2_USER_SECCOMP,
|
|
+};
|
|
+
|
|
/* The Speculative Store Bypass disable variants */
|
|
enum ssb_mitigation {
|
|
SPEC_STORE_BYPASS_NONE,
|
|
@@ -295,6 +304,60 @@ do { \
|
|
preempt_enable(); \
|
|
} while (0)
|
|
|
|
+DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
|
|
+DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
|
|
+DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
|
|
+
|
|
+DECLARE_STATIC_KEY_FALSE(mds_user_clear);
|
|
+DECLARE_STATIC_KEY_FALSE(mds_idle_clear);
|
|
+
|
|
+#include <asm/segment.h>
|
|
+
|
|
+/**
|
|
+ * mds_clear_cpu_buffers - Mitigation for MDS vulnerability
|
|
+ *
|
|
+ * This uses the otherwise unused and obsolete VERW instruction in
|
|
+ * combination with microcode which triggers a CPU buffer flush when the
|
|
+ * instruction is executed.
|
|
+ */
|
|
+static inline void mds_clear_cpu_buffers(void)
|
|
+{
|
|
+ static const u16 ds = __KERNEL_DS;
|
|
+
|
|
+ /*
|
|
+ * Has to be the memory-operand variant because only that
|
|
+ * guarantees the CPU buffer flush functionality according to
|
|
+ * documentation. The register-operand variant does not.
|
|
+ * Works with any segment selector, but a valid writable
|
|
+ * data segment is the fastest variant.
|
|
+ *
|
|
+ * "cc" clobber is required because VERW modifies ZF.
|
|
+ */
|
|
+ asm volatile("verw %[ds]" : : [ds] "m" (ds) : "cc");
|
|
+}
|
|
+
|
|
+/**
|
|
+ * mds_user_clear_cpu_buffers - Mitigation for MDS vulnerability
|
|
+ *
|
|
+ * Clear CPU buffers if the corresponding static key is enabled
|
|
+ */
|
|
+static inline void mds_user_clear_cpu_buffers(void)
|
|
+{
|
|
+ if (static_branch_likely(&mds_user_clear))
|
|
+ mds_clear_cpu_buffers();
|
|
+}
|
|
+
|
|
+/**
|
|
+ * mds_idle_clear_cpu_buffers - Mitigation for MDS vulnerability
|
|
+ *
|
|
+ * Clear CPU buffers if the corresponding static key is enabled
|
|
+ */
|
|
+static inline void mds_idle_clear_cpu_buffers(void)
|
|
+{
|
|
+ if (static_branch_likely(&mds_idle_clear))
|
|
+ mds_clear_cpu_buffers();
|
|
+}
|
|
+
|
|
#endif /* __ASSEMBLY__ */
|
|
|
|
/*
|
|
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
|
|
index 221a32ed1372..f12e61e2a86b 100644
|
|
--- a/arch/x86/include/asm/pgtable_64.h
|
|
+++ b/arch/x86/include/asm/pgtable_64.h
|
|
@@ -44,15 +44,15 @@ struct mm_struct;
|
|
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
|
|
|
|
|
|
-static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
|
|
- pte_t *ptep)
|
|
+static inline void native_set_pte(pte_t *ptep, pte_t pte)
|
|
{
|
|
- *ptep = native_make_pte(0);
|
|
+ WRITE_ONCE(*ptep, pte);
|
|
}
|
|
|
|
-static inline void native_set_pte(pte_t *ptep, pte_t pte)
|
|
+static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
|
|
+ pte_t *ptep)
|
|
{
|
|
- *ptep = pte;
|
|
+ native_set_pte(ptep, native_make_pte(0));
|
|
}
|
|
|
|
static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
|
|
@@ -62,7 +62,7 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
|
|
|
|
static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
|
|
{
|
|
- *pmdp = pmd;
|
|
+ WRITE_ONCE(*pmdp, pmd);
|
|
}
|
|
|
|
static inline void native_pmd_clear(pmd_t *pmd)
|
|
@@ -98,7 +98,7 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
|
|
|
|
static inline void native_set_pud(pud_t *pudp, pud_t pud)
|
|
{
|
|
- *pudp = pud;
|
|
+ WRITE_ONCE(*pudp, pud);
|
|
}
|
|
|
|
static inline void native_pud_clear(pud_t *pud)
|
|
@@ -131,7 +131,7 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
|
|
|
|
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
|
|
{
|
|
- *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
|
|
+ WRITE_ONCE(*pgdp, kaiser_set_shadow_pgd(pgdp, pgd));
|
|
}
|
|
|
|
static inline void native_pgd_clear(pgd_t *pgd)
|
|
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
|
|
index ee8c6290c421..155e49fc7010 100644
|
|
--- a/arch/x86/include/asm/processor.h
|
|
+++ b/arch/x86/include/asm/processor.h
|
|
@@ -874,4 +874,10 @@ enum l1tf_mitigations {
|
|
|
|
extern enum l1tf_mitigations l1tf_mitigation;
|
|
|
|
+enum mds_mitigations {
|
|
+ MDS_MITIGATION_OFF,
|
|
+ MDS_MITIGATION_FULL,
|
|
+ MDS_MITIGATION_VMWERV,
|
|
+};
|
|
+
|
|
#endif /* _ASM_X86_PROCESSOR_H */
|
|
diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h
|
|
index ae7c2c5cd7f0..5393babc0598 100644
|
|
--- a/arch/x86/include/asm/spec-ctrl.h
|
|
+++ b/arch/x86/include/asm/spec-ctrl.h
|
|
@@ -53,12 +53,24 @@ static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn)
|
|
return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
|
|
}
|
|
|
|
+static inline u64 stibp_tif_to_spec_ctrl(u64 tifn)
|
|
+{
|
|
+ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
|
|
+ return (tifn & _TIF_SPEC_IB) >> (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
|
|
+}
|
|
+
|
|
static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl)
|
|
{
|
|
BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT);
|
|
return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT);
|
|
}
|
|
|
|
+static inline unsigned long stibp_spec_ctrl_to_tif(u64 spec_ctrl)
|
|
+{
|
|
+ BUILD_BUG_ON(TIF_SPEC_IB < SPEC_CTRL_STIBP_SHIFT);
|
|
+ return (spec_ctrl & SPEC_CTRL_STIBP) << (TIF_SPEC_IB - SPEC_CTRL_STIBP_SHIFT);
|
|
+}
|
|
+
|
|
static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn)
|
|
{
|
|
return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL;
|
|
@@ -70,11 +82,7 @@ extern void speculative_store_bypass_ht_init(void);
|
|
static inline void speculative_store_bypass_ht_init(void) { }
|
|
#endif
|
|
|
|
-extern void speculative_store_bypass_update(unsigned long tif);
|
|
-
|
|
-static inline void speculative_store_bypass_update_current(void)
|
|
-{
|
|
- speculative_store_bypass_update(current_thread_info()->flags);
|
|
-}
|
|
+extern void speculation_ctrl_update(unsigned long tif);
|
|
+extern void speculation_ctrl_update_current(void);
|
|
|
|
#endif
|
|
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
|
|
index 5cb436acd463..676e84f521ba 100644
|
|
--- a/arch/x86/include/asm/switch_to.h
|
|
+++ b/arch/x86/include/asm/switch_to.h
|
|
@@ -8,9 +8,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
|
|
|
|
__visible struct task_struct *__switch_to(struct task_struct *prev,
|
|
struct task_struct *next);
|
|
-struct tss_struct;
|
|
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
|
|
- struct tss_struct *tss);
|
|
|
|
/* This runs runs on the previous thread's stack. */
|
|
static inline void prepare_switch_to(struct task_struct *prev,
|
|
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
|
|
index 2d8788a59b4d..0438f7fbb383 100644
|
|
--- a/arch/x86/include/asm/thread_info.h
|
|
+++ b/arch/x86/include/asm/thread_info.h
|
|
@@ -83,10 +83,12 @@ struct thread_info {
|
|
#define TIF_SIGPENDING 2 /* signal pending */
|
|
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
|
|
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
|
|
-#define TIF_SSBD 5 /* Reduced data speculation */
|
|
+#define TIF_SSBD 5 /* Speculative store bypass disable */
|
|
#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
|
|
#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
|
|
#define TIF_SECCOMP 8 /* secure computing */
|
|
+#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
|
|
+#define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */
|
|
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
|
|
#define TIF_UPROBE 12 /* breakpointed or singlestepping */
|
|
#define TIF_NOTSC 16 /* TSC is not accessible in userland */
|
|
@@ -111,6 +113,8 @@ struct thread_info {
|
|
#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
|
|
#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
|
|
#define _TIF_SECCOMP (1 << TIF_SECCOMP)
|
|
+#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
|
|
+#define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE)
|
|
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
|
|
#define _TIF_UPROBE (1 << TIF_UPROBE)
|
|
#define _TIF_NOTSC (1 << TIF_NOTSC)
|
|
@@ -140,8 +144,18 @@ struct thread_info {
|
|
_TIF_NOHZ)
|
|
|
|
/* flags to check in __switch_to() */
|
|
-#define _TIF_WORK_CTXSW \
|
|
- (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD)
|
|
+#define _TIF_WORK_CTXSW_BASE \
|
|
+ (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP| \
|
|
+ _TIF_SSBD | _TIF_SPEC_FORCE_UPDATE)
|
|
+
|
|
+/*
|
|
+ * Avoid calls to __switch_to_xtra() on UP as STIBP is not evaluated.
|
|
+ */
|
|
+#ifdef CONFIG_SMP
|
|
+# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE | _TIF_SPEC_IB)
|
|
+#else
|
|
+# define _TIF_WORK_CTXSW (_TIF_WORK_CTXSW_BASE)
|
|
+#endif
|
|
|
|
#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
|
|
#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
|
|
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
|
|
index 686a58d793e5..f5ca15622dc9 100644
|
|
--- a/arch/x86/include/asm/tlbflush.h
|
|
+++ b/arch/x86/include/asm/tlbflush.h
|
|
@@ -68,8 +68,12 @@ static inline void invpcid_flush_all_nonglobals(void)
|
|
struct tlb_state {
|
|
struct mm_struct *active_mm;
|
|
int state;
|
|
- /* last user mm's ctx id */
|
|
- u64 last_ctx_id;
|
|
+
|
|
+ /* Last user mm for optimizing IBPB */
|
|
+ union {
|
|
+ struct mm_struct *last_user_mm;
|
|
+ unsigned long last_user_mm_ibpb;
|
|
+ };
|
|
|
|
/*
|
|
* Access to this CR4 shadow and to H/W CR4 is protected by
|
|
diff --git a/arch/x86/include/uapi/asm/Kbuild b/arch/x86/include/uapi/asm/Kbuild
|
|
index 3dec769cadf7..1c532b3f18ea 100644
|
|
--- a/arch/x86/include/uapi/asm/Kbuild
|
|
+++ b/arch/x86/include/uapi/asm/Kbuild
|
|
@@ -27,7 +27,6 @@ header-y += ldt.h
|
|
header-y += mce.h
|
|
header-y += mman.h
|
|
header-y += msgbuf.h
|
|
-header-y += msr-index.h
|
|
header-y += msr.h
|
|
header-y += mtrr.h
|
|
header-y += param.h
|
|
diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h
|
|
index 69a6e07e3149..db7dae58745f 100644
|
|
--- a/arch/x86/include/uapi/asm/mce.h
|
|
+++ b/arch/x86/include/uapi/asm/mce.h
|
|
@@ -28,6 +28,8 @@ struct mce {
|
|
__u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */
|
|
__u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */
|
|
__u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */
|
|
+ __u64 ppin; /* Protected Processor Inventory Number */
|
|
+ __u32 microcode;/* Microcode revision */
|
|
};
|
|
|
|
#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
|
|
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
|
|
index 6221166e3fca..16970c39baea 100644
|
|
--- a/arch/x86/kernel/cpu/bugs.c
|
|
+++ b/arch/x86/kernel/cpu/bugs.c
|
|
@@ -13,6 +13,7 @@
|
|
#include <linux/module.h>
|
|
#include <linux/nospec.h>
|
|
#include <linux/prctl.h>
|
|
+#include <linux/sched/smt.h>
|
|
|
|
#include <asm/spec-ctrl.h>
|
|
#include <asm/cmdline.h>
|
|
@@ -24,6 +25,7 @@
|
|
#include <asm/vmx.h>
|
|
#include <asm/paravirt.h>
|
|
#include <asm/alternative.h>
|
|
+#include <asm/hypervisor.h>
|
|
#include <asm/pgtable.h>
|
|
#include <asm/cacheflush.h>
|
|
#include <asm/intel-family.h>
|
|
@@ -32,13 +34,12 @@
|
|
static void __init spectre_v2_select_mitigation(void);
|
|
static void __init ssb_select_mitigation(void);
|
|
static void __init l1tf_select_mitigation(void);
|
|
+static void __init mds_select_mitigation(void);
|
|
|
|
-/*
|
|
- * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any
|
|
- * writes to SPEC_CTRL contain whatever reserved bits have been set.
|
|
- */
|
|
-u64 __ro_after_init x86_spec_ctrl_base;
|
|
+/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
|
|
+u64 x86_spec_ctrl_base;
|
|
EXPORT_SYMBOL_GPL(x86_spec_ctrl_base);
|
|
+static DEFINE_MUTEX(spec_ctrl_mutex);
|
|
|
|
/*
|
|
* The vendor and possibly platform specific bits which can be modified in
|
|
@@ -53,6 +54,20 @@ static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS;
|
|
u64 __ro_after_init x86_amd_ls_cfg_base;
|
|
u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
|
|
|
|
+/* Control conditional STIPB in switch_to() */
|
|
+DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
|
|
+/* Control conditional IBPB in switch_mm() */
|
|
+DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
|
|
+/* Control unconditional IBPB in switch_mm() */
|
|
+DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
|
|
+
|
|
+/* Control MDS CPU buffer clear before returning to user space */
|
|
+DEFINE_STATIC_KEY_FALSE(mds_user_clear);
|
|
+EXPORT_SYMBOL_GPL(mds_user_clear);
|
|
+/* Control MDS CPU buffer clear before idling (halt, mwait) */
|
|
+DEFINE_STATIC_KEY_FALSE(mds_idle_clear);
|
|
+EXPORT_SYMBOL_GPL(mds_idle_clear);
|
|
+
|
|
void __init check_bugs(void)
|
|
{
|
|
identify_boot_cpu();
|
|
@@ -91,6 +106,10 @@ void __init check_bugs(void)
|
|
|
|
l1tf_select_mitigation();
|
|
|
|
+ mds_select_mitigation();
|
|
+
|
|
+ arch_smt_update();
|
|
+
|
|
#ifdef CONFIG_X86_32
|
|
/*
|
|
* Check whether we are able to run this kernel safely on SMP.
|
|
@@ -123,31 +142,6 @@ void __init check_bugs(void)
|
|
#endif
|
|
}
|
|
|
|
-/* The kernel command line selection */
|
|
-enum spectre_v2_mitigation_cmd {
|
|
- SPECTRE_V2_CMD_NONE,
|
|
- SPECTRE_V2_CMD_AUTO,
|
|
- SPECTRE_V2_CMD_FORCE,
|
|
- SPECTRE_V2_CMD_RETPOLINE,
|
|
- SPECTRE_V2_CMD_RETPOLINE_GENERIC,
|
|
- SPECTRE_V2_CMD_RETPOLINE_AMD,
|
|
-};
|
|
-
|
|
-static const char *spectre_v2_strings[] = {
|
|
- [SPECTRE_V2_NONE] = "Vulnerable",
|
|
- [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
|
|
- [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
|
|
- [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
|
|
- [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
|
|
- [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
|
|
-};
|
|
-
|
|
-#undef pr_fmt
|
|
-#define pr_fmt(fmt) "Spectre V2 : " fmt
|
|
-
|
|
-static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
|
|
- SPECTRE_V2_NONE;
|
|
-
|
|
void
|
|
x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
|
|
{
|
|
@@ -165,9 +159,14 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
|
|
guestval |= guest_spec_ctrl & x86_spec_ctrl_mask;
|
|
|
|
/* SSBD controlled in MSR_SPEC_CTRL */
|
|
- if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD))
|
|
+ if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
|
|
+ static_cpu_has(X86_FEATURE_AMD_SSBD))
|
|
hostval |= ssbd_tif_to_spec_ctrl(ti->flags);
|
|
|
|
+ /* Conditional STIBP enabled? */
|
|
+ if (static_branch_unlikely(&switch_to_cond_stibp))
|
|
+ hostval |= stibp_tif_to_spec_ctrl(ti->flags);
|
|
+
|
|
if (hostval != guestval) {
|
|
msrval = setguest ? guestval : hostval;
|
|
wrmsrl(MSR_IA32_SPEC_CTRL, msrval);
|
|
@@ -201,7 +200,7 @@ x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest)
|
|
tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) :
|
|
ssbd_spec_ctrl_to_tif(hostval);
|
|
|
|
- speculative_store_bypass_update(tif);
|
|
+ speculation_ctrl_update(tif);
|
|
}
|
|
}
|
|
EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl);
|
|
@@ -216,6 +215,70 @@ static void x86_amd_ssb_disable(void)
|
|
wrmsrl(MSR_AMD64_LS_CFG, msrval);
|
|
}
|
|
|
|
+#undef pr_fmt
|
|
+#define pr_fmt(fmt) "MDS: " fmt
|
|
+
|
|
+/* Default mitigation for MDS-affected CPUs */
|
|
+static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL;
|
|
+static bool mds_nosmt __ro_after_init = false;
|
|
+
|
|
+static const char * const mds_strings[] = {
|
|
+ [MDS_MITIGATION_OFF] = "Vulnerable",
|
|
+ [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers",
|
|
+ [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode",
|
|
+};
|
|
+
|
|
+static void __init mds_select_mitigation(void)
|
|
+{
|
|
+ if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) {
|
|
+ mds_mitigation = MDS_MITIGATION_OFF;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (mds_mitigation == MDS_MITIGATION_FULL) {
|
|
+ if (!boot_cpu_has(X86_FEATURE_MD_CLEAR))
|
|
+ mds_mitigation = MDS_MITIGATION_VMWERV;
|
|
+
|
|
+ static_branch_enable(&mds_user_clear);
|
|
+
|
|
+ if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) &&
|
|
+ (mds_nosmt || cpu_mitigations_auto_nosmt()))
|
|
+ cpu_smt_disable(false);
|
|
+ }
|
|
+
|
|
+ pr_info("%s\n", mds_strings[mds_mitigation]);
|
|
+}
|
|
+
|
|
+static int __init mds_cmdline(char *str)
|
|
+{
|
|
+ if (!boot_cpu_has_bug(X86_BUG_MDS))
|
|
+ return 0;
|
|
+
|
|
+ if (!str)
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (!strcmp(str, "off"))
|
|
+ mds_mitigation = MDS_MITIGATION_OFF;
|
|
+ else if (!strcmp(str, "full"))
|
|
+ mds_mitigation = MDS_MITIGATION_FULL;
|
|
+ else if (!strcmp(str, "full,nosmt")) {
|
|
+ mds_mitigation = MDS_MITIGATION_FULL;
|
|
+ mds_nosmt = true;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+early_param("mds", mds_cmdline);
|
|
+
|
|
+#undef pr_fmt
|
|
+#define pr_fmt(fmt) "Spectre V2 : " fmt
|
|
+
|
|
+static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init =
|
|
+ SPECTRE_V2_NONE;
|
|
+
|
|
+static enum spectre_v2_user_mitigation spectre_v2_user __ro_after_init =
|
|
+ SPECTRE_V2_USER_NONE;
|
|
+
|
|
#ifdef RETPOLINE
|
|
static bool spectre_v2_bad_module;
|
|
|
|
@@ -237,67 +300,225 @@ static inline const char *spectre_v2_module_string(void)
|
|
static inline const char *spectre_v2_module_string(void) { return ""; }
|
|
#endif
|
|
|
|
-static void __init spec2_print_if_insecure(const char *reason)
|
|
+static inline bool match_option(const char *arg, int arglen, const char *opt)
|
|
{
|
|
- if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
|
|
- pr_info("%s selected on command line.\n", reason);
|
|
+ int len = strlen(opt);
|
|
+
|
|
+ return len == arglen && !strncmp(arg, opt, len);
|
|
}
|
|
|
|
-static void __init spec2_print_if_secure(const char *reason)
|
|
+/* The kernel command line selection for spectre v2 */
|
|
+enum spectre_v2_mitigation_cmd {
|
|
+ SPECTRE_V2_CMD_NONE,
|
|
+ SPECTRE_V2_CMD_AUTO,
|
|
+ SPECTRE_V2_CMD_FORCE,
|
|
+ SPECTRE_V2_CMD_RETPOLINE,
|
|
+ SPECTRE_V2_CMD_RETPOLINE_GENERIC,
|
|
+ SPECTRE_V2_CMD_RETPOLINE_AMD,
|
|
+};
|
|
+
|
|
+enum spectre_v2_user_cmd {
|
|
+ SPECTRE_V2_USER_CMD_NONE,
|
|
+ SPECTRE_V2_USER_CMD_AUTO,
|
|
+ SPECTRE_V2_USER_CMD_FORCE,
|
|
+ SPECTRE_V2_USER_CMD_PRCTL,
|
|
+ SPECTRE_V2_USER_CMD_PRCTL_IBPB,
|
|
+ SPECTRE_V2_USER_CMD_SECCOMP,
|
|
+ SPECTRE_V2_USER_CMD_SECCOMP_IBPB,
|
|
+};
|
|
+
|
|
+static const char * const spectre_v2_user_strings[] = {
|
|
+ [SPECTRE_V2_USER_NONE] = "User space: Vulnerable",
|
|
+ [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection",
|
|
+ [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl",
|
|
+ [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl",
|
|
+};
|
|
+
|
|
+static const struct {
|
|
+ const char *option;
|
|
+ enum spectre_v2_user_cmd cmd;
|
|
+ bool secure;
|
|
+} v2_user_options[] __initconst = {
|
|
+ { "auto", SPECTRE_V2_USER_CMD_AUTO, false },
|
|
+ { "off", SPECTRE_V2_USER_CMD_NONE, false },
|
|
+ { "on", SPECTRE_V2_USER_CMD_FORCE, true },
|
|
+ { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false },
|
|
+ { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false },
|
|
+ { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false },
|
|
+ { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false },
|
|
+};
|
|
+
|
|
+static void __init spec_v2_user_print_cond(const char *reason, bool secure)
|
|
{
|
|
- if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
|
|
- pr_info("%s selected on command line.\n", reason);
|
|
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
|
|
+ pr_info("spectre_v2_user=%s forced on command line.\n", reason);
|
|
}
|
|
|
|
-static inline bool retp_compiler(void)
|
|
+static enum spectre_v2_user_cmd __init
|
|
+spectre_v2_parse_user_cmdline(enum spectre_v2_mitigation_cmd v2_cmd)
|
|
{
|
|
- return __is_defined(RETPOLINE);
|
|
+ char arg[20];
|
|
+ int ret, i;
|
|
+
|
|
+ switch (v2_cmd) {
|
|
+ case SPECTRE_V2_CMD_NONE:
|
|
+ return SPECTRE_V2_USER_CMD_NONE;
|
|
+ case SPECTRE_V2_CMD_FORCE:
|
|
+ return SPECTRE_V2_USER_CMD_FORCE;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = cmdline_find_option(boot_command_line, "spectre_v2_user",
|
|
+ arg, sizeof(arg));
|
|
+ if (ret < 0)
|
|
+ return SPECTRE_V2_USER_CMD_AUTO;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) {
|
|
+ if (match_option(arg, ret, v2_user_options[i].option)) {
|
|
+ spec_v2_user_print_cond(v2_user_options[i].option,
|
|
+ v2_user_options[i].secure);
|
|
+ return v2_user_options[i].cmd;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg);
|
|
+ return SPECTRE_V2_USER_CMD_AUTO;
|
|
}
|
|
|
|
-static inline bool match_option(const char *arg, int arglen, const char *opt)
|
|
+static void __init
|
|
+spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
|
|
{
|
|
- int len = strlen(opt);
|
|
+ enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE;
|
|
+ bool smt_possible = IS_ENABLED(CONFIG_SMP);
|
|
+ enum spectre_v2_user_cmd cmd;
|
|
|
|
- return len == arglen && !strncmp(arg, opt, len);
|
|
+ if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP))
|
|
+ return;
|
|
+
|
|
+ if (cpu_smt_control == CPU_SMT_FORCE_DISABLED ||
|
|
+ cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
|
|
+ smt_possible = false;
|
|
+
|
|
+ cmd = spectre_v2_parse_user_cmdline(v2_cmd);
|
|
+ switch (cmd) {
|
|
+ case SPECTRE_V2_USER_CMD_NONE:
|
|
+ goto set_mode;
|
|
+ case SPECTRE_V2_USER_CMD_FORCE:
|
|
+ mode = SPECTRE_V2_USER_STRICT;
|
|
+ break;
|
|
+ case SPECTRE_V2_USER_CMD_PRCTL:
|
|
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
|
|
+ mode = SPECTRE_V2_USER_PRCTL;
|
|
+ break;
|
|
+ case SPECTRE_V2_USER_CMD_AUTO:
|
|
+ case SPECTRE_V2_USER_CMD_SECCOMP:
|
|
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
|
|
+ if (IS_ENABLED(CONFIG_SECCOMP))
|
|
+ mode = SPECTRE_V2_USER_SECCOMP;
|
|
+ else
|
|
+ mode = SPECTRE_V2_USER_PRCTL;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* Initialize Indirect Branch Prediction Barrier */
|
|
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
|
|
+ setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
|
|
+
|
|
+ switch (cmd) {
|
|
+ case SPECTRE_V2_USER_CMD_FORCE:
|
|
+ case SPECTRE_V2_USER_CMD_PRCTL_IBPB:
|
|
+ case SPECTRE_V2_USER_CMD_SECCOMP_IBPB:
|
|
+ static_branch_enable(&switch_mm_always_ibpb);
|
|
+ break;
|
|
+ case SPECTRE_V2_USER_CMD_PRCTL:
|
|
+ case SPECTRE_V2_USER_CMD_AUTO:
|
|
+ case SPECTRE_V2_USER_CMD_SECCOMP:
|
|
+ static_branch_enable(&switch_mm_cond_ibpb);
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
|
|
+ static_key_enabled(&switch_mm_always_ibpb) ?
|
|
+ "always-on" : "conditional");
|
|
+ }
|
|
+
|
|
+ /* If enhanced IBRS is enabled no STIPB required */
|
|
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * If SMT is not possible or STIBP is not available clear the STIPB
|
|
+ * mode.
|
|
+ */
|
|
+ if (!smt_possible || !boot_cpu_has(X86_FEATURE_STIBP))
|
|
+ mode = SPECTRE_V2_USER_NONE;
|
|
+set_mode:
|
|
+ spectre_v2_user = mode;
|
|
+ /* Only print the STIBP mode when SMT possible */
|
|
+ if (smt_possible)
|
|
+ pr_info("%s\n", spectre_v2_user_strings[mode]);
|
|
}
|
|
|
|
+static const char * const spectre_v2_strings[] = {
|
|
+ [SPECTRE_V2_NONE] = "Vulnerable",
|
|
+ [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline",
|
|
+ [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline",
|
|
+ [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline",
|
|
+ [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline",
|
|
+ [SPECTRE_V2_IBRS_ENHANCED] = "Mitigation: Enhanced IBRS",
|
|
+};
|
|
+
|
|
static const struct {
|
|
const char *option;
|
|
enum spectre_v2_mitigation_cmd cmd;
|
|
bool secure;
|
|
-} mitigation_options[] = {
|
|
- { "off", SPECTRE_V2_CMD_NONE, false },
|
|
- { "on", SPECTRE_V2_CMD_FORCE, true },
|
|
- { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
|
|
- { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
|
|
- { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
|
|
- { "auto", SPECTRE_V2_CMD_AUTO, false },
|
|
+} mitigation_options[] __initconst = {
|
|
+ { "off", SPECTRE_V2_CMD_NONE, false },
|
|
+ { "on", SPECTRE_V2_CMD_FORCE, true },
|
|
+ { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false },
|
|
+ { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_AMD, false },
|
|
+ { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
|
|
+ { "auto", SPECTRE_V2_CMD_AUTO, false },
|
|
};
|
|
|
|
+static void __init spec_v2_print_cond(const char *reason, bool secure)
|
|
+{
|
|
+ if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure)
|
|
+ pr_info("%s selected on command line.\n", reason);
|
|
+}
|
|
+
|
|
+static inline bool retp_compiler(void)
|
|
+{
|
|
+ return __is_defined(RETPOLINE);
|
|
+}
|
|
+
|
|
static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
|
|
{
|
|
+ enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
|
|
char arg[20];
|
|
int ret, i;
|
|
- enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
|
|
|
|
- if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
|
|
+ if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") ||
|
|
+ cpu_mitigations_off())
|
|
return SPECTRE_V2_CMD_NONE;
|
|
- else {
|
|
- ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
|
|
- if (ret < 0)
|
|
- return SPECTRE_V2_CMD_AUTO;
|
|
|
|
- for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
|
|
- if (!match_option(arg, ret, mitigation_options[i].option))
|
|
- continue;
|
|
- cmd = mitigation_options[i].cmd;
|
|
- break;
|
|
- }
|
|
+ ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg));
|
|
+ if (ret < 0)
|
|
+ return SPECTRE_V2_CMD_AUTO;
|
|
|
|
- if (i >= ARRAY_SIZE(mitigation_options)) {
|
|
- pr_err("unknown option (%s). Switching to AUTO select\n", arg);
|
|
- return SPECTRE_V2_CMD_AUTO;
|
|
- }
|
|
+ for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
|
|
+ if (!match_option(arg, ret, mitigation_options[i].option))
|
|
+ continue;
|
|
+ cmd = mitigation_options[i].cmd;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (i >= ARRAY_SIZE(mitigation_options)) {
|
|
+ pr_err("unknown option (%s). Switching to AUTO select\n", arg);
|
|
+ return SPECTRE_V2_CMD_AUTO;
|
|
}
|
|
|
|
if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
|
|
@@ -314,11 +535,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
|
|
return SPECTRE_V2_CMD_AUTO;
|
|
}
|
|
|
|
- if (mitigation_options[i].secure)
|
|
- spec2_print_if_secure(mitigation_options[i].option);
|
|
- else
|
|
- spec2_print_if_insecure(mitigation_options[i].option);
|
|
-
|
|
+ spec_v2_print_cond(mitigation_options[i].option,
|
|
+ mitigation_options[i].secure);
|
|
return cmd;
|
|
}
|
|
|
|
@@ -400,12 +618,6 @@ specv2_set_mode:
|
|
setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
|
|
pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
|
|
|
|
- /* Initialize Indirect Branch Prediction Barrier if supported */
|
|
- if (boot_cpu_has(X86_FEATURE_IBPB)) {
|
|
- setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
|
|
- pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
|
|
- }
|
|
-
|
|
/*
|
|
* Retpoline means the kernel is safe because it has no indirect
|
|
* branches. Enhanced IBRS protects firmware too, so, enable restricted
|
|
@@ -421,6 +633,99 @@ specv2_set_mode:
|
|
setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW);
|
|
pr_info("Enabling Restricted Speculation for firmware calls\n");
|
|
}
|
|
+
|
|
+ /* Set up IBPB and STIBP depending on the general spectre V2 command */
|
|
+ spectre_v2_user_select_mitigation(cmd);
|
|
+}
|
|
+
|
|
+static void update_stibp_msr(void * __unused)
|
|
+{
|
|
+ wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
|
|
+}
|
|
+
|
|
+/* Update x86_spec_ctrl_base in case SMT state changed. */
|
|
+static void update_stibp_strict(void)
|
|
+{
|
|
+ u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP;
|
|
+
|
|
+ if (sched_smt_active())
|
|
+ mask |= SPEC_CTRL_STIBP;
|
|
+
|
|
+ if (mask == x86_spec_ctrl_base)
|
|
+ return;
|
|
+
|
|
+ pr_info("Update user space SMT mitigation: STIBP %s\n",
|
|
+ mask & SPEC_CTRL_STIBP ? "always-on" : "off");
|
|
+ x86_spec_ctrl_base = mask;
|
|
+ on_each_cpu(update_stibp_msr, NULL, 1);
|
|
+}
|
|
+
|
|
+/* Update the static key controlling the evaluation of TIF_SPEC_IB */
|
|
+static void update_indir_branch_cond(void)
|
|
+{
|
|
+ if (sched_smt_active())
|
|
+ static_branch_enable(&switch_to_cond_stibp);
|
|
+ else
|
|
+ static_branch_disable(&switch_to_cond_stibp);
|
|
+}
|
|
+
|
|
+#undef pr_fmt
|
|
+#define pr_fmt(fmt) fmt
|
|
+
|
|
+/* Update the static key controlling the MDS CPU buffer clear in idle */
|
|
+static void update_mds_branch_idle(void)
|
|
+{
|
|
+ /*
|
|
+ * Enable the idle clearing if SMT is active on CPUs which are
|
|
+ * affected only by MSBDS and not any other MDS variant.
|
|
+ *
|
|
+ * The other variants cannot be mitigated when SMT is enabled, so
|
|
+ * clearing the buffers on idle just to prevent the Store Buffer
|
|
+ * repartitioning leak would be a window dressing exercise.
|
|
+ */
|
|
+ if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY))
|
|
+ return;
|
|
+
|
|
+ if (sched_smt_active())
|
|
+ static_branch_enable(&mds_idle_clear);
|
|
+ else
|
|
+ static_branch_disable(&mds_idle_clear);
|
|
+}
|
|
+
|
|
+#define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n"
|
|
+
|
|
+void arch_smt_update(void)
|
|
+{
|
|
+ /* Enhanced IBRS implies STIBP. No update required. */
|
|
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
|
|
+ return;
|
|
+
|
|
+ mutex_lock(&spec_ctrl_mutex);
|
|
+
|
|
+ switch (spectre_v2_user) {
|
|
+ case SPECTRE_V2_USER_NONE:
|
|
+ break;
|
|
+ case SPECTRE_V2_USER_STRICT:
|
|
+ update_stibp_strict();
|
|
+ break;
|
|
+ case SPECTRE_V2_USER_PRCTL:
|
|
+ case SPECTRE_V2_USER_SECCOMP:
|
|
+ update_indir_branch_cond();
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ switch (mds_mitigation) {
|
|
+ case MDS_MITIGATION_FULL:
|
|
+ case MDS_MITIGATION_VMWERV:
|
|
+ if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY))
|
|
+ pr_warn_once(MDS_MSG_SMT);
|
|
+ update_mds_branch_idle();
|
|
+ break;
|
|
+ case MDS_MITIGATION_OFF:
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&spec_ctrl_mutex);
|
|
}
|
|
|
|
#undef pr_fmt
|
|
@@ -437,7 +742,7 @@ enum ssb_mitigation_cmd {
|
|
SPEC_STORE_BYPASS_CMD_SECCOMP,
|
|
};
|
|
|
|
-static const char *ssb_strings[] = {
|
|
+static const char * const ssb_strings[] = {
|
|
[SPEC_STORE_BYPASS_NONE] = "Vulnerable",
|
|
[SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled",
|
|
[SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl",
|
|
@@ -447,7 +752,7 @@ static const char *ssb_strings[] = {
|
|
static const struct {
|
|
const char *option;
|
|
enum ssb_mitigation_cmd cmd;
|
|
-} ssb_mitigation_options[] = {
|
|
+} ssb_mitigation_options[] __initconst = {
|
|
{ "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */
|
|
{ "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */
|
|
{ "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */
|
|
@@ -461,7 +766,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void)
|
|
char arg[20];
|
|
int ret, i;
|
|
|
|
- if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) {
|
|
+ if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") ||
|
|
+ cpu_mitigations_off()) {
|
|
return SPEC_STORE_BYPASS_CMD_NONE;
|
|
} else {
|
|
ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable",
|
|
@@ -531,18 +837,16 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void)
|
|
if (mode == SPEC_STORE_BYPASS_DISABLE) {
|
|
setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE);
|
|
/*
|
|
- * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses
|
|
- * a completely different MSR and bit dependent on family.
|
|
+ * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may
|
|
+ * use a completely different MSR and bit dependent on family.
|
|
*/
|
|
- switch (boot_cpu_data.x86_vendor) {
|
|
- case X86_VENDOR_INTEL:
|
|
+ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) &&
|
|
+ !static_cpu_has(X86_FEATURE_AMD_SSBD)) {
|
|
+ x86_amd_ssb_disable();
|
|
+ } else {
|
|
x86_spec_ctrl_base |= SPEC_CTRL_SSBD;
|
|
x86_spec_ctrl_mask |= SPEC_CTRL_SSBD;
|
|
wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base);
|
|
- break;
|
|
- case X86_VENDOR_AMD:
|
|
- x86_amd_ssb_disable();
|
|
- break;
|
|
}
|
|
}
|
|
|
|
@@ -560,10 +864,25 @@ static void ssb_select_mitigation(void)
|
|
#undef pr_fmt
|
|
#define pr_fmt(fmt) "Speculation prctl: " fmt
|
|
|
|
-static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
|
|
+static void task_update_spec_tif(struct task_struct *tsk)
|
|
{
|
|
- bool update;
|
|
+ /* Force the update of the real TIF bits */
|
|
+ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE);
|
|
|
|
+ /*
|
|
+ * Immediately update the speculation control MSRs for the current
|
|
+ * task, but for a non-current task delay setting the CPU
|
|
+ * mitigation until it is scheduled next.
|
|
+ *
|
|
+ * This can only happen for SECCOMP mitigation. For PRCTL it's
|
|
+ * always the current task.
|
|
+ */
|
|
+ if (tsk == current)
|
|
+ speculation_ctrl_update_current();
|
|
+}
|
|
+
|
|
+static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
|
|
+{
|
|
if (ssb_mode != SPEC_STORE_BYPASS_PRCTL &&
|
|
ssb_mode != SPEC_STORE_BYPASS_SECCOMP)
|
|
return -ENXIO;
|
|
@@ -574,28 +893,56 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl)
|
|
if (task_spec_ssb_force_disable(task))
|
|
return -EPERM;
|
|
task_clear_spec_ssb_disable(task);
|
|
- update = test_and_clear_tsk_thread_flag(task, TIF_SSBD);
|
|
+ task_update_spec_tif(task);
|
|
break;
|
|
case PR_SPEC_DISABLE:
|
|
task_set_spec_ssb_disable(task);
|
|
- update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
|
|
+ task_update_spec_tif(task);
|
|
break;
|
|
case PR_SPEC_FORCE_DISABLE:
|
|
task_set_spec_ssb_disable(task);
|
|
task_set_spec_ssb_force_disable(task);
|
|
- update = !test_and_set_tsk_thread_flag(task, TIF_SSBD);
|
|
+ task_update_spec_tif(task);
|
|
break;
|
|
default:
|
|
return -ERANGE;
|
|
}
|
|
+ return 0;
|
|
+}
|
|
|
|
- /*
|
|
- * If being set on non-current task, delay setting the CPU
|
|
- * mitigation until it is next scheduled.
|
|
- */
|
|
- if (task == current && update)
|
|
- speculative_store_bypass_update_current();
|
|
-
|
|
+static int ib_prctl_set(struct task_struct *task, unsigned long ctrl)
|
|
+{
|
|
+ switch (ctrl) {
|
|
+ case PR_SPEC_ENABLE:
|
|
+ if (spectre_v2_user == SPECTRE_V2_USER_NONE)
|
|
+ return 0;
|
|
+ /*
|
|
+ * Indirect branch speculation is always disabled in strict
|
|
+ * mode.
|
|
+ */
|
|
+ if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
|
|
+ return -EPERM;
|
|
+ task_clear_spec_ib_disable(task);
|
|
+ task_update_spec_tif(task);
|
|
+ break;
|
|
+ case PR_SPEC_DISABLE:
|
|
+ case PR_SPEC_FORCE_DISABLE:
|
|
+ /*
|
|
+ * Indirect branch speculation is always allowed when
|
|
+ * mitigation is force disabled.
|
|
+ */
|
|
+ if (spectre_v2_user == SPECTRE_V2_USER_NONE)
|
|
+ return -EPERM;
|
|
+ if (spectre_v2_user == SPECTRE_V2_USER_STRICT)
|
|
+ return 0;
|
|
+ task_set_spec_ib_disable(task);
|
|
+ if (ctrl == PR_SPEC_FORCE_DISABLE)
|
|
+ task_set_spec_ib_force_disable(task);
|
|
+ task_update_spec_tif(task);
|
|
+ break;
|
|
+ default:
|
|
+ return -ERANGE;
|
|
+ }
|
|
return 0;
|
|
}
|
|
|
|
@@ -605,6 +952,8 @@ int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
|
|
switch (which) {
|
|
case PR_SPEC_STORE_BYPASS:
|
|
return ssb_prctl_set(task, ctrl);
|
|
+ case PR_SPEC_INDIRECT_BRANCH:
|
|
+ return ib_prctl_set(task, ctrl);
|
|
default:
|
|
return -ENODEV;
|
|
}
|
|
@@ -615,6 +964,8 @@ void arch_seccomp_spec_mitigate(struct task_struct *task)
|
|
{
|
|
if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP)
|
|
ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE);
|
|
+ if (spectre_v2_user == SPECTRE_V2_USER_SECCOMP)
|
|
+ ib_prctl_set(task, PR_SPEC_FORCE_DISABLE);
|
|
}
|
|
#endif
|
|
|
|
@@ -637,11 +988,35 @@ static int ssb_prctl_get(struct task_struct *task)
|
|
}
|
|
}
|
|
|
|
+static int ib_prctl_get(struct task_struct *task)
|
|
+{
|
|
+ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
|
|
+ return PR_SPEC_NOT_AFFECTED;
|
|
+
|
|
+ switch (spectre_v2_user) {
|
|
+ case SPECTRE_V2_USER_NONE:
|
|
+ return PR_SPEC_ENABLE;
|
|
+ case SPECTRE_V2_USER_PRCTL:
|
|
+ case SPECTRE_V2_USER_SECCOMP:
|
|
+ if (task_spec_ib_force_disable(task))
|
|
+ return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;
|
|
+ if (task_spec_ib_disable(task))
|
|
+ return PR_SPEC_PRCTL | PR_SPEC_DISABLE;
|
|
+ return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
|
|
+ case SPECTRE_V2_USER_STRICT:
|
|
+ return PR_SPEC_DISABLE;
|
|
+ default:
|
|
+ return PR_SPEC_NOT_AFFECTED;
|
|
+ }
|
|
+}
|
|
+
|
|
int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
|
|
{
|
|
switch (which) {
|
|
case PR_SPEC_STORE_BYPASS:
|
|
return ssb_prctl_get(task);
|
|
+ case PR_SPEC_INDIRECT_BRANCH:
|
|
+ return ib_prctl_get(task);
|
|
default:
|
|
return -ENODEV;
|
|
}
|
|
@@ -713,6 +1088,11 @@ static void __init l1tf_select_mitigation(void)
|
|
if (!boot_cpu_has_bug(X86_BUG_L1TF))
|
|
return;
|
|
|
|
+ if (cpu_mitigations_off())
|
|
+ l1tf_mitigation = L1TF_MITIGATION_OFF;
|
|
+ else if (cpu_mitigations_auto_nosmt())
|
|
+ l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT;
|
|
+
|
|
override_cache_bits(&boot_cpu_data);
|
|
|
|
switch (l1tf_mitigation) {
|
|
@@ -735,12 +1115,13 @@ static void __init l1tf_select_mitigation(void)
|
|
#endif
|
|
|
|
half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT;
|
|
- if (e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
|
|
+ if (l1tf_mitigation != L1TF_MITIGATION_OFF &&
|
|
+ e820_any_mapped(half_pa, ULLONG_MAX - half_pa, E820_RAM)) {
|
|
pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n");
|
|
pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n",
|
|
half_pa);
|
|
pr_info("However, doing so will make a part of your RAM unusable.\n");
|
|
- pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html might help you decide.\n");
|
|
+ pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n");
|
|
return;
|
|
}
|
|
|
|
@@ -773,13 +1154,14 @@ static int __init l1tf_cmdline(char *str)
|
|
early_param("l1tf", l1tf_cmdline);
|
|
|
|
#undef pr_fmt
|
|
+#define pr_fmt(fmt) fmt
|
|
|
|
#ifdef CONFIG_SYSFS
|
|
|
|
#define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion"
|
|
|
|
#if IS_ENABLED(CONFIG_KVM_INTEL)
|
|
-static const char *l1tf_vmx_states[] = {
|
|
+static const char * const l1tf_vmx_states[] = {
|
|
[VMENTER_L1D_FLUSH_AUTO] = "auto",
|
|
[VMENTER_L1D_FLUSH_NEVER] = "vulnerable",
|
|
[VMENTER_L1D_FLUSH_COND] = "conditional cache flushes",
|
|
@@ -795,13 +1177,14 @@ static ssize_t l1tf_show_state(char *buf)
|
|
|
|
if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED ||
|
|
(l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER &&
|
|
- cpu_smt_control == CPU_SMT_ENABLED))
|
|
+ sched_smt_active())) {
|
|
return sprintf(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG,
|
|
l1tf_vmx_states[l1tf_vmx_mitigation]);
|
|
+ }
|
|
|
|
return sprintf(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG,
|
|
l1tf_vmx_states[l1tf_vmx_mitigation],
|
|
- cpu_smt_control == CPU_SMT_ENABLED ? "vulnerable" : "disabled");
|
|
+ sched_smt_active() ? "vulnerable" : "disabled");
|
|
}
|
|
#else
|
|
static ssize_t l1tf_show_state(char *buf)
|
|
@@ -810,6 +1193,55 @@ static ssize_t l1tf_show_state(char *buf)
|
|
}
|
|
#endif
|
|
|
|
+static ssize_t mds_show_state(char *buf)
|
|
+{
|
|
+#ifdef CONFIG_HYPERVISOR_GUEST
|
|
+ if (x86_hyper) {
|
|
+ return sprintf(buf, "%s; SMT Host state unknown\n",
|
|
+ mds_strings[mds_mitigation]);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) {
|
|
+ return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
|
|
+ (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" :
|
|
+ sched_smt_active() ? "mitigated" : "disabled"));
|
|
+ }
|
|
+
|
|
+ return sprintf(buf, "%s; SMT %s\n", mds_strings[mds_mitigation],
|
|
+ sched_smt_active() ? "vulnerable" : "disabled");
|
|
+}
|
|
+
|
|
+static char *stibp_state(void)
|
|
+{
|
|
+ if (spectre_v2_enabled == SPECTRE_V2_IBRS_ENHANCED)
|
|
+ return "";
|
|
+
|
|
+ switch (spectre_v2_user) {
|
|
+ case SPECTRE_V2_USER_NONE:
|
|
+ return ", STIBP: disabled";
|
|
+ case SPECTRE_V2_USER_STRICT:
|
|
+ return ", STIBP: forced";
|
|
+ case SPECTRE_V2_USER_PRCTL:
|
|
+ case SPECTRE_V2_USER_SECCOMP:
|
|
+ if (static_key_enabled(&switch_to_cond_stibp))
|
|
+ return ", STIBP: conditional";
|
|
+ }
|
|
+ return "";
|
|
+}
|
|
+
|
|
+static char *ibpb_state(void)
|
|
+{
|
|
+ if (boot_cpu_has(X86_FEATURE_IBPB)) {
|
|
+ if (static_key_enabled(&switch_mm_always_ibpb))
|
|
+ return ", IBPB: always-on";
|
|
+ if (static_key_enabled(&switch_mm_cond_ibpb))
|
|
+ return ", IBPB: conditional";
|
|
+ return ", IBPB: disabled";
|
|
+ }
|
|
+ return "";
|
|
+}
|
|
+
|
|
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
|
|
char *buf, unsigned int bug)
|
|
{
|
|
@@ -827,9 +1259,11 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
|
|
return sprintf(buf, "Mitigation: __user pointer sanitization\n");
|
|
|
|
case X86_BUG_SPECTRE_V2:
|
|
- return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
|
|
- boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "",
|
|
+ return sprintf(buf, "%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled],
|
|
+ ibpb_state(),
|
|
boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
|
|
+ stibp_state(),
|
|
+ boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
|
|
spectre_v2_module_string());
|
|
|
|
case X86_BUG_SPEC_STORE_BYPASS:
|
|
@@ -839,6 +1273,10 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
|
|
if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV))
|
|
return l1tf_show_state(buf);
|
|
break;
|
|
+
|
|
+ case X86_BUG_MDS:
|
|
+ return mds_show_state(buf);
|
|
+
|
|
default:
|
|
break;
|
|
}
|
|
@@ -870,4 +1308,9 @@ ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *b
|
|
{
|
|
return cpu_show_common(dev, attr, buf, X86_BUG_L1TF);
|
|
}
|
|
+
|
|
+ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ return cpu_show_common(dev, attr, buf, X86_BUG_MDS);
|
|
+}
|
|
#endif
|
|
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
|
|
index 3c01610c5ba9..cda130dc56b9 100644
|
|
--- a/arch/x86/kernel/cpu/common.c
|
|
+++ b/arch/x86/kernel/cpu/common.c
|
|
@@ -752,6 +752,12 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
|
|
set_cpu_cap(c, X86_FEATURE_STIBP);
|
|
set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
|
|
}
|
|
+
|
|
+ if (cpu_has(c, X86_FEATURE_AMD_SSBD)) {
|
|
+ set_cpu_cap(c, X86_FEATURE_SSBD);
|
|
+ set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL);
|
|
+ clear_cpu_cap(c, X86_FEATURE_VIRT_SSBD);
|
|
+ }
|
|
}
|
|
|
|
void get_cpu_cap(struct cpuinfo_x86 *c)
|
|
@@ -885,84 +891,95 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
|
|
c->x86_cache_bits = c->x86_phys_bits;
|
|
}
|
|
|
|
-static const __initconst struct x86_cpu_id cpu_no_speculation[] = {
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW, X86_FEATURE_ANY },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW, X86_FEATURE_ANY },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT, X86_FEATURE_ANY },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL, X86_FEATURE_ANY },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW, X86_FEATURE_ANY },
|
|
- { X86_VENDOR_CENTAUR, 5 },
|
|
- { X86_VENDOR_INTEL, 5 },
|
|
- { X86_VENDOR_NSC, 5 },
|
|
- { X86_VENDOR_ANY, 4 },
|
|
- {}
|
|
-};
|
|
+#define NO_SPECULATION BIT(0)
|
|
+#define NO_MELTDOWN BIT(1)
|
|
+#define NO_SSB BIT(2)
|
|
+#define NO_L1TF BIT(3)
|
|
+#define NO_MDS BIT(4)
|
|
+#define MSBDS_ONLY BIT(5)
|
|
|
|
-static const __initconst struct x86_cpu_id cpu_no_meltdown[] = {
|
|
- { X86_VENDOR_AMD },
|
|
- {}
|
|
-};
|
|
+#define VULNWL(_vendor, _family, _model, _whitelist) \
|
|
+ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist }
|
|
|
|
-static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = {
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
|
|
- { X86_VENDOR_CENTAUR, 5, },
|
|
- { X86_VENDOR_INTEL, 5, },
|
|
- { X86_VENDOR_NSC, 5, },
|
|
- { X86_VENDOR_AMD, 0x12, },
|
|
- { X86_VENDOR_AMD, 0x11, },
|
|
- { X86_VENDOR_AMD, 0x10, },
|
|
- { X86_VENDOR_AMD, 0xf, },
|
|
- { X86_VENDOR_ANY, 4, },
|
|
- {}
|
|
-};
|
|
+#define VULNWL_INTEL(model, whitelist) \
|
|
+ VULNWL(INTEL, 6, INTEL_FAM6_##model, whitelist)
|
|
+
|
|
+#define VULNWL_AMD(family, whitelist) \
|
|
+ VULNWL(AMD, family, X86_MODEL_ANY, whitelist)
|
|
+
|
|
+static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
|
|
+ VULNWL(ANY, 4, X86_MODEL_ANY, NO_SPECULATION),
|
|
+ VULNWL(CENTAUR, 5, X86_MODEL_ANY, NO_SPECULATION),
|
|
+ VULNWL(INTEL, 5, X86_MODEL_ANY, NO_SPECULATION),
|
|
+ VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION),
|
|
+
|
|
+ /* Intel Family 6 */
|
|
+ VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION),
|
|
+ VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION),
|
|
+ VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION),
|
|
+ VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION),
|
|
+ VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION),
|
|
+
|
|
+ VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY),
|
|
+ VULNWL_INTEL(ATOM_SILVERMONT_X, NO_SSB | NO_L1TF | MSBDS_ONLY),
|
|
+ VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY),
|
|
+ VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY),
|
|
+ VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY),
|
|
+ VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY),
|
|
+
|
|
+ VULNWL_INTEL(CORE_YONAH, NO_SSB),
|
|
+
|
|
+ VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY),
|
|
+
|
|
+ VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF),
|
|
+ VULNWL_INTEL(ATOM_GOLDMONT_X, NO_MDS | NO_L1TF),
|
|
+ VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF),
|
|
|
|
-static const __initconst struct x86_cpu_id cpu_no_l1tf[] = {
|
|
- /* in addition to cpu_no_speculation */
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MOOREFIELD },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GOLDMONT },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_DENVERTON },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_GEMINI_LAKE },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL },
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM },
|
|
+ /* AMD Family 0xf - 0x12 */
|
|
+ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
|
|
+ VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
|
|
+ VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
|
|
+ VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS),
|
|
+
|
|
+ /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */
|
|
+ VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS),
|
|
{}
|
|
};
|
|
|
|
-static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
|
|
+static bool __init cpu_matches(unsigned long which)
|
|
{
|
|
- u64 ia32_cap = 0;
|
|
+ const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
|
|
|
|
- if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
|
|
- rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
|
|
+ return m && !!(m->driver_data & which);
|
|
+}
|
|
|
|
- if (!x86_match_cpu(cpu_no_spec_store_bypass) &&
|
|
- !(ia32_cap & ARCH_CAP_SSB_NO))
|
|
- setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
|
|
+static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
|
|
+{
|
|
+ u64 ia32_cap = 0;
|
|
|
|
- if (x86_match_cpu(cpu_no_speculation))
|
|
+ if (cpu_matches(NO_SPECULATION))
|
|
return;
|
|
|
|
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
|
|
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
|
|
|
|
+ if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES))
|
|
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap);
|
|
+
|
|
+ if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
|
|
+ !cpu_has(c, X86_FEATURE_AMD_SSB_NO))
|
|
+ setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
|
|
+
|
|
if (ia32_cap & ARCH_CAP_IBRS_ALL)
|
|
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
|
|
|
|
- if (x86_match_cpu(cpu_no_meltdown))
|
|
+ if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
|
|
+ setup_force_cpu_bug(X86_BUG_MDS);
|
|
+ if (cpu_matches(MSBDS_ONLY))
|
|
+ setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
|
|
+ }
|
|
+
|
|
+ if (cpu_matches(NO_MELTDOWN))
|
|
return;
|
|
|
|
/* Rogue Data Cache Load? No! */
|
|
@@ -971,7 +988,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
|
|
|
|
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
|
|
|
|
- if (x86_match_cpu(cpu_no_l1tf))
|
|
+ if (cpu_matches(NO_L1TF))
|
|
return;
|
|
|
|
setup_force_cpu_bug(X86_BUG_L1TF);
|
|
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
|
|
index cee0fec0d232..860f2fd9f540 100644
|
|
--- a/arch/x86/kernel/cpu/intel.c
|
|
+++ b/arch/x86/kernel/cpu/intel.c
|
|
@@ -14,6 +14,7 @@
|
|
#include <asm/bugs.h>
|
|
#include <asm/cpu.h>
|
|
#include <asm/intel-family.h>
|
|
+#include <asm/microcode_intel.h>
|
|
|
|
#ifdef CONFIG_X86_64
|
|
#include <linux/topology.h>
|
|
@@ -137,14 +138,8 @@ static void early_init_intel(struct cpuinfo_x86 *c)
|
|
(c->x86 == 0x6 && c->x86_model >= 0x0e))
|
|
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
|
|
|
|
- if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
|
|
- unsigned lower_word;
|
|
-
|
|
- wrmsr(MSR_IA32_UCODE_REV, 0, 0);
|
|
- /* Required by the SDM */
|
|
- sync_core();
|
|
- rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
|
|
- }
|
|
+ if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64))
|
|
+ c->microcode = intel_get_microcode_revision();
|
|
|
|
/* Now if any of them are set, check the blacklist and clear the lot */
|
|
if ((cpu_has(c, X86_FEATURE_SPEC_CTRL) ||
|
|
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
|
|
index 25310d2b8609..d9ad49ca3cbe 100644
|
|
--- a/arch/x86/kernel/cpu/mcheck/mce.c
|
|
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
|
|
@@ -139,6 +139,8 @@ void mce_setup(struct mce *m)
|
|
m->socketid = cpu_data(m->extcpu).phys_proc_id;
|
|
m->apicid = cpu_data(m->extcpu).initial_apicid;
|
|
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
|
|
+
|
|
+ m->microcode = boot_cpu_data.microcode;
|
|
}
|
|
|
|
DEFINE_PER_CPU(struct mce, injectm);
|
|
@@ -309,7 +311,7 @@ static void print_mce(struct mce *m)
|
|
*/
|
|
pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
|
|
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
|
|
- cpu_data(m->extcpu).microcode);
|
|
+ m->microcode);
|
|
|
|
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
|
|
}
|
|
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
|
|
index 732bb03fcf91..a19fddfb6bf8 100644
|
|
--- a/arch/x86/kernel/cpu/microcode/amd.c
|
|
+++ b/arch/x86/kernel/cpu/microcode/amd.c
|
|
@@ -707,22 +707,26 @@ int apply_microcode_amd(int cpu)
|
|
return -1;
|
|
|
|
/* need to apply patch? */
|
|
- if (rev >= mc_amd->hdr.patch_id) {
|
|
- c->microcode = rev;
|
|
- uci->cpu_sig.rev = rev;
|
|
- return 0;
|
|
- }
|
|
+ if (rev >= mc_amd->hdr.patch_id)
|
|
+ goto out;
|
|
|
|
if (__apply_microcode_amd(mc_amd)) {
|
|
pr_err("CPU%d: update failed for patch_level=0x%08x\n",
|
|
cpu, mc_amd->hdr.patch_id);
|
|
return -1;
|
|
}
|
|
- pr_info("CPU%d: new patch_level=0x%08x\n", cpu,
|
|
- mc_amd->hdr.patch_id);
|
|
|
|
- uci->cpu_sig.rev = mc_amd->hdr.patch_id;
|
|
- c->microcode = mc_amd->hdr.patch_id;
|
|
+ rev = mc_amd->hdr.patch_id;
|
|
+
|
|
+ pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
|
|
+
|
|
+out:
|
|
+ uci->cpu_sig.rev = rev;
|
|
+ c->microcode = rev;
|
|
+
|
|
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
|
|
+ if (c->cpu_index == boot_cpu_data.cpu_index)
|
|
+ boot_cpu_data.microcode = rev;
|
|
|
|
return 0;
|
|
}
|
|
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
|
|
index 79291d6fb301..1308abfc4758 100644
|
|
--- a/arch/x86/kernel/cpu/microcode/intel.c
|
|
+++ b/arch/x86/kernel/cpu/microcode/intel.c
|
|
@@ -386,15 +386,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
|
|
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
|
|
csig.pf = 1 << ((val[1] >> 18) & 7);
|
|
}
|
|
- native_wrmsrl(MSR_IA32_UCODE_REV, 0);
|
|
|
|
- /* As documented in the SDM: Do a CPUID 1 here */
|
|
- sync_core();
|
|
-
|
|
- /* get the current revision from MSR 0x8B */
|
|
- native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
|
|
-
|
|
- csig.rev = val[1];
|
|
+ csig.rev = intel_get_microcode_revision();
|
|
|
|
uci->cpu_sig = csig;
|
|
uci->valid = 1;
|
|
@@ -618,29 +611,35 @@ static inline void print_ucode(struct ucode_cpu_info *uci)
|
|
static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)
|
|
{
|
|
struct microcode_intel *mc;
|
|
- unsigned int val[2];
|
|
+ u32 rev;
|
|
|
|
mc = uci->mc;
|
|
if (!mc)
|
|
return 0;
|
|
|
|
+ /*
|
|
+ * Save us the MSR write below - which is a particular expensive
|
|
+ * operation - when the other hyperthread has updated the microcode
|
|
+ * already.
|
|
+ */
|
|
+ rev = intel_get_microcode_revision();
|
|
+ if (rev >= mc->hdr.rev) {
|
|
+ uci->cpu_sig.rev = rev;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
/* write microcode via MSR 0x79 */
|
|
native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
|
|
- native_wrmsrl(MSR_IA32_UCODE_REV, 0);
|
|
-
|
|
- /* As documented in the SDM: Do a CPUID 1 here */
|
|
- sync_core();
|
|
|
|
- /* get the current revision from MSR 0x8B */
|
|
- native_rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
|
|
- if (val[1] != mc->hdr.rev)
|
|
+ rev = intel_get_microcode_revision();
|
|
+ if (rev != mc->hdr.rev)
|
|
return -1;
|
|
|
|
#ifdef CONFIG_X86_64
|
|
/* Flush global tlb. This is precaution. */
|
|
flush_tlb_early();
|
|
#endif
|
|
- uci->cpu_sig.rev = val[1];
|
|
+ uci->cpu_sig.rev = rev;
|
|
|
|
if (early)
|
|
print_ucode(uci);
|
|
@@ -903,9 +902,9 @@ static int apply_microcode_intel(int cpu)
|
|
{
|
|
struct microcode_intel *mc;
|
|
struct ucode_cpu_info *uci;
|
|
- struct cpuinfo_x86 *c;
|
|
- unsigned int val[2];
|
|
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
|
|
static int prev_rev;
|
|
+ u32 rev;
|
|
|
|
/* We should bind the task to the CPU */
|
|
if (WARN_ON(raw_smp_processor_id() != cpu))
|
|
@@ -924,35 +923,42 @@ static int apply_microcode_intel(int cpu)
|
|
if (!get_matching_mc(mc, cpu))
|
|
return 0;
|
|
|
|
+ /*
|
|
+ * Save us the MSR write below - which is a particular expensive
|
|
+ * operation - when the other hyperthread has updated the microcode
|
|
+ * already.
|
|
+ */
|
|
+ rev = intel_get_microcode_revision();
|
|
+ if (rev >= mc->hdr.rev)
|
|
+ goto out;
|
|
+
|
|
/* write microcode via MSR 0x79 */
|
|
wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits);
|
|
- wrmsrl(MSR_IA32_UCODE_REV, 0);
|
|
|
|
- /* As documented in the SDM: Do a CPUID 1 here */
|
|
- sync_core();
|
|
+ rev = intel_get_microcode_revision();
|
|
|
|
- /* get the current revision from MSR 0x8B */
|
|
- rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
|
|
-
|
|
- if (val[1] != mc->hdr.rev) {
|
|
+ if (rev != mc->hdr.rev) {
|
|
pr_err("CPU%d update to revision 0x%x failed\n",
|
|
cpu, mc->hdr.rev);
|
|
return -1;
|
|
}
|
|
|
|
- if (val[1] != prev_rev) {
|
|
+ if (rev != prev_rev) {
|
|
pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n",
|
|
- val[1],
|
|
+ rev,
|
|
mc->hdr.date & 0xffff,
|
|
mc->hdr.date >> 24,
|
|
(mc->hdr.date >> 16) & 0xff);
|
|
- prev_rev = val[1];
|
|
+ prev_rev = rev;
|
|
}
|
|
|
|
- c = &cpu_data(cpu);
|
|
+out:
|
|
+ uci->cpu_sig.rev = rev;
|
|
+ c->microcode = rev;
|
|
|
|
- uci->cpu_sig.rev = val[1];
|
|
- c->microcode = val[1];
|
|
+ /* Update boot_cpu_data's revision too, if we're on the BSP: */
|
|
+ if (c->cpu_index == boot_cpu_data.cpu_index)
|
|
+ boot_cpu_data.microcode = rev;
|
|
|
|
return 0;
|
|
}
|
|
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
|
|
index bfe4d6c96fbd..6b7b35d80264 100644
|
|
--- a/arch/x86/kernel/nmi.c
|
|
+++ b/arch/x86/kernel/nmi.c
|
|
@@ -32,6 +32,7 @@
|
|
#include <asm/x86_init.h>
|
|
#include <asm/reboot.h>
|
|
#include <asm/cache.h>
|
|
+#include <asm/nospec-branch.h>
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
#include <trace/events/nmi.h>
|
|
@@ -544,6 +545,9 @@ nmi_restart:
|
|
write_cr2(this_cpu_read(nmi_cr2));
|
|
if (this_cpu_dec_return(nmi_state))
|
|
goto nmi_restart;
|
|
+
|
|
+ if (user_mode(regs))
|
|
+ mds_user_clear_cpu_buffers();
|
|
}
|
|
NOKPROBE_SYMBOL(do_nmi);
|
|
|
|
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
|
|
index 00a9047539d7..2e4eab22ca37 100644
|
|
--- a/arch/x86/kernel/process.c
|
|
+++ b/arch/x86/kernel/process.c
|
|
@@ -35,6 +35,8 @@
|
|
#include <asm/switch_to.h>
|
|
#include <asm/spec-ctrl.h>
|
|
|
|
+#include "process.h"
|
|
+
|
|
/*
|
|
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
|
|
* no more per-task TSS's. The TSS size is kept cacheline-aligned
|
|
@@ -183,11 +185,12 @@ int set_tsc_mode(unsigned int val)
|
|
return 0;
|
|
}
|
|
|
|
-static inline void switch_to_bitmap(struct tss_struct *tss,
|
|
- struct thread_struct *prev,
|
|
+static inline void switch_to_bitmap(struct thread_struct *prev,
|
|
struct thread_struct *next,
|
|
unsigned long tifp, unsigned long tifn)
|
|
{
|
|
+ struct tss_struct *tss = this_cpu_ptr(&cpu_tss);
|
|
+
|
|
if (tifn & _TIF_IO_BITMAP) {
|
|
/*
|
|
* Copy the relevant range of the IO bitmap.
|
|
@@ -321,32 +324,85 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
|
|
wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
|
|
}
|
|
|
|
-static __always_inline void intel_set_ssb_state(unsigned long tifn)
|
|
+/*
|
|
+ * Update the MSRs managing speculation control, during context switch.
|
|
+ *
|
|
+ * tifp: Previous task's thread flags
|
|
+ * tifn: Next task's thread flags
|
|
+ */
|
|
+static __always_inline void __speculation_ctrl_update(unsigned long tifp,
|
|
+ unsigned long tifn)
|
|
{
|
|
- u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn);
|
|
+ unsigned long tif_diff = tifp ^ tifn;
|
|
+ u64 msr = x86_spec_ctrl_base;
|
|
+ bool updmsr = false;
|
|
+
|
|
+ /*
|
|
+ * If TIF_SSBD is different, select the proper mitigation
|
|
+ * method. Note that if SSBD mitigation is disabled or permanentely
|
|
+ * enabled this branch can't be taken because nothing can set
|
|
+ * TIF_SSBD.
|
|
+ */
|
|
+ if (tif_diff & _TIF_SSBD) {
|
|
+ if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
|
|
+ amd_set_ssb_virt_state(tifn);
|
|
+ } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
|
|
+ amd_set_core_ssb_state(tifn);
|
|
+ } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
|
|
+ static_cpu_has(X86_FEATURE_AMD_SSBD)) {
|
|
+ msr |= ssbd_tif_to_spec_ctrl(tifn);
|
|
+ updmsr = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
|
|
+ * otherwise avoid the MSR write.
|
|
+ */
|
|
+ if (IS_ENABLED(CONFIG_SMP) &&
|
|
+ static_branch_unlikely(&switch_to_cond_stibp)) {
|
|
+ updmsr |= !!(tif_diff & _TIF_SPEC_IB);
|
|
+ msr |= stibp_tif_to_spec_ctrl(tifn);
|
|
+ }
|
|
|
|
- wrmsrl(MSR_IA32_SPEC_CTRL, msr);
|
|
+ if (updmsr)
|
|
+ wrmsrl(MSR_IA32_SPEC_CTRL, msr);
|
|
}
|
|
|
|
-static __always_inline void __speculative_store_bypass_update(unsigned long tifn)
|
|
+static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
|
|
{
|
|
- if (static_cpu_has(X86_FEATURE_VIRT_SSBD))
|
|
- amd_set_ssb_virt_state(tifn);
|
|
- else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD))
|
|
- amd_set_core_ssb_state(tifn);
|
|
- else
|
|
- intel_set_ssb_state(tifn);
|
|
+ if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
|
|
+ if (task_spec_ssb_disable(tsk))
|
|
+ set_tsk_thread_flag(tsk, TIF_SSBD);
|
|
+ else
|
|
+ clear_tsk_thread_flag(tsk, TIF_SSBD);
|
|
+
|
|
+ if (task_spec_ib_disable(tsk))
|
|
+ set_tsk_thread_flag(tsk, TIF_SPEC_IB);
|
|
+ else
|
|
+ clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
|
|
+ }
|
|
+ /* Return the updated threadinfo flags*/
|
|
+ return task_thread_info(tsk)->flags;
|
|
}
|
|
|
|
-void speculative_store_bypass_update(unsigned long tif)
|
|
+void speculation_ctrl_update(unsigned long tif)
|
|
{
|
|
+ /* Forced update. Make sure all relevant TIF flags are different */
|
|
preempt_disable();
|
|
- __speculative_store_bypass_update(tif);
|
|
+ __speculation_ctrl_update(~tif, tif);
|
|
preempt_enable();
|
|
}
|
|
|
|
-void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
|
|
- struct tss_struct *tss)
|
|
+/* Called from seccomp/prctl update */
|
|
+void speculation_ctrl_update_current(void)
|
|
+{
|
|
+ preempt_disable();
|
|
+ speculation_ctrl_update(speculation_ctrl_update_tif(current));
|
|
+ preempt_enable();
|
|
+}
|
|
+
|
|
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
|
|
{
|
|
struct thread_struct *prev, *next;
|
|
unsigned long tifp, tifn;
|
|
@@ -356,7 +412,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
|
|
|
|
tifn = READ_ONCE(task_thread_info(next_p)->flags);
|
|
tifp = READ_ONCE(task_thread_info(prev_p)->flags);
|
|
- switch_to_bitmap(tss, prev, next, tifp, tifn);
|
|
+ switch_to_bitmap(prev, next, tifp, tifn);
|
|
|
|
propagate_user_return_notify(prev_p, next_p);
|
|
|
|
@@ -374,8 +430,15 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
|
|
if ((tifp ^ tifn) & _TIF_NOTSC)
|
|
cr4_toggle_bits(X86_CR4_TSD);
|
|
|
|
- if ((tifp ^ tifn) & _TIF_SSBD)
|
|
- __speculative_store_bypass_update(tifn);
|
|
+ if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
|
|
+ __speculation_ctrl_update(tifp, tifn);
|
|
+ } else {
|
|
+ speculation_ctrl_update_tif(prev_p);
|
|
+ tifn = speculation_ctrl_update_tif(next_p);
|
|
+
|
|
+ /* Enforce MSR update to ensure consistent state */
|
|
+ __speculation_ctrl_update(~tifn, tifn);
|
|
+ }
|
|
}
|
|
|
|
/*
|
|
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
|
|
new file mode 100644
|
|
index 000000000000..898e97cf6629
|
|
--- /dev/null
|
|
+++ b/arch/x86/kernel/process.h
|
|
@@ -0,0 +1,39 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+//
|
|
+// Code shared between 32 and 64 bit
|
|
+
|
|
+#include <asm/spec-ctrl.h>
|
|
+
|
|
+void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p);
|
|
+
|
|
+/*
|
|
+ * This needs to be inline to optimize for the common case where no extra
|
|
+ * work needs to be done.
|
|
+ */
|
|
+static inline void switch_to_extra(struct task_struct *prev,
|
|
+ struct task_struct *next)
|
|
+{
|
|
+ unsigned long next_tif = task_thread_info(next)->flags;
|
|
+ unsigned long prev_tif = task_thread_info(prev)->flags;
|
|
+
|
|
+ if (IS_ENABLED(CONFIG_SMP)) {
|
|
+ /*
|
|
+ * Avoid __switch_to_xtra() invocation when conditional
|
|
+ * STIPB is disabled and the only different bit is
|
|
+ * TIF_SPEC_IB. For CONFIG_SMP=n TIF_SPEC_IB is not
|
|
+ * in the TIF_WORK_CTXSW masks.
|
|
+ */
|
|
+ if (!static_branch_likely(&switch_to_cond_stibp)) {
|
|
+ prev_tif &= ~_TIF_SPEC_IB;
|
|
+ next_tif &= ~_TIF_SPEC_IB;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * __switch_to_xtra() handles debug registers, i/o bitmaps,
|
|
+ * speculation mitigations etc.
|
|
+ */
|
|
+ if (unlikely(next_tif & _TIF_WORK_CTXSW_NEXT ||
|
|
+ prev_tif & _TIF_WORK_CTXSW_PREV))
|
|
+ __switch_to_xtra(prev, next);
|
|
+}
|
|
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
|
|
index bd7be8efdc4c..912246fd6cd9 100644
|
|
--- a/arch/x86/kernel/process_32.c
|
|
+++ b/arch/x86/kernel/process_32.c
|
|
@@ -55,6 +55,8 @@
|
|
#include <asm/switch_to.h>
|
|
#include <asm/vm86.h>
|
|
|
|
+#include "process.h"
|
|
+
|
|
void __show_regs(struct pt_regs *regs, int all)
|
|
{
|
|
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
|
|
@@ -264,12 +266,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
|
if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
|
|
set_iopl_mask(next->iopl);
|
|
|
|
- /*
|
|
- * Now maybe handle debug registers and/or IO bitmaps
|
|
- */
|
|
- if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
|
|
- task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
|
|
- __switch_to_xtra(prev_p, next_p, tss);
|
|
+ switch_to_extra(prev_p, next_p);
|
|
|
|
/*
|
|
* Leave lazy mode, flushing any hypercalls made here.
|
|
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
|
|
index a2661814bde0..81eec65fe053 100644
|
|
--- a/arch/x86/kernel/process_64.c
|
|
+++ b/arch/x86/kernel/process_64.c
|
|
@@ -51,6 +51,8 @@
|
|
#include <asm/xen/hypervisor.h>
|
|
#include <asm/vdso.h>
|
|
|
|
+#include "process.h"
|
|
+
|
|
__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
|
|
|
|
/* Prints also some state that isn't saved in the pt_regs */
|
|
@@ -454,12 +456,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
|
|
/* Reload esp0 and ss1. This changes current_thread_info(). */
|
|
load_sp0(tss, next);
|
|
|
|
- /*
|
|
- * Now maybe reload the debug registers and handle I/O bitmaps
|
|
- */
|
|
- if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
|
|
- task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
|
|
- __switch_to_xtra(prev_p, next_p, tss);
|
|
+ switch_to_extra(prev_p, next_p);
|
|
|
|
#ifdef CONFIG_XEN
|
|
/*
|
|
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
|
|
index 5bbfa2f63b8c..ef225fa8e928 100644
|
|
--- a/arch/x86/kernel/traps.c
|
|
+++ b/arch/x86/kernel/traps.c
|
|
@@ -62,6 +62,7 @@
|
|
#include <asm/alternative.h>
|
|
#include <asm/fpu/xstate.h>
|
|
#include <asm/trace/mpx.h>
|
|
+#include <asm/nospec-branch.h>
|
|
#include <asm/mpx.h>
|
|
#include <asm/vm86.h>
|
|
|
|
@@ -340,6 +341,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
|
|
regs->ip = (unsigned long)general_protection;
|
|
regs->sp = (unsigned long)&normal_regs->orig_ax;
|
|
|
|
+ /*
|
|
+ * This situation can be triggered by userspace via
|
|
+ * modify_ldt(2) and the return does not take the regular
|
|
+ * user space exit, so a CPU buffer clear is required when
|
|
+ * MDS mitigation is enabled.
|
|
+ */
|
|
+ mds_user_clear_cpu_buffers();
|
|
return;
|
|
}
|
|
#endif
|
|
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
|
|
index 769c370011d6..cb768417429d 100644
|
|
--- a/arch/x86/kernel/tsc.c
|
|
+++ b/arch/x86/kernel/tsc.c
|
|
@@ -713,7 +713,7 @@ unsigned long native_calibrate_tsc(void)
|
|
case INTEL_FAM6_KABYLAKE_DESKTOP:
|
|
crystal_khz = 24000; /* 24.0 MHz */
|
|
break;
|
|
- case INTEL_FAM6_ATOM_DENVERTON:
|
|
+ case INTEL_FAM6_ATOM_GOLDMONT_X:
|
|
crystal_khz = 25000; /* 25.0 MHz */
|
|
break;
|
|
case INTEL_FAM6_ATOM_GOLDMONT:
|
|
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
|
|
index c17d3893ae60..fc8236fd2495 100644
|
|
--- a/arch/x86/kvm/cpuid.c
|
|
+++ b/arch/x86/kvm/cpuid.c
|
|
@@ -355,7 +355,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|
|
|
/* cpuid 0x80000008.ebx */
|
|
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
|
|
- F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD);
|
|
+ F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
|
|
+ F(AMD_SSB_NO) | F(AMD_STIBP);
|
|
|
|
/* cpuid 0xC0000001.edx */
|
|
const u32 kvm_cpuid_C000_0001_edx_x86_features =
|
|
@@ -380,7 +381,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|
|
|
/* cpuid 7.0.edx*/
|
|
const u32 kvm_cpuid_7_0_edx_x86_features =
|
|
- F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES);
|
|
+ F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) |
|
|
+ F(INTEL_STIBP) | F(MD_CLEAR);
|
|
|
|
/* all calls to cpuid_count() should be made on the same cpu */
|
|
get_cpu();
|
|
@@ -633,7 +635,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
|
|
entry->ebx |= F(VIRT_SSBD);
|
|
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
|
|
cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX);
|
|
- if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD))
|
|
+ /*
|
|
+ * The preference is to use SPEC CTRL MSR instead of the
|
|
+ * VIRT_SPEC MSR.
|
|
+ */
|
|
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) &&
|
|
+ !boot_cpu_has(X86_FEATURE_AMD_SSBD))
|
|
entry->ebx |= F(VIRT_SSBD);
|
|
break;
|
|
}
|
|
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
|
|
index 8a841b9d8f84..b2bf8e1d5782 100644
|
|
--- a/arch/x86/kvm/cpuid.h
|
|
+++ b/arch/x86/kvm/cpuid.h
|
|
@@ -176,7 +176,7 @@ static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu)
|
|
struct kvm_cpuid_entry2 *best;
|
|
|
|
best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
|
|
- if (best && (best->ebx & bit(X86_FEATURE_AMD_IBRS)))
|
|
+ if (best && (best->ebx & (bit(X86_FEATURE_AMD_IBRS | bit(X86_FEATURE_AMD_SSBD)))))
|
|
return true;
|
|
best = kvm_find_cpuid_entry(vcpu, 7, 0);
|
|
return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SPEC_CTRL_SSBD)));
|
|
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
|
|
index 9a6d258c3c16..9338136a6a23 100644
|
|
--- a/arch/x86/kvm/svm.c
|
|
+++ b/arch/x86/kvm/svm.c
|
|
@@ -3704,7 +3704,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
|
|
return 1;
|
|
|
|
/* The STIBP bit doesn't fault even if it's not advertised */
|
|
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
|
|
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
|
|
return 1;
|
|
|
|
svm->spec_ctrl = data;
|
|
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
|
|
index 75466d9417b8..8feb4f7e2e59 100644
|
|
--- a/arch/x86/kvm/vmx.c
|
|
+++ b/arch/x86/kvm/vmx.c
|
|
@@ -9206,8 +9206,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
|
|
|
vmx->__launched = vmx->loaded_vmcs->launched;
|
|
|
|
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
|
|
if (static_branch_unlikely(&vmx_l1d_should_flush))
|
|
vmx_l1d_flush(vcpu);
|
|
+ else if (static_branch_unlikely(&mds_user_clear))
|
|
+ mds_clear_cpu_buffers();
|
|
|
|
asm(
|
|
/* Store host registers */
|
|
@@ -9566,8 +9569,8 @@ free_vcpu:
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
-#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
|
|
-#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/l1tf.html for details.\n"
|
|
+#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
|
|
+#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
|
|
|
|
static int vmx_vm_init(struct kvm *kvm)
|
|
{
|
|
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
|
|
index 90801a8f19c9..ce092a62fc5d 100644
|
|
--- a/arch/x86/mm/init.c
|
|
+++ b/arch/x86/mm/init.c
|
|
@@ -790,7 +790,7 @@ unsigned long max_swapfile_size(void)
|
|
|
|
pages = generic_max_swapfile_size();
|
|
|
|
- if (boot_cpu_has_bug(X86_BUG_L1TF)) {
|
|
+ if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
|
|
/* Limit the swap file size to MAX_PA/2 for L1TF workaround */
|
|
unsigned long long l1tf_limit = l1tf_pfn_limit();
|
|
/*
|
|
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
|
|
index 3f729e20f0e3..12522dbae615 100644
|
|
--- a/arch/x86/mm/kaiser.c
|
|
+++ b/arch/x86/mm/kaiser.c
|
|
@@ -9,6 +9,7 @@
|
|
#include <linux/spinlock.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/uaccess.h>
|
|
+#include <linux/cpu.h>
|
|
|
|
#undef pr_fmt
|
|
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
|
|
@@ -297,7 +298,8 @@ void __init kaiser_check_boottime_disable(void)
|
|
goto skip;
|
|
}
|
|
|
|
- if (cmdline_find_option_bool(boot_command_line, "nopti"))
|
|
+ if (cmdline_find_option_bool(boot_command_line, "nopti") ||
|
|
+ cpu_mitigations_off())
|
|
goto disable;
|
|
|
|
skip:
|
|
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
|
|
index e30baa8ad94f..dff8ac2d255c 100644
|
|
--- a/arch/x86/mm/pgtable.c
|
|
+++ b/arch/x86/mm/pgtable.c
|
|
@@ -251,7 +251,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
|
|
if (pgd_val(pgd) != 0) {
|
|
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
|
|
|
|
- pgdp[i] = native_make_pgd(0);
|
|
+ pgd_clear(&pgdp[i]);
|
|
|
|
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
|
|
pmd_free(mm, pmd);
|
|
@@ -419,7 +419,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
|
|
int changed = !pte_same(*ptep, entry);
|
|
|
|
if (changed && dirty) {
|
|
- *ptep = entry;
|
|
+ set_pte(ptep, entry);
|
|
pte_update(vma->vm_mm, address, ptep);
|
|
}
|
|
|
|
@@ -436,7 +436,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
|
|
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
|
|
|
|
if (changed && dirty) {
|
|
- *pmdp = entry;
|
|
+ set_pmd(pmdp, entry);
|
|
/*
|
|
* We had a write-protection fault here and changed the pmd
|
|
* to to more permissive. No need to flush the TLB for that,
|
|
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
|
|
index eac92e2d171b..a112bb175dd4 100644
|
|
--- a/arch/x86/mm/tlb.c
|
|
+++ b/arch/x86/mm/tlb.c
|
|
@@ -30,6 +30,12 @@
|
|
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
|
|
*/
|
|
|
|
+/*
|
|
+ * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
|
|
+ * stored in cpu_tlb_state.last_user_mm_ibpb.
|
|
+ */
|
|
+#define LAST_USER_MM_IBPB 0x1UL
|
|
+
|
|
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
|
|
|
|
struct flush_tlb_info {
|
|
@@ -101,33 +107,101 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|
local_irq_restore(flags);
|
|
}
|
|
|
|
+static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
|
|
+{
|
|
+ unsigned long next_tif = task_thread_info(next)->flags;
|
|
+ unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
|
|
+
|
|
+ return (unsigned long)next->mm | ibpb;
|
|
+}
|
|
+
|
|
+static void cond_ibpb(struct task_struct *next)
|
|
+{
|
|
+ if (!next || !next->mm)
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * Both, the conditional and the always IBPB mode use the mm
|
|
+ * pointer to avoid the IBPB when switching between tasks of the
|
|
+ * same process. Using the mm pointer instead of mm->context.ctx_id
|
|
+ * opens a hypothetical hole vs. mm_struct reuse, which is more or
|
|
+ * less impossible to control by an attacker. Aside of that it
|
|
+ * would only affect the first schedule so the theoretically
|
|
+ * exposed data is not really interesting.
|
|
+ */
|
|
+ if (static_branch_likely(&switch_mm_cond_ibpb)) {
|
|
+ unsigned long prev_mm, next_mm;
|
|
+
|
|
+ /*
|
|
+ * This is a bit more complex than the always mode because
|
|
+ * it has to handle two cases:
|
|
+ *
|
|
+ * 1) Switch from a user space task (potential attacker)
|
|
+ * which has TIF_SPEC_IB set to a user space task
|
|
+ * (potential victim) which has TIF_SPEC_IB not set.
|
|
+ *
|
|
+ * 2) Switch from a user space task (potential attacker)
|
|
+ * which has TIF_SPEC_IB not set to a user space task
|
|
+ * (potential victim) which has TIF_SPEC_IB set.
|
|
+ *
|
|
+ * This could be done by unconditionally issuing IBPB when
|
|
+ * a task which has TIF_SPEC_IB set is either scheduled in
|
|
+ * or out. Though that results in two flushes when:
|
|
+ *
|
|
+ * - the same user space task is scheduled out and later
|
|
+ * scheduled in again and only a kernel thread ran in
|
|
+ * between.
|
|
+ *
|
|
+ * - a user space task belonging to the same process is
|
|
+ * scheduled in after a kernel thread ran in between
|
|
+ *
|
|
+ * - a user space task belonging to the same process is
|
|
+ * scheduled in immediately.
|
|
+ *
|
|
+ * Optimize this with reasonably small overhead for the
|
|
+ * above cases. Mangle the TIF_SPEC_IB bit into the mm
|
|
+ * pointer of the incoming task which is stored in
|
|
+ * cpu_tlbstate.last_user_mm_ibpb for comparison.
|
|
+ */
|
|
+ next_mm = mm_mangle_tif_spec_ib(next);
|
|
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
|
|
+
|
|
+ /*
|
|
+ * Issue IBPB only if the mm's are different and one or
|
|
+ * both have the IBPB bit set.
|
|
+ */
|
|
+ if (next_mm != prev_mm &&
|
|
+ (next_mm | prev_mm) & LAST_USER_MM_IBPB)
|
|
+ indirect_branch_prediction_barrier();
|
|
+
|
|
+ this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
|
|
+ }
|
|
+
|
|
+ if (static_branch_unlikely(&switch_mm_always_ibpb)) {
|
|
+ /*
|
|
+ * Only flush when switching to a user space task with a
|
|
+ * different context than the user space task which ran
|
|
+ * last on this CPU.
|
|
+ */
|
|
+ if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
|
|
+ indirect_branch_prediction_barrier();
|
|
+ this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|
struct task_struct *tsk)
|
|
{
|
|
unsigned cpu = smp_processor_id();
|
|
|
|
if (likely(prev != next)) {
|
|
- u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
|
|
-
|
|
/*
|
|
* Avoid user/user BTB poisoning by flushing the branch
|
|
* predictor when switching between processes. This stops
|
|
* one process from doing Spectre-v2 attacks on another.
|
|
- *
|
|
- * As an optimization, flush indirect branches only when
|
|
- * switching into processes that disable dumping. This
|
|
- * protects high value processes like gpg, without having
|
|
- * too high performance overhead. IBPB is *expensive*!
|
|
- *
|
|
- * This will not flush branches when switching into kernel
|
|
- * threads. It will also not flush if we switch to idle
|
|
- * thread and back to the same process. It will flush if we
|
|
- * switch to a different non-dumpable process.
|
|
*/
|
|
- if (tsk && tsk->mm &&
|
|
- tsk->mm->context.ctx_id != last_ctx_id &&
|
|
- get_dumpable(tsk->mm) != SUID_DUMP_USER)
|
|
- indirect_branch_prediction_barrier();
|
|
+ cond_ibpb(tsk);
|
|
|
|
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
|
|
/*
|
|
@@ -143,14 +217,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|
set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
|
|
}
|
|
|
|
- /*
|
|
- * Record last user mm's context id, so we can avoid
|
|
- * flushing branch buffer with IBPB if we switch back
|
|
- * to the same user.
|
|
- */
|
|
- if (next != &init_mm)
|
|
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
|
|
-
|
|
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
|
|
this_cpu_write(cpu_tlbstate.active_mm, next);
|
|
|
|
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
|
|
index d49d3be81953..ecb5866aaf84 100644
|
|
--- a/arch/x86/platform/atom/punit_atom_debug.c
|
|
+++ b/arch/x86/platform/atom/punit_atom_debug.c
|
|
@@ -154,8 +154,8 @@ static void punit_dbgfs_unregister(void)
|
|
(kernel_ulong_t)&drv_data }
|
|
|
|
static const struct x86_cpu_id intel_punit_cpu_ids[] = {
|
|
- ICPU(INTEL_FAM6_ATOM_SILVERMONT1, punit_device_byt),
|
|
- ICPU(INTEL_FAM6_ATOM_MERRIFIELD, punit_device_tng),
|
|
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT, punit_device_byt),
|
|
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, punit_device_tng),
|
|
ICPU(INTEL_FAM6_ATOM_AIRMONT, punit_device_cht),
|
|
{}
|
|
};
|
|
diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
|
|
index 957d3fa3b543..8e38249311bd 100644
|
|
--- a/drivers/acpi/acpi_lpss.c
|
|
+++ b/drivers/acpi/acpi_lpss.c
|
|
@@ -243,7 +243,7 @@ static const struct lpss_device_desc bsw_spi_dev_desc = {
|
|
#define ICPU(model) { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, }
|
|
|
|
static const struct x86_cpu_id lpss_cpu_ids[] = {
|
|
- ICPU(INTEL_FAM6_ATOM_SILVERMONT1), /* Valleyview, Bay Trail */
|
|
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT), /* Valleyview, Bay Trail */
|
|
ICPU(INTEL_FAM6_ATOM_AIRMONT), /* Braswell, Cherry Trail */
|
|
{}
|
|
};
|
|
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
|
|
index f1f4ce7ddb47..3b123735a1c4 100644
|
|
--- a/drivers/base/cpu.c
|
|
+++ b/drivers/base/cpu.c
|
|
@@ -531,11 +531,18 @@ ssize_t __weak cpu_show_l1tf(struct device *dev,
|
|
return sprintf(buf, "Not affected\n");
|
|
}
|
|
|
|
+ssize_t __weak cpu_show_mds(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf)
|
|
+{
|
|
+ return sprintf(buf, "Not affected\n");
|
|
+}
|
|
+
|
|
static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL);
|
|
static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL);
|
|
static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL);
|
|
static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL);
|
|
static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL);
|
|
+static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL);
|
|
|
|
static struct attribute *cpu_root_vulnerabilities_attrs[] = {
|
|
&dev_attr_meltdown.attr,
|
|
@@ -543,6 +550,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = {
|
|
&dev_attr_spectre_v2.attr,
|
|
&dev_attr_spec_store_bypass.attr,
|
|
&dev_attr_l1tf.attr,
|
|
+ &dev_attr_mds.attr,
|
|
NULL
|
|
};
|
|
|
|
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
|
|
index f690085b1ad9..4fe999687415 100644
|
|
--- a/drivers/cpufreq/intel_pstate.c
|
|
+++ b/drivers/cpufreq/intel_pstate.c
|
|
@@ -1413,7 +1413,7 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
|
|
static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
|
|
ICPU(INTEL_FAM6_SANDYBRIDGE, core_params),
|
|
ICPU(INTEL_FAM6_SANDYBRIDGE_X, core_params),
|
|
- ICPU(INTEL_FAM6_ATOM_SILVERMONT1, silvermont_params),
|
|
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT, silvermont_params),
|
|
ICPU(INTEL_FAM6_IVYBRIDGE, core_params),
|
|
ICPU(INTEL_FAM6_HASWELL_CORE, core_params),
|
|
ICPU(INTEL_FAM6_BROADWELL_CORE, core_params),
|
|
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
|
|
index 5ded9b22b015..a6fa32c7e068 100644
|
|
--- a/drivers/idle/intel_idle.c
|
|
+++ b/drivers/idle/intel_idle.c
|
|
@@ -1107,14 +1107,14 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
|
|
ICPU(INTEL_FAM6_WESTMERE, idle_cpu_nehalem),
|
|
ICPU(INTEL_FAM6_WESTMERE_EP, idle_cpu_nehalem),
|
|
ICPU(INTEL_FAM6_NEHALEM_EX, idle_cpu_nehalem),
|
|
- ICPU(INTEL_FAM6_ATOM_PINEVIEW, idle_cpu_atom),
|
|
- ICPU(INTEL_FAM6_ATOM_LINCROFT, idle_cpu_lincroft),
|
|
+ ICPU(INTEL_FAM6_ATOM_BONNELL, idle_cpu_atom),
|
|
+ ICPU(INTEL_FAM6_ATOM_BONNELL_MID, idle_cpu_lincroft),
|
|
ICPU(INTEL_FAM6_WESTMERE_EX, idle_cpu_nehalem),
|
|
ICPU(INTEL_FAM6_SANDYBRIDGE, idle_cpu_snb),
|
|
ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb),
|
|
- ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom),
|
|
- ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt),
|
|
- ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier),
|
|
+ ICPU(INTEL_FAM6_ATOM_SALTWELL, idle_cpu_atom),
|
|
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT, idle_cpu_byt),
|
|
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID, idle_cpu_tangier),
|
|
ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht),
|
|
ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb),
|
|
ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt),
|
|
@@ -1122,7 +1122,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
|
|
ICPU(INTEL_FAM6_HASWELL_X, idle_cpu_hsw),
|
|
ICPU(INTEL_FAM6_HASWELL_ULT, idle_cpu_hsw),
|
|
ICPU(INTEL_FAM6_HASWELL_GT3E, idle_cpu_hsw),
|
|
- ICPU(INTEL_FAM6_ATOM_SILVERMONT2, idle_cpu_avn),
|
|
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT_X, idle_cpu_avn),
|
|
ICPU(INTEL_FAM6_BROADWELL_CORE, idle_cpu_bdw),
|
|
ICPU(INTEL_FAM6_BROADWELL_GT3E, idle_cpu_bdw),
|
|
ICPU(INTEL_FAM6_BROADWELL_X, idle_cpu_bdw),
|
|
@@ -1134,7 +1134,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
|
|
ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx),
|
|
ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl),
|
|
ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt),
|
|
- ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv),
|
|
+ ICPU(INTEL_FAM6_ATOM_GOLDMONT_X, idle_cpu_dnv),
|
|
{}
|
|
};
|
|
|
|
diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c
|
|
index 80918abfc468..4398398c0935 100644
|
|
--- a/drivers/mmc/host/sdhci-acpi.c
|
|
+++ b/drivers/mmc/host/sdhci-acpi.c
|
|
@@ -127,7 +127,7 @@ static const struct sdhci_acpi_chip sdhci_acpi_chip_int = {
|
|
static bool sdhci_acpi_byt(void)
|
|
{
|
|
static const struct x86_cpu_id byt[] = {
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 },
|
|
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT },
|
|
{}
|
|
};
|
|
|
|
diff --git a/drivers/pci/pci-mid.c b/drivers/pci/pci-mid.c
|
|
index c7f3408e3148..54b3f9bc5ad8 100644
|
|
--- a/drivers/pci/pci-mid.c
|
|
+++ b/drivers/pci/pci-mid.c
|
|
@@ -71,8 +71,8 @@ static struct pci_platform_pm_ops mid_pci_platform_pm = {
|
|
* arch/x86/platform/intel-mid/pwr.c.
|
|
*/
|
|
static const struct x86_cpu_id lpss_cpu_ids[] = {
|
|
- ICPU(INTEL_FAM6_ATOM_PENWELL),
|
|
- ICPU(INTEL_FAM6_ATOM_MERRIFIELD),
|
|
+ ICPU(INTEL_FAM6_ATOM_SALTWELL_MID),
|
|
+ ICPU(INTEL_FAM6_ATOM_SILVERMONT_MID),
|
|
{}
|
|
};
|
|
|
|
diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
|
|
index 3c71f608b444..8809c1a20bed 100644
|
|
--- a/drivers/powercap/intel_rapl.c
|
|
+++ b/drivers/powercap/intel_rapl.c
|
|
@@ -1175,12 +1175,12 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
|
|
RAPL_CPU(INTEL_FAM6_KABYLAKE_MOBILE, rapl_defaults_core),
|
|
RAPL_CPU(INTEL_FAM6_KABYLAKE_DESKTOP, rapl_defaults_core),
|
|
|
|
- RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT1, rapl_defaults_byt),
|
|
+ RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT, rapl_defaults_byt),
|
|
RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT, rapl_defaults_cht),
|
|
- RAPL_CPU(INTEL_FAM6_ATOM_MERRIFIELD, rapl_defaults_tng),
|
|
- RAPL_CPU(INTEL_FAM6_ATOM_MOOREFIELD, rapl_defaults_ann),
|
|
+ RAPL_CPU(INTEL_FAM6_ATOM_SILVERMONT_MID,rapl_defaults_tng),
|
|
+ RAPL_CPU(INTEL_FAM6_ATOM_AIRMONT_MID, rapl_defaults_ann),
|
|
RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT, rapl_defaults_core),
|
|
- RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core),
|
|
+ RAPL_CPU(INTEL_FAM6_ATOM_GOLDMONT_X, rapl_defaults_core),
|
|
|
|
RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server),
|
|
{}
|
|
diff --git a/drivers/thermal/intel_soc_dts_thermal.c b/drivers/thermal/intel_soc_dts_thermal.c
|
|
index b2bbaa1c60b0..18788109cae6 100644
|
|
--- a/drivers/thermal/intel_soc_dts_thermal.c
|
|
+++ b/drivers/thermal/intel_soc_dts_thermal.c
|
|
@@ -43,7 +43,7 @@ static irqreturn_t soc_irq_thread_fn(int irq, void *dev_data)
|
|
}
|
|
|
|
static const struct x86_cpu_id soc_thermal_ids[] = {
|
|
- { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1, 0,
|
|
+ { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT, 0,
|
|
BYT_SOC_DTS_APIC_IRQ},
|
|
{}
|
|
};
|
|
diff --git a/include/linux/bitops.h b/include/linux/bitops.h
|
|
index a83c822c35c2..d4b167fc9ecb 100644
|
|
--- a/include/linux/bitops.h
|
|
+++ b/include/linux/bitops.h
|
|
@@ -1,28 +1,9 @@
|
|
#ifndef _LINUX_BITOPS_H
|
|
#define _LINUX_BITOPS_H
|
|
#include <asm/types.h>
|
|
+#include <linux/bits.h>
|
|
|
|
-#ifdef __KERNEL__
|
|
-#define BIT(nr) (1UL << (nr))
|
|
-#define BIT_ULL(nr) (1ULL << (nr))
|
|
-#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
|
|
-#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
|
|
-#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG))
|
|
-#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG)
|
|
-#define BITS_PER_BYTE 8
|
|
#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long))
|
|
-#endif
|
|
-
|
|
-/*
|
|
- * Create a contiguous bitmask starting at bit position @l and ending at
|
|
- * position @h. For example
|
|
- * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
|
|
- */
|
|
-#define GENMASK(h, l) \
|
|
- (((~0UL) << (l)) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
|
|
-
|
|
-#define GENMASK_ULL(h, l) \
|
|
- (((~0ULL) << (l)) & (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
|
|
|
|
extern unsigned int __sw_hweight8(unsigned int w);
|
|
extern unsigned int __sw_hweight16(unsigned int w);
|
|
diff --git a/include/linux/bits.h b/include/linux/bits.h
|
|
new file mode 100644
|
|
index 000000000000..2b7b532c1d51
|
|
--- /dev/null
|
|
+++ b/include/linux/bits.h
|
|
@@ -0,0 +1,26 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef __LINUX_BITS_H
|
|
+#define __LINUX_BITS_H
|
|
+#include <asm/bitsperlong.h>
|
|
+
|
|
+#define BIT(nr) (1UL << (nr))
|
|
+#define BIT_ULL(nr) (1ULL << (nr))
|
|
+#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
|
|
+#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
|
|
+#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG))
|
|
+#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG)
|
|
+#define BITS_PER_BYTE 8
|
|
+
|
|
+/*
|
|
+ * Create a contiguous bitmask starting at bit position @l and ending at
|
|
+ * position @h. For example
|
|
+ * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000.
|
|
+ */
|
|
+#define GENMASK(h, l) \
|
|
+ (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h))))
|
|
+
|
|
+#define GENMASK_ULL(h, l) \
|
|
+ (((~0ULL) - (1ULL << (l)) + 1) & \
|
|
+ (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h))))
|
|
+
|
|
+#endif /* __LINUX_BITS_H */
|
|
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
|
|
index ae5ac89324df..166686209f2c 100644
|
|
--- a/include/linux/cpu.h
|
|
+++ b/include/linux/cpu.h
|
|
@@ -54,6 +54,8 @@ extern ssize_t cpu_show_spec_store_bypass(struct device *dev,
|
|
struct device_attribute *attr, char *buf);
|
|
extern ssize_t cpu_show_l1tf(struct device *dev,
|
|
struct device_attribute *attr, char *buf);
|
|
+extern ssize_t cpu_show_mds(struct device *dev,
|
|
+ struct device_attribute *attr, char *buf);
|
|
|
|
extern __printf(4, 5)
|
|
struct device *cpu_device_create(struct device *parent, void *drvdata,
|
|
@@ -276,4 +278,28 @@ static inline void cpu_smt_check_topology_early(void) { }
|
|
static inline void cpu_smt_check_topology(void) { }
|
|
#endif
|
|
|
|
+/*
|
|
+ * These are used for a global "mitigations=" cmdline option for toggling
|
|
+ * optional CPU mitigations.
|
|
+ */
|
|
+enum cpu_mitigations {
|
|
+ CPU_MITIGATIONS_OFF,
|
|
+ CPU_MITIGATIONS_AUTO,
|
|
+ CPU_MITIGATIONS_AUTO_NOSMT,
|
|
+};
|
|
+
|
|
+extern enum cpu_mitigations cpu_mitigations;
|
|
+
|
|
+/* mitigations=off */
|
|
+static inline bool cpu_mitigations_off(void)
|
|
+{
|
|
+ return cpu_mitigations == CPU_MITIGATIONS_OFF;
|
|
+}
|
|
+
|
|
+/* mitigations=auto,nosmt */
|
|
+static inline bool cpu_mitigations_auto_nosmt(void)
|
|
+{
|
|
+ return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
|
|
+}
|
|
+
|
|
#endif /* _LINUX_CPU_H_ */
|
|
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
|
|
index d53a23100401..58ae371556bc 100644
|
|
--- a/include/linux/ptrace.h
|
|
+++ b/include/linux/ptrace.h
|
|
@@ -60,14 +60,17 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
|
|
#define PTRACE_MODE_READ 0x01
|
|
#define PTRACE_MODE_ATTACH 0x02
|
|
#define PTRACE_MODE_NOAUDIT 0x04
|
|
-#define PTRACE_MODE_FSCREDS 0x08
|
|
-#define PTRACE_MODE_REALCREDS 0x10
|
|
+#define PTRACE_MODE_FSCREDS 0x08
|
|
+#define PTRACE_MODE_REALCREDS 0x10
|
|
+#define PTRACE_MODE_SCHED 0x20
|
|
+#define PTRACE_MODE_IBPB 0x40
|
|
|
|
/* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */
|
|
#define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS)
|
|
#define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS)
|
|
#define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS)
|
|
#define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS)
|
|
+#define PTRACE_MODE_SPEC_IBPB (PTRACE_MODE_ATTACH_REALCREDS | PTRACE_MODE_IBPB)
|
|
|
|
/**
|
|
* ptrace_may_access - check whether the caller is permitted to access
|
|
@@ -85,6 +88,20 @@ extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead);
|
|
*/
|
|
extern bool ptrace_may_access(struct task_struct *task, unsigned int mode);
|
|
|
|
+/**
|
|
+ * ptrace_may_access - check whether the caller is permitted to access
|
|
+ * a target task.
|
|
+ * @task: target task
|
|
+ * @mode: selects type of access and caller credentials
|
|
+ *
|
|
+ * Returns true on success, false on denial.
|
|
+ *
|
|
+ * Similar to ptrace_may_access(). Only to be called from context switch
|
|
+ * code. Does not call into audit and the regular LSM hooks due to locking
|
|
+ * constraints.
|
|
+ */
|
|
+extern bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode);
|
|
+
|
|
static inline int ptrace_reparented(struct task_struct *child)
|
|
{
|
|
return !same_thread_group(child->real_parent, child->parent);
|
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
|
index ebd0afb35d16..1c487a3abd84 100644
|
|
--- a/include/linux/sched.h
|
|
+++ b/include/linux/sched.h
|
|
@@ -2357,6 +2357,8 @@ static inline void memalloc_noio_restore(unsigned int flags)
|
|
#define PFA_LMK_WAITING 3 /* Lowmemorykiller is waiting */
|
|
#define PFA_SPEC_SSB_DISABLE 4 /* Speculative Store Bypass disabled */
|
|
#define PFA_SPEC_SSB_FORCE_DISABLE 5 /* Speculative Store Bypass force disabled*/
|
|
+#define PFA_SPEC_IB_DISABLE 6 /* Indirect branch speculation restricted */
|
|
+#define PFA_SPEC_IB_FORCE_DISABLE 7 /* Indirect branch speculation permanently restricted */
|
|
|
|
|
|
#define TASK_PFA_TEST(name, func) \
|
|
@@ -2390,6 +2392,13 @@ TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)
|
|
TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
|
|
TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
|
|
|
|
+TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
|
|
+TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
|
|
+TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)
|
|
+
|
|
+TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
|
|
+TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
|
|
+
|
|
/*
|
|
* task->jobctl flags
|
|
*/
|
|
diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h
|
|
new file mode 100644
|
|
index 000000000000..559ac4590593
|
|
--- /dev/null
|
|
+++ b/include/linux/sched/smt.h
|
|
@@ -0,0 +1,20 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _LINUX_SCHED_SMT_H
|
|
+#define _LINUX_SCHED_SMT_H
|
|
+
|
|
+#include <linux/atomic.h>
|
|
+
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+extern atomic_t sched_smt_present;
|
|
+
|
|
+static __always_inline bool sched_smt_active(void)
|
|
+{
|
|
+ return atomic_read(&sched_smt_present);
|
|
+}
|
|
+#else
|
|
+static inline bool sched_smt_active(void) { return false; }
|
|
+#endif
|
|
+
|
|
+void arch_smt_update(void);
|
|
+
|
|
+#endif
|
|
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
|
|
index 64776b72e1eb..64ec0d62e5f5 100644
|
|
--- a/include/uapi/linux/prctl.h
|
|
+++ b/include/uapi/linux/prctl.h
|
|
@@ -202,6 +202,7 @@ struct prctl_mm_map {
|
|
#define PR_SET_SPECULATION_CTRL 53
|
|
/* Speculation control variants */
|
|
# define PR_SPEC_STORE_BYPASS 0
|
|
+# define PR_SPEC_INDIRECT_BRANCH 1
|
|
/* Return and control values for PR_SET/GET_SPECULATION_CTRL */
|
|
# define PR_SPEC_NOT_AFFECTED 0
|
|
# define PR_SPEC_PRCTL (1UL << 0)
|
|
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
|
|
index f39a7be98fc1..efba851ee018 100644
|
|
--- a/kernel/ptrace.c
|
|
+++ b/kernel/ptrace.c
|
|
@@ -258,6 +258,9 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
|
|
|
|
static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
|
|
{
|
|
+ if (mode & PTRACE_MODE_SCHED)
|
|
+ return false;
|
|
+
|
|
if (mode & PTRACE_MODE_NOAUDIT)
|
|
return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
|
|
else
|
|
@@ -325,9 +328,16 @@ ok:
|
|
!ptrace_has_cap(mm->user_ns, mode)))
|
|
return -EPERM;
|
|
|
|
+ if (mode & PTRACE_MODE_SCHED)
|
|
+ return 0;
|
|
return security_ptrace_access_check(task, mode);
|
|
}
|
|
|
|
+bool ptrace_may_access_sched(struct task_struct *task, unsigned int mode)
|
|
+{
|
|
+ return __ptrace_may_access(task, mode | PTRACE_MODE_SCHED);
|
|
+}
|
|
+
|
|
bool ptrace_may_access(struct task_struct *task, unsigned int mode)
|
|
{
|
|
int err;
|
|
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
|
|
index 6b3fff6a6437..50e80b1be2c8 100644
|
|
--- a/kernel/sched/core.c
|
|
+++ b/kernel/sched/core.c
|
|
@@ -7355,11 +7355,22 @@ static int cpuset_cpu_inactive(unsigned int cpu)
|
|
return 0;
|
|
}
|
|
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+atomic_t sched_smt_present = ATOMIC_INIT(0);
|
|
+#endif
|
|
+
|
|
int sched_cpu_activate(unsigned int cpu)
|
|
{
|
|
struct rq *rq = cpu_rq(cpu);
|
|
unsigned long flags;
|
|
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+ /*
|
|
+ * When going up, increment the number of cores with SMT present.
|
|
+ */
|
|
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
|
|
+ atomic_inc(&sched_smt_present);
|
|
+#endif
|
|
set_cpu_active(cpu, true);
|
|
|
|
if (sched_smp_initialized) {
|
|
@@ -7408,6 +7419,14 @@ int sched_cpu_deactivate(unsigned int cpu)
|
|
else
|
|
synchronize_rcu();
|
|
|
|
+#ifdef CONFIG_SCHED_SMT
|
|
+ /*
|
|
+ * When going down, decrement the number of cores with SMT present.
|
|
+ */
|
|
+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
|
|
+ atomic_dec(&sched_smt_present);
|
|
+#endif
|
|
+
|
|
if (!sched_smp_initialized)
|
|
return 0;
|
|
|
|
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
|
|
index ec6e838e991a..15c08752926b 100644
|
|
--- a/kernel/sched/sched.h
|
|
+++ b/kernel/sched/sched.h
|
|
@@ -2,6 +2,7 @@
|
|
#include <linux/sched.h>
|
|
#include <linux/sched/sysctl.h>
|
|
#include <linux/sched/rt.h>
|
|
+#include <linux/sched/smt.h>
|
|
#include <linux/u64_stats_sync.h>
|
|
#include <linux/sched/deadline.h>
|
|
#include <linux/kernel_stat.h>
|
|
diff --git a/tools/power/x86/turbostat/Makefile b/tools/power/x86/turbostat/Makefile
|
|
index 8561e7ddca59..92be948c922d 100644
|
|
--- a/tools/power/x86/turbostat/Makefile
|
|
+++ b/tools/power/x86/turbostat/Makefile
|
|
@@ -8,7 +8,7 @@ ifeq ("$(origin O)", "command line")
|
|
endif
|
|
|
|
turbostat : turbostat.c
|
|
-CFLAGS += -Wall
|
|
+CFLAGS += -Wall -I../../../include
|
|
CFLAGS += -DMSRHEADER='"../../../../arch/x86/include/asm/msr-index.h"'
|
|
|
|
%: %.c
|