Merge tag 'x86-urgent-2022-08-13' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sat, 13 Aug 2022 21:24:12 +0000 (14:24 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sat, 13 Aug 2022 21:24:12 +0000 (14:24 -0700)
Pull x86 fix from Ingo Molnar:
 "Fix the 'IBPB mitigated RETBleed' mode of operation on AMD CPUs (not
  turned on by default), which also need STIBP enabled (if available) to
  be '100% safe' on even the shortest speculation windows"

* tag 'x86-urgent-2022-08-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/bugs: Enable STIBP for IBPB mitigated RETBleed

1  2 
Documentation/admin-guide/kernel-parameters.txt
arch/x86/kernel/cpu/bugs.c

                        nosocket -- Disable socket memory accounting.
                        nokmem -- Disable kernel memory accounting.
  
 -      checkreqprot    [SELINUX] Set initial checkreqprot flag value.
 +      checkreqprot=   [SELINUX] Set initial checkreqprot flag value.
                        Format: { "0" | "1" }
                        See security/selinux/Kconfig help text.
                        0 -- check protection applied by kernel (includes
        nopku           [X86] Disable Memory Protection Keys CPU feature found
                        in some Intel CPUs.
  
 -      <module>.async_probe [KNL]
 -                      Enable asynchronous probe on this module.
 +      <module>.async_probe[=<bool>] [KNL]
 +                      If no <bool> value is specified or if the value
 +                      specified is not a valid <bool>, enable asynchronous
 +                      probe on this module.  Otherwise, enable/disable
 +                      asynchronous probe on this module as indicated by the
 +                      <bool> value. See also: module.async_probe
  
        early_ioremap_debug [KNL]
                        Enable debug messages in early_ioremap support. This
                        (in particular on some ATI chipsets).
                        The kernel tries to set a reasonable default.
  
 -      enforcing       [SELINUX] Set initial enforcing status.
 +      enforcing=      [SELINUX] Set initial enforcing status.
                        Format: {"0" | "1"}
                        See security/selinux/Kconfig help text.
                        0 -- permissive (log only, no denials).
  
        hlt             [BUGS=ARM,SH]
  
 +      hostname=       [KNL] Set the hostname (aka UTS nodename).
 +                      Format: <string>
 +                      This allows setting the system's hostname during early
 +                      startup. This sets the name returned by gethostname.
 +                      Using this parameter to set the hostname makes it
 +                      possible to ensure the hostname is correctly set before
 +                      any userspace processes run, avoiding the possibility
 +                      that a process may call gethostname before the hostname
 +                      has been explicitly set, resulting in the calling
 +                      process getting an incorrect result. The string must
 +                      not exceed the maximum allowed hostname length (usually
 +                      64 characters) and will be truncated otherwise.
 +
        hpet=           [X86-32,HPET] option to control HPET usage
                        Format: { enable (default) | disable | force |
                                verbose }
        hugetlb_free_vmemmap=
                        [KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
                        enabled.
 +                      Control if HugeTLB Vmemmap Optimization (HVO) is enabled.
                        Allows heavy hugetlb users to free up some more
                        memory (7 * PAGE_SIZE for each 2MB hugetlb page).
 -                      Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) }
 +                      Format: { on | off (default) }
  
 -                      [oO][Nn]/Y/y/1: enable the feature
 -                      [oO][Ff]/N/n/0: disable the feature
 +                      on: enable HVO
 +                      off: disable HVO
  
                        Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
                        the default is on.
  
 -                      This is not compatible with memory_hotplug.memmap_on_memory.
 -                      If both parameters are enabled, hugetlb_free_vmemmap takes
 -                      precedence over memory_hotplug.memmap_on_memory.
 +                      Note that the vmemmap pages may be allocated from the added
 +                      memory block itself when memory_hotplug.memmap_on_memory is
 +                      enabled, those vmemmap pages cannot be optimized even if this
 +                      feature is enabled.  Other vmemmap pages not allocated from
 +                      the added memory block itself do not be affected.
  
        hung_task_panic=
                        [KNL] Should the hung task detector generate panics.
  
        ivrs_ioapic     [HW,X86-64]
                        Provide an override to the IOAPIC-ID<->DEVICE-ID
 -                      mapping provided in the IVRS ACPI table. For
 -                      example, to map IOAPIC-ID decimal 10 to
 -                      PCI device 00:14.0 write the parameter as:
 +                      mapping provided in the IVRS ACPI table.
 +                      By default, PCI segment is 0, and can be omitted.
 +                      For example:
 +                      * To map IOAPIC-ID decimal 10 to PCI device 00:14.0
 +                        write the parameter as:
                                ivrs_ioapic[10]=00:14.0
 +                      * To map IOAPIC-ID decimal 10 to PCI segment 0x1 and
 +                        PCI device 00:14.0 write the parameter as:
 +                              ivrs_ioapic[10]=0001:00:14.0
  
        ivrs_hpet       [HW,X86-64]
                        Provide an override to the HPET-ID<->DEVICE-ID
 -                      mapping provided in the IVRS ACPI table. For
 -                      example, to map HPET-ID decimal 0 to
 -                      PCI device 00:14.0 write the parameter as:
 +                      mapping provided in the IVRS ACPI table.
 +                      By default, PCI segment is 0, and can be omitted.
 +                      For example:
 +                      * To map HPET-ID decimal 0 to PCI device 00:14.0
 +                        write the parameter as:
                                ivrs_hpet[0]=00:14.0
 +                      * To map HPET-ID decimal 10 to PCI segment 0x1 and
 +                        PCI device 00:14.0 write the parameter as:
 +                              ivrs_ioapic[10]=0001:00:14.0
  
        ivrs_acpihid    [HW,X86-64]
                        Provide an override to the ACPI-HID:UID<->DEVICE-ID
 -                      mapping provided in the IVRS ACPI table. For
 -                      example, to map UART-HID:UID AMD0020:0 to
 -                      PCI device 00:14.5 write the parameter as:
 +                      mapping provided in the IVRS ACPI table.
 +
 +                      For example, to map UART-HID:UID AMD0020:0 to
 +                      PCI segment 0x1 and PCI device ID 00:14.5,
 +                      write the parameter as:
 +                              ivrs_acpihid[0001:00:14.5]=AMD0020:0
 +
 +                      By default, PCI segment is 0, and can be omitted.
 +                      For example, PCI device 00:14.5 write the parameter as:
                                ivrs_acpihid[00:14.5]=AMD0020:0
  
        js=             [HW,JOY] Analog joystick
                        the KVM_CLEAR_DIRTY ioctl, and only for the pages being
                        cleared.
  
 -                      Eager page splitting currently only supports splitting
 -                      huge pages mapped by the TDP MMU.
 +                      Eager page splitting is only supported when kvm.tdp_mmu=Y.
  
                        Default is Y (on).
  
                        [KNL,X86,ARM] Boolean flag to enable this feature.
                        Format: {on | off (default)}
                        When enabled, runtime hotplugged memory will
 -                      allocate its internal metadata (struct pages)
 -                      from the hotadded memory which will allow to
 -                      hotadd a lot of memory without requiring
 -                      additional memory to do so.
 +                      allocate its internal metadata (struct pages,
 +                      those vmemmap pages cannot be optimized even
 +                      if hugetlb_free_vmemmap is enabled) from the
 +                      hotadded memory which will allow to hotadd a
 +                      lot of memory without requiring additional
 +                      memory to do so.
                        This feature is disabled by default because it
                        has some implication on large (e.g. GB)
                        allocations in some configurations (e.g. small
                        Note that even when enabled, there are a few cases where
                        the feature is not effective.
  
 -                      This is not compatible with hugetlb_free_vmemmap. If
 -                      both parameters are enabled, hugetlb_free_vmemmap takes
 -                      precedence over memory_hotplug.memmap_on_memory.
 -
        memtest=        [KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
                        Format: <integer>
                        default : 0 <disable>
                        mem_encrypt=on:         Activate SME
                        mem_encrypt=off:        Do not activate SME
  
 -                      Refer to Documentation/virt/kvm/amd-memory-encryption.rst
 +                      Refer to Documentation/virt/kvm/x86/amd-memory-encryption.rst
                        for details on when memory encryption can be activated.
  
        mem_sleep_default=      [SUSPEND] Default system suspend mode:
                        For details see:
                        Documentation/admin-guide/hw-vuln/processor_mmio_stale_data.rst
  
 +      module.async_probe=<bool>
 +                      [KNL] When set to true, modules will use async probing
 +                      by default. To enable/disable async probing for a
 +                      specific module, use the module specific control that
 +                      is documented under <module>.async_probe. When both
 +                      module.async_probe and <module>.async_probe are
 +                      specified, <module>.async_probe takes precedence for
 +                      the specific module.
 +
        module.sig_enforce
                        [KNL] When CONFIG_MODULE_SIG is set, this means that
                        modules without (valid) signatures will fail to load.
  
        noautogroup     Disable scheduler automatic task group creation.
  
 -      nobats          [PPC] Do not use BATs for mapping kernel lowmem
 -                      on "Classic" PPC cores.
 -
        nocache         [ARM]
  
        nodsp           [SH] Disable hardware DSP at boot time.
                        just as if they had also been called out in the
                        rcu_nocbs= boot parameter.
  
 +                      Note that this argument takes precedence over
 +                      the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option.
 +
        noiotrap        [SH] Disables trapped I/O port accesses.
  
        noirqdebug      [X86-32] Disables the code which attempts to detect and
  
        nolapic_timer   [X86-32,APIC] Do not use the local APIC timer.
  
 -      noltlbs         [PPC] Do not use large page/tlb entries for kernel
 -                      lowmem mapping on PPC40x and PPC8xx
 -
        nomca           [IA-64] Disable machine check abort handling
  
        nomce           [X86-32] Disable Machine Check Exception
        noreplace-smp   [X86-32,SMP] Don't replace SMP instructions
                        with UP alternatives
  
 -      nordrand        [X86] Disable kernel use of the RDRAND and
 -                      RDSEED instructions even if they are supported
 -                      by the processor.  RDRAND and RDSEED are still
 -                      available to user space applications.
 -
        noresume        [SWSUSP] Disables resume and restores original swap
                        space.
  
                        no-callback mode from boot but the mode may be
                        toggled at runtime via cpusets.
  
 +                      Note that this argument takes precedence over
 +                      the CONFIG_RCU_NOCB_CPU_DEFAULT_ALL option.
 +
        rcu_nocb_poll   [KNL]
                        Rather than requiring that offloaded CPUs
                        (specified by rcu_nocbs= above) explicitly
                        When RCU_NOCB_CPU is set, also adjust the
                        priority of NOCB callback kthreads.
  
 +      rcutree.rcu_divisor= [KNL]
 +                      Set the shift-right count to use to compute
 +                      the callback-invocation batch limit bl from
 +                      the number of callbacks queued on this CPU.
 +                      The result will be bounded below by the value of
 +                      the rcutree.blimit kernel parameter.  Every bl
 +                      callbacks, the softirq handler will exit in
 +                      order to allow the CPU to do other work.
 +
 +                      Please note that this callback-invocation batch
 +                      limit applies only to non-offloaded callback
 +                      invocation.  Offloaded callbacks are instead
 +                      invoked in the context of an rcuoc kthread, which
 +                      scheduler will preempt as it does any other task.
 +
 +      rcutree.nocb_nobypass_lim_per_jiffy= [KNL]
 +                      On callback-offloaded (rcu_nocbs) CPUs,
 +                      RCU reduces the lock contention that would
 +                      otherwise be caused by callback floods through
 +                      use of the ->nocb_bypass list.  However, in the
 +                      common non-flooded case, RCU queues directly to
 +                      the main ->cblist in order to avoid the extra
 +                      overhead of the ->nocb_bypass list and its lock.
 +                      But if there are too many callbacks queued during
 +                      a single jiffy, RCU pre-queues the callbacks into
 +                      the ->nocb_bypass queue.  The definition of "too
 +                      many" is supplied by this kernel boot parameter.
 +
        rcutree.rcu_nocb_gp_stride= [KNL]
                        Set the number of NOCB callback kthreads in
                        each group, which defaults to the square root
                        Speculative Code Execution with Return Instructions)
                        vulnerability.
  
+                       AMD-based UNRET and IBPB mitigations alone do not stop
+                       sibling threads from influencing the predictions of other
+                       sibling threads. For that reason, STIBP is used on pro-
+                       cessors that support it, and mitigate SMT on processors
+                       that don't.
                        off          - no mitigation
                        auto         - automatically select a migitation
                        auto,nosmt   - automatically select a mitigation,
                                       disabling SMT if necessary for
                                       the full mitigation (only on Zen1
                                       and older without STIBP).
-                       ibpb         - mitigate short speculation windows on
-                                      basic block boundaries too. Safe, highest
-                                      perf impact.
-                       unret        - force enable untrained return thunks,
-                                      only effective on AMD f15h-f17h
-                                      based systems.
-                       unret,nosmt  - like unret, will disable SMT when STIBP
-                                      is not available.
+                       ibpb         - On AMD, mitigate short speculation
+                                      windows on basic block boundaries too.
+                                      Safe, highest perf impact. It also
+                                      enables STIBP if present. Not suitable
+                                      on Intel.
+                       ibpb,nosmt   - Like "ibpb" above but will disable SMT
+                                      when STIBP is not available. This is
+                                      the alternative for systems which do not
+                                      have STIBP.
+                       unret        - Force enable untrained return thunks,
+                                      only effective on AMD f15h-f17h based
+                                      systems.
+                       unret,nosmt  - Like unret, but will disable SMT when STIBP
+                                      is not available. This is the alternative for
+                                      systems which do not have STIBP.
  
                        Selecting 'auto' will choose a mitigation method at run
                        time according to the CPU.
                        cache (risks via metadata attacks are mostly
                        unchanged). Debug options disable merging on their
                        own.
 -                      For more information see Documentation/vm/slub.rst.
 +                      For more information see Documentation/mm/slub.rst.
  
        slab_max_order= [MM, SLAB]
                        Determines the maximum allowed order for slabs.
                        slub_debug can create guard zones around objects and
                        may poison objects when not in use. Also tracks the
                        last alloc / free. For more information see
 -                      Documentation/vm/slub.rst.
 +                      Documentation/mm/slub.rst.
  
        slub_max_order= [MM, SLUB]
                        Determines the maximum allowed order for slabs.
                        A high setting may cause OOMs due to memory
                        fragmentation. For more information see
 -                      Documentation/vm/slub.rst.
 +                      Documentation/mm/slub.rst.
  
        slub_min_objects=       [MM, SLUB]
                        The minimum number of objects per slab. SLUB will
                        the number of objects indicated. The higher the number
                        of objects the smaller the overhead of tracking slabs
                        and the less frequently locks need to be acquired.
 -                      For more information see Documentation/vm/slub.rst.
 +                      For more information see Documentation/mm/slub.rst.
  
        slub_min_order= [MM, SLUB]
                        Determines the minimum page order for slabs. Must be
                        lower than slub_max_order.
 -                      For more information see Documentation/vm/slub.rst.
 +                      For more information see Documentation/mm/slub.rst.
  
        slub_merge      [MM, SLUB]
                        Same with slab_merge.
                        it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
  
        swiotlb=        [ARM,IA-64,PPC,MIPS,X86]
 -                      Format: { <int> | force | noforce }
 +                      Format: { <int> [,<int>] | force | noforce }
                        <int> -- Number of I/O TLB slabs
 +                      <int> -- Second integer after comma. Number of swiotlb
 +                               areas with their own lock. Will be rounded up
 +                               to a power of 2.
                        force -- force using of bounce buffers even if they
                                 wouldn't be automatically used by the kernel
                        noforce -- Never use bounce buffers (for debugging)
@@@ -152,7 -152,7 +152,7 @@@ void __init check_bugs(void
        /*
         * spectre_v2_user_select_mitigation() relies on the state set by
         * retbleed_select_mitigation(); specifically the STIBP selection is
-        * forced for UNRET.
+        * forced for UNRET or IBPB.
         */
        spectre_v2_user_select_mitigation();
        ssb_select_mitigation();
@@@ -1179,7 -1179,8 +1179,8 @@@ spectre_v2_user_select_mitigation(void
            boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON))
                mode = SPECTRE_V2_USER_STRICT_PREFERRED;
  
-       if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) {
+       if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+           retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
                if (mode != SPECTRE_V2_USER_STRICT &&
                    mode != SPECTRE_V2_USER_STRICT_PREFERRED)
                        pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n");
@@@ -1335,53 -1336,6 +1336,53 @@@ static void __init spec_ctrl_disable_ke
        }
  }
  
 +static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode)
 +{
 +      /*
 +       * Similar to context switches, there are two types of RSB attacks
 +       * after VM exit:
 +       *
 +       * 1) RSB underflow
 +       *
 +       * 2) Poisoned RSB entry
 +       *
 +       * When retpoline is enabled, both are mitigated by filling/clearing
 +       * the RSB.
 +       *
 +       * When IBRS is enabled, while #1 would be mitigated by the IBRS branch
 +       * prediction isolation protections, RSB still needs to be cleared
 +       * because of #2.  Note that SMEP provides no protection here, unlike
 +       * user-space-poisoned RSB entries.
 +       *
 +       * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB
 +       * bug is present then a LITE version of RSB protection is required,
 +       * just a single call needs to retire before a RET is executed.
 +       */
 +      switch (mode) {
 +      case SPECTRE_V2_NONE:
 +              return;
 +
 +      case SPECTRE_V2_EIBRS_LFENCE:
 +      case SPECTRE_V2_EIBRS:
 +              if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
 +                      setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE);
 +                      pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n");
 +              }
 +              return;
 +
 +      case SPECTRE_V2_EIBRS_RETPOLINE:
 +      case SPECTRE_V2_RETPOLINE:
 +      case SPECTRE_V2_LFENCE:
 +      case SPECTRE_V2_IBRS:
 +              setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
 +              pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n");
 +              return;
 +      }
 +
 +      pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit");
 +      dump_stack();
 +}
 +
  static void __init spectre_v2_select_mitigation(void)
  {
        enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
        setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
        pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n");
  
 -      /*
 -       * Similar to context switches, there are two types of RSB attacks
 -       * after vmexit:
 -       *
 -       * 1) RSB underflow
 -       *
 -       * 2) Poisoned RSB entry
 -       *
 -       * When retpoline is enabled, both are mitigated by filling/clearing
 -       * the RSB.
 -       *
 -       * When IBRS is enabled, while #1 would be mitigated by the IBRS branch
 -       * prediction isolation protections, RSB still needs to be cleared
 -       * because of #2.  Note that SMEP provides no protection here, unlike
 -       * user-space-poisoned RSB entries.
 -       *
 -       * eIBRS, on the other hand, has RSB-poisoning protections, so it
 -       * doesn't need RSB clearing after vmexit.
 -       */
 -      if (boot_cpu_has(X86_FEATURE_RETPOLINE) ||
 -          boot_cpu_has(X86_FEATURE_KERNEL_IBRS))
 -              setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT);
 +      spectre_v2_determine_rsb_fill_type_at_vmexit(mode);
  
        /*
         * Retpoline protects the kernel, but doesn't protect firmware.  IBRS
@@@ -2318,19 -2293,6 +2319,19 @@@ static char *ibpb_state(void
        return "";
  }
  
 +static char *pbrsb_eibrs_state(void)
 +{
 +      if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) {
 +              if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) ||
 +                  boot_cpu_has(X86_FEATURE_RSB_VMEXIT))
 +                      return ", PBRSB-eIBRS: SW sequence";
 +              else
 +                      return ", PBRSB-eIBRS: Vulnerable";
 +      } else {
 +              return ", PBRSB-eIBRS: Not affected";
 +      }
 +}
 +
  static ssize_t spectre_v2_show_state(char *buf)
  {
        if (spectre_v2_enabled == SPECTRE_V2_LFENCE)
            spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE)
                return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n");
  
 -      return sprintf(buf, "%s%s%s%s%s%s\n",
 +      return sprintf(buf, "%s%s%s%s%s%s%s\n",
                       spectre_v2_strings[spectre_v2_enabled],
                       ibpb_state(),
                       boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "",
                       stibp_state(),
                       boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "",
 +                     pbrsb_eibrs_state(),
                       spectre_v2_module_string());
  }
  
@@@ -2360,10 -2321,11 +2361,11 @@@ static ssize_t srbds_show_state(char *b
  
  static ssize_t retbleed_show_state(char *buf)
  {
-       if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET) {
+       if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET ||
+           retbleed_mitigation == RETBLEED_MITIGATION_IBPB) {
            if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD &&
                boot_cpu_data.x86_vendor != X86_VENDOR_HYGON)
-                   return sprintf(buf, "Vulnerable: untrained return thunk on non-Zen uarch\n");
+                   return sprintf(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n");
  
            return sprintf(buf, "%s; SMT %s\n",
                           retbleed_strings[retbleed_mitigation],