Merge tag 'locking-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 27 Jun 2023 21:14:30 +0000 (14:14 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 27 Jun 2023 21:14:30 +0000 (14:14 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 27 Jun 2023 21:14:30 +0000 (14:14 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 27 Jun 2023 21:14:30 +0000 (14:14 -0700)
diff --combined arch/x86/kernel/alternative.c

index a7e1ec5,18f16e9..72646d7
--- 1/arch/x86/kernel/alternative.c
--- 2/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@@ -37,23 -37,11 +37,23 @@@ EXPORT_SYMBOL_GPL(alternatives_patched)
   
   #define MAX_PATCH_LEN (255-1)
   
- -static int __initdata_or_module debug_alternative;
+ +#define DA_ALL                (~0)
+ +#define DA_ALT                0x01
+ +#define DA_RET                0x02
+ +#define DA_RETPOLINE  0x04
+ +#define DA_ENDBR      0x08
+ +#define DA_SMP                0x10
+ +
+ +static unsigned int __initdata_or_module debug_alternative;
   
   static int __init debug_alt(char *str)
   {
- -      debug_alternative = 1;
+ +      if (str && *str == '=')
+ +              str++;
+ +
+ +      if (!str || kstrtouint(str, 0, &debug_alternative))
+ +              debug_alternative = DA_ALL;
+ +
         return 1;
   }
   __setup("debug-alternative", debug_alt);
@@@ -67,15 -55,15 +67,15 @@@ static int __init setup_noreplace_smp(c
   }
   __setup("noreplace-smp", setup_noreplace_smp);
   
- -#define DPRINTK(fmt, args...)                                         \
+ +#define DPRINTK(type, fmt, args...)                                   \
   do {                                                                  \
- -      if (debug_alternative)                                          \
+ +      if (debug_alternative & DA_##type)                              \
                 printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);            \
   } while (0)
   
- -#define DUMP_BYTES(buf, len, fmt, args...)                            \
+ +#define DUMP_BYTES(type, buf, len, fmt, args...)                      \
   do {                                                                  \
- -      if (unlikely(debug_alternative)) {                              \
+ +      if (unlikely(debug_alternative & DA_##type)) {                  \
                 int j;                                                  \
                                                                         \
                 if (!(len))                                             \
@@@ -98,11 -86,6 +98,11 @@@ static const unsigned char x86nops[] 
         BYTES_NOP6,
         BYTES_NOP7,
         BYTES_NOP8,
+ +#ifdef CONFIG_64BIT
+ +      BYTES_NOP9,
+ +      BYTES_NOP10,
+ +      BYTES_NOP11,
+ +#endif
   };
   
   const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
@@@ -116,44 -99,19 +116,44 @@@
         x86nops + 1 + 2 + 3 + 4 + 5,
         x86nops + 1 + 2 + 3 + 4 + 5 + 6,
         x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+ +#ifdef CONFIG_64BIT
+ +      x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+ +      x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+ +      x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
+ +#endif
   };
   
- -/* Use this to add nops to a buffer, then text_poke the whole buffer. */
- -static void __init_or_module add_nops(void *insns, unsigned int len)
+ +/*
+ + * Fill the buffer with a single effective instruction of size @len.
+ + *
+ + * In order not to issue an ORC stack depth tracking CFI entry (Call Frame Info)
+ + * for every single-byte NOP, try to generate the maximally available NOP of
+ + * size <= ASM_NOP_MAX such that only a single CFI entry is generated (vs one for
+ + * each single-byte NOPs). If @len to fill out is > ASM_NOP_MAX, pad with INT3 and
+ + * *jump* over instead of executing long and daft NOPs.
+ + */
+ +static void __init_or_module add_nop(u8 *instr, unsigned int len)
   {
- -      while (len > 0) {
- -              unsigned int noplen = len;
- -              if (noplen > ASM_NOP_MAX)
- -                      noplen = ASM_NOP_MAX;
- -              memcpy(insns, x86_nops[noplen], noplen);
- -              insns += noplen;
- -              len -= noplen;
+ +      u8 *target = instr + len;
+ +
+ +      if (!len)
+ +              return;
+ +
+ +      if (len <= ASM_NOP_MAX) {
+ +              memcpy(instr, x86_nops[len], len);
+ +              return;
+ +      }
+ +
+ +      if (len < 128) {
+ +              __text_gen_insn(instr, JMP8_INSN_OPCODE, instr, target, JMP8_INSN_SIZE);
+ +              instr += JMP8_INSN_SIZE;
+ +      } else {
+ +              __text_gen_insn(instr, JMP32_INSN_OPCODE, instr, target, JMP32_INSN_SIZE);
+ +              instr += JMP32_INSN_SIZE;
         }
+ +
+ +      for (;instr < target; instr++)
+ +              *instr = INT3_INSN_OPCODE;
   }
   
   extern s32 __retpoline_sites[], __retpoline_sites_end[];
@@@ -165,223 -123,133 +165,223 @@@ extern s32 __smp_locks[], __smp_locks_e
   void text_poke_early(void *addr, const void *opcode, size_t len);
   
   /*
- - * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ + * Matches NOP and NOPL, not any of the other possible NOPs.
    */
- -static inline bool is_jmp(const u8 opcode)
+ +static bool insn_is_nop(struct insn *insn)
   {
- -      return opcode == 0xeb || opcode == 0xe9;
+ +      /* Anything NOP, but no REP NOP */
+ +      if (insn->opcode.bytes[0] == 0x90 &&
+ +          (!insn->prefixes.nbytes || insn->prefixes.bytes[0] != 0xF3))
+ +              return true;
+ +
+ +      /* NOPL */
+ +      if (insn->opcode.bytes[0] == 0x0F && insn->opcode.bytes[1] == 0x1F)
+ +              return true;
+ +
+ +      /* TODO: more nops */
+ +
+ +      return false;
   }
   
- -static void __init_or_module
- -recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
+ +/*
+ + * Find the offset of the first non-NOP instruction starting at @offset
+ + * but no further than @len.
+ + */
+ +static int skip_nops(u8 *instr, int offset, int len)
   {
- -      u8 *next_rip, *tgt_rip;
- -      s32 n_dspl, o_dspl;
- -      int repl_len;
+ +      struct insn insn;
   
- -      if (a->replacementlen != 5)
- -              return;
+ +      for (; offset < len; offset += insn.length) {
+ +              if (insn_decode_kernel(&insn, &instr[offset]))
+ +                      break;
   
- -      o_dspl = *(s32 *)(insn_buff + 1);
+ +              if (!insn_is_nop(&insn))
+ +                      break;
+ +      }
   
- -      /* next_rip of the replacement JMP */
- -      next_rip = repl_insn + a->replacementlen;
- -      /* target rip of the replacement JMP */
- -      tgt_rip  = next_rip + o_dspl;
- -      n_dspl = tgt_rip - orig_insn;
+ +      return offset;
+ +}
   
- -      DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
+ +/*
+ + * Optimize a sequence of NOPs, possibly preceded by an unconditional jump
+ + * to the end of the NOP sequence into a single NOP.
+ + */
+ +static bool __init_or_module
+ +__optimize_nops(u8 *instr, size_t len, struct insn *insn, int *next, int *prev, int *target)
+ +{
+ +      int i = *next - insn->length;
   
- -      if (tgt_rip - orig_insn >= 0) {
- -              if (n_dspl - 2 <= 127)
- -                      goto two_byte_jmp;
- -              else
- -                      goto five_byte_jmp;
- -      /* negative offset */
- -      } else {
- -              if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
- -                      goto two_byte_jmp;
- -              else
- -                      goto five_byte_jmp;
+ +      switch (insn->opcode.bytes[0]) {
+ +      case JMP8_INSN_OPCODE:
+ +      case JMP32_INSN_OPCODE:
+ +              *prev = i;
+ +              *target = *next + insn->immediate.value;
+ +              return false;
         }
   
- -two_byte_jmp:
- -      n_dspl -= 2;
+ +      if (insn_is_nop(insn)) {
+ +              int nop = i;
   
- -      insn_buff[0] = 0xeb;
- -      insn_buff[1] = (s8)n_dspl;
- -      add_nops(insn_buff + 2, 3);
+ +              *next = skip_nops(instr, *next, len);
+ +              if (*target && *next == *target)
+ +                      nop = *prev;
   
- -      repl_len = 2;
- -      goto done;
+ +              add_nop(instr + nop, *next - nop);
+ +              DUMP_BYTES(ALT, instr, len, "%px: [%d:%d) optimized NOPs: ", instr, nop, *next);
+ +              return true;
+ +      }
   
- -five_byte_jmp:
- -      n_dspl -= 5;
+ +      *target = 0;
+ +      return false;
+ +}
   
- -      insn_buff[0] = 0xe9;
- -      *(s32 *)&insn_buff[1] = n_dspl;
+ +/*
+ + * "noinline" to cause control flow change and thus invalidate I$ and
+ + * cause refetch after modification.
+ + */
+ +static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
+ +{
+ +      int prev, target = 0;
   
- -      repl_len = 5;
+ +      for (int next, i = 0; i < len; i = next) {
+ +              struct insn insn;
   
- -done:
+ +              if (insn_decode_kernel(&insn, &instr[i]))
+ +                      return;
   
- -      DPRINTK("final displ: 0x%08x, JMP 0x%lx",
- -              n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+ +              next = i + insn.length;
+ +
+ +              __optimize_nops(instr, len, &insn, &next, &prev, &target);
+ +      }
   }
   
   /*
- - * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
+ + * In this context, "source" is where the instructions are placed in the
+ + * section .altinstr_replacement, for example during kernel build by the
+ + * toolchain.
+ + * "Destination" is where the instructions are being patched in by this
+ + * machinery.
+ + *
+ + * The source offset is:
+ + *
+ + *   src_imm = target - src_next_ip                  (1)
    *
- - * @instr: instruction byte stream
- - * @instrlen: length of the above
- - * @off: offset within @instr where the first NOP has been detected
+ + * and the target offset is:
    *
- - * Return: number of NOPs found (and replaced).
+ + *   dst_imm = target - dst_next_ip                  (2)
+ + *
+ + * so rework (1) as an expression for target like:
+ + *
+ + *   target = src_imm + src_next_ip                  (1a)
+ + *
+ + * and substitute in (2) to get:
+ + *
+ + *   dst_imm = (src_imm + src_next_ip) - dst_next_ip (3)
+ + *
+ + * Now, since the instruction stream is 'identical' at src and dst (it
+ + * is being copied after all) it can be stated that:
+ + *
+ + *   src_next_ip = src + ip_offset
+ + *   dst_next_ip = dst + ip_offset                   (4)
+ + *
+ + * Substitute (4) in (3) and observe ip_offset being cancelled out to
+ + * obtain:
+ + *
+ + *   dst_imm = src_imm + (src + ip_offset) - (dst + ip_offset)
+ + *           = src_imm + src - dst + ip_offset - ip_offset
+ + *           = src_imm + src - dst                   (5)
+ + *
+ + * IOW, only the relative displacement of the code block matters.
    */
- -static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
- -{
- -      unsigned long flags;
- -      int i = off, nnops;
   
- -      while (i < instrlen) {
- -              if (instr[i] != 0x90)
- -                      break;
+ +#define apply_reloc_n(n_, p_, d_)                             \
+ +      do {                                                    \
+ +              s32 v = *(s##n_ *)(p_);                         \
+ +              v += (d_);                                      \
+ +              BUG_ON((v >> 31) != (v >> (n_-1)));             \
+ +              *(s##n_ *)(p_) = (s##n_)v;                      \
+ +      } while (0)
+ +
   
- -              i++;
+ +static __always_inline
+ +void apply_reloc(int n, void *ptr, uintptr_t diff)
+ +{
+ +      switch (n) {
+ +      case 1: apply_reloc_n(8, ptr, diff); break;
+ +      case 2: apply_reloc_n(16, ptr, diff); break;
+ +      case 4: apply_reloc_n(32, ptr, diff); break;
+ +      default: BUG();
         }
+ +}
   
- -      nnops = i - off;
+ +static __always_inline
+ +bool need_reloc(unsigned long offset, u8 *src, size_t src_len)
+ +{
+ +      u8 *target = src + offset;
+ +      /*
+ +       * If the target is inside the patched block, it's relative to the
+ +       * block itself and does not need relocation.
+ +       */
+ +      return (target < src || target > src + src_len);
+ +}
   
- -      if (nnops <= 1)
- -              return nnops;
+ +static void __init_or_module noinline
+ +apply_relocation(u8 *buf, size_t len, u8 *dest, u8 *src, size_t src_len)
+ +{
+ +      int prev, target = 0;
   
- -      local_irq_save(flags);
- -      add_nops(instr + off, nnops);
- -      local_irq_restore(flags);
+ +      for (int next, i = 0; i < len; i = next) {
+ +              struct insn insn;
   
- -      DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
+ +              if (WARN_ON_ONCE(insn_decode_kernel(&insn, &buf[i])))
+ +                      return;
   
- -      return nnops;
- -}
+ +              next = i + insn.length;
   
- -/*
- - * "noinline" to cause control flow change and thus invalidate I$ and
- - * cause refetch after modification.
- - */
- -static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
- -{
- -      struct insn insn;
- -      int i = 0;
+ +              if (__optimize_nops(buf, len, &insn, &next, &prev, &target))
+ +                      continue;
   
- -      /*
- -       * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
- -       * ones.
- -       */
- -      for (;;) {
- -              if (insn_decode_kernel(&insn, &instr[i]))
- -                      return;
+ +              switch (insn.opcode.bytes[0]) {
+ +              case 0x0f:
+ +                      if (insn.opcode.bytes[1] < 0x80 ||
+ +                          insn.opcode.bytes[1] > 0x8f)
+ +                              break;
   
- -              /*
- -               * See if this and any potentially following NOPs can be
- -               * optimized.
- -               */
- -              if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
- -                      i += optimize_nops_range(instr, len, i);
- -              else
- -                      i += insn.length;
+ +                      fallthrough;    /* Jcc.d32 */
+ +              case 0x70 ... 0x7f:     /* Jcc.d8 */
+ +              case JMP8_INSN_OPCODE:
+ +              case JMP32_INSN_OPCODE:
+ +              case CALL_INSN_OPCODE:
+ +                      if (need_reloc(next + insn.immediate.value, src, src_len)) {
+ +                              apply_reloc(insn.immediate.nbytes,
+ +                                          buf + i + insn_offset_immediate(&insn),
+ +                                          src - dest);
+ +                      }
+ +
+ +                      /*
+ +                       * Where possible, convert JMP.d32 into JMP.d8.
+ +                       */
+ +                      if (insn.opcode.bytes[0] == JMP32_INSN_OPCODE) {
+ +                              s32 imm = insn.immediate.value;
+ +                              imm += src - dest;
+ +                              imm += JMP32_INSN_SIZE - JMP8_INSN_SIZE;
+ +                              if ((imm >> 31) == (imm >> 7)) {
+ +                                      buf[i+0] = JMP8_INSN_OPCODE;
+ +                                      buf[i+1] = (s8)imm;
+ +
+ +                                      memset(&buf[i+2], INT3_INSN_OPCODE, insn.length - 2);
+ +                              }
+ +                      }
+ +                      break;
+ +              }
   
- -              if (i >= len)
- -                      return;
+ +              if (insn_rip_relative(&insn)) {
+ +                      if (need_reloc(next + insn.displacement.value, src, src_len)) {
+ +                              apply_reloc(insn.displacement.nbytes,
+ +                                          buf + i + insn_offset_displacement(&insn),
+ +                                          src - dest);
+ +                      }
+ +              }
         }
   }
   
@@@ -402,7 -270,7 +402,7 @@@ void __init_or_module noinline apply_al
         u8 *instr, *replacement;
         u8 insn_buff[MAX_PATCH_LEN];
   
- -      DPRINTK("alt table %px, -> %px", start, end);
+ +      DPRINTK(ALT, "alt table %px, -> %px", start, end);
         /*
          * The scan order should be from start to end. A later scanned
          * alternative code can overwrite previously scanned alternative code.
@@@ -426,31 -294,47 +426,31 @@@
                  * - feature not present but ALT_FLAG_NOT is set to mean,
                  *   patch if feature is *NOT* present.
                  */
- -              if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT))
- -                      goto next;
+ +              if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
+ +                      optimize_nops(instr, a->instrlen);
+ +                      continue;
+ +              }
   
- -              DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
+ +              DPRINTK(ALT, "feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
                         (a->flags & ALT_FLAG_NOT) ? "!" : "",
                         a->cpuid >> 5,
                         a->cpuid & 0x1f,
                         instr, instr, a->instrlen,
                         replacement, a->replacementlen);
   
- -              DUMP_BYTES(instr, a->instrlen, "%px:   old_insn: ", instr);
- -              DUMP_BYTES(replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
- -
                 memcpy(insn_buff, replacement, a->replacementlen);
                 insn_buff_sz = a->replacementlen;
   
- -              /*
- -               * 0xe8 is a relative jump; fix the offset.
- -               *
- -               * Instruction length is checked before the opcode to avoid
- -               * accessing uninitialized bytes for zero-length replacements.
- -               */
- -              if (a->replacementlen == 5 && *insn_buff == 0xe8) {
- -                      *(s32 *)(insn_buff + 1) += replacement - instr;
- -                      DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
- -                              *(s32 *)(insn_buff + 1),
- -                              (unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
- -              }
- -
- -              if (a->replacementlen && is_jmp(replacement[0]))
- -                      recompute_jump(a, instr, replacement, insn_buff);
- -
                 for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
                         insn_buff[insn_buff_sz] = 0x90;
   
- -              DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
+ +              apply_relocation(insn_buff, a->instrlen, instr, replacement, a->replacementlen);
   
- -              text_poke_early(instr, insn_buff, insn_buff_sz);
+ +              DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
+ +              DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
+ +              DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
   
- -next:
- -              optimize_nops(instr, a->instrlen);
+ +              text_poke_early(instr, insn_buff, insn_buff_sz);
         }
   }
   
@@@ -671,15 -555,15 +671,15 @@@ void __init_or_module noinline apply_re
                         continue;
                 }
   
- -              DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
+ +              DPRINTK(RETPOLINE, "retpoline at: %pS (%px) len: %d to: %pS",
                         addr, addr, insn.length,
                         addr + insn.length + insn.immediate.value);
   
                 len = patch_retpoline(addr, &insn, bytes);
                 if (len == insn.length) {
                         optimize_nops(bytes, len);
- -                      DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
- -                      DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+ +                      DUMP_BYTES(RETPOLINE, ((u8*)addr),  len, "%px: orig: ", addr);
+ +                      DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr);
                         text_poke_early(addr, bytes, len);
                 }
         }
@@@ -706,12 -590,13 +706,12 @@@ static int patch_return(void *addr, str
   {
         int i = 0;
   
+ +      /* Patch the custom return thunks... */
         if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
- -              if (x86_return_thunk == __x86_return_thunk)
- -                      return -1;
- -
                 i = JMP32_INSN_SIZE;
                 __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
         } else {
+ +              /* ... or patch them out if not needed. */
                 bytes[i++] = RET_INSN_OPCODE;
         }
   
@@@ -724,14 -609,6 +724,14 @@@ void __init_or_module noinline apply_re
   {
         s32 *s;
   
+ +      /*
+ +       * Do not patch out the default return thunks if those needed are the
+ +       * ones generated by the compiler.
+ +       */
+ +      if (cpu_feature_enabled(X86_FEATURE_RETHUNK) &&
+ +          (x86_return_thunk == __x86_return_thunk))
+ +              return;
+ +
         for (s = start; s < end; s++) {
                 void *dest = NULL, *addr = (void *)s + *s;
                 struct insn insn;
@@@ -753,14 -630,14 +753,14 @@@
                               addr, dest, 5, addr))
                         continue;
   
- -              DPRINTK("return thunk at: %pS (%px) len: %d to: %pS",
+ +              DPRINTK(RET, "return thunk at: %pS (%px) len: %d to: %pS",
                         addr, addr, insn.length,
                         addr + insn.length + insn.immediate.value);
   
                 len = patch_return(addr, &insn, bytes);
                 if (len == insn.length) {
- -                      DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
- -                      DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
+ +                      DUMP_BYTES(RET, ((u8*)addr),  len, "%px: orig: ", addr);
+ +                      DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr);
                         text_poke_early(addr, bytes, len);
                 }
         }
@@@ -778,7 -655,7 +778,7 @@@ void __init_or_module noinline apply_re
   
   #ifdef CONFIG_X86_KERNEL_IBT
   
- -static void poison_endbr(void *addr, bool warn)
+ +static void __init_or_module poison_endbr(void *addr, bool warn)
   {
         u32 endbr, poison = gen_endbr_poison();
   
@@@ -790,13 -667,13 +790,13 @@@
                 return;
         }
   
- -      DPRINTK("ENDBR at: %pS (%px)", addr, addr);
+ +      DPRINTK(ENDBR, "ENDBR at: %pS (%px)", addr, addr);
   
         /*
          * When we have IBT, the lack of ENDBR will trigger #CP
          */
- -      DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
- -      DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
+ +      DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr);
+ +      DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr);
         text_poke_early(addr, &poison, 4);
   }
   
@@@ -1271,7 -1148,7 +1271,7 @@@ void __init_or_module alternatives_smp_
         smp->locks_end  = locks_end;
         smp->text       = text;
         smp->text_end   = text_end;
- -      DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+ +      DPRINTK(SMP, "locks %p -> %p, text %p -> %p, name %s\n",
                 smp->locks, smp->locks_end,
                 smp->text, smp->text_end, smp->name);
   
@@@ -1348,20 -1225,6 +1348,20 @@@ int alternatives_text_reserved(void *st
   #endif /* CONFIG_SMP */
   
   #ifdef CONFIG_PARAVIRT
+ +
+ +/* Use this to add nops to a buffer, then text_poke the whole buffer. */
+ +static void __init_or_module add_nops(void *insns, unsigned int len)
+ +{
+ +      while (len > 0) {
+ +              unsigned int noplen = len;
+ +              if (noplen > ASM_NOP_MAX)
+ +                      noplen = ASM_NOP_MAX;
+ +              memcpy(insns, x86_nops[noplen], noplen);
+ +              insns += noplen;
+ +              len -= noplen;
+ +      }
+ +}
+ +
   void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
                                      struct paravirt_patch_site *end)
   {
@@@ -1469,35 -1332,6 +1469,35 @@@ static noinline void __init int3_selfte
         unregister_die_notifier(&int3_exception_nb);
   }
   
+ +static __initdata int __alt_reloc_selftest_addr;
+ +
+ +__visible noinline void __init __alt_reloc_selftest(void *arg)
+ +{
+ +      WARN_ON(arg != &__alt_reloc_selftest_addr);
+ +}
+ +
+ +static noinline void __init alt_reloc_selftest(void)
+ +{
+ +      /*
+ +       * Tests apply_relocation().
+ +       *
+ +       * This has a relative immediate (CALL) in a place other than the first
+ +       * instruction and additionally on x86_64 we get a RIP-relative LEA:
+ +       *
+ +       *   lea    0x0(%rip),%rdi  # 5d0: R_X86_64_PC32    .init.data+0x5566c
+ +       *   call   +0              # 5d5: R_X86_64_PLT32   __alt_reloc_selftest-0x4
+ +       *
+ +       * Getting this wrong will either crash and burn or tickle the WARN
+ +       * above.
+ +       */
+ +      asm_inline volatile (
+ +              ALTERNATIVE("", "lea %[mem], %%" _ASM_ARG1 "; call __alt_reloc_selftest;", X86_FEATURE_ALWAYS)
+ +              : /* output */
+ +              : [mem] "m" (__alt_reloc_selftest_addr)
+ +              : _ASM_ARG1
+ +      );
+ +}
+ +
   void __init alternative_instructions(void)
   {
         int3_selftest();
@@@ -1585,8 -1419,6 +1585,8 @@@
   
         restart_nmi();
         alternatives_patched = 1;
+ +
+ +      alt_reloc_selftest();
   }
   
   /**
@@@ -1967,7 -1799,7 +1967,7 @@@ struct bp_patching_desc *try_get_desc(v
   {
         struct bp_patching_desc *desc = &bp_desc;
   
-       if (!arch_atomic_inc_not_zero(&desc->refs))
+       if (!raw_atomic_inc_not_zero(&desc->refs))
                 return NULL;
   
         return desc;
@@@ -1978,7 -1810,7 +1978,7 @@@ static __always_inline void put_desc(vo
         struct bp_patching_desc *desc = &bp_desc;
   
         smp_mb__before_atomic();
-       arch_atomic_dec(&desc->refs);
+       raw_atomic_dec(&desc->refs);
   }
   
   static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
@@@ -2122,16 -1954,6 +2122,16 @@@ static void text_poke_bp_batch(struct t
         atomic_set_release(&bp_desc.refs, 1);
   
         /*
+ +       * Function tracing can enable thousands of places that need to be
+ +       * updated. This can take quite some time, and with full kernel debugging
+ +       * enabled, this could cause the softlockup watchdog to trigger.
+ +       * This function gets called every 256 entries added to be patched.
+ +       * Call cond_resched() here to make sure that other tasks can get scheduled
+ +       * while processing all the functions being patched.
+ +       */
+ +      cond_resched();
+ +
+ +      /*
          * Corresponding read barrier in int3 notifier for making sure the
          * nr_entries and handler are correctly ordered wrt. patching.
          */
diff --combined arch/x86/kernel/cpu/mce/core.c

index 22dfcb2,ab156e6..89e2aab
--- 1/arch/x86/kernel/cpu/mce/core.c
--- 2/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@@ -1022,12 -1022,12 +1022,12 @@@ static noinstr int mce_start(int *no_wa
         if (!timeout)
                 return ret;
   
-       arch_atomic_add(*no_way_out, &global_nwo);
+       raw_atomic_add(*no_way_out, &global_nwo);
         /*
          * Rely on the implied barrier below, such that global_nwo
          * is updated before mce_callin.
          */
-       order = arch_atomic_inc_return(&mce_callin);
+       order = raw_atomic_inc_return(&mce_callin);
         arch_cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
   
         /* Enable instrumentation around calls to external facilities */
@@@ -1036,10 -1036,10 +1036,10 @@@
         /*
          * Wait for everyone.
          */
-       while (arch_atomic_read(&mce_callin) != num_online_cpus()) {
+       while (raw_atomic_read(&mce_callin) != num_online_cpus()) {
                 if (mce_timed_out(&timeout,
                                   "Timeout: Not all CPUs entered broadcast exception handler")) {
-                       arch_atomic_set(&global_nwo, 0);
+                       raw_atomic_set(&global_nwo, 0);
                         goto out;
                 }
                 ndelay(SPINUNIT);
@@@ -1054,7 -1054,7 +1054,7 @@@
                 /*
                  * Monarch: Starts executing now, the others wait.
                  */
-               arch_atomic_set(&mce_executing, 1);
+               raw_atomic_set(&mce_executing, 1);
         } else {
                 /*
                  * Subject: Now start the scanning loop one by one in
@@@ -1062,10 -1062,10 +1062,10 @@@
                  * This way when there are any shared banks it will be
                  * only seen by one CPU before cleared, avoiding duplicates.
                  */
-               while (arch_atomic_read(&mce_executing) < order) {
+               while (raw_atomic_read(&mce_executing) < order) {
                         if (mce_timed_out(&timeout,
                                           "Timeout: Subject CPUs unable to finish machine check processing")) {
-                               arch_atomic_set(&global_nwo, 0);
+                               raw_atomic_set(&global_nwo, 0);
                                 goto out;
                         }
                         ndelay(SPINUNIT);
@@@ -1075,7 -1075,7 +1075,7 @@@
         /*
          * Cache the global no_way_out state.
          */
-       *no_way_out = arch_atomic_read(&global_nwo);
+       *no_way_out = raw_atomic_read(&global_nwo);
   
         ret = order;
   
@@@ -1533,7 -1533,7 +1533,7 @@@ noinstr void do_machine_check(struct pt
                 /* If this triggers there is no way to recover. Die hard. */
                 BUG_ON(!on_thread_stack() || !user_mode(regs));
   
- -              if (kill_current_task)
+ +              if (!mce_usable_address(&m))
                         queue_task_work(&m, msg, kill_me_now);
                 else
                         queue_task_work(&m, msg, kill_me_maybe);
diff --combined arch/x86/kvm/x86.c

index bc68a39,ac6f609..7f70207
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -1446,7 -1446,7 +1446,7 @@@ static const u32 msrs_to_save_base[] = 
   #endif
         MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
         MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
- -      MSR_IA32_SPEC_CTRL,
+ +      MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL,
         MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
         MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
         MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
@@@ -2799,13 -2799,14 +2799,13 @@@ static u64 read_tsc(void
   static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
                           int *mode)
   {
- -      long v;
         u64 tsc_pg_val;
+ +      long v;
   
         switch (clock->vclock_mode) {
         case VDSO_CLOCKMODE_HVCLOCK:
- -              tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
- -                                                tsc_timestamp);
- -              if (tsc_pg_val != U64_MAX) {
+ +              if (hv_read_tsc_page_tsc(hv_get_tsc_page(),
+ +                                       tsc_timestamp, &tsc_pg_val)) {
                         /* TSC page valid */
                         *mode = VDSO_CLOCKMODE_HVCLOCK;
                         v = (tsc_pg_val - clock->cycle_last) &
@@@ -7154,10 -7155,6 +7154,10 @@@ static void kvm_probe_msr_to_save(u32 m
                 if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
                         return;
                 break;
+ +      case MSR_IA32_TSX_CTRL:
+ +              if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR))
+ +                      return;
+ +              break;
         default:
                 break;
         }
@@@ -10757,9 -10754,6 +10757,9 @@@ static int vcpu_enter_guest(struct kvm_
                         exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
                         break;
                 }
+ +
+ +              /* Note, VM-Exits that go down the "slow" path are accounted below. */
+ +              ++vcpu->stat.exits;
         }
   
         /*
@@@ -13161,7 -13155,7 +13161,7 @@@ EXPORT_SYMBOL_GPL(kvm_arch_end_assignme
   
   bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
   {
-       return arch_atomic_read(&kvm->arch.assigned_device_count);
+       return raw_atomic_read(&kvm->arch.assigned_device_count);
   }
   EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
   
diff --combined drivers/iommu/amd/iommu.c

index e8a2e59,1e9f85e..9ea4096
--- 1/drivers/iommu/amd/iommu.c
--- 2/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@@ -845,7 -845,6 +845,7 @@@ amd_iommu_set_pci_msi_domain(struct dev
         (MMIO_STATUS_EVT_OVERFLOW_INT_MASK | \
          MMIO_STATUS_EVT_INT_MASK | \
          MMIO_STATUS_PPR_INT_MASK | \
+ +       MMIO_STATUS_GALOG_OVERFLOW_MASK | \
          MMIO_STATUS_GALOG_INT_MASK)
   
   irqreturn_t amd_iommu_int_thread(int irq, void *data)
@@@ -869,16 -868,10 +869,16 @@@
                 }
   
   #ifdef CONFIG_IRQ_REMAP
- -              if (status & MMIO_STATUS_GALOG_INT_MASK) {
+ +              if (status & (MMIO_STATUS_GALOG_INT_MASK |
+ +                            MMIO_STATUS_GALOG_OVERFLOW_MASK)) {
                         pr_devel("Processing IOMMU GA Log\n");
                         iommu_poll_ga_log(iommu);
                 }
+ +
+ +              if (status & MMIO_STATUS_GALOG_OVERFLOW_MASK) {
+ +                      pr_info_ratelimited("IOMMU GA Log overflow\n");
+ +                      amd_iommu_restart_ga_log(iommu);
+ +              }
   #endif
   
                 if (status & MMIO_STATUS_EVT_OVERFLOW_INT_MASK) {
@@@ -2074,10 -2067,14 +2074,10 @@@ static struct protection_domain *protec
   {
         struct io_pgtable_ops *pgtbl_ops;
         struct protection_domain *domain;
- -      int pgtable = amd_iommu_pgtable;
+ +      int pgtable;
         int mode = DEFAULT_PGTABLE_LEVEL;
         int ret;
   
- -      domain = kzalloc(sizeof(*domain), GFP_KERNEL);
- -      if (!domain)
- -              return NULL;
- -
         /*
          * Force IOMMU v1 page table when iommu=pt and
          * when allocating domain for pass-through devices.
@@@ -2087,16 -2084,8 +2087,16 @@@
                 mode = PAGE_MODE_NONE;
         } else if (type == IOMMU_DOMAIN_UNMANAGED) {
                 pgtable = AMD_IOMMU_V1;
+ +      } else if (type == IOMMU_DOMAIN_DMA || type == IOMMU_DOMAIN_DMA_FQ) {
+ +              pgtable = amd_iommu_pgtable;
+ +      } else {
+ +              return NULL;
         }
   
+ +      domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ +      if (!domain)
+ +              return NULL;
+ +
         switch (pgtable) {
         case AMD_IOMMU_V1:
                 ret = protection_domain_init_v1(domain, mode);
@@@ -2129,15 -2118,6 +2129,15 @@@ out_err
         return NULL;
   }
   
+ +static inline u64 dma_max_address(void)
+ +{
+ +      if (amd_iommu_pgtable == AMD_IOMMU_V1)
+ +              return ~0ULL;
+ +
+ +      /* V2 with 4/5 level page table */
+ +      return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
+ +}
+ +
   static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
   {
         struct protection_domain *domain;
@@@ -2154,7 -2134,7 +2154,7 @@@
                 return NULL;
   
         domain->domain.geometry.aperture_start = 0;
- -      domain->domain.geometry.aperture_end   = ~0ULL;
+ +      domain->domain.geometry.aperture_end   = dma_max_address();
         domain->domain.geometry.force_aperture = true;
   
         return &domain->domain;
@@@ -2407,7 -2387,7 +2407,7 @@@ static void amd_iommu_iotlb_sync(struc
         unsigned long flags;
   
         spin_lock_irqsave(&dom->lock, flags);
- -      domain_flush_pages(dom, gather->start, gather->end - gather->start, 1);
+ +      domain_flush_pages(dom, gather->start, gather->end - gather->start + 1, 1);
         amd_iommu_domain_flush_complete(dom);
         spin_unlock_irqrestore(&dom->lock, flags);
   }
@@@ -3023,10 -3003,10 +3023,10 @@@ out
   static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index,
                           struct irte_ga *irte, struct amd_ir_data *data)
   {
-       bool ret;
         struct irq_remap_table *table;
-       unsigned long flags;
         struct irte_ga *entry;
+       unsigned long flags;
+       u128 old;
   
         table = get_irq_table(iommu, devid);
         if (!table)
@@@ -3037,16 -3017,14 +3037,14 @@@
         entry = (struct irte_ga *)table->table;
         entry = &entry[index];
   
-       ret = cmpxchg_double(&entry->lo.val, &entry->hi.val,
-                            entry->lo.val, entry->hi.val,
-                            irte->lo.val, irte->hi.val);
         /*
          * We use cmpxchg16 to atomically update the 128-bit IRTE,
          * and it cannot be updated by the hardware or other processors
          * behind us, so the return value of cmpxchg16 should be the
          * same as the old value.
          */
-       WARN_ON(!ret);
+       old = entry->irte;
+       WARN_ON(!try_cmpxchg128(&entry->irte, &old, irte->irte));
   
         if (data)
                 data->ref = entry;
@@@ -3513,7 -3491,8 +3511,7 @@@ int amd_iommu_activate_guest_mode(void 
         struct irte_ga *entry = (struct irte_ga *) ir_data->entry;
         u64 valid;
   
- -      if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) ||
- -          !entry || entry->lo.fields_vapic.guest_mode)
+ +      if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir) || !entry)
                 return 0;
   
         valid = entry->lo.fields_vapic.valid;
diff --combined drivers/md/bcache/btree.c

index 68b9d7c,569f489..fd121a6
--- 1/drivers/md/bcache/btree.c
--- 2/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@@ -559,6 -559,27 +559,27 @@@ static void mca_data_alloc(struct btre
         }
   }
   
+ #define cmp_int(l, r)         ((l > r) - (l < r))
+ 
+ #ifdef CONFIG_PROVE_LOCKING
+ static int btree_lock_cmp_fn(const struct lockdep_map *_a,
+                            const struct lockdep_map *_b)
+ {
+       const struct btree *a = container_of(_a, struct btree, lock.dep_map);
+       const struct btree *b = container_of(_b, struct btree, lock.dep_map);
+ 
+       return -cmp_int(a->level, b->level) ?: bkey_cmp(&a->key, &b->key);
+ }
+ 
+ static void btree_lock_print_fn(const struct lockdep_map *map)
+ {
+       const struct btree *b = container_of(map, struct btree, lock.dep_map);
+ 
+       printk(KERN_CONT " l=%u %llu:%llu", b->level,
+              KEY_INODE(&b->key), KEY_OFFSET(&b->key));
+ }
+ #endif
+ 
   static struct btree *mca_bucket_alloc(struct cache_set *c,
                                       struct bkey *k, gfp_t gfp)
   {
@@@ -572,7 -593,7 +593,7 @@@
                 return NULL;
   
         init_rwsem(&b->lock);
-       lockdep_set_novalidate_class(&b->lock);
+       lock_set_cmp_fn(&b->lock, btree_lock_cmp_fn, btree_lock_print_fn);
         mutex_init(&b->write_lock);
         lockdep_set_novalidate_class(&b->write_lock);
         INIT_LIST_HEAD(&b->list);
@@@ -885,7 -906,7 +906,7 @@@ static struct btree *mca_cannibalize(st
    * cannibalize_bucket() will take. This means every time we unlock the root of
    * the btree, we need to release this lock if we have it held.
    */
- -static void bch_cannibalize_unlock(struct cache_set *c)
+ +void bch_cannibalize_unlock(struct cache_set *c)
   {
         spin_lock(&c->btree_cannibalize_lock);
         if (c->btree_cache_alloc_lock == current) {
@@@ -1090,12 -1111,10 +1111,12 @@@ struct btree *__bch_btree_node_alloc(st
                                      struct btree *parent)
   {
         BKEY_PADDED(key) k;
- -      struct btree *b = ERR_PTR(-EAGAIN);
+ +      struct btree *b;
   
         mutex_lock(&c->bucket_lock);
   retry:
+ +      /* return ERR_PTR(-EAGAIN) when it fails */
+ +      b = ERR_PTR(-EAGAIN);
         if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait))
                 goto err;
   
@@@ -1140,7 -1159,7 +1161,7 @@@ static struct btree *btree_node_alloc_r
   {
         struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
   
- -      if (!IS_ERR_OR_NULL(n)) {
+ +      if (!IS_ERR(n)) {
                 mutex_lock(&n->write_lock);
                 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
                 bkey_copy_key(&n->key, &b->key);
@@@ -1342,7 -1361,7 +1363,7 @@@ static int btree_gc_coalesce(struct btr
         memset(new_nodes, 0, sizeof(new_nodes));
         closure_init_stack(&cl);
   
- -      while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
+ +      while (nodes < GC_MERGE_NODES && !IS_ERR(r[nodes].b))
                 keys += r[nodes++].keys;
   
         blocks = btree_default_blocks(b->c) * 2 / 3;
@@@ -1354,7 -1373,7 +1375,7 @@@
   
         for (i = 0; i < nodes; i++) {
                 new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
- -              if (IS_ERR_OR_NULL(new_nodes[i]))
+ +              if (IS_ERR(new_nodes[i]))
                         goto out_nocoalesce;
         }
   
@@@ -1489,7 -1508,7 +1510,7 @@@ out_nocoalesce
         bch_keylist_free(&keylist);
   
         for (i = 0; i < nodes; i++)
- -              if (!IS_ERR_OR_NULL(new_nodes[i])) {
+ +              if (!IS_ERR(new_nodes[i])) {
                         btree_node_free(new_nodes[i]);
                         rw_unlock(true, new_nodes[i]);
                 }
@@@ -1671,7 -1690,7 +1692,7 @@@ static int bch_btree_gc_root(struct btr
         if (should_rewrite) {
                 n = btree_node_alloc_replacement(b, NULL);
   
- -              if (!IS_ERR_OR_NULL(n)) {
+ +              if (!IS_ERR(n)) {
                         bch_btree_node_write_sync(n);
   
                         bch_btree_set_root(n);
@@@ -1970,15 -1989,6 +1991,15 @@@ static int bch_btree_check_thread(void 
                         c->gc_stats.nodes++;
                         bch_btree_op_init(&op, 0);
                         ret = bcache_btree(check_recurse, p, c->root, &op);
+ +                      /*
+ +                       * The op may be added to cache_set's btree_cache_wait
+ +                       * in mca_cannibalize(), must ensure it is removed from
+ +                       * the list and release btree_cache_alloc_lock before
+ +                       * free op memory.
+ +                       * Otherwise, the btree_cache_wait will be damaged.
+ +                       */
+ +                      bch_cannibalize_unlock(c);
+ +                      finish_wait(&c->btree_cache_wait, &(&op)->wait);
                         if (ret)
                                 goto out;
                 }
diff --combined drivers/md/bcache/btree.h

index a2920bb,17b1d20..45d64b5
--- 1/drivers/md/bcache/btree.h
--- 2/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@@ -247,8 -247,8 +247,8 @@@ static inline void bch_btree_op_init(st
   
   static inline void rw_lock(bool w, struct btree *b, int level)
   {
-       w ? down_write_nested(&b->lock, level + 1)
-         : down_read_nested(&b->lock, level + 1);
+       w ? down_write(&b->lock)
+         : down_read(&b->lock);
         if (w)
                 b->seq++;
   }
@@@ -282,7 -282,6 +282,7 @@@ void bch_initial_gc_finish(struct cache
   void bch_moving_gc(struct cache_set *c);
   int bch_btree_check(struct cache_set *c);
   void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k);
+ +void bch_cannibalize_unlock(struct cache_set *c);
   
   static inline void wake_up_gc(struct cache_set *c)
   {
diff --combined include/linux/lockdep.h

index 74bd269,3bac150..310f859
--- 1/include/linux/lockdep.h
--- 2/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@@ -344,16 -344,6 +344,16 @@@ extern void lock_unpin_lock(struct lock
   #define lockdep_repin_lock(l,c)       lock_repin_lock(&(l)->dep_map, (c))
   #define lockdep_unpin_lock(l,c)       lock_unpin_lock(&(l)->dep_map, (c))
   
+ +/*
+ + * Must use lock_map_aquire_try() with override maps to avoid
+ + * lockdep thinking they participate in the block chain.
+ + */
+ +#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type)   \
+ +      struct lockdep_map _name = {                    \
+ +              .name = #_name "-wait-type-override",   \
+ +              .wait_type_inner = _wait_type,          \
+ +              .lock_type = LD_LOCK_WAIT_OVERRIDE, }
+ +
   #else /* !CONFIG_LOCKDEP */
   
   static inline void lockdep_init_task(struct task_struct *task)
@@@ -442,11 -432,16 +442,19 @@@ extern int lockdep_is_held(const void *
   #define lockdep_repin_lock(l, c)              do { (void)(l); (void)(c); } while (0)
   #define lockdep_unpin_lock(l, c)              do { (void)(l); (void)(c); } while (0)
   
+ +#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type)   \
+ +      struct lockdep_map __maybe_unused _name = {}
+ +
   #endif /* !LOCKDEP */
   
+ #ifdef CONFIG_PROVE_LOCKING
+ void lockdep_set_lock_cmp_fn(struct lockdep_map *, lock_cmp_fn, lock_print_fn);
+ 
+ #define lock_set_cmp_fn(lock, ...)    lockdep_set_lock_cmp_fn(&(lock)->dep_map, __VA_ARGS__)
+ #else
+ #define lock_set_cmp_fn(lock, ...)    do { } while (0)
+ #endif
+ 
   enum xhlock_context_t {
         XHLOCK_HARD,
         XHLOCK_SOFT,
@@@ -569,7 -564,6 +577,7 @@@ do {                                                                       
   #define rwsem_release(l, i)                   lock_release(l, i)
   
   #define lock_map_acquire(l)                   lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
+ +#define lock_map_acquire_try(l)                       lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_)
   #define lock_map_acquire_read(l)              lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
   #define lock_map_acquire_tryread(l)           lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
   #define lock_map_release(l)                   lock_release(l, _THIS_IP_)
diff --combined include/linux/lockdep_types.h

index 59f4fb1,8bf79c4..2ebc323
--- 1/include/linux/lockdep_types.h
--- 2/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@@ -33,7 -33,6 +33,7 @@@ enum lockdep_wait_type 
   enum lockdep_lock_type {
         LD_LOCK_NORMAL = 0,     /* normal, catch all */
         LD_LOCK_PERCPU,         /* percpu */
+ +      LD_LOCK_WAIT_OVERRIDE,  /* annotation */
         LD_LOCK_MAX,
   };
   
@@@ -85,6 -84,11 +85,11 @@@ struct lock_trace
   
   #define LOCKSTAT_POINTS               4
   
+ struct lockdep_map;
+ typedef int (*lock_cmp_fn)(const struct lockdep_map *a,
+                          const struct lockdep_map *b);
+ typedef void (*lock_print_fn)(const struct lockdep_map *map);
+ 
   /*
    * The lock-class itself. The order of the structure members matters.
    * reinit_class() zeroes the key member and all subsequent members.
@@@ -110,6 -114,9 +115,9 @@@ struct lock_class 
         struct list_head                locks_after, locks_before;
   
         const struct lockdep_subclass_key *key;
+       lock_cmp_fn                     cmp_fn;
+       lock_print_fn                   print_fn;
+ 
         unsigned int                    subclass;
         unsigned int                    dep_gen_id;
   
diff --combined kernel/locking/lockdep.c

index 4dfd2f3,3e8950f..111607d
--- 1/kernel/locking/lockdep.c
--- 2/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@@ -709,7 -709,7 +709,7 @@@ void get_usage_chars(struct lock_class 
         usage[i] = '\0';
   }
   
- static void __print_lock_name(struct lock_class *class)
+ static void __print_lock_name(struct held_lock *hlock, struct lock_class *class)
   {
         char str[KSYM_NAME_LEN];
         const char *name;
@@@ -724,17 -724,19 +724,19 @@@
                         printk(KERN_CONT "#%d", class->name_version);
                 if (class->subclass)
                         printk(KERN_CONT "/%d", class->subclass);
+               if (hlock && class->print_fn)
+                       class->print_fn(hlock->instance);
         }
   }
   
- static void print_lock_name(struct lock_class *class)
+ static void print_lock_name(struct held_lock *hlock, struct lock_class *class)
   {
         char usage[LOCK_USAGE_CHARS];
   
         get_usage_chars(class, usage);
   
         printk(KERN_CONT " (");
-       __print_lock_name(class);
+       __print_lock_name(hlock, class);
         printk(KERN_CONT "){%s}-{%d:%d}", usage,
                         class->wait_type_outer ?: class->wait_type_inner,
                         class->wait_type_inner);
@@@ -772,7 -774,7 +774,7 @@@ static void print_lock(struct held_loc
         }
   
         printk(KERN_CONT "%px", hlock->instance);
-       print_lock_name(lock);
+       print_lock_name(hlock, lock);
         printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip);
   }
   
@@@ -1868,7 -1870,7 +1870,7 @@@ print_circular_bug_entry(struct lock_li
         if (debug_locks_silent)
                 return;
         printk("\n-> #%u", depth);
-       print_lock_name(target->class);
+       print_lock_name(NULL, target->class);
         printk(KERN_CONT ":\n");
         print_lock_trace(target->trace, 6);
   }
@@@ -1899,11 -1901,11 +1901,11 @@@ print_circular_lock_scenario(struct hel
          */
         if (parent != source) {
                 printk("Chain exists of:\n  ");
-               __print_lock_name(source);
+               __print_lock_name(src, source);
                 printk(KERN_CONT " --> ");
-               __print_lock_name(parent);
+               __print_lock_name(NULL, parent);
                 printk(KERN_CONT " --> ");
-               __print_lock_name(target);
+               __print_lock_name(tgt, target);
                 printk(KERN_CONT "\n\n");
         }
   
@@@ -1914,13 -1916,13 +1916,13 @@@
                 printk("  rlock(");
         else
                 printk("  lock(");
-       __print_lock_name(target);
+       __print_lock_name(tgt, target);
         printk(KERN_CONT ");\n");
         printk("                               lock(");
-       __print_lock_name(parent);
+       __print_lock_name(NULL, parent);
         printk(KERN_CONT ");\n");
         printk("                               lock(");
-       __print_lock_name(target);
+       __print_lock_name(tgt, target);
         printk(KERN_CONT ");\n");
         if (src_read != 0)
                 printk("  rlock(");
@@@ -1928,7 -1930,7 +1930,7 @@@
                 printk("  sync(");
         else
                 printk("  lock(");
-       __print_lock_name(source);
+       __print_lock_name(src, source);
         printk(KERN_CONT ");\n");
         printk("\n *** DEADLOCK ***\n\n");
   }
@@@ -2154,6 -2156,8 +2156,8 @@@ check_path(struct held_lock *target, st
         return ret;
   }
   
+ static void print_deadlock_bug(struct task_struct *, struct held_lock *, struct held_lock *);
+ 
   /*
    * Prove that the dependency graph starting at <src> can not
    * lead to <target>. If it can, there is a circle when adding
@@@ -2185,7 -2189,10 +2189,10 @@@ check_noncircular(struct held_lock *src
                         *trace = save_trace();
                 }
   
-               print_circular_bug(&src_entry, target_entry, src, target);
+               if (src->class_idx == target->class_idx)
+                       print_deadlock_bug(current, src, target);
+               else
+                       print_circular_bug(&src_entry, target_entry, src, target);
         }
   
         return ret;
@@@ -2263,9 -2270,6 +2270,9 @@@ static inline bool usage_match(struct l
   
   static inline bool usage_skip(struct lock_list *entry, void *mask)
   {
+ +      if (entry->class->lock_type == LD_LOCK_NORMAL)
+ +              return false;
+ +
         /*
          * Skip local_lock() for irq inversion detection.
          *
@@@ -2292,16 -2296,14 +2299,16 @@@
          * As a result, we will skip local_lock(), when we search for irq
          * inversion bugs.
          */
- -      if (entry->class->lock_type == LD_LOCK_PERCPU) {
- -              if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
- -                      return false;
+ +      if (entry->class->lock_type == LD_LOCK_PERCPU &&
+ +          DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG))
+ +              return false;
   
- -              return true;
- -      }
+ +      /*
+ +       * Skip WAIT_OVERRIDE for irq inversion detection -- it's not actually
+ +       * a lock and only used to override the wait_type.
+ +       */
   
- -      return false;
+ +      return true;
   }
   
   /*
@@@ -2346,7 -2348,7 +2353,7 @@@ static void print_lock_class_header(str
         int bit;
   
         printk("%*s->", depth, "");
-       print_lock_name(class);
+       print_lock_name(NULL, class);
   #ifdef CONFIG_DEBUG_LOCKDEP
         printk(KERN_CONT " ops: %lu", debug_class_ops_read(class));
   #endif
@@@ -2528,11 -2530,11 +2535,11 @@@ print_irq_lock_scenario(struct lock_lis
          */
         if (middle_class != unsafe_class) {
                 printk("Chain exists of:\n  ");
-               __print_lock_name(safe_class);
+               __print_lock_name(NULL, safe_class);
                 printk(KERN_CONT " --> ");
-               __print_lock_name(middle_class);
+               __print_lock_name(NULL, middle_class);
                 printk(KERN_CONT " --> ");
-               __print_lock_name(unsafe_class);
+               __print_lock_name(NULL, unsafe_class);
                 printk(KERN_CONT "\n\n");
         }
   
@@@ -2540,18 -2542,18 +2547,18 @@@
         printk("       CPU0                    CPU1\n");
         printk("       ----                    ----\n");
         printk("  lock(");
-       __print_lock_name(unsafe_class);
+       __print_lock_name(NULL, unsafe_class);
         printk(KERN_CONT ");\n");
         printk("                               local_irq_disable();\n");
         printk("                               lock(");
-       __print_lock_name(safe_class);
+       __print_lock_name(NULL, safe_class);
         printk(KERN_CONT ");\n");
         printk("                               lock(");
-       __print_lock_name(middle_class);
+       __print_lock_name(NULL, middle_class);
         printk(KERN_CONT ");\n");
         printk("  <Interrupt>\n");
         printk("    lock(");
-       __print_lock_name(safe_class);
+       __print_lock_name(NULL, safe_class);
         printk(KERN_CONT ");\n");
         printk("\n *** DEADLOCK ***\n\n");
   }
@@@ -2588,20 -2590,20 +2595,20 @@@ print_bad_irq_dependency(struct task_st
         pr_warn("\nand this task is already holding:\n");
         print_lock(prev);
         pr_warn("which would create a new lock dependency:\n");
-       print_lock_name(hlock_class(prev));
+       print_lock_name(prev, hlock_class(prev));
         pr_cont(" ->");
-       print_lock_name(hlock_class(next));
+       print_lock_name(next, hlock_class(next));
         pr_cont("\n");
   
         pr_warn("\nbut this new dependency connects a %s-irq-safe lock:\n",
                 irqclass);
-       print_lock_name(backwards_entry->class);
+       print_lock_name(NULL, backwards_entry->class);
         pr_warn("\n... which became %s-irq-safe at:\n", irqclass);
   
         print_lock_trace(backwards_entry->class->usage_traces[bit1], 1);
   
         pr_warn("\nto a %s-irq-unsafe lock:\n", irqclass);
-       print_lock_name(forwards_entry->class);
+       print_lock_name(NULL, forwards_entry->class);
         pr_warn("\n... which became %s-irq-unsafe at:\n", irqclass);
         pr_warn("...");
   
@@@ -2971,10 -2973,10 +2978,10 @@@ print_deadlock_scenario(struct held_loc
         printk("       CPU0\n");
         printk("       ----\n");
         printk("  lock(");
-       __print_lock_name(prev);
+       __print_lock_name(prv, prev);
         printk(KERN_CONT ");\n");
         printk("  lock(");
-       __print_lock_name(next);
+       __print_lock_name(nxt, next);
         printk(KERN_CONT ");\n");
         printk("\n *** DEADLOCK ***\n\n");
         printk(" May be due to missing lock nesting notation\n\n");
@@@ -2984,6 -2986,8 +2991,8 @@@ static voi
   print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
                    struct held_lock *next)
   {
+       struct lock_class *class = hlock_class(prev);
+ 
         if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                 return;
   
@@@ -2998,6 -3002,11 +3007,11 @@@
         pr_warn("\nbut task is already holding lock:\n");
         print_lock(prev);
   
+       if (class->cmp_fn) {
+               pr_warn("and the lock comparison function returns %i:\n",
+                       class->cmp_fn(prev->instance, next->instance));
+       }
+ 
         pr_warn("\nother info that might help us debug this:\n");
         print_deadlock_scenario(next, prev);
         lockdep_print_held_locks(curr);
@@@ -3019,6 -3028,7 +3033,7 @@@
   static int
   check_deadlock(struct task_struct *curr, struct held_lock *next)
   {
+       struct lock_class *class;
         struct held_lock *prev;
         struct held_lock *nest = NULL;
         int i;
@@@ -3039,6 -3049,12 +3054,12 @@@
                 if ((next->read == 2) && prev->read)
                         continue;
   
+               class = hlock_class(prev);
+ 
+               if (class->cmp_fn &&
+                   class->cmp_fn(prev->instance, next->instance) < 0)
+                       continue;
+ 
                 /*
                  * We're holding the nest_lock, which serializes this lock's
                  * nesting behaviour.
@@@ -3100,6 -3116,14 +3121,14 @@@ check_prev_add(struct task_struct *curr
                 return 2;
         }
   
+       if (prev->class_idx == next->class_idx) {
+               struct lock_class *class = hlock_class(prev);
+ 
+               if (class->cmp_fn &&
+                   class->cmp_fn(prev->instance, next->instance) < 0)
+                       return 2;
+       }
+ 
         /*
          * Prove that the new <prev> -> <next> dependency would not
          * create a circular dependency in the graph. (We do this by
@@@ -3576,7 -3600,7 +3605,7 @@@ static void print_chain_keys_chain(stru
                 hlock_id = chain_hlocks[chain->base + i];
                 chain_key = print_chain_key_iteration(hlock_id, chain_key);
   
-               print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id));
+               print_lock_name(NULL, lock_classes + chain_hlock_class_idx(hlock_id));
                 printk("\n");
         }
   }
@@@ -3933,11 -3957,11 +3962,11 @@@ static void print_usage_bug_scenario(st
         printk("       CPU0\n");
         printk("       ----\n");
         printk("  lock(");
-       __print_lock_name(class);
+       __print_lock_name(lock, class);
         printk(KERN_CONT ");\n");
         printk("  <Interrupt>\n");
         printk("    lock(");
-       __print_lock_name(class);
+       __print_lock_name(lock, class);
         printk(KERN_CONT ");\n");
         printk("\n *** DEADLOCK ***\n\n");
   }
@@@ -4023,7 -4047,7 +4052,7 @@@ print_irq_inversion_bug(struct task_str
                 pr_warn("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
         else
                 pr_warn("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
-       print_lock_name(other->class);
+       print_lock_name(NULL, other->class);
         pr_warn("\n\nand interrupts could create inverse lock ordering between them.\n\n");
   
         pr_warn("\nother info that might help us debug this:\n");
@@@ -4773,8 -4797,7 +4802,8 @@@ static int check_wait_context(struct ta
   
         for (; depth < curr->lockdep_depth; depth++) {
                 struct held_lock *prev = curr->held_locks + depth;
- -              u8 prev_inner = hlock_class(prev)->wait_type_inner;
+ +              struct lock_class *class = hlock_class(prev);
+ +              u8 prev_inner = class->wait_type_inner;
   
                 if (prev_inner) {
                         /*
@@@ -4784,14 -4807,6 +4813,14 @@@
                          * Also due to trylocks.
                          */
                         curr_inner = min(curr_inner, prev_inner);
+ +
+ +                      /*
+ +                       * Allow override for annotations -- this is typically
+ +                       * only valid/needed for code that only exists when
+ +                       * CONFIG_PREEMPT_RT=n.
+ +                       */
+ +                      if (unlikely(class->lock_type == LD_LOCK_WAIT_OVERRIDE))
+ +                              curr_inner = prev_inner;
                 }
         }
   
@@@ -4896,6 -4911,33 +4925,33 @@@ EXPORT_SYMBOL_GPL(lockdep_init_map_type
   struct lock_class_key __lockdep_no_validate__;
   EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
   
+ #ifdef CONFIG_PROVE_LOCKING
+ void lockdep_set_lock_cmp_fn(struct lockdep_map *lock, lock_cmp_fn cmp_fn,
+                            lock_print_fn print_fn)
+ {
+       struct lock_class *class = lock->class_cache[0];
+       unsigned long flags;
+ 
+       raw_local_irq_save(flags);
+       lockdep_recursion_inc();
+ 
+       if (!class)
+               class = register_lock_class(lock, 0, 0);
+ 
+       if (class) {
+               WARN_ON(class->cmp_fn   && class->cmp_fn != cmp_fn);
+               WARN_ON(class->print_fn && class->print_fn != print_fn);
+ 
+               class->cmp_fn   = cmp_fn;
+               class->print_fn = print_fn;
+       }
+ 
+       lockdep_recursion_finish();
+       raw_local_irq_restore(flags);
+ }
+ EXPORT_SYMBOL_GPL(lockdep_set_lock_cmp_fn);
+ #endif
+ 
   static void
   print_lock_nested_lock_not_held(struct task_struct *curr,
                                 struct held_lock *hlock)
diff --combined kernel/sched/clock.c

index 5a575a0,71443cf..3c6193d
--- 1/kernel/sched/clock.c
--- 2/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@@ -266,7 -266,7 +266,7 @@@ static __always_inline u64 sched_clock_
         s64 delta;
   
   again:
- -      now = sched_clock();
+ +      now = sched_clock_noinstr();
         delta = now - scd->tick_raw;
         if (unlikely(delta < 0))
                 delta = 0;
@@@ -287,35 -287,28 +287,35 @@@
         clock = wrap_max(clock, min_clock);
         clock = wrap_min(clock, max_clock);
   
-       if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
+       if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock))
                 goto again;
   
         return clock;
   }
   
- -noinstr u64 local_clock(void)
+ +noinstr u64 local_clock_noinstr(void)
   {
         u64 clock;
   
         if (static_branch_likely(&__sched_clock_stable))
- -              return sched_clock() + __sched_clock_offset;
+ +              return sched_clock_noinstr() + __sched_clock_offset;
   
         if (!static_branch_likely(&sched_clock_running))
- -              return sched_clock();
+ +              return sched_clock_noinstr();
   
- -      preempt_disable_notrace();
         clock = sched_clock_local(this_scd());
- -      preempt_enable_notrace();
   
         return clock;
   }
+ +
+ +u64 local_clock(void)
+ +{
+ +      u64 now;
+ +      preempt_disable_notrace();
+ +      now = local_clock_noinstr();
+ +      preempt_enable_notrace();
+ +      return now;
+ +}
   EXPORT_SYMBOL_GPL(local_clock);
   
   static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
diff --combined scripts/min-tool-version.sh

index 131be76,367c37f..dfd1863
--- 1/scripts/min-tool-version.sh
--- 2/scripts/min-tool-version.sh
+++ b/scripts/min-tool-version.sh
@@@ -17,7 -17,11 +17,11 @@@ binutils
         echo 2.25.0
         ;;
   gcc)
-       echo 5.1.0
+       if [ "$SRCARCH" = parisc ]; then
+               echo 11.0.0
+       else
+               echo 5.1.0
+       fi
         ;;
   llvm)
         if [ "$SRCARCH" = s390 ]; then
@@@ -27,7 -31,7 +31,7 @@@
         fi
         ;;
   rustc)
- -      echo 1.62.0
+ +      echo 1.68.2
         ;;
   bindgen)
         echo 0.56.0
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 27 Jun 2023 21:14:30 +0000 (14:14 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 27 Jun 2023 21:14:30 +0000 (14:14 -0700)
		1	2
arch/x86/kernel/alternative.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mce/core.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/iommu/amd/iommu.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/bcache/btree.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/bcache/btree.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/lockdep.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/lockdep_types.h	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/locking/lockdep.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched/clock.c	patch \|	diff1 \|	diff2 \|	blob \| history
scripts/min-tool-version.sh	patch \|	diff1 \|	diff2 \|	blob \| history