Merge tag 'sched-core-2023-06-27' of git://git.kernel.org/pub/scm/linux/kernel/git...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 27 Jun 2023 21:03:21 +0000 (14:03 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 27 Jun 2023 21:03:21 +0000 (14:03 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 27 Jun 2023 21:03:21 +0000 (14:03 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 27 Jun 2023 21:03:21 +0000 (14:03 -0700)
diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst

index 9d9be52..9fe4846 100644 (file)
--- a/Documentation/scheduler/sched-deadline.rst
+++ b/Documentation/scheduler/sched-deadline.rst
@@ -203,12 +203,15 @@ Deadline Task Scheduling
    - Total bandwidth (this_bw): this is the sum of all tasks "belonging" to the
      runqueue, including the tasks in Inactive state.
  
+  - Maximum usable bandwidth (max_bw): This is the maximum bandwidth usable by
+    deadline tasks and is currently set to the RT capacity.
+
  
   The algorithm reclaims the bandwidth of the tasks in Inactive state.
   It does so by decrementing the runtime of the executing task Ti at a pace equal
   to
  
-           dq = -max{ Ui / Umax, (1 - Uinact - Uextra) } dt
+           dq = -(max{ Ui, (Umax - Uinact - Uextra) } / Umax) dt
  
   where:
  
diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h

index af1fafb..934c658 100644 (file)
--- a/arch/arm64/include/asm/arch_timer.h
+++ b/arch/arm64/include/asm/arch_timer.h
@@ -88,13 +88,7 @@ static inline notrace u64 arch_timer_read_cntvct_el0(void)
  
  #define arch_timer_reg_read_stable(reg)                                        \
         ({                                                              \
-               u64 _val;                                               \
-                                                                       \
-               preempt_disable_notrace();                              \
-               _val = erratum_handler(read_ ## reg)();                 \
-               preempt_enable_notrace();                               \
-                                                                       \
-               _val;                                                   \
+               erratum_handler(read_ ## reg)();                        \
         })
  
  /*
diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h

index 877495a..51d92ab 100644 (file)
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -22,13 +22,13 @@
   * Generic IO read/write.  These perform native-endian accesses.
   */
  #define __raw_writeb __raw_writeb
-static inline void __raw_writeb(u8 val, volatile void __iomem *addr)
+static __always_inline void __raw_writeb(u8 val, volatile void __iomem *addr)
  {
         asm volatile("strb %w0, [%1]" : : "rZ" (val), "r" (addr));
  }
  
  #define __raw_writew __raw_writew
-static inline void __raw_writew(u16 val, volatile void __iomem *addr)
+static __always_inline void __raw_writew(u16 val, volatile void __iomem *addr)
  {
         asm volatile("strh %w0, [%1]" : : "rZ" (val), "r" (addr));
  }
@@ -40,13 +40,13 @@ static __always_inline void __raw_writel(u32 val, volatile void __iomem *addr)
  }
  
  #define __raw_writeq __raw_writeq
-static inline void __raw_writeq(u64 val, volatile void __iomem *addr)
+static __always_inline void __raw_writeq(u64 val, volatile void __iomem *addr)
  {
         asm volatile("str %x0, [%1]" : : "rZ" (val), "r" (addr));
  }
  
  #define __raw_readb __raw_readb
-static inline u8 __raw_readb(const volatile void __iomem *addr)
+static __always_inline u8 __raw_readb(const volatile void __iomem *addr)
  {
         u8 val;
         asm volatile(ALTERNATIVE("ldrb %w0, [%1]",
@@ -57,7 +57,7 @@ static inline u8 __raw_readb(const volatile void __iomem *addr)
  }
  
  #define __raw_readw __raw_readw
-static inline u16 __raw_readw(const volatile void __iomem *addr)
+static __always_inline u16 __raw_readw(const volatile void __iomem *addr)
  {
         u16 val;
  
@@ -80,7 +80,7 @@ static __always_inline u32 __raw_readl(const volatile void __iomem *addr)
  }
  
  #define __raw_readq __raw_readq
-static inline u64 __raw_readq(const volatile void __iomem *addr)
+static __always_inline u64 __raw_readq(const volatile void __iomem *addr)
  {
         u64 val;
         asm volatile(ALTERNATIVE("ldr %0, [%1]",
diff --git a/arch/loongarch/include/asm/loongarch.h b/arch/loongarch/include/asm/loongarch.h

index 35e8a52..1c2a0a2 100644 (file)
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -1167,7 +1167,7 @@ static __always_inline void iocsr_write64(u64 val, u32 reg)
  
  #ifndef __ASSEMBLY__
  
-static inline u64 drdtime(void)
+static __always_inline u64 drdtime(void)
  {
         int rID = 0;
         u64 val = 0;
diff --git a/arch/loongarch/kernel/time.c b/arch/loongarch/kernel/time.c

index f377e50..c189e03 100644 (file)
--- a/arch/loongarch/kernel/time.c
+++ b/arch/loongarch/kernel/time.c
@@ -190,9 +190,9 @@ static u64 read_const_counter(struct clocksource *clk)
         return drdtime();
  }
  
-static u64 native_sched_clock(void)
+static noinstr u64 sched_clock_read(void)
  {
-       return read_const_counter(NULL);
+       return drdtime();
  }
  
  static struct clocksource clocksource_const = {
@@ -211,7 +211,7 @@ int __init constant_clocksource_init(void)
  
         res = clocksource_register_hz(&clocksource_const, freq);
  
-       sched_clock_register(native_sched_clock, 64, freq);
+       sched_clock_register(sched_clock_read, 64, freq);
  
         pr_info("Constant clock source device register\n");
  
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h

index ce878e8..4d64665 100644 (file)
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -63,7 +63,7 @@ static inline int store_tod_clock_ext_cc(union tod_clock *clk)
         return cc;
  }
  
-static inline void store_tod_clock_ext(union tod_clock *tod)
+static __always_inline void store_tod_clock_ext(union tod_clock *tod)
  {
         asm volatile("stcke %0" : "=Q" (*tod) : : "cc");
  }
@@ -177,7 +177,7 @@ static inline void local_tick_enable(unsigned long comp)
  
  typedef unsigned long cycles_t;
  
-static inline unsigned long get_tod_clock(void)
+static __always_inline unsigned long get_tod_clock(void)
  {
         union tod_clock clk;
  
@@ -204,6 +204,11 @@ void init_cpu_timer(void);
  
  extern union tod_clock tod_clock_base;
  
+static __always_inline unsigned long __get_tod_clock_monotonic(void)
+{
+       return get_tod_clock() - tod_clock_base.tod;
+}
+
  /**
   * get_clock_monotonic - returns current time in clock rate units
   *
@@ -216,7 +221,7 @@ static inline unsigned long get_tod_clock_monotonic(void)
         unsigned long tod;
  
         preempt_disable_notrace();
-       tod = get_tod_clock() - tod_clock_base.tod;
+       tod = __get_tod_clock_monotonic();
         preempt_enable_notrace();
         return tod;
  }
@@ -240,7 +245,7 @@ static inline unsigned long get_tod_clock_monotonic(void)
   * -> ns = (th * 125) + ((tl * 125) >> 9);
   *
   */
-static inline unsigned long tod_to_ns(unsigned long todval)
+static __always_inline unsigned long tod_to_ns(unsigned long todval)
  {
         return ((todval >> 9) * 125) + (((todval & 0x1ff) * 125) >> 9);
  }
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c

index 6b7b6d5..2762781 100644 (file)
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -102,6 +102,11 @@ void __init time_early_init(void)
                         ((long) qui.old_leap * 4096000000L);
  }
  
+unsigned long long noinstr sched_clock_noinstr(void)
+{
+       return tod_to_ns(__get_tod_clock_monotonic());
+}
+
  /*
   * Scheduler clock - returns current time in nanosec units.
   */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h

index 49bb4f2..88d9ef9 100644 (file)
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -257,6 +257,11 @@ void hv_set_register(unsigned int reg, u64 value);
  u64 hv_get_non_nested_register(unsigned int reg);
  void hv_set_non_nested_register(unsigned int reg, u64 value);
  
+static __always_inline u64 hv_raw_get_register(unsigned int reg)
+{
+       return __rdmsr(reg);
+}
+
  #else /* CONFIG_HYPERV */
  static inline void hyperv_init(void) {}
  static inline void hyperv_setup_mmu_ops(void) {}
diff --git a/arch/x86/include/asm/vdso/gettimeofday.h b/arch/x86/include/asm/vdso/gettimeofday.h

index 4cf6794..c81858d 100644 (file)
--- a/arch/x86/include/asm/vdso/gettimeofday.h
+++ b/arch/x86/include/asm/vdso/gettimeofday.h
@@ -231,14 +231,19 @@ static u64 vread_pvclock(void)
                 ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
         } while (pvclock_read_retry(pvti, version));
  
-       return ret;
+       return ret & S64_MAX;
  }
  #endif
  
  #ifdef CONFIG_HYPERV_TIMER
  static u64 vread_hvclock(void)
  {
-       return hv_read_tsc_page(&hvclock_page);
+       u64 tsc, time;
+
+       if (hv_read_tsc_page_tsc(&hvclock_page, &tsc, &time))
+               return time & S64_MAX;
+
+       return U64_MAX;
  }
  #endif
  
@@ -246,7 +251,7 @@ static inline u64 __arch_get_hw_counter(s32 clock_mode,
                                         const struct vdso_data *vd)
  {
         if (likely(clock_mode == VDSO_CLOCKMODE_TSC))
-               return (u64)rdtsc_ordered();
+               return (u64)rdtsc_ordered() & S64_MAX;
         /*
          * For any memory-mapped vclock type, we need to make sure that gcc
          * doesn't cleverly hoist a load before the mode check.  Otherwise we
@@ -284,6 +289,9 @@ static inline bool arch_vdso_clocksource_ok(const struct vdso_data *vd)
   * which can be invalidated asynchronously and indicate invalidation by
   * returning U64_MAX, which can be effectively tested by checking for a
   * negative value after casting it to s64.
+ *
+ * This effectively forces a S64_MAX mask on the calculations, unlike the
+ * U64_MAX mask normally used by x86 clocksources.
   */
  static inline bool arch_vdso_cycles_ok(u64 cycles)
  {
@@ -303,18 +311,29 @@ static inline bool arch_vdso_cycles_ok(u64 cycles)
   * @last. If not then use @last, which is the base time of the current
   * conversion period.
   *
- * This variant also removes the masking of the subtraction because the
- * clocksource mask of all VDSO capable clocksources on x86 is U64_MAX
- * which would result in a pointless operation. The compiler cannot
- * optimize it away as the mask comes from the vdso data and is not compile
- * time constant.
+ * This variant also uses a custom mask because while the clocksource mask of
+ * all the VDSO capable clocksources on x86 is U64_MAX, the above code uses
+ * U64_MASK as an exception value, additionally arch_vdso_cycles_ok() above
+ * declares everything with the MSB/Sign-bit set as invalid. Therefore the
+ * effective mask is S64_MAX.
   */
  static __always_inline
  u64 vdso_calc_delta(u64 cycles, u64 last, u64 mask, u32 mult)
  {
-       if (cycles > last)
-               return (cycles - last) * mult;
-       return 0;
+       /*
+        * Due to the MSB/Sign-bit being used as invald marker (see
+        * arch_vdso_cycles_valid() above), the effective mask is S64_MAX.
+        */
+       u64 delta = (cycles - last) & S64_MAX;
+
+       /*
+        * Due to the above mentioned TSC wobbles, filter out negative motion.
+        * Per the above masking, the effective sign bit is now bit 62.
+        */
+       if (unlikely(delta & (1ULL << 62)))
+               return 0;
+
+       return delta * mult;
  }
  #define vdso_calc_delta vdso_calc_delta
  
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c

index 670eb08..ee4fe8c 100644 (file)
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -165,32 +165,19 @@ int arch_asym_cpu_priority(int cpu)
  
  /**
   * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
- * @prio:      Priority of cpu core
- * @core_cpu:  The cpu number associated with the core
+ * @prio:      Priority of @cpu
+ * @cpu:       The CPU number
   *
   * The pstate driver will find out the max boost frequency
   * and call this function to set a priority proportional
- * to the max boost frequency. CPU with higher boost
+ * to the max boost frequency. CPUs with higher boost
   * frequency will receive higher priority.
   *
   * No need to rebuild sched domain after updating
   * the CPU priorities. The sched domains have no
   * dependency on CPU priorities.
   */
-void sched_set_itmt_core_prio(int prio, int core_cpu)
+void sched_set_itmt_core_prio(int prio, int cpu)
  {
-       int cpu, i = 1;
-
-       for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
-               int smt_prio;
-
-               /*
-                * Ensure that the siblings are moved to the end
-                * of the priority chain and only used when
-                * all other high priority cpus are out of capacity.
-                */
-               smt_prio = prio * smp_num_siblings / (i * i);
-               per_cpu(sched_core_priority, cpu) = smt_prio;
-               i++;
-       }
+       per_cpu(sched_core_priority, cpu) = prio;
  }
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c

index 0f35d44..fb8f521 100644 (file)
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -71,7 +71,7 @@ static int kvm_set_wallclock(const struct timespec64 *now)
         return -ENODEV;
  }
  
-static noinstr u64 kvm_clock_read(void)
+static u64 kvm_clock_read(void)
  {
         u64 ret;
  
@@ -88,7 +88,7 @@ static u64 kvm_clock_get_cycles(struct clocksource *cs)
  
  static noinstr u64 kvm_sched_clock_read(void)
  {
-       return kvm_clock_read() - kvm_sched_clock_offset;
+       return pvclock_clocksource_read_nowd(this_cpu_pvti()) - kvm_sched_clock_offset;
  }
  
  static inline void kvm_sched_clock_init(bool stable)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c

index 8779a7e..ed2d519 100644 (file)
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -602,7 +602,7 @@ static int x86_core_flags(void)
  #ifdef CONFIG_SCHED_SMT
  static int x86_smt_flags(void)
  {
-       return cpu_smt_flags() | x86_sched_itmt_flags();
+       return cpu_smt_flags();
  }
  #endif
  #ifdef CONFIG_SCHED_CLUSTER
@@ -613,50 +613,57 @@ static int x86_cluster_flags(void)
  #endif
  #endif
  
-static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-       { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_CLUSTER
-       { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
-#endif
-#ifdef CONFIG_SCHED_MC
-       { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
-#endif
-       { NULL, },
-};
+/*
+ * Set if a package/die has multiple NUMA nodes inside.
+ * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
+ * Sub-NUMA Clustering have this.
+ */
+static bool x86_has_numa_in_package;
  
-static struct sched_domain_topology_level x86_hybrid_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-       { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-#ifdef CONFIG_SCHED_MC
-       { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
-#endif
-       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-       { NULL, },
-};
+static struct sched_domain_topology_level x86_topology[6];
+
+static void __init build_sched_topology(void)
+{
+       int i = 0;
  
-static struct sched_domain_topology_level x86_topology[] = {
  #ifdef CONFIG_SCHED_SMT
-       { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
+       x86_topology[i++] = (struct sched_domain_topology_level){
+               cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
+       };
  #endif
  #ifdef CONFIG_SCHED_CLUSTER
-       { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) },
+       /*
+        * For now, skip the cluster domain on Hybrid.
+        */
+       if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) {
+               x86_topology[i++] = (struct sched_domain_topology_level){
+                       cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS)
+               };
+       }
  #endif
  #ifdef CONFIG_SCHED_MC
-       { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
+       x86_topology[i++] = (struct sched_domain_topology_level){
+               cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC)
+       };
  #endif
-       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-       { NULL, },
-};
+       /*
+        * When there is NUMA topology inside the package skip the DIE domain
+        * since the NUMA domains will auto-magically create the right spanning
+        * domains based on the SLIT.
+        */
+       if (!x86_has_numa_in_package) {
+               x86_topology[i++] = (struct sched_domain_topology_level){
+                       cpu_cpu_mask, SD_INIT_NAME(DIE)
+               };
+       }
  
-/*
- * Set if a package/die has multiple NUMA nodes inside.
- * AMD Magny-Cours, Intel Cluster-on-Die, and Intel
- * Sub-NUMA Clustering have this.
- */
-static bool x86_has_numa_in_package;
+       /*
+        * There must be one trailing NULL entry left.
+        */
+       BUG_ON(i >= ARRAY_SIZE(x86_topology)-1);
+
+       set_sched_topology(x86_topology);
+}
  
  void set_cpu_sibling_map(int cpu)
  {
@@ -1264,15 +1271,6 @@ void __init smp_prepare_cpus_common(void)
                 zalloc_cpumask_var(&per_cpu(cpu_l2c_shared_map, i), GFP_KERNEL);
         }
  
-       /*
-        * Set 'default' x86 topology, this matches default_topology() in that
-        * it has NUMA nodes as a topology level. See also
-        * native_smp_cpus_done().
-        *
-        * Must be done before set_cpus_sibling_map() is ran.
-        */
-       set_sched_topology(x86_topology);
-
         set_cpu_sibling_map(0);
  }
  
@@ -1393,13 +1391,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
         pr_debug("Boot done\n");
  
         calculate_max_logical_packages();
-
-       /* XXX for now assume numa-in-package and hybrid don't overlap */
-       if (x86_has_numa_in_package)
-               set_sched_topology(x86_numa_in_package_topology);
-       if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU))
-               set_sched_topology(x86_hybrid_topology);
-
+       build_sched_topology();
         nmi_selftest();
         impress_friends();
         cache_aps_init();
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c

index 1412b77..3425c6a 100644 (file)
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -69,12 +69,10 @@ static int __init tsc_early_khz_setup(char *buf)
  }
  early_param("tsc_early_khz", tsc_early_khz_setup);
  
-__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
+__always_inline void __cyc2ns_read(struct cyc2ns_data *data)
  {
         int seq, idx;
  
-       preempt_disable_notrace();
-
         do {
                 seq = this_cpu_read(cyc2ns.seq.seqcount.sequence);
                 idx = seq & 1;
@@ -86,6 +84,12 @@ __always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
         } while (unlikely(seq != this_cpu_read(cyc2ns.seq.seqcount.sequence)));
  }
  
+__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
+{
+       preempt_disable_notrace();
+       __cyc2ns_read(data);
+}
+
  __always_inline void cyc2ns_read_end(void)
  {
         preempt_enable_notrace();
@@ -115,18 +119,25 @@ __always_inline void cyc2ns_read_end(void)
   *                      -johnstul@us.ibm.com "math is hard, lets go shopping!"
   */
  
-static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static __always_inline unsigned long long __cycles_2_ns(unsigned long long cyc)
  {
         struct cyc2ns_data data;
         unsigned long long ns;
  
-       cyc2ns_read_begin(&data);
+       __cyc2ns_read(&data);
  
         ns = data.cyc2ns_offset;
         ns += mul_u64_u32_shr(cyc, data.cyc2ns_mul, data.cyc2ns_shift);
  
-       cyc2ns_read_end();
+       return ns;
+}
  
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+       unsigned long long ns;
+       preempt_disable_notrace();
+       ns = __cycles_2_ns(cyc);
+       preempt_enable_notrace();
         return ns;
  }
  
@@ -223,7 +234,7 @@ noinstr u64 native_sched_clock(void)
                 u64 tsc_now = rdtsc();
  
                 /* return the value in ns */
-               return cycles_2_ns(tsc_now);
+               return __cycles_2_ns(tsc_now);
         }
  
         /*
@@ -250,7 +261,7 @@ u64 native_sched_clock_from_tsc(u64 tsc)
  /* We need to define a real function for sched_clock, to override the
     weak default version */
  #ifdef CONFIG_PARAVIRT
-noinstr u64 sched_clock(void)
+noinstr u64 sched_clock_noinstr(void)
  {
         return paravirt_sched_clock();
  }
@@ -260,11 +271,20 @@ bool using_native_sched_clock(void)
         return static_call_query(pv_sched_clock) == native_sched_clock;
  }
  #else
-u64 sched_clock(void) __attribute__((alias("native_sched_clock")));
+u64 sched_clock_noinstr(void) __attribute__((alias("native_sched_clock")));
  
  bool using_native_sched_clock(void) { return true; }
  #endif
  
+notrace u64 sched_clock(void)
+{
+       u64 now;
+       preempt_disable_notrace();
+       now = sched_clock_noinstr();
+       preempt_enable_notrace();
+       return now;
+}
+
  int check_tsc_unstable(void)
  {
         return tsc_unstable;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index 04b57a3..bc68a39 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2799,14 +2799,13 @@ static u64 read_tsc(void)
  static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
                           int *mode)
  {
-       long v;
         u64 tsc_pg_val;
+       long v;
  
         switch (clock->vclock_mode) {
         case VDSO_CLOCKMODE_HVCLOCK:
-               tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
-                                                 tsc_timestamp);
-               if (tsc_pg_val != U64_MAX) {
+               if (hv_read_tsc_page_tsc(hv_get_tsc_page(),
+                                        tsc_timestamp, &tsc_pg_val)) {
                         /* TSC page valid */
                         *mode = VDSO_CLOCKMODE_HVCLOCK;
                         v = (tsc_pg_val - clock->cycle_last) &
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c

index b74ac25..52fa560 100644 (file)
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -66,11 +66,10 @@ static noinstr u64 xen_sched_clock(void)
          struct pvclock_vcpu_time_info *src;
         u64 ret;
  
-       preempt_disable_notrace();
         src = &__this_cpu_read(xen_vcpu)->time;
         ret = pvclock_clocksource_read_nowd(src);
         ret -= xen_sched_clock_offset;
-       preempt_enable_notrace();
+
         return ret;
  }
  
diff --git a/drivers/clocksource/arm_arch_timer.c b/drivers/clocksource/arm_arch_timer.c

index e09d442..e733a2a 100644 (file)
--- a/drivers/clocksource/arm_arch_timer.c
+++ b/drivers/clocksource/arm_arch_timer.c
@@ -191,22 +191,40 @@ u32 arch_timer_reg_read(int access, enum arch_timer_reg reg,
         return val;
  }
  
-static notrace u64 arch_counter_get_cntpct_stable(void)
+static noinstr u64 raw_counter_get_cntpct_stable(void)
  {
         return __arch_counter_get_cntpct_stable();
  }
  
-static notrace u64 arch_counter_get_cntpct(void)
+static notrace u64 arch_counter_get_cntpct_stable(void)
+{
+       u64 val;
+       preempt_disable_notrace();
+       val = __arch_counter_get_cntpct_stable();
+       preempt_enable_notrace();
+       return val;
+}
+
+static noinstr u64 arch_counter_get_cntpct(void)
  {
         return __arch_counter_get_cntpct();
  }
  
-static notrace u64 arch_counter_get_cntvct_stable(void)
+static noinstr u64 raw_counter_get_cntvct_stable(void)
  {
         return __arch_counter_get_cntvct_stable();
  }
  
-static notrace u64 arch_counter_get_cntvct(void)
+static notrace u64 arch_counter_get_cntvct_stable(void)
+{
+       u64 val;
+       preempt_disable_notrace();
+       val = __arch_counter_get_cntvct_stable();
+       preempt_enable_notrace();
+       return val;
+}
+
+static noinstr u64 arch_counter_get_cntvct(void)
  {
         return __arch_counter_get_cntvct();
  }
@@ -753,14 +771,14 @@ static int arch_timer_set_next_event_phys(unsigned long evt,
         return 0;
  }
  
-static u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo)
+static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo)
  {
         u32 cnt_lo, cnt_hi, tmp_hi;
  
         do {
-               cnt_hi = readl_relaxed(t->base + offset_lo + 4);
-               cnt_lo = readl_relaxed(t->base + offset_lo);
-               tmp_hi = readl_relaxed(t->base + offset_lo + 4);
+               cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4));
+               cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo));
+               tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4));
         } while (cnt_hi != tmp_hi);
  
         return ((u64) cnt_hi << 32) | cnt_lo;
@@ -1060,7 +1078,7 @@ bool arch_timer_evtstrm_available(void)
         return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available);
  }
  
-static u64 arch_counter_get_cntvct_mem(void)
+static noinstr u64 arch_counter_get_cntvct_mem(void)
  {
         return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO);
  }
@@ -1074,6 +1092,7 @@ struct arch_timer_kvm_info *arch_timer_get_kvm_info(void)
  
  static void __init arch_counter_register(unsigned type)
  {
+       u64 (*scr)(void);
         u64 start_count;
         int width;
  
@@ -1083,21 +1102,28 @@ static void __init arch_counter_register(unsigned type)
  
                 if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) ||
                     arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) {
-                       if (arch_timer_counter_has_wa())
+                       if (arch_timer_counter_has_wa()) {
                                 rd = arch_counter_get_cntvct_stable;
-                       else
+                               scr = raw_counter_get_cntvct_stable;
+                       } else {
                                 rd = arch_counter_get_cntvct;
+                               scr = arch_counter_get_cntvct;
+                       }
                 } else {
-                       if (arch_timer_counter_has_wa())
+                       if (arch_timer_counter_has_wa()) {
                                 rd = arch_counter_get_cntpct_stable;
-                       else
+                               scr = raw_counter_get_cntpct_stable;
+                       } else {
                                 rd = arch_counter_get_cntpct;
+                               scr = arch_counter_get_cntpct;
+                       }
                 }
  
                 arch_timer_read_counter = rd;
                 clocksource_counter.vdso_clock_mode = vdso_default;
         } else {
                 arch_timer_read_counter = arch_counter_get_cntvct_mem;
+               scr = arch_counter_get_cntvct_mem;
         }
  
         width = arch_counter_get_width();
@@ -1113,7 +1139,7 @@ static void __init arch_counter_register(unsigned type)
         timecounter_init(&arch_timer_kvm_info.timecounter,
                          &cyclecounter, start_count);
  
-       sched_clock_register(arch_timer_read_counter, width, arch_timer_rate);
+       sched_clock_register(scr, width, arch_timer_rate);
  }
  
  static void arch_timer_stop(struct clock_event_device *clk)
diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c

index 9fc008c..e56307a 100644 (file)
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -365,6 +365,20 @@ void hv_stimer_global_cleanup(void)
  }
  EXPORT_SYMBOL_GPL(hv_stimer_global_cleanup);
  
+static __always_inline u64 read_hv_clock_msr(void)
+{
+       /*
+        * Read the partition counter to get the current tick count. This count
+        * is set to 0 when the partition is created and is incremented in 100
+        * nanosecond units.
+        *
+        * Use hv_raw_get_register() because this function is used from
+        * noinstr. Notable; while HV_REGISTER_TIME_REF_COUNT is a synthetic
+        * register it doesn't need the GHCB path.
+        */
+       return hv_raw_get_register(HV_REGISTER_TIME_REF_COUNT);
+}
+
  /*
   * Code and definitions for the Hyper-V clocksources.  Two
   * clocksources are defined: one that reads the Hyper-V defined MSR, and
@@ -393,14 +407,20 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
  }
  EXPORT_SYMBOL_GPL(hv_get_tsc_page);
  
-static u64 notrace read_hv_clock_tsc(void)
+static __always_inline u64 read_hv_clock_tsc(void)
  {
-       u64 current_tick = hv_read_tsc_page(hv_get_tsc_page());
+       u64 cur_tsc, time;
  
-       if (current_tick == U64_MAX)
-               current_tick = hv_get_register(HV_REGISTER_TIME_REF_COUNT);
+       /*
+        * The Hyper-V Top-Level Function Spec (TLFS), section Timers,
+        * subsection Refererence Counter, guarantees that the TSC and MSR
+        * times are in sync and monotonic. Therefore we can fall back
+        * to the MSR in case the TSC page indicates unavailability.
+        */
+       if (!hv_read_tsc_page_tsc(tsc_page, &cur_tsc, &time))
+               time = read_hv_clock_msr();
  
-       return current_tick;
+       return time;
  }
  
  static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg)
@@ -408,7 +428,7 @@ static u64 notrace read_hv_clock_tsc_cs(struct clocksource *arg)
         return read_hv_clock_tsc();
  }
  
-static u64 notrace read_hv_sched_clock_tsc(void)
+static u64 noinstr read_hv_sched_clock_tsc(void)
  {
         return (read_hv_clock_tsc() - hv_sched_clock_offset) *
                 (NSEC_PER_SEC / HV_CLOCK_HZ);
@@ -460,16 +480,6 @@ static struct clocksource hyperv_cs_tsc = {
  #endif
  };
  
-static u64 notrace read_hv_clock_msr(void)
-{
-       /*
-        * Read the partition counter to get the current tick count. This count
-        * is set to 0 when the partition is created and is incremented in
-        * 100 nanosecond units.
-        */
-       return hv_get_register(HV_REGISTER_TIME_REF_COUNT);
-}
-
  static u64 notrace read_hv_clock_msr_cs(struct clocksource *arg)
  {
         return read_hv_clock_msr();
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c

index 8e929f6..737a026 100644 (file)
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -145,7 +145,7 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv,
  
         instrumentation_begin();
  
-       time_start = ns_to_ktime(local_clock());
+       time_start = ns_to_ktime(local_clock_noinstr());
  
         tick_freeze();
         /*
@@ -169,7 +169,7 @@ static noinstr void enter_s2idle_proper(struct cpuidle_driver *drv,
         tick_unfreeze();
         start_critical_timings();
  
-       time_end = ns_to_ktime(local_clock());
+       time_end = ns_to_ktime(local_clock_noinstr());
  
         dev->states_usage[index].s2idle_time += ktime_us_delta(time_end, time_start);
         dev->states_usage[index].s2idle_usage++;
@@ -243,7 +243,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
         sched_idle_set_state(target_state);
  
         trace_cpu_idle(index, dev->cpu);
-       time_start = ns_to_ktime(local_clock());
+       time_start = ns_to_ktime(local_clock_noinstr());
  
         stop_critical_timings();
         if (!(target_state->flags & CPUIDLE_FLAG_RCU_IDLE)) {
@@ -276,7 +276,7 @@ noinstr int cpuidle_enter_state(struct cpuidle_device *dev,
         start_critical_timings();
  
         sched_clock_idle_wakeup_event();
-       time_end = ns_to_ktime(local_clock());
+       time_end = ns_to_ktime(local_clock_noinstr());
         trace_cpu_idle(PWR_EVENT_EXIT, dev->cpu);
  
         /* The cpu is no longer idle or about to enter idle. */
diff --git a/drivers/cpuidle/poll_state.c b/drivers/cpuidle/poll_state.c

index bdcfeae..9b6d90a 100644 (file)
--- a/drivers/cpuidle/poll_state.c
+++ b/drivers/cpuidle/poll_state.c
@@ -15,7 +15,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
  {
         u64 time_start;
  
-       time_start = local_clock();
+       time_start = local_clock_noinstr();
  
         dev->poll_time_limit = false;
  
@@ -32,7 +32,7 @@ static int __cpuidle poll_idle(struct cpuidle_device *dev,
                                 continue;
  
                         loop_count = 0;
-                       if (local_clock() - time_start > limit) {
+                       if (local_clock_noinstr() - time_start > limit) {
                                 dev->poll_time_limit = true;
                                 break;
                         }
diff --git a/include/clocksource/hyperv_timer.h b/include/clocksource/hyperv_timer.h

index 536f897..6cdc873 100644 (file)
--- a/include/clocksource/hyperv_timer.h
+++ b/include/clocksource/hyperv_timer.h
@@ -38,8 +38,9 @@ extern void hv_remap_tsc_clocksource(void);
  extern unsigned long hv_get_tsc_pfn(void);
  extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
  
-static inline notrace u64
-hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
+static __always_inline bool
+hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
+                    u64 *cur_tsc, u64 *time)
  {
         u64 scale, offset;
         u32 sequence;
@@ -63,7 +64,7 @@ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
         do {
                 sequence = READ_ONCE(tsc_pg->tsc_sequence);
                 if (!sequence)
-                       return U64_MAX;
+                       return false;
                 /*
                  * Make sure we read sequence before we read other values from
                  * TSC page.
@@ -82,15 +83,8 @@ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc)
  
         } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence);
  
-       return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
-}
-
-static inline notrace u64
-hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg)
-{
-       u64 cur_tsc;
-
-       return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc);
+       *time = mul_u64_u64_shr(*cur_tsc, scale, 64) + offset;
+       return true;
  }
  
  #else /* CONFIG_HYPERV_TIMER */
@@ -104,10 +98,10 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void)
         return NULL;
  }
  
-static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
-                                      u64 *cur_tsc)
+static __always_inline bool
+hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, u64 *cur_tsc, u64 *time)
  {
-       return U64_MAX;
+       return false;
  }
  
  static inline int hv_stimer_cleanup(unsigned int cpu) { return 0; }
diff --git a/include/linux/kthread.h b/include/linux/kthread.h

index 30e5bec..f1f95a7 100644 (file)
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -89,6 +89,7 @@ int kthread_stop(struct task_struct *k);
  bool kthread_should_stop(void);
  bool kthread_should_park(void);
  bool __kthread_should_park(struct task_struct *k);
+bool kthread_should_stop_or_park(void);
  bool kthread_freezable_should_stop(bool *was_frozen);
  void *kthread_func(struct task_struct *k);
  void *kthread_data(struct task_struct *k);
diff --git a/include/linux/math64.h b/include/linux/math64.h

index 8b9191a..bf74478 100644 (file)
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -168,7 +168,7 @@ static __always_inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
  #endif /* mul_u64_u32_shr */
  
  #ifndef mul_u64_u64_shr
-static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
+static __always_inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift)
  {
         return (u64)(((unsigned __int128)a * mul) >> shift);
  }
diff --git a/include/linux/rbtree_latch.h b/include/linux/rbtree_latch.h

index 3d1a9e7..6a0999c 100644 (file)
--- a/include/linux/rbtree_latch.h
+++ b/include/linux/rbtree_latch.h
@@ -206,7 +206,7 @@ latch_tree_find(void *key, struct latch_tree_root *root,
         do {
                 seq = raw_read_seqcount_latch(&root->seq);
                 node = __lt_find(key, root, seq & 1, ops->comp);
-       } while (read_seqcount_latch_retry(&root->seq, seq));
+       } while (raw_read_seqcount_latch_retry(&root->seq, seq));
  
         return node;
  }
diff --git a/include/linux/sched.h b/include/linux/sched.h

index eed5d65..1292d38 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2006,15 +2006,12 @@ static __always_inline void scheduler_ipi(void)
          */
         preempt_fold_need_resched();
  }
-extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
  #else
  static inline void scheduler_ipi(void) { }
-static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
-{
-       return 1;
-}
  #endif
  
+extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
+
  /*
   * Set thread flags in other task's structures.
   * See asm/thread_info.h for TIF_xxxx flags available:
diff --git a/include/linux/sched/clock.h b/include/linux/sched/clock.h

index ca008f7..196f0ca 100644 (file)
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -12,7 +12,16 @@
   *
   * Please use one of the three interfaces below.
   */
-extern unsigned long long notrace sched_clock(void);
+extern u64 sched_clock(void);
+
+#if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK)
+extern u64 sched_clock_noinstr(void);
+#else
+static __always_inline u64 sched_clock_noinstr(void)
+{
+       return sched_clock();
+}
+#endif
  
  /*
   * See the comment in kernel/sched/clock.c
@@ -45,6 +54,11 @@ static inline u64 cpu_clock(int cpu)
         return sched_clock();
  }
  
+static __always_inline u64 local_clock_noinstr(void)
+{
+       return sched_clock_noinstr();
+}
+
  static __always_inline u64 local_clock(void)
  {
         return sched_clock();
@@ -79,6 +93,7 @@ static inline u64 cpu_clock(int cpu)
         return sched_clock_cpu(cpu);
  }
  
+extern u64 local_clock_noinstr(void);
  extern u64 local_clock(void);
  
  #endif
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h

index 57bde66..fad77b5 100644 (file)
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
  /*
   * Place busy tasks earlier in the domain
   *
- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
- *               up, but currently assumed to be set from the base domain
- *               upwards (see update_top_cache_domain()).
   * NEEDS_GROUPS: Load balancing flag.
   */
-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
+SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
  
  /*
   * Prefer to place tasks in a sibling domain
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

index 816df6c..67b573d 100644 (file)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -203,7 +203,7 @@ struct sched_domain_topology_level {
  #endif
  };
  
-extern void set_sched_topology(struct sched_domain_topology_level *tl);
+extern void __init set_sched_topology(struct sched_domain_topology_level *tl);
  
  #ifdef CONFIG_SCHED_DEBUG
  # define SD_INIT_NAME(type)            .name = #type
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h

index 3926e90..987a59d 100644 (file)
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -671,9 +671,9 @@ typedef struct {
   *
   * Return: sequence counter raw value. Use the lowest bit as an index for
   * picking which data copy to read. The full counter must then be checked
- * with read_seqcount_latch_retry().
+ * with raw_read_seqcount_latch_retry().
   */
-static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
+static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
  {
         /*
          * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
@@ -683,16 +683,17 @@ static inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
  }
  
  /**
- * read_seqcount_latch_retry() - end a seqcount_latch_t read section
+ * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
   * @s:         Pointer to seqcount_latch_t
   * @start:     count, from raw_read_seqcount_latch()
   *
   * Return: true if a read section retry is required, else false
   */
-static inline int
-read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
+static __always_inline int
+raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
  {
-       return read_seqcount_retry(&s->seqcount, start);
+       smp_rmb();
+       return unlikely(READ_ONCE(s->seqcount.sequence) != start);
  }
  
  /**
@@ -752,7 +753,7 @@ read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
   *                     entry = data_query(latch->data[idx], ...);
   *
   *             // This includes needed smp_rmb()
- *             } while (read_seqcount_latch_retry(&latch->seq, seq));
+ *             } while (raw_read_seqcount_latch_retry(&latch->seq, seq));
   *
   *             return entry;
   *     }
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index 4d42f0c..8f917f6 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3891,6 +3891,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
         return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
  }
  
+static int cgroup_pressure_open(struct kernfs_open_file *of)
+{
+       if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
+               return -EPERM;
+
+       return 0;
+}
+
  static void cgroup_pressure_release(struct kernfs_open_file *of)
  {
         struct cgroup_file_ctx *ctx = of->priv;
@@ -5290,6 +5298,7 @@ static struct cftype cgroup_psi_files[] = {
         {
                 .name = "io.pressure",
                 .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
+               .open = cgroup_pressure_open,
                 .seq_show = cgroup_io_pressure_show,
                 .write = cgroup_io_pressure_write,
                 .poll = cgroup_pressure_poll,
@@ -5298,6 +5307,7 @@ static struct cftype cgroup_psi_files[] = {
         {
                 .name = "memory.pressure",
                 .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
+               .open = cgroup_pressure_open,
                 .seq_show = cgroup_memory_pressure_show,
                 .write = cgroup_memory_pressure_write,
                 .poll = cgroup_pressure_poll,
@@ -5306,6 +5316,7 @@ static struct cftype cgroup_psi_files[] = {
         {
                 .name = "cpu.pressure",
                 .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
+               .open = cgroup_pressure_open,
                 .seq_show = cgroup_cpu_pressure_show,
                 .write = cgroup_cpu_pressure_write,
                 .poll = cgroup_pressure_poll,
@@ -5315,6 +5326,7 @@ static struct cftype cgroup_psi_files[] = {
         {
                 .name = "irq.pressure",
                 .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+               .open = cgroup_pressure_open,
                 .seq_show = cgroup_irq_pressure_show,
                 .write = cgroup_irq_pressure_write,
                 .poll = cgroup_pressure_poll,
diff --git a/kernel/kthread.c b/kernel/kthread.c

index 490792b..07a0570 100644 (file)
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -182,6 +182,16 @@ bool kthread_should_park(void)
  }
  EXPORT_SYMBOL_GPL(kthread_should_park);
  
+bool kthread_should_stop_or_park(void)
+{
+       struct kthread *kthread = __to_kthread(current);
+
+       if (!kthread)
+               return false;
+
+       return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
+}
+
  /**
   * kthread_freezable_should_stop - should this freezable kthread return now?
   * @was_frozen: optional out parameter, indicates whether %current was frozen
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c

index 6a333ad..357a4d1 100644 (file)
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -528,7 +528,7 @@ static u64 latched_seq_read_nolock(struct latched_seq *ls)
                 seq = raw_read_seqcount_latch(&ls->latch);
                 idx = seq & 0x1;
                 val = ls->val[idx];
-       } while (read_seqcount_latch_retry(&ls->latch, seq));
+       } while (raw_read_seqcount_latch_retry(&ls->latch, seq));
  
         return val;
  }
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c

index b5cc2b5..5a575a0 100644 (file)
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -266,7 +266,7 @@ static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
         s64 delta;
  
  again:
-       now = sched_clock();
+       now = sched_clock_noinstr();
         delta = now - scd->tick_raw;
         if (unlikely(delta < 0))
                 delta = 0;
@@ -293,22 +293,29 @@ again:
         return clock;
  }
  
-noinstr u64 local_clock(void)
+noinstr u64 local_clock_noinstr(void)
  {
         u64 clock;
  
         if (static_branch_likely(&__sched_clock_stable))
-               return sched_clock() + __sched_clock_offset;
+               return sched_clock_noinstr() + __sched_clock_offset;
  
         if (!static_branch_likely(&sched_clock_running))
-               return sched_clock();
+               return sched_clock_noinstr();
  
-       preempt_disable_notrace();
         clock = sched_clock_local(this_scd());
-       preempt_enable_notrace();
  
         return clock;
  }
+
+u64 local_clock(void)
+{
+       u64 now;
+       preempt_disable_notrace();
+       now = local_clock_noinstr();
+       preempt_enable_notrace();
+       return now;
+}
  EXPORT_SYMBOL_GPL(local_clock);
  
  static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index a68d127..7eb6e29 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2213,6 +2213,154 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
                 rq_clock_skip_update(rq);
  }
  
+static __always_inline
+int __task_state_match(struct task_struct *p, unsigned int state)
+{
+       if (READ_ONCE(p->__state) & state)
+               return 1;
+
+#ifdef CONFIG_PREEMPT_RT
+       if (READ_ONCE(p->saved_state) & state)
+               return -1;
+#endif
+       return 0;
+}
+
+static __always_inline
+int task_state_match(struct task_struct *p, unsigned int state)
+{
+#ifdef CONFIG_PREEMPT_RT
+       int match;
+
+       /*
+        * Serialize against current_save_and_set_rtlock_wait_state() and
+        * current_restore_rtlock_saved_state().
+        */
+       raw_spin_lock_irq(&p->pi_lock);
+       match = __task_state_match(p, state);
+       raw_spin_unlock_irq(&p->pi_lock);
+
+       return match;
+#else
+       return __task_state_match(p, state);
+#endif
+}
+
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * Wait for the thread to block in any of the states set in @match_state.
+ * If it changes, i.e. @p might have woken up, then return zero.  When we
+ * succeed in waiting for @p to be off its CPU, we return a positive number
+ * (its total switch count).  If a second call a short while later returns the
+ * same number, the caller can be sure that @p has remained unscheduled the
+ * whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
+{
+       int running, queued, match;
+       struct rq_flags rf;
+       unsigned long ncsw;
+       struct rq *rq;
+
+       for (;;) {
+               /*
+                * We do the initial early heuristics without holding
+                * any task-queue locks at all. We'll only try to get
+                * the runqueue lock when things look like they will
+                * work out!
+                */
+               rq = task_rq(p);
+
+               /*
+                * If the task is actively running on another CPU
+                * still, just relax and busy-wait without holding
+                * any locks.
+                *
+                * NOTE! Since we don't hold any locks, it's not
+                * even sure that "rq" stays as the right runqueue!
+                * But we don't care, since "task_on_cpu()" will
+                * return false if the runqueue has changed and p
+                * is actually now running somewhere else!
+                */
+               while (task_on_cpu(rq, p)) {
+                       if (!task_state_match(p, match_state))
+                               return 0;
+                       cpu_relax();
+               }
+
+               /*
+                * Ok, time to look more closely! We need the rq
+                * lock now, to be *sure*. If we're wrong, we'll
+                * just go back and repeat.
+                */
+               rq = task_rq_lock(p, &rf);
+               trace_sched_wait_task(p);
+               running = task_on_cpu(rq, p);
+               queued = task_on_rq_queued(p);
+               ncsw = 0;
+               if ((match = __task_state_match(p, match_state))) {
+                       /*
+                        * When matching on p->saved_state, consider this task
+                        * still queued so it will wait.
+                        */
+                       if (match < 0)
+                               queued = 1;
+                       ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+               }
+               task_rq_unlock(rq, p, &rf);
+
+               /*
+                * If it changed from the expected state, bail out now.
+                */
+               if (unlikely(!ncsw))
+                       break;
+
+               /*
+                * Was it really running after all now that we
+                * checked with the proper locks actually held?
+                *
+                * Oops. Go back and try again..
+                */
+               if (unlikely(running)) {
+                       cpu_relax();
+                       continue;
+               }
+
+               /*
+                * It's not enough that it's not actively running,
+                * it must be off the runqueue _entirely_, and not
+                * preempted!
+                *
+                * So if it was still runnable (but just not actively
+                * running right now), it's preempted, and we should
+                * yield - it could be a while.
+                */
+               if (unlikely(queued)) {
+                       ktime_t to = NSEC_PER_SEC / HZ;
+
+                       set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
+                       continue;
+               }
+
+               /*
+                * Ahh, all good. It wasn't running, and it wasn't
+                * runnable, which means that it will never become
+                * running in the future either. We're all done!
+                */
+               break;
+       }
+
+       return ncsw;
+}
+
  #ifdef CONFIG_SMP
  
  static void
@@ -2398,7 +2546,6 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
         if (!is_cpu_allowed(p, dest_cpu))
                 return rq;
  
-       update_rq_clock(rq);
         rq = move_queued_task(rq, rf, p, dest_cpu);
  
         return rq;
@@ -2456,10 +2603,12 @@ static int migration_cpu_stop(void *data)
                                 goto out;
                 }
  
-               if (task_on_rq_queued(p))
+               if (task_on_rq_queued(p)) {
+                       update_rq_clock(rq);
                         rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
-               else
+               } else {
                         p->wake_cpu = arg->dest_cpu;
+               }
  
                 /*
                  * XXX __migrate_task() can fail, at which point we might end
@@ -3341,114 +3490,6 @@ out:
  }
  #endif /* CONFIG_NUMA_BALANCING */
  
-/*
- * wait_task_inactive - wait for a thread to unschedule.
- *
- * Wait for the thread to block in any of the states set in @match_state.
- * If it changes, i.e. @p might have woken up, then return zero.  When we
- * succeed in waiting for @p to be off its CPU, we return a positive number
- * (its total switch count).  If a second call a short while later returns the
- * same number, the caller can be sure that @p has remained unscheduled the
- * whole time.
- *
- * The caller must ensure that the task *will* unschedule sometime soon,
- * else this function might spin for a *long* time. This function can't
- * be called with interrupts off, or it may introduce deadlock with
- * smp_call_function() if an IPI is sent by the same process we are
- * waiting to become inactive.
- */
-unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
-{
-       int running, queued;
-       struct rq_flags rf;
-       unsigned long ncsw;
-       struct rq *rq;
-
-       for (;;) {
-               /*
-                * We do the initial early heuristics without holding
-                * any task-queue locks at all. We'll only try to get
-                * the runqueue lock when things look like they will
-                * work out!
-                */
-               rq = task_rq(p);
-
-               /*
-                * If the task is actively running on another CPU
-                * still, just relax and busy-wait without holding
-                * any locks.
-                *
-                * NOTE! Since we don't hold any locks, it's not
-                * even sure that "rq" stays as the right runqueue!
-                * But we don't care, since "task_on_cpu()" will
-                * return false if the runqueue has changed and p
-                * is actually now running somewhere else!
-                */
-               while (task_on_cpu(rq, p)) {
-                       if (!(READ_ONCE(p->__state) & match_state))
-                               return 0;
-                       cpu_relax();
-               }
-
-               /*
-                * Ok, time to look more closely! We need the rq
-                * lock now, to be *sure*. If we're wrong, we'll
-                * just go back and repeat.
-                */
-               rq = task_rq_lock(p, &rf);
-               trace_sched_wait_task(p);
-               running = task_on_cpu(rq, p);
-               queued = task_on_rq_queued(p);
-               ncsw = 0;
-               if (READ_ONCE(p->__state) & match_state)
-                       ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-               task_rq_unlock(rq, p, &rf);
-
-               /*
-                * If it changed from the expected state, bail out now.
-                */
-               if (unlikely(!ncsw))
-                       break;
-
-               /*
-                * Was it really running after all now that we
-                * checked with the proper locks actually held?
-                *
-                * Oops. Go back and try again..
-                */
-               if (unlikely(running)) {
-                       cpu_relax();
-                       continue;
-               }
-
-               /*
-                * It's not enough that it's not actively running,
-                * it must be off the runqueue _entirely_, and not
-                * preempted!
-                *
-                * So if it was still runnable (but just not actively
-                * running right now), it's preempted, and we should
-                * yield - it could be a while.
-                */
-               if (unlikely(queued)) {
-                       ktime_t to = NSEC_PER_SEC / HZ;
-
-                       set_current_state(TASK_UNINTERRUPTIBLE);
-                       schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD);
-                       continue;
-               }
-
-               /*
-                * Ahh, all good. It wasn't running, and it wasn't
-                * runnable, which means that it will never become
-                * running in the future either. We're all done!
-                */
-               break;
-       }
-
-       return ncsw;
-}
-
  /***
   * kick_process - kick a running thread to enter/exit the kernel
   * @p: the to-be-kicked thread
@@ -4003,15 +4044,14 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  static __always_inline
  bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
  {
+       int match;
+
         if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
                 WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
                              state != TASK_RTLOCK_WAIT);
         }
  
-       if (READ_ONCE(p->__state) & state) {
-               *success = 1;
-               return true;
-       }
+       *success = !!(match = __task_state_match(p, state));
  
  #ifdef CONFIG_PREEMPT_RT
         /*
@@ -4027,12 +4067,10 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
          * p::saved_state to TASK_RUNNING so any further tests will
          * not result in false positives vs. @success
          */
-       if (p->saved_state & state) {
+       if (match < 0)
                 p->saved_state = TASK_RUNNING;
-               *success = 1;
-       }
  #endif
-       return false;
+       return match > 0;
  }
  
  /*
@@ -9548,6 +9586,7 @@ void set_rq_offline(struct rq *rq)
         if (rq->online) {
                 const struct sched_class *class;
  
+               update_rq_clock(rq);
                 for_each_class(class) {
                         if (class->rq_offline)
                                 class->rq_offline(rq);
@@ -9689,7 +9728,6 @@ int sched_cpu_deactivate(unsigned int cpu)
  
         rq_lock_irqsave(rq, &rf);
         if (rq->rd) {
-               update_rq_clock(rq);
                 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                 set_rq_offline(rq);
         }
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c

index e321145..4492608 100644 (file)
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -155,10 +155,11 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
  
  static void sugov_get_util(struct sugov_cpu *sg_cpu)
  {
+       unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu);
         struct rq *rq = cpu_rq(sg_cpu->cpu);
  
         sg_cpu->bw_dl = cpu_bw_dl(rq);
-       sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
+       sg_cpu->util = effective_cpu_util(sg_cpu->cpu, util,
                                           FREQUENCY_UTIL, NULL);
  }
  
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 5a9a4b8..e41a36b 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -489,13 +489,6 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
  
  static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
  
-void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
-{
-       raw_spin_lock_init(&dl_b->dl_runtime_lock);
-       dl_b->dl_period = period;
-       dl_b->dl_runtime = runtime;
-}
-
  void init_dl_bw(struct dl_bw *dl_b)
  {
         raw_spin_lock_init(&dl_b->lock);
@@ -1260,43 +1253,39 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
  }
  
  /*
- * This function implements the GRUB accounting rule:
- * according to the GRUB reclaiming algorithm, the runtime is
- * not decreased as "dq = -dt", but as
- * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * This function implements the GRUB accounting rule. According to the
+ * GRUB reclaiming algorithm, the runtime is not decreased as "dq = -dt",
+ * but as "dq = -(max{u, (Umax - Uinact - Uextra)} / Umax) dt",
   * where u is the utilization of the task, Umax is the maximum reclaimable
   * utilization, Uinact is the (per-runqueue) inactive utilization, computed
   * as the difference between the "total runqueue utilization" and the
- * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * "runqueue active utilization", and Uextra is the (per runqueue) extra
   * reclaimable utilization.
- * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
- * multiplied by 2^BW_SHIFT, the result has to be shifted right by
- * BW_SHIFT.
- * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
- * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
- * Since delta is a 64 bit variable, to have an overflow its value
- * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
- * So, overflow is not an issue here.
+ * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
+ * by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
+ * is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value should be
+ * larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
+ * not an issue here.
   */
  static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
  {
-       u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
         u64 u_act;
-       u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
+       u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
  
         /*
-        * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
-        * we compare u_inact + rq->dl.extra_bw with
-        * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
-        * u_inact + rq->dl.extra_bw can be larger than
-        * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
-        * leading to wrong results)
+        * Instead of computing max{u, (u_max - u_inact - u_extra)}, we
+        * compare u_inact + u_extra with u_max - u, because u_inact + u_extra
+        * can be larger than u_max. So, u_max - u_inact - u_extra would be
+        * negative leading to wrong results.
          */
-       if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
-               u_act = u_act_min;
+       if (u_inact + rq->dl.extra_bw > rq->dl.max_bw - dl_se->dl_bw)
+               u_act = dl_se->dl_bw;
         else
-               u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
+               u_act = rq->dl.max_bw - u_inact - rq->dl.extra_bw;
  
+       u_act = (u_act * rq->dl.bw_ratio) >> RATIO_SHIFT;
         return (delta * u_act) >> BW_SHIFT;
  }
  
@@ -2795,12 +2784,12 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
  {
         if (global_rt_runtime() == RUNTIME_INF) {
                 dl_rq->bw_ratio = 1 << RATIO_SHIFT;
-               dl_rq->extra_bw = 1 << BW_SHIFT;
+               dl_rq->max_bw = dl_rq->extra_bw = 1 << BW_SHIFT;
         } else {
                 dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
                           global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
-               dl_rq->extra_bw = to_ratio(global_rt_period(),
-                                                   global_rt_runtime());
+               dl_rq->max_bw = dl_rq->extra_bw =
+                       to_ratio(global_rt_period(), global_rt_runtime());
         }
  }
  
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 0b2340a..066ff1c 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -777,7 +777,7 @@ static void print_cpu(struct seq_file *m, int cpu)
  #define P(x)                                                           \
  do {                                                                   \
         if (sizeof(rq->x) == 4)                                         \
-               SEQ_printf(m, "  .%-30s: %ld\n", #x, (long)(rq->x));    \
+               SEQ_printf(m, "  .%-30s: %d\n", #x, (int)(rq->x));      \
         else                                                            \
                 SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x));\
  } while (0)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 373ff5f..a80a739 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1064,6 +1064,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
   * Scheduling class queueing methods:
   */
  
+static inline bool is_core_idle(int cpu)
+{
+#ifdef CONFIG_SCHED_SMT
+       int sibling;
+
+       for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+               if (cpu == sibling)
+                       continue;
+
+               if (!idle_cpu(sibling))
+                       return false;
+       }
+#endif
+
+       return true;
+}
+
  #ifdef CONFIG_NUMA
  #define NUMA_IMBALANCE_MIN 2
  
@@ -1700,23 +1717,6 @@ struct numa_stats {
         int idle_cpu;
  };
  
-static inline bool is_core_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-       int sibling;
-
-       for_each_cpu(sibling, cpu_smt_mask(cpu)) {
-               if (cpu == sibling)
-                       continue;
-
-               if (!idle_cpu(sibling))
-                       return false;
-       }
-#endif
-
-       return true;
-}
-
  struct task_numa_env {
         struct task_struct *p;
  
@@ -5577,6 +5577,14 @@ static void __cfsb_csd_unthrottle(void *arg)
         rq_lock(rq, &rf);
  
         /*
+        * Iterating over the list can trigger several call to
+        * update_rq_clock() in unthrottle_cfs_rq().
+        * Do it once and skip the potential next ones.
+        */
+       update_rq_clock(rq);
+       rq_clock_start_loop_update(rq);
+
+       /*
          * Since we hold rq lock we're safe from concurrent manipulation of
          * the CSD list. However, this RCU critical section annotates the
          * fact that we pair with sched_free_group_rcu(), so that we cannot
@@ -5595,6 +5603,7 @@ static void __cfsb_csd_unthrottle(void *arg)
  
         rcu_read_unlock();
  
+       rq_clock_stop_loop_update(rq);
         rq_unlock(rq, &rf);
  }
  
@@ -6115,6 +6124,13 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
  
         lockdep_assert_rq_held(rq);
  
+       /*
+        * The rq clock has already been updated in the
+        * set_rq_offline(), so we should skip updating
+        * the rq clock again in unthrottle_cfs_rq().
+        */
+       rq_clock_start_loop_update(rq);
+
         rcu_read_lock();
         list_for_each_entry_rcu(tg, &task_groups, list) {
                 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
@@ -6137,6 +6153,8 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
                         unthrottle_cfs_rq(cfs_rq);
         }
         rcu_read_unlock();
+
+       rq_clock_stop_loop_update(rq);
  }
  
  #else /* CONFIG_CFS_BANDWIDTH */
@@ -7202,14 +7220,58 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         return target;
  }
  
-/*
- * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
- * (@dst_cpu = -1) or migrated to @dst_cpu.
- */
-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+/**
+ * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks.
+ * @cpu: the CPU to get the utilization for
+ * @p: task for which the CPU utilization should be predicted or NULL
+ * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL
+ * @boost: 1 to enable boosting, otherwise 0
+ *
+ * The unit of the return value must be the same as the one of CPU capacity
+ * so that CPU utilization can be compared with CPU capacity.
+ *
+ * CPU utilization is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on that CPU.
+ * It represents the amount of CPU capacity currently used by CFS tasks in
+ * the range [0..max CPU capacity] with max CPU capacity being the CPU
+ * capacity at f_max.
+ *
+ * The estimated CPU utilization is defined as the maximum between CPU
+ * utilization and sum of the estimated utilization of the currently
+ * runnable tasks on that CPU. It preserves a utilization "snapshot" of
+ * previously-executed tasks, which helps better deduce how busy a CPU will
+ * be when a long-sleeping task wakes up. The contribution to CPU utilization
+ * of such a task would be significantly decayed at this point of time.
+ *
+ * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization).
+ * CPU contention for CFS tasks can be detected by CPU runnable > CPU
+ * utilization. Boosting is implemented in cpu_util() so that internal
+ * users (e.g. EAS) can use it next to external users (e.g. schedutil),
+ * latter via cpu_util_cfs_boost().
+ *
+ * CPU utilization can be higher than the current CPU capacity
+ * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
+ * of rounding errors as well as task migrations or wakeups of new tasks.
+ * CPU utilization has to be capped to fit into the [0..max CPU capacity]
+ * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
+ * could be seen as over-utilized even though CPU1 has 20% of spare CPU
+ * capacity. CPU utilization is allowed to overshoot current CPU capacity
+ * though since this is useful for predicting the CPU capacity required
+ * after task migrations (scheduler-driven DVFS).
+ *
+ * Return: (Boosted) (estimated) utilization for the specified CPU.
+ */
+static unsigned long
+cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost)
  {
         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
         unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
+       unsigned long runnable;
+
+       if (boost) {
+               runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+               util = max(util, runnable);
+       }
  
         /*
          * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
@@ -7217,9 +7279,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
          * contribution. In all the other cases @cpu is not impacted by the
          * migration so its util_avg is already correct.
          */
-       if (task_cpu(p) == cpu && dst_cpu != cpu)
+       if (p && task_cpu(p) == cpu && dst_cpu != cpu)
                 lsub_positive(&util, task_util(p));
-       else if (task_cpu(p) != cpu && dst_cpu == cpu)
+       else if (p && task_cpu(p) != cpu && dst_cpu == cpu)
                 util += task_util(p);
  
         if (sched_feat(UTIL_EST)) {
@@ -7227,6 +7289,9 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
  
                 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
  
+               if (boost)
+                       util_est = max(util_est, runnable);
+
                 /*
                  * During wake-up @p isn't enqueued yet and doesn't contribute
                  * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
@@ -7255,7 +7320,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
                  */
                 if (dst_cpu == cpu)
                         util_est += _task_util_est(p);
-               else if (unlikely(task_on_rq_queued(p) || current == p))
+               else if (p && unlikely(task_on_rq_queued(p) || current == p))
                         lsub_positive(&util_est, _task_util_est(p));
  
                 util = max(util, util_est);
@@ -7264,6 +7329,16 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
         return min(util, capacity_orig_of(cpu));
  }
  
+unsigned long cpu_util_cfs(int cpu)
+{
+       return cpu_util(cpu, NULL, -1, 0);
+}
+
+unsigned long cpu_util_cfs_boost(int cpu)
+{
+       return cpu_util(cpu, NULL, -1, 1);
+}
+
  /*
   * cpu_util_without: compute cpu utilization without any contributions from *p
   * @cpu: the CPU which utilization is requested
@@ -7281,9 +7356,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
  {
         /* Task has no contribution or is new */
         if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
-               return cpu_util_cfs(cpu);
+               p = NULL;
  
-       return cpu_util_next(cpu, p, -1);
+       return cpu_util(cpu, p, -1, 0);
  }
  
  /*
@@ -7330,7 +7405,7 @@ static inline void eenv_task_busy_time(struct energy_env *eenv,
   * cpu_capacity.
   *
   * The contribution of the task @p for which we want to estimate the
- * energy cost is removed (by cpu_util_next()) and must be calculated
+ * energy cost is removed (by cpu_util()) and must be calculated
   * separately (see eenv_task_busy_time). This ensures:
   *
   *   - A stable PD utilization, no matter which CPU of that PD we want to place
@@ -7351,7 +7426,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv,
         int cpu;
  
         for_each_cpu(cpu, pd_cpus) {
-               unsigned long util = cpu_util_next(cpu, p, -1);
+               unsigned long util = cpu_util(cpu, p, -1, 0);
  
                 busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL);
         }
@@ -7375,8 +7450,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
  
         for_each_cpu(cpu, pd_cpus) {
                 struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL;
-               unsigned long util = cpu_util_next(cpu, p, dst_cpu);
-               unsigned long cpu_util;
+               unsigned long util = cpu_util(cpu, p, dst_cpu, 1);
+               unsigned long eff_util;
  
                 /*
                  * Performance domain frequency: utilization clamping
@@ -7385,8 +7460,8 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus,
                  * NOTE: in case RT tasks are running, by default the
                  * FREQUENCY_UTIL's utilization can be max OPP.
                  */
-               cpu_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
-               max_util = max(max_util, cpu_util);
+               eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk);
+               max_util = max(max_util, eff_util);
         }
  
         return min(max_util, eenv->cpu_cap);
@@ -7521,7 +7596,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                         if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                 continue;
  
-                       util = cpu_util_next(cpu, p, cpu);
+                       util = cpu_util(cpu, p, cpu, 0);
                         cpu_cap = capacity_of(cpu);
  
                         /*
@@ -9331,96 +9406,61 @@ group_type group_classify(unsigned int imbalance_pct,
  }
  
  /**
- * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
- * @dst_cpu:   Destination CPU of the load balancing
+ * sched_use_asym_prio - Check whether asym_packing priority must be used
+ * @sd:                The scheduling domain of the load balancing
+ * @cpu:       A CPU
+ *
+ * Always use CPU priority when balancing load between SMT siblings. When
+ * balancing load between cores, it is not sufficient that @cpu is idle. Only
+ * use CPU priority if the whole core is idle.
+ *
+ * Returns: True if the priority of @cpu must be followed. False otherwise.
+ */
+static bool sched_use_asym_prio(struct sched_domain *sd, int cpu)
+{
+       if (!sched_smt_active())
+               return true;
+
+       return sd->flags & SD_SHARE_CPUCAPACITY || is_core_idle(cpu);
+}
+
+/**
+ * sched_asym - Check if the destination CPU can do asym_packing load balance
+ * @env:       The load balancing environment
   * @sds:       Load-balancing data with statistics of the local group
   * @sgs:       Load-balancing statistics of the candidate busiest group
- * @sg:                The candidate busiest group
+ * @group:     The candidate busiest group
   *
- * Check the state of the SMT siblings of both @sds::local and @sg and decide
- * if @dst_cpu can pull tasks.
+ * @env::dst_cpu can do asym_packing if it has higher priority than the
+ * preferred CPU of @group.
   *
- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
- * only if @dst_cpu has higher priority.
+ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu
+ * can do asym_packing balance only if all its SMT siblings are idle. Also, it
+ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger
+ * imbalances in the number of CPUS are dealt with in find_busiest_group().
   *
- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
- * Bigger imbalances in the number of busy CPUs will be dealt with in
- * update_sd_pick_busiest().
+ * If we are balancing load within an SMT core, or at DIE domain level, always
+ * proceed.
   *
- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
- * of @dst_cpu are idle and @sg has lower priority.
- *
- * Return: true if @dst_cpu can pull tasks, false otherwise.
+ * Return: true if @env::dst_cpu can do with asym_packing load balance. False
+ * otherwise.
   */
-static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
-                                   struct sg_lb_stats *sgs,
-                                   struct sched_group *sg)
+static inline bool
+sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs,
+          struct sched_group *group)
  {
-#ifdef CONFIG_SCHED_SMT
-       bool local_is_smt, sg_is_smt;
-       int sg_busy_cpus;
-
-       local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
-       sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
-
-       sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
-
-       if (!local_is_smt) {
-               /*
-                * If we are here, @dst_cpu is idle and does not have SMT
-                * siblings. Pull tasks if candidate group has two or more
-                * busy CPUs.
-                */
-               if (sg_busy_cpus >= 2) /* implies sg_is_smt */
-                       return true;
-
-               /*
-                * @dst_cpu does not have SMT siblings. @sg may have SMT
-                * siblings and only one is busy. In such case, @dst_cpu
-                * can help if it has higher priority and is idle (i.e.,
-                * it has no running tasks).
-                */
-               return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-       }
-
-       /* @dst_cpu has SMT siblings. */
-
-       if (sg_is_smt) {
-               int local_busy_cpus = sds->local->group_weight -
-                                     sds->local_stat.idle_cpus;
-               int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
-
-               if (busy_cpus_delta == 1)
-                       return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
+       /* Ensure that the whole local core is idle, if applicable. */
+       if (!sched_use_asym_prio(env->sd, env->dst_cpu))
                 return false;
-       }
  
         /*
-        * @sg does not have SMT siblings. Ensure that @sds::local does not end
-        * up with more than one busy SMT sibling and only pull tasks if there
-        * are not busy CPUs (i.e., no CPU has running tasks).
+        * CPU priorities does not make sense for SMT cores with more than one
+        * busy sibling.
          */
-       if (!sds->local_stat.sum_nr_running)
-               return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
-
-       return false;
-#else
-       /* Always return false so that callers deal with non-SMT cases. */
-       return false;
-#endif
-}
-
-static inline bool
-sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs,
-          struct sched_group *group)
-{
-       /* Only do SMT checks if either local or candidate have SMT siblings */
-       if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
-           (group->flags & SD_SHARE_CPUCAPACITY))
-               return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
+       if (group->flags & SD_SHARE_CPUCAPACITY) {
+               if (sgs->group_weight - sgs->idle_cpus != 1)
+                       return false;
+       }
  
         return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
  }
@@ -9610,10 +9650,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
                  * contention when accessing shared HW resources.
                  *
                  * XXX for now avg_load is not computed and always 0 so we
-                * select the 1st one.
+                * select the 1st one, except if @sg is composed of SMT
+                * siblings.
                  */
-               if (sgs->avg_load <= busiest->avg_load)
+
+               if (sgs->avg_load < busiest->avg_load)
                         return false;
+
+               if (sgs->avg_load == busiest->avg_load) {
+                       /*
+                        * SMT sched groups need more help than non-SMT groups.
+                        * If @sg happens to also be SMT, either choice is good.
+                        */
+                       if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+                               return false;
+               }
+
                 break;
  
         case group_has_spare:
@@ -10088,7 +10140,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
  
  static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
  {
-       struct sched_domain *child = env->sd->child;
         struct sched_group *sg = env->sd->groups;
         struct sg_lb_stats *local = &sds->local_stat;
         struct sg_lb_stats tmp_sgs;
@@ -10129,8 +10180,13 @@ next_group:
                 sg = sg->next;
         } while (sg != env->sd->groups);
  
-       /* Tag domain that child domain prefers tasks go to siblings first */
-       sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+       /*
+        * Indicate that the child domain of the busiest group prefers tasks
+        * go to a child's sibling domains first. NB the flags of a sched group
+        * are those of the child domain.
+        */
+       if (sds->busiest)
+               sds->prefer_sibling = !!(sds->busiest->flags & SD_PREFER_SIBLING);
  
  
         if (env->sd->flags & SD_NUMA)
@@ -10440,7 +10496,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                         goto out_balanced;
         }
  
-       /* Try to move all excess tasks to child's sibling domain */
+       /*
+        * Try to move all excess tasks to a sibling domain of the busiest
+        * group's child domain.
+        */
         if (sds.prefer_sibling && local->group_type == group_has_spare &&
             busiest->sum_nr_running > local->sum_nr_running + 1)
                 goto force_balance;
@@ -10542,8 +10601,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                     nr_running == 1)
                         continue;
  
-               /* Make sure we only pull tasks from a CPU of lower priority */
+               /*
+                * Make sure we only pull tasks from a CPU of lower priority
+                * when balancing between SMT siblings.
+                *
+                * If balancing between cores, let lower priority CPUs help
+                * SMT cores with more than one busy sibling.
+                */
                 if ((env->sd->flags & SD_ASYM_PACKING) &&
+                   sched_use_asym_prio(env->sd, i) &&
                     sched_asym_prefer(i, env->dst_cpu) &&
                     nr_running == 1)
                         continue;
@@ -10581,7 +10647,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                         break;
  
                 case migrate_util:
-                       util = cpu_util_cfs(i);
+                       util = cpu_util_cfs_boost(i);
  
                         /*
                          * Don't try to pull utilization from a CPU with one
@@ -10632,12 +10698,19 @@ static inline bool
  asym_active_balance(struct lb_env *env)
  {
         /*
-        * ASYM_PACKING needs to force migrate tasks from busy but
-        * lower priority CPUs in order to pack all tasks in the
-        * highest priority CPUs.
+        * ASYM_PACKING needs to force migrate tasks from busy but lower
+        * priority CPUs in order to pack all tasks in the highest priority
+        * CPUs. When done between cores, do it only if the whole core if the
+        * whole core is idle.
+        *
+        * If @env::src_cpu is an SMT core with busy siblings, let
+        * the lower priority @env::dst_cpu help it. Do not follow
+        * CPU priority.
          */
         return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
-              sched_asym_prefer(env->dst_cpu, env->src_cpu);
+              sched_use_asym_prio(env->sd, env->dst_cpu) &&
+              (sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+               !sched_use_asym_prio(env->sd, env->src_cpu));
  }
  
  static inline bool
@@ -10744,7 +10817,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                 .sd             = sd,
                 .dst_cpu        = this_cpu,
                 .dst_rq         = this_rq,
-               .dst_grpmask    = sched_group_span(sd->groups),
+               .dst_grpmask    = group_balance_mask(sd->groups),
                 .idle           = idle,
                 .loop_break     = SCHED_NR_MIGRATE_BREAK,
                 .cpus           = cpus,
@@ -11371,9 +11444,13 @@ static void nohz_balancer_kick(struct rq *rq)
                  * When ASYM_PACKING; see if there's a more preferred CPU
                  * currently idle; in which case, kick the ILB to move tasks
                  * around.
+                *
+                * When balancing betwen cores, all the SMT siblings of the
+                * preferred CPU must be idle.
                  */
                 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
-                       if (sched_asym_prefer(i, cpu)) {
+                       if (sched_use_asym_prio(sd, i) &&
+                           sched_asym_prefer(i, cpu)) {
                                 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
                                 goto unlock;
                         }
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c

index e072f6b..81fca77 100644 (file)
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -160,7 +160,6 @@ __setup("psi=", setup_psi);
  #define EXP_300s       2034            /* 1/exp(2s/300s) */
  
  /* PSI trigger definitions */
-#define WINDOW_MIN_US 500000   /* Min window size is 500ms */
  #define WINDOW_MAX_US 10000000 /* Max window size is 10s */
  #define UPDATES_PER_WINDOW 10  /* 10 updates per window */
  
@@ -1305,8 +1304,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
         if (state >= PSI_NONIDLE)
                 return ERR_PTR(-EINVAL);
  
-       if (window_us < WINDOW_MIN_US ||
-               window_us > WINDOW_MAX_US)
+       if (window_us == 0 || window_us > WINDOW_MAX_US)
                 return ERR_PTR(-EINVAL);
  
         /*
@@ -1409,11 +1407,16 @@ void psi_trigger_destroy(struct psi_trigger *t)
                         group->rtpoll_nr_triggers[t->state]--;
                         if (!group->rtpoll_nr_triggers[t->state])
                                 group->rtpoll_states &= ~(1 << t->state);
-                       /* reset min update period for the remaining triggers */
-                       list_for_each_entry(tmp, &group->rtpoll_triggers, node)
-                               period = min(period, div_u64(tmp->win.size,
-                                               UPDATES_PER_WINDOW));
-                       group->rtpoll_min_period = period;
+                       /*
+                        * Reset min update period for the remaining triggers
+                        * iff the destroying trigger had the min window size.
+                        */
+                       if (group->rtpoll_min_period == div_u64(t->win.size, UPDATES_PER_WINDOW)) {
+                               list_for_each_entry(tmp, &group->rtpoll_triggers, node)
+                                       period = min(period, div_u64(tmp->win.size,
+                                                       UPDATES_PER_WINDOW));
+                               group->rtpoll_min_period = period;
+                       }
                         /* Destroy rtpoll_task when the last trigger is destroyed */
                         if (group->rtpoll_states == 0) {
                                 group->rtpoll_until = 0;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index ec7b3e0..50d4b61 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -286,12 +286,6 @@ struct rt_bandwidth {
  
  void __dl_clear_params(struct task_struct *p);
  
-struct dl_bandwidth {
-       raw_spinlock_t          dl_runtime_lock;
-       u64                     dl_runtime;
-       u64                     dl_period;
-};
-
  static inline int dl_bandwidth_enabled(void)
  {
         return sysctl_sched_rt_runtime >= 0;
@@ -754,6 +748,12 @@ struct dl_rq {
         u64                     extra_bw;
  
         /*
+        * Maximum available bandwidth for reclaiming by SCHED_FLAG_RECLAIM
+        * tasks of this rq. Used in calculation of reclaimable bandwidth(GRUB).
+        */
+       u64                     max_bw;
+
+       /*
          * Inverse of the fraction of CPU utilization that can be reclaimed
          * by the GRUB algorithm.
          */
@@ -1546,6 +1546,28 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq)
         rq->clock_update_flags &= ~RQCF_REQ_SKIP;
  }
  
+/*
+ * During cpu offlining and rq wide unthrottling, we can trigger
+ * an update_rq_clock() for several cfs and rt runqueues (Typically
+ * when using list_for_each_entry_*)
+ * rq_clock_start_loop_update() can be called after updating the clock
+ * once and before iterating over the list to prevent multiple update.
+ * After the iterative traversal, we need to call rq_clock_stop_loop_update()
+ * to clear RQCF_ACT_SKIP of rq->clock_update_flags.
+ */
+static inline void rq_clock_start_loop_update(struct rq *rq)
+{
+       lockdep_assert_rq_held(rq);
+       SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+       rq->clock_update_flags |= RQCF_ACT_SKIP;
+}
+
+static inline void rq_clock_stop_loop_update(struct rq *rq)
+{
+       lockdep_assert_rq_held(rq);
+       rq->clock_update_flags &= ~RQCF_ACT_SKIP;
+}
+
  struct rq_flags {
         unsigned long flags;
         struct pin_cookie cookie;
@@ -1772,6 +1794,13 @@ queue_balance_callback(struct rq *rq,
         for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
                         __sd; __sd = __sd->parent)
  
+/* A mask of all the SD flags that have the SDF_SHARED_CHILD metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_SHARED_CHILD)) |
+static const unsigned int SD_SHARED_CHILD_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
  /**
   * highest_flag_domain - Return highest sched_domain containing flag.
   * @cpu:       The CPU whose highest level of sched domain is to
@@ -1779,16 +1808,25 @@ queue_balance_callback(struct rq *rq,
   * @flag:      The flag to check for the highest sched_domain
   *             for the given CPU.
   *
- * Returns the highest sched_domain of a CPU which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains @flag. If @flag has
+ * the SDF_SHARED_CHILD metaflag, all the children domains also have @flag.
   */
  static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
  {
         struct sched_domain *sd, *hsd = NULL;
  
         for_each_domain(cpu, sd) {
-               if (!(sd->flags & flag))
+               if (sd->flags & flag) {
+                       hsd = sd;
+                       continue;
+               }
+
+               /*
+                * Stop the search if @flag is known to be shared at lower
+                * levels. It will not be found further up.
+                */
+               if (flag & SD_SHARED_CHILD_MASK)
                         break;
-               hsd = sd;
         }
  
         return hsd;
@@ -2378,7 +2416,6 @@ extern struct rt_bandwidth def_rt_bandwidth;
  extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
  extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
  
-extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
  extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
  extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
  
@@ -2946,53 +2983,9 @@ static inline unsigned long cpu_util_dl(struct rq *rq)
         return READ_ONCE(rq->avg_dl.util_avg);
  }
  
-/**
- * cpu_util_cfs() - Estimates the amount of CPU capacity used by CFS tasks.
- * @cpu: the CPU to get the utilization for.
- *
- * The unit of the return value must be the same as the one of CPU capacity
- * so that CPU utilization can be compared with CPU capacity.
- *
- * CPU utilization is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on that CPU.
- * It represents the amount of CPU capacity currently used by CFS tasks in
- * the range [0..max CPU capacity] with max CPU capacity being the CPU
- * capacity at f_max.
- *
- * The estimated CPU utilization is defined as the maximum between CPU
- * utilization and sum of the estimated utilization of the currently
- * runnable tasks on that CPU. It preserves a utilization "snapshot" of
- * previously-executed tasks, which helps better deduce how busy a CPU will
- * be when a long-sleeping task wakes up. The contribution to CPU utilization
- * of such a task would be significantly decayed at this point of time.
- *
- * CPU utilization can be higher than the current CPU capacity
- * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because
- * of rounding errors as well as task migrations or wakeups of new tasks.
- * CPU utilization has to be capped to fit into the [0..max CPU capacity]
- * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%)
- * could be seen as over-utilized even though CPU1 has 20% of spare CPU
- * capacity. CPU utilization is allowed to overshoot current CPU capacity
- * though since this is useful for predicting the CPU capacity required
- * after task migrations (scheduler-driven DVFS).
- *
- * Return: (Estimated) utilization for the specified CPU.
- */
-static inline unsigned long cpu_util_cfs(int cpu)
-{
-       struct cfs_rq *cfs_rq;
-       unsigned long util;
-
-       cfs_rq = &cpu_rq(cpu)->cfs;
-       util = READ_ONCE(cfs_rq->avg.util_avg);
-
-       if (sched_feat(UTIL_EST)) {
-               util = max_t(unsigned long, util,
-                            READ_ONCE(cfs_rq->avg.util_est.enqueued));
-       }
  
-       return min(util, capacity_orig_of(cpu));
-}
+extern unsigned long cpu_util_cfs(int cpu);
+extern unsigned long cpu_util_cfs_boost(int cpu);
  
  static inline unsigned long cpu_util_rt(struct rq *rq)
  {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index 6682535..d3a3b26 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -487,9 +487,9 @@ static void free_rootdomain(struct rcu_head *rcu)
  void rq_attach_root(struct rq *rq, struct root_domain *rd)
  {
         struct root_domain *old_rd = NULL;
-       unsigned long flags;
+       struct rq_flags rf;
  
-       raw_spin_rq_lock_irqsave(rq, flags);
+       rq_lock_irqsave(rq, &rf);
  
         if (rq->rd) {
                 old_rd = rq->rd;
@@ -515,7 +515,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
         if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                 set_rq_online(rq);
  
-       raw_spin_rq_unlock_irqrestore(rq, flags);
+       rq_unlock_irqrestore(rq, &rf);
  
         if (old_rd)
                 call_rcu(&old_rd->rcu, free_rootdomain);
@@ -719,8 +719,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
  
                 if (sd_parent_degenerate(tmp, parent)) {
                         tmp->parent = parent->parent;
-                       if (parent->parent)
+
+                       if (parent->parent) {
                                 parent->parent->child = tmp;
+                               if (tmp->flags & SD_SHARE_CPUCAPACITY)
+                                       parent->parent->groups->flags |= SD_SHARE_CPUCAPACITY;
+                       }
+
                         /*
                          * Transfer SD_PREFER_SIBLING down in case of a
                          * degenerate parent; the spans match for this
@@ -1676,7 +1681,7 @@ static struct sched_domain_topology_level *sched_domain_topology_saved;
  #define for_each_sd_topology(tl)                       \
         for (tl = sched_domain_topology; tl->mask; tl++)
  
-void set_sched_topology(struct sched_domain_topology_level *tl)
+void __init set_sched_topology(struct sched_domain_topology_level *tl)
  {
         if (WARN_ON_ONCE(sched_smp_initialized))
                 return;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c

index 133b747..48c53e4 100644 (file)
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -425,11 +425,6 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
  }
  EXPORT_SYMBOL(autoremove_wake_function);
  
-static inline bool is_kthread_should_stop(void)
-{
-       return (current->flags & PF_KTHREAD) && kthread_should_stop();
-}
-
  /*
   * DEFINE_WAIT_FUNC(wait, woken_wake_func);
   *
@@ -459,7 +454,7 @@ long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout)
          * or woken_wake_function() sees our store to current->state.
          */
         set_current_state(mode); /* A */
-       if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+       if (!(wq_entry->flags & WQ_FLAG_WOKEN) && !kthread_should_stop_or_park())
                 timeout = schedule_timeout(timeout);
         __set_current_state(TASK_RUNNING);
  
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c

index 8464c5a..68d6c11 100644 (file)
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -64,7 +64,7 @@ static struct clock_data cd ____cacheline_aligned = {
         .actual_read_sched_clock = jiffy_sched_clock_read,
  };
  
-static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
+static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
  {
         return (cyc * mult) >> shift;
  }
@@ -77,26 +77,36 @@ notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
  
  notrace int sched_clock_read_retry(unsigned int seq)
  {
-       return read_seqcount_latch_retry(&cd.seq, seq);
+       return raw_read_seqcount_latch_retry(&cd.seq, seq);
  }
  
-unsigned long long notrace sched_clock(void)
+unsigned long long noinstr sched_clock_noinstr(void)
  {
-       u64 cyc, res;
-       unsigned int seq;
         struct clock_read_data *rd;
+       unsigned int seq;
+       u64 cyc, res;
  
         do {
-               rd = sched_clock_read_begin(&seq);
+               seq = raw_read_seqcount_latch(&cd.seq);
+               rd = cd.read_data + (seq & 1);
  
                 cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
                       rd->sched_clock_mask;
                 res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
-       } while (sched_clock_read_retry(seq));
+       } while (raw_read_seqcount_latch_retry(&cd.seq, seq));
  
         return res;
  }
  
+unsigned long long notrace sched_clock(void)
+{
+       unsigned long long ns;
+       preempt_disable_notrace();
+       ns = sched_clock_noinstr();
+       preempt_enable_notrace();
+       return ns;
+}
+
  /*
   * Updating the data required to read the clock.
   *
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c

index 09d5949..266d028 100644 (file)
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -450,7 +450,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
                 tkr = tkf->base + (seq & 0x01);
                 now = ktime_to_ns(tkr->base);
                 now += fast_tk_get_delta_ns(tkr);
-       } while (read_seqcount_latch_retry(&tkf->seq, seq));
+       } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
  
         return now;
  }
@@ -566,7 +566,7 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
                 basem = ktime_to_ns(tkr->base);
                 baser = ktime_to_ns(tkr->base_real);
                 delta = fast_tk_get_delta_ns(tkr);
-       } while (read_seqcount_latch_retry(&tkf->seq, seq));
+       } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
  
         if (mono)
                 *mono = basem + delta;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 27 Jun 2023 21:03:21 +0000 (14:03 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 27 Jun 2023 21:03:21 +0000 (14:03 -0700)
Documentation/scheduler/sched-deadline.rst		patch \| blob \| history
arch/arm64/include/asm/arch_timer.h		patch \| blob \| history
arch/arm64/include/asm/io.h		patch \| blob \| history
arch/loongarch/include/asm/loongarch.h		patch \| blob \| history
arch/loongarch/kernel/time.c		patch \| blob \| history
arch/s390/include/asm/timex.h		patch \| blob \| history
arch/s390/kernel/time.c		patch \| blob \| history
arch/x86/include/asm/mshyperv.h		patch \| blob \| history
arch/x86/include/asm/vdso/gettimeofday.h		patch \| blob \| history
arch/x86/kernel/itmt.c		patch \| blob \| history
arch/x86/kernel/kvmclock.c		patch \| blob \| history
arch/x86/kernel/smpboot.c		patch \| blob \| history
arch/x86/kernel/tsc.c		patch \| blob \| history
arch/x86/kvm/x86.c		patch \| blob \| history
arch/x86/xen/time.c		patch \| blob \| history
drivers/clocksource/arm_arch_timer.c		patch \| blob \| history
drivers/clocksource/hyperv_timer.c		patch \| blob \| history
drivers/cpuidle/cpuidle.c		patch \| blob \| history
drivers/cpuidle/poll_state.c		patch \| blob \| history
include/clocksource/hyperv_timer.h		patch \| blob \| history
include/linux/kthread.h		patch \| blob \| history
include/linux/math64.h		patch \| blob \| history
include/linux/rbtree_latch.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
include/linux/sched/clock.h		patch \| blob \| history
include/linux/sched/sd_flags.h		patch \| blob \| history
include/linux/sched/topology.h		patch \| blob \| history
include/linux/seqlock.h		patch \| blob \| history
kernel/cgroup/cgroup.c		patch \| blob \| history
kernel/kthread.c		patch \| blob \| history
kernel/printk/printk.c		patch \| blob \| history
kernel/sched/clock.c		patch \| blob \| history
kernel/sched/core.c		patch \| blob \| history
kernel/sched/cpufreq_schedutil.c		patch \| blob \| history
kernel/sched/deadline.c		patch \| blob \| history
kernel/sched/debug.c		patch \| blob \| history
kernel/sched/fair.c		patch \| blob \| history
kernel/sched/psi.c		patch \| blob \| history
kernel/sched/sched.h		patch \| blob \| history
kernel/sched/topology.c		patch \| blob \| history
kernel/sched/wait.c		patch \| blob \| history
kernel/time/sched_clock.c		patch \| blob \| history
kernel/time/timekeeping.c		patch \| blob \| history