Merge branches 'pm-core', 'pm-pci', 'pm-sleep', 'pm-domains' and 'powercap'
authorRafael J. Wysocki <rafael.j.wysocki@intel.com>
Mon, 26 Apr 2021 14:57:17 +0000 (16:57 +0200)
committerRafael J. Wysocki <rafael.j.wysocki@intel.com>
Mon, 26 Apr 2021 14:57:17 +0000 (16:57 +0200)
* pm-core:
  PM: runtime: Add documentation for pm_runtime_resume_and_get()
  PM: runtime: Replace inline function pm_runtime_callbacks_present()
  PM: core: Remove duplicate declaration from header file

* pm-pci:
  PCI: PM: Do not read power state in pci_enable_device_flags()

* pm-sleep:
  PM: wakeup: remove redundant assignment to variable retval
  PM: hibernate: x86: Use crc32 instead of md5 for hibernation e820 integrity check
  PM: wakeup: use dev_set_name() directly
  PM: sleep: fix typos in comments
  freezer: Remove unused inline function try_to_freeze_nowarn()

* pm-domains:
  PM: domains: Don't runtime resume devices at genpd_prepare()

* powercap:
  powercap: RAPL: Fix struct declaration in header file
  MAINTAINERS: Add DTPM subsystem maintainer
  powercap: Add Hygon Fam18h RAPL support

37 files changed:
MAINTAINERS
arch/arm64/include/asm/topology.h
arch/arm64/kernel/topology.c
arch/x86/kernel/e820.c
arch/x86/power/hibernate.c
drivers/base/arch_topology.c
drivers/base/power/domain.c
drivers/base/power/wakeup_stats.c
drivers/clk/mvebu/armada-37xx-periph.c
drivers/cpufreq/Kconfig
drivers/cpufreq/Kconfig.arm
drivers/cpufreq/armada-37xx-cpufreq.c
drivers/cpufreq/cppc_cpufreq.c
drivers/cpufreq/cpufreq-dt.c
drivers/cpufreq/cpufreq.c
drivers/cpufreq/ia64-acpi-cpufreq.c
drivers/cpufreq/intel_pstate.c
drivers/cpufreq/s5pv210-cpufreq.c
drivers/cpuidle/Kconfig.arm
drivers/cpuidle/cpuidle-tegra.c
drivers/cpuidle/driver.c
drivers/cpuidle/governors/menu.c
drivers/cpuidle/governors/teo.c
drivers/idle/intel_idle.c
drivers/pci/pci.c
drivers/powercap/intel_rapl_common.c
drivers/powercap/intel_rapl_msr.c
include/linux/arch_topology.h
include/linux/cpuidle.h
include/linux/freezer.h
include/linux/intel_rapl.h
kernel/power/autosleep.c
kernel/power/snapshot.c
kernel/power/swap.c
kernel/sched/core.c
kernel/sched/cpufreq_schedutil.c
kernel/time/tick-sched.c

index c80ad73..154132b 100644 (file)
@@ -14312,6 +14312,15 @@ F:     include/linux/pm_*
 F:     include/linux/powercap.h
 F:     kernel/configs/nopm.config
 
+DYNAMIC THERMAL POWER MANAGEMENT (DTPM)
+M:     Daniel Lezcano <daniel.lezcano@kernel.org>
+L:     linux-pm@vger.kernel.org
+S:     Supported
+B:     https://bugzilla.kernel.org
+T:     git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
+F:     drivers/powercap/dtpm*
+F:     include/linux/dtpm.h
+
 POWER STATE COORDINATION INTERFACE (PSCI)
 M:     Mark Rutland <mark.rutland@arm.com>
 M:     Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
index 3b8dca4..ec2db34 100644 (file)
@@ -17,17 +17,9 @@ int pcibus_to_node(struct pci_bus *bus);
 #include <linux/arch_topology.h>
 
 void update_freq_counters_refs(void);
-void topology_scale_freq_tick(void);
-
-#ifdef CONFIG_ARM64_AMU_EXTN
-/*
- * Replace task scheduler's default counter-based
- * frequency-invariance scale factor setting.
- */
-#define arch_scale_freq_tick topology_scale_freq_tick
-#endif /* CONFIG_ARM64_AMU_EXTN */
 
 /* Replace task scheduler's default frequency-invariant accounting */
+#define arch_scale_freq_tick topology_scale_freq_tick
 #define arch_set_freq_scale topology_set_freq_scale
 #define arch_scale_freq_capacity topology_get_freq_scale
 #define arch_scale_freq_invariant topology_scale_freq_invariant
index e08a412..4dd14a6 100644 (file)
@@ -199,12 +199,47 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate)
        return 0;
 }
 
-static DEFINE_STATIC_KEY_FALSE(amu_fie_key);
-#define amu_freq_invariant() static_branch_unlikely(&amu_fie_key)
+static void amu_scale_freq_tick(void)
+{
+       u64 prev_core_cnt, prev_const_cnt;
+       u64 core_cnt, const_cnt, scale;
+
+       prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
+       prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
+
+       update_freq_counters_refs();
+
+       const_cnt = this_cpu_read(arch_const_cycles_prev);
+       core_cnt = this_cpu_read(arch_core_cycles_prev);
+
+       if (unlikely(core_cnt <= prev_core_cnt ||
+                    const_cnt <= prev_const_cnt))
+               return;
+
+       /*
+        *          /\core    arch_max_freq_scale
+        * scale =  ------- * --------------------
+        *          /\const   SCHED_CAPACITY_SCALE
+        *
+        * See validate_cpu_freq_invariance_counters() for details on
+        * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT.
+        */
+       scale = core_cnt - prev_core_cnt;
+       scale *= this_cpu_read(arch_max_freq_scale);
+       scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT,
+                         const_cnt - prev_const_cnt);
+
+       scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
+       this_cpu_write(arch_freq_scale, (unsigned long)scale);
+}
+
+static struct scale_freq_data amu_sfd = {
+       .source = SCALE_FREQ_SOURCE_ARCH,
+       .set_freq_scale = amu_scale_freq_tick,
+};
 
 static void amu_fie_setup(const struct cpumask *cpus)
 {
-       bool invariant;
        int cpu;
 
        /* We are already set since the last insmod of cpufreq driver */
@@ -221,25 +256,10 @@ static void amu_fie_setup(const struct cpumask *cpus)
 
        cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus);
 
-       invariant = topology_scale_freq_invariant();
-
-       /* We aren't fully invariant yet */
-       if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask))
-               return;
-
-       static_branch_enable(&amu_fie_key);
+       topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus);
 
        pr_debug("CPUs[%*pbl]: counters will be used for FIE.",
                 cpumask_pr_args(cpus));
-
-       /*
-        * Task scheduler behavior depends on frequency invariance support,
-        * either cpufreq or counter driven. If the support status changes as
-        * a result of counter initialisation and use, retrigger the build of
-        * scheduling domains to ensure the information is propagated properly.
-        */
-       if (!invariant)
-               rebuild_sched_domains_energy();
 }
 
 static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val,
@@ -256,8 +276,8 @@ static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val,
         * initialized AMU support and enabled invariance. The AMU counters will
         * keep on working just fine in the absence of the cpufreq driver, and
         * for the CPUs for which there are no counters available, the last set
-        * value of freq_scale will remain valid as that is the frequency those
-        * CPUs are running at.
+        * value of arch_freq_scale will remain valid as that is the frequency
+        * those CPUs are running at.
         */
 
        return 0;
@@ -283,53 +303,6 @@ static int __init init_amu_fie(void)
 }
 core_initcall(init_amu_fie);
 
-bool arch_freq_counters_available(const struct cpumask *cpus)
-{
-       return amu_freq_invariant() &&
-              cpumask_subset(cpus, amu_fie_cpus);
-}
-
-void topology_scale_freq_tick(void)
-{
-       u64 prev_core_cnt, prev_const_cnt;
-       u64 core_cnt, const_cnt, scale;
-       int cpu = smp_processor_id();
-
-       if (!amu_freq_invariant())
-               return;
-
-       if (!cpumask_test_cpu(cpu, amu_fie_cpus))
-               return;
-
-       prev_const_cnt = this_cpu_read(arch_const_cycles_prev);
-       prev_core_cnt = this_cpu_read(arch_core_cycles_prev);
-
-       update_freq_counters_refs();
-
-       const_cnt = this_cpu_read(arch_const_cycles_prev);
-       core_cnt = this_cpu_read(arch_core_cycles_prev);
-
-       if (unlikely(core_cnt <= prev_core_cnt ||
-                    const_cnt <= prev_const_cnt))
-               return;
-
-       /*
-        *          /\core    arch_max_freq_scale
-        * scale =  ------- * --------------------
-        *          /\const   SCHED_CAPACITY_SCALE
-        *
-        * See validate_cpu_freq_invariance_counters() for details on
-        * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT.
-        */
-       scale = core_cnt - prev_core_cnt;
-       scale *= this_cpu_read(arch_max_freq_scale);
-       scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT,
-                         const_cnt - prev_const_cnt);
-
-       scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE);
-       this_cpu_write(freq_scale, (unsigned long)scale);
-}
-
 #ifdef CONFIG_ACPI_CPPC_LIB
 #include <acpi/cppc_acpi.h>
 
index 22aad41..629c499 100644 (file)
@@ -31,8 +31,8 @@
  *       - inform the user about the firmware's notion of memory layout
  *         via /sys/firmware/memmap
  *
- *       - the hibernation code uses it to generate a kernel-independent MD5
- *         fingerprint of the physical memory layout of a system.
+ *       - the hibernation code uses it to generate a kernel-independent CRC32
+ *         checksum of the physical memory layout of a system.
  *
  * - 'e820_table_kexec': a slightly modified (by the kernel) firmware version
  *   passed to us by the bootloader - the major difference between
index cd3914f..e94e005 100644 (file)
@@ -13,8 +13,8 @@
 #include <linux/kdebug.h>
 #include <linux/cpu.h>
 #include <linux/pgtable.h>
-
-#include <crypto/hash.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
 
 #include <asm/e820/api.h>
 #include <asm/init.h>
@@ -54,95 +54,33 @@ int pfn_is_nosave(unsigned long pfn)
        return pfn >= nosave_begin_pfn && pfn < nosave_end_pfn;
 }
 
-
-#define MD5_DIGEST_SIZE 16
-
 struct restore_data_record {
        unsigned long jump_address;
        unsigned long jump_address_phys;
        unsigned long cr3;
        unsigned long magic;
-       u8 e820_digest[MD5_DIGEST_SIZE];
+       unsigned long e820_checksum;
 };
 
-#if IS_BUILTIN(CONFIG_CRYPTO_MD5)
 /**
- * get_e820_md5 - calculate md5 according to given e820 table
+ * compute_e820_crc32 - calculate crc32 of a given e820 table
  *
  * @table: the e820 table to be calculated
- * @buf: the md5 result to be stored to
+ *
+ * Return: the resulting checksum
  */
-static int get_e820_md5(struct e820_table *table, void *buf)
+static inline u32 compute_e820_crc32(struct e820_table *table)
 {
-       struct crypto_shash *tfm;
-       struct shash_desc *desc;
-       int size;
-       int ret = 0;
-
-       tfm = crypto_alloc_shash("md5", 0, 0);
-       if (IS_ERR(tfm))
-               return -ENOMEM;
-
-       desc = kmalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm),
-                      GFP_KERNEL);
-       if (!desc) {
-               ret = -ENOMEM;
-               goto free_tfm;
-       }
-
-       desc->tfm = tfm;
-
-       size = offsetof(struct e820_table, entries) +
+       int size = offsetof(struct e820_table, entries) +
                sizeof(struct e820_entry) * table->nr_entries;
 
-       if (crypto_shash_digest(desc, (u8 *)table, size, buf))
-               ret = -EINVAL;
-
-       kfree_sensitive(desc);
-
-free_tfm:
-       crypto_free_shash(tfm);
-       return ret;
-}
-
-static int hibernation_e820_save(void *buf)
-{
-       return get_e820_md5(e820_table_firmware, buf);
-}
-
-static bool hibernation_e820_mismatch(void *buf)
-{
-       int ret;
-       u8 result[MD5_DIGEST_SIZE];
-
-       memset(result, 0, MD5_DIGEST_SIZE);
-       /* If there is no digest in suspend kernel, let it go. */
-       if (!memcmp(result, buf, MD5_DIGEST_SIZE))
-               return false;
-
-       ret = get_e820_md5(e820_table_firmware, result);
-       if (ret)
-               return true;
-
-       return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false;
-}
-#else
-static int hibernation_e820_save(void *buf)
-{
-       return 0;
-}
-
-static bool hibernation_e820_mismatch(void *buf)
-{
-       /* If md5 is not builtin for restore kernel, let it go. */
-       return false;
+       return ~crc32_le(~0, (unsigned char const *)table, size);
 }
-#endif
 
 #ifdef CONFIG_X86_64
-#define RESTORE_MAGIC  0x23456789ABCDEF01UL
+#define RESTORE_MAGIC  0x23456789ABCDEF02UL
 #else
-#define RESTORE_MAGIC  0x12345678UL
+#define RESTORE_MAGIC  0x12345679UL
 #endif
 
 /**
@@ -179,7 +117,8 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
         */
        rdr->cr3 = restore_cr3 & ~CR3_PCID_MASK;
 
-       return hibernation_e820_save(rdr->e820_digest);
+       rdr->e820_checksum = compute_e820_crc32(e820_table_firmware);
+       return 0;
 }
 
 /**
@@ -200,7 +139,7 @@ int arch_hibernation_header_restore(void *addr)
        jump_address_phys = rdr->jump_address_phys;
        restore_cr3 = rdr->cr3;
 
-       if (hibernation_e820_mismatch(rdr->e820_digest)) {
+       if (rdr->e820_checksum != compute_e820_crc32(e820_table_firmware)) {
                pr_crit("Hibernate inconsistent memory map detected!\n");
                return -ENODEV;
        }
index de8587c..c1179ed 100644 (file)
 #include <linux/sched.h>
 #include <linux/smp.h>
 
+static DEFINE_PER_CPU(struct scale_freq_data *, sft_data);
+static struct cpumask scale_freq_counters_mask;
+static bool scale_freq_invariant;
+
+static bool supports_scale_freq_counters(const struct cpumask *cpus)
+{
+       return cpumask_subset(cpus, &scale_freq_counters_mask);
+}
+
 bool topology_scale_freq_invariant(void)
 {
        return cpufreq_supports_freq_invariance() ||
-              arch_freq_counters_available(cpu_online_mask);
+              supports_scale_freq_counters(cpu_online_mask);
 }
 
-__weak bool arch_freq_counters_available(const struct cpumask *cpus)
+static void update_scale_freq_invariant(bool status)
 {
-       return false;
+       if (scale_freq_invariant == status)
+               return;
+
+       /*
+        * Task scheduler behavior depends on frequency invariance support,
+        * either cpufreq or counter driven. If the support status changes as
+        * a result of counter initialisation and use, retrigger the build of
+        * scheduling domains to ensure the information is propagated properly.
+        */
+       if (topology_scale_freq_invariant() == status) {
+               scale_freq_invariant = status;
+               rebuild_sched_domains_energy();
+       }
 }
-DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
+
+void topology_set_scale_freq_source(struct scale_freq_data *data,
+                                   const struct cpumask *cpus)
+{
+       struct scale_freq_data *sfd;
+       int cpu;
+
+       /*
+        * Avoid calling rebuild_sched_domains() unnecessarily if FIE is
+        * supported by cpufreq.
+        */
+       if (cpumask_empty(&scale_freq_counters_mask))
+               scale_freq_invariant = topology_scale_freq_invariant();
+
+       for_each_cpu(cpu, cpus) {
+               sfd = per_cpu(sft_data, cpu);
+
+               /* Use ARCH provided counters whenever possible */
+               if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) {
+                       per_cpu(sft_data, cpu) = data;
+                       cpumask_set_cpu(cpu, &scale_freq_counters_mask);
+               }
+       }
+
+       update_scale_freq_invariant(true);
+}
+EXPORT_SYMBOL_GPL(topology_set_scale_freq_source);
+
+void topology_clear_scale_freq_source(enum scale_freq_source source,
+                                     const struct cpumask *cpus)
+{
+       struct scale_freq_data *sfd;
+       int cpu;
+
+       for_each_cpu(cpu, cpus) {
+               sfd = per_cpu(sft_data, cpu);
+
+               if (sfd && sfd->source == source) {
+                       per_cpu(sft_data, cpu) = NULL;
+                       cpumask_clear_cpu(cpu, &scale_freq_counters_mask);
+               }
+       }
+
+       update_scale_freq_invariant(false);
+}
+EXPORT_SYMBOL_GPL(topology_clear_scale_freq_source);
+
+void topology_scale_freq_tick(void)
+{
+       struct scale_freq_data *sfd = *this_cpu_ptr(&sft_data);
+
+       if (sfd)
+               sfd->set_freq_scale();
+}
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+EXPORT_PER_CPU_SYMBOL_GPL(arch_freq_scale);
 
 void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
                             unsigned long max_freq)
@@ -47,13 +124,13 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
         * want to update the scale factor with information from CPUFREQ.
         * Instead the scale factor will be updated from arch_scale_freq_tick.
         */
-       if (arch_freq_counters_available(cpus))
+       if (supports_scale_freq_counters(cpus))
                return;
 
        scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
 
        for_each_cpu(i, cpus)
-               per_cpu(freq_scale, i) = scale;
+               per_cpu(arch_freq_scale, i) = scale;
 }
 
 DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
index 78c310d..b6a782c 100644 (file)
@@ -1088,34 +1088,6 @@ static void genpd_sync_power_on(struct generic_pm_domain *genpd, bool use_lock,
 }
 
 /**
- * resume_needed - Check whether to resume a device before system suspend.
- * @dev: Device to check.
- * @genpd: PM domain the device belongs to.
- *
- * There are two cases in which a device that can wake up the system from sleep
- * states should be resumed by genpd_prepare(): (1) if the device is enabled
- * to wake up the system and it has to remain active for this purpose while the
- * system is in the sleep state and (2) if the device is not enabled to wake up
- * the system from sleep states and it generally doesn't generate wakeup signals
- * by itself (those signals are generated on its behalf by other parts of the
- * system).  In the latter case it may be necessary to reconfigure the device's
- * wakeup settings during system suspend, because it may have been set up to
- * signal remote wakeup from the system's working state as needed by runtime PM.
- * Return 'true' in either of the above cases.
- */
-static bool resume_needed(struct device *dev,
-                         const struct generic_pm_domain *genpd)
-{
-       bool active_wakeup;
-
-       if (!device_can_wakeup(dev))
-               return false;
-
-       active_wakeup = genpd_is_active_wakeup(genpd);
-       return device_may_wakeup(dev) ? active_wakeup : !active_wakeup;
-}
-
-/**
  * genpd_prepare - Start power transition of a device in a PM domain.
  * @dev: Device to start the transition of.
  *
@@ -1135,14 +1107,6 @@ static int genpd_prepare(struct device *dev)
        if (IS_ERR(genpd))
                return -EINVAL;
 
-       /*
-        * If a wakeup request is pending for the device, it should be woken up
-        * at this point and a system wakeup event should be reported if it's
-        * set up to wake up the system from sleep states.
-        */
-       if (resume_needed(dev, genpd))
-               pm_runtime_resume(dev);
-
        genpd_lock(genpd);
 
        if (genpd->prepared_count++ == 0)
index d638259..924fac4 100644 (file)
@@ -137,7 +137,7 @@ static struct device *wakeup_source_device_create(struct device *parent,
                                                  struct wakeup_source *ws)
 {
        struct device *dev = NULL;
-       int retval = -ENODEV;
+       int retval;
 
        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
        if (!dev) {
@@ -154,7 +154,7 @@ static struct device *wakeup_source_device_create(struct device *parent,
        dev_set_drvdata(dev, ws);
        device_set_pm_not_required(dev);
 
-       retval = kobject_set_name(&dev->kobj, "wakeup%d", ws->id);
+       retval = dev_set_name(dev, "wakeup%d", ws->id);
        if (retval)
                goto error;
 
index f5746f9..32ac6b6 100644 (file)
@@ -84,6 +84,7 @@ struct clk_pm_cpu {
        void __iomem *reg_div;
        u8 shift_div;
        struct regmap *nb_pm_base;
+       unsigned long l1_expiration;
 };
 
 #define to_clk_double_div(_hw) container_of(_hw, struct clk_double_div, hw)
@@ -440,33 +441,6 @@ static u8 clk_pm_cpu_get_parent(struct clk_hw *hw)
        return val;
 }
 
-static int clk_pm_cpu_set_parent(struct clk_hw *hw, u8 index)
-{
-       struct clk_pm_cpu *pm_cpu = to_clk_pm_cpu(hw);
-       struct regmap *base = pm_cpu->nb_pm_base;
-       int load_level;
-
-       /*
-        * We set the clock parent only if the DVFS is available but
-        * not enabled.
-        */
-       if (IS_ERR(base) || armada_3700_pm_dvfs_is_enabled(base))
-               return -EINVAL;
-
-       /* Set the parent clock for all the load level */
-       for (load_level = 0; load_level < LOAD_LEVEL_NR; load_level++) {
-               unsigned int reg, mask,  val,
-                       offset = ARMADA_37XX_NB_TBG_SEL_OFF;
-
-               armada_3700_pm_dvfs_update_regs(load_level, &reg, &offset);
-
-               val = index << offset;
-               mask = ARMADA_37XX_NB_TBG_SEL_MASK << offset;
-               regmap_update_bits(base, reg, mask, val);
-       }
-       return 0;
-}
-
 static unsigned long clk_pm_cpu_recalc_rate(struct clk_hw *hw,
                                            unsigned long parent_rate)
 {
@@ -514,8 +488,10 @@ static long clk_pm_cpu_round_rate(struct clk_hw *hw, unsigned long rate,
 }
 
 /*
- * Switching the CPU from the L2 or L3 frequencies (300 and 200 Mhz
- * respectively) to L0 frequency (1.2 Ghz) requires a significant
+ * Workaround when base CPU frequnecy is 1000 or 1200 MHz
+ *
+ * Switching the CPU from the L2 or L3 frequencies (250/300 or 200 MHz
+ * respectively) to L0 frequency (1/1.2 GHz) requires a significant
  * amount of time to let VDD stabilize to the appropriate
  * voltage. This amount of time is large enough that it cannot be
  * covered by the hardware countdown register. Due to this, the CPU
@@ -525,26 +501,56 @@ static long clk_pm_cpu_round_rate(struct clk_hw *hw, unsigned long rate,
  * To work around this problem, we prevent switching directly from the
  * L2/L3 frequencies to the L0 frequency, and instead switch to the L1
  * frequency in-between. The sequence therefore becomes:
- * 1. First switch from L2/L3(200/300MHz) to L1(600MHZ)
+ * 1. First switch from L2/L3 (200/250/300 MHz) to L1 (500/600 MHz)
  * 2. Sleep 20ms for stabling VDD voltage
- * 3. Then switch from L1(600MHZ) to L0(1200Mhz).
+ * 3. Then switch from L1 (500/600 MHz) to L0 (1000/1200 MHz).
  */
-static void clk_pm_cpu_set_rate_wa(unsigned long rate, struct regmap *base)
+static void clk_pm_cpu_set_rate_wa(struct clk_pm_cpu *pm_cpu,
+                                  unsigned int new_level, unsigned long rate,
+                                  struct regmap *base)
 {
        unsigned int cur_level;
 
-       if (rate != 1200 * 1000 * 1000)
-               return;
-
        regmap_read(base, ARMADA_37XX_NB_CPU_LOAD, &cur_level);
        cur_level &= ARMADA_37XX_NB_CPU_LOAD_MASK;
-       if (cur_level <= ARMADA_37XX_DVFS_LOAD_1)
+
+       if (cur_level == new_level)
+               return;
+
+       /*
+        * System wants to go to L1 on its own. If we are going from L2/L3,
+        * remember when 20ms will expire. If from L0, set the value so that
+        * next switch to L0 won't have to wait.
+        */
+       if (new_level == ARMADA_37XX_DVFS_LOAD_1) {
+               if (cur_level == ARMADA_37XX_DVFS_LOAD_0)
+                       pm_cpu->l1_expiration = jiffies;
+               else
+                       pm_cpu->l1_expiration = jiffies + msecs_to_jiffies(20);
                return;
+       }
+
+       /*
+        * If we are setting to L2/L3, just invalidate L1 expiration time,
+        * sleeping is not needed.
+        */
+       if (rate < 1000*1000*1000)
+               goto invalidate_l1_exp;
+
+       /*
+        * We are going to L0 with rate >= 1GHz. Check whether we have been at
+        * L1 for long enough time. If not, go to L1 for 20ms.
+        */
+       if (pm_cpu->l1_expiration && jiffies >= pm_cpu->l1_expiration)
+               goto invalidate_l1_exp;
 
        regmap_update_bits(base, ARMADA_37XX_NB_CPU_LOAD,
                           ARMADA_37XX_NB_CPU_LOAD_MASK,
                           ARMADA_37XX_DVFS_LOAD_1);
        msleep(20);
+
+invalidate_l1_exp:
+       pm_cpu->l1_expiration = 0;
 }
 
 static int clk_pm_cpu_set_rate(struct clk_hw *hw, unsigned long rate,
@@ -578,7 +584,9 @@ static int clk_pm_cpu_set_rate(struct clk_hw *hw, unsigned long rate,
                        reg = ARMADA_37XX_NB_CPU_LOAD;
                        mask = ARMADA_37XX_NB_CPU_LOAD_MASK;
 
-                       clk_pm_cpu_set_rate_wa(rate, base);
+                       /* Apply workaround when base CPU frequency is 1000 or 1200 MHz */
+                       if (parent_rate >= 1000*1000*1000)
+                               clk_pm_cpu_set_rate_wa(pm_cpu, load_level, rate, base);
 
                        regmap_update_bits(base, reg, mask, load_level);
 
@@ -592,7 +600,6 @@ static int clk_pm_cpu_set_rate(struct clk_hw *hw, unsigned long rate,
 
 static const struct clk_ops clk_pm_cpu_ops = {
        .get_parent = clk_pm_cpu_get_parent,
-       .set_parent = clk_pm_cpu_set_parent,
        .round_rate = clk_pm_cpu_round_rate,
        .set_rate = clk_pm_cpu_set_rate,
        .recalc_rate = clk_pm_cpu_recalc_rate,
index 85de313..c3038cd 100644 (file)
@@ -13,7 +13,8 @@ config CPU_FREQ
          clock speed, you need to either enable a dynamic cpufreq governor
          (see below) after boot, or use a userspace tool.
 
-         For details, take a look at <file:Documentation/cpu-freq>.
+         For details, take a look at
+         <file:Documentation/admin-guide/pm/cpufreq.rst>.
 
          If in doubt, say N.
 
@@ -140,8 +141,6 @@ config CPU_FREQ_GOV_USERSPACE
          To compile this driver as a module, choose M here: the
          module will be called cpufreq_userspace.
 
-         For details, take a look at <file:Documentation/cpu-freq/>.
-
          If in doubt, say Y.
 
 config CPU_FREQ_GOV_ONDEMAND
@@ -158,7 +157,8 @@ config CPU_FREQ_GOV_ONDEMAND
          To compile this driver as a module, choose M here: the
          module will be called cpufreq_ondemand.
 
-         For details, take a look at linux/Documentation/cpu-freq.
+         For details, take a look at
+         <file:Documentation/admin-guide/pm/cpufreq.rst>.
 
          If in doubt, say N.
 
@@ -182,7 +182,8 @@ config CPU_FREQ_GOV_CONSERVATIVE
          To compile this driver as a module, choose M here: the
          module will be called cpufreq_conservative.
 
-         For details, take a look at linux/Documentation/cpu-freq.
+         For details, take a look at
+         <file:Documentation/admin-guide/pm/cpufreq.rst>.
 
          If in doubt, say N.
 
@@ -246,8 +247,6 @@ config IA64_ACPI_CPUFREQ
        This driver adds a CPUFreq driver which utilizes the ACPI
        Processor Performance States.
 
-       For details, take a look at <file:Documentation/cpu-freq/>.
-
        If in doubt, say N.
 endif
 
@@ -271,8 +270,6 @@ config LOONGSON2_CPUFREQ
 
          Loongson2F and it's successors support this feature.
 
-         For details, take a look at <file:Documentation/cpu-freq/>.
-
          If in doubt, say N.
 
 config LOONGSON1_CPUFREQ
@@ -282,8 +279,6 @@ config LOONGSON1_CPUFREQ
          This option adds a CPUFreq driver for loongson1 processors which
          support software configurable cpu frequency.
 
-         For details, take a look at <file:Documentation/cpu-freq/>.
-
          If in doubt, say N.
 endif
 
@@ -293,8 +288,6 @@ config SPARC_US3_CPUFREQ
        help
          This adds the CPUFreq driver for UltraSPARC-III processors.
 
-         For details, take a look at <file:Documentation/cpu-freq>.
-
          If in doubt, say N.
 
 config SPARC_US2E_CPUFREQ
@@ -302,8 +295,6 @@ config SPARC_US2E_CPUFREQ
        help
          This adds the CPUFreq driver for UltraSPARC-IIe processors.
 
-         For details, take a look at <file:Documentation/cpu-freq>.
-
          If in doubt, say N.
 endif
 
@@ -318,8 +309,6 @@ config SH_CPU_FREQ
          will also generate a notice in the boot log before disabling
          itself if the CPU in question is not capable of rate rounding.
 
-         For details, take a look at <file:Documentation/cpu-freq>.
-
          If unsure, say N.
 endif
 
index e65e0a4..a5c5f70 100644 (file)
@@ -19,6 +19,16 @@ config ACPI_CPPC_CPUFREQ
 
          If in doubt, say N.
 
+config ACPI_CPPC_CPUFREQ_FIE
+       bool "Frequency Invariance support for CPPC cpufreq driver"
+       depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY
+       default y
+       help
+         This extends frequency invariance support in the CPPC cpufreq driver,
+         by using CPPC delivered and reference performance counters.
+
+         If in doubt, say N.
+
 config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM
        tristate "Allwinner nvmem based SUN50I CPUFreq driver"
        depends on ARCH_SUNXI
index b4af409..3fc98a3 100644 (file)
 
 #include "cpufreq-dt.h"
 
+/* Clk register set */
+#define ARMADA_37XX_CLK_TBG_SEL                0
+#define ARMADA_37XX_CLK_TBG_SEL_CPU_OFF        22
+
 /* Power management in North Bridge register set */
 #define ARMADA_37XX_NB_L0L1    0x18
 #define ARMADA_37XX_NB_L2L3    0x1C
@@ -69,6 +73,8 @@
 #define LOAD_LEVEL_NR  4
 
 #define MIN_VOLT_MV 1000
+#define MIN_VOLT_MV_FOR_L1_1000MHZ 1108
+#define MIN_VOLT_MV_FOR_L1_1200MHZ 1155
 
 /*  AVS value for the corresponding voltage (in mV) */
 static int avs_map[] = {
@@ -80,6 +86,8 @@ static int avs_map[] = {
 };
 
 struct armada37xx_cpufreq_state {
+       struct platform_device *pdev;
+       struct device *cpu_dev;
        struct regmap *regmap;
        u32 nb_l0l1;
        u32 nb_l2l3;
@@ -120,10 +128,15 @@ static struct armada_37xx_dvfs *armada_37xx_cpu_freq_info_get(u32 freq)
  * will be configured then the DVFS will be enabled.
  */
 static void __init armada37xx_cpufreq_dvfs_setup(struct regmap *base,
-                                                struct clk *clk, u8 *divider)
+                                                struct regmap *clk_base, u8 *divider)
 {
+       u32 cpu_tbg_sel;
        int load_lvl;
-       struct clk *parent;
+
+       /* Determine to which TBG clock is CPU connected */
+       regmap_read(clk_base, ARMADA_37XX_CLK_TBG_SEL, &cpu_tbg_sel);
+       cpu_tbg_sel >>= ARMADA_37XX_CLK_TBG_SEL_CPU_OFF;
+       cpu_tbg_sel &= ARMADA_37XX_NB_TBG_SEL_MASK;
 
        for (load_lvl = 0; load_lvl < LOAD_LEVEL_NR; load_lvl++) {
                unsigned int reg, mask, val, offset = 0;
@@ -142,6 +155,11 @@ static void __init armada37xx_cpufreq_dvfs_setup(struct regmap *base,
                mask = (ARMADA_37XX_NB_CLK_SEL_MASK
                        << ARMADA_37XX_NB_CLK_SEL_OFF);
 
+               /* Set TBG index, for all levels we use the same TBG */
+               val = cpu_tbg_sel << ARMADA_37XX_NB_TBG_SEL_OFF;
+               mask = (ARMADA_37XX_NB_TBG_SEL_MASK
+                       << ARMADA_37XX_NB_TBG_SEL_OFF);
+
                /*
                 * Set cpu divider based on the pre-computed array in
                 * order to have balanced step.
@@ -160,14 +178,6 @@ static void __init armada37xx_cpufreq_dvfs_setup(struct regmap *base,
 
                regmap_update_bits(base, reg, mask, val);
        }
-
-       /*
-        * Set cpu clock source, for all the level we keep the same
-        * clock source that the one already configured. For this one
-        * we need to use the clock framework
-        */
-       parent = clk_get_parent(clk);
-       clk_set_parent(clk, parent);
 }
 
 /*
@@ -202,6 +212,8 @@ static u32 armada_37xx_avs_val_match(int target_vm)
  * - L2 & L3 voltage should be about 150mv smaller than L0 voltage.
  * This function calculates L1 & L2 & L3 AVS values dynamically based
  * on L0 voltage and fill all AVS values to the AVS value table.
+ * When base CPU frequency is 1000 or 1200 MHz then there is additional
+ * minimal avs value for load L1.
  */
 static void __init armada37xx_cpufreq_avs_configure(struct regmap *base,
                                                struct armada_37xx_dvfs *dvfs)
@@ -233,6 +245,19 @@ static void __init armada37xx_cpufreq_avs_configure(struct regmap *base,
                for (load_level = 1; load_level < LOAD_LEVEL_NR; load_level++)
                        dvfs->avs[load_level] = avs_min;
 
+               /*
+                * Set the avs values for load L0 and L1 when base CPU frequency
+                * is 1000/1200 MHz to its typical initial values according to
+                * the Armada 3700 Hardware Specifications.
+                */
+               if (dvfs->cpu_freq_max >= 1000*1000*1000) {
+                       if (dvfs->cpu_freq_max >= 1200*1000*1000)
+                               avs_min = armada_37xx_avs_val_match(MIN_VOLT_MV_FOR_L1_1200MHZ);
+                       else
+                               avs_min = armada_37xx_avs_val_match(MIN_VOLT_MV_FOR_L1_1000MHZ);
+                       dvfs->avs[0] = dvfs->avs[1] = avs_min;
+               }
+
                return;
        }
 
@@ -252,6 +277,26 @@ static void __init armada37xx_cpufreq_avs_configure(struct regmap *base,
        target_vm = avs_map[l0_vdd_min] - 150;
        target_vm = target_vm > MIN_VOLT_MV ? target_vm : MIN_VOLT_MV;
        dvfs->avs[2] = dvfs->avs[3] = armada_37xx_avs_val_match(target_vm);
+
+       /*
+        * Fix the avs value for load L1 when base CPU frequency is 1000/1200 MHz,
+        * otherwise the CPU gets stuck when switching from load L1 to load L0.
+        * Also ensure that avs value for load L1 is not higher than for L0.
+        */
+       if (dvfs->cpu_freq_max >= 1000*1000*1000) {
+               u32 avs_min_l1;
+
+               if (dvfs->cpu_freq_max >= 1200*1000*1000)
+                       avs_min_l1 = armada_37xx_avs_val_match(MIN_VOLT_MV_FOR_L1_1200MHZ);
+               else
+                       avs_min_l1 = armada_37xx_avs_val_match(MIN_VOLT_MV_FOR_L1_1000MHZ);
+
+               if (avs_min_l1 > dvfs->avs[0])
+                       avs_min_l1 = dvfs->avs[0];
+
+               if (dvfs->avs[1] < avs_min_l1)
+                       dvfs->avs[1] = avs_min_l1;
+       }
 }
 
 static void __init armada37xx_cpufreq_avs_setup(struct regmap *base,
@@ -357,12 +402,17 @@ static int __init armada37xx_cpufreq_driver_init(void)
        struct armada_37xx_dvfs *dvfs;
        struct platform_device *pdev;
        unsigned long freq;
-       unsigned int cur_frequency, base_frequency;
-       struct regmap *nb_pm_base, *avs_base;
+       unsigned int base_frequency;
+       struct regmap *nb_clk_base, *nb_pm_base, *avs_base;
        struct device *cpu_dev;
        int load_lvl, ret;
        struct clk *clk, *parent;
 
+       nb_clk_base =
+               syscon_regmap_lookup_by_compatible("marvell,armada-3700-periph-clock-nb");
+       if (IS_ERR(nb_clk_base))
+               return -ENODEV;
+
        nb_pm_base =
                syscon_regmap_lookup_by_compatible("marvell,armada-3700-nb-pm");
 
@@ -413,15 +463,7 @@ static int __init armada37xx_cpufreq_driver_init(void)
                return -EINVAL;
        }
 
-       /* Get nominal (current) CPU frequency */
-       cur_frequency = clk_get_rate(clk);
-       if (!cur_frequency) {
-               dev_err(cpu_dev, "Failed to get clock rate for CPU\n");
-               clk_put(clk);
-               return -EINVAL;
-       }
-
-       dvfs = armada_37xx_cpu_freq_info_get(cur_frequency);
+       dvfs = armada_37xx_cpu_freq_info_get(base_frequency);
        if (!dvfs) {
                clk_put(clk);
                return -EINVAL;
@@ -439,7 +481,7 @@ static int __init armada37xx_cpufreq_driver_init(void)
        armada37xx_cpufreq_avs_configure(avs_base, dvfs);
        armada37xx_cpufreq_avs_setup(avs_base, dvfs);
 
-       armada37xx_cpufreq_dvfs_setup(nb_pm_base, clk, dvfs->divider);
+       armada37xx_cpufreq_dvfs_setup(nb_pm_base, nb_clk_base, dvfs->divider);
        clk_put(clk);
 
        for (load_lvl = ARMADA_37XX_DVFS_LOAD_0; load_lvl < LOAD_LEVEL_NR;
@@ -466,6 +508,9 @@ static int __init armada37xx_cpufreq_driver_init(void)
        if (ret)
                goto disable_dvfs;
 
+       armada37xx_cpufreq_state->cpu_dev = cpu_dev;
+       armada37xx_cpufreq_state->pdev = pdev;
+       platform_set_drvdata(pdev, dvfs);
        return 0;
 
 disable_dvfs:
@@ -473,7 +518,7 @@ disable_dvfs:
 remove_opp:
        /* clean-up the already added opp before leaving */
        while (load_lvl-- > ARMADA_37XX_DVFS_LOAD_0) {
-               freq = cur_frequency / dvfs->divider[load_lvl];
+               freq = base_frequency / dvfs->divider[load_lvl];
                dev_pm_opp_remove(cpu_dev, freq);
        }
 
@@ -484,6 +529,26 @@ remove_opp:
 /* late_initcall, to guarantee the driver is loaded after A37xx clock driver */
 late_initcall(armada37xx_cpufreq_driver_init);
 
+static void __exit armada37xx_cpufreq_driver_exit(void)
+{
+       struct platform_device *pdev = armada37xx_cpufreq_state->pdev;
+       struct armada_37xx_dvfs *dvfs = platform_get_drvdata(pdev);
+       unsigned long freq;
+       int load_lvl;
+
+       platform_device_unregister(pdev);
+
+       armada37xx_cpufreq_disable_dvfs(armada37xx_cpufreq_state->regmap);
+
+       for (load_lvl = ARMADA_37XX_DVFS_LOAD_0; load_lvl < LOAD_LEVEL_NR; load_lvl++) {
+               freq = dvfs->cpu_freq_max / dvfs->divider[load_lvl];
+               dev_pm_opp_remove(armada37xx_cpufreq_state->cpu_dev, freq);
+       }
+
+       kfree(armada37xx_cpufreq_state);
+}
+module_exit(armada37xx_cpufreq_driver_exit);
+
 static const struct of_device_id __maybe_unused armada37xx_cpufreq_of_match[] = {
        { .compatible = "marvell,armada-3700-nb-pm" },
        { },
index 8a482c4..3848b4c 100644 (file)
 
 #define pr_fmt(fmt)    "CPPC Cpufreq:" fmt
 
+#include <linux/arch_topology.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
 #include <linux/cpufreq.h>
 #include <linux/dmi.h>
+#include <linux/irq_work.h>
+#include <linux/kthread.h>
 #include <linux/time.h>
 #include <linux/vmalloc.h>
+#include <uapi/linux/sched/types.h>
 
 #include <asm/unaligned.h>
 
@@ -57,6 +61,204 @@ static struct cppc_workaround_oem_info wa_info[] = {
        }
 };
 
+#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
+
+/* Frequency invariance support */
+struct cppc_freq_invariance {
+       int cpu;
+       struct irq_work irq_work;
+       struct kthread_work work;
+       struct cppc_perf_fb_ctrs prev_perf_fb_ctrs;
+       struct cppc_cpudata *cpu_data;
+};
+
+static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv);
+static struct kthread_worker *kworker_fie;
+static bool fie_disabled;
+
+static struct cpufreq_driver cppc_cpufreq_driver;
+static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu);
+static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
+                                struct cppc_perf_fb_ctrs fb_ctrs_t0,
+                                struct cppc_perf_fb_ctrs fb_ctrs_t1);
+
+/**
+ * cppc_scale_freq_workfn - CPPC arch_freq_scale updater for frequency invariance
+ * @work: The work item.
+ *
+ * The CPPC driver register itself with the topology core to provide its own
+ * implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which
+ * gets called by the scheduler on every tick.
+ *
+ * Note that the arch specific counters have higher priority than CPPC counters,
+ * if available, though the CPPC driver doesn't need to have any special
+ * handling for that.
+ *
+ * On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we
+ * reach here from hard-irq context), which then schedules a normal work item
+ * and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable
+ * based on the counter updates since the last tick.
+ */
+static void cppc_scale_freq_workfn(struct kthread_work *work)
+{
+       struct cppc_freq_invariance *cppc_fi;
+       struct cppc_perf_fb_ctrs fb_ctrs = {0};
+       struct cppc_cpudata *cpu_data;
+       unsigned long local_freq_scale;
+       u64 perf;
+
+       cppc_fi = container_of(work, struct cppc_freq_invariance, work);
+       cpu_data = cppc_fi->cpu_data;
+
+       if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) {
+               pr_warn("%s: failed to read perf counters\n", __func__);
+               return;
+       }
+
+       cppc_fi->prev_perf_fb_ctrs = fb_ctrs;
+       perf = cppc_perf_from_fbctrs(cpu_data, cppc_fi->prev_perf_fb_ctrs,
+                                    fb_ctrs);
+
+       perf <<= SCHED_CAPACITY_SHIFT;
+       local_freq_scale = div64_u64(perf, cpu_data->perf_caps.highest_perf);
+       if (WARN_ON(local_freq_scale > 1024))
+               local_freq_scale = 1024;
+
+       per_cpu(arch_freq_scale, cppc_fi->cpu) = local_freq_scale;
+}
+
+static void cppc_irq_work(struct irq_work *irq_work)
+{
+       struct cppc_freq_invariance *cppc_fi;
+
+       cppc_fi = container_of(irq_work, struct cppc_freq_invariance, irq_work);
+       kthread_queue_work(kworker_fie, &cppc_fi->work);
+}
+
+static void cppc_scale_freq_tick(void)
+{
+       struct cppc_freq_invariance *cppc_fi = &per_cpu(cppc_freq_inv, smp_processor_id());
+
+       /*
+        * cppc_get_perf_ctrs() can potentially sleep, call that from the right
+        * context.
+        */
+       irq_work_queue(&cppc_fi->irq_work);
+}
+
+static struct scale_freq_data cppc_sftd = {
+       .source = SCALE_FREQ_SOURCE_CPPC,
+       .set_freq_scale = cppc_scale_freq_tick,
+};
+
+static void cppc_freq_invariance_policy_init(struct cpufreq_policy *policy,
+                                            struct cppc_cpudata *cpu_data)
+{
+       struct cppc_perf_fb_ctrs fb_ctrs = {0};
+       struct cppc_freq_invariance *cppc_fi;
+       int i, ret;
+
+       if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+               return;
+
+       if (fie_disabled)
+               return;
+
+       for_each_cpu(i, policy->cpus) {
+               cppc_fi = &per_cpu(cppc_freq_inv, i);
+               cppc_fi->cpu = i;
+               cppc_fi->cpu_data = cpu_data;
+               kthread_init_work(&cppc_fi->work, cppc_scale_freq_workfn);
+               init_irq_work(&cppc_fi->irq_work, cppc_irq_work);
+
+               ret = cppc_get_perf_ctrs(i, &fb_ctrs);
+               if (ret) {
+                       pr_warn("%s: failed to read perf counters: %d\n",
+                               __func__, ret);
+                       fie_disabled = true;
+               } else {
+                       cppc_fi->prev_perf_fb_ctrs = fb_ctrs;
+               }
+       }
+}
+
+static void __init cppc_freq_invariance_init(void)
+{
+       struct sched_attr attr = {
+               .size           = sizeof(struct sched_attr),
+               .sched_policy   = SCHED_DEADLINE,
+               .sched_nice     = 0,
+               .sched_priority = 0,
+               /*
+                * Fake (unused) bandwidth; workaround to "fix"
+                * priority inheritance.
+                */
+               .sched_runtime  = 1000000,
+               .sched_deadline = 10000000,
+               .sched_period   = 10000000,
+       };
+       int ret;
+
+       if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+               return;
+
+       if (fie_disabled)
+               return;
+
+       kworker_fie = kthread_create_worker(0, "cppc_fie");
+       if (IS_ERR(kworker_fie))
+               return;
+
+       ret = sched_setattr_nocheck(kworker_fie->task, &attr);
+       if (ret) {
+               pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__,
+                       ret);
+               kthread_destroy_worker(kworker_fie);
+               return;
+       }
+
+       /* Register for freq-invariance */
+       topology_set_scale_freq_source(&cppc_sftd, cpu_present_mask);
+}
+
+static void cppc_freq_invariance_exit(void)
+{
+       struct cppc_freq_invariance *cppc_fi;
+       int i;
+
+       if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
+               return;
+
+       if (fie_disabled)
+               return;
+
+       topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, cpu_present_mask);
+
+       for_each_possible_cpu(i) {
+               cppc_fi = &per_cpu(cppc_freq_inv, i);
+               irq_work_sync(&cppc_fi->irq_work);
+       }
+
+       kthread_destroy_worker(kworker_fie);
+       kworker_fie = NULL;
+}
+
+#else
+static inline void
+cppc_freq_invariance_policy_init(struct cpufreq_policy *policy,
+                                struct cppc_cpudata *cpu_data)
+{
+}
+
+static inline void cppc_freq_invariance_init(void)
+{
+}
+
+static inline void cppc_freq_invariance_exit(void)
+{
+}
+#endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */
+
 /* Callback function used to retrieve the max frequency from DMI */
 static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private)
 {
@@ -216,26 +418,16 @@ static unsigned int cppc_cpufreq_get_transition_delay_us(unsigned int cpu)
 {
        unsigned long implementor = read_cpuid_implementor();
        unsigned long part_num = read_cpuid_part_number();
-       unsigned int delay_us = 0;
 
        switch (implementor) {
        case ARM_CPU_IMP_QCOM:
                switch (part_num) {
                case QCOM_CPU_PART_FALKOR_V1:
                case QCOM_CPU_PART_FALKOR:
-                       delay_us = 10000;
-                       break;
-               default:
-                       delay_us = cppc_get_transition_latency(cpu) / NSEC_PER_USEC;
-                       break;
+                       return 10000;
                }
-               break;
-       default:
-               delay_us = cppc_get_transition_latency(cpu) / NSEC_PER_USEC;
-               break;
        }
-
-       return delay_us;
+       return cppc_get_transition_latency(cpu) / NSEC_PER_USEC;
 }
 
 #else
@@ -355,9 +547,12 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy)
        cpu_data->perf_ctrls.desired_perf =  caps->highest_perf;
 
        ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
-       if (ret)
+       if (ret) {
                pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n",
                         caps->highest_perf, cpu, ret);
+       } else {
+               cppc_freq_invariance_policy_init(policy, cpu_data);
+       }
 
        return ret;
 }
@@ -370,12 +565,12 @@ static inline u64 get_delta(u64 t1, u64 t0)
        return (u32)t1 - (u32)t0;
 }
 
-static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data,
-                                    struct cppc_perf_fb_ctrs fb_ctrs_t0,
-                                    struct cppc_perf_fb_ctrs fb_ctrs_t1)
+static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
+                                struct cppc_perf_fb_ctrs fb_ctrs_t0,
+                                struct cppc_perf_fb_ctrs fb_ctrs_t1)
 {
        u64 delta_reference, delta_delivered;
-       u64 reference_perf, delivered_perf;
+       u64 reference_perf;
 
        reference_perf = fb_ctrs_t0.reference_perf;
 
@@ -384,12 +579,21 @@ static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data,
        delta_delivered = get_delta(fb_ctrs_t1.delivered,
                                    fb_ctrs_t0.delivered);
 
-       /* Check to avoid divide-by zero */
-       if (delta_reference || delta_delivered)
-               delivered_perf = (reference_perf * delta_delivered) /
-                                       delta_reference;
-       else
-               delivered_perf = cpu_data->perf_ctrls.desired_perf;
+       /* Check to avoid divide-by zero and invalid delivered_perf */
+       if (!delta_reference || !delta_delivered)
+               return cpu_data->perf_ctrls.desired_perf;
+
+       return (reference_perf * delta_delivered) / delta_reference;
+}
+
+static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data,
+                                    struct cppc_perf_fb_ctrs fb_ctrs_t0,
+                                    struct cppc_perf_fb_ctrs fb_ctrs_t1)
+{
+       u64 delivered_perf;
+
+       delivered_perf = cppc_perf_from_fbctrs(cpu_data, fb_ctrs_t0,
+                                              fb_ctrs_t1);
 
        return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf);
 }
@@ -514,6 +718,8 @@ static void cppc_check_hisi_workaround(void)
 
 static int __init cppc_cpufreq_init(void)
 {
+       int ret;
+
        if ((acpi_disabled) || !acpi_cpc_valid())
                return -ENODEV;
 
@@ -521,7 +727,11 @@ static int __init cppc_cpufreq_init(void)
 
        cppc_check_hisi_workaround();
 
-       return cpufreq_register_driver(&cppc_cpufreq_driver);
+       ret = cpufreq_register_driver(&cppc_cpufreq_driver);
+       if (!ret)
+               cppc_freq_invariance_init();
+
+       return ret;
 }
 
 static inline void free_cpu_data(void)
@@ -538,6 +748,7 @@ static inline void free_cpu_data(void)
 
 static void __exit cppc_cpufreq_exit(void)
 {
+       cppc_freq_invariance_exit();
        cpufreq_unregister_driver(&cppc_cpufreq_driver);
 
        free_cpu_data();
index b1e1bdc..ece5286 100644 (file)
@@ -255,10 +255,15 @@ static int dt_cpufreq_early_init(struct device *dev, int cpu)
         * before updating priv->cpus. Otherwise, we will end up creating
         * duplicate OPPs for the CPUs.
         *
-        * OPPs might be populated at runtime, don't check for error here.
+        * OPPs might be populated at runtime, don't fail for error here unless
+        * it is -EPROBE_DEFER.
         */
-       if (!dev_pm_opp_of_cpumask_add_table(priv->cpus))
+       ret = dev_pm_opp_of_cpumask_add_table(priv->cpus);
+       if (!ret) {
                priv->have_static_opps = true;
+       } else if (ret == -EPROBE_DEFER) {
+               goto out;
+       }
 
        /*
         * The OPP table must be initialized, statically or dynamically, by this
index 1d1b563..802abc9 100644 (file)
@@ -42,9 +42,6 @@ static LIST_HEAD(cpufreq_policy_list);
 #define for_each_inactive_policy(__policy)             \
        for_each_suitable_policy(__policy, false)
 
-#define for_each_policy(__policy)                      \
-       list_for_each_entry(__policy, &cpufreq_policy_list, policy_list)
-
 /* Iterate over governors */
 static LIST_HEAD(cpufreq_governor_list);
 #define for_each_governor(__governor)                          \
index 2efe718..c6bdc45 100644 (file)
@@ -54,7 +54,7 @@ processor_set_pstate (
        retval = ia64_pal_set_pstate((u64)value);
 
        if (retval) {
-               pr_debug("Failed to set freq to 0x%x, with error 0x%lx\n",
+               pr_debug("Failed to set freq to 0x%x, with error 0x%llx\n",
                        value, retval);
                return -ENODEV;
        }
@@ -77,7 +77,7 @@ processor_get_pstate (
 
        if (retval)
                pr_debug("Failed to get current freq with "
-                       "error 0x%lx, idx 0x%x\n", retval, *value);
+                       "error 0x%llx, idx 0x%x\n", retval, *value);
 
        return (int)retval;
 }
index 5175ae3..f040106 100644 (file)
@@ -819,19 +819,21 @@ static struct freq_attr *hwp_cpufreq_attrs[] = {
        NULL,
 };
 
-static void intel_pstate_get_hwp_max(struct cpudata *cpu, int *phy_max,
-                                    int *current_max)
+static void __intel_pstate_get_hwp_cap(struct cpudata *cpu)
 {
        u64 cap;
 
        rdmsrl_on_cpu(cpu->cpu, MSR_HWP_CAPABILITIES, &cap);
        WRITE_ONCE(cpu->hwp_cap_cached, cap);
-       if (global.no_turbo || global.turbo_disabled)
-               *current_max = HWP_GUARANTEED_PERF(cap);
-       else
-               *current_max = HWP_HIGHEST_PERF(cap);
+       cpu->pstate.max_pstate = HWP_GUARANTEED_PERF(cap);
+       cpu->pstate.turbo_pstate = HWP_HIGHEST_PERF(cap);
+}
 
-       *phy_max = HWP_HIGHEST_PERF(cap);
+static void intel_pstate_get_hwp_cap(struct cpudata *cpu)
+{
+       __intel_pstate_get_hwp_cap(cpu);
+       cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
+       cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
 }
 
 static void intel_pstate_hwp_set(unsigned int cpu)
@@ -1195,12 +1197,13 @@ static ssize_t store_no_turbo(struct kobject *a, struct kobj_attribute *b,
 
 static void update_qos_request(enum freq_qos_req_type type)
 {
-       int max_state, turbo_max, freq, i, perf_pct;
        struct freq_qos_request *req;
        struct cpufreq_policy *policy;
+       int i;
 
        for_each_possible_cpu(i) {
                struct cpudata *cpu = all_cpu_data[i];
+               unsigned int freq, perf_pct;
 
                policy = cpufreq_cpu_get(i);
                if (!policy)
@@ -1213,9 +1216,7 @@ static void update_qos_request(enum freq_qos_req_type type)
                        continue;
 
                if (hwp_active)
-                       intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state);
-               else
-                       turbo_max = cpu->pstate.turbo_pstate;
+                       intel_pstate_get_hwp_cap(cpu);
 
                if (type == FREQ_QOS_MIN) {
                        perf_pct = global.min_perf_pct;
@@ -1224,8 +1225,7 @@ static void update_qos_request(enum freq_qos_req_type type)
                        perf_pct = global.max_perf_pct;
                }
 
-               freq = DIV_ROUND_UP(turbo_max * perf_pct, 100);
-               freq *= cpu->pstate.scaling;
+               freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * perf_pct, 100);
 
                if (freq_qos_update_request(req, freq) < 0)
                        pr_warn("Failed to update freq constraint: CPU%d\n", i);
@@ -1715,21 +1715,17 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 {
        cpu->pstate.min_pstate = pstate_funcs.get_min();
        cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical();
-       cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
        cpu->pstate.scaling = pstate_funcs.get_scaling();
 
        if (hwp_active && !hwp_mode_bdw) {
-               unsigned int phy_max, current_max;
-
-               intel_pstate_get_hwp_max(cpu, &phy_max, &current_max);
-               cpu->pstate.turbo_freq = phy_max * cpu->pstate.scaling;
-               cpu->pstate.turbo_pstate = phy_max;
-               cpu->pstate.max_pstate = HWP_GUARANTEED_PERF(READ_ONCE(cpu->hwp_cap_cached));
+               __intel_pstate_get_hwp_cap(cpu);
        } else {
-               cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
                cpu->pstate.max_pstate = pstate_funcs.get_max();
+               cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
        }
+
        cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
+       cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
 
        if (pstate_funcs.get_aperf_mperf_shift)
                cpu->aperf_mperf_shift = pstate_funcs.get_aperf_mperf_shift();
@@ -2199,41 +2195,34 @@ static void intel_pstate_update_perf_limits(struct cpudata *cpu,
                                            unsigned int policy_min,
                                            unsigned int policy_max)
 {
+       int scaling = cpu->pstate.scaling;
        int32_t max_policy_perf, min_policy_perf;
-       int max_state, turbo_max;
-       int max_freq;
 
        /*
-        * HWP needs some special consideration, because on BDX the
-        * HWP_REQUEST uses abstract value to represent performance
-        * rather than pure ratios.
+        * HWP needs some special consideration, because HWP_REQUEST uses
+        * abstract values to represent performance rather than pure ratios.
         */
-       if (hwp_active) {
-               intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state);
-       } else {
-               max_state = global.no_turbo || global.turbo_disabled ?
-                       cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
-               turbo_max = cpu->pstate.turbo_pstate;
-       }
-       max_freq = max_state * cpu->pstate.scaling;
+       if (hwp_active)
+               intel_pstate_get_hwp_cap(cpu);
 
-       max_policy_perf = max_state * policy_max / max_freq;
+       max_policy_perf = policy_max / scaling;
        if (policy_max == policy_min) {
                min_policy_perf = max_policy_perf;
        } else {
-               min_policy_perf = max_state * policy_min / max_freq;
+               min_policy_perf = policy_min / scaling;
                min_policy_perf = clamp_t(int32_t, min_policy_perf,
                                          0, max_policy_perf);
        }
 
-       pr_debug("cpu:%d max_state %d min_policy_perf:%d max_policy_perf:%d\n",
-                cpu->cpu, max_state, min_policy_perf, max_policy_perf);
+       pr_debug("cpu:%d min_policy_perf:%d max_policy_perf:%d\n",
+                cpu->cpu, min_policy_perf, max_policy_perf);
 
        /* Normalize user input to [min_perf, max_perf] */
        if (per_cpu_limits) {
                cpu->min_perf_ratio = min_policy_perf;
                cpu->max_perf_ratio = max_policy_perf;
        } else {
+               int turbo_max = cpu->pstate.turbo_pstate;
                int32_t global_min, global_max;
 
                /* Global limits are in percent of the maximum turbo P-state. */
@@ -2322,10 +2311,9 @@ static void intel_pstate_verify_cpu_policy(struct cpudata *cpu,
 
        update_turbo_state();
        if (hwp_active) {
-               int max_state, turbo_max;
-
-               intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state);
-               max_freq = max_state * cpu->pstate.scaling;
+               intel_pstate_get_hwp_cap(cpu);
+               max_freq = global.no_turbo || global.turbo_disabled ?
+                               cpu->pstate.max_freq : cpu->pstate.turbo_freq;
        } else {
                max_freq = intel_pstate_get_max_freq(cpu);
        }
@@ -2416,25 +2404,15 @@ static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
        cpu->max_perf_ratio = 0xFF;
        cpu->min_perf_ratio = 0;
 
-       policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
-       policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
-
        /* cpuinfo and default policy values */
        policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
        update_turbo_state();
        global.turbo_disabled_mf = global.turbo_disabled;
        policy->cpuinfo.max_freq = global.turbo_disabled ?
-                       cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
-       policy->cpuinfo.max_freq *= cpu->pstate.scaling;
-
-       if (hwp_active) {
-               unsigned int max_freq;
-
-               max_freq = global.turbo_disabled ?
                        cpu->pstate.max_freq : cpu->pstate.turbo_freq;
-               if (max_freq < policy->cpuinfo.max_freq)
-                       policy->cpuinfo.max_freq = max_freq;
-       }
+
+       policy->min = policy->cpuinfo.min_freq;
+       policy->max = policy->cpuinfo.max_freq;
 
        intel_pstate_init_acpi_perf_limits(policy);
 
@@ -2683,10 +2661,10 @@ static void intel_cpufreq_adjust_perf(unsigned int cpunum,
 
 static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
-       int max_state, turbo_max, min_freq, max_freq, ret;
        struct freq_qos_request *req;
        struct cpudata *cpu;
        struct device *dev;
+       int ret, freq;
 
        dev = get_cpu_device(policy->cpu);
        if (!dev)
@@ -2711,30 +2689,31 @@ static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
        if (hwp_active) {
                u64 value;
 
-               intel_pstate_get_hwp_max(cpu, &turbo_max, &max_state);
                policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY_HWP;
+
+               intel_pstate_get_hwp_cap(cpu);
+
                rdmsrl_on_cpu(cpu->cpu, MSR_HWP_REQUEST, &value);
                WRITE_ONCE(cpu->hwp_req_cached, value);
+
                cpu->epp_cached = intel_pstate_get_epp(cpu, value);
        } else {
-               turbo_max = cpu->pstate.turbo_pstate;
                policy->transition_delay_us = INTEL_CPUFREQ_TRANSITION_DELAY;
        }
 
-       min_freq = DIV_ROUND_UP(turbo_max * global.min_perf_pct, 100);
-       min_freq *= cpu->pstate.scaling;
-       max_freq = DIV_ROUND_UP(turbo_max * global.max_perf_pct, 100);
-       max_freq *= cpu->pstate.scaling;
+       freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.min_perf_pct, 100);
 
        ret = freq_qos_add_request(&policy->constraints, req, FREQ_QOS_MIN,
-                                  min_freq);
+                                  freq);
        if (ret < 0) {
                dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret);
                goto free_req;
        }
 
+       freq = DIV_ROUND_UP(cpu->pstate.turbo_freq * global.max_perf_pct, 100);
+
        ret = freq_qos_add_request(&policy->constraints, req + 1, FREQ_QOS_MAX,
-                                  max_freq);
+                                  freq);
        if (ret < 0) {
                dev_err(dev, "Failed to add max-freq constraint (%d)\n", ret);
                goto remove_min_req;
index 69786e5..ad7d4f2 100644 (file)
@@ -91,7 +91,7 @@ static DEFINE_MUTEX(set_freq_lock);
 /* Use 800MHz when entering sleep mode */
 #define SLEEP_FREQ     (800 * 1000)
 
-/* Tracks if cpu freqency can be updated anymore */
+/* Tracks if CPU frequency can be updated anymore */
 static bool no_cpufreq_access;
 
 /*
@@ -190,7 +190,7 @@ static u32 clkdiv_val[5][11] = {
 
 /*
  * This function set DRAM refresh counter
- * accoriding to operating frequency of DRAM
+ * according to operating frequency of DRAM
  * ch: DMC port number 0 or 1
  * freq: Operating frequency of DRAM(KHz)
  */
@@ -320,7 +320,7 @@ static int s5pv210_target(struct cpufreq_policy *policy, unsigned int index)
 
                /*
                 * 3. DMC1 refresh count for 133Mhz if (index == L4) is
-                * true refresh counter is already programed in upper
+                * true refresh counter is already programmed in upper
                 * code. 0x287@83Mhz
                 */
                if (!bus_speed_changing)
@@ -378,7 +378,7 @@ static int s5pv210_target(struct cpufreq_policy *policy, unsigned int index)
                /*
                 * 6. Turn on APLL
                 * 6-1. Set PMS values
-                * 6-2. Wait untile the PLL is locked
+                * 6-2. Wait until the PLL is locked
                 */
                if (index == L0)
                        writel_relaxed(APLL_VAL_1000, S5P_APLL_CON);
@@ -390,7 +390,7 @@ static int s5pv210_target(struct cpufreq_policy *policy, unsigned int index)
                } while (!(reg & (0x1 << 29)));
 
                /*
-                * 7. Change souce clock from SCLKMPLL(667Mhz)
+                * 7. Change source clock from SCLKMPLL(667Mhz)
                 * to SCLKA2M(200Mhz) in MFC_MUX and G3D MUX
                 * (667/4=166)->(200/4=50)Mhz
                 */
@@ -439,8 +439,8 @@ static int s5pv210_target(struct cpufreq_policy *policy, unsigned int index)
        }
 
        /*
-        * L4 level need to change memory bus speed, hence onedram clock divier
-        * and memory refresh parameter should be changed
+        * L4 level needs to change memory bus speed, hence ONEDRAM clock
+        * divider and memory refresh parameter should be changed
         */
        if (bus_speed_changing) {
                reg = readl_relaxed(S5P_CLK_DIV6);
index 0844fad..334f83e 100644 (file)
@@ -107,7 +107,7 @@ config ARM_TEGRA_CPUIDLE
 
 config ARM_QCOM_SPM_CPUIDLE
        bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)"
-       depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64
+       depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU
        select ARM_CPU_SUSPEND
        select CPU_IDLE_MULTIPLE_DRIVERS
        select DT_IDLE_STATES
index 191966d..508bd9f 100644 (file)
@@ -48,11 +48,6 @@ enum tegra_state {
 static atomic_t tegra_idle_barrier;
 static atomic_t tegra_abort_flag;
 
-static inline bool tegra_cpuidle_using_firmware(void)
-{
-       return firmware_ops->prepare_idle && firmware_ops->do_idle;
-}
-
 static void tegra_cpuidle_report_cpus_state(void)
 {
        unsigned long cpu, lcpu, csr;
@@ -135,13 +130,9 @@ static int tegra_cpuidle_c7_enter(void)
 {
        int err;
 
-       if (tegra_cpuidle_using_firmware()) {
-               err = call_firmware_op(prepare_idle, TF_PM_MODE_LP2_NOFLUSH_L2);
-               if (err)
-                       return err;
-
-               return call_firmware_op(do_idle, 0);
-       }
+       err = call_firmware_op(prepare_idle, TF_PM_MODE_LP2_NOFLUSH_L2);
+       if (err && err != -ENOSYS)
+               return err;
 
        return cpu_suspend(0, tegra30_pm_secondary_cpu_suspend);
 }
@@ -356,9 +347,7 @@ static int tegra_cpuidle_probe(struct platform_device *pdev)
         * is disabled.
         */
        if (!IS_ENABLED(CONFIG_PM_SLEEP)) {
-               if (!tegra_cpuidle_using_firmware())
-                       tegra_cpuidle_disable_state(TEGRA_C7);
-
+               tegra_cpuidle_disable_state(TEGRA_C7);
                tegra_cpuidle_disable_state(TEGRA_CC6);
        }
 
index 4070e57..f70aa17 100644 (file)
@@ -181,9 +181,13 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
                 */
                if (s->target_residency > 0)
                        s->target_residency_ns = s->target_residency * NSEC_PER_USEC;
+               else if (s->target_residency_ns < 0)
+                       s->target_residency_ns = 0;
 
                if (s->exit_latency > 0)
                        s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC;
+               else if (s->exit_latency_ns < 0)
+                       s->exit_latency_ns =  0;
        }
 }
 
index b0a7ad5..c3aa8d6 100644 (file)
@@ -271,7 +271,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
        u64 predicted_ns;
        u64 interactivity_req;
        unsigned long nr_iowaiters;
-       ktime_t delta_next;
+       ktime_t delta, delta_tick;
        int i, idx;
 
        if (data->needs_update) {
@@ -280,7 +280,12 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
        }
 
        /* determine the expected residency time, round up */
-       data->next_timer_ns = tick_nohz_get_sleep_length(&delta_next);
+       delta = tick_nohz_get_sleep_length(&delta_tick);
+       if (unlikely(delta < 0)) {
+               delta = 0;
+               delta_tick = 0;
+       }
+       data->next_timer_ns = delta;
 
        nr_iowaiters = nr_iowait_cpu(dev->cpu);
        data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
@@ -318,7 +323,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
                 * state selection.
                 */
                if (predicted_ns < TICK_NSEC)
-                       predicted_ns = delta_next;
+                       predicted_ns = data->next_timer_ns;
        } else {
                /*
                 * Use the performance multiplier and the user-configurable
@@ -377,7 +382,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
                         * stuck in the shallow one for too long.
                         */
                        if (drv->states[idx].target_residency_ns < TICK_NSEC &&
-                           s->target_residency_ns <= delta_next)
+                           s->target_residency_ns <= delta_tick)
                                idx = i;
 
                        return idx;
@@ -399,7 +404,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
             predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
                *stop_tick = false;
 
-               if (idx > 0 && drv->states[idx].target_residency_ns > delta_next) {
+               if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
                        /*
                         * The tick is not going to be stopped and the target
                         * residency of the state to be returned is not within
@@ -411,7 +416,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
                                        continue;
 
                                idx = i;
-                               if (drv->states[i].target_residency_ns <= delta_next)
+                               if (drv->states[i].target_residency_ns <= delta_tick)
                                        break;
                        }
                }
index 6deaaf5..ac4bb27 100644 (file)
@@ -100,8 +100,8 @@ struct teo_idle_state {
  * @intervals: Saved idle duration values.
  */
 struct teo_cpu {
-       u64 time_span_ns;
-       u64 sleep_length_ns;
+       s64 time_span_ns;
+       s64 sleep_length_ns;
        struct teo_idle_state states[CPUIDLE_STATE_MAX];
        int interval_idx;
        u64 intervals[INTERVALS];
@@ -117,7 +117,8 @@ static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
 {
        struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
-       int i, idx_hit = -1, idx_timer = -1;
+       int i, idx_hit = 0, idx_timer = 0;
+       unsigned int hits, misses;
        u64 measured_ns;
 
        if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
@@ -174,25 +175,22 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
         * also increase the "early hits" metric for the state that actually
         * matches the measured idle duration.
         */
-       if (idx_timer >= 0) {
-               unsigned int hits = cpu_data->states[idx_timer].hits;
-               unsigned int misses = cpu_data->states[idx_timer].misses;
-
-               hits -= hits >> DECAY_SHIFT;
-               misses -= misses >> DECAY_SHIFT;
-
-               if (idx_timer > idx_hit) {
-                       misses += PULSE;
-                       if (idx_hit >= 0)
-                               cpu_data->states[idx_hit].early_hits += PULSE;
-               } else {
-                       hits += PULSE;
-               }
+       hits = cpu_data->states[idx_timer].hits;
+       hits -= hits >> DECAY_SHIFT;
+
+       misses = cpu_data->states[idx_timer].misses;
+       misses -= misses >> DECAY_SHIFT;
 
-               cpu_data->states[idx_timer].misses = misses;
-               cpu_data->states[idx_timer].hits = hits;
+       if (idx_timer == idx_hit) {
+               hits += PULSE;
+       } else {
+               misses += PULSE;
+               cpu_data->states[idx_hit].early_hits += PULSE;
        }
 
+       cpu_data->states[idx_timer].misses = misses;
+       cpu_data->states[idx_timer].hits = hits;
+
        /*
         * Save idle duration values corresponding to non-timer wakeups for
         * pattern detection.
@@ -216,7 +214,7 @@ static bool teo_time_ok(u64 interval_ns)
  */
 static int teo_find_shallower_state(struct cpuidle_driver *drv,
                                    struct cpuidle_device *dev, int state_idx,
-                                   u64 duration_ns)
+                                   s64 duration_ns)
 {
        int i;
 
@@ -242,10 +240,10 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 {
        struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
        s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
-       u64 duration_ns;
+       int max_early_idx, prev_max_early_idx, constraint_idx, idx0, idx, i;
        unsigned int hits, misses, early_hits;
-       int max_early_idx, prev_max_early_idx, constraint_idx, idx, i;
        ktime_t delta_tick;
+       s64 duration_ns;
 
        if (dev->last_state_idx >= 0) {
                teo_update(drv, dev);
@@ -264,6 +262,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
        prev_max_early_idx = -1;
        constraint_idx = drv->state_count;
        idx = -1;
+       idx0 = idx;
 
        for (i = 0; i < drv->state_count; i++) {
                struct cpuidle_state *s = &drv->states[i];
@@ -324,6 +323,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
                        idx = i; /* first enabled state */
                        hits = cpu_data->states[i].hits;
                        misses = cpu_data->states[i].misses;
+                       idx0 = i;
                }
 
                if (s->target_residency_ns > duration_ns)
@@ -376,11 +376,16 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
 
        if (idx < 0) {
                idx = 0; /* No states enabled. Must use 0. */
-       } else if (idx > 0) {
+       } else if (idx > idx0) {
                unsigned int count = 0;
                u64 sum = 0;
 
                /*
+                * The target residencies of at least two different enabled idle
+                * states are less than or equal to the current expected idle
+                * duration.  Try to refine the selection using the most recent
+                * measured idle duration values.
+                *
                 * Count and sum the most recent idle duration values less than
                 * the current expected idle duration value.
                 */
@@ -428,7 +433,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
                 * till the closest timer including the tick, try to correct
                 * that.
                 */
-               if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick)
+               if (idx > idx0 &&
+                   drv->states[idx].target_residency_ns > delta_tick)
                        idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
        }
 
index 3273360..ec1b9d3 100644 (file)
@@ -744,8 +744,8 @@ static struct cpuidle_state icx_cstates[] __initdata = {
                .name = "C6",
                .desc = "MWAIT 0x20",
                .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
-               .exit_latency = 128,
-               .target_residency = 384,
+               .exit_latency = 170,
+               .target_residency = 600,
                .enter = &intel_idle,
                .enter_s2idle = intel_idle_s2idle, },
        {
@@ -1156,6 +1156,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
        X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,            &idle_cpu_skl),
        X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,           &idle_cpu_skx),
        X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,           &idle_cpu_icx),
+       X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D,           &idle_cpu_icx),
        X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL,        &idle_cpu_knl),
        X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM,        &idle_cpu_knl),
        X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT,       &idle_cpu_bxt),
index 16a1721..e4d4e39 100644 (file)
@@ -1870,20 +1870,10 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags)
        int err;
        int i, bars = 0;
 
-       /*
-        * Power state could be unknown at this point, either due to a fresh
-        * boot or a device removal call.  So get the current power state
-        * so that things like MSI message writing will behave as expected
-        * (e.g. if the device really is in D0 at enable time).
-        */
-       if (dev->pm_cap) {
-               u16 pmcsr;
-               pci_read_config_word(dev, dev->pm_cap + PCI_PM_CTRL, &pmcsr);
-               dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK);
-       }
-
-       if (atomic_inc_return(&dev->enable_cnt) > 1)
+       if (atomic_inc_return(&dev->enable_cnt) > 1) {
+               pci_update_current_state(dev, dev->current_state);
                return 0;               /* already enabled */
+       }
 
        bridge = pci_upstream_bridge(dev);
        if (bridge)
index fdda2a7..73cf68a 100644 (file)
@@ -1069,6 +1069,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 
        X86_MATCH_VENDOR_FAM(AMD, 0x17, &rapl_defaults_amd),
        X86_MATCH_VENDOR_FAM(AMD, 0x19, &rapl_defaults_amd),
+       X86_MATCH_VENDOR_FAM(HYGON, 0x18, &rapl_defaults_amd),
        {}
 };
 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
index 78213d4..cc3b228 100644 (file)
@@ -150,6 +150,7 @@ static int rapl_msr_probe(struct platform_device *pdev)
        case X86_VENDOR_INTEL:
                rapl_msr_priv = &rapl_msr_priv_intel;
                break;
+       case X86_VENDOR_HYGON:
        case X86_VENDOR_AMD:
                rapl_msr_priv = &rapl_msr_priv_amd;
                break;
index 0f6cd6b..f180240 100644 (file)
@@ -23,18 +23,31 @@ static inline unsigned long topology_get_cpu_scale(int cpu)
 
 void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
 
-DECLARE_PER_CPU(unsigned long, freq_scale);
+DECLARE_PER_CPU(unsigned long, arch_freq_scale);
 
 static inline unsigned long topology_get_freq_scale(int cpu)
 {
-       return per_cpu(freq_scale, cpu);
+       return per_cpu(arch_freq_scale, cpu);
 }
 
 void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq,
                             unsigned long max_freq);
 bool topology_scale_freq_invariant(void);
 
-bool arch_freq_counters_available(const struct cpumask *cpus);
+enum scale_freq_source {
+       SCALE_FREQ_SOURCE_CPUFREQ = 0,
+       SCALE_FREQ_SOURCE_ARCH,
+       SCALE_FREQ_SOURCE_CPPC,
+};
+
+struct scale_freq_data {
+       enum scale_freq_source source;
+       void (*set_freq_scale)(void);
+};
+
+void topology_scale_freq_tick(void);
+void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus);
+void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus);
 
 DECLARE_PER_CPU(unsigned long, thermal_pressure);
 
index bd605b5..fce4762 100644 (file)
@@ -49,8 +49,8 @@ struct cpuidle_state {
        char            name[CPUIDLE_NAME_LEN];
        char            desc[CPUIDLE_DESC_LEN];
 
-       u64             exit_latency_ns;
-       u64             target_residency_ns;
+       s64             exit_latency_ns;
+       s64             target_residency_ns;
        unsigned int    flags;
        unsigned int    exit_latency; /* in US */
        int             power_usage; /* in mW */
index 2782814..0621c5f 100644 (file)
@@ -279,7 +279,6 @@ static inline int freeze_kernel_threads(void) { return -ENOSYS; }
 static inline void thaw_processes(void) {}
 static inline void thaw_kernel_threads(void) {}
 
-static inline bool try_to_freeze_nowarn(void) { return false; }
 static inline bool try_to_freeze(void) { return false; }
 
 static inline void freezer_do_not_count(void) {}
index 50b8398..9378083 100644 (file)
@@ -33,7 +33,7 @@ enum rapl_domain_reg_id {
        RAPL_DOMAIN_REG_MAX,
 };
 
-struct rapl_package;
+struct rapl_domain;
 
 enum rapl_primitives {
        ENERGY_COUNTER,
index 9af5a50..b29c8ac 100644 (file)
@@ -54,7 +54,7 @@ static void try_to_suspend(struct work_struct *work)
                goto out;
 
        /*
-        * If the wakeup occured for an unknown reason, wait to prevent the
+        * If the wakeup occurred for an unknown reason, wait to prevent the
         * system from trying to suspend and waking up in a tight loop.
         */
        if (final_count == initial_count)
index d63560e..1a221dc 100644 (file)
@@ -329,7 +329,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
 /**
  * Data types related to memory bitmaps.
  *
- * Memory bitmap is a structure consiting of many linked lists of
+ * Memory bitmap is a structure consisting of many linked lists of
  * objects.  The main list's elements are of type struct zone_bitmap
  * and each of them corresonds to one zone.  For each zone bitmap
  * object there is a list of objects of type struct bm_block that
index 72e3305..bea3cb8 100644 (file)
@@ -884,7 +884,7 @@ out_clean:
  *     enough_swap - Make sure we have enough swap to save the image.
  *
  *     Returns TRUE or FALSE after checking the total amount of swap
- *     space avaiable from the resume partition.
+ *     space available from the resume partition.
  */
 
 static int enough_swap(unsigned int nr_pages)
index 9819121..b2890f6 100644 (file)
@@ -6384,6 +6384,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
 {
        return __sched_setscheduler(p, attr, false, true);
 }
+EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
 
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
index 50cbad8..6ee9c9b 100644 (file)
@@ -114,19 +114,8 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
        return true;
 }
 
-static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
-                             unsigned int next_freq)
+static void sugov_deferred_update(struct sugov_policy *sg_policy)
 {
-       if (sugov_update_next_freq(sg_policy, time, next_freq))
-               cpufreq_driver_fast_switch(sg_policy->policy, next_freq);
-}
-
-static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time,
-                                 unsigned int next_freq)
-{
-       if (!sugov_update_next_freq(sg_policy, time, next_freq))
-               return;
-
        if (!sg_policy->work_in_progress) {
                sg_policy->work_in_progress = true;
                irq_work_queue(&sg_policy->irq_work);
@@ -366,16 +355,19 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
                sg_policy->cached_raw_freq = cached_freq;
        }
 
+       if (!sugov_update_next_freq(sg_policy, time, next_f))
+               return;
+
        /*
         * This code runs under rq->lock for the target CPU, so it won't run
         * concurrently on two different CPUs for the same target and it is not
         * necessary to acquire the lock in the fast switch case.
         */
        if (sg_policy->policy->fast_switch_enabled) {
-               sugov_fast_switch(sg_policy, time, next_f);
+               cpufreq_driver_fast_switch(sg_policy->policy, next_f);
        } else {
                raw_spin_lock(&sg_policy->update_lock);
-               sugov_deferred_update(sg_policy, time, next_f);
+               sugov_deferred_update(sg_policy);
                raw_spin_unlock(&sg_policy->update_lock);
        }
 }
@@ -454,12 +446,15 @@ sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
        if (sugov_should_update_freq(sg_policy, time)) {
                next_f = sugov_next_freq_shared(sg_cpu, time);
 
+               if (!sugov_update_next_freq(sg_policy, time, next_f))
+                       goto unlock;
+
                if (sg_policy->policy->fast_switch_enabled)
-                       sugov_fast_switch(sg_policy, time, next_f);
+                       cpufreq_driver_fast_switch(sg_policy->policy, next_f);
                else
-                       sugov_deferred_update(sg_policy, time, next_f);
+                       sugov_deferred_update(sg_policy);
        }
-
+unlock:
        raw_spin_unlock(&sg_policy->update_lock);
 }
 
index e10a4af..ee0032b 100644 (file)
@@ -1124,7 +1124,11 @@ ktime_t tick_nohz_get_next_hrtimer(void)
  * tick_nohz_get_sleep_length - return the expected length of the current sleep
  * @delta_next: duration until the next event if the tick cannot be stopped
  *
- * Called from power state control code with interrupts disabled
+ * Called from power state control code with interrupts disabled.
+ *
+ * The return value of this function and/or the value returned by it through the
+ * @delta_next pointer can be negative which must be taken into account by its
+ * callers.
  */
 ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 {