nvme: extend and modify the APST configuration algorithm

author Alexey Bogoslavsky <Alexey.Bogoslavsky@wdc.com>

Wed, 28 Apr 2021 09:27:36 +0000 (09:27 +0000)

committer Christoph Hellwig <hch@lst.de>

Thu, 3 Jun 2021 07:29:24 +0000 (10:29 +0300)
author Alexey Bogoslavsky <Alexey.Bogoslavsky@wdc.com>
Wed, 28 Apr 2021 09:27:36 +0000 (09:27 +0000)
committer Christoph Hellwig <hch@lst.de>
Thu, 3 Jun 2021 07:29:24 +0000 (10:29 +0300)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c

index 762125f..e7441cc 100644 (file)
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -57,6 +57,26 @@ static bool force_apst;
  module_param(force_apst, bool, 0644);
  MODULE_PARM_DESC(force_apst, "allow APST for newly enumerated devices even if quirked off");
  
+static unsigned long apst_primary_timeout_ms = 100;
+module_param(apst_primary_timeout_ms, ulong, 0644);
+MODULE_PARM_DESC(apst_primary_timeout_ms,
+       "primary APST timeout in ms");
+
+static unsigned long apst_secondary_timeout_ms = 2000;
+module_param(apst_secondary_timeout_ms, ulong, 0644);
+MODULE_PARM_DESC(apst_secondary_timeout_ms,
+       "secondary APST timeout in ms");
+
+static unsigned long apst_primary_latency_tol_us = 15000;
+module_param(apst_primary_latency_tol_us, ulong, 0644);
+MODULE_PARM_DESC(apst_primary_latency_tol_us,
+       "primary APST latency tolerance in us");
+
+static unsigned long apst_secondary_latency_tol_us = 100000;
+module_param(apst_secondary_latency_tol_us, ulong, 0644);
+MODULE_PARM_DESC(apst_secondary_latency_tol_us,
+       "secondary APST latency tolerance in us");
+
  static bool streams;
  module_param(streams, bool, 0644);
  MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
@@ -2218,13 +2238,53 @@ static int nvme_configure_acre(struct nvme_ctrl *ctrl)
  }
  
  /*
+ * The function checks whether the given total (exlat + enlat) latency of
+ * a power state allows the latter to be used as an APST transition target.
+ * It does so by comparing the latency to the primary and secondary latency
+ * tolerances defined by module params. If there's a match, the corresponding
+ * timeout value is returned and the matching tolerance index (1 or 2) is
+ * reported.
+ */
+static bool nvme_apst_get_transition_time(u64 total_latency,
+               u64 *transition_time, unsigned *last_index)
+{
+       if (total_latency <= apst_primary_latency_tol_us) {
+               if (*last_index == 1)
+                       return false;
+               *last_index = 1;
+               *transition_time = apst_primary_timeout_ms;
+               return true;
+       }
+       if (apst_secondary_timeout_ms &&
+               total_latency <= apst_secondary_latency_tol_us) {
+               if (*last_index <= 2)
+                       return false;
+               *last_index = 2;
+               *transition_time = apst_secondary_timeout_ms;
+               return true;
+       }
+       return false;
+}
+
+/*
   * APST (Autonomous Power State Transition) lets us program a table of power
   * state transitions that the controller will perform automatically.
- * We configure it with a simple heuristic: we are willing to spend at most 2%
- * of the time transitioning between power states.  Therefore, when running in
- * any given state, we will enter the next lower-power non-operational state
- * after waiting 50 * (enlat + exlat) microseconds, as long as that state's exit
- * latency is under the requested maximum latency.
+ *
+ * Depending on module params, one of the two supported techniques will be used:
+ *
+ * - If the parameters provide explicit timeouts and tolerances, they will be
+ *   used to build a table with up to 2 non-operational states to transition to.
+ *   The default parameter values were selected based on the values used by
+ *   Microsoft's and Intel's NVMe drivers. Yet, since we don't implement dynamic
+ *   regeneration of the APST table in the event of switching between external
+ *   and battery power, the timeouts and tolerances reflect a compromise
+ *   between values used by Microsoft for AC and battery scenarios.
+ * - If not, we'll configure the table with a simple heuristic: we are willing
+ *   to spend at most 2% of the time transitioning between power states.
+ *   Therefore, when running in any given state, we will enter the next
+ *   lower-power non-operational state after waiting 50 * (enlat + exlat)
+ *   microseconds, as long as that state's exit latency is under the requested
+ *   maximum latency.
   *
   * We will not autonomously enter any non-operational state for which the total
   * latency exceeds ps_max_latency_us.
@@ -2240,6 +2300,7 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl)
         int max_ps = -1;
         int state;
         int ret;
+       unsigned last_lt_index = UINT_MAX;
  
         /*
          * If APST isn't supported or if we haven't been initialized yet,
@@ -2298,13 +2359,19 @@ static int nvme_configure_apst(struct nvme_ctrl *ctrl)
                         le32_to_cpu(ctrl->psd[state].entry_lat);
  
                 /*
-                * This state is good.  Use it as the APST idle target for
-                * higher power states.
+                * This state is good. It can be used as the APST idle target
+                * for higher power states.
                  */
-               transition_ms = total_latency_us + 19;
-               do_div(transition_ms, 20);
-               if (transition_ms > (1 << 24) - 1)
-                       transition_ms = (1 << 24) - 1;
+               if (apst_primary_timeout_ms && apst_primary_latency_tol_us) {
+                       if (!nvme_apst_get_transition_time(total_latency_us,
+                                       &transition_ms, &last_lt_index))
+                               continue;
+               } else {
+                       transition_ms = total_latency_us + 19;
+                       do_div(transition_ms, 20);
+                       if (transition_ms > (1 << 24) - 1)
+                               transition_ms = (1 << 24) - 1;
+               }
  
                 target = cpu_to_le64((state << 3) | (transition_ms << 8));
                 if (max_ps == -1)
author	Alexey Bogoslavsky <Alexey.Bogoslavsky@wdc.com>
	Wed, 28 Apr 2021 09:27:36 +0000 (09:27 +0000)
committer	Christoph Hellwig <hch@lst.de>
	Thu, 3 Jun 2021 07:29:24 +0000 (10:29 +0300)