habanalabs: expand clock throttling information uAPI

author Ofir Bitton <obitton@habana.ai>

Mon, 25 Oct 2021 06:47:04 +0000 (09:47 +0300)

committer Oded Gabbay <ogabbay@kernel.org>

Sun, 26 Dec 2021 06:59:05 +0000 (08:59 +0200)
author Ofir Bitton <obitton@habana.ai>
Mon, 25 Oct 2021 06:47:04 +0000 (09:47 +0300)
committer Oded Gabbay <ogabbay@kernel.org>
Sun, 26 Dec 2021 06:59:05 +0000 (08:59 +0200)
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c

index eb5800b403b6f9e60abdd6d5c435951bc9a7eb96..0da5a55490ffcbab311f4eb2ad366fb4d273529b 100644 (file)
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -455,6 +455,7 @@ static int device_early_init(struct hl_device *hdev)
         INIT_LIST_HEAD(&hdev->fpriv_list);
         mutex_init(&hdev->fpriv_list_lock);
         atomic_set(&hdev->in_reset, 0);
+       mutex_init(&hdev->clk_throttling.lock);
  
         return 0;
  
@@ -495,6 +496,8 @@ static void device_early_fini(struct hl_device *hdev)
  
         mutex_destroy(&hdev->fpriv_list_lock);
  
+       mutex_destroy(&hdev->clk_throttling.lock);
+
         hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
  
         kfree(hdev->hl_chip_info);
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h

index 92d12c8ba569d97a28aa73f9fbbb8a405d5e057e..fc201537f7a905e54b845b93b1164f231ee19c48 100644 (file)
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -2378,6 +2378,32 @@ struct multi_cs_data {
         u8              update_ts;
  };
  
+/**
+ * struct hl_clk_throttle_timestamp - current/last clock throttling timestamp
+ * @start: timestamp taken when 'start' event is received in driver
+ * @end: timestamp taken when 'end' event is received in driver
+ */
+struct hl_clk_throttle_timestamp {
+       ktime_t         start;
+       ktime_t         end;
+};
+
+/**
+ * struct hl_clk_throttle - keeps current/last clock throttling timestamps
+ * @timestamp: timestamp taken by driver and firmware, index 0 refers to POWER
+ *             index 1 refers to THERMAL
+ * @lock: protects this structure as it can be accessed from both event queue
+ *        context and info_ioctl context
+ * @current_reason: bitmask represents the current clk throttling reasons
+ * @aggregated_reason: bitmask represents aggregated clk throttling reasons since driver load
+ */
+struct hl_clk_throttle {
+       struct hl_clk_throttle_timestamp timestamp[HL_CLK_THROTTLE_TYPE_MAX];
+       struct mutex    lock;
+       u32             current_reason;
+       u32             aggregated_reason;
+};
+
  /**
   * struct hl_device - habanalabs device structure.
   * @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -2445,6 +2471,7 @@ struct multi_cs_data {
   * @pci_mem_region: array of memory regions in the PCI
   * @state_dump_specs: constants and dictionaries needed to dump system state.
   * @multi_cs_completion: array of multi-CS completion.
+ * @clk_throttling: holds information about current/previous clock throttling events
   * @dram_used_mem: current DRAM memory consumption.
   * @timeout_jiffies: device CS timeout value.
   * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -2474,7 +2501,6 @@ struct multi_cs_data {
   * @high_pll: high PLL profile frequency.
   * @soft_reset_cnt: number of soft reset since the driver was loaded.
   * @hard_reset_cnt: number of hard reset since the driver was loaded.
- * @clk_throttling_reason: bitmask represents the current clk throttling reasons
   * @id: device minor.
   * @id_control: minor of the control device
   * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@@ -2604,6 +2630,8 @@ struct hl_device {
  
         struct multi_cs_completion      multi_cs_completion[
                                                         MULTI_CS_MAX_USER_CTX];
+       struct hl_clk_throttle          clk_throttling;
+
         u32                             *stream_master_qid_arr;
         atomic64_t                      dram_used_mem;
         u64                             timeout_jiffies;
@@ -2622,7 +2650,6 @@ struct hl_device {
         u32                             high_pll;
         u32                             soft_reset_cnt;
         u32                             hard_reset_cnt;
-       u32                             clk_throttling_reason;
         u16                             id;
         u16                             id_control;
         u16                             cpu_pci_msb_addr;
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c

index 86c3257d9ae1fbd4d9f28167d11eba1c70f811c0..19726c6b642a1ab2f1ee37f04160e3abe2d701f4 100644 (file)
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -313,15 +313,38 @@ static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
  
  static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
  {
+       void __user *out = (void __user *) (uintptr_t) args->return_pointer;
         struct hl_device *hdev = hpriv->hdev;
         struct hl_info_clk_throttle clk_throttle = {0};
+       ktime_t end_time, zero_time = ktime_set(0, 0);
         u32 max_size = args->return_size;
-       void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+       int i;
  
         if ((!max_size) || (!out))
                 return -EINVAL;
  
-       clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
+       mutex_lock(&hdev->clk_throttling.lock);
+
+       clk_throttle.clk_throttling_reason = hdev->clk_throttling.current_reason;
+
+       for (i = 0 ; i < HL_CLK_THROTTLE_TYPE_MAX ; i++) {
+               if (!(hdev->clk_throttling.aggregated_reason & BIT(i)))
+                       continue;
+
+               clk_throttle.clk_throttling_timestamp_us[i] =
+                       ktime_to_us(hdev->clk_throttling.timestamp[i].start);
+
+               if (ktime_compare(hdev->clk_throttling.timestamp[i].end, zero_time))
+                       end_time = ktime_get();
+               else
+                       end_time = hdev->clk_throttling.timestamp[i].end;
+
+               clk_throttle.clk_throttling_duration_ns[i] =
+                       ktime_to_ns(ktime_sub(end_time,
+                               hdev->clk_throttling.timestamp[i].start));
+
+       }
+       mutex_unlock(&hdev->clk_throttling.lock);
  
         return copy_to_user(out, &clk_throttle,
                 min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c

index 2724ab3747f2291b46fa1645eee6f46d5b891819..b4814369062efae3b3b3368b3f2066c4c664dc77 100644 (file)
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -7925,27 +7925,39 @@ static int tpc_krn_event_to_tpc_id(u16 tpc_dec_event_type)
  static void gaudi_print_clk_change_info(struct hl_device *hdev,
                                         u16 event_type)
  {
+       ktime_t zero_time = ktime_set(0, 0);
+
+       mutex_lock(&hdev->clk_throttling.lock);
+
         switch (event_type) {
         case GAUDI_EVENT_FIX_POWER_ENV_S:
-               hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
+               hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
+               hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
                 dev_info_ratelimited(hdev->dev,
                         "Clock throttling due to power consumption\n");
                 break;
  
         case GAUDI_EVENT_FIX_POWER_ENV_E:
-               hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
+               hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
                 dev_info_ratelimited(hdev->dev,
                         "Power envelop is safe, back to optimal clock\n");
                 break;
  
         case GAUDI_EVENT_FIX_THERMAL_ENV_S:
-               hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
+               hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
+               hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
                 dev_info_ratelimited(hdev->dev,
                         "Clock throttling due to overheating\n");
                 break;
  
         case GAUDI_EVENT_FIX_THERMAL_ENV_E:
-               hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
+               hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
                 dev_info_ratelimited(hdev->dev,
                         "Thermal envelop is safe, back to optimal clock\n");
                 break;
@@ -7955,6 +7967,8 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
                         event_type);
                 break;
         }
+
+       mutex_unlock(&hdev->clk_throttling.lock);
  }
  
  static void gaudi_handle_eqe(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c

index 3bbcab7da25eaaf76c49946b6179fcae187c5cf1..7b3683f2a6dc67779a59219afb31230f8c14e21d 100644 (file)
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4768,24 +4768,39 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
  
  static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
  {
+       ktime_t zero_time = ktime_set(0, 0);
+
+       mutex_lock(&hdev->clk_throttling.lock);
+
         switch (event_type) {
         case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
-               hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
+               hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
+               hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
                 dev_info_ratelimited(hdev->dev,
                         "Clock throttling due to power consumption\n");
                 break;
+
         case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
-               hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
+               hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
                 dev_info_ratelimited(hdev->dev,
                         "Power envelop is safe, back to optimal clock\n");
                 break;
+
         case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
-               hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
+               hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
+               hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
                 dev_info_ratelimited(hdev->dev,
                         "Clock throttling due to overheating\n");
                 break;
+
         case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
-               hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
+               hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
+               hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
                 dev_info_ratelimited(hdev->dev,
                         "Thermal envelop is safe, back to optimal clock\n");
                 break;
@@ -4795,6 +4810,8 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
                         event_type);
                 break;
         }
+
+       mutex_unlock(&hdev->clk_throttling.lock);
  }
  
  void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h

index c5760acebdd1d89e04212026ab61702e29e735c5..257b9630773eae57069f44c66f4e01875902e2ca 100644 (file)
--- a/include/uapi/misc/habanalabs.h
+++ b/include/uapi/misc/habanalabs.h
@@ -473,15 +473,27 @@ struct hl_info_pci_counters {
         __u64 replay_cnt;
  };
  
-#define HL_CLK_THROTTLE_POWER  0x1
-#define HL_CLK_THROTTLE_THERMAL        0x2
+enum hl_clk_throttling_type {
+       HL_CLK_THROTTLE_TYPE_POWER,
+       HL_CLK_THROTTLE_TYPE_THERMAL,
+       HL_CLK_THROTTLE_TYPE_MAX
+};
+
+/* clk_throttling_reason masks */
+#define HL_CLK_THROTTLE_POWER          (1 << HL_CLK_THROTTLE_TYPE_POWER)
+#define HL_CLK_THROTTLE_THERMAL                (1 << HL_CLK_THROTTLE_TYPE_THERMAL)
  
  /**
   * struct hl_info_clk_throttle - clock throttling reason
   * @clk_throttling_reason: each bit represents a clk throttling reason
+ * @clk_throttling_timestamp_us: represents CPU timestamp in microseconds of the start-event
+ * @clk_throttling_duration_ns: the clock throttle time in nanosec
   */
  struct hl_info_clk_throttle {
         __u32 clk_throttling_reason;
+       __u32 pad;
+       __u64 clk_throttling_timestamp_us[HL_CLK_THROTTLE_TYPE_MAX];
+       __u64 clk_throttling_duration_ns[HL_CLK_THROTTLE_TYPE_MAX];
  };
  
  /**
author	Ofir Bitton <obitton@habana.ai>
	Mon, 25 Oct 2021 06:47:04 +0000 (09:47 +0300)
committer	Oded Gabbay <ogabbay@kernel.org>
	Sun, 26 Dec 2021 06:59:05 +0000 (08:59 +0200)
drivers/misc/habanalabs/common/device.c		patch \| blob \| history
drivers/misc/habanalabs/common/habanalabs.h		patch \| blob \| history
drivers/misc/habanalabs/common/habanalabs_ioctl.c		patch \| blob \| history
drivers/misc/habanalabs/gaudi/gaudi.c		patch \| blob \| history
drivers/misc/habanalabs/goya/goya.c		patch \| blob \| history
include/uapi/misc/habanalabs.h		patch \| blob \| history