drm/amdkfd: Add thermal throttling SMI event
authorMukul Joshi <mukul.joshi@amd.com>
Fri, 24 Jul 2020 03:09:57 +0000 (23:09 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 27 Jul 2020 20:21:50 +0000 (16:21 -0400)
Add support for reporting thermal throttling events through SMI.
Also, add a counter to count the number of throttling interrupts
observed and report the count in the SMI event message.

Signed-off-by: Mukul Joshi <mukul.joshi@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h
drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
drivers/gpu/drm/amd/powerplay/arcturus_ppt.c
drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
drivers/gpu/drm/amd/powerplay/smu_v11_0.c
include/uapi/linux/kfd_ioctl.h

index a0ea663..92790db 100644 (file)
@@ -789,4 +789,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
 void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
 {
 }
+
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
+{
+}
 #endif
index ffe149a..a10507e 100644 (file)
@@ -270,5 +270,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
 int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
                                               struct dma_fence *fence);
 void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask);
 
 #endif /* AMDGPU_AMDKFD_H_INCLUDED */
index 4bfedaa..d5e790f 100644 (file)
@@ -29,6 +29,7 @@
 #include "cwsr_trap_handler.h"
 #include "kfd_iommu.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 #define MQD_SIZE_ALIGNED 768
 
@@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
        WARN_ONCE(count < 0, "Compute profile ref. count error");
 }
 
+void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
+{
+       if (kfd)
+               kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 /* This function will send a package to HIQ to hang the HWS
index 7b348bf..86c2c3e 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/wait.h>
 #include <linux/anon_inodes.h>
 #include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu.h"
 #include "amdgpu_vm.h"
 #include "kfd_priv.h"
 #include "kfd_smi_events.h"
@@ -148,6 +149,54 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
        return 0;
 }
 
+static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
+                             char *event_msg, int len)
+{
+       struct kfd_smi_client *client;
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(client, &dev->smi_clients, list) {
+               if (!(READ_ONCE(client->events) & smi_event))
+                       continue;
+               spin_lock(&client->lock);
+               if (kfifo_avail(&client->fifo) >= len) {
+                       kfifo_in(&client->fifo, event_msg, len);
+                       wake_up_all(&client->wait_queue);
+               } else {
+                       pr_debug("smi_event(EventID: %llu): no space left\n",
+                                       smi_event);
+               }
+               spin_unlock(&client->lock);
+       }
+
+       rcu_read_unlock();
+}
+
+void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
+                                            uint32_t throttle_bitmask)
+{
+       struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
+       /*
+        * ThermalThrottle msg = throttle_bitmask(8):
+        *                       thermal_interrupt_count(16):
+        * 16 bytes event + 1 byte space + 8 byte throttle_bitmask +
+        * 1 byte : + 16 byte thermal_interupt_counter + 1 byte \n +
+        * 1 byte \0 = 44
+        */
+       char fifo_in[44];
+       int len;
+
+       if (list_empty(&dev->smi_clients))
+               return;
+
+       len = snprintf(fifo_in, 44, "%x %x:%llx\n",
+                      KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask,
+                      atomic64_read(&adev->smu.throttle_int_counter));
+
+       add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len);
+}
+
 void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
@@ -156,7 +205,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
        /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
         */
        char fifo_in[43];
-       struct kfd_smi_client *client;
        int len;
 
        if (list_empty(&dev->smi_clients))
@@ -171,22 +219,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
        len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
                task_info.pid, task_info.task_name);
 
-       rcu_read_lock();
-
-       list_for_each_entry_rcu(client, &dev->smi_clients, list) {
-               if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
-                       continue;
-               spin_lock(&client->lock);
-               if (kfifo_avail(&client->fifo) >= len) {
-                       kfifo_in(&client->fifo, fifo_in, len);
-                       wake_up_all(&client->wait_queue);
-               }
-               else
-                       pr_debug("smi_event(vmfault): no space left\n");
-               spin_unlock(&client->lock);
-       }
-
-       rcu_read_unlock();
+       add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
 }
 
 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
index a9cb218..15537b2 100644 (file)
@@ -25,5 +25,7 @@
 
 int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
 void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
+void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
+                                            uint32_t throttle_bitmask);
 
 #endif
index 0eeccf3..7d9c40a 100644 (file)
@@ -640,6 +640,7 @@ static int smu_sw_init(void *handle)
        mutex_init(&smu->message_lock);
 
        INIT_WORK(&smu->throttling_logging_work, smu_throttling_logging_work_fn);
+       atomic64_set(&smu->throttle_int_counter, 0);
        smu->watermarks_bitmap = 0;
        smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
        smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
index 3b9182c..f139796 100644 (file)
@@ -2251,6 +2251,7 @@ static void arcturus_log_thermal_throttling_event(struct smu_context *smu)
 
        dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
                        log_buf);
+       kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
 }
 
 static const struct pptable_funcs arcturus_ppt_funcs = {
index 28312d6..b57b104 100644 (file)
@@ -446,6 +446,7 @@ struct smu_context
        bool dc_controlled_by_gpio;
 
        struct work_struct throttling_logging_work;
+       atomic64_t throttle_int_counter;
 };
 
 struct i2c_adapter;
index fd82402..a9453ec 100644 (file)
@@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
                                smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
                                break;
                        case 0x7:
+                               /*
+                                * Increment the throttle interrupt counter
+                                */
+                               atomic64_inc(&smu->throttle_int_counter);
+
                                if (!atomic_read(&adev->throttling_logging_enabled))
                                        return 0;
 
index f738c3b..df6c7a4 100644 (file)
@@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args {
  * KFD SMI(System Management Interface) events
  */
 /* Event type (defined by bitmask) */
-#define KFD_SMI_EVENT_VMFAULT     0x0000000000000001
+#define KFD_SMI_EVENT_VMFAULT                  0x0000000000000001
+#define KFD_SMI_EVENT_THERMAL_THROTTLE         0x0000000000000002
 
 struct kfd_ioctl_smi_events_args {
        __u32 gpuid;    /* to KFD */