drm/amdkfd: Add eviction debug messages
authorFelix Kuehling <Felix.Kuehling@amd.com>
Fri, 12 Jun 2020 03:19:37 +0000 (23:19 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 1 Jul 2020 05:59:21 +0000 (01:59 -0400)
Use WARN to print messages with backtrace when evictions are triggered.
This can help determine the root cause of evictions and help spot driver
bugs triggering evictions unintentionally, or help with performance tuning
by avoiding conditions that cause evictions in a specific workload.

The messages are controlled by a new module parameter that can be changed
at runtime:

  echo Y > /sys/module/amdgpu/parameters/debug_evictions
  echo N > /sys/module/amdgpu/parameters/debug_evictions

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu.h
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 905cf0bac100cd930941bf3c967cc3cb2c3165c8..3d2625beacf7179dd469c69a34d27cfdbe53e91f 100644 (file)
@@ -186,8 +186,10 @@ extern int amdgpu_noretry;
 extern int amdgpu_force_asic_type;
 #ifdef CONFIG_HSA_AMD
 extern int sched_policy;
+extern bool debug_evictions;
 #else
 static const int sched_policy = KFD_SCHED_POLICY_HWS;
+static const bool debug_evictions; /* = false */
 #endif
 
 extern int amdgpu_tmz;
index 75bcd17891853937dfdaa47614cc13882e1a81de..653a377dd3428cf7040994ae7381d295d52907c0 100644 (file)
@@ -705,6 +705,14 @@ MODULE_PARM_DESC(hws_gws_support, "Assume MEC2 FW supports GWS barriers (false =
 int queue_preemption_timeout_ms = 9000;
 module_param(queue_preemption_timeout_ms, int, 0644);
 MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption timeout in ms (1 = Minimum, 9000 = default)");
+
+/**
+ * DOC: debug_evictions(bool)
+ * Enable extra debug messages to help determine the cause of evictions
+ */
+bool debug_evictions;
+module_param(debug_evictions, bool, 0644);
+MODULE_PARM_DESC(debug_evictions, "enable eviction debug messages (false = default)");
 #endif
 
 /**
index b87ca171986af0879394d0f09634f641b006d26f..072f0e1185a8921dbbc91fa5331e0f9b359c7fa1 100644 (file)
@@ -275,6 +275,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
                        continue;
                }
 
+               WARN(debug_evictions && fence_owner == AMDGPU_FENCE_OWNER_KFD,
+                    "Adding eviction fence to sync obj");
                r = amdgpu_sync_fence(sync, f, false);
                if (r)
                        break;
index 314c4b99671dff5dd310160a09c4fb86e09c9ae0..7f6d0958ed62ef4b9e6eb4010f536dbec39584b0 100644 (file)
@@ -935,6 +935,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
        if (!p)
                return -ESRCH;
 
+       WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
        r = kfd_process_evict_queues(p);
 
        kfd_unref_process(p);
@@ -1002,6 +1003,8 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
        /* During process initialization eviction_work.dwork is initialized
         * to kfd_evict_bo_worker
         */
+       WARN(debug_evictions, "Scheduling eviction of pid %d in %ld jiffies",
+            p->lead_thread->pid, delay_jiffies);
        schedule_delayed_work(&p->eviction_work, delay_jiffies);
 out:
        kfd_unref_process(p);
index 3a4fbb6a9aca3299ae8aaed89763615f21316b96..308e96f1dab5686303d28a0466fdf2403063b627 100644 (file)
@@ -177,6 +177,11 @@ extern bool hws_gws_support;
  */
 extern int queue_preemption_timeout_ms;
 
+/*
+ * Enable eviction debug messages
+ */
+extern bool debug_evictions;
+
 enum cache_policy {
        cache_policy_coherent,
        cache_policy_noncoherent