drm/amdkfd: add per process hw trap enable and disable functions
authorJonathan Kim <jonathan.kim@amd.com>
Tue, 5 Apr 2022 16:34:55 +0000 (12:34 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 16:35:48 +0000 (12:35 -0400)
To enable HW debug mode per process, all devices must be debug enabled
successfully.  If a failure occures, rewind the enablement of debug mode
on the enabled devices.

A power management scenario that needs to be considered is HW
debug mode setting during GFXOFF.  During GFXOFF, these registers
will be unreachable so we have to transiently disable GFXOFF when
setting.  Also, some devices don't support the RLC save restore
function for these debug registers so we have to disable GFXOFF
completely during a debug session.

Cooperative launch also has debugging restriction based on HW/FW bugs.
If such bugs exists, the debugger cannot attach to a process that uses GWS
resources nor can GWS resources be requested if a process is being
debugged.

Multi-process debug devices can only enable trap temporaries based
on certain runtime scenerios, which will be explained when the
runtime enable functions are implemented in a follow up patch.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
drivers/gpu/drm/amd/amdkfd/kfd_debug.c
drivers/gpu/drm/amd/amdkfd/kfd_debug.h
drivers/gpu/drm/amd/amdkfd/kfd_process.c

index ee086a0a46dfe36f760668a0a03f8c14c4edd23a..826a99acb6fb9c7fd8da996a49ccb9873ebec396 100644 (file)
@@ -1488,6 +1488,11 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
                goto out_unlock;
        }
 
+       if (!kfd_dbg_has_gws_support(dev) && p->debug_trap_enabled) {
+               retval = -EBUSY;
+               goto out_unlock;
+       }
+
        retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
        mutex_unlock(&p->mutex);
 
index 898cc1fe3d13547b539e9f2c8dd4a85f728ce4ec..73b07b5f17f149c4f6d9d21e40f1b91ee39d405c 100644 (file)
  */
 
 #include "kfd_debug.h"
+#include "kfd_device_queue_manager.h"
 #include <linux/file.h>
 
+static int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd)
+{
+       uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode;
+       uint32_t flags = pdd->process->dbg_flags;
+
+       if (!kfd_dbg_is_per_vmid_supported(pdd->dev))
+               return 0;
+
+       return amdgpu_mes_set_shader_debugger(pdd->dev->adev, pdd->proc_ctx_gpu_addr, spi_dbg_cntl,
+                                               pdd->watch_points, flags);
+}
+
+/* kfd_dbg_trap_deactivate:
+ *     target: target process
+ *     unwind: If this is unwinding a failed kfd_dbg_trap_enable()
+ *     unwind_count:
+ *             If unwind == true, how far down the pdd list we need
+ *                             to unwind
+ *             else: ignored
+ */
+static void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count)
+{
+       int i;
+
+       for (i = 0; i < target->n_pdds; i++) {
+               struct kfd_process_device *pdd = target->pdds[i];
+
+               /* If this is an unwind, and we have unwound the required
+                * enable calls on the pdd list, we need to stop now
+                * otherwise we may mess up another debugger session.
+                */
+               if (unwind && i == unwind_count)
+                       break;
+
+               /* GFX off is already disabled by debug activate if not RLC restore supported. */
+               if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+                       amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+               pdd->spi_dbg_override =
+                               pdd->dev->kfd2kgd->disable_debug_trap(
+                               pdd->dev->adev,
+                               target->runtime_info.ttmp_setup,
+                               pdd->dev->vm_info.last_vmid_kfd);
+               amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+               if (!kfd_dbg_is_per_vmid_supported(pdd->dev) &&
+                               release_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd))
+                       pr_err("Failed to release debug vmid on [%i]\n", pdd->dev->id);
+
+               if (!pdd->dev->kfd->shared_resources.enable_mes)
+                       debug_refresh_runlist(pdd->dev->dqm);
+               else
+                       kfd_dbg_set_mes_debug_mode(pdd);
+       }
+}
+
 int kfd_dbg_trap_disable(struct kfd_process *target)
 {
        if (!target->debug_trap_enabled)
                return 0;
 
+       /*
+        * Defer deactivation to runtime if runtime not enabled otherwise reset
+        * attached running target runtime state to enable for re-attach.
+        */
+       if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
+               kfd_dbg_trap_deactivate(target, false, 0);
+       else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED)
+               target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED;
+
        fput(target->dbg_ev_file);
        target->dbg_ev_file = NULL;
 
@@ -42,16 +107,89 @@ int kfd_dbg_trap_disable(struct kfd_process *target)
        return 0;
 }
 
+static int kfd_dbg_trap_activate(struct kfd_process *target)
+{
+       int i, r = 0;
+
+       for (i = 0; i < target->n_pdds; i++) {
+               struct kfd_process_device *pdd = target->pdds[i];
+
+               if (!kfd_dbg_is_per_vmid_supported(pdd->dev)) {
+                       r = reserve_debug_trap_vmid(pdd->dev->dqm, &pdd->qpd);
+
+                       if (r) {
+                               target->runtime_info.runtime_state = (r == -EBUSY) ?
+                                                       DEBUG_RUNTIME_STATE_ENABLED_BUSY :
+                                                       DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+
+                               goto unwind_err;
+                       }
+               }
+
+               /* Disable GFX OFF to prevent garbage read/writes to debug registers.
+                * If RLC restore of debug registers is not supported and runtime enable
+                * hasn't done so already on ttmp setup request, restore the trap config registers.
+                *
+                * If RLC restore of debug registers is not supported, keep gfx off disabled for
+                * the debug session.
+                */
+               amdgpu_gfx_off_ctrl(pdd->dev->adev, false);
+               if (!(kfd_dbg_is_rlc_restore_supported(pdd->dev) ||
+                                               target->runtime_info.ttmp_setup))
+                       pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true,
+                                                               pdd->dev->vm_info.last_vmid_kfd);
+
+               pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap(
+                                       pdd->dev->adev,
+                                       false,
+                                       pdd->dev->vm_info.last_vmid_kfd);
+
+               if (kfd_dbg_is_rlc_restore_supported(pdd->dev))
+                       amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+
+               if (!pdd->dev->kfd->shared_resources.enable_mes)
+                       r = debug_refresh_runlist(pdd->dev->dqm);
+               else
+                       r = kfd_dbg_set_mes_debug_mode(pdd);
+
+               if (r) {
+                       target->runtime_info.runtime_state =
+                                       DEBUG_RUNTIME_STATE_ENABLED_ERROR;
+                       goto unwind_err;
+               }
+       }
+
+       return 0;
+
+unwind_err:
+       /* Enabling debug failed, we need to disable on
+        * all GPUs so the enable is all or nothing.
+        */
+       kfd_dbg_trap_deactivate(target, true, i);
+       return r;
+}
+
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
                        void __user *runtime_info, uint32_t *runtime_size)
 {
        struct file *f;
        uint32_t copy_size;
-       int r = 0;
+       int i, r = 0;
 
        if (target->debug_trap_enabled)
                return -EALREADY;
 
+       /* Enable pre-checks */
+       for (i = 0; i < target->n_pdds; i++) {
+               struct kfd_process_device *pdd = target->pdds[i];
+
+               if (!KFD_IS_SOC15(pdd->dev))
+                       return -ENODEV;
+
+               if (!kfd_dbg_has_gws_support(pdd->dev) && pdd->qpd.num_gws)
+                       return -EBUSY;
+       }
+
        copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info));
 
        f = fget(fd);
@@ -62,6 +200,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
 
        target->dbg_ev_file = f;
 
+       /* defer activation to runtime if not runtime enabled */
+       if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED)
+               kfd_dbg_trap_activate(target);
+
        /* We already hold the process reference but hold another one for the
         * debug session.
         */
@@ -71,8 +213,10 @@ int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
        if (target->debugger_process)
                atomic_inc(&target->debugger_process->debugged_process_count);
 
-       if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size))
+       if (copy_to_user(runtime_info, (void *)&target->runtime_info, copy_size)) {
+               kfd_dbg_trap_deactivate(target, false, 0);
                r = -EFAULT;
+       }
 
        *runtime_size = sizeof(target->runtime_info);
 
index db6d72e7930f7080ebf956348816e0e8ddb1904f..17481f824647d6170e8122e1da51b1c45a4b2552 100644 (file)
@@ -34,4 +34,33 @@ static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev)
        return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2);
 }
 
+/*
+ * If GFX off is enabled, chips that do not support RLC restore for the debug
+ * registers will disable GFX off temporarily for the entire debug session.
+ * See disable_on_trap_action_entry and enable_on_trap_action_exit for details.
+ */
+static inline bool kfd_dbg_is_rlc_restore_supported(struct kfd_node *dev)
+{
+       return !(KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 10) ||
+                KFD_GC_VERSION(dev) == IP_VERSION(10, 1, 1));
+}
+
+static inline bool kfd_dbg_has_gws_support(struct kfd_node *dev)
+{
+       if ((KFD_GC_VERSION(dev) == IP_VERSION(9, 0, 1)
+                       && dev->kfd->mec2_fw_version < 0x81b6) ||
+               (KFD_GC_VERSION(dev) >= IP_VERSION(9, 1, 0)
+                       && KFD_GC_VERSION(dev) <= IP_VERSION(9, 2, 2)
+                       && dev->kfd->mec2_fw_version < 0x1b6) ||
+               (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0)
+                       && dev->kfd->mec2_fw_version < 0x1b6) ||
+               (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 1)
+                       && dev->kfd->mec2_fw_version < 0x30) ||
+               (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0) &&
+                       KFD_GC_VERSION(dev) < IP_VERSION(12, 0, 0)))
+               return false;
+
+       /* Assume debugging and cooperative launch supported otherwise. */
+       return true;
+}
 #endif
index 725d936b2cc7df18d6356d7aa072f76e12fd7930..e77cadadb09b64179113797ab4049327577dfe74 100644 (file)
@@ -1165,9 +1165,19 @@ static void kfd_process_free_notifier(struct mmu_notifier *mn)
 
 static void kfd_process_notifier_release_internal(struct kfd_process *p)
 {
+       int i;
+
        cancel_delayed_work_sync(&p->eviction_work);
        cancel_delayed_work_sync(&p->restore_work);
 
+       for (i = 0; i < p->n_pdds; i++) {
+               struct kfd_process_device *pdd = p->pdds[i];
+
+               /* re-enable GFX OFF since runtime enable with ttmp setup disabled it. */
+               if (!kfd_dbg_is_rlc_restore_supported(pdd->dev) && p->runtime_info.ttmp_setup)
+                       amdgpu_gfx_off_ctrl(pdd->dev->adev, true);
+       }
+
        /* Indicate to other users that MM is no longer valid */
        p->mm = NULL;
        kfd_dbg_trap_disable(p);