drm/amdkfd: add send exception operation
authorJonathan Kim <jonathan.kim@amd.com>
Fri, 8 Apr 2022 16:49:48 +0000 (12:49 -0400)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 9 Jun 2023 16:36:01 +0000 (12:36 -0400)
Add a debug operation that allows the debugger to send an exception
directly to runtime through a payload address.

For memory violations, normal vmfault signals will be applied to
notify runtime instead after passing in the saved exception data
when a memory violation was raised to the debugger.

For runtime exceptions, this will unblock the runtime enable
function which will be explained and implemented in a follow up
patch.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Felix Kuehling <felix.kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
drivers/gpu/drm/amd/amdkfd/kfd_debug.c
drivers/gpu/drm/amd/amdkfd/kfd_debug.h
drivers/gpu/drm/amd/amdkfd/kfd_events.c
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process.c

index 4ebfff6b6c55a44c0acde41c700e4d7e58442a65..795382b55e0a916cf61207622a994ac55c7971b1 100644 (file)
@@ -118,9 +118,9 @@ static void cik_event_interrupt_wq(struct kfd_node *dev,
                        return;
 
                if (info.vmid == vmid)
-                       kfd_signal_vm_fault_event(dev, pasid, &info);
+                       kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
                else
-                       kfd_signal_vm_fault_event(dev, pasid, NULL);
+                       kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
        }
 }
 
index d4df424e45144b6f522fd638a2ba57ef524d01cc..5e57b3e96ff90dde454920bbdbcd3c99102ed359 100644 (file)
@@ -2833,6 +2833,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, v
                r = kfd_dbg_trap_disable(target);
                break;
        case KFD_IOC_DBG_TRAP_SEND_RUNTIME_EVENT:
+               r = kfd_dbg_send_exception_to_runtime(target,
+                               args->send_runtime_event.gpu_id,
+                               args->send_runtime_event.queue_id,
+                               args->send_runtime_event.exception_mask);
+               break;
        case KFD_IOC_DBG_TRAP_SET_EXCEPTIONS_ENABLED:
        case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_OVERRIDE:
        case KFD_IOC_DBG_TRAP_SET_WAVE_LAUNCH_MODE:
index dccb27fc764b06600fb1cb4f1b113c151fec06e5..61098975bb0ed52bdf4b367ded18c636c9bab59f 100644 (file)
@@ -125,6 +125,49 @@ bool kfd_dbg_ev_raise(uint64_t event_mask,
        return is_subscribed;
 }
 
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+                                       unsigned int dev_id,
+                                       unsigned int queue_id,
+                                       uint64_t error_reason)
+{
+       if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
+               struct kfd_process_device *pdd = NULL;
+               struct kfd_hsa_memory_exception_data *data;
+               int i;
+
+               for (i = 0; i < p->n_pdds; i++) {
+                       if (p->pdds[i]->dev->id == dev_id) {
+                               pdd = p->pdds[i];
+                               break;
+                       }
+               }
+
+               if (!pdd)
+                       return -ENODEV;
+
+               data = (struct kfd_hsa_memory_exception_data *)
+                                               pdd->vm_fault_exc_data;
+
+               kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
+               kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
+               error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
+       }
+
+       if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) {
+               /*
+                * block should only happen after the debugger receives runtime
+                * enable notice.
+                */
+               up(&p->runtime_enable_sema);
+               error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME);
+       }
+
+       if (error_reason)
+               return kfd_send_exception_to_runtime(p, queue_id, error_reason);
+
+       return 0;
+}
+
 static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable)
 {
        struct mqd_update_info minfo = {0};
index 66ee7b95d08a962909906e34222e7fb5938cc63f..2c6866bb8850675bfbde34716759c5f78f935956 100644 (file)
@@ -34,6 +34,12 @@ int kfd_dbg_trap_disable(struct kfd_process *target);
 int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd,
                        void __user *runtime_info,
                        uint32_t *runtime_info_size);
+
+int kfd_dbg_send_exception_to_runtime(struct kfd_process *p,
+                                       unsigned int dev_id,
+                                       unsigned int queue_id,
+                                       uint64_t error_reason);
+
 static inline bool kfd_dbg_is_per_vmid_supported(struct kfd_node *dev)
 {
        return KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2) ||
index 9926186f88a65462fe9b4acb7bf7f8de8c602074..0a5e7b172a648cb5677dc7ce4f2dd38bc63db0e1 100644 (file)
@@ -1222,7 +1222,8 @@ void kfd_signal_hw_exception_event(u32 pasid)
 }
 
 void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
-                              struct kfd_vm_fault_info *info)
+                               struct kfd_vm_fault_info *info,
+                               struct kfd_hsa_memory_exception_data *data)
 {
        struct kfd_event *ev;
        uint32_t id;
index 861bccb1e9dc5f89fe03851375b85cd505043086..8cf58be80f4ea3f2972181a8b938e974c962a995 100644 (file)
@@ -362,7 +362,7 @@ static void event_interrupt_wq_v9(struct kfd_node *dev,
 
                kfd_smi_event_update_vmfault(dev, pasid);
                kfd_dqm_evict_pasid(dev->dqm, pasid);
-               kfd_signal_vm_fault_event(dev, pasid, &info);
+               kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
        }
 }
 
index b18cd4bf76bfbe8af15e792628210bfd3c10dd0d..58b82fa59584df24b231823b5196227e2007529c 100644 (file)
@@ -979,6 +979,7 @@ struct kfd_process {
        bool queues_paused;
 
        /* Tracks runtime enable status */
+       struct semaphore runtime_enable_sema;
        struct kfd_runtime_info runtime_info;
 
 };
@@ -1447,7 +1448,8 @@ int kfd_get_num_events(struct kfd_process *p);
 int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
 void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
-                               struct kfd_vm_fault_info *info);
+                               struct kfd_vm_fault_info *info,
+                               struct kfd_hsa_memory_exception_data *data);
 
 void kfd_signal_reset_event(struct kfd_node *dev);
 
@@ -1463,6 +1465,9 @@ static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
               KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 0);
 }
 
+int kfd_send_exception_to_runtime(struct kfd_process *p,
+                               unsigned int queue_id,
+                               uint64_t error_reason);
 bool kfd_is_locked(void);
 
 /* Compute profile */
index f904d6d6e01cd55cd4d5faa0e8519a79f250d2c3..5cbfcaf08c8f617cd4bdcdfcd6e1c96b93e57bd7 100644 (file)
@@ -1462,6 +1462,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
        process->debugger_process = NULL;
        process->exception_enable_mask = 0;
        atomic_set(&process->debugged_process_count, 0);
+       sema_init(&process->runtime_enable_sema, 0);
 
        process->pasid = kfd_pasid_alloc();
        if (process->pasid == 0) {
@@ -2120,6 +2121,75 @@ void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
        }
 }
 
+struct send_exception_work_handler_workarea {
+       struct work_struct work;
+       struct kfd_process *p;
+       unsigned int queue_id;
+       uint64_t error_reason;
+};
+
+static void send_exception_work_handler(struct work_struct *work)
+{
+       struct send_exception_work_handler_workarea *workarea;
+       struct kfd_process *p;
+       struct queue *q;
+       struct mm_struct *mm;
+       struct kfd_context_save_area_header __user *csa_header;
+       uint64_t __user *err_payload_ptr;
+       uint64_t cur_err;
+       uint32_t ev_id;
+
+       workarea = container_of(work,
+                               struct send_exception_work_handler_workarea,
+                               work);
+       p = workarea->p;
+
+       mm = get_task_mm(p->lead_thread);
+
+       if (!mm)
+               return;
+
+       kthread_use_mm(mm);
+
+       q = pqm_get_user_queue(&p->pqm, workarea->queue_id);
+
+       if (!q)
+               goto out;
+
+       csa_header = (void __user *)q->properties.ctx_save_restore_area_address;
+
+       get_user(err_payload_ptr, (uint64_t __user **)&csa_header->err_payload_addr);
+       get_user(cur_err, err_payload_ptr);
+       cur_err |= workarea->error_reason;
+       put_user(cur_err, err_payload_ptr);
+       get_user(ev_id, &csa_header->err_event_id);
+
+       kfd_set_event(p, ev_id);
+
+out:
+       kthread_unuse_mm(mm);
+       mmput(mm);
+}
+
+int kfd_send_exception_to_runtime(struct kfd_process *p,
+                       unsigned int queue_id,
+                       uint64_t error_reason)
+{
+       struct send_exception_work_handler_workarea worker;
+
+       INIT_WORK_ONSTACK(&worker.work, send_exception_work_handler);
+
+       worker.p = p;
+       worker.queue_id = queue_id;
+       worker.error_reason = error_reason;
+
+       schedule_work(&worker.work);
+       flush_work(&worker.work);
+       destroy_work_on_stack(&worker.work);
+
+       return 0;
+}
+
 struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *p, uint32_t gpu_id)
 {
        int i;
@@ -2179,4 +2249,3 @@ int kfd_debugfs_mqds_by_process(struct seq_file *m, void *data)
 }
 
 #endif
-