drm/amdkfd: Handle VM faults in KFD
authorshaoyunl <Shaoyun.Liu@amd.com>
Thu, 12 Jul 2018 02:32:50 +0000 (22:32 -0400)
committerOded Gabbay <oded.gabbay@gmail.com>
Thu, 12 Jul 2018 02:32:50 +0000 (22:32 -0400)
1. Pre-GFX9 the amdgpu ISR saves the vm-fault status and address per
   per-vmid. amdkfd needs to get the information from amdgpu through the
   new get_vm_fault_info interface. On GFX9 and later, all the required
   information is in the IH ring
2. amdkfd unmaps all queues from the faulting process and create new
   run-list without the guilty process
3. amdkfd notifies the runtime of the vm fault trap via EVENT_TYPE_MEMORY

Signed-off-by: shaoyun liu <shaoyun.liu@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
drivers/gpu/drm/amd/amdkfd/cik_int.h
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_events.c
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
include/uapi/linux/kfd_ioctl.h

index 49df6c7..cc33870 100644 (file)
@@ -48,18 +48,19 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
        return ihre->source_id == CIK_INTSRC_CP_END_OF_PIPE ||
                ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
                ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
-               ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE;
+               ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
+               ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+               ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
 }
 
 static void cik_event_interrupt_wq(struct kfd_dev *dev,
                                        const uint32_t *ih_ring_entry)
 {
-       unsigned int pasid;
        const struct cik_ih_ring_entry *ihre =
                        (const struct cik_ih_ring_entry *)ih_ring_entry;
        uint32_t context_id = ihre->data & 0xfffffff;
-
-       pasid = (ihre->ring_id & 0xffff0000) >> 16;
+       unsigned int vmid  = (ihre->ring_id & 0x0000ff00) >> 8;
+       unsigned int pasid = (ihre->ring_id & 0xffff0000) >> 16;
 
        if (pasid == 0)
                return;
@@ -72,6 +73,22 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
                kfd_signal_event_interrupt(pasid, context_id & 0xff, 8);
        else if (ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE)
                kfd_signal_hw_exception_event(pasid);
+       else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+               ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
+               struct kfd_vm_fault_info info;
+
+               kfd_process_vm_fault(dev->dqm, pasid);
+
+               memset(&info, 0, sizeof(info));
+               dev->kfd2kgd->get_vm_fault_info(dev->kgd, &info);
+               if (!info.page_addr && !info.status)
+                       return;
+
+               if (info.vmid == vmid)
+                       kfd_signal_vm_fault_event(dev, pasid, &info);
+               else
+                       kfd_signal_vm_fault_event(dev, pasid, NULL);
+       }
 }
 
 const struct kfd_event_interrupt_class event_interrupt_class_cik = {
index 109298b..a2079a0 100644 (file)
@@ -37,6 +37,8 @@ struct cik_ih_ring_entry {
 #define CIK_INTSRC_DEQUEUE_COMPLETE    0xC6
 #define CIK_INTSRC_SDMA_TRAP           0xE0
 #define CIK_INTSRC_SQ_INTERRUPT_MSG    0xEF
+#define CIK_INTSRC_GFX_PAGE_INV_FAULT  0x92
+#define CIK_INTSRC_GFX_MEM_PROT_FAULT  0x93
 
 #endif
 
index f2f81d2..44fc203 100644 (file)
@@ -1684,6 +1684,23 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
        kfree(dqm);
 }
 
+int kfd_process_vm_fault(struct device_queue_manager *dqm,
+                        unsigned int pasid)
+{
+       struct kfd_process_device *pdd;
+       struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+       int ret = 0;
+
+       if (!p)
+               return -EINVAL;
+       pdd = kfd_get_process_device_data(dqm->dev, p);
+       if (pdd)
+               ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+       kfd_unref_process(p);
+
+       return ret;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
 static void seq_reg_dump(struct seq_file *m,
index 3d5a833..b58a0e6 100644 (file)
@@ -963,3 +963,40 @@ void kfd_signal_hw_exception_event(unsigned int pasid)
        mutex_unlock(&p->event_mutex);
        kfd_unref_process(p);
 }
+
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+                               struct kfd_vm_fault_info *info)
+{
+       struct kfd_event *ev;
+       uint32_t id;
+       struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+       struct kfd_hsa_memory_exception_data memory_exception_data;
+
+       if (!p)
+               return; /* Presumably process exited. */
+       memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+       memory_exception_data.gpu_id = dev->id;
+       memory_exception_data.failure.imprecise = 1;
+       /* Set failure reason */
+       if (info) {
+               memory_exception_data.va = (info->page_addr) << PAGE_SHIFT;
+               memory_exception_data.failure.NotPresent =
+                       info->prot_valid ? 1 : 0;
+               memory_exception_data.failure.NoExecute =
+                       info->prot_exec ? 1 : 0;
+               memory_exception_data.failure.ReadOnly =
+                       info->prot_write ? 1 : 0;
+               memory_exception_data.failure.imprecise = 0;
+       }
+       mutex_lock(&p->event_mutex);
+
+       id = KFD_FIRST_NONSIGNAL_EVENT_ID;
+       idr_for_each_entry_continue(&p->event_idr, ev, id)
+               if (ev->type == KFD_EVENT_TYPE_MEMORY) {
+                       ev->memory_exception_data = memory_exception_data;
+                       set_event(ev);
+               }
+
+       mutex_unlock(&p->event_mutex);
+       kfd_unref_process(p);
+}
index 37029ba..d6b64e6 100644 (file)
@@ -57,7 +57,9 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
        return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
                source_id == SOC15_INTSRC_SDMA_TRAP ||
                source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
-               source_id == SOC15_INTSRC_CP_BAD_OPCODE;
+               source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
+               client_id == SOC15_IH_CLIENTID_VMC ||
+               client_id == SOC15_IH_CLIENTID_UTCL2;
 }
 
 static void event_interrupt_wq_v9(struct kfd_dev *dev,
@@ -82,7 +84,19 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
                kfd_signal_hw_exception_event(pasid);
        else if (client_id == SOC15_IH_CLIENTID_VMC ||
                 client_id == SOC15_IH_CLIENTID_UTCL2) {
-               /* TODO */
+               struct kfd_vm_fault_info info = {0};
+               uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+
+               info.vmid = vmid;
+               info.mc_id = client_id;
+               info.page_addr = ih_ring_entry[4] |
+                       (uint64_t)(ih_ring_entry[5] & 0xf) << 32;
+               info.prot_valid = ring_id & 0x08;
+               info.prot_read  = ring_id & 0x10;
+               info.prot_write = ring_id & 0x20;
+
+               kfd_process_vm_fault(dev->dqm, pasid);
+               kfd_signal_vm_fault_event(dev, pasid, &info);
        }
 }
 
index 5e3990b..91a3368 100644 (file)
@@ -838,6 +838,7 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm);
 struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
                                        enum kfd_queue_type type);
 void kernel_queue_uninit(struct kernel_queue *kq);
+int kfd_process_vm_fault(struct device_queue_manager *dqm, unsigned int pasid);
 
 /* Process Queue Manager */
 struct process_queue_node {
@@ -964,6 +965,9 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
                     uint64_t *event_page_offset, uint32_t *event_slot_index);
 int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
+void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
+                               struct kfd_vm_fault_info *info);
+
 void kfd_flush_tlb(struct kfd_process_device *pdd);
 
 int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
index b4f5073..46a54ab 100644 (file)
@@ -219,7 +219,7 @@ struct kfd_memory_exception_failure {
        __u32 NotPresent;       /* Page not present or supervisor privilege */
        __u32 ReadOnly; /* Write access to a read-only page */
        __u32 NoExecute;        /* Execute access to a page marked NX */
-       __u32 pad;
+       __u32 imprecise;        /* Can't determine the  exact fault address */
 };
 
 /* memory exception data*/