drm/amdkfd: Add kernel parameter to stop queue eviction on vm fault
authorOak Zeng <Oak.Zeng@amd.com>
Tue, 23 Jun 2020 00:27:45 +0000 (19:27 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 24 Mar 2021 02:59:22 +0000 (22:59 -0400)
This is to keep wavefront context for debug purpose

Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 7294a80..5179d5f 100644 (file)
@@ -751,6 +751,13 @@ bool no_system_mem_limit;
 module_param(no_system_mem_limit, bool, 0644);
 MODULE_PARM_DESC(no_system_mem_limit, "disable system memory limit (false = default)");
 
+/**
+ * DOC: no_queue_eviction_on_vm_fault (int)
+ * If set, process queues will not be evicted on gpuvm fault. This is to keep the wavefront context for debugging (0 = queue eviction, 1 = no queue eviction). The default is 0 (queue eviction).
+ */
+int amdgpu_no_queue_eviction_on_vm_fault = 0;
+MODULE_PARM_DESC(no_queue_eviction_on_vm_fault, "No queue eviction on VM fault (0 = queue eviction, 1 = no queue eviction)");
+module_param_named(no_queue_eviction_on_vm_fault, amdgpu_no_queue_eviction_on_vm_fault, int, 0444);
 #endif
 
 /**
index fe14e47..f623301 100644 (file)
@@ -80,8 +80,9 @@ static bool cik_event_interrupt_isr(struct kfd_dev *dev,
                ihre->source_id == CIK_INTSRC_SDMA_TRAP ||
                ihre->source_id == CIK_INTSRC_SQ_INTERRUPT_MSG ||
                ihre->source_id == CIK_INTSRC_CP_BAD_OPCODE ||
-               ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
-               ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT;
+               ((ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
+               ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) &&
+               !amdgpu_no_queue_eviction_on_vm_fault);
 }
 
 static void cik_event_interrupt_wq(struct kfd_dev *dev,
index 74a460b..1c20458 100644 (file)
@@ -98,9 +98,10 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
                source_id == SOC15_INTSRC_SDMA_TRAP ||
                source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
                source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
-               client_id == SOC15_IH_CLIENTID_VMC ||
+               ((client_id == SOC15_IH_CLIENTID_VMC ||
                client_id == SOC15_IH_CLIENTID_VMC1 ||
-               client_id == SOC15_IH_CLIENTID_UTCL2;
+               client_id == SOC15_IH_CLIENTID_UTCL2) &&
+               !amdgpu_no_queue_eviction_on_vm_fault);
 }
 
 static void event_interrupt_wq_v9(struct kfd_dev *dev,
index 0b59748..d8c8b5f 100644 (file)
@@ -169,6 +169,11 @@ extern bool hws_gws_support;
 /* Queue preemption timeout in ms */
 extern int queue_preemption_timeout_ms;
 
+/*
+ * Don't evict process queues on vm fault
+ */
+extern int amdgpu_no_queue_eviction_on_vm_fault;
+
 /* Enable eviction debug messages */
 extern bool debug_evictions;