drm/amdgpu: Fix the race condition for draining retry fault

author Emily Deng <Emily.Deng@amd.com>

Thu, 6 Mar 2025 00:40:01 +0000 (08:40 +0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 20 Apr 2025 08:15:26 +0000 (10:15 +0200)
author Emily Deng <Emily.Deng@amd.com>
Thu, 6 Mar 2025 00:40:01 +0000 (08:40 +0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 20 Apr 2025 08:15:26 +0000 (10:15 +0200)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c

index 8c61dee5ca0db10668b1d73ad1979dc66d62e5a3..b50283864dcd26d140009434d71c0aa3fec8b459 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -2992,19 +2992,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
                 goto out;
         }
  
-       /* check if this page fault time stamp is before svms->checkpoint_ts */
-       if (svms->checkpoint_ts[gpuidx] != 0) {
-               if (amdgpu_ih_ts_after(ts,  svms->checkpoint_ts[gpuidx])) {
-                       pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
-                       r = 0;
-                       goto out;
-               } else
-                       /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts
-                        * to zero to avoid following ts wrap around give wrong comparing
-                        */
-                       svms->checkpoint_ts[gpuidx] = 0;
-       }
-
         if (!p->xnack_enabled) {
                 pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
                 r = -EFAULT;
@@ -3024,6 +3011,21 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
         mmap_read_lock(mm);
  retry_write_locked:
         mutex_lock(&svms->lock);
+
+       /* check if this page fault time stamp is before svms->checkpoint_ts */
+       if (svms->checkpoint_ts[gpuidx] != 0) {
+               if (amdgpu_ih_ts_after(ts,  svms->checkpoint_ts[gpuidx])) {
+                       pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
+                       r = -EAGAIN;
+                       goto out_unlock_svms;
+               } else {
+                       /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts
+                        * to zero to avoid following ts wrap around give wrong comparing
+                        */
+                       svms->checkpoint_ts[gpuidx] = 0;
+               }
+       }
+
         prange = svm_range_from_addr(svms, addr, NULL);
         if (!prange) {
                 pr_debug("failed to find prange svms 0x%p address [0x%llx]\n",
@@ -3148,7 +3150,8 @@ out_unlock_svms:
         mutex_unlock(&svms->lock);
         mmap_read_unlock(mm);
  
-       svm_range_count_fault(node, p, gpuidx);
+       if (r != -EAGAIN)
+               svm_range_count_fault(node, p, gpuidx);
  
         mmput(mm);
  out:
author	Emily Deng <Emily.Deng@amd.com>
	Thu, 6 Mar 2025 00:40:01 +0000 (08:40 +0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 20 Apr 2025 08:15:26 +0000 (10:15 +0200)