drm/amdkfd: add RAS ECC event support (v3)

author Eric Huang <JinhuiEric.Huang@amd.com>

Fri, 11 Jan 2019 19:38:51 +0000 (14:38 -0500)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 19 Mar 2019 20:36:51 +0000 (15:36 -0500)
author Eric Huang <JinhuiEric.Huang@amd.com>
Fri, 11 Jan 2019 19:38:51 +0000 (14:38 -0500)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 19 Mar 2019 20:36:51 +0000 (15:36 -0500)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

index fe1d736..acf8ae0 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -640,4 +640,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
  void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
  {
  }
+
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+}
  #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

index 0e1711a..e6a5037 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -229,5 +229,6 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm);
  int kgd2kfd_resume_mm(struct mm_struct *mm);
  int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
                                                struct dma_fence *fence);
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
  
  #endif /* AMDGPU_AMDKFD_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

index 88c45f9..6bb71f6 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4805,6 +4805,7 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
                 struct amdgpu_iv_entry *entry)
  {
         /* TODO ue will trigger an interrupt. */
+       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
         amdgpu_ras_reset_gpu(adev, 0);
         return AMDGPU_RAS_UE;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index 2daa5ea..0252345 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -354,6 +354,7 @@ static int gmc_v9_0_ecc_interrupt_state(struct amdgpu_device *adev,
  static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
                 struct amdgpu_iv_entry *entry)
  {
+       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
         amdgpu_ras_reset_gpu(adev, 0);
         return AMDGPU_RAS_UE;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

index 058b9da..f7a6faf 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1851,6 +1851,8 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
                 return 0;
         }
  
+       kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
         amdgpu_ras_reset_gpu(adev, 0);
  
         return AMDGPU_RAS_UE;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c

index 8be9677..b3cdbf7 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -466,6 +466,8 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
         memset(&kfd->doorbell_available_index, 0,
                 sizeof(kfd->doorbell_available_index));
  
+       atomic_set(&kfd->sram_ecc_flag, 0);
+
         return kfd;
  }
  
@@ -661,6 +663,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
                 return ret;
         count = atomic_dec_return(&kfd_locked);
         WARN_ONCE(count != 0, "KFD reset ref. error");
+
+       atomic_set(&kfd->sram_ecc_flag, 0);
+
         return 0;
  }
  
@@ -1024,6 +1029,12 @@ int kfd_gtt_sa_free(struct kfd_dev *kfd, struct kfd_mem_obj *mem_obj)
         return 0;
  }
  
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+       if (kfd)
+               atomic_inc(&kfd->sram_ecc_flag);
+}
+
  #if defined(CONFIG_DEBUG_FS)
  
  /* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c

index e9f0e0a..6e1d41c 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1011,25 +1011,41 @@ void kfd_signal_vm_fault_event(struct kfd_dev *dev, unsigned int pasid,
  void kfd_signal_reset_event(struct kfd_dev *dev)
  {
         struct kfd_hsa_hw_exception_data hw_exception_data;
+       struct kfd_hsa_memory_exception_data memory_exception_data;
         struct kfd_process *p;
         struct kfd_event *ev;
         unsigned int temp;
         uint32_t id, idx;
+       int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
+                       KFD_HW_EXCEPTION_ECC :
+                       KFD_HW_EXCEPTION_GPU_HANG;
  
         /* Whole gpu reset caused by GPU hang and memory is lost */
         memset(&hw_exception_data, 0, sizeof(hw_exception_data));
         hw_exception_data.gpu_id = dev->id;
         hw_exception_data.memory_lost = 1;
+       hw_exception_data.reset_cause = reset_cause;
+
+       memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+       memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
+       memory_exception_data.gpu_id = dev->id;
+       memory_exception_data.failure.imprecise = true;
  
         idx = srcu_read_lock(&kfd_processes_srcu);
         hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
                 mutex_lock(&p->event_mutex);
                 id = KFD_FIRST_NONSIGNAL_EVENT_ID;
-               idr_for_each_entry_continue(&p->event_idr, ev, id)
+               idr_for_each_entry_continue(&p->event_idr, ev, id) {
                         if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
                                 ev->hw_exception_data = hw_exception_data;
                                 set_event(ev);
                         }
+                       if (ev->type == KFD_EVENT_TYPE_MEMORY &&
+                           reset_cause == KFD_HW_EXCEPTION_ECC) {
+                               ev->memory_exception_data = memory_exception_data;
+                               set_event(ev);
+                       }
+               }
                 mutex_unlock(&p->event_mutex);
         }
         srcu_read_unlock(&kfd_processes_srcu, idx);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 0eeee3c..9e02309 100644 (file)
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -276,6 +276,9 @@ struct kfd_dev {
         uint64_t hive_id;
  
         bool pci_atomic_requested;
+
+       /* SRAM ECC flag */
+       atomic_t sram_ecc_flag;
  };
  
  enum kfd_mempool {
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h

index e622fd1..dc067ed 100644 (file)
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -211,6 +211,11 @@ struct kfd_ioctl_dbg_wave_control_args {
  #define KFD_HW_EXCEPTION_GPU_HANG      0
  #define KFD_HW_EXCEPTION_ECC           1
  
+/* For kfd_hsa_memory_exception_data.ErrorType */
+#define KFD_MEM_ERR_NO_RAS             0
+#define KFD_MEM_ERR_SRAM_ECC           1
+#define KFD_MEM_ERR_POISON_CONSUMED    2
+#define KFD_MEM_ERR_GPU_HANG           3
  
  struct kfd_ioctl_create_event_args {
         __u64 event_page_offset;        /* from KFD */
@@ -250,7 +255,12 @@ struct kfd_hsa_memory_exception_data {
         struct kfd_memory_exception_failure failure;
         __u64 va;
         __u32 gpu_id;
-       __u32 pad;
+       __u32 ErrorType; /* 0 = no RAS error,
+                         * 1 = ECC_SRAM,
+                         * 2 = Link_SYNFLOOD (poison),
+                         * 3 = GPU hang (not attributable to a specific cause),
+                         * other values reserved
+                         */
  };
  
  /* hw exception data */
author	Eric Huang <JinhuiEric.Huang@amd.com>
	Fri, 11 Jan 2019 19:38:51 +0000 (14:38 -0500)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 19 Mar 2019 20:36:51 +0000 (15:36 -0500)
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_device.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_events.c		patch \| blob \| history
drivers/gpu/drm/amd/amdkfd/kfd_priv.h		patch \| blob \| history
include/uapi/linux/kfd_ioctl.h		patch \| blob \| history