void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
{
}
+
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+}
#endif
int kgd2kfd_resume_mm(struct mm_struct *mm);
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
struct dma_fence *fence);
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
#endif /* AMDGPU_AMDKFD_H_INCLUDED */
struct amdgpu_iv_entry *entry)
{
/* TODO ue will trigger an interrupt. */
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_ras_reset_gpu(adev, 0);
return AMDGPU_RAS_UE;
}
static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
amdgpu_ras_reset_gpu(adev, 0);
return AMDGPU_RAS_UE;
}
return 0;
}
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
amdgpu_ras_reset_gpu(adev, 0);
return AMDGPU_RAS_UE;
memset(&kfd->doorbell_available_index, 0,
sizeof(kfd->doorbell_available_index));
+ atomic_set(&kfd->sram_ecc_flag, 0);
+
return kfd;
}
return ret;
count = atomic_dec_return(&kfd_locked);
WARN_ONCE(count != 0, "KFD reset ref. error");
+
+ atomic_set(&kfd->sram_ecc_flag, 0);
+
return 0;
}
return 0;
}
+void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
+{
+ if (kfd)
+ atomic_inc(&kfd->sram_ecc_flag);
+}
+
#if defined(CONFIG_DEBUG_FS)
/* This function will send a package to HIQ to hang the HWS
void kfd_signal_reset_event(struct kfd_dev *dev)
{
struct kfd_hsa_hw_exception_data hw_exception_data;
+ struct kfd_hsa_memory_exception_data memory_exception_data;
struct kfd_process *p;
struct kfd_event *ev;
unsigned int temp;
uint32_t id, idx;
+ int reset_cause = atomic_read(&dev->sram_ecc_flag) ?
+ KFD_HW_EXCEPTION_ECC :
+ KFD_HW_EXCEPTION_GPU_HANG;
/* Whole gpu reset caused by GPU hang and memory is lost */
memset(&hw_exception_data, 0, sizeof(hw_exception_data));
hw_exception_data.gpu_id = dev->id;
hw_exception_data.memory_lost = 1;
+ hw_exception_data.reset_cause = reset_cause;
+
+ memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+ memory_exception_data.ErrorType = KFD_MEM_ERR_SRAM_ECC;
+ memory_exception_data.gpu_id = dev->id;
+ memory_exception_data.failure.imprecise = true;
idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
mutex_lock(&p->event_mutex);
id = KFD_FIRST_NONSIGNAL_EVENT_ID;
- idr_for_each_entry_continue(&p->event_idr, ev, id)
+ idr_for_each_entry_continue(&p->event_idr, ev, id) {
if (ev->type == KFD_EVENT_TYPE_HW_EXCEPTION) {
ev->hw_exception_data = hw_exception_data;
set_event(ev);
}
+ if (ev->type == KFD_EVENT_TYPE_MEMORY &&
+ reset_cause == KFD_HW_EXCEPTION_ECC) {
+ ev->memory_exception_data = memory_exception_data;
+ set_event(ev);
+ }
+ }
mutex_unlock(&p->event_mutex);
}
srcu_read_unlock(&kfd_processes_srcu, idx);
uint64_t hive_id;
bool pci_atomic_requested;
+
+ /* SRAM ECC flag */
+ atomic_t sram_ecc_flag;
};
enum kfd_mempool {
#define KFD_HW_EXCEPTION_GPU_HANG 0
#define KFD_HW_EXCEPTION_ECC 1
+/* For kfd_hsa_memory_exception_data.ErrorType */
+#define KFD_MEM_ERR_NO_RAS 0
+#define KFD_MEM_ERR_SRAM_ECC 1
+#define KFD_MEM_ERR_POISON_CONSUMED 2
+#define KFD_MEM_ERR_GPU_HANG 3
struct kfd_ioctl_create_event_args {
__u64 event_page_offset; /* from KFD */
struct kfd_memory_exception_failure failure;
__u64 va;
__u32 gpu_id;
- __u32 pad;
+ __u32 ErrorType; /* 0 = no RAS error,
+ * 1 = ECC_SRAM,
+ * 2 = Link_SYNFLOOD (poison),
+ * 3 = GPU hang (not attributable to a specific cause),
+ * other values reserved
+ */
};
/* hw exception data */