From 6dd84983d0c131fb9fc5e130c80b51b74e397d48 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 2 Feb 2023 10:29:47 -0600 Subject: [PATCH] [Libomptarget] Improve next-gen AMDGPU plugin error messages The next-gen plugin properly prints errors. This patch improves the error messages by including the Node-ID of the GPU that failed as well as a textual representation of the enumeration values. Reviewed By: kevinsala Differential Revision: https://reviews.llvm.org/D143192 --- openmp/libomptarget/include/Debug.h | 2 +- .../plugins-nextgen/amdgpu/src/rtl.cpp | 37 +++++++++++----------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/openmp/libomptarget/include/Debug.h b/openmp/libomptarget/include/Debug.h index 387dfe5..91c1268 100644 --- a/openmp/libomptarget/include/Debug.h +++ b/openmp/libomptarget/include/Debug.h @@ -119,7 +119,7 @@ inline uint32_t getDebugLevel() { /// Print fatal error message with a printf string and error identifier #define FATAL_MESSAGE(_num, _str, ...) \ do { \ - fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d:" _str "\n", _num, \ + fprintf(stderr, GETNAME(TARGET_NAME) " fatal error %d: " _str "\n", _num, \ __VA_ARGS__); \ abort(); \ } while (0) diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index ca313f5..6f57e19 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -431,8 +431,7 @@ struct AMDGPUKernelTy : public GenericKernelTy { /// Launch the AMDGPU kernel function. Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads, - uint64_t NumBlocks, - KernelArgsTy &KernelArgs, void *Args, + uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args, AsyncInfoWrapperTy &AsyncInfoWrapper) const override; /// The default number of blocks is common to the whole device. @@ -2431,38 +2430,40 @@ private: if (Event->event_type != HSA_AMD_GPU_MEMORY_FAULT_EVENT) return HSA_STATUS_SUCCESS; - std::string Reasons; + SmallVector Reasons; uint32_t ReasonsMask = Event->memory_fault.fault_reason_mask; if (ReasonsMask & HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT) - Reasons += "HSA_AMD_MEMORY_FAULT_PAGE_NOT_PRESENT, "; + Reasons.emplace_back("Page not present or supervisor privilege"); if (ReasonsMask & HSA_AMD_MEMORY_FAULT_READ_ONLY) - Reasons += " HSA_AMD_MEMORY_FAULT_READ_ONLY, "; + Reasons.emplace_back("Write access to a read-only page"); if (ReasonsMask & HSA_AMD_MEMORY_FAULT_NX) - Reasons += " HSA_AMD_MEMORY_FAULT_NX, "; + Reasons.emplace_back("Execute access to a page marked NX"); if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HOST_ONLY) - Reasons += " HSA_AMD_MEMORY_FAULT_HOST_ONLY, "; + Reasons.emplace_back("GPU attempted access to a host only page"); if (ReasonsMask & HSA_AMD_MEMORY_FAULT_DRAMECC) - Reasons += " HSA_AMD_MEMORY_FAULT_DRAMECC, "; + Reasons.emplace_back("DRAM ECC failure"); if (ReasonsMask & HSA_AMD_MEMORY_FAULT_IMPRECISE) - Reasons += " HSA_AMD_MEMORY_FAULT_IMPRECISE, "; + Reasons.emplace_back("Can't determine the exact fault address"); if (ReasonsMask & HSA_AMD_MEMORY_FAULT_SRAMECC) - Reasons += " HSA_AMD_MEMORY_FAULT_SRAMECC, "; + Reasons.emplace_back("SRAM ECC failure (ie registers, no fault address)"); if (ReasonsMask & HSA_AMD_MEMORY_FAULT_HANG) - Reasons += " HSA_AMD_MEMORY_FAULT_HANG, "; + Reasons.emplace_back("GPU reset following unspecified hang"); // If we do not know the reason, say so, otherwise remove the trailing comma // and space. if (Reasons.empty()) - Reasons = "Unknown (Mask: " + std::to_string(ReasonsMask) + ")"; - else - Reasons.resize(Reasons.size() - /* ', ' */ 2); + Reasons.emplace_back("Unknown (" + std::to_string(ReasonsMask) + ")"); + + uint32_t Node = -1; + hsa_agent_get_info(Event->memory_fault.agent, HSA_AGENT_INFO_NODE, &Node); // Abort the execution since we do not recover from this error. FATAL_MESSAGE(1, - "Found HSA_AMD_GPU_MEMORY_FAULT_EVENT in agent %" PRIu64 - " at virtual address %p and reasons: %s", - Event->memory_fault.agent.handle, - (void *)Event->memory_fault.virtual_address, Reasons.data()); + "Memory access fault by GPU %" PRIu32 " (agent 0x%" PRIx64 + ") at virtual address %p. Reasons: %s", + Node, Event->memory_fault.agent.handle, + (void *)Event->memory_fault.virtual_address, + llvm::join(Reasons, ", ").c_str()); return HSA_STATUS_ERROR; } -- 2.7.4