From 53a4adc0deb29fcc1f907ea7bc151fdebecf406d Mon Sep 17 00:00:00 2001 From: Krzysztof Drewniak Date: Fri, 5 May 2023 22:03:40 +0000 Subject: [PATCH] [AMDGPU] Fix crash with 160-bit p7's by manually defining getPointerTy While pointers in address space 7 (128 bit rsrc + 32 bit offset) should be rewritten out of the code before IR translation on AMDGPU, higher-level analyses may still call MVT getPointerTy() and the like on the target machine. Currently, since there is no MVT::i160, this operation ends up causing crashes. The changes to the data layout that caused such crashes were D149776. This patch causes getPointerTy() to return the type MVT::v5i32 and getPointerMemTy() to be MVT::v8i32. These are accurate types, but mean that we can't use vectors of address space 7 pointers during codegen. This is mostly OK, since vectors of buffers aren't supported in LPC anyway, but it's a noticable limitation. Potential alternative solutions include adjusting getPointerTy() to return an EVT or adding MVT::i160 and MVT::i256, both of which are rather disruptive to the rest of the compiler. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D150002 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 20 ++++++++ llvm/lib/Target/AMDGPU/SIISelLowering.h | 6 +++ ...anslator-non-integral-address-spaces-vectors.ll | 14 +++++ .../irtranslator-non-integral-address-spaces.ll | 20 ++++++++ .../AMDGPU/addrspace-7-doesnt-crash.ll | 60 ++++++++++++++++++++++ 5 files changed, 120 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll create mode 100644 llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6c8aa6c..6fad74c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -979,6 +979,26 @@ static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) { return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes); } +/// Map address space 7 to MVT::v5i32 because that's its in-memory +/// representation. This return value is vector-typed because there is no +/// MVT::i160 and it is not clear if one can be added. While this could +/// cause issues during codegen, these address space 7 pointers will be +/// rewritten away by then. Therefore, we can return MVT::v5i32 in order +/// to allow pre-codegen passes that query TargetTransformInfo, often for cost +/// modeling, to work. +MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const { + if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) + return MVT::v5i32; + return AMDGPUTargetLowering::getPointerTy(DL, AS); +} +/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka +/// v8i32 when padding is added. +MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const { + if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160) + return MVT::v8i32; + return AMDGPUTargetLowering::getPointerMemTy(DL, AS); +} + bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &CI, MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index 7e4b431..e82c97e 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -273,6 +273,12 @@ public: bool isShuffleMaskLegal(ArrayRef /*Mask*/, EVT /*VT*/) const override; + // While address space 7 should never make it to codegen, it still needs to + // have a MVT to prevent some analyses that query this function from breaking, + // so, to work around the lack of i160, map it to v5i32. + MVT getPointerTy(const DataLayout &DL, unsigned AS) const override; + MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override; + bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll new file mode 100644 index 0000000..43ade02 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces-vectors.ll @@ -0,0 +1,14 @@ +; RUN: not --crash llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - -stop-after=irtranslator < %s + +; Confirm that no one's gotten vectors of addrspace(7) pointers to go through the +; IR translater incidentally. + +define <2 x ptr addrspace(7)> @no_auto_constfold_gep_vector() { + %gep = getelementptr i8, <2 x ptr addrspace(7)> zeroinitializer, <2 x i32> + ret <2 x ptr addrspace(7)> %gep +} + +define <2 x ptr addrspace(7)> @gep_vector_splat(<2 x ptr addrspace(7)> %ptrs, i64 %idx) { + %gep = getelementptr i8, <2 x ptr addrspace(7)> %ptrs, i64 %idx + ret <2 x ptr addrspace(7)> %gep +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll new file mode 100644 index 0000000..93cc0d9 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-non-integral-address-spaces.ll @@ -0,0 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - -stop-after=irtranslator %s | FileCheck %s + +; Check that the CSEMIRBuilder doesn't fold away the getelementptr during IRTranslator +define ptr addrspace(7) @no_auto_constfold_gep() { + ; CHECK-LABEL: name: no_auto_constfold_gep + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: [[C:%[0-9]+]]:_(p7) = G_CONSTANT i160 0 + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s160) = G_CONSTANT i160 123 + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p7) = G_PTR_ADD [[C]], [[C1]](s160) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[PTR_ADD]](p7) + ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32) + ; CHECK-NEXT: $vgpr1 = COPY [[UV1]](s32) + ; CHECK-NEXT: $vgpr2 = COPY [[UV2]](s32) + ; CHECK-NEXT: $vgpr3 = COPY [[UV3]](s32) + ; CHECK-NEXT: $vgpr4 = COPY [[UV4]](s32) + ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4 + %gep = getelementptr i8, ptr addrspace(7) null, i32 123 + ret ptr addrspace(7) %gep +} diff --git a/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll b/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll new file mode 100644 index 0000000..98d7b46 --- /dev/null +++ b/llvm/test/Transforms/IndVarSimplify/AMDGPU/addrspace-7-doesnt-crash.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -passes=indvars -S < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8" +target triple = "amdgcn--amdpal" + +define void @f(ptr addrspace(7) %arg) { +; CHECK-LABEL: define void @f +; CHECK-SAME: (ptr addrspace(7) [[ARG:%.*]]) { +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB1]] +; CHECK: bb2: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(7) [[ARG]], i32 8 +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb3: +; CHECK-NEXT: [[I4:%.*]] = load i32, ptr addrspace(7) [[SCEVGEP]], align 4 +; CHECK-NEXT: br label [[BB3]] +; +bb: + br label %bb1 +bb1: + %i = getelementptr i32, ptr addrspace(7) %arg, i32 2 + br i1 false, label %bb2, label %bb1 +bb2: + br label %bb3 +bb3: + %i4 = load i32, ptr addrspace(7) %i, align 4 + br label %bb3 +} + +define void @f2(<2 x ptr addrspace(7)> %arg) { +; CHECK-LABEL: define void @f2 +; CHECK-SAME: (<2 x ptr addrspace(7)> [[ARG:%.*]]) { +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[P:%.*]] = extractelement <2 x ptr addrspace(7)> [[ARG]], i32 0 +; CHECK-NEXT: [[I:%.*]] = getelementptr i32, ptr addrspace(7) [[P]], i32 2 +; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB1]] +; CHECK: bb2: +; CHECK-NEXT: [[I_LCSSA:%.*]] = phi ptr addrspace(7) [ [[I]], [[BB1]] ] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb3: +; CHECK-NEXT: [[I4:%.*]] = load i32, ptr addrspace(7) [[I_LCSSA]], align 4 +; CHECK-NEXT: br label [[BB3]] +; +bb: + br label %bb1 +bb1: + %p = extractelement <2 x ptr addrspace(7)> %arg, i32 0 + %i = getelementptr i32, ptr addrspace(7) %p, i32 2 + br i1 false, label %bb2, label %bb1 +bb2: + br label %bb3 +bb3: + %i4 = load i32, ptr addrspace(7) %i, align 4 + br label %bb3 +} -- 2.7.4