From b5613ecf173d5eeca82dce3be2b269feb4a75082 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 7 Dec 2018 22:12:17 +0000 Subject: [PATCH] AMDGPU: Fix offsets for < 4-byte aggregate kernel arguments We were still using the rounded down offset and alignment even though they aren't handled because you can't trivially bitcast the loaded value. llvm-svn: 348658 --- llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp | 11 +++++++---- llvm/test/CodeGen/AMDGPU/kernel-args.ll | 16 ++++++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index fae1da9..743dc7a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -122,14 +122,17 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { VectorType *VT = dyn_cast(ArgTy); bool IsV3 = VT && VT->getNumElements() == 3; + bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType(); + VectorType *V4Ty = nullptr; int64_t AlignDownOffset = alignDown(EltOffset, 4); int64_t OffsetDiff = EltOffset - AlignDownOffset; - unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset); + unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset, + KernArgBaseAlign); Value *ArgPtr; - if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types + if (DoShiftOpt) { // FIXME: Handle aggregate types // Since we don't have sub-dword scalar loads, avoid doing an extload by // loading earlier than the argument address, and extracting the relevant // bits. @@ -147,7 +150,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { } else { ArgPtr = Builder.CreateConstInBoundsGEP1_64( KernArgSegment, - AlignDownOffset, + EltOffset, Arg.getName() + ".kernarg.offset"); ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS), ArgPtr->getName() + ".cast"); @@ -198,7 +201,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { // TODO: Convert noalias arg to !noalias - if (Size < 32 && !ArgTy->isAggregateType()) { + if (DoShiftOpt) { Value *ExtractBits = OffsetDiff == 0 ? Load : Builder.CreateLShr(Load, OffsetDiff * 8); diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll index 941c155..6b64731 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -739,10 +739,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: ; HSA-GFX9: kernarg_segment_byte_size = 28 +; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 +; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 ; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 ; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 -; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0xc -; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 @@ -789,10 +789,18 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { ; FIXME: Why not all scalar loads? ; GCN-LABEL: {{^}}array_3xi16: ; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:2 -; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4 +; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:6 define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { store volatile i8 %arg0, i8 addrspace(1)* undef store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef ret void } + +; GCN-LABEL: {{^}}small_array_round_down_offset: +; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:1 +define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) { + %val = extractvalue [1 x i8] %arg, 0 + store volatile i8 %val, i8 addrspace(1)* undef + ret void +} -- 2.7.4