AMDGPU: Fix not preserving alignent in call setups

author Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 13 Sep 2018 12:14:31 +0000 (12:14 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Thu, 13 Sep 2018 12:14:31 +0000 (12:14 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 13 Sep 2018 12:14:31 +0000 (12:14 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Thu, 13 Sep 2018 12:14:31 +0000 (12:14 +0000)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp

index ccb0072..733cff7 100644 (file)
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2518,12 +2518,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
        int32_t Offset = LocMemOffset;
  
        SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
+      unsigned Align = 0;
  
        if (IsTailCall) {
          ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
          unsigned OpSize = Flags.isByVal() ?
            Flags.getByValSize() : VA.getValVT().getStoreSize();
  
+        // FIXME: We can have better than the minimum byval required alignment.
+        Align = Flags.isByVal() ? Flags.getByValAlign() :
+          MinAlign(Subtarget->getStackAlignment(), Offset);
+
          Offset = Offset + FPDiff;
          int FI = MFI.CreateFixedObject(OpSize, Offset, true);
  
@@ -2541,6 +2546,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
        } else {
          DstAddr = PtrOff;
          DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
+        Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
        }
  
        if (Outs[i].Flags.isByVal()) {
@@ -2555,7 +2561,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
  
          MemOpChains.push_back(Cpy);
        } else {
-        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
+        SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
          MemOpChains.push_back(Store);
        }
      }
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll

index e98dd84..581df1c 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -721,6 +721,56 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
    ret void
  }
  
+; GCN-LABEL: {{^}}stack_passed_arg_alignment_v32i32_f64:
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:8
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4
+; GCN: s_swappc_b64
+define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
+entry:
+  call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
+  ret void
+}
+
+; GCN-LABEL: {{^}}tail_call_byval_align16:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:28 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:24 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:32
+; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:36
+; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:20
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16
+; GCN: s_getpc_b64
+; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:24 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:28 ; 4-byte Folded Reload
+; GCN: s_setpc_b64
+define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
+entry:
+  %alloca = alloca double, align 8, addrspace(5)
+  tail call void @byval_align16_f64_arg(<32 x i32> %val, double addrspace(5)* byval align 16 %alloca)
+  ret void
+}
+
+; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
+; GCN: s_mov_b32 s5, s32
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill
+; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
+; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:8
+; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8
+; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
+; GCN: s_getpc_b64
+; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload
+; GCN: s_setpc_b64
+define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
+entry:
+  tail call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
+  ret void
+}
+
+declare void @byval_align16_f64_arg(<32 x i32>, double addrspace(5)* byval align 16) #0
+declare void @stack_passed_f64_arg(<32 x i32>, double) #0
+
  attributes #0 = { nounwind }
  attributes #1 = { nounwind readnone }
  attributes #2 = { nounwind noinline }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 13 Sep 2018 12:14:31 +0000 (12:14 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Thu, 13 Sep 2018 12:14:31 +0000 (12:14 +0000)
llvm/lib/Target/AMDGPU/SIISelLowering.cpp		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/call-argument-types.ll		patch \| blob \| history