; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4
; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] poison, align 4
-define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @promote_alloca_size_63(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
- %0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
- %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
- %3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %3, i32 addrspace(1)* %arrayidx13
+ %0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %3, ptr addrspace(1) %arrayidx13
ret void
}
; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] poison, align 4
-define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+define amdgpu_kernel void @promote_alloca_size_256(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #1 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
- %0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
- %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
- %3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %3, i32 addrspace(1)* %arrayidx13
+ %0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %3, ptr addrspace(1) %arrayidx13
ret void
}
; CI: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4
; GFX10PLUS: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1024 x [5 x i32]] poison, align 4
-define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
+define amdgpu_kernel void @promote_alloca_size_1600(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #2 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
- %0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
- %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
- %3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %3, i32 addrspace(1)* %arrayidx13
+ %0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %3, ptr addrspace(1) %arrayidx13
ret void
}
; ALL-LABEL: @occupancy_0(
; CI-NOT: alloca [5 x i32]
; SI: alloca [5 x i32]
-define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
+define amdgpu_kernel void @occupancy_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #3 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
- %0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
- %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
- %3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %3, i32 addrspace(1)* %arrayidx13
+ %0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %3, ptr addrspace(1) %arrayidx13
ret void
}
; ALL-LABEL: @occupancy_max(
; CI-NOT: alloca [5 x i32]
; SI: alloca [5 x i32]
-define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
+define amdgpu_kernel void @occupancy_max(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #4 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
- %0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
- %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
- %3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %3, i32 addrspace(1)* %arrayidx13
+ %0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %3, ptr addrspace(1) %arrayidx13
ret void
}
; CI-LABEL: @occupancy_6(
; SI: alloca
; CI-NOT: alloca
-define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @occupancy_6(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
entry:
%stack = alloca [42 x i8], align 4, addrspace(5)
- %tmp = load i8, i8 addrspace(1)* %in, align 1
+ %tmp = load i8, ptr addrspace(1) %in, align 1
%tmp4 = sext i8 %tmp to i64
- %arrayidx1 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
- store i8 4, i8 addrspace(5)* %arrayidx1, align 1
- %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
- %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+ %arrayidx1 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+ store i8 4, ptr addrspace(5) %arrayidx1, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+ %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
%tmp5 = sext i8 %tmp1 to i64
- %arrayidx3 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
- store i8 5, i8 addrspace(5)* %arrayidx3, align 1
- %arrayidx10 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 0
- %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
- store i8 %tmp2, i8 addrspace(1)* %out, align 1
- %arrayidx12 = getelementptr inbounds [42 x i8], [42 x i8] addrspace(5)* %stack, i64 0, i64 1
- %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
- %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
- store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+ %arrayidx3 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+ store i8 5, ptr addrspace(5) %arrayidx3, align 1
+ %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+ store i8 %tmp2, ptr addrspace(1) %out, align 1
+ %arrayidx12 = getelementptr inbounds [42 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+ %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+ %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+ store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
ret void
}
; SICI: alloca [43 x i8]
; GFX10PLUS-NOT: alloca
-define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @occupancy_6_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #5 {
entry:
%stack = alloca [43 x i8], align 4, addrspace(5)
- %tmp = load i8, i8 addrspace(1)* %in, align 1
+ %tmp = load i8, ptr addrspace(1) %in, align 1
%tmp4 = sext i8 %tmp to i64
- %arrayidx1 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
- store i8 4, i8 addrspace(5)* %arrayidx1, align 1
- %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
- %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+ %arrayidx1 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+ store i8 4, ptr addrspace(5) %arrayidx1, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+ %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
%tmp5 = sext i8 %tmp1 to i64
- %arrayidx3 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
- store i8 5, i8 addrspace(5)* %arrayidx3, align 1
- %arrayidx10 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 0
- %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
- store i8 %tmp2, i8 addrspace(1)* %out, align 1
- %arrayidx12 = getelementptr inbounds [43 x i8], [43 x i8] addrspace(5)* %stack, i64 0, i64 1
- %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
- %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
- store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+ %arrayidx3 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+ store i8 5, ptr addrspace(5) %arrayidx3, align 1
+ %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+ store i8 %tmp2, ptr addrspace(1) %out, align 1
+ %arrayidx12 = getelementptr inbounds [43 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+ %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+ %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+ store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
ret void
}
; CI-LABEL: @occupancy_8(
; SI: alloca
; CI-NOT: alloca
-define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @occupancy_8(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
entry:
%stack = alloca [32 x i8], align 4, addrspace(5)
- %tmp = load i8, i8 addrspace(1)* %in, align 1
+ %tmp = load i8, ptr addrspace(1) %in, align 1
%tmp4 = sext i8 %tmp to i64
- %arrayidx1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
- store i8 4, i8 addrspace(5)* %arrayidx1, align 1
- %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
- %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+ %arrayidx1 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+ store i8 4, ptr addrspace(5) %arrayidx1, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+ %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
%tmp5 = sext i8 %tmp1 to i64
- %arrayidx3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
- store i8 5, i8 addrspace(5)* %arrayidx3, align 1
- %arrayidx10 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 0
- %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
- store i8 %tmp2, i8 addrspace(1)* %out, align 1
- %arrayidx12 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %stack, i64 0, i64 1
- %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
- %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
- store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+ %arrayidx3 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+ store i8 5, ptr addrspace(5) %arrayidx3, align 1
+ %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+ store i8 %tmp2, ptr addrspace(1) %out, align 1
+ %arrayidx12 = getelementptr inbounds [32 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+ %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+ %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+ store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
ret void
}
; SICI: alloca [33 x i8]
; GFX10PLUS-NOT: alloca
-define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @occupancy_8_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #6 {
entry:
%stack = alloca [33 x i8], align 4, addrspace(5)
- %tmp = load i8, i8 addrspace(1)* %in, align 1
+ %tmp = load i8, ptr addrspace(1) %in, align 1
%tmp4 = sext i8 %tmp to i64
- %arrayidx1 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
- store i8 4, i8 addrspace(5)* %arrayidx1, align 1
- %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
- %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+ %arrayidx1 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+ store i8 4, ptr addrspace(5) %arrayidx1, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+ %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
%tmp5 = sext i8 %tmp1 to i64
- %arrayidx3 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
- store i8 5, i8 addrspace(5)* %arrayidx3, align 1
- %arrayidx10 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 0
- %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
- store i8 %tmp2, i8 addrspace(1)* %out, align 1
- %arrayidx12 = getelementptr inbounds [33 x i8], [33 x i8] addrspace(5)* %stack, i64 0, i64 1
- %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
- %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
- store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+ %arrayidx3 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+ store i8 5, ptr addrspace(5) %arrayidx3, align 1
+ %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+ store i8 %tmp2, ptr addrspace(1) %out, align 1
+ %arrayidx12 = getelementptr inbounds [33 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+ %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+ %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+ store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
ret void
}
; CI-LABEL: @occupancy_9(
; SI: alloca
; CI-NOT: alloca
-define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+define amdgpu_kernel void @occupancy_9(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 {
entry:
%stack = alloca [28 x i8], align 4, addrspace(5)
- %tmp = load i8, i8 addrspace(1)* %in, align 1
+ %tmp = load i8, ptr addrspace(1) %in, align 1
%tmp4 = sext i8 %tmp to i64
- %arrayidx1 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
- store i8 4, i8 addrspace(5)* %arrayidx1, align 1
- %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
- %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+ %arrayidx1 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+ store i8 4, ptr addrspace(5) %arrayidx1, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+ %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
%tmp5 = sext i8 %tmp1 to i64
- %arrayidx3 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
- store i8 5, i8 addrspace(5)* %arrayidx3, align 1
- %arrayidx10 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 0
- %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
- store i8 %tmp2, i8 addrspace(1)* %out, align 1
- %arrayidx12 = getelementptr inbounds [28 x i8], [28 x i8] addrspace(5)* %stack, i64 0, i64 1
- %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
- %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
- store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+ %arrayidx3 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+ store i8 5, ptr addrspace(5) %arrayidx3, align 1
+ %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+ store i8 %tmp2, ptr addrspace(1) %out, align 1
+ %arrayidx12 = getelementptr inbounds [28 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+ %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+ %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+ store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
ret void
}
; SICI: alloca [29 x i8]
; GFX10PLUS-NOT: alloca
-define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+define amdgpu_kernel void @occupancy_9_over(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #7 {
entry:
%stack = alloca [29 x i8], align 4, addrspace(5)
- %tmp = load i8, i8 addrspace(1)* %in, align 1
+ %tmp = load i8, ptr addrspace(1) %in, align 1
%tmp4 = sext i8 %tmp to i64
- %arrayidx1 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 %tmp4
- store i8 4, i8 addrspace(5)* %arrayidx1, align 1
- %arrayidx2 = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 1
- %tmp1 = load i8, i8 addrspace(1)* %arrayidx2, align 1
+ %arrayidx1 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp4
+ store i8 4, ptr addrspace(5) %arrayidx1, align 1
+ %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %in, i64 1
+ %tmp1 = load i8, ptr addrspace(1) %arrayidx2, align 1
%tmp5 = sext i8 %tmp1 to i64
- %arrayidx3 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 %tmp5
- store i8 5, i8 addrspace(5)* %arrayidx3, align 1
- %arrayidx10 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 0
- %tmp2 = load i8, i8 addrspace(5)* %arrayidx10, align 1
- store i8 %tmp2, i8 addrspace(1)* %out, align 1
- %arrayidx12 = getelementptr inbounds [29 x i8], [29 x i8] addrspace(5)* %stack, i64 0, i64 1
- %tmp3 = load i8, i8 addrspace(5)* %arrayidx12, align 1
- %arrayidx13 = getelementptr inbounds i8, i8 addrspace(1)* %out, i64 1
- store i8 %tmp3, i8 addrspace(1)* %arrayidx13, align 1
+ %arrayidx3 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 %tmp5
+ store i8 5, ptr addrspace(5) %arrayidx3, align 1
+ %tmp2 = load i8, ptr addrspace(5) %stack, align 1
+ store i8 %tmp2, ptr addrspace(1) %out, align 1
+ %arrayidx12 = getelementptr inbounds [29 x i8], ptr addrspace(5) %stack, i64 0, i64 1
+ %tmp3 = load i8, ptr addrspace(5) %arrayidx12, align 1
+ %arrayidx13 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 1
+ store i8 %tmp3, ptr addrspace(1) %arrayidx13, align 1
ret void
}
; CHECK-LABEL: @promote_1d_aggr(
; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
; CHECK-NEXT: [[F1:%.*]] = alloca [1 x float], align 4, addrspace(5)
-; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], [[BLOCK]] addrspace(1)* @block, i32 0, i32 1
-; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4
-; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4
-; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [[BLOCK]], [[BLOCK]] addrspace(1)* @block, i32 0, i32 0
-; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], [1 x float] addrspace(1)* [[FOO2]], align 4
-; CHECK-NEXT: store [1 x float] [[FOO3]], [1 x float] addrspace(5)* [[F1]], align 4
-; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4
-; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], [1 x float] addrspace(5)* [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT: [[FOO6:%.*]] = load float, float addrspace(5)* [[FOO5]], align 4
+; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK:%.*]], ptr addrspace(1) @block, i32 0, i32 1
+; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
+; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT: [[FOO3:%.*]] = load [1 x float], ptr addrspace(1) @block, align 4
+; CHECK-NEXT: store [1 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [1 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT: [[FOO6:%.*]] = load float, ptr addrspace(5) [[FOO5]], align 4
; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
-; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float> addrspace(5)* [[FOO7]], align 16
+; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[FOO6]], i32 0
; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[FOO6]], i32 1
; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[FOO6]], i32 2
; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[FOO6]], i32 3
-; CHECK-NEXT: [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
-; CHECK-NEXT: store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16
+; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
; CHECK-NEXT: ret void
;
%i = alloca i32, addrspace(5)
%f1 = alloca [1 x float], addrspace(5)
- %foo = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 1
- %foo1 = load i32, i32 addrspace(1)* %foo
- store i32 %foo1, i32 addrspace(5)* %i
- %foo2 = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 0
- %foo3 = load [1 x float], [1 x float] addrspace(1)* %foo2
- store [1 x float] %foo3, [1 x float] addrspace(5)* %f1
- %foo4 = load i32, i32 addrspace(5)* %i
- %foo5 = getelementptr [1 x float], [1 x float] addrspace(5)* %f1, i32 0, i32 %foo4
- %foo6 = load float, float addrspace(5)* %foo5
+ %foo = getelementptr %Block, ptr addrspace(1) @block, i32 0, i32 1
+ %foo1 = load i32, ptr addrspace(1) %foo
+ store i32 %foo1, ptr addrspace(5) %i
+ %foo3 = load [1 x float], ptr addrspace(1) @block
+ store [1 x float] %foo3, ptr addrspace(5) %f1
+ %foo4 = load i32, ptr addrspace(5) %i
+ %foo5 = getelementptr [1 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
+ %foo6 = load float, ptr addrspace(5) %foo5
%foo7 = alloca <4 x float>, addrspace(5)
- %foo8 = load <4 x float>, <4 x float> addrspace(5)* %foo7
+ %foo8 = load <4 x float>, ptr addrspace(5) %foo7
%foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
%foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
%foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
%foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
- %foo13 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0
- store <4 x float> %foo12, <4 x float> addrspace(1)* %foo13
+ store <4 x float> %foo12, ptr addrspace(1) @pv
ret void
}
; CHECK-LABEL: @promote_store_aggr(
; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
-; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK2:%.*]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 0
-; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4
-; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4
-; CHECK-NEXT: [[FOO2:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4
+; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) @block2, align 4
+; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT: [[FOO2:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
; CHECK-NEXT: [[FOO3:%.*]] = sitofp i32 [[FOO2]] to float
-; CHECK-NEXT: [[FOO4:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)*
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP1]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[FOO3]], i32 0
-; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float> addrspace(5)* [[TMP1]], align 8
-; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)*
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP4]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float 2.000000e+00, i64 1
-; CHECK-NEXT: store <2 x float> [[TMP6]], <2 x float> addrspace(5)* [[TMP4]], align 8
-; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], [2 x float] addrspace(5)* [[F1]], align 4
-; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2]], [[BLOCK2]] addrspace(1)* @block2, i32 0, i32 1
-; CHECK-NEXT: store [2 x float] [[FOO6]], [2 x float] addrspace(1)* [[FOO7]], align 4
-; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
-; CHECK-NEXT: store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> addrspace(1)* [[FOO8]], align 16
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3]], i32 0
+; CHECK-NEXT: store <2 x float> [[TMP2]], ptr addrspace(5) [[F1]], align 8
+; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float 2.000000e+00, i64 1
+; CHECK-NEXT: store <2 x float> [[TMP4]], ptr addrspace(5) [[F1]], align 8
+; CHECK-NEXT: [[FOO6:%.*]] = load [2 x float], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO7:%.*]] = getelementptr [[BLOCK2:%.*]], ptr addrspace(1) @block2, i32 0, i32 1
+; CHECK-NEXT: store [2 x float] [[FOO6]], ptr addrspace(1) [[FOO7]], align 4
+; CHECK-NEXT: store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv, align 16
; CHECK-NEXT: ret void
;
%i = alloca i32, addrspace(5)
%f1 = alloca [2 x float], addrspace(5)
- %foo = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 0
- %foo1 = load i32, i32 addrspace(1)* %foo
- store i32 %foo1, i32 addrspace(5)* %i
- %foo2 = load i32, i32 addrspace(5)* %i
+ %foo1 = load i32, ptr addrspace(1) @block2
+ store i32 %foo1, ptr addrspace(5) %i
+ %foo2 = load i32, ptr addrspace(5) %i
%foo3 = sitofp i32 %foo2 to float
- %foo4 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 0
- store float %foo3, float addrspace(5)* %foo4
- %foo5 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 1
- store float 2.000000e+00, float addrspace(5)* %foo5
- %foo6 = load [2 x float], [2 x float] addrspace(5)* %f1
- %foo7 = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 1
- store [2 x float] %foo6, [2 x float] addrspace(1)* %foo7
- %foo8 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0
- store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> addrspace(1)* %foo8
+ store float %foo3, ptr addrspace(5) %f1
+ %foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 1
+ store float 2.000000e+00, ptr addrspace(5) %foo5
+ %foo6 = load [2 x float], ptr addrspace(5) %f1
+ %foo7 = getelementptr %Block2, ptr addrspace(1) @block2, i32 0, i32 1
+ store [2 x float] %foo6, ptr addrspace(1) %foo7
+ store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, ptr addrspace(1) @pv
ret void
}
; CHECK-LABEL: @promote_load_from_store_aggr(
; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4, addrspace(5)
; CHECK-NEXT: [[F1:%.*]] = alloca [2 x float], align 4, addrspace(5)
-; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], [[BLOCK3]] addrspace(1)* @block3, i32 0, i32 1
-; CHECK-NEXT: [[FOO1:%.*]] = load i32, i32 addrspace(1)* [[FOO]], align 4
-; CHECK-NEXT: store i32 [[FOO1]], i32 addrspace(5)* [[I]], align 4
-; CHECK-NEXT: [[FOO2:%.*]] = getelementptr [[BLOCK3]], [[BLOCK3]] addrspace(1)* @block3, i32 0, i32 0
-; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], [2 x float] addrspace(1)* [[FOO2]], align 4
-; CHECK-NEXT: store [2 x float] [[FOO3]], [2 x float] addrspace(5)* [[F1]], align 4
-; CHECK-NEXT: [[FOO4:%.*]] = load i32, i32 addrspace(5)* [[I]], align 4
-; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], [2 x float] addrspace(5)* [[F1]], i32 0, i32 [[FOO4]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x float] addrspace(5)* [[F1]] to <2 x float> addrspace(5)*
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float> addrspace(5)* [[TMP1]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO4]]
+; CHECK-NEXT: [[FOO:%.*]] = getelementptr [[BLOCK3:%.*]], ptr addrspace(1) @block3, i32 0, i32 1
+; CHECK-NEXT: [[FOO1:%.*]] = load i32, ptr addrspace(1) [[FOO]], align 4
+; CHECK-NEXT: store i32 [[FOO1]], ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT: [[FOO3:%.*]] = load [2 x float], ptr addrspace(1) @block3, align 4
+; CHECK-NEXT: store [2 x float] [[FOO3]], ptr addrspace(5) [[F1]], align 4
+; CHECK-NEXT: [[FOO4:%.*]] = load i32, ptr addrspace(5) [[I]], align 4
+; CHECK-NEXT: [[FOO5:%.*]] = getelementptr [2 x float], ptr addrspace(5) [[F1]], i32 0, i32 [[FOO4]]
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr addrspace(5) [[F1]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 [[FOO4]]
; CHECK-NEXT: [[FOO7:%.*]] = alloca <4 x float>, align 16, addrspace(5)
-; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, <4 x float> addrspace(5)* [[FOO7]], align 16
-; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP3]], i32 0
-; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
-; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP3]], i32 2
-; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP3]], i32 3
-; CHECK-NEXT: [[FOO13:%.*]] = getelementptr [[GL_PERVERTEX:%.*]], [[GL_PERVERTEX]] addrspace(1)* @pv, i32 0, i32 0
-; CHECK-NEXT: store <4 x float> [[FOO12]], <4 x float> addrspace(1)* [[FOO13]], align 16
+; CHECK-NEXT: [[FOO8:%.*]] = load <4 x float>, ptr addrspace(5) [[FOO7]], align 16
+; CHECK-NEXT: [[FOO9:%.*]] = insertelement <4 x float> [[FOO8]], float [[TMP2]], i32 0
+; CHECK-NEXT: [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP2]], i32 1
+; CHECK-NEXT: [[FOO11:%.*]] = insertelement <4 x float> [[FOO10]], float [[TMP2]], i32 2
+; CHECK-NEXT: [[FOO12:%.*]] = insertelement <4 x float> [[FOO11]], float [[TMP2]], i32 3
+; CHECK-NEXT: store <4 x float> [[FOO12]], ptr addrspace(1) @pv, align 16
; CHECK-NEXT: ret void
;
%i = alloca i32, addrspace(5)
%f1 = alloca [2 x float], addrspace(5)
- %foo = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 1
- %foo1 = load i32, i32 addrspace(1)* %foo
- store i32 %foo1, i32 addrspace(5)* %i
- %foo2 = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 0
- %foo3 = load [2 x float], [2 x float] addrspace(1)* %foo2
- store [2 x float] %foo3, [2 x float] addrspace(5)* %f1
- %foo4 = load i32, i32 addrspace(5)* %i
- %foo5 = getelementptr [2 x float], [2 x float] addrspace(5)* %f1, i32 0, i32 %foo4
- %foo6 = load float, float addrspace(5)* %foo5
+ %foo = getelementptr %Block3, ptr addrspace(1) @block3, i32 0, i32 1
+ %foo1 = load i32, ptr addrspace(1) %foo
+ store i32 %foo1, ptr addrspace(5) %i
+ %foo3 = load [2 x float], ptr addrspace(1) @block3
+ store [2 x float] %foo3, ptr addrspace(5) %f1
+ %foo4 = load i32, ptr addrspace(5) %i
+ %foo5 = getelementptr [2 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
+ %foo6 = load float, ptr addrspace(5) %foo5
%foo7 = alloca <4 x float>, addrspace(5)
- %foo8 = load <4 x float>, <4 x float> addrspace(5)* %foo7
+ %foo8 = load <4 x float>, ptr addrspace(5) %foo7
%foo9 = insertelement <4 x float> %foo8, float %foo6, i32 0
%foo10 = insertelement <4 x float> %foo9, float %foo6, i32 1
%foo11 = insertelement <4 x float> %foo10, float %foo6, i32 2
%foo12 = insertelement <4 x float> %foo11, float %foo6, i32 3
- %foo13 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0
- store <4 x float> %foo12, <4 x float> addrspace(1)* %foo13
+ store <4 x float> %foo12, ptr addrspace(1) @pv
ret void
}
define amdgpu_ps void @promote_double_aggr() #0 {
; CHECK-LABEL: @promote_double_aggr(
; CHECK-NEXT: [[S:%.*]] = alloca [2 x double], align 8, addrspace(5)
-; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0
-; CHECK-NEXT: [[FOO1:%.*]] = load double, double addrspace(1)* [[FOO]], align 8
-; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1
-; CHECK-NEXT: [[FOO3:%.*]] = load double, double addrspace(1)* [[FOO2]], align 8
+; CHECK-NEXT: [[FOO:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
+; CHECK-NEXT: [[FOO1:%.*]] = load double, ptr addrspace(1) [[FOO]], align 8
+; CHECK-NEXT: [[FOO2:%.*]] = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
+; CHECK-NEXT: [[FOO3:%.*]] = load double, ptr addrspace(1) [[FOO2]], align 8
; CHECK-NEXT: [[FOO4:%.*]] = insertvalue [2 x double] undef, double [[FOO1]], 0
; CHECK-NEXT: [[FOO5:%.*]] = insertvalue [2 x double] [[FOO4]], double [[FOO3]], 1
-; CHECK-NEXT: store [2 x double] [[FOO5]], [2 x double] addrspace(5)* [[S]], align 8
-; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP1]], align 16
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
-; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP4]], align 16
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP5]], i64 1
-; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP3]], [[TMP6]]
-; CHECK-NEXT: [[FOO11:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
-; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP7]], align 16
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP8]], double [[FOO10]], i32 0
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double> addrspace(5)* [[TMP7]], align 16
-; CHECK-NEXT: [[FOO12:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
-; CHECK-NEXT: [[TMP11:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP10]], align 16
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double> [[TMP11]], i32 0
-; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], [2 x double] addrspace(5)* [[S]], i32 0, i32 1
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x double] addrspace(5)* [[S]] to <2 x double> addrspace(5)*
-; CHECK-NEXT: [[TMP14:%.*]] = load <2 x double>, <2 x double> addrspace(5)* [[TMP13]], align 16
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[TMP14]], i64 1
-; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP12]], [[TMP15]]
+; CHECK-NEXT: store [2 x double] [[FOO5]], ptr addrspace(5) [[S]], align 8
+; CHECK-NEXT: [[FOO6:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT: [[FOO8:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT: [[FOO10:%.*]] = fadd double [[TMP2]], [[TMP4]]
+; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[FOO10]], i32 0
+; CHECK-NEXT: store <2 x double> [[TMP6]], ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
+; CHECK-NEXT: [[FOO14:%.*]] = getelementptr [2 x double], ptr addrspace(5) [[S]], i32 0, i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr addrspace(5) [[S]], align 16
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP9]], i64 1
+; CHECK-NEXT: [[FOO16:%.*]] = fadd double [[TMP8]], [[TMP10]]
; CHECK-NEXT: [[FOO17:%.*]] = fptrunc double [[FOO16]] to float
; CHECK-NEXT: [[FOO18:%.*]] = insertelement <4 x float> undef, float [[FOO17]], i32 0
; CHECK-NEXT: [[FOO19:%.*]] = insertelement <4 x float> [[FOO18]], float [[FOO17]], i32 1
; CHECK-NEXT: [[FOO20:%.*]] = insertelement <4 x float> [[FOO19]], float [[FOO17]], i32 2
; CHECK-NEXT: [[FOO21:%.*]] = insertelement <4 x float> [[FOO20]], float [[FOO17]], i32 3
-; CHECK-NEXT: store <4 x float> [[FOO21]], <4 x float> addrspace(1)* @frag_color, align 16
+; CHECK-NEXT: store <4 x float> [[FOO21]], ptr addrspace(1) @frag_color, align 16
; CHECK-NEXT: ret void
;
%s = alloca [2 x double], addrspace(5)
- %foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0
- %foo1 = load double, double addrspace(1)* %foo
- %foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1
- %foo3 = load double, double addrspace(1)* %foo2
+ %foo = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 0
+ %foo1 = load double, ptr addrspace(1) %foo
+ %foo2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, ptr addrspace(1) @tmp_g, i32 0, i32 0, i32 1
+ %foo3 = load double, ptr addrspace(1) %foo2
%foo4 = insertvalue [2 x double] undef, double %foo1, 0
%foo5 = insertvalue [2 x double] %foo4, double %foo3, 1
- store [2 x double] %foo5, [2 x double] addrspace(5)* %s
- %foo6 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1
- %foo7 = load double, double addrspace(5)* %foo6
- %foo8 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1
- %foo9 = load double, double addrspace(5)* %foo8
+ store [2 x double] %foo5, ptr addrspace(5) %s
+ %foo6 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
+ %foo7 = load double, ptr addrspace(5) %foo6
+ %foo8 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
+ %foo9 = load double, ptr addrspace(5) %foo8
%foo10 = fadd double %foo7, %foo9
- %foo11 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 0
- store double %foo10, double addrspace(5)* %foo11
- %foo12 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 0
- %foo13 = load double, double addrspace(5)* %foo12
- %foo14 = getelementptr [2 x double], [2 x double] addrspace(5)* %s, i32 0, i32 1
- %foo15 = load double, double addrspace(5)* %foo14
+ store double %foo10, ptr addrspace(5) %s
+ %foo13 = load double, ptr addrspace(5) %s
+ %foo14 = getelementptr [2 x double], ptr addrspace(5) %s, i32 0, i32 1
+ %foo15 = load double, ptr addrspace(5) %foo14
%foo16 = fadd double %foo13, %foo15
%foo17 = fptrunc double %foo16 to float
%foo18 = insertelement <4 x float> undef, float %foo17, i32 0
%foo19 = insertelement <4 x float> %foo18, float %foo17, i32 1
%foo20 = insertelement <4 x float> %foo19, float %foo17, i32 2
%foo21 = insertelement <4 x float> %foo20, float %foo17, i32 3
- store <4 x float> %foo21, <4 x float> addrspace(1)* @frag_color
+ store <4 x float> %foo21, ptr addrspace(1) @frag_color
ret void
}
define amdgpu_kernel void @alloca_struct() #0 {
; CHECK-LABEL: @alloca_struct(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(4)* [[TMP0]] to i32 addrspace(4)*
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP1]], i64 1
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(4)* [[TMP2]], align 4, !invariant.load !0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(4)* [[TMP1]], i64 2
-; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32 addrspace(4)* [[TMP4]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
-; CHECK-NEXT: [[TMP6:%.*]] = lshr i32 [[TMP3]], 16
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
-; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
-; CHECK-NEXT: [[TMP10:%.*]] = mul nuw nsw i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], [[TMP7]]
-; CHECK-NEXT: [[TMP12:%.*]] = mul nuw nsw i32 [[TMP8]], [[TMP5]]
-; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP9]]
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], [1024 x [2 x %struct]] addrspace(3)* @alloca_struct.alloca, i32 0, i32 [[TMP14]]
+; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load !0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load !0
+; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.workitem.id.x(), !range [[RNG2:![0-9]+]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.workitem.id.y(), !range [[RNG2]]
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.workitem.id.z(), !range [[RNG2]]
+; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [2 x %struct]], ptr addrspace(3) @alloca_struct.alloca, i32 0, i32 [[TMP13]]
; CHECK-NEXT: ret void
;
entry:
; CHECK-LABEL: @array_alloca(
; CHECK: %stack = alloca i32, i32 5, align 4, addrspace(5)
-define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @array_alloca(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
entry:
%stack = alloca i32, i32 5, align 4, addrspace(5)
- %ld0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 0
- %ld2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %ld2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 1
- %ld3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %ld3, i32 addrspace(1)* %arrayidx13
+ %ld0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %ld1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %ld2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %ld2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 1
+ %ld3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %ld3, ptr addrspace(1) %arrayidx13
ret void
}
; CHECK-LABEL: @array_alloca_dynamic(
; CHECK: %stack = alloca i32, i32 %size, align 4, addrspace(5)
-define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
+define amdgpu_kernel void @array_alloca_dynamic(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %size) #0 {
entry:
%stack = alloca i32, i32 %size, align 4, addrspace(5)
- %ld0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %ld1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 %ld1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 0
- %ld2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %ld2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds i32, i32 addrspace(5)* %stack, i32 1
- %ld3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %ld3, i32 addrspace(1)* %arrayidx13
+ %ld0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %ld1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 %ld1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %ld2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %ld2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds i32, ptr addrspace(5) %stack, i32 1
+ %ld3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %ld3, ptr addrspace(1) %arrayidx13
ret void
}
; RUN: opt -data-layout=A5 -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-promote-alloca -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=IR %s
; RUN: llc -march=amdgcn -mcpu=fiji -disable-promote-alloca-to-vector < %s | FileCheck -check-prefix=ASM %s
-; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
+; IR-LABEL: define amdgpu_vs void @promote_alloca_shaders(ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %in) #0 {
; IR: alloca [5 x i32]
; ASM-LABEL: {{^}}promote_alloca_shaders:
; ASM: ; ScratchSize: 24
-define amdgpu_vs void @promote_alloca_shaders(i32 addrspace(1)* inreg %out, i32 addrspace(1)* inreg %in) #0 {
+define amdgpu_vs void @promote_alloca_shaders(ptr addrspace(1) inreg %out, ptr addrspace(1) inreg %in) #0 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
- %tmp0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx4 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
- %tmp2 = load i32, i32 addrspace(5)* %arrayidx4, align 4
- store i32 %tmp2, i32 addrspace(1)* %out, align 4
- %arrayidx5 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
- %tmp3 = load i32, i32 addrspace(5)* %arrayidx5
- %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %tmp3, i32 addrspace(1)* %arrayidx6
+ %tmp0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %tmp2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %tmp2, ptr addrspace(1) %out, align 4
+ %arrayidx5 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %tmp3 = load i32, ptr addrspace(5) %arrayidx5
+ %arrayidx6 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %tmp3, ptr addrspace(1) %arrayidx6
ret void
}
; ASM-LABEL: {{^}}promote_to_vector_call_c:
; ASM-NOT: LDSByteSize
; ASM: ; ScratchSize: 12
-define void @promote_to_vector_call_c(i32 addrspace(1)* %out, i32 %in) #0 {
+define void @promote_to_vector_call_c(ptr addrspace(1) %out, i32 %in) #0 {
entry:
%tmp = alloca [2 x i32], addrspace(5)
- %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
- %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
- store i32 0, i32 addrspace(5)* %tmp1
- store i32 1, i32 addrspace(5)* %tmp2
- %tmp3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in
- %tmp4 = load i32, i32 addrspace(5)* %tmp3
- %tmp5 = load volatile i32, i32 addrspace(1)* undef
+ %tmp2 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 1, ptr addrspace(5) %tmp2
+ %tmp3 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in
+ %tmp4 = load i32, ptr addrspace(5) %tmp3
+ %tmp5 = load volatile i32, ptr addrspace(1) undef
%tmp6 = add i32 %tmp4, %tmp5
- store i32 %tmp6, i32 addrspace(1)* %out
+ store i32 %tmp6, ptr addrspace(1) %out
ret void
}
; ASM-LABEL: {{^}}no_promote_to_lds_c:
; ASM-NOT: LDSByteSize
; ASM: ; ScratchSize: 24
-define void @no_promote_to_lds_c(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define void @no_promote_to_lds_c(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) #0 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
- %0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
- %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
- %3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %3, i32 addrspace(1)* %arrayidx13
+ %0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %3, ptr addrspace(1) %arrayidx13
ret void
}
-declare i32 @foo(i32 addrspace(5)*) #0
+declare i32 @foo(ptr addrspace(5)) #0
; ASM-LABEL: {{^}}call_private:
; ASM: buffer_store_dword
; ASM: buffer_store_dword
; ASM: s_swappc_b64
; ASM: ScratchSize: 16400
-define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @call_private(ptr addrspace(1) %out, i32 %in) #0 {
entry:
%tmp = alloca [2 x i32], addrspace(5)
- %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
- %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
- store i32 0, i32 addrspace(5)* %tmp1
- store i32 1, i32 addrspace(5)* %tmp2
- %tmp3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in
- %val = call i32 @foo(i32 addrspace(5)* %tmp3)
- store i32 %val, i32 addrspace(1)* %out
+ %tmp2 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 1, ptr addrspace(5) %tmp2
+ %tmp3 = getelementptr [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in
+ %val = call i32 @foo(ptr addrspace(5) %tmp3)
+ store i32 %val, ptr addrspace(1) %out
ret void
}
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mtriple=amdgcn-- -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
target datalayout = "A5"
-declare {}* @llvm.invariant.start.p5i8(i64, i8 addrspace(5)* nocapture) #0
-declare void @llvm.invariant.end.p5i8({}*, i64, i8 addrspace(5)* nocapture) #0
-declare i8 addrspace(5)* @llvm.launder.invariant.group.p5i8(i8 addrspace(5)*) #1
+declare ptr @llvm.invariant.start.p5(i64, ptr addrspace(5) nocapture) #0
+declare void @llvm.invariant.end.p5(ptr, i64, ptr addrspace(5) nocapture) #0
+declare ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5)) #1
; GCN-LABEL: {{^}}use_invariant_promotable_lds:
; GCN: buffer_load_dword
; GCN: ds_write_b32
-define amdgpu_kernel void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @use_invariant_promotable_lds(ptr addrspace(1) %arg) #2 {
bb:
%tmp = alloca i32, align 4, addrspace(5)
- %tmp1 = bitcast i32 addrspace(5)* %tmp to i8 addrspace(5)*
- %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
- %tmp3 = load i32, i32 addrspace(1)* %tmp2
- store i32 %tmp3, i32 addrspace(5)* %tmp
- %tmp4 = call {}* @llvm.invariant.start.p5i8(i64 4, i8 addrspace(5)* %tmp1) #0
- call void @llvm.invariant.end.p5i8({}* %tmp4, i64 4, i8 addrspace(5)* %tmp1) #0
- %tmp5 = call i8 addrspace(5)* @llvm.launder.invariant.group.p5i8(i8 addrspace(5)* %tmp1) #1
+ %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+ %tmp3 = load i32, ptr addrspace(1) %tmp2
+ store i32 %tmp3, ptr addrspace(5) %tmp
+ %tmp4 = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %tmp) #0
+ call void @llvm.invariant.end.p5(ptr %tmp4, i64 4, ptr addrspace(5) %tmp) #0
+ %tmp5 = call ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5) %tmp) #1
ret void
}
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #0
-declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #0
+declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #0
+declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #0
; OPT-LABEL: @use_lifetime_promotable_lds(
; OPT-NOT: alloca i32
; OPT-NOT: llvm.lifetime
-; OPT: store i32 %tmp3, i32 addrspace(3)*
-define amdgpu_kernel void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
+; OPT: store i32 %tmp3, ptr addrspace(3)
+define amdgpu_kernel void @use_lifetime_promotable_lds(ptr addrspace(1) %arg) #2 {
bb:
%tmp = alloca i32, align 4, addrspace(5)
- %tmp1 = bitcast i32 addrspace(5)* %tmp to i8 addrspace(5)*
- call void @llvm.lifetime.start.p5i8(i64 4, i8 addrspace(5)* %tmp1)
- %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
- %tmp3 = load i32, i32 addrspace(1)* %tmp2
- store i32 %tmp3, i32 addrspace(5)* %tmp
- call void @llvm.lifetime.end.p5i8(i64 4, i8 addrspace(5)* %tmp1)
+ call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %tmp)
+ %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+ %tmp3 = load i32, ptr addrspace(1) %tmp2
+ store i32 %tmp3, ptr addrspace(5) %tmp
+ call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %tmp)
ret void
}
define amdgpu_kernel void @iterator_erased_lifetime() {
entry:
%alloca = alloca i8, align 1, addrspace(5)
- call void @llvm.lifetime.start.p5i8(i64 1, i8 addrspace(5)* %alloca)
+ call void @llvm.lifetime.start.p5(i64 1, ptr addrspace(5) %alloca)
ret void
}
-; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -passes=amdgpu-promote-alloca < %s | FileCheck --enable-var-scope %s
-declare void @llvm.memcpy.p5i8.p1i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0
-declare void @llvm.memcpy.p1i8.p5i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0
-declare void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture, i64, i1) #0
+declare void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0
+declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0
+declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture, i64, i1) #0
-declare void @llvm.memmove.p5i8.p1i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(1)* nocapture, i32, i1) #0
-declare void @llvm.memmove.p1i8.p5i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0
-declare void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(5)* nocapture, i64, i1) #0
+declare void @llvm.memmove.p5.p1.i32(ptr addrspace(5) nocapture, ptr addrspace(1) nocapture, i32, i1) #0
+declare void @llvm.memmove.p1.p5.i32(ptr addrspace(1) nocapture, ptr addrspace(5) nocapture, i32, i1) #0
+declare void @llvm.memmove.p5.p5.i64(ptr addrspace(5) nocapture, ptr addrspace(5) nocapture, i64, i1) #0
-declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture, i8, i32, i1) #0
+declare void @llvm.memset.p5.i32(ptr addrspace(5) nocapture, i8, i32, i1) #0
-declare i32 @llvm.objectsize.i32.p5i8(i8 addrspace(5)*, i1, i1, i1) #1
+declare i32 @llvm.objectsize.i32.p5(ptr addrspace(5), i1, i1, i1) #1
; CHECK-LABEL: @promote_with_memcpy(
-; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
-; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(3)* align 4 %alloca.bc, i32 68, i1 false)
-define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memcpy.p3.p1.i32(ptr addrspace(3) align 4 [[GEP]], ptr addrspace(1) align 4 %in, i32 68, i1 false)
+; CHECK: call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 [[GEP]], i32 68, i1 false)
+define amdgpu_kernel void @promote_with_memcpy(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%alloca = alloca [17 x i32], align 4, addrspace(5)
- %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
- %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
- %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
- call void @llvm.memcpy.p5i8.p1i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
- call void @llvm.memcpy.p1i8.p5i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(5)* align 4 %alloca.bc, i32 68, i1 false)
+ call void @llvm.memcpy.p5.p1.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(1) align 4 %in, i32 68, i1 false)
+ call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 %out, ptr addrspace(5) align 4 %alloca, i32 68, i1 false)
ret void
}
; CHECK-LABEL: @promote_with_memmove(
-; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
-; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(3)* align 4 %alloca.bc, i32 68, i1 false)
-define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memmove.p3.p1.i32(ptr addrspace(3) align 4 [[GEP]], ptr addrspace(1) align 4 %in, i32 68, i1 false)
+; CHECK: call void @llvm.memmove.p1.p3.i32(ptr addrspace(1) align 4 %out, ptr addrspace(3) align 4 [[GEP]], i32 68, i1 false)
+define amdgpu_kernel void @promote_with_memmove(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%alloca = alloca [17 x i32], align 4, addrspace(5)
- %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
- %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
- %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
- call void @llvm.memmove.p5i8.p1i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 addrspace(1)* align 4 %in.bc, i32 68, i1 false)
- call void @llvm.memmove.p1i8.p5i8.i32(i8 addrspace(1)* align 4 %out.bc, i8 addrspace(5)* align 4 %alloca.bc, i32 68, i1 false)
+ call void @llvm.memmove.p5.p1.i32(ptr addrspace(5) align 4 %alloca, ptr addrspace(1) align 4 %in, i32 68, i1 false)
+ call void @llvm.memmove.p1.p5.i32(ptr addrspace(1) align 4 %out, ptr addrspace(5) align 4 %alloca, i32 68, i1 false)
ret void
}
; CHECK-LABEL: @promote_with_memset(
-; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* align 4 %alloca.bc, i8 7, i32 68, i1 false)
-define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call void @llvm.memset.p3.i32(ptr addrspace(3) align 4 [[GEP]], i8 7, i32 68, i1 false)
+define amdgpu_kernel void @promote_with_memset(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
%alloca = alloca [17 x i32], align 4, addrspace(5)
- %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
- %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
- %out.bc = bitcast i32 addrspace(1)* %out to i8 addrspace(1)*
- call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %alloca.bc, i8 7, i32 68, i1 false)
+ call void @llvm.memset.p5.i32(ptr addrspace(5) align 4 %alloca, i8 7, i32 68, i1 false)
ret void
}
; CHECK-LABEL: @promote_with_objectsize(
-; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false, i1 false, i1 false)
-define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
+; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], ptr addrspace(3) @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: call i32 @llvm.objectsize.i32.p3(ptr addrspace(3) [[PTR]], i1 false, i1 false, i1 false)
+define amdgpu_kernel void @promote_with_objectsize(ptr addrspace(1) %out) #0 {
%alloca = alloca [17 x i32], align 4, addrspace(5)
- %alloca.bc = bitcast [17 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
- %size = call i32 @llvm.objectsize.i32.p5i8(i8 addrspace(5)* %alloca.bc, i1 false, i1 false, i1 false)
- store i32 %size, i32 addrspace(1)* %out
+ %size = call i32 @llvm.objectsize.i32.p5(ptr addrspace(5) %alloca, i1 false, i1 false, i1 false)
+ store i32 %size, ptr addrspace(1) %out
ret void
}
; CHECK-LABEL: @promote_alloca_used_twice_in_memcpy(
-; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)*
-; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)*
-; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+; CHECK: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(3) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
define amdgpu_kernel void @promote_alloca_used_twice_in_memcpy(i32 %c) {
entry:
%r = alloca double, align 8, addrspace(5)
- %arrayidx1 = getelementptr inbounds double, double addrspace(5)* %r, i32 1
- %i = bitcast double addrspace(5)* %arrayidx1 to i8 addrspace(5)*
- %arrayidx2 = getelementptr inbounds double, double addrspace(5)* %r, i32 %c
- %i1 = bitcast double addrspace(5)* %arrayidx2 to i8 addrspace(5)*
- call void @llvm.memcpy.p5i8.p5i8.i64(i8 addrspace(5)* align 8 dereferenceable(16) %i, i8 addrspace(5)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+ %arrayidx1 = getelementptr inbounds double, ptr addrspace(5) %r, i32 1
+ %arrayidx2 = getelementptr inbounds double, ptr addrspace(5) %r, i32 %c
+ call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(5) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
ret void
}
; CHECK-LABEL: @promote_alloca_used_twice_in_memmove(
-; CHECK: %i = bitcast double addrspace(3)* %arrayidx1 to i8 addrspace(3)*
-; CHECK: %i1 = bitcast double addrspace(3)* %arrayidx2 to i8 addrspace(3)*
-; CHECK: call void @llvm.memmove.p3i8.p3i8.i64(i8 addrspace(3)* align 8 dereferenceable(16) %i, i8 addrspace(3)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+; CHECK: call void @llvm.memmove.p3.p3.i64(ptr addrspace(3) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(3) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
define amdgpu_kernel void @promote_alloca_used_twice_in_memmove(i32 %c) {
entry:
%r = alloca double, align 8, addrspace(5)
- %arrayidx1 = getelementptr inbounds double, double addrspace(5)* %r, i32 1
- %i = bitcast double addrspace(5)* %arrayidx1 to i8 addrspace(5)*
- %arrayidx2 = getelementptr inbounds double, double addrspace(5)* %r, i32 %c
- %i1 = bitcast double addrspace(5)* %arrayidx2 to i8 addrspace(5)*
- call void @llvm.memmove.p5i8.p5i8.i64(i8 addrspace(5)* align 8 dereferenceable(16) %i, i8 addrspace(5)* align 8 dereferenceable(16) %i1, i64 16, i1 false)
+ %arrayidx1 = getelementptr inbounds double, ptr addrspace(5) %r, i32 1
+ %arrayidx2 = getelementptr inbounds double, ptr addrspace(5) %r, i32 %c
+ call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) align 8 dereferenceable(16) %arrayidx1, ptr addrspace(5) align 8 dereferenceable(16) %arrayidx2, i64 16, i1 false)
ret void
}
; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
; NOOPTS-NOT: ds_write
; OPTS: ds_write
-define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @promote_alloca_i32_array_array(ptr addrspace(1) %out, i32 %index) #0 {
entry:
%alloca = alloca [2 x [2 x i32]], addrspace(5)
- %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
- store i32 0, i32 addrspace(5)* %gep0
- store i32 1, i32 addrspace(5)* %gep1
- %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
- %load = load i32, i32 addrspace(5)* %gep2
- store i32 %load, i32 addrspace(1)* %out
+ %gep1 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+ store i32 0, ptr addrspace(5) %alloca
+ store i32 1, ptr addrspace(5) %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index
+ %load = load i32, ptr addrspace(5) %gep2
+ store i32 %load, ptr addrspace(1) %out
ret void
}
; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array:
; ALL: workgroup_group_segment_byte_size = 0{{$}}
; ALL-NOT: ds_write
-define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(ptr addrspace(1) %out, i32 %index) #1 {
entry:
%alloca = alloca [2 x [2 x i32]], addrspace(5)
- %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
- %gep1 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
- store i32 0, i32 addrspace(5)* %gep0
- store i32 1, i32 addrspace(5)* %gep1
- %gep2 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
- %load = load i32, i32 addrspace(5)* %gep2
- store i32 %load, i32 addrspace(1)* %out
+ %gep1 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 1
+ store i32 0, ptr addrspace(5) %alloca
+ store i32 1, ptr addrspace(5) %gep1
+ %gep2 = getelementptr inbounds [2 x [2 x i32]], ptr addrspace(5) %alloca, i32 0, i32 0, i32 %index
+ %load = load i32, ptr addrspace(5) %gep2
+ store i32 %load, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}promote_alloca_size_order_0:
; GCN: workgroup_group_segment_byte_size = 1060
-define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_0(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
- %tmp0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
- %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %tmp2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
- %tmp3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %tmp3, i32 addrspace(1)* %arrayidx13
-
- %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
- store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
-
- %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
- store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
-
- %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
- store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
+ %tmp0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %tmp2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %tmp2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %tmp3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %tmp3, ptr addrspace(1) %arrayidx13
+
+ %gep.lds1 = getelementptr inbounds [73 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx
+ store volatile i32 0, ptr addrspace(3) %gep.lds1, align 4
+
+ %gep.lds2 = getelementptr inbounds [32 x i64], ptr addrspace(3) @lds2, i32 0, i32 %idx
+ store volatile i64 0, ptr addrspace(3) %gep.lds2, align 8
+
+ %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], ptr addrspace(3) @lds0, i32 0, i32 %idx
+ store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds0, align 16
ret void
}
; GCN-LABEL: {{^}}promote_alloca_size_order_1:
; GCN: workgroup_group_segment_byte_size = 1072
-define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_1(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
- %tmp0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
- %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %tmp2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
- %tmp3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %tmp3, i32 addrspace(1)* %arrayidx13
-
- %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], [32 x <4 x i32>] addrspace(3)* @lds0, i32 0, i32 %idx
- store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds0, align 16
-
- %gep.lds2 = getelementptr inbounds [32 x i64], [32 x i64] addrspace(3)* @lds2, i32 0, i32 %idx
- store volatile i64 0, i64 addrspace(3)* %gep.lds2, align 8
-
- %gep.lds1 = getelementptr inbounds [73 x i32], [73 x i32] addrspace(3)* @lds1, i32 0, i32 %idx
- store volatile i32 0, i32 addrspace(3)* %gep.lds1, align 4
+ %tmp0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %tmp2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %tmp2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %tmp3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %tmp3, ptr addrspace(1) %arrayidx13
+
+ %gep.lds0 = getelementptr inbounds [32 x <4 x i32>], ptr addrspace(3) @lds0, i32 0, i32 %idx
+ store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds0, align 16
+
+ %gep.lds2 = getelementptr inbounds [32 x i64], ptr addrspace(3) @lds2, i32 0, i32 %idx
+ store volatile i64 0, ptr addrspace(3) %gep.lds2, align 8
+
+ %gep.lds1 = getelementptr inbounds [73 x i32], ptr addrspace(3) @lds1, i32 0, i32 %idx
+ store volatile i32 0, ptr addrspace(3) %gep.lds1, align 4
ret void
}
; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
; GCN: workgroup_group_segment_byte_size = 1060
-define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in, i32 %idx) #0 {
entry:
%stack = alloca [5 x i32], align 4, addrspace(5)
- %tmp0 = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp0
- store i32 4, i32 addrspace(5)* %arrayidx1, align 4
- %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
- %tmp1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
- %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %tmp1
- store i32 5, i32 addrspace(5)* %arrayidx3, align 4
- %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
- %tmp2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
- store i32 %tmp2, i32 addrspace(1)* %out, align 4
- %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
- %tmp3 = load i32, i32 addrspace(5)* %arrayidx12
- %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
- store i32 %tmp3, i32 addrspace(1)* %arrayidx13
-
- %gep.lds3 = getelementptr inbounds [13 x i32], [13 x i32] addrspace(3)* @lds3, i32 0, i32 %idx
- store volatile i32 0, i32 addrspace(3)* %gep.lds3, align 4
-
- %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], [63 x <4 x i32>] addrspace(3)* @lds4, i32 0, i32 %idx
- store volatile <4 x i32> zeroinitializer, <4 x i32> addrspace(3)* %gep.lds4, align 16
+ %tmp0 = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp0
+ store i32 4, ptr addrspace(5) %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %in, i32 1
+ %tmp1 = load i32, ptr addrspace(1) %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp1
+ store i32 5, ptr addrspace(5) %arrayidx3, align 4
+ %tmp2 = load i32, ptr addrspace(5) %stack, align 4
+ store i32 %tmp2, ptr addrspace(1) %out, align 4
+ %arrayidx12 = getelementptr inbounds [5 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %tmp3 = load i32, ptr addrspace(5) %arrayidx12
+ %arrayidx13 = getelementptr inbounds i32, ptr addrspace(1) %out, i32 1
+ store i32 %tmp3, ptr addrspace(1) %arrayidx13
+
+ %gep.lds3 = getelementptr inbounds [13 x i32], ptr addrspace(3) @lds3, i32 0, i32 %idx
+ store volatile i32 0, ptr addrspace(3) %gep.lds3, align 4
+
+ %gep.lds4 = getelementptr inbounds [63 x <4 x i32>], ptr addrspace(3) @lds4, i32 0, i32 %idx
+ store volatile <4 x i32> zeroinitializer, ptr addrspace(3) %gep.lds4, align 16
ret void
}
define i64 @test_pointer_array(i64 %v) {
; OPT-LABEL: @test_pointer_array(
; OPT-NEXT: entry:
-; OPT-NEXT: [[A:%.*]] = alloca [3 x i8*], align 16, addrspace(5)
-; OPT-NEXT: [[GEP:%.*]] = getelementptr inbounds [3 x i8*], [3 x i8*] addrspace(5)* [[A]], i32 0, i32 0
-; OPT-NEXT: [[CAST:%.*]] = bitcast i8* addrspace(5)* [[GEP]] to i64 addrspace(5)*
-; OPT-NEXT: [[TMP0:%.*]] = bitcast [3 x i8*] addrspace(5)* [[A]] to <3 x i8*> addrspace(5)*
-; OPT-NEXT: [[TMP1:%.*]] = load <3 x i8*>, <3 x i8*> addrspace(5)* [[TMP0]], align 32
-; OPT-NEXT: [[TMP2:%.*]] = inttoptr i64 [[V:%.*]] to i8*
-; OPT-NEXT: [[TMP3:%.*]] = insertelement <3 x i8*> [[TMP1]], i8* [[TMP2]], i32 0
-; OPT-NEXT: store <3 x i8*> [[TMP3]], <3 x i8*> addrspace(5)* [[TMP0]], align 32
-; OPT-NEXT: [[TMP4:%.*]] = bitcast [3 x i8*] addrspace(5)* [[A]] to <3 x i8*> addrspace(5)*
-; OPT-NEXT: [[TMP5:%.*]] = load <3 x i8*>, <3 x i8*> addrspace(5)* [[TMP4]], align 32
-; OPT-NEXT: [[TMP6:%.*]] = extractelement <3 x i8*> [[TMP5]], i32 0
-; OPT-NEXT: [[TMP7:%.*]] = ptrtoint i8* [[TMP6]] to i64
+; OPT-NEXT: [[A:%.*]] = alloca [3 x ptr], align 16, addrspace(5)
+; OPT-NEXT: [[TMP1:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 32
+; OPT-NEXT: [[TMP2:%.*]] = inttoptr i64 [[V:%.*]] to ptr
+; OPT-NEXT: [[TMP3:%.*]] = insertelement <3 x ptr> [[TMP1]], ptr [[TMP2]], i32 0
+; OPT-NEXT: store <3 x ptr> [[TMP3]], ptr addrspace(5) [[A]], align 32
+; OPT-NEXT: [[TMP5:%.*]] = load <3 x ptr>, ptr addrspace(5) [[A]], align 32
+; OPT-NEXT: [[TMP6:%.*]] = extractelement <3 x ptr> [[TMP5]], i32 0
+; OPT-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64
; OPT-NEXT: ret i64 [[TMP7]]
;
entry:
- %a = alloca [3 x i8*], align 16, addrspace(5)
- %gep = getelementptr inbounds [3 x i8*], [3 x i8*] addrspace(5)* %a, i32 0, i32 0
- %cast = bitcast i8* addrspace(5)* %gep to i64 addrspace(5)*
- store i64 %v, i64 addrspace(5)* %cast, align 16
- %ld = load i64, i64 addrspace(5)* %cast, align 16
+ %a = alloca [3 x ptr], align 16, addrspace(5)
+ store i64 %v, ptr addrspace(5) %a, align 16
+ %ld = load i64, ptr addrspace(5) %a, align 16
ret i64 %ld
}
; GCN-LABEL: {{^}}stored_lds_pointer_value:
; GCN: buffer_store_dword v
-define amdgpu_kernel void @stored_lds_pointer_value(float addrspace(5)* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value(ptr addrspace(1) %ptr) #0 {
%tmp = alloca float, addrspace(5)
- store float 0.0, float addrspace(5)*%tmp
- store float addrspace(5)* %tmp, float addrspace(5)* addrspace(1)* %ptr
+ store float 0.0, ptr addrspace(5) %tmp
+ store ptr addrspace(5) %tmp, ptr addrspace(1) %ptr
ret void
}
; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
; GCN: buffer_store_dword v
-define amdgpu_kernel void @stored_lds_pointer_value_offset(float addrspace(5)* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_offset(ptr addrspace(1) %ptr) #0 {
%tmp0 = alloca float, addrspace(5)
%tmp1 = alloca float, addrspace(5)
- store float 0.0, float addrspace(5)*%tmp0
- store float 0.0, float addrspace(5)*%tmp1
- store volatile float addrspace(5)* %tmp0, float addrspace(5)* addrspace(1)* %ptr
- store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(1)* %ptr
+ store float 0.0, ptr addrspace(5) %tmp0
+ store float 0.0, ptr addrspace(5) %tmp1
+ store volatile ptr addrspace(5) %tmp0, ptr addrspace(1) %ptr
+ store volatile ptr addrspace(5) %tmp1, ptr addrspace(1) %ptr
ret void
}
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GCN: buffer_store_dword v
; GCN: buffer_store_dword v
-define amdgpu_kernel void @stored_lds_pointer_value_gep(float addrspace(5)* addrspace(1)* %ptr, i32 %idx) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_gep(ptr addrspace(1) %ptr, i32 %idx) #0 {
bb:
%tmp = alloca float, i32 16, addrspace(5)
- store float 0.0, float addrspace(5)* %tmp
- %tmp2 = getelementptr inbounds float, float addrspace(5)* %tmp, i32 %idx
- store float addrspace(5)* %tmp2, float addrspace(5)* addrspace(1)* %ptr
+ store float 0.0, ptr addrspace(5) %tmp
+ %tmp2 = getelementptr inbounds float, ptr addrspace(5) %tmp, i32 %idx
+ store ptr addrspace(5) %tmp2, ptr addrspace(1) %ptr
ret void
}
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
-define amdgpu_kernel void @stored_vector_pointer_value(i32 addrspace(5)* addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @stored_vector_pointer_value(ptr addrspace(1) %out, i32 %index) {
entry:
%tmp0 = alloca [4 x i32], addrspace(5)
- %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 0
- %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 1
- %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 2
- %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 3
- store i32 0, i32 addrspace(5)* %x
- store i32 1, i32 addrspace(5)* %y
- store i32 2, i32 addrspace(5)* %z
- store i32 3, i32 addrspace(5)* %w
- %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp0, i32 0, i32 %index
- store i32 addrspace(5)* %tmp1, i32 addrspace(5)* addrspace(1)* %out
+ %y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 1
+ %z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 2
+ %w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 3
+ store i32 0, ptr addrspace(5) %tmp0
+ store i32 1, ptr addrspace(5) %y
+ store i32 2, ptr addrspace(5) %z
+ store i32 3, ptr addrspace(5) %w
+ %tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp0, i32 0, i32 %index
+ store ptr addrspace(5) %tmp1, ptr addrspace(1) %out
ret void
}
; GCN-LABEL: {{^}}stored_fi_to_self:
; GCN-NOT: ds_
define amdgpu_kernel void @stored_fi_to_self() #0 {
- %tmp = alloca i32 addrspace(5)*, addrspace(5)
- store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp
- %bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp to i32 addrspace(5)*
- store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp
+ %tmp = alloca ptr addrspace(5), addrspace(5)
+ store volatile ptr addrspace(5) inttoptr (i32 1234 to ptr addrspace(5)), ptr addrspace(5) %tmp
+ store volatile ptr addrspace(5) %tmp, ptr addrspace(5) %tmp
ret void
}
; This kernel starts with the amdgpu-no-workitem-id-* attributes, but
; need to be removed when these intrinsic uses are introduced.
-; CHECK-LABEL: define amdgpu_kernel void @promote_to_lds(i32 addrspace(1)* %out, i32 %in) #0 {
-; CHECK: call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
+; CHECK-LABEL: define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 {
+; CHECK: call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
; CHECK: call i32 @llvm.amdgcn.workitem.id.x(), !range !2
; CHECK: call i32 @llvm.amdgcn.workitem.id.y(), !range !2
; CHECK: call i32 @llvm.amdgcn.workitem.id.z(), !range !2
-define amdgpu_kernel void @promote_to_lds(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @promote_to_lds(ptr addrspace(1) %out, i32 %in) #0 {
entry:
%tmp = alloca [2 x i32], addrspace(5)
- %tmp1 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
- %tmp2 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
- store i32 0, i32 addrspace(5)* %tmp1
- store i32 1, i32 addrspace(5)* %tmp2
- %tmp3 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 %in
- %tmp4 = load i32, i32 addrspace(5)* %tmp3
- %tmp5 = load volatile i32, i32 addrspace(1)* undef
+ %tmp2 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 1, ptr addrspace(5) %tmp2
+ %tmp3 = getelementptr inbounds [2 x i32], ptr addrspace(5) %tmp, i32 0, i32 %in
+ %tmp4 = load i32, ptr addrspace(5) %tmp3
+ %tmp5 = load volatile i32, ptr addrspace(1) undef
%tmp6 = add i32 %tmp4, %tmp5
- store i32 %tmp6, i32 addrspace(1)* %out
+ store i32 %tmp6, ptr addrspace(1) %out
ret void
}
@some_lds = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
@some_dynamic_lds = external hidden addrspace(3) global [0 x i32], align 4
-@initializer_user_some = addrspace(1) global i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), align 4
-@initializer_user_all = addrspace(1) global i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), align 4
+@initializer_user_some = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @some_lds to i32), align 4
+@initializer_user_all = addrspace(1) global i32 ptrtoint (ptr addrspace(3) @all_lds to i32), align 4
; This function cannot promote to using LDS because of the size of the
; constant expression use in the function, which was previously not
; ASM-LABEL: constant_expression_uses_all_lds:
; ASM: .amdhsa_group_segment_fixed_size 65536
-define amdgpu_kernel void @constant_expression_uses_all_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_all_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
entry:
%stack = alloca [4 x i32], align 4, addrspace(5)
- %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
- %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
- %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
- %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
- store i32 9, i32 addrspace(5)* %gep0
- store i32 10, i32 addrspace(5)* %gep1
- store i32 99, i32 addrspace(5)* %gep2
- store i32 43, i32 addrspace(5)* %gep3
- %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
- %load = load i32, i32 addrspace(5)* %arrayidx, align 4
- store i32 %load, i32 addrspace(1)* %out
-
- store volatile i32 ptrtoint ([16384 x i32] addrspace(3)* @all_lds to i32), i32 addrspace(1)* undef
+ %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+ %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+ store i32 9, ptr addrspace(5) %stack
+ store i32 10, ptr addrspace(5) %gep1
+ store i32 99, ptr addrspace(5) %gep2
+ store i32 43, ptr addrspace(5) %gep3
+ %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+ %load = load i32, ptr addrspace(5) %arrayidx, align 4
+ store i32 %load, ptr addrspace(1) %out
+
+ store volatile i32 ptrtoint (ptr addrspace(3) @all_lds to i32), ptr addrspace(1) undef
ret void
}
; ASM-LABEL: {{^}}constant_expression_uses_some_lds:
; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_some_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
entry:
%stack = alloca [4 x i32], align 4, addrspace(5)
- %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
- %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
- %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
- %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
- store i32 9, i32 addrspace(5)* %gep0
- store i32 10, i32 addrspace(5)* %gep1
- store i32 99, i32 addrspace(5)* %gep2
- store i32 43, i32 addrspace(5)* %gep3
- %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
- %load = load i32, i32 addrspace(5)* %arrayidx, align 4
- store i32 %load, i32 addrspace(1)* %out
- store volatile i32 ptrtoint ([32 x i32] addrspace(3)* @some_lds to i32), i32 addrspace(1)* undef
+ %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+ %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+ store i32 9, ptr addrspace(5) %stack
+ store i32 10, ptr addrspace(5) %gep1
+ store i32 99, ptr addrspace(5) %gep2
+ store i32 43, ptr addrspace(5) %gep3
+ %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+ %load = load i32, ptr addrspace(5) %arrayidx, align 4
+ store i32 %load, ptr addrspace(1) %out
+ store volatile i32 ptrtoint (ptr addrspace(3) @some_lds to i32), ptr addrspace(1) undef
ret void
}
; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds:
; ASM: .amdhsa_group_segment_fixed_size 0{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
entry:
%stack = alloca [4 x i32], align 4, addrspace(5)
- %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
- %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
- %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
- %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
- store i32 9, i32 addrspace(5)* %gep0
- store i32 10, i32 addrspace(5)* %gep1
- store i32 99, i32 addrspace(5)* %gep2
- store i32 43, i32 addrspace(5)* %gep3
- %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
- %load = load i32, i32 addrspace(5)* %arrayidx, align 4
- store i32 %load, i32 addrspace(1)* %out
- %gep_dyn_lds = getelementptr inbounds [0 x i32], [0 x i32]* addrspacecast ([0 x i32] addrspace(3)* @some_dynamic_lds to [0 x i32]*), i64 0, i64 0
- store i32 1234, i32* %gep_dyn_lds, align 4
+ %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+ %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+ store i32 9, ptr addrspace(5) %stack
+ store i32 10, ptr addrspace(5) %gep1
+ store i32 99, ptr addrspace(5) %gep2
+ store i32 43, ptr addrspace(5) %gep3
+ %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+ %load = load i32, ptr addrspace(5) %arrayidx, align 4
+ store i32 %load, ptr addrspace(1) %out
+ store i32 1234, ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr), align 4
ret void
}
-declare void @callee(i8*)
+declare void @callee(ptr)
; IR-LABEL: @constant_expression_uses_all_lds_multi_level(
; IR: alloca
; ASM-LABEL: {{^}}constant_expression_uses_all_lds_multi_level:
; ASM: .amdhsa_group_segment_fixed_size 65536{{$}}
-define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_all_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
entry:
%stack = alloca [4 x i32], align 4, addrspace(5)
- %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
- %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
- %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
- %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
- store i32 9, i32 addrspace(5)* %gep0
- store i32 10, i32 addrspace(5)* %gep1
- store i32 99, i32 addrspace(5)* %gep2
- store i32 43, i32 addrspace(5)* %gep3
- %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
- %load = load i32, i32 addrspace(5)* %arrayidx, align 4
- store i32 %load, i32 addrspace(1)* %out
- call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([16384 x i32], [16384 x i32] addrspace(3)* @all_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
+ %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+ %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+ store i32 9, ptr addrspace(5) %stack
+ store i32 10, ptr addrspace(5) %gep1
+ store i32 99, ptr addrspace(5) %gep2
+ store i32 43, ptr addrspace(5) %gep3
+ %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+ %load = load i32, ptr addrspace(5) %arrayidx, align 4
+ store i32 %load, ptr addrspace(1) %out
+ call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([16384 x i32], ptr addrspace(3) @all_lds, i32 0, i32 8) to ptr))
ret void
}
; ASM-LABEL: {{^}}constant_expression_uses_some_lds_multi_level:
; ASM: .amdhsa_group_segment_fixed_size 4224{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_some_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
entry:
%stack = alloca [4 x i32], align 4, addrspace(5)
- %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
- %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
- %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
- %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
- store i32 9, i32 addrspace(5)* %gep0
- store i32 10, i32 addrspace(5)* %gep1
- store i32 99, i32 addrspace(5)* %gep2
- store i32 43, i32 addrspace(5)* %gep3
- %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
- %load = load i32, i32 addrspace(5)* %arrayidx, align 4
- store i32 %load, i32 addrspace(1)* %out
- call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([32 x i32], [32 x i32] addrspace(3)* @some_lds, i32 0, i32 8) to i8 addrspace(3)*) to i8*))
+ %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+ %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+ store i32 9, ptr addrspace(5) %stack
+ store i32 10, ptr addrspace(5) %gep1
+ store i32 99, ptr addrspace(5) %gep2
+ store i32 43, ptr addrspace(5) %gep3
+ %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+ %load = load i32, ptr addrspace(5) %arrayidx, align 4
+ store i32 %load, ptr addrspace(1) %out
+ call void @callee(ptr addrspacecast (ptr addrspace(3) getelementptr inbounds ([32 x i32], ptr addrspace(3) @some_lds, i32 0, i32 8) to ptr))
ret void
}
; ASM-LABEL: {{^}}constant_expression_uses_some_dynamic_lds_multi_level:
; ASM: .amdhsa_group_segment_fixed_size 0{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_some_dynamic_lds_multi_level(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
entry:
%stack = alloca [4 x i32], align 4, addrspace(5)
- %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
- %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
- %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
- %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
- store i32 9, i32 addrspace(5)* %gep0
- store i32 10, i32 addrspace(5)* %gep1
- store i32 99, i32 addrspace(5)* %gep2
- store i32 43, i32 addrspace(5)* %gep3
- %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
- %load = load i32, i32 addrspace(5)* %arrayidx, align 4
- store i32 %load, i32 addrspace(1)* %out
- call void @callee(i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* getelementptr inbounds ([0 x i32], [0 x i32] addrspace(3)* @some_dynamic_lds, i32 0, i32 0) to i8 addrspace(3)*) to i8*))
+ %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+ %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+ store i32 9, ptr addrspace(5) %stack
+ store i32 10, ptr addrspace(5) %gep1
+ store i32 99, ptr addrspace(5) %gep2
+ store i32 43, ptr addrspace(5) %gep3
+ %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+ %load = load i32, ptr addrspace(5) %arrayidx, align 4
+ store i32 %load, ptr addrspace(1) %out
+ call void @callee(ptr addrspacecast (ptr addrspace(3) @some_dynamic_lds to ptr))
ret void
}
; ASM-LABEL: {{^}}constant_expression_uses_some_lds_global_initializer:
; ASM: .amdhsa_group_segment_fixed_size 4096{{$}}
-define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_some_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
entry:
%stack = alloca [4 x i32], align 4, addrspace(5)
- %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
- %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
- %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
- %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
- store i32 9, i32 addrspace(5)* %gep0
- store i32 10, i32 addrspace(5)* %gep1
- store i32 99, i32 addrspace(5)* %gep2
- store i32 43, i32 addrspace(5)* %gep3
- %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
- %load = load i32, i32 addrspace(5)* %arrayidx, align 4
- store i32 %load, i32 addrspace(1)* %out
-
- store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_some to i32), i32 addrspace(1)* undef
+ %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+ %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+ store i32 9, ptr addrspace(5) %stack
+ store i32 10, ptr addrspace(5) %gep1
+ store i32 99, ptr addrspace(5) %gep2
+ store i32 43, ptr addrspace(5) %gep3
+ %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+ %load = load i32, ptr addrspace(5) %arrayidx, align 4
+ store i32 %load, ptr addrspace(1) %out
+
+ store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_some to i32), ptr addrspace(1) undef
ret void
}
; ASM-LABEL: {{^}}constant_expression_uses_all_lds_global_initializer:
; ASM: .group_segment_fixed_size: 65536
-define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(i32 addrspace(1)* nocapture %out, i32 %idx) #0 {
+define amdgpu_kernel void @constant_expression_uses_all_lds_global_initializer(ptr addrspace(1) nocapture %out, i32 %idx) #0 {
entry:
%stack = alloca [4 x i32], align 4, addrspace(5)
- %gep0 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 0
- %gep1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 1
- %gep2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 2
- %gep3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 3
- store i32 9, i32 addrspace(5)* %gep0
- store i32 10, i32 addrspace(5)* %gep1
- store i32 99, i32 addrspace(5)* %gep2
- store i32 43, i32 addrspace(5)* %gep3
- %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %idx
- %load = load i32, i32 addrspace(5)* %arrayidx, align 4
- store i32 %load, i32 addrspace(1)* %out
- store volatile i32 ptrtoint (i32 addrspace(1)* @initializer_user_all to i32), i32 addrspace(1)* undef
+ %gep1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 1
+ %gep2 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 2
+ %gep3 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 3
+ store i32 9, ptr addrspace(5) %stack
+ store i32 10, ptr addrspace(5) %gep1
+ store i32 99, ptr addrspace(5) %gep2
+ store i32 43, ptr addrspace(5) %gep3
+ %arrayidx = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %idx
+ %load = load i32, ptr addrspace(5) %arrayidx, align 4
+ store i32 %load, ptr addrspace(1) %out
+ store volatile i32 ptrtoint (ptr addrspace(1) @initializer_user_all to i32), ptr addrspace(1) undef
ret void
}
; CHECK-LABEL: @branch_ptr_var_same_alloca(
-; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: [[GEP:%[0-9]+]] = getelementptr inbounds [256 x [64 x i32]], ptr addrspace(3) @branch_ptr_var_same_alloca.alloca, i32 0, i32 %{{[0-9]+}}
; CHECK: if:
-; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP]], i32 0, i32 %a
; CHECK: else:
-; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %15, i32 0, i32 %b
+; CHECK: %arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP]], i32 0, i32 %b
; CHECK: endif:
-; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
-; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, ptr addrspace(3) %phi.ptr, align 4
define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
entry:
%alloca = alloca [64 x i32], align 4, addrspace(5)
br i1 undef, label %if, label %else
if:
- %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+ %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
br label %endif
else:
- %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %b
+ %arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %b
br label %endif
endif:
- %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
- store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+ %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+ store i32 0, ptr addrspace(5) %phi.ptr, align 4
ret void
}
; CHECK-LABEL: @branch_ptr_phi_alloca_null_0(
-; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ]
+; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %if ], [ null, %entry ]
define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
entry:
%alloca = alloca [64 x i32], align 4, addrspace(5)
br i1 undef, label %if, label %endif
if:
- %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+ %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
br label %endif
endif:
- %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ null, %entry ]
- store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+ %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ null, %entry ]
+ store i32 0, ptr addrspace(5) %phi.ptr, align 4
ret void
}
; CHECK-LABEL: @branch_ptr_phi_alloca_null_1(
-; CHECK: %phi.ptr = phi i32 addrspace(3)* [ null, %entry ], [ %arrayidx0, %if ]
+; CHECK: %phi.ptr = phi ptr addrspace(3) [ null, %entry ], [ %arrayidx0, %if ]
define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
entry:
%alloca = alloca [64 x i32], align 4, addrspace(5)
br i1 undef, label %if, label %endif
if:
- %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+ %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
br label %endif
endif:
- %phi.ptr = phi i32 addrspace(5)* [ null, %entry ], [ %arrayidx0, %if ]
- store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+ %phi.ptr = phi ptr addrspace(5) [ null, %entry ], [ %arrayidx0, %if ]
+ store i32 0, ptr addrspace(5) %phi.ptr, align 4
ret void
}
; CHECK-LABEL: @one_phi_value(
-; CHECK: getelementptr inbounds [256 x [64 x i32]], [256 x [64 x i32]] addrspace(3)* @one_phi_value.alloca, i32 0, i32 %14
-; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(3)* %{{[0-9]+}}, i32 0, i32 %a
+; CHECK: [[GEP0:%[0-9]+]] = getelementptr inbounds [256 x [64 x i32]], ptr addrspace(3) @one_phi_value.alloca, i32 0, i32 %{{[0-9]+}}
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(3) [[GEP0]], i32 0, i32 %a
; CHECK: br label %exit
-; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
-; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
+; CHECK: %phi.ptr = phi ptr addrspace(3) [ %arrayidx0, %entry ]
+; CHECK: store i32 0, ptr addrspace(3) %phi.ptr, align 4
define amdgpu_kernel void @one_phi_value(i32 %a) #0 {
entry:
%alloca = alloca [64 x i32], align 4, addrspace(5)
- %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+ %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
br label %exit
exit:
- %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %entry ]
- store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+ %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %entry ]
+ store i32 0, ptr addrspace(5) %phi.ptr, align 4
ret void
}
; CHECK: %alloca = alloca [64 x i32], align 4
; CHECK: if:
-; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+; CHECK: %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
; CHECK: else:
-; CHECK: %arrayidx1 = call i32 addrspace(5)* @get_unknown_pointer()
+; CHECK: %arrayidx1 = call ptr addrspace(5) @get_unknown_pointer()
; CHECK: endif:
-; CHECK: %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
-; CHECK: store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+; CHECK: %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+; CHECK: store i32 0, ptr addrspace(5) %phi.ptr, align 4
define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
entry:
%alloca = alloca [64 x i32], align 4, addrspace(5)
br i1 undef, label %if, label %else
if:
- %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 %a
+ %arrayidx0 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a
br label %endif
else:
- %arrayidx1 = call i32 addrspace(5)* @get_unknown_pointer()
+ %arrayidx1 = call ptr addrspace(5) @get_unknown_pointer()
br label %endif
endif:
- %phi.ptr = phi i32 addrspace(5)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
- store i32 0, i32 addrspace(5)* %phi.ptr, align 4
+ %phi.ptr = phi ptr addrspace(5) [ %arrayidx0, %if ], [ %arrayidx1, %else ]
+ store i32 0, ptr addrspace(5) %phi.ptr, align 4
ret void
}
; CHECK-LABEL: @ptr_induction_var_same_alloca(
; CHECK: %alloca = alloca [64 x i32], align 4
-; CHECK: phi i32 addrspace(5)* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+; CHECK: phi ptr addrspace(5) [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 {
entry:
%alloca = alloca [64 x i32], align 4, addrspace(5)
- %arrayidx = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 2
- %arrayidx1 = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 48
+ %arrayidx = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
+ %arrayidx1 = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 48
br label %for.body
for.cond.cleanup: ; preds = %for.body
for.body: ; preds = %for.body, %entry
%i.09 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
- %p.08 = phi i32 addrspace(5)* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
- store i32 %i.09, i32 addrspace(5)* %p.08, align 4
- %incdec.ptr = getelementptr inbounds i32, i32 addrspace(5)* %p.08, i32 1
+ %p.08 = phi ptr addrspace(5) [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
+ store i32 %i.09, ptr addrspace(5) %p.08, align 4
+ %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p.08, i32 1
%inc = add nuw nsw i32 %i.09, 1
- %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %arrayidx1
+ %cmp = icmp eq ptr addrspace(5) %incdec.ptr, %arrayidx1
br i1 %cmp, label %for.cond.cleanup, label %for.body
}
; CHECK-LABEL: @ptr_induction_var_alloca_unknown(
; CHECK: %alloca = alloca [64 x i32], align 4
-; CHECK: %p.08 = phi i32 addrspace(5)* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
-; CHECK: %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %call
+; CHECK: %p.08 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+; CHECK: %cmp = icmp eq ptr addrspace(5) %incdec.ptr, %call
define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 {
entry:
%alloca = alloca [64 x i32], align 4, addrspace(5)
- %arrayidx = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %alloca, i32 0, i32 2
- %call = tail call i32 addrspace(5)* @get_unknown_pointer() #2
- %cmp.7 = icmp eq i32 addrspace(5)* %arrayidx, %call
+ %arrayidx = getelementptr inbounds [64 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
+ %call = tail call ptr addrspace(5) @get_unknown_pointer() #2
+ %cmp.7 = icmp eq ptr addrspace(5) %arrayidx, %call
br i1 %cmp.7, label %for.cond.cleanup, label %for.body.preheader
for.body.preheader: ; preds = %entry
for.body: ; preds = %for.body, %for.body.preheader
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
- %p.08 = phi i32 addrspace(5)* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
- store i32 %i.09, i32 addrspace(5)* %p.08, align 4
- %incdec.ptr = getelementptr inbounds i32, i32 addrspace(5)* %p.08, i32 1
+ %p.08 = phi ptr addrspace(5) [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
+ store i32 %i.09, ptr addrspace(5) %p.08, align 4
+ %incdec.ptr = getelementptr inbounds i32, ptr addrspace(5) %p.08, i32 1
%inc = add nuw nsw i32 %i.09, 1
- %cmp = icmp eq i32 addrspace(5)* %incdec.ptr, %call
+ %cmp = icmp eq ptr addrspace(5) %incdec.ptr, %call
br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
}
-declare i32 addrspace(5)* @get_unknown_pointer() #0
+declare ptr addrspace(5) @get_unknown_pointer() #0
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
; This is just an arbitrary intrinisic that shouldn't ever need to be
; handled to ensure it doesn't crash.
-declare void @llvm.stackrestore(i8*) #2
+declare void @llvm.stackrestore(ptr) #2
; CHECK-LABEL: @try_promote_unhandled_intrinsic(
; CHECK: alloca
-; CHECK: call void @llvm.stackrestore(i8* %tmp1)
-define amdgpu_kernel void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
+; CHECK: call void @llvm.stackrestore(ptr %tmp)
+define amdgpu_kernel void @try_promote_unhandled_intrinsic(ptr addrspace(1) %arg) #2 {
bb:
%tmp = alloca i32, align 4
- %tmp1 = bitcast i32* %tmp to i8*
- %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
- %tmp3 = load i32, i32 addrspace(1)* %tmp2
- store i32 %tmp3, i32* %tmp
- call void @llvm.stackrestore(i8* %tmp1)
+ %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
+ %tmp3 = load i32, ptr addrspace(1) %tmp2
+ store i32 %tmp3, ptr %tmp
+ call void @llvm.stackrestore(ptr %tmp)
ret void
}
; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
; GCN: store_dword v{{.+}}, [[RES]]
-; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4
-; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, ptr addrspace(5) %alloca, align 4
+; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
; OPT: %1 = extractelement <4 x float> %0, i32 %sel2
-; OPT: store float %1, float addrspace(1)* %out, align 4
+; OPT: store float %1, ptr addrspace(1) %out, align 4
-define amdgpu_kernel void @float4_alloca_store4(float addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @float4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
%alloca = alloca <4 x float>, align 16, addrspace(5)
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%c2 = icmp uge i32 %y, 3
%sel1 = select i1 %c1, i32 1, i32 2
%sel2 = select i1 %c2, i32 0, i32 %sel1
- %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
- store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, <4 x float> addrspace(5)* %alloca, align 4
- %load = load float, float addrspace(5)* %gep, align 4
- store float %load, float addrspace(1)* %out, align 4
+ %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+ store <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, ptr addrspace(5) %alloca, align 4
+ %load = load float, ptr addrspace(5) %gep, align 4
+ store float %load, ptr addrspace(1) %out, align 4
ret void
}
; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
; GCN: store_dwordx4 v{{.+}},
-; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x float>, ptr addrspace(5) %alloca
; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
-; OPT: store <4 x float> %1, <4 x float> addrspace(5)* %alloca
-; OPT: %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
-; OPT: store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
+; OPT: store <4 x float> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
+; OPT: store <4 x float> %load, ptr addrspace(1) %out, align 4
-define amdgpu_kernel void @float4_alloca_load4(<4 x float> addrspace(1)* %out, float addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @float4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
%alloca = alloca <4 x float>, align 16, addrspace(5)
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%c2 = icmp uge i32 %y, 3
%sel1 = select i1 %c1, i32 1, i32 2
%sel2 = select i1 %c2, i32 0, i32 %sel1
- %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
- store float 1.0, float addrspace(5)* %gep, align 4
- %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
- store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
+ %gep = getelementptr inbounds <4 x float>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+ store float 1.0, ptr addrspace(5) %gep, align 4
+ %load = load <4 x float>, ptr addrspace(5) %alloca, align 4
+ store <4 x float> %load, ptr addrspace(1) %out, align 4
ret void
}
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
-; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, <4 x half> addrspace(5)* %alloca, align 2
-; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, ptr addrspace(5) %alloca, align 2
+; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
-; OPT: store half %1, half addrspace(1)* %out, align 2
+; OPT: store half %1, ptr addrspace(1) %out, align 2
-define amdgpu_kernel void @half4_alloca_store4(half addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @half4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
%alloca = alloca <4 x half>, align 16, addrspace(5)
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%c2 = icmp uge i32 %y, 3
%sel1 = select i1 %c1, i32 1, i32 2
%sel2 = select i1 %c2, i32 0, i32 %sel1
- %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
- store <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, <4 x half> addrspace(5)* %alloca, align 2
- %load = load half, half addrspace(5)* %gep, align 2
- store half %load, half addrspace(1)* %out, align 2
+ %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+ store <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, ptr addrspace(5) %alloca, align 2
+ %load = load half, ptr addrspace(5) %gep, align 2
+ store half %load, ptr addrspace(1) %out, align 2
ret void
}
; GCN-NOT: buffer_
; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
-; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x half>, ptr addrspace(5) %alloca
; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
-; OPT: store <4 x half> %1, <4 x half> addrspace(5)* %alloca
-; OPT: %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
-; OPT: store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
+; OPT: store <4 x half> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
+; OPT: store <4 x half> %load, ptr addrspace(1) %out, align 2
-define amdgpu_kernel void @half4_alloca_load4(<4 x half> addrspace(1)* %out, half addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @half4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
%alloca = alloca <4 x half>, align 16, addrspace(5)
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%c2 = icmp uge i32 %y, 3
%sel1 = select i1 %c1, i32 1, i32 2
%sel2 = select i1 %c2, i32 0, i32 %sel1
- %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
- store half 1.0, half addrspace(5)* %gep, align 4
- %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
- store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
+ %gep = getelementptr inbounds <4 x half>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+ store half 1.0, ptr addrspace(5) %gep, align 4
+ %load = load <4 x half>, ptr addrspace(5) %alloca, align 2
+ store <4 x half> %load, ptr addrspace(1) %out, align 2
ret void
}
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s[[[SL]]:[[SH]]]
-; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
-; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
+; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
-; OPT: store i16 %1, i16 addrspace(1)* %out, align 2
+; OPT: store i16 %1, ptr addrspace(1) %out, align 2
-define amdgpu_kernel void @short4_alloca_store4(i16 addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @short4_alloca_store4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
%alloca = alloca <4 x i16>, align 16, addrspace(5)
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%c2 = icmp uge i32 %y, 3
%sel1 = select i1 %c1, i32 1, i32 2
%sel2 = select i1 %c2, i32 0, i32 %sel1
- %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
- store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
- %load = load i16, i16 addrspace(5)* %gep, align 2
- store i16 %load, i16 addrspace(1)* %out, align 2
+ %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+ store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, ptr addrspace(5) %alloca, align 2
+ %load = load i16, ptr addrspace(5) %gep, align 2
+ store i16 %load, ptr addrspace(1) %out, align 2
ret void
}
; GCN-NOT: buffer_
; GCN: s_mov_b64 s[{{[0-9:]+}}], 0xffff
-; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
-; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
+; OPT: %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+; OPT: %0 = load <4 x i16>, ptr addrspace(5) %alloca
; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
-; OPT: store <4 x i16> %1, <4 x i16> addrspace(5)* %alloca
-; OPT: %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
-; OPT: store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
+; OPT: store <4 x i16> %1, ptr addrspace(5) %alloca
+; OPT: %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
+; OPT: store <4 x i16> %load, ptr addrspace(1) %out, align 2
-define amdgpu_kernel void @short4_alloca_load4(<4 x i16> addrspace(1)* %out, i16 addrspace(3)* %dummy_lds) {
+define amdgpu_kernel void @short4_alloca_load4(ptr addrspace(1) %out, ptr addrspace(3) %dummy_lds) {
entry:
%alloca = alloca <4 x i16>, align 16, addrspace(5)
%x = tail call i32 @llvm.amdgcn.workitem.id.x()
%c2 = icmp uge i32 %y, 3
%sel1 = select i1 %c1, i32 1, i32 2
%sel2 = select i1 %c2, i32 0, i32 %sel1
- %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
- store i16 1, i16 addrspace(5)* %gep, align 4
- %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
- store <4 x i16> %load, <4 x i16> addrspace(1)* %out, align 2
+ %gep = getelementptr inbounds <4 x i16>, ptr addrspace(5) %alloca, i32 0, i32 %sel2
+ store i16 1, ptr addrspace(5) %gep, align 4
+ %load = load <4 x i16>, ptr addrspace(5) %alloca, align 2
+ store <4 x i16> %load, ptr addrspace(1) %out, align 2
ret void
}
; GCN: v_mov_b32_e32 v1, 0
; OPT: %private_iptr = alloca <2 x i32>, align 8, addrspace(5)
-; OPT: %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)*
-; OPT: %tmp1 = load i64, i64 addrspace(5)* %cast, align 8
+; OPT: %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
define i64 @ptr_alloca_bitcast() {
entry:
%private_iptr = alloca <2 x i32>, align 8, addrspace(5)
- %cast = bitcast <2 x i32> addrspace(5)* %private_iptr to i64 addrspace(5)*
- %tmp1 = load i64, i64 addrspace(5)* %cast, align 8
+ %tmp1 = load i64, ptr addrspace(5) %private_iptr, align 8
ret i64 %tmp1
}
; CHECK-LABEL: @volatile_load(
; CHECK: alloca [4 x i32]
-; CHECK: load volatile i32, i32 addrspace(5)*
-define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; CHECK: load volatile i32, ptr addrspace(5)
+define amdgpu_kernel void @volatile_load(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
entry:
%stack = alloca [4 x i32], align 4, addrspace(5)
- %tmp = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
- %load = load volatile i32, i32 addrspace(5)* %arrayidx1
- store i32 %load, i32 addrspace(1)* %out
+ %tmp = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp
+ %load = load volatile i32, ptr addrspace(5) %arrayidx1
+ store i32 %load, ptr addrspace(1) %out
ret void
}
; CHECK-LABEL: @volatile_store(
; CHECK: alloca [4 x i32]
-; CHECK: store volatile i32 %tmp, i32 addrspace(5)*
-define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; CHECK: store volatile i32 %tmp, ptr addrspace(5)
+define amdgpu_kernel void @volatile_store(ptr addrspace(1) nocapture %out, ptr addrspace(1) nocapture %in) {
entry:
%stack = alloca [4 x i32], align 4, addrspace(5)
- %tmp = load i32, i32 addrspace(1)* %in, align 4
- %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %stack, i32 0, i32 %tmp
- store volatile i32 %tmp, i32 addrspace(5)* %arrayidx1
+ %tmp = load i32, ptr addrspace(1) %in, align 4
+ %arrayidx1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %stack, i32 0, i32 %tmp
+ store volatile i32 %tmp, ptr addrspace(5) %arrayidx1
ret void
}
; CHECK: alloca double
; CHECK: load double
; CHECK: load volatile double
-define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @volatile_and_non_volatile_load(ptr addrspace(1) nocapture %arg, i32 %arg1) #0 {
bb:
%tmp = alloca double, align 8, addrspace(5)
- store double 0.000000e+00, double addrspace(5)* %tmp, align 8
+ store double 0.000000e+00, ptr addrspace(5) %tmp, align 8
- %tmp4 = load double, double addrspace(5)* %tmp, align 8
- %tmp5 = load volatile double, double addrspace(5)* %tmp, align 8
+ %tmp4 = load double, ptr addrspace(5) %tmp, align 8
+ %tmp5 = load volatile double, ptr addrspace(5) %tmp, align 8
- store double %tmp4, double addrspace(1)* %arg
+ store double %tmp4, ptr addrspace(1) %arg
ret void
}
; CHECK-LABEL: @test_insertelement(
; CHECK: %alloca = alloca i16
-; CHECK-NEXT: insertelement <2 x i16 addrspace(5)*> undef, i16 addrspace(5)* %alloca, i32 0
+; CHECK-NEXT: insertelement <2 x ptr addrspace(5)> undef, ptr addrspace(5) %alloca, i32 0
define amdgpu_kernel void @test_insertelement() #0 {
entry:
%alloca = alloca i16, align 4, addrspace(5)
- %in = insertelement <2 x i16 addrspace(5)*> undef, i16 addrspace(5)* %alloca, i32 0
- store <2 x i16 addrspace(5)*> %in, <2 x i16 addrspace(5)*>* undef, align 4
+ %in = insertelement <2 x ptr addrspace(5)> undef, ptr addrspace(5) %alloca, i32 0
+ store <2 x ptr addrspace(5)> %in, ptr undef, align 4
ret void
}
; CHECK-LABEL: @test_insertvalue(
; CHECK: %alloca = alloca i16
-; CHECK-NEXT: insertvalue { i16 addrspace(5)* } undef, i16 addrspace(5)* %alloca, 0
+; CHECK-NEXT: insertvalue { ptr addrspace(5) } undef, ptr addrspace(5) %alloca, 0
define amdgpu_kernel void @test_insertvalue() #0 {
entry:
%alloca = alloca i16, align 4, addrspace(5)
- %in = insertvalue { i16 addrspace(5)* } undef, i16 addrspace(5)* %alloca, 0
- store { i16 addrspace(5)* } %in, { i16 addrspace(5)* }* undef, align 4
+ %in = insertvalue { ptr addrspace(5) } undef, ptr addrspace(5) %alloca, 0
+ store { ptr addrspace(5) } %in, ptr undef, align 4
ret void
}
; CHECK-LABEL: @test_insertvalue_array(
; CHECK: %alloca = alloca i16
-; CHECK-NEXT: insertvalue [2 x i16 addrspace(5)*] undef, i16 addrspace(5)* %alloca, 0
+; CHECK-NEXT: insertvalue [2 x ptr addrspace(5)] undef, ptr addrspace(5) %alloca, 0
define amdgpu_kernel void @test_insertvalue_array() #0 {
entry:
%alloca = alloca i16, align 4, addrspace(5)
- %in = insertvalue [2 x i16 addrspace(5)*] undef, i16 addrspace(5)* %alloca, 0
- store [2 x i16 addrspace(5)*] %in, [2 x i16 addrspace(5)*]* undef, align 4
+ %in = insertvalue [2 x ptr addrspace(5)] undef, ptr addrspace(5) %alloca, 0
+ store [2 x ptr addrspace(5)] %in, ptr undef, align 4
ret void
}
; OPT-LABEL: @vector_alloca_not_atomic(
;
; OPT: extractelement <3 x i32> <i32 0, i32 1, i32 2>, i64 %index
-define amdgpu_kernel void @vector_alloca_not_atomic(i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @vector_alloca_not_atomic(ptr addrspace(1) %out, i64 %index) {
entry:
%alloca = alloca [3 x i32], addrspace(5)
- %a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0
- %a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1
- %a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2
- store i32 0, i32 addrspace(5)* %a0
- store i32 1, i32 addrspace(5)* %a1
- store i32 2, i32 addrspace(5)* %a2
- %tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
- %data = load i32, i32 addrspace(5)* %tmp
- store i32 %data, i32 addrspace(1)* %out
+ %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
+ %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
+ store i32 0, ptr addrspace(5) %alloca
+ store i32 1, ptr addrspace(5) %a1
+ store i32 2, ptr addrspace(5) %a2
+ %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+ %data = load i32, ptr addrspace(5) %tmp
+ store i32 %data, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_alloca_atomic_read(
;
; OPT: alloca [3 x i32]
-; OPT: store i32 0, i32 addrspace(5)*
-; OPT: store i32 1, i32 addrspace(5)*
-; OPT: store i32 2, i32 addrspace(5)*
-; OPT: load atomic i32, i32 addrspace(5)*
-define amdgpu_kernel void @vector_alloca_atomic_read(i32 addrspace(1)* %out, i64 %index) {
+; OPT: store i32 0, ptr addrspace(5)
+; OPT: store i32 1, ptr addrspace(5)
+; OPT: store i32 2, ptr addrspace(5)
+; OPT: load atomic i32, ptr addrspace(5)
+define amdgpu_kernel void @vector_alloca_atomic_read(ptr addrspace(1) %out, i64 %index) {
entry:
%alloca = alloca [3 x i32], addrspace(5)
- %a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0
- %a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1
- %a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2
- store i32 0, i32 addrspace(5)* %a0
- store i32 1, i32 addrspace(5)* %a1
- store i32 2, i32 addrspace(5)* %a2
- %tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
- %data = load atomic i32, i32 addrspace(5)* %tmp acquire, align 4
- store i32 %data, i32 addrspace(1)* %out
+ %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
+ %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
+ store i32 0, ptr addrspace(5) %alloca
+ store i32 1, ptr addrspace(5) %a1
+ store i32 2, ptr addrspace(5) %a2
+ %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+ %data = load atomic i32, ptr addrspace(5) %tmp acquire, align 4
+ store i32 %data, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_alloca_atomic_write(
;
; OPT: alloca [3 x i32]
-; OPT: store atomic i32 0, i32 addrspace(5)
-; OPT: store atomic i32 1, i32 addrspace(5)
-; OPT: store atomic i32 2, i32 addrspace(5)
-; OPT: load i32, i32 addrspace(5)*
-define amdgpu_kernel void @vector_alloca_atomic_write(i32 addrspace(1)* %out, i64 %index) {
+; OPT: store atomic i32 0, ptr addrspace(5)
+; OPT: store atomic i32 1, ptr addrspace(5)
+; OPT: store atomic i32 2, ptr addrspace(5)
+; OPT: load i32, ptr addrspace(5)
+define amdgpu_kernel void @vector_alloca_atomic_write(ptr addrspace(1) %out, i64 %index) {
entry:
%alloca = alloca [3 x i32], addrspace(5)
- %a0 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 0
- %a1 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 1
- %a2 = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i32 0, i32 2
- store atomic i32 0, i32 addrspace(5)* %a0 release, align 4
- store atomic i32 1, i32 addrspace(5)* %a1 release, align 4
- store atomic i32 2, i32 addrspace(5)* %a2 release, align 4
- %tmp = getelementptr [3 x i32], [3 x i32] addrspace(5)* %alloca, i64 0, i64 %index
- %data = load i32, i32 addrspace(5)* %tmp
- store i32 %data, i32 addrspace(1)* %out
+ %a1 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 1
+ %a2 = getelementptr [3 x i32], ptr addrspace(5) %alloca, i32 0, i32 2
+ store atomic i32 0, ptr addrspace(5) %alloca release, align 4
+ store atomic i32 1, ptr addrspace(5) %a1 release, align 4
+ store atomic i32 2, ptr addrspace(5) %a2 release, align 4
+ %tmp = getelementptr [3 x i32], ptr addrspace(5) %alloca, i64 0, i64 %index
+ %data = load i32, ptr addrspace(5) %tmp
+ store i32 %data, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_read_alloca_bitcast(
; OPT-NOT: alloca
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-; OPT-NEXT: store i32 %0, i32 addrspace(1)* %out, align 4
+; OPT-NEXT: store i32 %0, ptr addrspace(1) %out, align 4
; GCN-LABEL: {{^}}vector_read_alloca_bitcast:
; GCN-ALLOCA-COUNT-4: buffer_store_dword
; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc
; GCN-PROMOTE: ScratchSize: 0
-define amdgpu_kernel void @vector_read_alloca_bitcast(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_alloca_bitcast(ptr addrspace(1) %out, i32 %index) {
entry:
%tmp = alloca [4 x i32], addrspace(5)
- %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
- %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
- %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
- %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
- store i32 0, i32 addrspace(5)* %x
- store i32 1, i32 addrspace(5)* %y
- store i32 2, i32 addrspace(5)* %z
- store i32 3, i32 addrspace(5)* %w
- %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i32, i32 addrspace(5)* %tmp1
- store i32 %tmp2, i32 addrspace(1)* %out
+ %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+ %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 1, ptr addrspace(5) %y
+ store i32 2, ptr addrspace(5) %z
+ store i32 3, ptr addrspace(5) %w
+ %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i32, ptr addrspace(5) %tmp1
+ store i32 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-NOT: alloca
; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
; OPT-NEXT: %1 = extractelement <4 x i32> %0, i32 %r_index
-; OPT-NEXT: store i32 %1, i32 addrspace(1)* %out, align
+; OPT-NEXT: store i32 %1, ptr addrspace(1) %out, align
; GCN-LABEL: {{^}}vector_write_alloca_bitcast:
; GCN-ALLOCA-COUNT-5: buffer_store_dword
; GCN-PROMOTE: ScratchSize: 0
-define amdgpu_kernel void @vector_write_alloca_bitcast(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @vector_write_alloca_bitcast(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
entry:
%tmp = alloca [4 x i32], addrspace(5)
- %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
- %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
- %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
- %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
- store i32 0, i32 addrspace(5)* %x
- store i32 0, i32 addrspace(5)* %y
- store i32 0, i32 addrspace(5)* %z
- store i32 0, i32 addrspace(5)* %w
- %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
- store i32 1, i32 addrspace(5)* %tmp1
- %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
- %tmp3 = load i32, i32 addrspace(5)* %tmp2
- store i32 %tmp3, i32 addrspace(1)* %out
+ %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+ %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 0, ptr addrspace(5) %y
+ store i32 0, ptr addrspace(5) %z
+ store i32 0, ptr addrspace(5) %w
+ %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index
+ store i32 1, ptr addrspace(5) %tmp1
+ %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index
+ %tmp3 = load i32, ptr addrspace(5) %tmp2
+ store i32 %tmp3, ptr addrspace(1) %out
ret void
}
; OPT-NOT: alloca
; OPT: bb2:
; OPT: %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ]
-; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp73, i32 %tmp10
+; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10
; OPT: .preheader:
; OPT: %bc = bitcast <6 x float> %0 to <6 x i32>
; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20
; GCN-PROMOTE: ScratchSize: 0
-define amdgpu_kernel void @vector_write_read_bitcast_to_float(float addrspace(1)* %arg) {
+define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) {
bb:
%tmp = alloca [6 x float], align 4, addrspace(5)
- %tmp1 = bitcast [6 x float] addrspace(5)* %tmp to i8 addrspace(5)*
- call void @llvm.lifetime.start.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2
+ call void @llvm.lifetime.start.p5(i64 24, ptr addrspace(5) %tmp) #2
br label %bb2
bb2: ; preds = %bb2, %bb
%tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
%tmp4 = zext i32 %tmp3 to i64
- %tmp5 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp4
- %tmp6 = bitcast float addrspace(1)* %tmp5 to i32 addrspace(1)*
- %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp4
+ %tmp7 = load i32, ptr addrspace(1) %tmp5, align 4
%tmp8 = trunc i32 %tmp3 to i16
%tmp9 = urem i16 %tmp8, 6
%tmp10 = zext i16 %tmp9 to i32
- %tmp11 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp10
- %tmp12 = bitcast float addrspace(5)* %tmp11 to i32 addrspace(5)*
- store i32 %tmp7, i32 addrspace(5)* %tmp12, align 4
+ %tmp11 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp10
+ store i32 %tmp7, ptr addrspace(5) %tmp11, align 4
%tmp13 = add nuw nsw i32 %tmp3, 1
%tmp14 = icmp eq i32 %tmp13, 1000
br i1 %tmp14, label %.preheader, label %bb2
bb15: ; preds = %.preheader
- call void @llvm.lifetime.end.p5i8(i64 24, i8 addrspace(5)* %tmp1) #2
+ call void @llvm.lifetime.end.p5(i64 24, ptr addrspace(5) %tmp) #2
ret void
.preheader: ; preds = %.preheader, %bb2
%tmp18 = urem i16 %tmp17, 6
%tmp19 = sub nuw nsw i16 5, %tmp18
%tmp20 = zext i16 %tmp19 to i32
- %tmp21 = getelementptr inbounds [6 x float], [6 x float] addrspace(5)* %tmp, i32 0, i32 %tmp20
- %tmp22 = bitcast float addrspace(5)* %tmp21 to i32 addrspace(5)*
- %tmp23 = load i32, i32 addrspace(5)* %tmp22, align 4
+ %tmp21 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp20
+ %tmp23 = load i32, ptr addrspace(5) %tmp21, align 4
%tmp24 = zext i32 %tmp16 to i64
- %tmp25 = getelementptr inbounds float, float addrspace(1)* %arg, i64 %tmp24
- %tmp26 = bitcast float addrspace(1)* %tmp25 to i32 addrspace(1)*
- store i32 %tmp23, i32 addrspace(1)* %tmp26, align 4
+ %tmp25 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp24
+ store i32 %tmp23, ptr addrspace(1) %tmp25, align 4
%tmp27 = add nuw nsw i32 %tmp16, 1
%tmp28 = icmp eq i32 %tmp27, 1000
br i1 %tmp28, label %bb15, label %.preheader
; OPT-NOT: alloca
; OPT: bb2:
; OPT: %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ]
-; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp73, i32 %tmp10
+; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10
; OPT: .preheader:
; OPT: %bc = bitcast <6 x double> %0 to <6 x i64>
; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20
; GCN-PROMOTE: ScratchSize: 0
-define amdgpu_kernel void @vector_write_read_bitcast_to_double(double addrspace(1)* %arg) {
+define amdgpu_kernel void @vector_write_read_bitcast_to_double(ptr addrspace(1) %arg) {
bb:
%tmp = alloca [6 x double], align 8, addrspace(5)
- %tmp1 = bitcast [6 x double] addrspace(5)* %tmp to i8 addrspace(5)*
- call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
+ call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2
br label %bb2
bb2: ; preds = %bb2, %bb
%tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ]
%tmp4 = zext i32 %tmp3 to i64
- %tmp5 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp4
- %tmp6 = bitcast double addrspace(1)* %tmp5 to i64 addrspace(1)*
- %tmp7 = load i64, i64 addrspace(1)* %tmp6, align 8
+ %tmp5 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp4
+ %tmp7 = load i64, ptr addrspace(1) %tmp5, align 8
%tmp8 = trunc i32 %tmp3 to i16
%tmp9 = urem i16 %tmp8, 6
%tmp10 = zext i16 %tmp9 to i32
- %tmp11 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp10
- %tmp12 = bitcast double addrspace(5)* %tmp11 to i64 addrspace(5)*
- store i64 %tmp7, i64 addrspace(5)* %tmp12, align 8
+ %tmp11 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp10
+ store i64 %tmp7, ptr addrspace(5) %tmp11, align 8
%tmp13 = add nuw nsw i32 %tmp3, 1
%tmp14 = icmp eq i32 %tmp13, 1000
br i1 %tmp14, label %.preheader, label %bb2
bb15: ; preds = %.preheader
- call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
+ call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2
ret void
.preheader: ; preds = %.preheader, %bb2
%tmp18 = urem i16 %tmp17, 6
%tmp19 = sub nuw nsw i16 5, %tmp18
%tmp20 = zext i16 %tmp19 to i32
- %tmp21 = getelementptr inbounds [6 x double], [6 x double] addrspace(5)* %tmp, i32 0, i32 %tmp20
- %tmp22 = bitcast double addrspace(5)* %tmp21 to i64 addrspace(5)*
- %tmp23 = load i64, i64 addrspace(5)* %tmp22, align 8
+ %tmp21 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp20
+ %tmp23 = load i64, ptr addrspace(5) %tmp21, align 8
%tmp24 = zext i32 %tmp16 to i64
- %tmp25 = getelementptr inbounds double, double addrspace(1)* %arg, i64 %tmp24
- %tmp26 = bitcast double addrspace(1)* %tmp25 to i64 addrspace(1)*
- store i64 %tmp23, i64 addrspace(1)* %tmp26, align 8
+ %tmp25 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp24
+ store i64 %tmp23, ptr addrspace(1) %tmp25, align 8
%tmp27 = add nuw nsw i32 %tmp16, 1
%tmp28 = icmp eq i32 %tmp27, 1000
br i1 %tmp28, label %bb15, label %.preheader
; GCN-PROMOTE: ScratchSize: 0
-define amdgpu_kernel void @vector_write_read_bitcast_to_i64(i64 addrspace(1)* %arg) {
+define amdgpu_kernel void @vector_write_read_bitcast_to_i64(ptr addrspace(1) %arg) {
bb:
%tmp = alloca [6 x i64], align 8, addrspace(5)
- %tmp1 = bitcast [6 x i64] addrspace(5)* %tmp to i8 addrspace(5)*
- call void @llvm.lifetime.start.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
+ call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2
br label %bb2
bb2: ; preds = %bb2, %bb
%tmp3 = phi i32 [ 0, %bb ], [ %tmp11, %bb2 ]
%tmp4 = zext i32 %tmp3 to i64
- %tmp5 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp4
- %tmp6 = load i64, i64 addrspace(1)* %tmp5, align 8
+ %tmp5 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp4
+ %tmp6 = load i64, ptr addrspace(1) %tmp5, align 8
%tmp7 = trunc i32 %tmp3 to i16
%tmp8 = urem i16 %tmp7, 6
%tmp9 = zext i16 %tmp8 to i32
- %tmp10 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp9
- store i64 %tmp6, i64 addrspace(5)* %tmp10, align 8
+ %tmp10 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp9
+ store i64 %tmp6, ptr addrspace(5) %tmp10, align 8
%tmp11 = add nuw nsw i32 %tmp3, 1
%tmp12 = icmp eq i32 %tmp11, 1000
br i1 %tmp12, label %.preheader, label %bb2
bb13: ; preds = %.preheader
- call void @llvm.lifetime.end.p5i8(i64 48, i8 addrspace(5)* %tmp1) #2
+ call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2
ret void
.preheader: ; preds = %.preheader, %bb2
%tmp16 = urem i16 %tmp15, 6
%tmp17 = sub nuw nsw i16 5, %tmp16
%tmp18 = zext i16 %tmp17 to i32
- %tmp19 = getelementptr inbounds [6 x i64], [6 x i64] addrspace(5)* %tmp, i32 0, i32 %tmp18
- %tmp20 = load i64, i64 addrspace(5)* %tmp19, align 8
+ %tmp19 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp18
+ %tmp20 = load i64, ptr addrspace(5) %tmp19, align 8
%tmp21 = zext i32 %tmp14 to i64
- %tmp22 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp21
- store i64 %tmp20, i64 addrspace(1)* %tmp22, align 8
+ %tmp22 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp21
+ store i64 %tmp20, ptr addrspace(1) %tmp22, align 8
%tmp23 = add nuw nsw i32 %tmp14, 1
%tmp24 = icmp eq i32 %tmp23, 1000
br i1 %tmp24, label %bb13, label %.preheader
; OPT-LABEL: @vector_read_alloca_bitcast_assume(
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
+; OPT: store i32 %0, ptr addrspace(1) %out, align 4
; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume:
; GCN-COUNT-4: buffer_store_dword
-define amdgpu_kernel void @vector_read_alloca_bitcast_assume(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) {
entry:
%tmp = alloca [4 x i32], addrspace(5)
- %x = bitcast [4 x i32] addrspace(5)* %tmp to i32 addrspace(5)*
- %cmp = icmp ne i32 addrspace(5)* %x, null
+ %cmp = icmp ne ptr addrspace(5) %tmp, null
call void @llvm.assume(i1 %cmp)
- %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
- %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
- %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
- store i32 0, i32 addrspace(5)* %x
- store i32 1, i32 addrspace(5)* %y
- store i32 2, i32 addrspace(5)* %z
- store i32 3, i32 addrspace(5)* %w
- %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i32, i32 addrspace(5)* %tmp1
- store i32 %tmp2, i32 addrspace(1)* %out
+ %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+ %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 1, ptr addrspace(5) %y
+ store i32 2, ptr addrspace(5) %z
+ store i32 3, ptr addrspace(5) %w
+ %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i32, ptr addrspace(5) %tmp1
+ store i32 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-NOT: alloca
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
; OPT-NEXT: %add2 = add nuw nsw i32 %0, 1
-; OPT-NEXT: store i32 %add2, i32 addrspace(1)* %out, align 4
+; OPT-NEXT: store i32 %add2, ptr addrspace(1) %out, align 4
; GCN-LABEL: {{^}}vector_read_alloca_multiuse:
; GCN-ALLOCA-COUNT-4: buffer_store_dword
; GCN-PROMOTE: ScratchSize: 0
-define amdgpu_kernel void @vector_read_alloca_multiuse(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_alloca_multiuse(ptr addrspace(1) %out, i32 %index) {
entry:
%tmp = alloca [4 x i32], addrspace(5)
- %b = bitcast [4 x i32] addrspace(5)* %tmp to float addrspace(5)*
- %x = bitcast float addrspace(5)* %b to i32 addrspace(5)*
- %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
- %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
- %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
- store i32 0, i32 addrspace(5)* %x
- store i32 1, i32 addrspace(5)* %y
- store i32 2, i32 addrspace(5)* %z
- store i32 3, i32 addrspace(5)* %w
- %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i32, i32 addrspace(5)* %tmp1
- %tmp3 = load i32, i32 addrspace(5)* %x
- %tmp4 = load i32, i32 addrspace(5)* %y
+ %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+ %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 1, ptr addrspace(5) %y
+ store i32 2, ptr addrspace(5) %z
+ store i32 3, ptr addrspace(5) %w
+ %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i32, ptr addrspace(5) %tmp1
+ %tmp3 = load i32, ptr addrspace(5) %tmp
+ %tmp4 = load i32, ptr addrspace(5) %y
%add1 = add i32 %tmp2, %tmp3
%add2 = add i32 %add1, %tmp4
- store i32 %add2, i32 addrspace(1)* %out
+ store i32 %add2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @bitcast_vector_to_vector(
; OPT-NOT: alloca
-; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
+; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(1) %out, align 16
; GCN-LABEL: {{^}}bitcast_vector_to_vector:
; GCN: v_mov_b32_e32 v0, 1
; GCN: ScratchSize: 0
-define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @bitcast_vector_to_vector(ptr addrspace(1) %out) {
.entry:
%alloca = alloca <4 x float>, align 16, addrspace(5)
- %cast = bitcast <4 x float> addrspace(5)* %alloca to <4 x i32> addrspace(5)*
- store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
- %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
- store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+ store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %alloca
+ %load = load <4 x i32>, ptr addrspace(5) %alloca, align 16
+ store <4 x i32> %load, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_bitcast_from_alloca_array(
; OPT-NOT: alloca
-; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
+; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(1) %out, align 16
; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array:
; GCN: v_mov_b32_e32 v0, 1
; GCN: ScratchSize: 0
-define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @vector_bitcast_from_alloca_array(ptr addrspace(1) %out) {
.entry:
%alloca = alloca [4 x float], align 16, addrspace(5)
- %cast = bitcast [4 x float] addrspace(5)* %alloca to <4 x i32> addrspace(5)*
- store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
- %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
- store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+ store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %alloca
+ %load = load <4 x i32>, ptr addrspace(5) %alloca, align 16
+ store <4 x i32> %load, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array(
; OPT-NOT: alloca
-; OPT: %out.repack = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 0
-; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
-; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 1
-; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
-; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 2
-; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
-; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 3
-; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
+; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4
+; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 1
+; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4
+; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 2
+; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4
+; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 3
+; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4
; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array:
; GCN: v_mov_b32_e32 v0, 1
; GCN: ScratchSize: 0
-define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] addrspace(1)* %out) {
-.entry:
+define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array(ptr addrspace(1) %out) {
%alloca = alloca [4 x float], align 16, addrspace(5)
- %cast = bitcast [4 x float] addrspace(5)* %alloca to [4 x i32] addrspace(5)*
- store [4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] addrspace(5)* %cast
- %load = load [4 x i32], [4 x i32] addrspace(5)* %cast, align 16
- store [4 x i32] %load, [4 x i32] addrspace(1)* %out
+ store [4 x i32] [i32 1, i32 2, i32 3, i32 4], ptr addrspace(5) %alloca
+ %load = load [4 x i32], ptr addrspace(5) %alloca, align 16
+ store [4 x i32] %load, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array(
; OPT-NOT: alloca
-; OPT: %out.repack = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 0
-; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
-; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 1
-; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
-; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 2
-; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
-; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 3
-; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
+; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4
+; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 1
+; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4
+; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 2
+; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4
+; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 3
+; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4
; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array:
; GCN: v_mov_b32_e32 v0, 1
%struct.v4 = type { i32, i32, i32, i32 }
-define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(%struct.v4 addrspace(1)* %out) {
-.entry:
+define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(ptr addrspace(1) %out) {
%alloca = alloca [4 x float], align 16, addrspace(5)
- %cast = bitcast [4 x float] addrspace(5)* %alloca to %struct.v4 addrspace(5)*
- store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, %struct.v4 addrspace(5)* %cast
- %load = load %struct.v4, %struct.v4 addrspace(5)* %cast, align 16
- store %struct.v4 %load, %struct.v4 addrspace(1)* %out
+ store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, ptr addrspace(5) %alloca
+ %load = load %struct.v4, ptr addrspace(5) %alloca, align 16
+ store %struct.v4 %load, ptr addrspace(1) %out
ret void
}
-declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
+declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture)
-declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture)
declare void @llvm.assume(i1)
; OPT: <8 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <8 x i64>
-define amdgpu_kernel void @alloca_8xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @alloca_8xi64_max1024(ptr addrspace(1) %out, i32 %index) #0 {
entry:
%tmp = alloca [8 x i64], addrspace(5)
- %x = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 0
- store i64 0, i64 addrspace(5)* %x
- %tmp1 = getelementptr [8 x i64], [8 x i64] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i64, i64 addrspace(5)* %tmp1
- store i64 %tmp2, i64 addrspace(1)* %out
+ store i64 0, ptr addrspace(5) %tmp
+ %tmp1 = getelementptr [8 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i64, ptr addrspace(5) %tmp1
+ store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-NOT: <9 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i64>
-define amdgpu_kernel void @alloca_9xi64_max1024(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @alloca_9xi64_max1024(ptr addrspace(1) %out, i32 %index) #0 {
entry:
%tmp = alloca [9 x i64], addrspace(5)
- %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
- store i64 0, i64 addrspace(5)* %x
- %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i64, i64 addrspace(5)* %tmp1
- store i64 %tmp2, i64 addrspace(1)* %out
+ store i64 0, ptr addrspace(5) %tmp
+ %tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i64, ptr addrspace(5) %tmp1
+ store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT: <16 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <16 x i64>
-define amdgpu_kernel void @alloca_16xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @alloca_16xi64_max512(ptr addrspace(1) %out, i32 %index) #1 {
entry:
%tmp = alloca [16 x i64], addrspace(5)
- %x = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 0
- store i64 0, i64 addrspace(5)* %x
- %tmp1 = getelementptr [16 x i64], [16 x i64] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i64, i64 addrspace(5)* %tmp1
- store i64 %tmp2, i64 addrspace(1)* %out
+ store i64 0, ptr addrspace(5) %tmp
+ %tmp1 = getelementptr [16 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i64, ptr addrspace(5) %tmp1
+ store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-NOT: <17 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <17 x i64>
-define amdgpu_kernel void @alloca_17xi64_max512(i64 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @alloca_17xi64_max512(ptr addrspace(1) %out, i32 %index) #1 {
entry:
%tmp = alloca [17 x i64], addrspace(5)
- %x = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 0
- store i64 0, i64 addrspace(5)* %x
- %tmp1 = getelementptr [17 x i64], [17 x i64] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i64, i64 addrspace(5)* %tmp1
- store i64 %tmp2, i64 addrspace(1)* %out
+ store i64 0, ptr addrspace(5) %tmp
+ %tmp1 = getelementptr [17 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i64, ptr addrspace(5) %tmp1
+ store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-NOT: <9 x i128>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i128>
-define amdgpu_kernel void @alloca_9xi128_max512(i128 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @alloca_9xi128_max512(ptr addrspace(1) %out, i32 %index) #1 {
entry:
%tmp = alloca [9 x i128], addrspace(5)
- %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
- store i128 0, i128 addrspace(5)* %x
- %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i128, i128 addrspace(5)* %tmp1
- store i128 %tmp2, i128 addrspace(1)* %out
+ store i128 0, ptr addrspace(5) %tmp
+ %tmp1 = getelementptr [9 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i128, ptr addrspace(5) %tmp1
+ store i128 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT: <9 x i128>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i128>
-define amdgpu_kernel void @alloca_9xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
+define amdgpu_kernel void @alloca_9xi128_max256(ptr addrspace(1) %out, i32 %index) #2 {
entry:
%tmp = alloca [9 x i128], addrspace(5)
- %x = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 0
- store i128 0, i128 addrspace(5)* %x
- %tmp1 = getelementptr [9 x i128], [9 x i128] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i128, i128 addrspace(5)* %tmp1
- store i128 %tmp2, i128 addrspace(1)* %out
+ store i128 0, ptr addrspace(5) %tmp
+ %tmp1 = getelementptr [9 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i128, ptr addrspace(5) %tmp1
+ store i128 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT: <16 x i128>
; LIMIT32: alloca
; LIMIT32-NOT: <16 x i128>
-define amdgpu_kernel void @alloca_16xi128_max256(i128 addrspace(1)* %out, i32 %index) #2 {
+define amdgpu_kernel void @alloca_16xi128_max256(ptr addrspace(1) %out, i32 %index) #2 {
entry:
%tmp = alloca [16 x i128], addrspace(5)
- %x = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 0
- store i128 0, i128 addrspace(5)* %x
- %tmp1 = getelementptr [16 x i128], [16 x i128] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i128, i128 addrspace(5)* %tmp1
- store i128 %tmp2, i128 addrspace(1)* %out
+ store i128 0, ptr addrspace(5) %tmp
+ %tmp1 = getelementptr [16 x i128], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i128, ptr addrspace(5) %tmp1
+ store i128 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-NOT: <9 x i256>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i256>
-define amdgpu_kernel void @alloca_9xi256_max256(i256 addrspace(1)* %out, i32 %index) #2 {
+define amdgpu_kernel void @alloca_9xi256_max256(ptr addrspace(1) %out, i32 %index) #2 {
entry:
%tmp = alloca [9 x i256], addrspace(5)
- %x = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 0
- store i256 0, i256 addrspace(5)* %x
- %tmp1 = getelementptr [9 x i256], [9 x i256] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i256, i256 addrspace(5)* %tmp1
- store i256 %tmp2, i256 addrspace(1)* %out
+ store i256 0, ptr addrspace(5) %tmp
+ %tmp1 = getelementptr [9 x i256], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i256, ptr addrspace(5) %tmp1
+ store i256 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT: <9 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i64>
-define amdgpu_kernel void @alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
+define amdgpu_kernel void @alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index) #2 {
entry:
%tmp = alloca [9 x i64], addrspace(5)
- %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
- store i64 0, i64 addrspace(5)* %x
- %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i64, i64 addrspace(5)* %tmp1
- store i64 %tmp2, i64 addrspace(1)* %out
+ store i64 0, ptr addrspace(5) %tmp
+ %tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i64, ptr addrspace(5) %tmp1
+ store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-NOT: <9 x i64>
; LIMIT32: alloca
; LIMIT32-NOT: <9 x i64>
-define void @func_alloca_9xi64_max256(i64 addrspace(1)* %out, i32 %index) #2 {
+define void @func_alloca_9xi64_max256(ptr addrspace(1) %out, i32 %index) #2 {
entry:
%tmp = alloca [9 x i64], addrspace(5)
- %x = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 0
- store i64 0, i64 addrspace(5)* %x
- %tmp1 = getelementptr [9 x i64], [9 x i64] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i64, i64 addrspace(5)* %tmp1
- store i64 %tmp2, i64 addrspace(1)* %out
+ store i64 0, ptr addrspace(5) %tmp
+ %tmp1 = getelementptr [9 x i64], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i64, ptr addrspace(5) %tmp1
+ store i64 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_read(
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
+; OPT: store i32 %0, ptr addrspace(1) %out, align 4
; FUNC-LABEL: {{^}}vector_read:
; EG: MOV
; EG: MOV
; EG: MOV
; EG: MOVA_INT
-define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read(ptr addrspace(1) %out, i32 %index) {
entry:
%tmp = alloca [4 x i32], addrspace(5)
- %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
- %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
- %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
- %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
- store i32 0, i32 addrspace(5)* %x
- store i32 1, i32 addrspace(5)* %y
- store i32 2, i32 addrspace(5)* %z
- store i32 3, i32 addrspace(5)* %w
- %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i32, i32 addrspace(5)* %tmp1
- store i32 %tmp2, i32 addrspace(1)* %out
+ %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+ %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 1, ptr addrspace(5) %y
+ store i32 2, ptr addrspace(5) %z
+ store i32 3, ptr addrspace(5) %w
+ %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i32, ptr addrspace(5) %tmp1
+ store i32 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_write(
; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index
; OPT: %1 = extractelement <4 x i32> %0, i32 %r_index
-; OPT: store i32 %1, i32 addrspace(1)* %out, align 4
+; OPT: store i32 %1, ptr addrspace(1) %out, align 4
; FUNC-LABEL: {{^}}vector_write:
; EG: MOV
; EG: MOV
; EG: MOVA_INT
; EG: MOVA_INT
-define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @vector_write(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
entry:
%tmp = alloca [4 x i32], addrspace(5)
- %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
- %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
- %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
- %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
- store i32 0, i32 addrspace(5)* %x
- store i32 0, i32 addrspace(5)* %y
- store i32 0, i32 addrspace(5)* %z
- store i32 0, i32 addrspace(5)* %w
- %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %w_index
- store i32 1, i32 addrspace(5)* %tmp1
- %tmp2 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %r_index
- %tmp3 = load i32, i32 addrspace(5)* %tmp2
- store i32 %tmp3, i32 addrspace(1)* %out
+ %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+ %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 0, ptr addrspace(5) %y
+ store i32 0, ptr addrspace(5) %z
+ store i32 0, ptr addrspace(5) %w
+ %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index
+ store i32 1, ptr addrspace(5) %tmp1
+ %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index
+ %tmp3 = load i32, ptr addrspace(5) %tmp2
+ store i32 %tmp3, ptr addrspace(1) %out
ret void
}
; This test should be optimize to:
-; store i32 0, i32 addrspace(1)* %out
+; store i32 0, ptr addrspace(1) %out
; OPT-LABEL: @bitcast_gep(
-; OPT-LABEL: store i32 0, i32 addrspace(1)* %out, align 4
+; OPT-LABEL: store i32 0, ptr addrspace(1) %out, align 4
; FUNC-LABEL: {{^}}bitcast_gep:
; EG: STORE_RAW
-define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @bitcast_gep(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) {
entry:
%tmp = alloca [4 x i32], addrspace(5)
- %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
- %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
- %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
- %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
- store i32 0, i32 addrspace(5)* %x
- store i32 0, i32 addrspace(5)* %y
- store i32 0, i32 addrspace(5)* %z
- store i32 0, i32 addrspace(5)* %w
- %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
- %tmp2 = bitcast i32 addrspace(5)* %tmp1 to [4 x i32] addrspace(5)*
- %tmp3 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp2, i32 0, i32 0
- %tmp4 = load i32, i32 addrspace(5)* %tmp3
- store i32 %tmp4, i32 addrspace(1)* %out
+ %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+ %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 0, ptr addrspace(5) %y
+ store i32 0, ptr addrspace(5) %z
+ store i32 0, ptr addrspace(5) %w
+ %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ %tmp4 = load i32, ptr addrspace(5) %tmp1
+ store i32 %tmp4, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_read_bitcast_gep(
; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
-; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
-define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
+; OPT: store i32 %0, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @vector_read_bitcast_gep(ptr addrspace(1) %out, i32 %index) {
entry:
%tmp = alloca [4 x i32], addrspace(5)
- %x = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
- %y = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
- %z = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
- %w = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
- %bc = bitcast i32 addrspace(5)* %x to float addrspace(5)*
- store float 1.0, float addrspace(5)* %bc
- store i32 1, i32 addrspace(5)* %y
- store i32 2, i32 addrspace(5)* %z
- store i32 3, i32 addrspace(5)* %w
- %tmp1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i32, i32 addrspace(5)* %tmp1
- store i32 %tmp2, i32 addrspace(1)* %out
+ %y = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ %z = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+ %w = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+ store float 1.0, ptr addrspace(5) %tmp
+ store i32 1, ptr addrspace(5) %y
+ store i32 2, ptr addrspace(5) %z
+ store i32 3, ptr addrspace(5) %w
+ %tmp1 = getelementptr inbounds [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i32, ptr addrspace(5) %tmp1
+ store i32 %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_read_bitcast_alloca(
; OPT: %0 = extractelement <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, i32 %index
-; OPT: store float %0, float addrspace(1)* %out, align 4
-define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
+; OPT: store float %0, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @vector_read_bitcast_alloca(ptr addrspace(1) %out, i32 %index) {
entry:
%tmp = alloca [4 x i32], addrspace(5)
- %tmp.bc = bitcast [4 x i32] addrspace(5)* %tmp to [4 x float] addrspace(5)*
- %x = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 0
- %y = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 1
- %z = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 2
- %w = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 3
- store float 0.0, float addrspace(5)* %x
- store float 1.0, float addrspace(5)* %y
- store float 2.0, float addrspace(5)* %z
- store float 4.0, float addrspace(5)* %w
- %tmp1 = getelementptr inbounds [4 x float], [4 x float] addrspace(5)* %tmp.bc, i32 0, i32 %index
- %tmp2 = load float, float addrspace(5)* %tmp1
- store float %tmp2, float addrspace(1)* %out
+ %y = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 1
+ %z = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 2
+ %w = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 3
+ store float 0.0, ptr addrspace(5) %tmp
+ store float 1.0, ptr addrspace(5) %y
+ store float 2.0, ptr addrspace(5) %z
+ store float 4.0, ptr addrspace(5) %w
+ %tmp1 = getelementptr inbounds [4 x float], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load float, ptr addrspace(5) %tmp1
+ store float %tmp2, ptr addrspace(1) %out
ret void
}
; OPT-LABEL: @vector_read_with_local_arg(
; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
-; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
-define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
+; OPT: store i32 %0, ptr addrspace(1) %out, align 4
+define amdgpu_kernel void @vector_read_with_local_arg(ptr addrspace(3) %stopper, ptr addrspace(1) %out, i32 %index) {
entry:
%tmp = alloca [4 x i32], addrspace(5)
- %x = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 0
- %y = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 1
- %z = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 2
- %w = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 3
- store i32 0, i32 addrspace(5)* %x
- store i32 1, i32 addrspace(5)* %y
- store i32 2, i32 addrspace(5)* %z
- store i32 3, i32 addrspace(5)* %w
- %tmp1 = getelementptr [4 x i32], [4 x i32] addrspace(5)* %tmp, i32 0, i32 %index
- %tmp2 = load i32, i32 addrspace(5)* %tmp1
- store i32 %tmp2, i32 addrspace(1)* %out
+ %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1
+ %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2
+ %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3
+ store i32 0, ptr addrspace(5) %tmp
+ store i32 1, ptr addrspace(5) %y
+ store i32 2, ptr addrspace(5) %z
+ store i32 3, ptr addrspace(5) %w
+ %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index
+ %tmp2 = load i32, ptr addrspace(5) %tmp1
+ store i32 %tmp2, ptr addrspace(1) %out
ret void
}