-; RUN: opt -S -passes=load-store-vectorizer --mcpu=hawaii -mattr=-unaligned-access-mode,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
-; RUN: opt -S -passes=load-store-vectorizer --mcpu=hawaii -mattr=+unaligned-access-mode,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
-; RUN: opt -S -passes='function(load-store-vectorizer)' --mcpu=hawaii -mattr=-unaligned-access-mode,+max-private-element-size-16 < %s | FileCheck -check-prefix=ALIGNED -check-prefix=ALL %s
-; RUN: opt -S -passes='function(load-store-vectorizer)' --mcpu=hawaii -mattr=+unaligned-access-mode,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefix=UNALIGNED -check-prefix=ALL %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=load-store-vectorizer --mcpu=hawaii -mattr=-unaligned-access-mode,+max-private-element-size-16 < %s | FileCheck -check-prefixes=CHECK,ALIGNED %s
+; RUN: opt -S -passes=load-store-vectorizer --mcpu=hawaii -mattr=+unaligned-access-mode,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefixes=CHECK,UNALIGNED %s
+; RUN: opt -S -passes='function(load-store-vectorizer)' --mcpu=hawaii -mattr=-unaligned-access-mode,+max-private-element-size-16 < %s | FileCheck -check-prefixes=CHECK,ALIGNED %s
+; RUN: opt -S -passes='function(load-store-vectorizer)' --mcpu=hawaii -mattr=+unaligned-access-mode,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefixes=CHECK,UNALIGNED %s
target triple = "amdgcn--"
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
-; ALL-LABEL: @load_unknown_offset_align1_i8(
-; ALL: alloca [128 x i8], align 1
-; UNALIGNED: load <2 x i8>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+; ALIGNED-LABEL: @load_unknown_offset_align1_i8(
+; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
+; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT: [[VAL0:%.*]] = load i8, i8 addrspace(5)* [[PTR0]], align 1
+; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[PTR0]], i32 1
+; ALIGNED-NEXT: [[VAL1:%.*]] = load i8, i8 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT: [[ADD:%.*]] = add i8 [[VAL0]], [[VAL1]]
+; ALIGNED-NEXT: store i8 [[ADD]], i8 addrspace(1)* [[OUT:%.*]], align 1
+; ALIGNED-NEXT: ret void
+;
+; UNALIGNED-LABEL: @load_unknown_offset_align1_i8(
+; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
+; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[PTR0]] to <2 x i8> addrspace(5)*
+; UNALIGNED-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT: [[VAL01:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
+; UNALIGNED-NEXT: [[VAL12:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
+; UNALIGNED-NEXT: [[ADD:%.*]] = add i8 [[VAL01]], [[VAL12]]
+; UNALIGNED-NEXT: store i8 [[ADD]], i8 addrspace(1)* [[OUT:%.*]], align 1
+; UNALIGNED-NEXT: ret void
+;
%alloca = alloca [128 x i8], align 1, addrspace(5)
%ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset
%val0 = load i8, i8 addrspace(5)* %ptr0, align 1
ret void
}
-; ALL-LABEL: @load_unknown_offset_align1_i16(
-; ALL: alloca [128 x i16], align 1, addrspace(5){{$}}
-; UNALIGNED: load <2 x i16>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: load i16, i16 addrspace(5)* %ptr0, align 1{{$}}
-; ALIGNED: load i16, i16 addrspace(5)* %ptr1, align 1{{$}}
define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+; ALIGNED-LABEL: @load_unknown_offset_align1_i16(
+; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
+; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT: [[VAL0:%.*]] = load i16, i16 addrspace(5)* [[PTR0]], align 1
+; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[PTR0]], i32 1
+; ALIGNED-NEXT: [[VAL1:%.*]] = load i16, i16 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT: [[ADD:%.*]] = add i16 [[VAL0]], [[VAL1]]
+; ALIGNED-NEXT: store i16 [[ADD]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; ALIGNED-NEXT: ret void
+;
+; UNALIGNED-LABEL: @load_unknown_offset_align1_i16(
+; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
+; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT: [[TMP1:%.*]] = bitcast i16 addrspace(5)* [[PTR0]] to <2 x i16> addrspace(5)*
+; UNALIGNED-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT: [[VAL01:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
+; UNALIGNED-NEXT: [[VAL12:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
+; UNALIGNED-NEXT: [[ADD:%.*]] = add i16 [[VAL01]], [[VAL12]]
+; UNALIGNED-NEXT: store i16 [[ADD]], i16 addrspace(1)* [[OUT:%.*]], align 2
+; UNALIGNED-NEXT: ret void
+;
%alloca = alloca [128 x i16], align 1, addrspace(5)
%ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset
%val0 = load i16, i16 addrspace(5)* %ptr0, align 1
; FIXME: Although the offset is unknown here, we know it is a multiple
; of the element size, so should still be align 4
-
-; ALL-LABEL: @load_unknown_offset_align1_i32(
-; ALL: alloca [128 x i32], align 1
-; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: load i32, i32 addrspace(5)* %ptr0, align 1
-; ALIGNED: load i32, i32 addrspace(5)* %ptr1, align 1
define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+; ALIGNED-LABEL: @load_unknown_offset_align1_i32(
+; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
+; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT: [[VAL0:%.*]] = load i32, i32 addrspace(5)* [[PTR0]], align 1
+; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i32, i32 addrspace(5)* [[PTR0]], i32 1
+; ALIGNED-NEXT: [[VAL1:%.*]] = load i32, i32 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT: [[ADD:%.*]] = add i32 [[VAL0]], [[VAL1]]
+; ALIGNED-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; ALIGNED-NEXT: ret void
+;
+; UNALIGNED-LABEL: @load_unknown_offset_align1_i32(
+; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
+; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(5)* [[PTR0]] to <2 x i32> addrspace(5)*
+; UNALIGNED-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT: [[VAL01:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; UNALIGNED-NEXT: [[VAL12:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; UNALIGNED-NEXT: [[ADD:%.*]] = add i32 [[VAL01]], [[VAL12]]
+; UNALIGNED-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; UNALIGNED-NEXT: ret void
+;
%alloca = alloca [128 x i32], align 1, addrspace(5)
%ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
%val0 = load i32, i32 addrspace(5)* %ptr0, align 1
}
; Make sure alloca alignment isn't decreased
-; ALL-LABEL: @load_alloca16_unknown_offset_align1_i32(
-; ALL: alloca [128 x i32], align 16
-
-; ALL: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}}
define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+; CHECK-LABEL: @load_alloca16_unknown_offset_align1_i32(
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 16, addrspace(5)
+; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(5)* [[PTR0]] to <2 x i32> addrspace(5)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32> addrspace(5)* [[TMP1]], align 4
+; CHECK-NEXT: [[VAL01:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[VAL12:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[VAL01]], [[VAL12]]
+; CHECK-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[OUT:%.*]], align 4
+; CHECK-NEXT: ret void
+;
%alloca = alloca [128 x i32], align 16, addrspace(5)
%ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
%val0 = load i32, i32 addrspace(5)* %ptr0, align 1
ret void
}
-; ALL-LABEL: @store_unknown_offset_align1_i8(
-; ALL: alloca [128 x i8], align 1
-; UNALIGNED: store <2 x i8> <i8 9, i8 10>, <2 x i8> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: store i8 9, i8 addrspace(5)* %ptr0, align 1{{$}}
-; ALIGNED: store i8 10, i8 addrspace(5)* %ptr1, align 1{{$}}
define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+; ALIGNED-LABEL: @store_unknown_offset_align1_i8(
+; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
+; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT: store i8 9, i8 addrspace(5)* [[PTR0]], align 1
+; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, i8 addrspace(5)* [[PTR0]], i32 1
+; ALIGNED-NEXT: store i8 10, i8 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT: ret void
+;
+; UNALIGNED-LABEL: @store_unknown_offset_align1_i8(
+; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
+; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[PTR0]] to <2 x i8> addrspace(5)*
+; UNALIGNED-NEXT: store <2 x i8> <i8 9, i8 10>, <2 x i8> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT: ret void
+;
%alloca = alloca [128 x i8], align 1, addrspace(5)
%ptr0 = getelementptr inbounds [128 x i8], [128 x i8] addrspace(5)* %alloca, i32 0, i32 %offset
store i8 9, i8 addrspace(5)* %ptr0, align 1
ret void
}
-; ALL-LABEL: @store_unknown_offset_align1_i16(
-; ALL: alloca [128 x i16], align 1
-; UNALIGNED: store <2 x i16> <i16 9, i16 10>, <2 x i16> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: store i16 9, i16 addrspace(5)* %ptr0, align 1{{$}}
-; ALIGNED: store i16 10, i16 addrspace(5)* %ptr1, align 1{{$}}
define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+; ALIGNED-LABEL: @store_unknown_offset_align1_i16(
+; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
+; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT: store i16 9, i16 addrspace(5)* [[PTR0]], align 1
+; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i16, i16 addrspace(5)* [[PTR0]], i32 1
+; ALIGNED-NEXT: store i16 10, i16 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT: ret void
+;
+; UNALIGNED-LABEL: @store_unknown_offset_align1_i16(
+; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
+; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT: [[TMP1:%.*]] = bitcast i16 addrspace(5)* [[PTR0]] to <2 x i16> addrspace(5)*
+; UNALIGNED-NEXT: store <2 x i16> <i16 9, i16 10>, <2 x i16> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT: ret void
+;
%alloca = alloca [128 x i16], align 1, addrspace(5)
%ptr0 = getelementptr inbounds [128 x i16], [128 x i16] addrspace(5)* %alloca, i32 0, i32 %offset
store i16 9, i16 addrspace(5)* %ptr0, align 1
; FIXME: Although the offset is unknown here, we know it is a multiple
; of the element size, so it still should be align 4.
-; ALL-LABEL: @store_unknown_offset_align1_i32(
-; ALL: alloca [128 x i32], align 1
-
-; UNALIGNED: store <2 x i32> <i32 9, i32 10>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; ALIGNED: store i32 9, i32 addrspace(5)* %ptr0, align 1
-; ALIGNED: store i32 10, i32 addrspace(5)* %ptr1, align 1
define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+; ALIGNED-LABEL: @store_unknown_offset_align1_i32(
+; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
+; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; ALIGNED-NEXT: store i32 9, i32 addrspace(5)* [[PTR0]], align 1
+; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i32, i32 addrspace(5)* [[PTR0]], i32 1
+; ALIGNED-NEXT: store i32 10, i32 addrspace(5)* [[PTR1]], align 1
+; ALIGNED-NEXT: ret void
+;
+; UNALIGNED-LABEL: @store_unknown_offset_align1_i32(
+; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
+; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
+; UNALIGNED-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(5)* [[PTR0]] to <2 x i32> addrspace(5)*
+; UNALIGNED-NEXT: store <2 x i32> <i32 9, i32 10>, <2 x i32> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT: ret void
+;
%alloca = alloca [128 x i32], align 1, addrspace(5)
%ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
store i32 9, i32 addrspace(5)* %ptr0, align 1
ret void
}
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32(
-; ALL: %alloca = alloca [8 x i32], align 4, addrspace(5)
-; ALL: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 4
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() {
+; CHECK-LABEL: @merge_private_store_4_vector_elts_loads_v4i32(
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i32], align 4, addrspace(5)
+; CHECK-NEXT: [[OUT:%.*]] = bitcast [8 x i32] addrspace(5)* [[ALLOCA]] to i32 addrspace(5)*
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(5)* [[OUT]] to <4 x i32> addrspace(5)*
+; CHECK-NEXT: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%alloca = alloca [8 x i32], align 1, addrspace(5)
%out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
%out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
ret void
}
-; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
-; ALL: %alloca = alloca [8 x i8], align 4, addrspace(5)
-; ALL: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 4
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() {
+; CHECK-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i8], align 4, addrspace(5)
+; CHECK-NEXT: [[OUT:%.*]] = bitcast [8 x i8] addrspace(5)* [[ALLOCA]] to i8 addrspace(5)*
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[OUT]] to <4 x i8> addrspace(5)*
+; CHECK-NEXT: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%alloca = alloca [8 x i8], align 1, addrspace(5)
%out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
%out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
ret void
}
-; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32(
-; ALL: %alloca = alloca [8 x i32], align 4, addrspace(5)
-; ALL: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4
define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() {
+; CHECK-LABEL: @merge_private_load_4_vector_elts_loads_v4i32(
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i32], align 4, addrspace(5)
+; CHECK-NEXT: [[OUT:%.*]] = bitcast [8 x i32] addrspace(5)* [[ALLOCA]] to i32 addrspace(5)*
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(5)* [[OUT]] to <4 x i32> addrspace(5)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(5)* [[TMP1]], align 4
+; CHECK-NEXT: [[LOAD01:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[LOAD12:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[LOAD23:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT: [[LOAD34:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT: ret void
+;
%alloca = alloca [8 x i32], align 1, addrspace(5)
%out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
%out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
ret void
}
-; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8(
-; ALL: %alloca = alloca [8 x i8], align 4, addrspace(5)
-; ALL: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4
define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() {
+; CHECK-LABEL: @merge_private_load_4_vector_elts_loads_v4i8(
+; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i8], align 4, addrspace(5)
+; CHECK-NEXT: [[OUT:%.*]] = bitcast [8 x i8] addrspace(5)* [[ALLOCA]] to i8 addrspace(5)*
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(5)* [[OUT]] to <4 x i8> addrspace(5)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8> addrspace(5)* [[TMP1]], align 4
+; CHECK-NEXT: [[LOAD01:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
+; CHECK-NEXT: [[LOAD12:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
+; CHECK-NEXT: [[LOAD23:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
+; CHECK-NEXT: [[LOAD34:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
+; CHECK-NEXT: ret void
+;
%alloca = alloca [8 x i8], align 1, addrspace(5)
%out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
%out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
}
; Make sure we don't think the alignment will increase if the base address isn't an alloca
-; ALL-LABEL: @private_store_2xi16_align2_not_alloca(
-; ALL: store i16
-; ALL: store i16
define void @private_store_2xi16_align2_not_alloca(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 {
+; CHECK-LABEL: @private_store_2xi16_align2_not_alloca(
+; CHECK-NEXT: [[GEP_R:%.*]] = getelementptr i16, i16 addrspace(5)* [[R:%.*]], i32 1
+; CHECK-NEXT: store i16 1, i16 addrspace(5)* [[R]], align 2
+; CHECK-NEXT: store i16 2, i16 addrspace(5)* [[GEP_R]], align 2
+; CHECK-NEXT: ret void
+;
%gep.r = getelementptr i16, i16 addrspace(5)* %r, i32 1
store i16 1, i16 addrspace(5)* %r, align 2
store i16 2, i16 addrspace(5)* %gep.r, align 2
ret void
}
-; ALL-LABEL: @private_store_2xi16_align1_not_alloca(
-; ALIGNED: store i16
-; ALIGNED: store i16
-; UNALIGNED: store <2 x i16>
define void @private_store_2xi16_align1_not_alloca(i16 addrspace(5)* %p, i16 addrspace(5)* %r) #0 {
+; ALIGNED-LABEL: @private_store_2xi16_align1_not_alloca(
+; ALIGNED-NEXT: [[GEP_R:%.*]] = getelementptr i16, i16 addrspace(5)* [[R:%.*]], i32 1
+; ALIGNED-NEXT: store i16 1, i16 addrspace(5)* [[R]], align 1
+; ALIGNED-NEXT: store i16 2, i16 addrspace(5)* [[GEP_R]], align 1
+; ALIGNED-NEXT: ret void
+;
+; UNALIGNED-LABEL: @private_store_2xi16_align1_not_alloca(
+; UNALIGNED-NEXT: [[TMP1:%.*]] = bitcast i16 addrspace(5)* [[R:%.*]] to <2 x i16> addrspace(5)*
+; UNALIGNED-NEXT: store <2 x i16> <i16 1, i16 2>, <2 x i16> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT: ret void
+;
%gep.r = getelementptr i16, i16 addrspace(5)* %r, i32 1
store i16 1, i16 addrspace(5)* %r, align 1
store i16 2, i16 addrspace(5)* %gep.r, align 1
ret void
}
-; ALL-LABEL: @private_load_2xi16_align2_not_alloca(
-; ALL: load i16
-; ALL: load i16
define i32 @private_load_2xi16_align2_not_alloca(i16 addrspace(5)* %p) #0 {
+; CHECK-LABEL: @private_load_2xi16_align2_not_alloca(
+; CHECK-NEXT: [[GEP_P:%.*]] = getelementptr i16, i16 addrspace(5)* [[P:%.*]], i64 1
+; CHECK-NEXT: [[P_0:%.*]] = load i16, i16 addrspace(5)* [[P]], align 2
+; CHECK-NEXT: [[P_1:%.*]] = load i16, i16 addrspace(5)* [[GEP_P]], align 2
+; CHECK-NEXT: [[ZEXT_0:%.*]] = zext i16 [[P_0]] to i32
+; CHECK-NEXT: [[ZEXT_1:%.*]] = zext i16 [[P_1]] to i32
+; CHECK-NEXT: [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
+; CHECK-NEXT: [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
+; CHECK-NEXT: ret i32 [[OR]]
+;
%gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
%p.0 = load i16, i16 addrspace(5)* %p, align 2
%p.1 = load i16, i16 addrspace(5)* %gep.p, align 2
ret i32 %or
}
-; ALL-LABEL: @private_load_2xi16_align1_not_alloca(
-; ALIGNED: load i16
-; ALIGNED: load i16
-; UNALIGNED: load <2 x i16>
define i32 @private_load_2xi16_align1_not_alloca(i16 addrspace(5)* %p) #0 {
+; ALIGNED-LABEL: @private_load_2xi16_align1_not_alloca(
+; ALIGNED-NEXT: [[GEP_P:%.*]] = getelementptr i16, i16 addrspace(5)* [[P:%.*]], i64 1
+; ALIGNED-NEXT: [[P_0:%.*]] = load i16, i16 addrspace(5)* [[P]], align 1
+; ALIGNED-NEXT: [[P_1:%.*]] = load i16, i16 addrspace(5)* [[GEP_P]], align 1
+; ALIGNED-NEXT: [[ZEXT_0:%.*]] = zext i16 [[P_0]] to i32
+; ALIGNED-NEXT: [[ZEXT_1:%.*]] = zext i16 [[P_1]] to i32
+; ALIGNED-NEXT: [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
+; ALIGNED-NEXT: [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
+; ALIGNED-NEXT: ret i32 [[OR]]
+;
+; UNALIGNED-LABEL: @private_load_2xi16_align1_not_alloca(
+; UNALIGNED-NEXT: [[TMP1:%.*]] = bitcast i16 addrspace(5)* [[P:%.*]] to <2 x i16> addrspace(5)*
+; UNALIGNED-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16> addrspace(5)* [[TMP1]], align 1
+; UNALIGNED-NEXT: [[P_01:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0
+; UNALIGNED-NEXT: [[P_12:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1
+; UNALIGNED-NEXT: [[ZEXT_0:%.*]] = zext i16 [[P_01]] to i32
+; UNALIGNED-NEXT: [[ZEXT_1:%.*]] = zext i16 [[P_12]] to i32
+; UNALIGNED-NEXT: [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
+; UNALIGNED-NEXT: [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
+; UNALIGNED-NEXT: ret i32 [[OR]]
+;
%gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
%p.0 = load i16, i16 addrspace(5)* %p, align 1
%p.1 = load i16, i16 addrspace(5)* %gep.p, align 1
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=amdgcn-amd-amdhsa --mcpu=hawaii -passes=load-store-vectorizer -S -o - %s | FileCheck %s
; Copy of test/CodeGen/AMDGPU/merge-stores.ll with some additions
; TODO: Same base addrspacecasted
-; CHECK-LABEL: @merge_global_store_2_constants_i8(
-; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2
define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_i8(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[OUT:%.*]] to <2 x i8> addrspace(1)*
+; CHECK-NEXT: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* [[TMP1]], align 2
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 123, i8 addrspace(1)* %out.gep.1
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align
-; CHECK: store <2 x i8>
define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[OUT:%.*]] to <2 x i8> addrspace(1)*
+; CHECK-NEXT: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* [[TMP1]], align 1
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 123, i8 addrspace(1)* %out.gep.1
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_i16
-; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_i16(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 addrspace(1)* [[OUT:%.*]] to <2 x i16> addrspace(1)*
+; CHECK-NEXT: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
store i16 123, i16 addrspace(1)* %out.gep.1
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_0_i16
-; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_0_i16(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 addrspace(1)* [[OUT:%.*]] to <2 x i16> addrspace(1)*
+; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
store i16 0, i16 addrspace(1)* %out.gep.1
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align
-; CHECK: store i16
-; CHECK: store i16
define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align(
+; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr i16, i16 addrspace(1)* [[OUT:%.*]], i32 1
+; CHECK-NEXT: store i16 123, i16 addrspace(1)* [[OUT_GEP_1]], align 2
+; CHECK-NEXT: store i16 456, i16 addrspace(1)* [[OUT]], align 2
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
store i16 123, i16 addrspace(1)* %out.gep.1
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_i16_align_1
-; CHECK: store <2 x i16>
define amdgpu_kernel void @merge_global_store_2_constants_i16_align_1(i16 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_i16_align_1(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 addrspace(1)* [[OUT:%.*]] to <2 x i16> addrspace(1)*
+; CHECK-NEXT: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* [[TMP1]], align 1
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
store i16 123, i16 addrspace(1)* %out.gep.1, align 1
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align
-; CHECK: store half
-; CHECK: store half
define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align(
+; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr half, half addrspace(1)* [[OUT:%.*]], i32 1
+; CHECK-NEXT: store half 0xH4000, half addrspace(1)* [[OUT_GEP_1]], align 2
+; CHECK-NEXT: store half 0xH3C00, half addrspace(1)* [[OUT]], align 2
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
store half 2.0, half addrspace(1)* %out.gep.1
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_half_align_1
-; CHECK: store <2 x half>
define amdgpu_kernel void @merge_global_store_2_constants_half_align_1(half addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_half_align_1(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast half addrspace(1)* [[OUT:%.*]] to <2 x half> addrspace(1)*
+; CHECK-NEXT: store <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> addrspace(1)* [[TMP1]], align 1
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
store half 2.0, half addrspace(1)* %out.gep.1, align 1
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_i32
-; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
store i32 123, i32 addrspace(1)* %out.gep.1
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_i32_f32
-; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_i32_f32(
+; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 1
+; CHECK-NEXT: [[OUT_GEP_1_BC:%.*]] = bitcast i32 addrspace(1)* [[OUT_GEP_1]] to float addrspace(1)*
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
store float 1.0, float addrspace(1)* %out.gep.1.bc
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
-; CHECK: store <2 x i32> <i32 1082130432, i32 123>, <2 x i32> addrspace(1)*
define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_f32_i32(
+; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 1
+; CHECK-NEXT: [[OUT_GEP_1_BC:%.*]] = bitcast float addrspace(1)* [[OUT_GEP_1]] to i32 addrspace(1)*
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float addrspace(1)* [[OUT]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT: store <2 x i32> <i32 1082130432, i32 123>, <2 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
%out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
store i32 123, i32 addrspace(1)* %out.gep.1.bc
ret void
}
-; CHECK-LABEL: @merge_global_store_4_constants_i32
-; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_4_constants_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
ret void
}
-; CHECK-LABEL: @merge_global_store_4_constants_f32_order
-; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}
define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_4_constants_f32_order(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float addrspace(1)* [[OUT:%.*]] to <4 x float> addrspace(1)*
+; CHECK-NEXT: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
%out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
}
; First store is out of order.
-; CHECK-LABEL: @merge_global_store_4_constants_f32
-; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_4_constants_f32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float addrspace(1)* [[OUT:%.*]] to <4 x float> addrspace(1)*
+; CHECK-NEXT: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
%out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
ret void
}
-; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32
-; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32(
+; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr float, float addrspace(1)* [[OUT:%.*]], i32 1
+; CHECK-NEXT: [[OUT_GEP_3:%.*]] = getelementptr float, float addrspace(1)* [[OUT]], i32 3
+; CHECK-NEXT: [[OUT_GEP_1_BC:%.*]] = bitcast float addrspace(1)* [[OUT_GEP_1]] to i32 addrspace(1)*
+; CHECK-NEXT: [[OUT_GEP_3_BC:%.*]] = bitcast float addrspace(1)* [[OUT_GEP_3]] to i32 addrspace(1)*
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float addrspace(1)* [[OUT]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
%out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
ret void
}
-; CHECK-LABEL: @merge_global_store_3_constants_i32
-; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_3_constants_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <3 x i32> addrspace(1)*
+; CHECK-NEXT: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
ret void
}
-; CHECK-LABEL: @merge_global_store_2_constants_i64
-; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_2_constants_i64(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 addrspace(1)* [[OUT:%.*]] to <2 x i64> addrspace(1)*
+; CHECK-NEXT: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* [[TMP1]], align 8
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
store i64 123, i64 addrspace(1)* %out.gep.1
ret void
}
-; CHECK-LABEL: @merge_global_store_4_constants_i64
-; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+; CHECK-LABEL: @merge_global_store_4_constants_i64(
+; CHECK-NEXT: [[OUT_GEP_2:%.*]] = getelementptr i64, i64 addrspace(1)* [[OUT:%.*]], i64 2
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 addrspace(1)* [[OUT_GEP_2]] to <2 x i64> addrspace(1)*
+; CHECK-NEXT: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* [[TMP1]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 addrspace(1)* [[OUT]] to <2 x i64> addrspace(1)*
+; CHECK-NEXT: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* [[TMP2]], align 8
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
%out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
%out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
ret void
}
-; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32
-; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32>
-; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0
-; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1
-; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> poison, i32 [[ELT0]], i32 0
-; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1
-; CHECK: store <2 x i32> [[INSERT1]]
define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[LO1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[HI2:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[HI2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32> addrspace(1)* [[TMP5]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
ret void
}
-; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base
-; CHECK: extractelement
-; CHECK: extractelement
-; CHECK: insertelement
-; CHECK: insertelement
-; CHECK: store <2 x i32>
define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_2_adjacent_loads_i32_nonzero_base(
+; CHECK-NEXT: [[IN_GEP_0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i32 2
+; CHECK-NEXT: [[OUT_GEP_0:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 2
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN_GEP_0]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[LO1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[HI2:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LO1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[HI2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[OUT_GEP_0]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32> addrspace(1)* [[TMP5]], align 4
+; CHECK-NEXT: ret void
+;
%in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
ret void
}
-; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32
-; CHECK: [[LOAD:%[^ ]+]] = load <2 x i32>
-; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 0
-; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> [[LOAD]], i32 1
-; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> poison, i32 [[ELT1]], i32 0
-; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1
-; CHECK: store <2 x i32> [[INSERT1]]
define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_2_adjacent_loads_shuffle_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[LO1:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[HI2:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[HI2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LO1]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT: store <2 x i32> [[TMP4]], <2 x i32> addrspace(1)* [[TMP5]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
ret void
}
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Y2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[Z3]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[W4]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32> addrspace(1)* [[TMP7]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
ret void
}
-; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32
-; CHECK: load <3 x i32>
-; CHECK: store <3 x i32>
define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <3 x i32> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <3 x i32>, <3 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[X1:%.*]] = extractelement <3 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <3 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[Z3:%.*]] = extractelement <3 x i32> [[TMP2]], i32 2
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i32> poison, i32 [[X1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[Y2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x i32> [[TMP4]], i32 [[Z3]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <3 x i32> addrspace(1)*
+; CHECK-NEXT: store <3 x i32> [[TMP5]], <3 x i32> addrspace(1)* [[TMP6]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
ret void
}
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32
-; CHECK: load <4 x float>
-; CHECK: store <4 x float>
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast float addrspace(1)* [[IN:%.*]] to <4 x float> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
+; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
+; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[X1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[Y2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[Z3]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[W4]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast float addrspace(1)* [[OUT:%.*]] to <4 x float> addrspace(1)*
+; CHECK-NEXT: store <4 x float> [[TMP6]], <4 x float> addrspace(1)* [[TMP7]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
%out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
ret void
}
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base(
+; CHECK-NEXT: [[IN_GEP_0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i32 11
+; CHECK-NEXT: [[OUT_GEP_0:%.*]] = getelementptr i32, i32 addrspace(1)* [[OUT:%.*]], i32 7
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN_GEP_0]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Y2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[Z3]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[W4]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[OUT_GEP_0]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32> addrspace(1)* [[TMP7]], align 4
+; CHECK-NEXT: ret void
+;
%in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
%in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
%in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
ret void
}
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[X1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Y2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[Z3]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[W4]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32> addrspace(1)* [[TMP7]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
ret void
}
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[IN:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
+; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
+; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() #[[ATTR3]]
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[W4]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[Z3]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[Y2]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[X1]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32> addrspace(1)* [[TMP7]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
ret void
}
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8
-; CHECK: load <4 x i8>
-; CHECK: extractelement <4 x i8>
-; CHECK: extractelement <4 x i8>
-; CHECK: extractelement <4 x i8>
-; CHECK: extractelement <4 x i8>
-; CHECK: insertelement <4 x i8>
-; CHECK: insertelement <4 x i8>
-; CHECK: insertelement <4 x i8>
-; CHECK: insertelement <4 x i8>
-; CHECK: store <4 x i8>
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[IN:%.*]] to <4 x i8> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
+; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
+; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Y2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[Z3]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[W4]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 addrspace(1)* [[OUT:%.*]] to <4 x i8> addrspace(1)*
+; CHECK-NEXT: store <4 x i8> [[TMP6]], <4 x i8> addrspace(1)* [[TMP7]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
%out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
%out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
ret void
}
-; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align
-; CHECK: load <4 x i8>
-; CHECK: store <4 x i8>
define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(1)* [[IN:%.*]] to <4 x i8> addrspace(1)*
+; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, <4 x i8> addrspace(1)* [[TMP1]], align 1
+; CHECK-NEXT: [[X1:%.*]] = extractelement <4 x i8> [[TMP2]], i32 0
+; CHECK-NEXT: [[Y2:%.*]] = extractelement <4 x i8> [[TMP2]], i32 1
+; CHECK-NEXT: [[Z3:%.*]] = extractelement <4 x i8> [[TMP2]], i32 2
+; CHECK-NEXT: [[W4:%.*]] = extractelement <4 x i8> [[TMP2]], i32 3
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i8> poison, i8 [[X1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> [[TMP3]], i8 [[Y2]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i8> [[TMP4]], i8 [[Z3]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i8> [[TMP5]], i8 [[W4]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8 addrspace(1)* [[OUT:%.*]] to <4 x i8> addrspace(1)*
+; CHECK-NEXT: store <4 x i8> [[TMP6]], <4 x i8> addrspace(1)* [[TMP7]], align 1
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
%out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
%out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
ret void
}
-; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32
-; CHECK: load <4 x i32>
-; CHECK: store <4 x i32>
define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32(
+; CHECK-NEXT: [[VEC:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[IN:%.*]], align 16
+; CHECK-NEXT: [[X:%.*]] = extractelement <4 x i32> [[VEC]], i32 0
+; CHECK-NEXT: [[Y:%.*]] = extractelement <4 x i32> [[VEC]], i32 1
+; CHECK-NEXT: [[Z:%.*]] = extractelement <4 x i32> [[VEC]], i32 2
+; CHECK-NEXT: [[W:%.*]] = extractelement <4 x i32> [[VEC]], i32 3
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Y]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[Z]], i32 2
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[W]], i32 3
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP5]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
%out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
ret void
}
-; CHECK-LABEL: @merge_local_store_2_constants_i8
-; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2
define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+; CHECK-LABEL: @merge_local_store_2_constants_i8(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 addrspace(3)* [[OUT:%.*]] to <2 x i8> addrspace(3)*
+; CHECK-NEXT: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* [[TMP1]], align 2
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
store i8 123, i8 addrspace(3)* %out.gep.1
ret void
}
-; CHECK-LABEL: @merge_local_store_2_constants_i32
-; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+; CHECK-LABEL: @merge_local_store_2_constants_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(3)* [[OUT:%.*]] to <2 x i32> addrspace(3)*
+; CHECK-NEXT: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* [[TMP1]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
store i32 123, i32 addrspace(3)* %out.gep.1
ret void
}
-; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
-; CHECK: store i32
-; CHECK: store i32
define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
+; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2(
+; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr i32, i32 addrspace(3)* [[OUT:%.*]], i32 1
+; CHECK-NEXT: store i32 123, i32 addrspace(3)* [[OUT_GEP_1]], align 2
+; CHECK-NEXT: store i32 456, i32 addrspace(3)* [[OUT]], align 2
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
store i32 123, i32 addrspace(3)* %out.gep.1, align 2
ret void
}
-; CHECK-LABEL: @merge_local_store_4_constants_i32
-; CHECK: store <2 x i32> <i32 456, i32 333>, <2 x i32> addrspace(3)* %1, align 4
-; CHECK: store <2 x i32> <i32 1234, i32 123>, <2 x i32> addrspace(3)* %2, align 4
define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+; CHECK-LABEL: @merge_local_store_4_constants_i32(
+; CHECK-NEXT: [[OUT_GEP_2:%.*]] = getelementptr i32, i32 addrspace(3)* [[OUT:%.*]], i32 2
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(3)* [[OUT_GEP_2]] to <2 x i32> addrspace(3)*
+; CHECK-NEXT: store <2 x i32> <i32 456, i32 333>, <2 x i32> addrspace(3)* [[TMP1]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 addrspace(3)* [[OUT]] to <2 x i32> addrspace(3)*
+; CHECK-NEXT: store <2 x i32> <i32 1234, i32 123>, <2 x i32> addrspace(3)* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
%out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
%out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
ret void
}
-; CHECK-LABEL: @merge_global_store_5_constants_i32
-; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-; CHECK: store i32
define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+; CHECK-LABEL: @merge_global_store_5_constants_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT]], i64 4
+; CHECK-NEXT: store i32 11, i32 addrspace(1)* [[IDX4]], align 4
+; CHECK-NEXT: ret void
+;
store i32 9, i32 addrspace(1)* %out, align 4
%idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
store i32 12, i32 addrspace(1)* %idx1, align 4
ret void
}
-; CHECK-LABEL: @merge_global_store_6_constants_i32
-; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+; CHECK-LABEL: @merge_global_store_6_constants_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT]], i64 4
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 addrspace(1)* [[IDX4]] to <2 x i32> addrspace(1)*
+; CHECK-NEXT: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
store i32 13, i32 addrspace(1)* %out, align 4
%idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
store i32 15, i32 addrspace(1)* %idx1, align 4
ret void
}
-; CHECK-LABEL: @merge_global_store_7_constants_i32
-; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+; CHECK-LABEL: @merge_global_store_7_constants_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT]], i64 4
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 addrspace(1)* [[IDX4]] to <3 x i32> addrspace(1)*
+; CHECK-NEXT: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
store i32 34, i32 addrspace(1)* %out, align 4
%idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
store i32 999, i32 addrspace(1)* %idx1, align 4
ret void
}
-; CHECK-LABEL: @merge_global_store_8_constants_i32
-; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+; CHECK-LABEL: @merge_global_store_8_constants_i32(
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[OUT:%.*]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* [[TMP1]], align 4
+; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[OUT]], i64 4
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 addrspace(1)* [[IDX4]] to <4 x i32> addrspace(1)*
+; CHECK-NEXT: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* [[TMP2]], align 4
+; CHECK-NEXT: ret void
+;
store i32 34, i32 addrspace(1)* %out, align 4
%idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
store i32 999, i32 addrspace(1)* %idx1, align 4
ret void
}
-; CHECK-LABEL: @copy_v3i32_align4
-; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
-; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+; CHECK-LABEL: @copy_v3i32_align4(
+; CHECK-NEXT: [[VEC:%.*]] = load <3 x i32>, <3 x i32> addrspace(1)* [[IN:%.*]], align 4
+; CHECK-NEXT: store <3 x i32> [[VEC]], <3 x i32> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
ret void
}
-; CHECK-LABEL: @copy_v3i64_align4
-; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
-; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+; CHECK-LABEL: @copy_v3i64_align4(
+; CHECK-NEXT: [[VEC:%.*]] = load <3 x i64>, <3 x i64> addrspace(1)* [[IN:%.*]], align 4
+; CHECK-NEXT: store <3 x i64> [[VEC]], <3 x i64> addrspace(1)* [[OUT:%.*]], align 32
+; CHECK-NEXT: ret void
+;
%vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
ret void
}
-; CHECK-LABEL: @copy_v3f32_align4
-; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
-; CHECK: store <3 x float>
define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+; CHECK-LABEL: @copy_v3f32_align4(
+; CHECK-NEXT: [[VEC:%.*]] = load <3 x float>, <3 x float> addrspace(1)* [[IN:%.*]], align 4
+; CHECK-NEXT: [[FADD:%.*]] = fadd <3 x float> [[VEC]], <float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>
+; CHECK-NEXT: store <3 x float> [[FADD]], <3 x float> addrspace(1)* [[OUT:%.*]], align 16
+; CHECK-NEXT: ret void
+;
%vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
%fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
store <3 x float> %fadd, <3 x float> addrspace(1)* %out
ret void
}
-; CHECK-LABEL: @copy_v3f64_align4
-; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
-; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out
define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+; CHECK-LABEL: @copy_v3f64_align4(
+; CHECK-NEXT: [[VEC:%.*]] = load <3 x double>, <3 x double> addrspace(1)* [[IN:%.*]], align 4
+; CHECK-NEXT: [[FADD:%.*]] = fadd <3 x double> [[VEC]], <double 1.000000e+00, double 2.000000e+00, double 4.000000e+00>
+; CHECK-NEXT: store <3 x double> [[FADD]], <3 x double> addrspace(1)* [[OUT:%.*]], align 32
+; CHECK-NEXT: ret void
+;
%vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
%fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
store <3 x double> %fadd, <3 x double> addrspace(1)* %out
}
; Verify that we no longer hit asserts for this test case. No change expected.
-; CHECK-LABEL: @copy_vec_of_ptrs
-; CHECK: %in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
-; CHECK: %vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
-; CHECK: %vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4
-; CHECK: %out.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %out, i32 1
-; CHECK: store <2 x i16*> %vec1, <2 x i16*> addrspace(1)* %out.gep.1
-; CHECK: store <2 x i16*> %vec2, <2 x i16*> addrspace(1)* %out, align 4
define amdgpu_kernel void @copy_vec_of_ptrs(<2 x i16*> addrspace(1)* %out,
- <2 x i16*> addrspace(1)* %in ) #0 {
+; CHECK-LABEL: @copy_vec_of_ptrs(
+; CHECK-NEXT: [[IN_GEP_1:%.*]] = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* [[IN:%.*]], i32 1
+; CHECK-NEXT: [[VEC1:%.*]] = load <2 x i16*>, <2 x i16*> addrspace(1)* [[IN_GEP_1]], align 16
+; CHECK-NEXT: [[VEC2:%.*]] = load <2 x i16*>, <2 x i16*> addrspace(1)* [[IN]], align 4
+; CHECK-NEXT: [[OUT_GEP_1:%.*]] = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* [[OUT:%.*]], i32 1
+; CHECK-NEXT: store <2 x i16*> [[VEC1]], <2 x i16*> addrspace(1)* [[OUT_GEP_1]], align 16
+; CHECK-NEXT: store <2 x i16*> [[VEC2]], <2 x i16*> addrspace(1)* [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ <2 x i16*> addrspace(1)* %in ) #0 {
%in.gep.1 = getelementptr <2 x i16*>, <2 x i16*> addrspace(1)* %in, i32 1
%vec1 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in.gep.1
%vec2 = load <2 x i16*>, <2 x i16*> addrspace(1)* %in, align 4