// try again.
unsigned EltSzInBytes = Sz / 8;
unsigned SzInBytes = EltSzInBytes * ChainSize;
- if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
- auto Chains = splitOddVectorElts(Chain, Sz);
- return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
- vectorizeStoreChain(Chains.second, InstructionsProcessed);
- }
VectorType *VecTy;
VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
// If the store is going to be misaligned, don't vectorize it.
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
- if (S0->getPointerAddressSpace() != 0)
- return false;
+ if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
+ vectorizeStoreChain(Chains.second, InstructionsProcessed);
+ }
unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
StackAdjustedAlignment,
DL, S0, nullptr, &DT);
- if (NewAlign < StackAdjustedAlignment)
- return false;
+ if (NewAlign != 0)
+ Alignment = NewAlign;
+ }
+
+ if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
+ vectorizeStoreChain(Chains.second, InstructionsProcessed);
}
BasicBlock::iterator First, Last;
}
}
- // This cast is safe because Builder.CreateStore() always creates a bona fide
- // StoreInst.
- StoreInst *SI = cast<StoreInst>(
- Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(),
- VecTy->getPointerTo(AS))));
+ StoreInst *SI = Builder.CreateAlignedStore(
+ Vec,
+ Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
+ Alignment);
propagateMetadata(SI, Chain);
- SI->setAlignment(Alignment);
eraseInstructions(Chain);
++NumVectorInstructions;
// try again.
unsigned EltSzInBytes = Sz / 8;
unsigned SzInBytes = EltSzInBytes * ChainSize;
- if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
- auto Chains = splitOddVectorElts(Chain, Sz);
- return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
- vectorizeLoadChain(Chains.second, InstructionsProcessed);
- }
-
VectorType *VecTy;
VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
if (VecLoadTy)
// If the load is going to be misaligned, don't vectorize it.
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
- if (L0->getPointerAddressSpace() != 0)
- return false;
+ if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
+ vectorizeLoadChain(Chains.second, InstructionsProcessed);
+ }
unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
StackAdjustedAlignment,
DL, L0, nullptr, &DT);
- if (NewAlign < StackAdjustedAlignment)
- return false;
+ if (NewAlign != 0)
+ Alignment = NewAlign;
Alignment = NewAlign;
}
+ if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
+ vectorizeLoadChain(Chains.second, InstructionsProcessed);
+ }
+
LLVM_DEBUG({
dbgs() << "LSV: Loads to vectorize:\n";
for (Instruction *I : Chain)
Value *Bitcast =
Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
- // This cast is safe because Builder.CreateLoad always creates a bona fide
- // LoadInst.
- LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
+ LoadInst *LI = Builder.CreateAlignedLoad(Bitcast, Alignment);
propagateMetadata(LI, Chain);
- LI->setAlignment(Alignment);
if (VecLoadTy) {
SmallVector<Instruction *, 16> InstrsToErase;
; ALL: alloca [128 x i32], align 16
; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
-
-; FIXME: Should change alignment
-; ALIGNED: load i32
-; ALIGNED: load i32
+; ALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}}
define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
%alloca = alloca [128 x i32], align 16, addrspace(5)
%ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
ret void
}
-attributes #0 = { nounwind }
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32(
+; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5)
+; ALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5)
+; UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() {
+ %alloca = alloca [8 x i32], align 1, addrspace(5)
+ %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
+ %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+ %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+ %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+
+ store i32 9, i32 addrspace(5)* %out, align 1
+ store i32 1, i32 addrspace(5)* %out.gep.1, align 1
+ store i32 23, i32 addrspace(5)* %out.gep.2, align 1
+ store i32 19, i32 addrspace(5)* %out.gep.3, align 1
+ ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
+; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5)
+; ALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5)
+; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() {
+ %alloca = alloca [8 x i8], align 1, addrspace(5)
+ %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+ %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
+ %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
+ %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
+
+ store i8 9, i8 addrspace(5)* %out, align 1
+ store i8 1, i8 addrspace(5)* %out.gep.1, align 1
+ store i8 23, i8 addrspace(5)* %out.gep.2, align 1
+ store i8 19, i8 addrspace(5)* %out.gep.3, align 1
+ ret void
+}
+; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32(
+; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5)
+; ALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5)
+; UNALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() {
+ %alloca = alloca [8 x i32], align 1, addrspace(5)
+ %out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
+ %out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
+ %out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
+ %out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
+
+ %load0 = load i32, i32 addrspace(5)* %out, align 1
+ %load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1
+ %load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1
+ %load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1
+ ret void
+}
+
+; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8(
+; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5)
+; ALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4
+
+; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5)
+; UNALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 1
+define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() {
+ %alloca = alloca [8 x i8], align 1, addrspace(5)
+ %out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
+ %out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
+ %out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
+ %out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
+
+ %load0 = load i8, i8 addrspace(5)* %out, align 1
+ %load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1
+ %load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1
+ %load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1
+ ret void
+}
+
+attributes #0 = { nounwind }