// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR3:[0-9]+]]
+// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4:[0-9]+]]
// BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
-// BE-PWR9-NEXT: ret <16 x i8> [[TMP5]]
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
+// BE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
// LE-PWR9-LABEL: @test_ldrmb1(
// LE-PWR9-NEXT: entry:
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
// LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
-// LE-PWR9-NEXT: store <16 x i8> [[TMP14]], <16 x i8>* [[__RES_I]], align 16
-// LE-PWR9-NEXT: [[TMP15:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
-// LE-PWR9-NEXT: ret <16 x i8> [[TMP15]]
+// LE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
// BE32-PWR9-LABEL: @test_ldrmb1(
// BE32-PWR9-NEXT: entry:
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 1, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb1(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 2, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb2(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 3, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb3(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 4, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb4(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 5, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb5(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 6, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb6(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 7, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb7(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 8, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb8(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
-// BE-PWR9-NEXT: ret <16 x i8> [[TMP5]]
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
+// BE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
// LE-PWR9-LABEL: @test_ldrmb9(
// LE-PWR9-NEXT: entry:
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
// LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
-// LE-PWR9-NEXT: store <16 x i8> [[TMP14]], <16 x i8>* [[__RES_I]], align 16
-// LE-PWR9-NEXT: [[TMP15:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
-// LE-PWR9-NEXT: ret <16 x i8> [[TMP15]]
+// LE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
// BE32-PWR9-LABEL: @test_ldrmb9(
// BE32-PWR9-NEXT: entry:
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 9, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb9(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 10, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb10(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 11, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb11(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 12, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb12(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 13, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb13(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 14, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb14(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 15, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb15(
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i64, align 8
// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: [[TMP0:%.*]] = load i8*, i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__A_ADDR_I]], align 8
// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP2]], 56
-// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.ppc.vsx.lxvll(i8* [[TMP1]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__RES_I]], align 16
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
-// BE-PWR9-NEXT: ret <16 x i8> [[TMP5]]
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP5]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsr(i8* [[TMP6]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP9:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP10:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
+// BE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
// LE-PWR9-LABEL: @test_ldrmb16(
// LE-PWR9-NEXT: entry:
// LE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
// LE-PWR9-NEXT: [[TMP13:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP9]], <4 x i32> [[TMP11]], <16 x i8> [[TMP12]]) #[[ATTR4]]
// LE-PWR9-NEXT: [[TMP14:%.*]] = bitcast <4 x i32> [[TMP13]] to <16 x i8>
-// LE-PWR9-NEXT: store <16 x i8> [[TMP14]], <16 x i8>* [[__RES_I]], align 16
-// LE-PWR9-NEXT: [[TMP15:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
-// LE-PWR9-NEXT: ret <16 x i8> [[TMP15]]
+// LE-PWR9-NEXT: ret <16 x i8> [[TMP14]]
//
// BE32-PWR9-LABEL: @test_ldrmb16(
// BE32-PWR9-NEXT: entry:
// BE-PWR9-NEXT: [[__A_ADDR_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[__B_ADDR_I:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[__C_ADDR_I:%.*]] = alloca i64, align 8
+// BE-PWR9-NEXT: [[__MASK_I:%.*]] = alloca <16 x i8>, align 16
+// BE-PWR9-NEXT: [[__RES_I:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: [[PTR_ADDR:%.*]] = alloca i8*, align 8
// BE-PWR9-NEXT: [[DATA_ADDR:%.*]] = alloca <16 x i8>, align 16
// BE-PWR9-NEXT: store i8* [[PTR:%.*]], i8** [[PTR_ADDR]], align 8
// BE-PWR9-NEXT: store <16 x i8> [[TMP0]], <16 x i8>* [[__A_ADDR_I]], align 16
// BE-PWR9-NEXT: store i8* [[TMP1]], i8** [[__B_ADDR_I]], align 8
// BE-PWR9-NEXT: store i64 16, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
-// BE-PWR9-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
-// BE-PWR9-NEXT: [[TMP4:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[TMP5:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
-// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP5]], 56
-// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP3]], i8* [[TMP4]], i64 [[SHL_I]]) #[[ATTR3]]
+// BE-PWR9-NEXT: [[TMP2:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SUB_I:%.*]] = sub i64 16, [[TMP2]]
+// BE-PWR9-NEXT: [[CONV_I:%.*]] = trunc i64 [[SUB_I]] to i8
+// BE-PWR9-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* null, i8 [[CONV_I]]
+// BE-PWR9-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.ppc.altivec.lvsl(i8* [[TMP3]]) #[[ATTR4]]
+// BE-PWR9-NEXT: store <16 x i8> [[TMP4]], <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP7:%.*]] = load <16 x i8>, <16 x i8>* [[__A_ADDR_I]], align 16
+// BE-PWR9-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP9:%.*]] = load <16 x i8>, <16 x i8>* [[__MASK_I]], align 16
+// BE-PWR9-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[TMP6]], <4 x i32> [[TMP8]], <16 x i8> [[TMP9]]) #[[ATTR4]]
+// BE-PWR9-NEXT: [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// BE-PWR9-NEXT: store <16 x i8> [[TMP11]], <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP12:%.*]] = load <16 x i8>, <16 x i8>* [[__RES_I]], align 16
+// BE-PWR9-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// BE-PWR9-NEXT: [[TMP14:%.*]] = load i8*, i8** [[__B_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[TMP15:%.*]] = load i64, i64* [[__C_ADDR_I]], align 8
+// BE-PWR9-NEXT: [[SHL_I:%.*]] = shl i64 [[TMP15]], 56
+// BE-PWR9-NEXT: call void @llvm.ppc.vsx.stxvll(<4 x i32> [[TMP13]], i8* [[TMP14]], i64 [[SHL_I]]) #[[ATTR4]]
// BE-PWR9-NEXT: ret void
//
// LE-PWR9-LABEL: @test_strmb16(