From: Simon Pilgrim Date: Thu, 10 Sep 2020 11:17:54 +0000 (+0100) Subject: [CostModel][X86] Add vXi32 division by uniform constant costs (PR47476) X-Git-Tag: llvmorg-13-init~12480 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=de25ebaac6d2fed371fcd03d95b35eaa2207f395;p=platform%2Fupstream%2Fllvm.git [CostModel][X86] Add vXi32 division by uniform constant costs (PR47476) Other types can be handled in future patches but their uniform / non-uniform costs are more similar and don't appear to cause many vectorization issues. --- diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index c917974..03f8be0 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -321,6 +321,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v64i8, 4 }, // psllw + pand. { ISD::SRL, MVT::v64i8, 4 }, // psrlw + pand. { ISD::SRA, MVT::v64i8, 8 }, // psrlw, pand, pxor, psubb. + + { ISD::SDIV, MVT::v16i32, 6 }, // pmuludq sequence + { ISD::SREM, MVT::v16i32, 8 }, // pmuludq+mul+sub sequence + { ISD::UDIV, MVT::v16i32, 5 }, // pmuludq sequence + { ISD::UREM, MVT::v16i32, 7 }, // pmuludq+mul+sub sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -336,6 +341,11 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SRA, MVT::v32i8, 4 }, // psrlw, pand, pxor, psubb. { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. + + { ISD::SDIV, MVT::v8i32, 6 }, // pmuludq sequence + { ISD::SREM, MVT::v8i32, 8 }, // pmuludq+mul+sub sequence + { ISD::UDIV, MVT::v8i32, 5 }, // pmuludq sequence + { ISD::UREM, MVT::v8i32, 7 }, // pmuludq+mul+sub sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && @@ -353,6 +363,15 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. + + { ISD::SDIV, MVT::v8i32, 12+2 }, // 2*pmuludq sequence + split. + { ISD::SREM, MVT::v8i32, 16+2 }, // 2*pmuludq+mul+sub sequence + split. + { ISD::SDIV, MVT::v4i32, 6 }, // pmuludq sequence + { ISD::SREM, MVT::v4i32, 8 }, // pmuludq+mul+sub sequence + { ISD::UDIV, MVT::v8i32, 10+2 }, // 2*pmuludq sequence + split. + { ISD::UREM, MVT::v8i32, 14+2 }, // 2*pmuludq+mul+sub sequence + split. + { ISD::UDIV, MVT::v4i32, 5 }, // pmuludq sequence + { ISD::UREM, MVT::v4i32, 7 }, // pmuludq+mul+sub sequence }; // XOP has faster vXi8 shifts. diff --git a/llvm/test/Analysis/CostModel/X86/div.ll b/llvm/test/Analysis/CostModel/X86/div.ll index fb3b705..4bead92 100644 --- a/llvm/test/Analysis/CostModel/X86/div.ll +++ b/llvm/test/Analysis/CostModel/X86/div.ll @@ -450,62 +450,24 @@ define i32 @udiv_const() { } define i32 @sdiv_uniformconst() { -; SSE2-LABEL: 'sdiv_uniformconst' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4i32 = sdiv <4 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = sdiv <8 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = sdiv <16 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'sdiv_uniformconst' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4i32 = sdiv <4 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = sdiv <8 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = sdiv <16 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'sdiv_uniformconst' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = sdiv <8 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = sdiv <16 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'sdiv_uniformconst' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = sdiv <2 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = sdiv <8 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = sdiv <16 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32i16 = sdiv <32 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = sdiv i8 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i8 = sdiv <16 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V32i8 = sdiv <32 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V64i8 = sdiv <64 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'sdiv_uniformconst' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = sdiv i64 undef, 7 @@ -513,9 +475,9 @@ define i32 @sdiv_uniformconst() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = sdiv <8 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = sdiv <16 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8i32 = sdiv <8 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16i32 = sdiv <16 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -532,9 +494,9 @@ define i32 @sdiv_uniformconst() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16i32 = sdiv <16 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = sdiv <8 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i32 = sdiv <16 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -551,9 +513,9 @@ define i32 @sdiv_uniformconst() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sdiv <16 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = sdiv <8 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sdiv <16 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -570,9 +532,9 @@ define i32 @sdiv_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = sdiv <8 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = sdiv <16 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i32 = sdiv <8 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i32 = sdiv <16 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -589,9 +551,9 @@ define i32 @sdiv_uniformconst() { ; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = sdiv <8 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = sdiv <16 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = sdiv <8 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = sdiv <16 x i32> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -608,9 +570,9 @@ define i32 @sdiv_uniformconst() { ; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = sdiv <4 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = sdiv <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = sdiv i32 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = sdiv <4 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = sdiv <8 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = sdiv <16 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4i32 = sdiv <4 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8i32 = sdiv <8 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16i32 = sdiv <16 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = sdiv i16 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = sdiv <8 x i16> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = sdiv <16 x i16> undef, @@ -651,9 +613,9 @@ define i32 @udiv_uniformconst() { ; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; SSE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = udiv <8 x i32> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = udiv <16 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8i32 = udiv <8 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i32 = udiv <16 x i32> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -670,9 +632,9 @@ define i32 @udiv_uniformconst() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = udiv <8 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = udiv <16 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = udiv <8 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = udiv <16 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -689,9 +651,9 @@ define i32 @udiv_uniformconst() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V16i32 = udiv <16 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = udiv <8 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16i32 = udiv <16 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -708,9 +670,9 @@ define i32 @udiv_uniformconst() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = udiv <16 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = udiv <8 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16i32 = udiv <16 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -727,9 +689,9 @@ define i32 @udiv_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V8i32 = udiv <8 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16i32 = udiv <16 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8i32 = udiv <8 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16i32 = udiv <16 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -746,9 +708,9 @@ define i32 @udiv_uniformconst() { ; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8i32 = udiv <8 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16i32 = udiv <16 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8i32 = udiv <8 x i32> undef, +; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16i32 = udiv <16 x i32> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16i16 = udiv <16 x i16> undef, @@ -765,9 +727,9 @@ define i32 @udiv_uniformconst() { ; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = udiv <4 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = udiv <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = udiv i32 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V4i32 = udiv <4 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8i32 = udiv <8 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16i32 = udiv <16 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4i32 = udiv <4 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V8i32 = udiv <8 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V16i32 = udiv <16 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = udiv i16 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8i16 = udiv <8 x i16> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i16 = udiv <16 x i16> undef, diff --git a/llvm/test/Analysis/CostModel/X86/rem.ll b/llvm/test/Analysis/CostModel/X86/rem.ll index 7942cda..30dd9a7 100644 --- a/llvm/test/Analysis/CostModel/X86/rem.ll +++ b/llvm/test/Analysis/CostModel/X86/rem.ll @@ -450,62 +450,24 @@ define i32 @urem_const() { } define i32 @srem_uniformconst() { -; SSE2-LABEL: 'srem_uniformconst' -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'srem_uniformconst' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4i32 = srem <4 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V8i32 = srem <8 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16i32 = srem <16 x i32> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'srem_uniformconst' -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'srem_uniformconst' +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = srem <8 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i32 = srem <16 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'srem_uniformconst' ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 @@ -513,9 +475,9 @@ define i32 @srem_uniformconst() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = srem <8 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = srem <16 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8i32 = srem <8 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16i32 = srem <16 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, @@ -532,9 +494,9 @@ define i32 @srem_uniformconst() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16i32 = srem <16 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i32 = srem <16 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, @@ -551,9 +513,9 @@ define i32 @srem_uniformconst() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = srem <16 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = srem <16 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, @@ -570,9 +532,9 @@ define i32 @srem_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = srem <8 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = srem <16 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i32 = srem <8 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = srem <16 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = srem <16 x i16> undef, @@ -583,53 +545,15 @@ define i32 @srem_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64i8 = srem <64 x i8> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; SLM-LABEL: 'srem_uniformconst' -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, -; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; GLM-LABEL: 'srem_uniformconst' -; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 -; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; GLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = srem <8 x i32> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = srem <16 x i32> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 -; GLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = srem <16 x i16> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i16 = srem <32 x i16> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = srem i8 undef, 7 -; GLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i8 = srem <16 x i8> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32i8 = srem <32 x i8> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64i8 = srem <64 x i8> undef, -; GLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; ; BTVER2-LABEL: 'srem_uniformconst' ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I64 = srem i64 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2i64 = srem <2 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = srem <4 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = srem <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = srem i32 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = srem <4 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8i32 = srem <8 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V16i32 = srem <16 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4i32 = srem <4 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V8i32 = srem <8 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V16i32 = srem <16 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = srem i16 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = srem <8 x i16> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = srem <16 x i16> undef, @@ -670,9 +594,9 @@ define i32 @urem_uniformconst() { ; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; SSE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8i32 = urem <8 x i32> undef, -; SSE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16i32 = urem <16 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V8i32 = urem <8 x i32> undef, +; SSE-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V16i32 = urem <16 x i32> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = urem <16 x i16> undef, @@ -689,9 +613,9 @@ define i32 @urem_uniformconst() { ; AVX1-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; AVX1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX1-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V16i32 = urem <16 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = urem <8 x i32> undef, +; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i32 = urem <16 x i32> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; AVX1-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, @@ -708,9 +632,9 @@ define i32 @urem_uniformconst() { ; AVX2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; AVX2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V16i32 = urem <16 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = urem <8 x i32> undef, +; AVX2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16i32 = urem <16 x i32> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, @@ -727,9 +651,9 @@ define i32 @urem_uniformconst() { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX512F-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = urem <16 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = urem <8 x i32> undef, +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = urem <16 x i32> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, @@ -746,9 +670,9 @@ define i32 @urem_uniformconst() { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8i32 = urem <8 x i32> undef, -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16i32 = urem <16 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8i32 = urem <8 x i32> undef, +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16i32 = urem <16 x i32> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16i16 = urem <16 x i16> undef, @@ -765,9 +689,9 @@ define i32 @urem_uniformconst() { ; BTVER2-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4i64 = urem <4 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V8i64 = urem <8 x i64> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I32 = urem i32 undef, 7 -; BTVER2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4i32 = urem <4 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8i32 = urem <8 x i32> undef, -; BTVER2-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V16i32 = urem <16 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4i32 = urem <4 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8i32 = urem <8 x i32> undef, +; BTVER2-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16i32 = urem <16 x i32> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I16 = urem i16 undef, 7 ; BTVER2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = urem <8 x i16> undef, ; BTVER2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V16i16 = urem <16 x i16> undef, diff --git a/llvm/test/Analysis/CostModel/X86/vdiv-cost.ll b/llvm/test/Analysis/CostModel/X86/vdiv-cost.ll index d87d21c..8552509 100644 --- a/llvm/test/Analysis/CostModel/X86/vdiv-cost.ll +++ b/llvm/test/Analysis/CostModel/X86/vdiv-cost.ll @@ -10,7 +10,7 @@ define <4 x i32> @test1(<4 x i32> %a) { ; CHECK-LABEL: 'test1' -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = udiv <4 x i32> %a, +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %div = udiv <4 x i32> %a, ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div ; %div = udiv <4 x i32> %a, @@ -19,19 +19,19 @@ define <4 x i32> @test1(<4 x i32> %a) { define <8 x i32> @test2(<8 x i32> %a) { ; SSE-LABEL: 'test2' -; SSE-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %div = udiv <8 x i32> %a, +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %div = udiv <8 x i32> %a, ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX1-LABEL: 'test2' -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %div = udiv <8 x i32> %a, +; AVX1-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %div = udiv <8 x i32> %a, ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX2-LABEL: 'test2' -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = udiv <8 x i32> %a, +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %div = udiv <8 x i32> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX512-LABEL: 'test2' -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = udiv <8 x i32> %a, +; AVX512-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %div = udiv <8 x i32> %a, ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; %div = udiv <8 x i32> %a, @@ -108,53 +108,29 @@ define <16 x i8> @test7(<16 x i8> %a) { } define <4 x i32> @test8(<4 x i32> %a) { -; SSE2-LABEL: 'test8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %div = sdiv <4 x i32> %a, -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div -; -; SSSE3-LABEL: 'test8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %div = sdiv <4 x i32> %a, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div -; -; SSE42-LABEL: 'test8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <4 x i32> %a, -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div -; -; AVX-LABEL: 'test8' -; AVX-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <4 x i32> %a, -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div -; -; AVX512-LABEL: 'test8' -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <4 x i32> %a, -; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div +; CHECK-LABEL: 'test8' +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = sdiv <4 x i32> %a, +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div ; %div = sdiv <4 x i32> %a, ret <4 x i32> %div } define <8 x i32> @test9(<8 x i32> %a) { -; SSE2-LABEL: 'test9' -; SSE2-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %div = sdiv <8 x i32> %a, -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div -; -; SSSE3-LABEL: 'test9' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %div = sdiv <8 x i32> %a, -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div -; -; SSE42-LABEL: 'test9' -; SSE42-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %div = sdiv <8 x i32> %a, -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div +; SSE-LABEL: 'test9' +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %div = sdiv <8 x i32> %a, +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX1-LABEL: 'test9' -; AVX1-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %div = sdiv <8 x i32> %a, +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %div = sdiv <8 x i32> %a, ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX2-LABEL: 'test9' -; AVX2-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <8 x i32> %a, +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = sdiv <8 x i32> %a, ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; ; AVX512-LABEL: 'test9' -; AVX512-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %div = sdiv <8 x i32> %a, +; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = sdiv <8 x i32> %a, ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %div ; %div = sdiv <8 x i32> %a, diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll index 30930ea..fb4ec00 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-div.ll @@ -24,166 +24,43 @@ define void @sdiv_v16i32_uniformconst() { ; SSE-LABEL: @sdiv_v16i32_uniformconst( -; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SSE-NEXT: [[R0:%.*]] = sdiv i32 [[A0]], 5 -; SSE-NEXT: [[R1:%.*]] = sdiv i32 [[A1]], 5 -; SSE-NEXT: [[R2:%.*]] = sdiv i32 [[A2]], 5 -; SSE-NEXT: [[R3:%.*]] = sdiv i32 [[A3]], 5 -; SSE-NEXT: [[R4:%.*]] = sdiv i32 [[A4]], 5 -; SSE-NEXT: [[R5:%.*]] = sdiv i32 [[A5]], 5 -; SSE-NEXT: [[R6:%.*]] = sdiv i32 [[A6]], 5 -; SSE-NEXT: [[R7:%.*]] = sdiv i32 [[A7]], 5 -; SSE-NEXT: [[R8:%.*]] = sdiv i32 [[A8]], 5 -; SSE-NEXT: [[R9:%.*]] = sdiv i32 [[A9]], 5 -; SSE-NEXT: [[R10:%.*]] = sdiv i32 [[A10]], 5 -; SSE-NEXT: [[R11:%.*]] = sdiv i32 [[A11]], 5 -; SSE-NEXT: [[R12:%.*]] = sdiv i32 [[A12]], 5 -; SSE-NEXT: [[R13:%.*]] = sdiv i32 [[A13]], 5 -; SSE-NEXT: [[R14:%.*]] = sdiv i32 [[A14]], 5 -; SSE-NEXT: [[R15:%.*]] = sdiv i32 [[A15]], 5 -; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP1]], +; SSE-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[TMP2]], +; SSE-NEXT: [[TMP7:%.*]] = sdiv <4 x i32> [[TMP3]], +; SSE-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP4]], +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @sdiv_v16i32_uniformconst( -; SLM-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SLM-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SLM-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SLM-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SLM-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SLM-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SLM-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SLM-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SLM-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SLM-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SLM-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SLM-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SLM-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SLM-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SLM-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SLM-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SLM-NEXT: [[R0:%.*]] = sdiv i32 [[A0]], 5 -; SLM-NEXT: [[R1:%.*]] = sdiv i32 [[A1]], 5 -; SLM-NEXT: [[R2:%.*]] = sdiv i32 [[A2]], 5 -; SLM-NEXT: [[R3:%.*]] = sdiv i32 [[A3]], 5 -; SLM-NEXT: [[R4:%.*]] = sdiv i32 [[A4]], 5 -; SLM-NEXT: [[R5:%.*]] = sdiv i32 [[A5]], 5 -; SLM-NEXT: [[R6:%.*]] = sdiv i32 [[A6]], 5 -; SLM-NEXT: [[R7:%.*]] = sdiv i32 [[A7]], 5 -; SLM-NEXT: [[R8:%.*]] = sdiv i32 [[A8]], 5 -; SLM-NEXT: [[R9:%.*]] = sdiv i32 [[A9]], 5 -; SLM-NEXT: [[R10:%.*]] = sdiv i32 [[A10]], 5 -; SLM-NEXT: [[R11:%.*]] = sdiv i32 [[A11]], 5 -; SLM-NEXT: [[R12:%.*]] = sdiv i32 [[A12]], 5 -; SLM-NEXT: [[R13:%.*]] = sdiv i32 [[A13]], 5 -; SLM-NEXT: [[R14:%.*]] = sdiv i32 [[A14]], 5 -; SLM-NEXT: [[R15:%.*]] = sdiv i32 [[A15]], 5 -; SLM-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SLM-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SLM-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SLM-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SLM-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SLM-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SLM-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SLM-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SLM-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SLM-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SLM-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SLM-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SLM-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SLM-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SLM-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SLM-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP6:%.*]] = sdiv <4 x i32> [[TMP2]], +; SLM-NEXT: [[TMP7:%.*]] = sdiv <4 x i32> [[TMP3]], +; SLM-NEXT: [[TMP8:%.*]] = sdiv <4 x i32> [[TMP4]], +; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; -; AVX1-LABEL: @sdiv_v16i32_uniformconst( -; AVX1-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; AVX1-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; AVX1-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; AVX1-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; AVX1-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; AVX1-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; AVX1-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; AVX1-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; AVX1-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; AVX1-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; AVX1-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; AVX1-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; AVX1-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; AVX1-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; AVX1-NEXT: [[R0:%.*]] = sdiv i32 [[A0]], 5 -; AVX1-NEXT: [[R1:%.*]] = sdiv i32 [[A1]], 5 -; AVX1-NEXT: [[R2:%.*]] = sdiv i32 [[A2]], 5 -; AVX1-NEXT: [[R3:%.*]] = sdiv i32 [[A3]], 5 -; AVX1-NEXT: [[R4:%.*]] = sdiv i32 [[A4]], 5 -; AVX1-NEXT: [[R5:%.*]] = sdiv i32 [[A5]], 5 -; AVX1-NEXT: [[R6:%.*]] = sdiv i32 [[A6]], 5 -; AVX1-NEXT: [[R7:%.*]] = sdiv i32 [[A7]], 5 -; AVX1-NEXT: [[R8:%.*]] = sdiv i32 [[A8]], 5 -; AVX1-NEXT: [[R9:%.*]] = sdiv i32 [[A9]], 5 -; AVX1-NEXT: [[R10:%.*]] = sdiv i32 [[A10]], 5 -; AVX1-NEXT: [[R11:%.*]] = sdiv i32 [[A11]], 5 -; AVX1-NEXT: [[R12:%.*]] = sdiv i32 [[A12]], 5 -; AVX1-NEXT: [[R13:%.*]] = sdiv i32 [[A13]], 5 -; AVX1-NEXT: [[R14:%.*]] = sdiv i32 [[A14]], 5 -; AVX1-NEXT: [[R15:%.*]] = sdiv i32 [[A15]], 5 -; AVX1-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; AVX1-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; AVX1-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; AVX1-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; AVX1-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; AVX1-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; AVX1-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; AVX1-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; AVX1-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; AVX1-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; AVX1-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @sdiv_v16i32_uniformconst( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = sdiv <8 x i32> [[TMP1]], -; AVX2-NEXT: [[TMP4:%.*]] = sdiv <8 x i32> [[TMP2]], -; AVX2-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 -; AVX2-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: ret void +; AVX-LABEL: @sdiv_v16i32_uniformconst( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = sdiv <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP4:%.*]] = sdiv <8 x i32> [[TMP2]], +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @sdiv_v16i32_uniformconst( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 @@ -244,166 +121,43 @@ define void @sdiv_v16i32_uniformconst() { define void @srem_v16i32_uniformconst() { ; SSE-LABEL: @srem_v16i32_uniformconst( -; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SSE-NEXT: [[R0:%.*]] = srem i32 [[A0]], 5 -; SSE-NEXT: [[R1:%.*]] = srem i32 [[A1]], 5 -; SSE-NEXT: [[R2:%.*]] = srem i32 [[A2]], 5 -; SSE-NEXT: [[R3:%.*]] = srem i32 [[A3]], 5 -; SSE-NEXT: [[R4:%.*]] = srem i32 [[A4]], 5 -; SSE-NEXT: [[R5:%.*]] = srem i32 [[A5]], 5 -; SSE-NEXT: [[R6:%.*]] = srem i32 [[A6]], 5 -; SSE-NEXT: [[R7:%.*]] = srem i32 [[A7]], 5 -; SSE-NEXT: [[R8:%.*]] = srem i32 [[A8]], 5 -; SSE-NEXT: [[R9:%.*]] = srem i32 [[A9]], 5 -; SSE-NEXT: [[R10:%.*]] = srem i32 [[A10]], 5 -; SSE-NEXT: [[R11:%.*]] = srem i32 [[A11]], 5 -; SSE-NEXT: [[R12:%.*]] = srem i32 [[A12]], 5 -; SSE-NEXT: [[R13:%.*]] = srem i32 [[A13]], 5 -; SSE-NEXT: [[R14:%.*]] = srem i32 [[A14]], 5 -; SSE-NEXT: [[R15:%.*]] = srem i32 [[A15]], 5 -; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = srem <4 x i32> [[TMP1]], +; SSE-NEXT: [[TMP6:%.*]] = srem <4 x i32> [[TMP2]], +; SSE-NEXT: [[TMP7:%.*]] = srem <4 x i32> [[TMP3]], +; SSE-NEXT: [[TMP8:%.*]] = srem <4 x i32> [[TMP4]], +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @srem_v16i32_uniformconst( -; SLM-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SLM-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SLM-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SLM-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SLM-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SLM-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SLM-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SLM-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SLM-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SLM-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SLM-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SLM-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SLM-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SLM-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SLM-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SLM-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SLM-NEXT: [[R0:%.*]] = srem i32 [[A0]], 5 -; SLM-NEXT: [[R1:%.*]] = srem i32 [[A1]], 5 -; SLM-NEXT: [[R2:%.*]] = srem i32 [[A2]], 5 -; SLM-NEXT: [[R3:%.*]] = srem i32 [[A3]], 5 -; SLM-NEXT: [[R4:%.*]] = srem i32 [[A4]], 5 -; SLM-NEXT: [[R5:%.*]] = srem i32 [[A5]], 5 -; SLM-NEXT: [[R6:%.*]] = srem i32 [[A6]], 5 -; SLM-NEXT: [[R7:%.*]] = srem i32 [[A7]], 5 -; SLM-NEXT: [[R8:%.*]] = srem i32 [[A8]], 5 -; SLM-NEXT: [[R9:%.*]] = srem i32 [[A9]], 5 -; SLM-NEXT: [[R10:%.*]] = srem i32 [[A10]], 5 -; SLM-NEXT: [[R11:%.*]] = srem i32 [[A11]], 5 -; SLM-NEXT: [[R12:%.*]] = srem i32 [[A12]], 5 -; SLM-NEXT: [[R13:%.*]] = srem i32 [[A13]], 5 -; SLM-NEXT: [[R14:%.*]] = srem i32 [[A14]], 5 -; SLM-NEXT: [[R15:%.*]] = srem i32 [[A15]], 5 -; SLM-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SLM-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SLM-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SLM-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SLM-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SLM-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SLM-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SLM-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SLM-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SLM-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SLM-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SLM-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SLM-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SLM-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SLM-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SLM-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = srem <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP6:%.*]] = srem <4 x i32> [[TMP2]], +; SLM-NEXT: [[TMP7:%.*]] = srem <4 x i32> [[TMP3]], +; SLM-NEXT: [[TMP8:%.*]] = srem <4 x i32> [[TMP4]], +; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; -; AVX1-LABEL: @srem_v16i32_uniformconst( -; AVX1-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; AVX1-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; AVX1-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; AVX1-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; AVX1-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; AVX1-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; AVX1-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; AVX1-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; AVX1-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; AVX1-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; AVX1-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; AVX1-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; AVX1-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; AVX1-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; AVX1-NEXT: [[R0:%.*]] = srem i32 [[A0]], 5 -; AVX1-NEXT: [[R1:%.*]] = srem i32 [[A1]], 5 -; AVX1-NEXT: [[R2:%.*]] = srem i32 [[A2]], 5 -; AVX1-NEXT: [[R3:%.*]] = srem i32 [[A3]], 5 -; AVX1-NEXT: [[R4:%.*]] = srem i32 [[A4]], 5 -; AVX1-NEXT: [[R5:%.*]] = srem i32 [[A5]], 5 -; AVX1-NEXT: [[R6:%.*]] = srem i32 [[A6]], 5 -; AVX1-NEXT: [[R7:%.*]] = srem i32 [[A7]], 5 -; AVX1-NEXT: [[R8:%.*]] = srem i32 [[A8]], 5 -; AVX1-NEXT: [[R9:%.*]] = srem i32 [[A9]], 5 -; AVX1-NEXT: [[R10:%.*]] = srem i32 [[A10]], 5 -; AVX1-NEXT: [[R11:%.*]] = srem i32 [[A11]], 5 -; AVX1-NEXT: [[R12:%.*]] = srem i32 [[A12]], 5 -; AVX1-NEXT: [[R13:%.*]] = srem i32 [[A13]], 5 -; AVX1-NEXT: [[R14:%.*]] = srem i32 [[A14]], 5 -; AVX1-NEXT: [[R15:%.*]] = srem i32 [[A15]], 5 -; AVX1-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; AVX1-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; AVX1-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; AVX1-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; AVX1-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; AVX1-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; AVX1-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; AVX1-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; AVX1-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; AVX1-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; AVX1-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @srem_v16i32_uniformconst( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = srem <8 x i32> [[TMP1]], -; AVX2-NEXT: [[TMP4:%.*]] = srem <8 x i32> [[TMP2]], -; AVX2-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 -; AVX2-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: ret void +; AVX-LABEL: @srem_v16i32_uniformconst( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = srem <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP4:%.*]] = srem <8 x i32> [[TMP2]], +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @srem_v16i32_uniformconst( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 @@ -464,166 +218,43 @@ define void @srem_v16i32_uniformconst() { define void @udiv_v16i32_uniformconst() { ; SSE-LABEL: @udiv_v16i32_uniformconst( -; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SSE-NEXT: [[R0:%.*]] = udiv i32 [[A0]], 5 -; SSE-NEXT: [[R1:%.*]] = udiv i32 [[A1]], 5 -; SSE-NEXT: [[R2:%.*]] = udiv i32 [[A2]], 5 -; SSE-NEXT: [[R3:%.*]] = udiv i32 [[A3]], 5 -; SSE-NEXT: [[R4:%.*]] = udiv i32 [[A4]], 5 -; SSE-NEXT: [[R5:%.*]] = udiv i32 [[A5]], 5 -; SSE-NEXT: [[R6:%.*]] = udiv i32 [[A6]], 5 -; SSE-NEXT: [[R7:%.*]] = udiv i32 [[A7]], 5 -; SSE-NEXT: [[R8:%.*]] = udiv i32 [[A8]], 5 -; SSE-NEXT: [[R9:%.*]] = udiv i32 [[A9]], 5 -; SSE-NEXT: [[R10:%.*]] = udiv i32 [[A10]], 5 -; SSE-NEXT: [[R11:%.*]] = udiv i32 [[A11]], 5 -; SSE-NEXT: [[R12:%.*]] = udiv i32 [[A12]], 5 -; SSE-NEXT: [[R13:%.*]] = udiv i32 [[A13]], 5 -; SSE-NEXT: [[R14:%.*]] = udiv i32 [[A14]], 5 -; SSE-NEXT: [[R15:%.*]] = udiv i32 [[A15]], 5 -; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = udiv <4 x i32> [[TMP1]], +; SSE-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP2]], +; SSE-NEXT: [[TMP7:%.*]] = udiv <4 x i32> [[TMP3]], +; SSE-NEXT: [[TMP8:%.*]] = udiv <4 x i32> [[TMP4]], +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @udiv_v16i32_uniformconst( -; SLM-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SLM-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SLM-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SLM-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SLM-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SLM-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SLM-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SLM-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SLM-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SLM-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SLM-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SLM-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SLM-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SLM-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SLM-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SLM-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SLM-NEXT: [[R0:%.*]] = udiv i32 [[A0]], 5 -; SLM-NEXT: [[R1:%.*]] = udiv i32 [[A1]], 5 -; SLM-NEXT: [[R2:%.*]] = udiv i32 [[A2]], 5 -; SLM-NEXT: [[R3:%.*]] = udiv i32 [[A3]], 5 -; SLM-NEXT: [[R4:%.*]] = udiv i32 [[A4]], 5 -; SLM-NEXT: [[R5:%.*]] = udiv i32 [[A5]], 5 -; SLM-NEXT: [[R6:%.*]] = udiv i32 [[A6]], 5 -; SLM-NEXT: [[R7:%.*]] = udiv i32 [[A7]], 5 -; SLM-NEXT: [[R8:%.*]] = udiv i32 [[A8]], 5 -; SLM-NEXT: [[R9:%.*]] = udiv i32 [[A9]], 5 -; SLM-NEXT: [[R10:%.*]] = udiv i32 [[A10]], 5 -; SLM-NEXT: [[R11:%.*]] = udiv i32 [[A11]], 5 -; SLM-NEXT: [[R12:%.*]] = udiv i32 [[A12]], 5 -; SLM-NEXT: [[R13:%.*]] = udiv i32 [[A13]], 5 -; SLM-NEXT: [[R14:%.*]] = udiv i32 [[A14]], 5 -; SLM-NEXT: [[R15:%.*]] = udiv i32 [[A15]], 5 -; SLM-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SLM-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SLM-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SLM-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SLM-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SLM-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SLM-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SLM-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SLM-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SLM-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SLM-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SLM-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SLM-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SLM-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SLM-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SLM-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = udiv <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP2]], +; SLM-NEXT: [[TMP7:%.*]] = udiv <4 x i32> [[TMP3]], +; SLM-NEXT: [[TMP8:%.*]] = udiv <4 x i32> [[TMP4]], +; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; -; AVX1-LABEL: @udiv_v16i32_uniformconst( -; AVX1-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; AVX1-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; AVX1-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; AVX1-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; AVX1-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; AVX1-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; AVX1-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; AVX1-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; AVX1-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; AVX1-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; AVX1-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; AVX1-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; AVX1-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; AVX1-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; AVX1-NEXT: [[R0:%.*]] = udiv i32 [[A0]], 5 -; AVX1-NEXT: [[R1:%.*]] = udiv i32 [[A1]], 5 -; AVX1-NEXT: [[R2:%.*]] = udiv i32 [[A2]], 5 -; AVX1-NEXT: [[R3:%.*]] = udiv i32 [[A3]], 5 -; AVX1-NEXT: [[R4:%.*]] = udiv i32 [[A4]], 5 -; AVX1-NEXT: [[R5:%.*]] = udiv i32 [[A5]], 5 -; AVX1-NEXT: [[R6:%.*]] = udiv i32 [[A6]], 5 -; AVX1-NEXT: [[R7:%.*]] = udiv i32 [[A7]], 5 -; AVX1-NEXT: [[R8:%.*]] = udiv i32 [[A8]], 5 -; AVX1-NEXT: [[R9:%.*]] = udiv i32 [[A9]], 5 -; AVX1-NEXT: [[R10:%.*]] = udiv i32 [[A10]], 5 -; AVX1-NEXT: [[R11:%.*]] = udiv i32 [[A11]], 5 -; AVX1-NEXT: [[R12:%.*]] = udiv i32 [[A12]], 5 -; AVX1-NEXT: [[R13:%.*]] = udiv i32 [[A13]], 5 -; AVX1-NEXT: [[R14:%.*]] = udiv i32 [[A14]], 5 -; AVX1-NEXT: [[R15:%.*]] = udiv i32 [[A15]], 5 -; AVX1-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; AVX1-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; AVX1-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; AVX1-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; AVX1-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; AVX1-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; AVX1-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; AVX1-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; AVX1-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; AVX1-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; AVX1-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @udiv_v16i32_uniformconst( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = udiv <8 x i32> [[TMP1]], -; AVX2-NEXT: [[TMP4:%.*]] = udiv <8 x i32> [[TMP2]], -; AVX2-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 -; AVX2-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: ret void +; AVX-LABEL: @udiv_v16i32_uniformconst( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = udiv <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP4:%.*]] = udiv <8 x i32> [[TMP2]], +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @udiv_v16i32_uniformconst( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4 @@ -684,166 +315,43 @@ define void @udiv_v16i32_uniformconst() { define void @urem_v16i32_uniformconst() { ; SSE-LABEL: @urem_v16i32_uniformconst( -; SSE-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SSE-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SSE-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SSE-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SSE-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SSE-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SSE-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SSE-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SSE-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SSE-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SSE-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SSE-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SSE-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SSE-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SSE-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SSE-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SSE-NEXT: [[R0:%.*]] = urem i32 [[A0]], 5 -; SSE-NEXT: [[R1:%.*]] = urem i32 [[A1]], 5 -; SSE-NEXT: [[R2:%.*]] = urem i32 [[A2]], 5 -; SSE-NEXT: [[R3:%.*]] = urem i32 [[A3]], 5 -; SSE-NEXT: [[R4:%.*]] = urem i32 [[A4]], 5 -; SSE-NEXT: [[R5:%.*]] = urem i32 [[A5]], 5 -; SSE-NEXT: [[R6:%.*]] = urem i32 [[A6]], 5 -; SSE-NEXT: [[R7:%.*]] = urem i32 [[A7]], 5 -; SSE-NEXT: [[R8:%.*]] = urem i32 [[A8]], 5 -; SSE-NEXT: [[R9:%.*]] = urem i32 [[A9]], 5 -; SSE-NEXT: [[R10:%.*]] = urem i32 [[A10]], 5 -; SSE-NEXT: [[R11:%.*]] = urem i32 [[A11]], 5 -; SSE-NEXT: [[R12:%.*]] = urem i32 [[A12]], 5 -; SSE-NEXT: [[R13:%.*]] = urem i32 [[A13]], 5 -; SSE-NEXT: [[R14:%.*]] = urem i32 [[A14]], 5 -; SSE-NEXT: [[R15:%.*]] = urem i32 [[A15]], 5 -; SSE-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SSE-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SSE-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SSE-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SSE-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SSE-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SSE-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SSE-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SSE-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SSE-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SSE-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SSE-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SSE-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SSE-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SSE-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SSE-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SSE-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SSE-NEXT: [[TMP5:%.*]] = urem <4 x i32> [[TMP1]], +; SSE-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP2]], +; SSE-NEXT: [[TMP7:%.*]] = urem <4 x i32> [[TMP3]], +; SSE-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP4]], +; SSE-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SSE-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SSE-NEXT: ret void ; ; SLM-LABEL: @urem_v16i32_uniformconst( -; SLM-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; SLM-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; SLM-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; SLM-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; SLM-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; SLM-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; SLM-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; SLM-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; SLM-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; SLM-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; SLM-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; SLM-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; SLM-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; SLM-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; SLM-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; SLM-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; SLM-NEXT: [[R0:%.*]] = urem i32 [[A0]], 5 -; SLM-NEXT: [[R1:%.*]] = urem i32 [[A1]], 5 -; SLM-NEXT: [[R2:%.*]] = urem i32 [[A2]], 5 -; SLM-NEXT: [[R3:%.*]] = urem i32 [[A3]], 5 -; SLM-NEXT: [[R4:%.*]] = urem i32 [[A4]], 5 -; SLM-NEXT: [[R5:%.*]] = urem i32 [[A5]], 5 -; SLM-NEXT: [[R6:%.*]] = urem i32 [[A6]], 5 -; SLM-NEXT: [[R7:%.*]] = urem i32 [[A7]], 5 -; SLM-NEXT: [[R8:%.*]] = urem i32 [[A8]], 5 -; SLM-NEXT: [[R9:%.*]] = urem i32 [[A9]], 5 -; SLM-NEXT: [[R10:%.*]] = urem i32 [[A10]], 5 -; SLM-NEXT: [[R11:%.*]] = urem i32 [[A11]], 5 -; SLM-NEXT: [[R12:%.*]] = urem i32 [[A12]], 5 -; SLM-NEXT: [[R13:%.*]] = urem i32 [[A13]], 5 -; SLM-NEXT: [[R14:%.*]] = urem i32 [[A14]], 5 -; SLM-NEXT: [[R15:%.*]] = urem i32 [[A15]], 5 -; SLM-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; SLM-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; SLM-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; SLM-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; SLM-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; SLM-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; SLM-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; SLM-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; SLM-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; SLM-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; SLM-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; SLM-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; SLM-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; SLM-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; SLM-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; SLM-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 +; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4 +; SLM-NEXT: [[TMP5:%.*]] = urem <4 x i32> [[TMP1]], +; SLM-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP2]], +; SLM-NEXT: [[TMP7:%.*]] = urem <4 x i32> [[TMP3]], +; SLM-NEXT: [[TMP8:%.*]] = urem <4 x i32> [[TMP4]], +; SLM-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP6]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4 +; SLM-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4 ; SLM-NEXT: ret void ; -; AVX1-LABEL: @urem_v16i32_uniformconst( -; AVX1-NEXT: [[A0:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 0), align 4 -; AVX1-NEXT: [[A1:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 1), align 4 -; AVX1-NEXT: [[A2:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 2), align 4 -; AVX1-NEXT: [[A3:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 3), align 4 -; AVX1-NEXT: [[A4:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4), align 4 -; AVX1-NEXT: [[A5:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 5), align 4 -; AVX1-NEXT: [[A6:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 6), align 4 -; AVX1-NEXT: [[A7:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 7), align 4 -; AVX1-NEXT: [[A8:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8), align 4 -; AVX1-NEXT: [[A9:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 9), align 4 -; AVX1-NEXT: [[A10:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 10), align 4 -; AVX1-NEXT: [[A11:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 11), align 4 -; AVX1-NEXT: [[A12:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12), align 4 -; AVX1-NEXT: [[A13:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 13), align 4 -; AVX1-NEXT: [[A14:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 14), align 4 -; AVX1-NEXT: [[A15:%.*]] = load i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 15), align 4 -; AVX1-NEXT: [[R0:%.*]] = urem i32 [[A0]], 5 -; AVX1-NEXT: [[R1:%.*]] = urem i32 [[A1]], 5 -; AVX1-NEXT: [[R2:%.*]] = urem i32 [[A2]], 5 -; AVX1-NEXT: [[R3:%.*]] = urem i32 [[A3]], 5 -; AVX1-NEXT: [[R4:%.*]] = urem i32 [[A4]], 5 -; AVX1-NEXT: [[R5:%.*]] = urem i32 [[A5]], 5 -; AVX1-NEXT: [[R6:%.*]] = urem i32 [[A6]], 5 -; AVX1-NEXT: [[R7:%.*]] = urem i32 [[A7]], 5 -; AVX1-NEXT: [[R8:%.*]] = urem i32 [[A8]], 5 -; AVX1-NEXT: [[R9:%.*]] = urem i32 [[A9]], 5 -; AVX1-NEXT: [[R10:%.*]] = urem i32 [[A10]], 5 -; AVX1-NEXT: [[R11:%.*]] = urem i32 [[A11]], 5 -; AVX1-NEXT: [[R12:%.*]] = urem i32 [[A12]], 5 -; AVX1-NEXT: [[R13:%.*]] = urem i32 [[A13]], 5 -; AVX1-NEXT: [[R14:%.*]] = urem i32 [[A14]], 5 -; AVX1-NEXT: [[R15:%.*]] = urem i32 [[A15]], 5 -; AVX1-NEXT: store i32 [[R0]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 0), align 4 -; AVX1-NEXT: store i32 [[R1]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 1), align 4 -; AVX1-NEXT: store i32 [[R2]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 2), align 4 -; AVX1-NEXT: store i32 [[R3]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 3), align 4 -; AVX1-NEXT: store i32 [[R4]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4), align 4 -; AVX1-NEXT: store i32 [[R5]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 5), align 4 -; AVX1-NEXT: store i32 [[R6]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 6), align 4 -; AVX1-NEXT: store i32 [[R7]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 7), align 4 -; AVX1-NEXT: store i32 [[R8]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8), align 4 -; AVX1-NEXT: store i32 [[R9]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 9), align 4 -; AVX1-NEXT: store i32 [[R10]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 10), align 4 -; AVX1-NEXT: store i32 [[R11]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 11), align 4 -; AVX1-NEXT: store i32 [[R12]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12), align 4 -; AVX1-NEXT: store i32 [[R13]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 13), align 4 -; AVX1-NEXT: store i32 [[R14]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 14), align 4 -; AVX1-NEXT: store i32 [[R15]], i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 15), align 4 -; AVX1-NEXT: ret void -; -; AVX2-LABEL: @urem_v16i32_uniformconst( -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: [[TMP3:%.*]] = urem <8 x i32> [[TMP1]], -; AVX2-NEXT: [[TMP4:%.*]] = urem <8 x i32> [[TMP2]], -; AVX2-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 -; AVX2-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 -; AVX2-NEXT: ret void +; AVX-LABEL: @urem_v16i32_uniformconst( +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: [[TMP3:%.*]] = urem <8 x i32> [[TMP1]], +; AVX-NEXT: [[TMP4:%.*]] = urem <8 x i32> [[TMP2]], +; AVX-NEXT: store <8 x i32> [[TMP3]], <8 x i32>* bitcast ([16 x i32]* @c32 to <8 x i32>*), align 4 +; AVX-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <8 x i32>*), align 4 +; AVX-NEXT: ret void ; ; AVX512-LABEL: @urem_v16i32_uniformconst( ; AVX512-NEXT: [[TMP1:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([16 x i32]* @a32 to <16 x i32>*), align 4