From fa9c12ed964b8201e142d78e430ad4c76bd7af62 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 13 Oct 2022 14:34:01 +0100 Subject: [PATCH] [X86] Attempt to combine binary shuffles where both operands come from the same larger vector Allows us to use combineX86ShuffleChainWithExtract to combine targetshuffle(low_subvector(x),high_subvector(x)) -> low_subvector(targetshuffle(x)) style patterns This is currently very limited (it must have a v2i64/v2f64 result), but while triaging I noticed we might be able to extend this to allow more types for targets with suitable variable cross lane shuffle support. Fixes #58339 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 20 +++++-- .../CodeGen/X86/avx512-shuffles/partial_permute.ll | 67 +++++++++++----------- llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll | 22 ++++--- .../CodeGen/X86/vector-shuffle-combining-avx2.ll | 5 +- llvm/test/CodeGen/X86/vector-shuffle-combining.ll | 25 +++++--- 5 files changed, 83 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c9942ab..f3668b3 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39550,10 +39550,22 @@ static SDValue combineX86ShufflesRecursively( std::swap(Ops[0], Ops[1]); } - // Finally, try to combine into a single shuffle instruction. - return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, - AllowVariableCrossLaneMask, - AllowVariablePerLaneMask, DAG, Subtarget); + // Try to combine into a single shuffle instruction. + if (SDValue Shuffle = combineX86ShuffleChain( + Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, + AllowVariablePerLaneMask, DAG, Subtarget)) + return Shuffle; + + // If all the operands come from the same larger vector, fallthrough and try + // to use combineX86ShuffleChainWithExtract. + SDValue LHS = peekThroughBitcasts(Ops.front()); + SDValue RHS = peekThroughBitcasts(Ops.back()); + if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 || + (RootSizeInBits / Mask.size()) != 64 || + LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || + RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || + LHS.getOperand(0) != RHS.getOperand(0)) + return SDValue(); } // If that failed and any input is extracted then try to combine as a diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index a182dee..fb22103 100644 --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -974,8 +974,7 @@ define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask1(<8 x i32> %vec, <4 x i define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_8xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm3[1],xmm0[1] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3] ; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1 ; CHECK-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper @@ -989,8 +988,7 @@ define <4 x i32> @test_masked_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32 define <4 x i32> @test_masked_z_8xi32_to_4xi32_perm_mask2(<8 x i32> %vec, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_8xi32_to_4xi32_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm2[1],xmm0[1] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,3] ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper @@ -2244,7 +2242,6 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64 %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } - define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask7: ; CHECK: # %bb.0: @@ -2259,11 +2256,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask7(<8 x i64> %vec, <4 x i %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } + define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7] +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2272,10 +2270,9 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec) { define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 +; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,0,2,3,7,4,6,7] ; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1 -; CHECK-NEXT: valignq {{.*#+}} xmm1 {%k1} = xmm3[1],xmm0[0] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vpblendmq %xmm0, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2283,13 +2280,12 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 ret <2 x i64> %res } - define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 -; CHECK-NEXT: valignq {{.*#+}} xmm0 {%k1} {z} = xmm2[1],xmm0[0] +; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,2,3,7,4,6,7] +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2297,6 +2293,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer ret <2 x i64> %res } + define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1: ; CHECK: # %bb.0: @@ -2311,7 +2308,6 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64 %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2 ret <2 x i64> %res } - define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1: ; CHECK: # %bb.0: @@ -2326,6 +2322,7 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i %res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer ret <2 x i64> %res } + define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: @@ -2347,7 +2344,6 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> % %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 ret <4 x i64> %res } - define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask0(ptr %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask0: ; CHECK: # %bb.0: @@ -2697,8 +2693,10 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64> define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) { ; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps 32(%rdi), %xmm0 -; CHECK-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3] +; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4,1] +; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2707,10 +2705,11 @@ define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) { define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %xmm2 -; CHECK-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [4,1] +; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm2 ; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -2722,10 +2721,11 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> % define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa 32(%rdi), %xmm1 -; CHECK-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3] +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [4,1] ; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1 -; CHECK-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x i64>, ptr %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> @@ -4003,12 +4003,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double> define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm3 -; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,5,5] -; CHECK-FAST-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 -; CHECK-FAST-NEXT: vxorpd %xmm0, %xmm0, %xmm0 -; CHECK-FAST-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 -; CHECK-FAST-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} +; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm3 = [1,5] +; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0 +; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 +; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1] +; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4: @@ -4029,12 +4029,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) { ; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; CHECK-FAST-NEXT: vmovapd {{.*#+}} ymm3 = [1,1,5,5] -; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4 -; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 -; CHECK-FAST-NEXT: vpermt2pd %ymm2, %ymm3, %ymm0 {%k1} {z} -; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; CHECK-FAST-NEXT: vmovapd {{.*#+}} xmm2 = [1,5] +; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 +; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1] ; CHECK-FAST-NEXT: retq ; ; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index 3f2a809..6d2553f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -2204,13 +2204,21 @@ define <4 x double> @test_v8f64_2346 (<8 x double> %v) { ;FIXME: compressp define <2 x double> @test_v8f64_34 (<8 x double> %v) { -; ALL-LABEL: test_v8f64_34: -; ALL: # %bb.0: -; ALL-NEXT: vextractf32x4 $2, %zmm0, %xmm1 -; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] -; ALL-NEXT: vzeroupper -; ALL-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: test_v8f64_34: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps {{.*#+}} xmm1 = [3,4] +; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512F-32-LABEL: test_v8f64_34: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovaps {{.*#+}} xmm1 = [3,0,4,0] +; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512F-32-NEXT: vzeroupper +; AVX512F-32-NEXT: retl %res = shufflevector <8 x double> %v, <8 x double> undef, <2 x i32> ret <2 x double> %res } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll index 7438b0a..068cfff 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -636,13 +636,12 @@ define <16 x i16> @shuffle_combine_packusdw_pshufb(<8 x i32> %a0, <8 x i32> %a1) } declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone -; TODO: Failure to merge vpunpcklqdq(vextracti128(x,0),vextracti128(x,1)) -> vpermq define <8 x i16> @shuffle_combine_packusdw_permq_extract(<8 x i32> %a0) { ; CHECK-LABEL: shuffle_combine_packusdw_permq_extract: ; CHECK: # %bb.0: ; CHECK-NEXT: vpackusdw %ymm0, %ymm0, %ymm0 -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> poison) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll index f587720..b8fac4d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -1566,14 +1566,23 @@ define <4 x i32> @combine_test21(<8 x i32> %a, ptr %ptr) { ; SSE-NEXT: movaps %xmm2, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: combine_test21: -; AVX: # %bb.0: -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] -; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; AVX-NEXT: vmovaps %xmm2, (%rdi) -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX1-LABEL: combine_test21: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] +; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; AVX1-NEXT: vmovaps %xmm2, (%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_test21: +; AVX2: # %bb.0: +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,3,2,3] +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vmovaps %xmm0, (%rdi) +; AVX2-NEXT: vmovaps %xmm1, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq %1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> %2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> store <4 x i32> %1, ptr %ptr, align 16 -- 2.7.4