From: John Brawn Date: Thu, 25 Oct 2018 14:56:48 +0000 (+0000) Subject: [AArch64] Do 64-bit vector move of 0 and -1 by extracting from the 128-bit move X-Git-Tag: llvmorg-8.0.0-rc1~5798 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=49e61d90ca3087a8b8575b3b9192102c54e34292;p=platform%2Fupstream%2Fllvm.git [AArch64] Do 64-bit vector move of 0 and -1 by extracting from the 128-bit move Currently a vector move of 0 or -1 will use different instructions depending on the size of the vector. Using a single instruction (the 128-bit one) for both gives more opportunity for Machine CSE to eliminate instructions. Differential Revision: https://reviews.llvm.org/D53579 llvm-svn: 345270 --- diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 88e5632..76ea2ac 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4920,16 +4920,6 @@ def MOVID : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi", def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)), (MOVID imm0_255:$shift)>; -def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>; -def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>; -def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>; -def : Pat<(v8i8 immAllZerosV), (MOVID (i32 0))>; - -def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>; -def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>; -def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>; -def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>; - // EDIT byte mask: 2d // The movi_edit node has the immediate value already encoded, so we use @@ -4950,6 +4940,18 @@ def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>; def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>; def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>; +// Set 64-bit vectors to all 0/1 by extracting from a 128-bit register as the +// extract is free and this gives better MachineCSE results. +def : Pat<(v1i64 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>; +def : Pat<(v2i32 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>; +def : Pat<(v4i16 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>; +def : Pat<(v8i8 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>; + +def : Pat<(v1i64 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; +def : Pat<(v2i32 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; +def : Pat<(v4i16 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; +def : Pat<(v8i8 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>; + // EDIT per word & halfword: 2s, 4h, 4s, & 8h let isReMaterializable = 1, isAsCheapAsAMove = 1 in defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">; diff --git a/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll b/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll index 54b7c8f..0e1797f 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll @@ -746,7 +746,7 @@ define void @modimm_t10_call() { ; CHECK-NEXT: rev64 v{{[0-9]+}}.4h, v[[REG1]].4h ; CHECK-NEXT: bl f_v4i16 call i16 @f_v4i16(<4 x i16> ) - ; CHECK: movi d[[REG1:[0-9]+]], #0xffffffffffffffff + ; CHECK: movi v[[REG1:[0-9]+]].2d, #0xffffffffffffffff ; CHECK-NEXT: rev64 v{{[0-9]+}}.2s, v[[REG1]].2s ; CHECK-NEXT: bl f_v2i32 call i32 @f_v2i32(<2 x i32> ) diff --git a/llvm/test/CodeGen/AArch64/aarch64-smax-constantfold.ll b/llvm/test/CodeGen/AArch64/aarch64-smax-constantfold.ll index 0e5b59f..32cd3c6 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smax-constantfold.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smax-constantfold.ll @@ -6,7 +6,7 @@ declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>) ; CHECK-LABEL: test define <4 x i16> @test() { entry: -; CHECK: movi d{{[0-9]+}}, #0000000000000000 +; CHECK: movi v{{[0-9]+}}.2d, #0000000000000000 %0 = tail call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> , <4 x i16> zeroinitializer) ret <4 x i16> %0 } diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll index 7cc5a43..bb3c36a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll @@ -975,7 +975,7 @@ define <2 x i64> @cmhiz2xi64(<2 x i64> %A) { define <8 x i8> @cmlsz8xi8(<8 x i8> %A) { ; Using registers other than v0, v1 are possible, but would be odd. ; LS implemented as HS, so check reversed operands. -;CHECK: movi d[[ZERO:[0-9]+]], #0 +;CHECK: movi v[[ZERO:[0-9]+]].2d, #0 ;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b %tmp3 = icmp ule <8 x i8> %A, zeroinitializer; %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> @@ -995,7 +995,7 @@ define <16 x i8> @cmlsz16xi8(<16 x i8> %A) { define <4 x i16> @cmlsz4xi16(<4 x i16> %A) { ; Using registers other than v0, v1 are possible, but would be odd. ; LS implemented as HS, so check reversed operands. -;CHECK: movi d[[ZERO:[0-9]+]], #0 +;CHECK: movi v[[ZERO:[0-9]+]].2d, #0 ;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h %tmp3 = icmp ule <4 x i16> %A, zeroinitializer; %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> @@ -1015,7 +1015,7 @@ define <8 x i16> @cmlsz8xi16(<8 x i16> %A) { define <2 x i32> @cmlsz2xi32(<2 x i32> %A) { ; Using registers other than v0, v1 are possible, but would be odd. ; LS implemented as HS, so check reversed operands. -;CHECK: movi d[[ZERO:[0-9]+]], #0 +;CHECK: movi v[[ZERO:[0-9]+]].2d, #0 ;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s %tmp3 = icmp ule <2 x i32> %A, zeroinitializer; %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index 2a9e545..0b6132b 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1401,7 +1401,7 @@ entry: define <4 x i16> @concat_vector_v4i16_const() { ; CHECK-LABEL: concat_vector_v4i16_const: -; CHECK: movi {{d[0-9]+}}, #0 +; CHECK: movi {{v[0-9]+}}.2d, #0 %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer ret <4 x i16> %r } @@ -1422,7 +1422,7 @@ define <4 x i32> @concat_vector_v4i32_const() { define <8 x i8> @concat_vector_v8i8_const() { ; CHECK-LABEL: concat_vector_v8i8_const: -; CHECK: movi {{d[0-9]+}}, #0 +; CHECK: movi {{v[0-9]+}}.2d, #0 %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer ret <8 x i8> %r } diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-ext.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ext.ll index 68892ee..8debd21 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vector-ext.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-ext.ll @@ -19,7 +19,7 @@ define void @func30(%T0_30 %v0, %T1_30* %p1) { ; sensible instead. define <1 x i32> @autogen_SD7918() { ; CHECK-LABEL: autogen_SD7918 -; CHECK: movi d0, #0000000000000000 +; CHECK: movi.2d v0, #0000000000000000 ; CHECK-NEXT: ret %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0 %ZE = zext <1 x i1> %I29 to <1 x i32> diff --git a/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll index b4f5767..fdd7cad 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll @@ -2,7 +2,7 @@ ; CHECK: test1 -; CHECK: movi d[[REG0:[0-9]+]], #0000000000000000 +; CHECK: movi.16b v[[REG0:[0-9]+]], #0 define <8 x i1> @test1() { entry: %Shuff = shufflevector <8 x i1> @tv8i8() { entry: ; ALL-LABEL: tv8i8: -; ALL: movi d0, #0 +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 ret <8 x i8> } define <4 x i16> @tv4i16() { entry: ; ALL-LABEL: tv4i16: -; ALL: movi d0, #0 +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 ret <4 x i16> } define <2 x i32> @tv2i32() { entry: ; ALL-LABEL: tv2i32: -; ALL: movi d0, #0 +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 ret <2 x i32> } define <2 x float> @tv2f32() { entry: ; ALL-LABEL: tv2f32: -; ALL: movi d0, #0 +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 ret <2 x float> } diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll index e88ea9e..d60bd4a 100644 --- a/llvm/test/CodeGen/AArch64/bitcast.ll +++ b/llvm/test/CodeGen/AArch64/bitcast.ll @@ -4,7 +4,7 @@ define <4 x i16> @foo1(<2 x i32> %a) { ; CHECK-LABEL: foo1: -; CHECK: movi d0, #0000000000000000 +; CHECK: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %1 = shufflevector <2 x i32> , <2 x i32> %a, <2 x i32> @@ -16,7 +16,7 @@ define <4 x i16> @foo1(<2 x i32> %a) { define <4 x i16> @foo2(<2 x i32> %a) { ; CHECK-LABEL: foo2: -; CHECK: movi d0, #0000000000000000 +; CHECK: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ret %1 = shufflevector <2 x i32> , <2 x i32> %a, <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/fast-isel-cmp-vec.ll b/llvm/test/CodeGen/AArch64/fast-isel-cmp-vec.ll index d5b64c5..4211206 100644 --- a/llvm/test/CodeGen/AArch64/fast-isel-cmp-vec.ll +++ b/llvm/test/CodeGen/AArch64/fast-isel-cmp-vec.ll @@ -24,7 +24,7 @@ bb2: define <2 x i32> @icmp_constfold_v2i32(<2 x i32> %a) { ; CHECK-LABEL: icmp_constfold_v2i32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi d[[CMP:[0-9]+]], #0xffffffffffffffff +; CHECK-NEXT: movi.2d v[[CMP:[0-9]+]], #0xffffffffffffffff ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: movi.2s [[MASK:v[0-9]+]], #1 ; CHECK-NEXT: and.8b v0, v[[CMP]], [[MASK]] @@ -56,7 +56,7 @@ bb2: define <4 x i32> @icmp_constfold_v4i32(<4 x i32> %a) { ; CHECK-LABEL: icmp_constfold_v4i32: ; CHECK: ; %bb.0: -; CHECK-NEXT: movi d[[CMP:[0-9]+]], #0xffffffffffffffff +; CHECK-NEXT: movi.2d v[[CMP:[0-9]+]], #0xffffffffffffffff ; CHECK-NEXT: ; %bb.1: ; CHECK-NEXT: movi.4h [[MASK:v[0-9]+]], #1 ; CHECK-NEXT: and.8b [[ZEXT:v[0-9]+]], v[[CMP]], [[MASK]] diff --git a/llvm/test/CodeGen/AArch64/fold-constants.ll b/llvm/test/CodeGen/AArch64/fold-constants.ll index 719d3f4..ab13eb6 100644 --- a/llvm/test/CodeGen/AArch64/fold-constants.ll +++ b/llvm/test/CodeGen/AArch64/fold-constants.ll @@ -2,7 +2,7 @@ define i64 @dotests_616() { ; CHECK-LABEL: dotests_616 -; CHECK: movi d0, #0000000000000000 +; CHECK: movi v0.2d, #0000000000000000 ; CHECK-NEXT: fmov x0, d0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/machine_cse.ll b/llvm/test/CodeGen/AArch64/machine_cse.ll index e9fa680..51252a2 100644 --- a/llvm/test/CodeGen/AArch64/machine_cse.ll +++ b/llvm/test/CodeGen/AArch64/machine_cse.ll @@ -47,3 +47,27 @@ return: store i32 %a, i32 *%arg ret void } + +define void @combine_vector_zeros(<8 x i8>* %p, <16 x i8>* %q) { +; CHECK-LABEL: combine_vector_zeros: +; CHECK: movi v[[REG:[0-9]+]].2d, #0 +; CHECK-NOT: movi +; CHECK: str d[[REG]], [x0] +; CHECK: str q[[REG]], [x1] +entry: + store <8 x i8> zeroinitializer, <8 x i8>* %p + store <16 x i8> zeroinitializer, <16 x i8>* %q + ret void +} + +define void @combine_vector_ones(<2 x i32>* %p, <4 x i32>* %q) { +; CHECK-LABEL: combine_vector_ones: +; CHECK: movi v[[REG:[0-9]+]].2d, #0xffffffffffffffff +; CHECK-NOT: movi +; CHECK: str d[[REG]], [x0] +; CHECK: str q[[REG]], [x1] +entry: + store <2 x i32> , <2 x i32>* %p + store <4 x i32> , <4 x i32>* %q + ret void +} diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll index 8bb7cc8..9d7d0ab 100644 --- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll +++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll @@ -1223,7 +1223,7 @@ define <8 x i8> @cmlsz8xi8(<8 x i8> %A) { ; CHECK-LABEL: cmlsz8xi8: ; Using registers other than v0, v1 are possible, but would be odd. ; LS implemented as HS, so check reversed operands. -; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}} +; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}} ; CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b %tmp3 = icmp ule <8 x i8> %A, zeroinitializer; %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> @@ -1245,7 +1245,7 @@ define <4 x i16> @cmlsz4xi16(<4 x i16> %A) { ; CHECK-LABEL: cmlsz4xi16: ; Using registers other than v0, v1 are possible, but would be odd. ; LS implemented as HS, so check reversed operands. -; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}} +; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}} ; CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h %tmp3 = icmp ule <4 x i16> %A, zeroinitializer; %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> @@ -1267,7 +1267,7 @@ define <2 x i32> @cmlsz2xi32(<2 x i32> %A) { ; CHECK-LABEL: cmlsz2xi32: ; Using registers other than v0, v1 are possible, but would be odd. ; LS implemented as HS, so check reversed operands. -; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}} +; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}} ; CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s %tmp3 = icmp ule <2 x i32> %A, zeroinitializer; %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> diff --git a/llvm/test/CodeGen/AArch64/selectiondag-order.ll b/llvm/test/CodeGen/AArch64/selectiondag-order.ll index 9427906..fb40653 100644 --- a/llvm/test/CodeGen/AArch64/selectiondag-order.ll +++ b/llvm/test/CodeGen/AArch64/selectiondag-order.ll @@ -21,7 +21,7 @@ end: ; preds = %body } ; AARCH64-CHECK: simulate: -; AARCH64-CHECK: movi d9, #0000000000000000 +; AARCH64-CHECK: movi v0.2d, #0000000000000000 ; AARCH64-CHECK: bl lrand48 ; AARCH64-CHECK: mov x19, x0 ; AARCH64-CHECK: BB0_1: @@ -47,7 +47,7 @@ end: ; preds = %body } ; AARCH64-CHECK: simulateWithDebugIntrinsic -; AARCH64-CHECK: movi d9, #0000000000000000 +; AARCH64-CHECK: movi v0.2d, #0000000000000000 ; AARCH64-CHECK: bl lrand48 ; AARCH64-CHECK: mov x19, x0 ; AARCH64-CHECK: BB1_1: @@ -73,7 +73,7 @@ end: ; preds = %body } ; AARCH64-CHECK: simulateWithDbgDeclare: -; AARCH64-CHECK: movi d9, #0000000000000000 +; AARCH64-CHECK: movi v0.2d, #0000000000000000 ; AARCH64-CHECK: bl lrand48 ; AARCH64-CHECK: mov x19, x0 ; AARCH64-CHECK: BB2_1: