[AArch64] Do 64-bit vector move of 0 and -1 by extracting from the 128-bit move

author John Brawn <john.brawn@arm.com>

Thu, 25 Oct 2018 14:56:48 +0000 (14:56 +0000)

committer John Brawn <john.brawn@arm.com>

Thu, 25 Oct 2018 14:56:48 +0000 (14:56 +0000)
author John Brawn <john.brawn@arm.com>
Thu, 25 Oct 2018 14:56:48 +0000 (14:56 +0000)
committer John Brawn <john.brawn@arm.com>
Thu, 25 Oct 2018 14:56:48 +0000 (14:56 +0000)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td

index 88e5632..76ea2ac 100644 (file)
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4920,16 +4920,6 @@ def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
  def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
            (MOVID imm0_255:$shift)>;
  
-def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
-
-def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
-
  // EDIT byte mask: 2d
  
  // The movi_edit node has the immediate value already encoded, so we use
@@ -4950,6 +4940,18 @@ def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
  def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
  def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
  
+// Set 64-bit vectors to all 0/1 by extracting from a 128-bit register as the
+// extract is free and this gives better MachineCSE results.
+def : Pat<(v1i64 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+def : Pat<(v2i32 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+def : Pat<(v4i16 immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+def : Pat<(v8i8  immAllZerosV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub)>;
+
+def : Pat<(v1i64 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+def : Pat<(v2i32 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+def : Pat<(v4i16 immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+def : Pat<(v8i8  immAllOnesV), (EXTRACT_SUBREG (MOVIv2d_ns (i32 255)), dsub)>;
+
  // EDIT per word & halfword: 2s, 4h, 4s, & 8h
  let isReMaterializable = 1, isAsCheapAsAMove = 1 in
  defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll b/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll

index 54b7c8f..0e1797f 100644 (file)
--- a/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-be-bv.ll
@@ -746,7 +746,7 @@ define void @modimm_t10_call() {
    ; CHECK-NEXT:    rev64   v{{[0-9]+}}.4h, v[[REG1]].4h
    ; CHECK-NEXT:    bl      f_v4i16
    call i16 @f_v4i16(<4 x i16> <i16 -1, i16 0, i16 -1, i16 0>)
-  ; CHECK:         movi    d[[REG1:[0-9]+]], #0xffffffffffffffff
+  ; CHECK:         movi    v[[REG1:[0-9]+]].2d, #0xffffffffffffffff
    ; CHECK-NEXT:    rev64   v{{[0-9]+}}.2s, v[[REG1]].2s
    ; CHECK-NEXT:    bl      f_v2i32
    call i32 @f_v2i32(<2 x i32> <i32 -1, i32 -1>)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smax-constantfold.ll b/llvm/test/CodeGen/AArch64/aarch64-smax-constantfold.ll

index 0e5b59f..32cd3c6 100644 (file)
--- a/llvm/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smax-constantfold.ll
@@ -6,7 +6,7 @@ declare <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16>, <4 x i16>)
  ; CHECK-LABEL: test
  define <4 x i16> @test() {
  entry:
-; CHECK: movi  d{{[0-9]+}}, #0000000000000000
+; CHECK: movi  v{{[0-9]+}}.2d, #0000000000000000
    %0 = tail call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> zeroinitializer)
    ret <4 x i16> %0
  }
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll

index 7cc5a43..bb3c36a 100644 (file)
--- a/llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll
@@ -975,7 +975,7 @@ define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
  define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
  ; Using registers other than v0, v1 are possible, but would be odd.
  ; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
  ;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
         %tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
     %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
@@ -995,7 +995,7 @@ define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
  define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
  ; Using registers other than v0, v1 are possible, but would be odd.
  ; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
  ;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
         %tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
     %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
@@ -1015,7 +1015,7 @@ define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
  define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
  ; Using registers other than v0, v1 are possible, but would be odd.
  ; LS implemented as HS, so check reversed operands.
-;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
  ;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
         %tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
     %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll

index 2a9e545..0b6132b 100644 (file)
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1401,7 +1401,7 @@ entry:
  
  define <4 x i16> @concat_vector_v4i16_const() {
  ; CHECK-LABEL: concat_vector_v4i16_const:
-; CHECK: movi {{d[0-9]+}}, #0
+; CHECK: movi {{v[0-9]+}}.2d, #0
   %r = shufflevector <1 x i16> zeroinitializer, <1 x i16> undef, <4 x i32> zeroinitializer
   ret <4 x i16> %r
  }
@@ -1422,7 +1422,7 @@ define <4 x i32> @concat_vector_v4i32_const() {
  
  define <8 x i8> @concat_vector_v8i8_const() {
  ; CHECK-LABEL: concat_vector_v8i8_const:
-; CHECK: movi {{d[0-9]+}}, #0
+; CHECK: movi {{v[0-9]+}}.2d, #0
   %r = shufflevector <1 x i8> zeroinitializer, <1 x i8> undef, <8 x i32> zeroinitializer
   ret <8 x i8> %r
  }
diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-ext.ll b/llvm/test/CodeGen/AArch64/arm64-vector-ext.ll

index 68892ee..8debd21 100644 (file)
--- a/llvm/test/CodeGen/AArch64/arm64-vector-ext.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-ext.ll
@@ -19,7 +19,7 @@ define void @func30(%T0_30 %v0, %T1_30* %p1) {
  ; sensible instead.
  define <1 x i32> @autogen_SD7918() {
  ; CHECK-LABEL: autogen_SD7918
-; CHECK: movi d0, #0000000000000000
+; CHECK: movi.2d v0, #0000000000000000
  ; CHECK-NEXT: ret
    %I29 = insertelement <1 x i1> zeroinitializer, i1 false, i32 0
    %ZE = zext <1 x i1> %I29 to <1 x i32>
diff --git a/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll

index b4f5767..fdd7cad 100644 (file)
--- a/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vshuffle.ll
@@ -2,7 +2,7 @@
  
  
  ; CHECK: test1
-; CHECK: movi d[[REG0:[0-9]+]], #0000000000000000
+; CHECK: movi.16b v[[REG0:[0-9]+]], #0
  define <8 x i1> @test1() {
  entry:
    %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll

index 0335d0a..784b4c4 100644 (file)
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll
@@ -162,28 +162,28 @@ entry:
  define <8 x i8> @tv8i8() {
  entry:
  ; ALL-LABEL: tv8i8:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
    ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
  }
  
  define <4 x i16> @tv4i16() {
  entry:
  ; ALL-LABEL: tv4i16:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
    ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
  }
  
  define <2 x i32> @tv2i32() {
  entry:
  ; ALL-LABEL: tv2i32:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
    ret <2 x i32> <i32 0, i32 0>
  }
  
  define <2 x float> @tv2f32() {
  entry:
  ; ALL-LABEL: tv2f32:
-; ALL: movi d0, #0
+; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0
    ret <2 x float> <float 0.0, float 0.0>
  }
  
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll

index e88ea9e..d60bd4a 100644 (file)
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -4,7 +4,7 @@
  
  define <4 x i16> @foo1(<2 x i32> %a) {
  ; CHECK-LABEL: foo1:
-; CHECK:       movi    d0, #0000000000000000
+; CHECK:       movi    v0.2d, #0000000000000000
  ; CHECK-NEXT:  ret
  
    %1 = shufflevector <2 x i32> <i32 58712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
@@ -16,7 +16,7 @@ define <4 x i16> @foo1(<2 x i32> %a) {
  
  define <4 x i16> @foo2(<2 x i32> %a) {
  ; CHECK-LABEL: foo2:
-; CHECK:       movi    d0, #0000000000000000
+; CHECK:       movi    v0.2d, #0000000000000000
  ; CHECK-NEXT:  ret
  
    %1 = shufflevector <2 x i32> <i32 712, i32 undef>, <2 x i32> %a, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AArch64/fast-isel-cmp-vec.ll b/llvm/test/CodeGen/AArch64/fast-isel-cmp-vec.ll

index d5b64c5..4211206 100644 (file)
--- a/llvm/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
+++ b/llvm/test/CodeGen/AArch64/fast-isel-cmp-vec.ll
@@ -24,7 +24,7 @@ bb2:
  define <2 x i32> @icmp_constfold_v2i32(<2 x i32> %a) {
  ; CHECK-LABEL: icmp_constfold_v2i32:
  ; CHECK:      ; %bb.0:
-; CHECK-NEXT:  movi d[[CMP:[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT:  movi.2d v[[CMP:[0-9]+]], #0xffffffffffffffff
  ; CHECK-NEXT: ; %bb.1:
  ; CHECK-NEXT:  movi.2s [[MASK:v[0-9]+]], #1
  ; CHECK-NEXT:  and.8b v0, v[[CMP]], [[MASK]]
@@ -56,7 +56,7 @@ bb2:
  define <4 x i32> @icmp_constfold_v4i32(<4 x i32> %a) {
  ; CHECK-LABEL: icmp_constfold_v4i32:
  ; CHECK:      ; %bb.0:
-; CHECK-NEXT:  movi d[[CMP:[0-9]+]], #0xffffffffffffffff
+; CHECK-NEXT:  movi.2d v[[CMP:[0-9]+]], #0xffffffffffffffff
  ; CHECK-NEXT: ; %bb.1:
  ; CHECK-NEXT:  movi.4h [[MASK:v[0-9]+]], #1
  ; CHECK-NEXT:  and.8b [[ZEXT:v[0-9]+]], v[[CMP]], [[MASK]]
diff --git a/llvm/test/CodeGen/AArch64/fold-constants.ll b/llvm/test/CodeGen/AArch64/fold-constants.ll

index 719d3f4..ab13eb6 100644 (file)
--- a/llvm/test/CodeGen/AArch64/fold-constants.ll
+++ b/llvm/test/CodeGen/AArch64/fold-constants.ll
@@ -2,7 +2,7 @@
  
  define i64 @dotests_616() {
  ; CHECK-LABEL: dotests_616
-; CHECK:       movi d0, #0000000000000000
+; CHECK:       movi v0.2d, #0000000000000000
  ; CHECK-NEXT:  fmov x0, d0
  ; CHECK-NEXT:  ret
  entry:
diff --git a/llvm/test/CodeGen/AArch64/machine_cse.ll b/llvm/test/CodeGen/AArch64/machine_cse.ll

index e9fa680..51252a2 100644 (file)
--- a/llvm/test/CodeGen/AArch64/machine_cse.ll
+++ b/llvm/test/CodeGen/AArch64/machine_cse.ll
@@ -47,3 +47,27 @@ return:
    store i32 %a, i32 *%arg
    ret void
  }
+
+define void @combine_vector_zeros(<8 x i8>* %p, <16 x i8>* %q) {
+; CHECK-LABEL: combine_vector_zeros:
+; CHECK: movi v[[REG:[0-9]+]].2d, #0
+; CHECK-NOT: movi
+; CHECK: str d[[REG]], [x0]
+; CHECK: str q[[REG]], [x1]
+entry:
+  store <8 x i8> zeroinitializer, <8 x i8>* %p
+  store <16 x i8> zeroinitializer, <16 x i8>* %q
+  ret void
+}
+
+define void @combine_vector_ones(<2 x i32>* %p, <4 x i32>* %q) {
+; CHECK-LABEL: combine_vector_ones:
+; CHECK: movi v[[REG:[0-9]+]].2d, #0xffffffffffffffff
+; CHECK-NOT: movi
+; CHECK: str d[[REG]], [x0]
+; CHECK: str q[[REG]], [x1]
+entry:
+  store <2 x i32> <i32 -1, i32 -1>, <2 x i32>* %p
+  store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %q
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll

index 8bb7cc8..9d7d0ab 100644 (file)
--- a/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1223,7 +1223,7 @@ define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
  ; CHECK-LABEL: cmlsz8xi8:
  ; Using registers other than v0, v1 are possible, but would be odd.
  ; LS implemented as HS, so check reversed operands.
-; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}}
  ; CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
         %tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
     %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
@@ -1245,7 +1245,7 @@ define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
  ; CHECK-LABEL: cmlsz4xi16:
  ; Using registers other than v0, v1 are possible, but would be odd.
  ; LS implemented as HS, so check reversed operands.
-; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}}
  ; CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
         %tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
     %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
@@ -1267,7 +1267,7 @@ define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
  ; CHECK-LABEL: cmlsz2xi32:
  ; Using registers other than v0, v1 are possible, but would be odd.
  ; LS implemented as HS, so check reversed operands.
-; CHECK: movi {{v1.8b|d1}}, #{{0x0|0}}
+; CHECK: movi {{v1.8b|v1.2d}}, #{{0x0|0}}
  ; CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
         %tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
     %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
diff --git a/llvm/test/CodeGen/AArch64/selectiondag-order.ll b/llvm/test/CodeGen/AArch64/selectiondag-order.ll

index 9427906..fb40653 100644 (file)
--- a/llvm/test/CodeGen/AArch64/selectiondag-order.ll
+++ b/llvm/test/CodeGen/AArch64/selectiondag-order.ll
@@ -21,7 +21,7 @@ end:                                        ; preds = %body
  }
  
  ; AARCH64-CHECK: simulate:
-; AARCH64-CHECK: movi d9, #0000000000000000
+; AARCH64-CHECK: movi v0.2d, #0000000000000000
  ; AARCH64-CHECK: bl lrand48
  ; AARCH64-CHECK: mov x19, x0
  ; AARCH64-CHECK: BB0_1:
@@ -47,7 +47,7 @@ end:                                        ; preds = %body
  }
  
  ; AARCH64-CHECK: simulateWithDebugIntrinsic
-; AARCH64-CHECK: movi d9, #0000000000000000
+; AARCH64-CHECK: movi v0.2d, #0000000000000000
  ; AARCH64-CHECK: bl lrand48
  ; AARCH64-CHECK: mov x19, x0
  ; AARCH64-CHECK: BB1_1:
@@ -73,7 +73,7 @@ end:                                        ; preds = %body
  }
  
  ; AARCH64-CHECK: simulateWithDbgDeclare:
-; AARCH64-CHECK: movi d9, #0000000000000000
+; AARCH64-CHECK: movi v0.2d, #0000000000000000
  ; AARCH64-CHECK: bl lrand48
  ; AARCH64-CHECK: mov x19, x0
  ; AARCH64-CHECK: BB2_1:
author	John Brawn <john.brawn@arm.com>
	Thu, 25 Oct 2018 14:56:48 +0000 (14:56 +0000)
committer	John Brawn <john.brawn@arm.com>
	Thu, 25 Oct 2018 14:56:48 +0000 (14:56 +0000)
llvm/lib/Target/AArch64/AArch64InstrInfo.td		patch \| blob \| history
llvm/test/CodeGen/AArch64/aarch64-be-bv.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/aarch64-smax-constantfold.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/arm64-neon-compare-instructions.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/arm64-neon-copy.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/arm64-vector-ext.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/arm64-vshuffle.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/bitcast.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/fast-isel-cmp-vec.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/fold-constants.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/machine_cse.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/neon-compare-instructions.ll		patch \| blob \| history
llvm/test/CodeGen/AArch64/selectiondag-order.ll		patch \| blob \| history