return true;
}
+// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
+// v4i32s. This is really a truncate, which we can construct out of (legal)
+// concats and truncate nodes.
+static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
+ if (V.getValueType() != MVT::v16i8)
+ return SDValue();
+ assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
+
+ for (unsigned X = 0; X < 4; X++) {
+ // Check the first item in each group is an extract from lane 0 of a v4i32
+ // or v4i16.
+ SDValue BaseExt = V.getOperand(X * 4);
+ if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
+ BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
+ !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
+ BaseExt.getConstantOperandVal(1) != 0)
+ return SDValue();
+ SDValue Base = BaseExt.getOperand(0);
+ // And check the other items are extracts from the same vector.
+ for (unsigned Y = 1; Y < 4; Y++) {
+ SDValue Ext = V.getOperand(X * 4 + Y);
+ if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Ext.getOperand(0) != Base ||
+ !isa<ConstantSDNode>(Ext.getOperand(1)) ||
+ Ext.getConstantOperandVal(1) != Y)
+ return SDValue();
+ }
+ }
+
+ // Turn the buildvector into a series of truncates and concates, which will
+ // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
+ // concat together to produce 2 v8i16. These are both truncated and concat
+ // together.
+ SDLoc DL(V);
+ SDValue Trunc[4] = {
+ V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
+ V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
+ for (int I = 0; I < 4; I++)
+ if (Trunc[I].getValueType() == MVT::v4i32)
+ Trunc[I] = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, Trunc[I]);
+ SDValue Concat0 =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
+ SDValue Concat1 =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
+ SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
+ SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
+}
+
/// Check if a vector shuffle corresponds to a DUP instructions with a larger
/// element width than the vector lane type. If that is the case the function
/// returns true and writes the value of the DUP instruction lane operand into
return SDValue();
}
+ // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
+ // v4i32s. This is really a truncate, which we can construct out of (legal)
+ // concats and truncate nodes.
+ if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
+ return M;
+
// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {
if (SDValue shuffle = ReconstructShuffle(Op, DAG))
; CHECK-LABEL: test_signed_v16f32_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v4.4s, #127
+; CHECK-NEXT: fcvtzs v3.4s, v3.4s
+; CHECK-NEXT: fcvtzs v2.4s, v2.4s
+; CHECK-NEXT: fcvtzs v1.4s, v1.4s
; CHECK-NEXT: fcvtzs v0.4s, v0.4s
; CHECK-NEXT: mvni v5.4s, #127
-; CHECK-NEXT: fcvtzs v1.4s, v1.4s
-; CHECK-NEXT: fcvtzs v2.4s, v2.4s
-; CHECK-NEXT: smin v0.4s, v0.4s, v4.4s
-; CHECK-NEXT: smin v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: smin v3.4s, v3.4s, v4.4s
; CHECK-NEXT: smin v2.4s, v2.4s, v4.4s
-; CHECK-NEXT: smax v0.4s, v0.4s, v5.4s
-; CHECK-NEXT: smax v1.4s, v1.4s, v5.4s
-; CHECK-NEXT: smax v2.4s, v2.4s, v5.4s
-; CHECK-NEXT: xtn v6.4h, v0.4s
-; CHECK-NEXT: umov w8, v6.h[0]
-; CHECK-NEXT: umov w9, v6.h[1]
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: umov w8, v6.h[2]
-; CHECK-NEXT: mov v0.b[1], w9
-; CHECK-NEXT: mov v0.b[2], w8
-; CHECK-NEXT: umov w8, v6.h[3]
-; CHECK-NEXT: mov v0.b[3], w8
-; CHECK-NEXT: umov w8, v1.h[0]
-; CHECK-NEXT: mov v0.b[4], w8
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: umov w8, v1.h[2]
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: xtn v1.4h, v2.4s
-; CHECK-NEXT: fcvtzs v2.4s, v3.4s
-; CHECK-NEXT: mov v0.b[7], w8
-; CHECK-NEXT: umov w8, v1.h[0]
-; CHECK-NEXT: smin v2.4s, v2.4s, v4.4s
-; CHECK-NEXT: mov v0.b[8], w8
-; CHECK-NEXT: umov w8, v1.h[1]
+; CHECK-NEXT: smin v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: smin v0.4s, v0.4s, v4.4s
+; CHECK-NEXT: smax v3.4s, v3.4s, v5.4s
; CHECK-NEXT: smax v2.4s, v2.4s, v5.4s
-; CHECK-NEXT: mov v0.b[9], w8
-; CHECK-NEXT: umov w8, v1.h[2]
-; CHECK-NEXT: mov v0.b[10], w8
-; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: xtn v1.4h, v2.4s
-; CHECK-NEXT: mov v0.b[11], w8
-; CHECK-NEXT: umov w8, v1.h[0]
-; CHECK-NEXT: mov v0.b[12], w8
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: mov v0.b[13], w8
-; CHECK-NEXT: umov w8, v1.h[2]
-; CHECK-NEXT: mov v0.b[14], w8
-; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: mov v0.b[15], w8
+; CHECK-NEXT: smax v1.4s, v1.4s, v5.4s
+; CHECK-NEXT: smax v0.4s, v0.4s, v5.4s
+; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%x = call <16 x i8> @llvm.fptosi.sat.v16f32.v16i8(<16 x float> %f)
ret <16 x i8> %x
; CHECK-LABEL: test_unsigned_v16f32_v16i8:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v4.2d, #0x0000ff000000ff
-; CHECK-NEXT: fcvtzu v0.4s, v0.4s
-; CHECK-NEXT: fcvtzu v1.4s, v1.4s
+; CHECK-NEXT: fcvtzu v3.4s, v3.4s
; CHECK-NEXT: fcvtzu v2.4s, v2.4s
-; CHECK-NEXT: umin v0.4s, v0.4s, v4.4s
-; CHECK-NEXT: umin v1.4s, v1.4s, v4.4s
-; CHECK-NEXT: umin v2.4s, v2.4s, v4.4s
-; CHECK-NEXT: xtn v5.4h, v0.4s
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: umov w8, v5.h[0]
-; CHECK-NEXT: umov w9, v5.h[1]
-; CHECK-NEXT: fmov s0, w8
-; CHECK-NEXT: umov w8, v5.h[2]
-; CHECK-NEXT: mov v0.b[1], w9
-; CHECK-NEXT: mov v0.b[2], w8
-; CHECK-NEXT: umov w8, v5.h[3]
-; CHECK-NEXT: mov v0.b[3], w8
-; CHECK-NEXT: umov w8, v1.h[0]
-; CHECK-NEXT: mov v0.b[4], w8
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: umov w8, v1.h[2]
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: xtn v1.4h, v2.4s
-; CHECK-NEXT: fcvtzu v2.4s, v3.4s
-; CHECK-NEXT: mov v0.b[7], w8
-; CHECK-NEXT: umov w8, v1.h[0]
+; CHECK-NEXT: fcvtzu v1.4s, v1.4s
+; CHECK-NEXT: fcvtzu v0.4s, v0.4s
+; CHECK-NEXT: umin v3.4s, v3.4s, v4.4s
; CHECK-NEXT: umin v2.4s, v2.4s, v4.4s
-; CHECK-NEXT: mov v0.b[8], w8
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: mov v0.b[9], w8
-; CHECK-NEXT: umov w8, v1.h[2]
-; CHECK-NEXT: mov v0.b[10], w8
-; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: xtn v1.4h, v2.4s
-; CHECK-NEXT: mov v0.b[11], w8
-; CHECK-NEXT: umov w8, v1.h[0]
-; CHECK-NEXT: mov v0.b[12], w8
-; CHECK-NEXT: umov w8, v1.h[1]
-; CHECK-NEXT: mov v0.b[13], w8
-; CHECK-NEXT: umov w8, v1.h[2]
-; CHECK-NEXT: mov v0.b[14], w8
-; CHECK-NEXT: umov w8, v1.h[3]
-; CHECK-NEXT: mov v0.b[15], w8
+; CHECK-NEXT: umin v1.4s, v1.4s, v4.4s
+; CHECK-NEXT: umin v0.4s, v0.4s, v4.4s
+; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
%x = call <16 x i8> @llvm.fptoui.sat.v16f32.v16i8(<16 x float> %f)
ret <16 x i8> %x
define <16 x i8> @extract_4_v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
; CHECK-LABEL: extract_4_v4i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w9, v0.h[0]
-; CHECK-NEXT: umov w10, v0.h[1]
-; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT: umov w8, v2.h[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT: fmov s4, w9
-; CHECK-NEXT: umov w9, v0.h[2]
-; CHECK-NEXT: mov v4.b[1], w10
-; CHECK-NEXT: umov w10, v0.h[3]
-; CHECK-NEXT: mov v4.b[2], w9
-; CHECK-NEXT: umov w9, v1.h[0]
-; CHECK-NEXT: mov v4.b[3], w10
-; CHECK-NEXT: umov w10, v1.h[1]
-; CHECK-NEXT: mov v4.b[4], w9
-; CHECK-NEXT: umov w9, v1.h[2]
-; CHECK-NEXT: mov v4.b[5], w10
-; CHECK-NEXT: umov w10, v1.h[3]
-; CHECK-NEXT: mov v4.b[6], w9
-; CHECK-NEXT: umov w9, v2.h[1]
-; CHECK-NEXT: mov v4.b[7], w10
-; CHECK-NEXT: mov v4.b[8], w8
-; CHECK-NEXT: umov w8, v2.h[2]
-; CHECK-NEXT: mov v4.b[9], w9
-; CHECK-NEXT: umov w9, v2.h[3]
-; CHECK-NEXT: mov v4.b[10], w8
-; CHECK-NEXT: umov w8, v3.h[0]
-; CHECK-NEXT: mov v4.b[11], w9
-; CHECK-NEXT: umov w9, v3.h[1]
-; CHECK-NEXT: mov v4.b[12], w8
-; CHECK-NEXT: umov w8, v3.h[2]
-; CHECK-NEXT: mov v4.b[13], w9
-; CHECK-NEXT: umov w9, v3.h[3]
-; CHECK-NEXT: mov v4.b[14], w8
-; CHECK-NEXT: mov v4.b[15], w9
-; CHECK-NEXT: mov v0.16b, v4.16b
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov v2.d[1], v3.d[0]
+; CHECK-NEXT: mov v0.d[1], v1.d[0]
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
entry:
%a0 = extractelement <4 x i16> %a, i32 0
define <16 x i8> @extract_4_v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
; CHECK-LABEL: extract_4_v4i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: mov w9, v0.s[2]
-; CHECK-NEXT: mov w10, v0.s[3]
-; CHECK-NEXT: mov v0.b[1], w8
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov v0.b[2], w9
-; CHECK-NEXT: mov w9, v1.s[1]
-; CHECK-NEXT: mov v0.b[3], w10
-; CHECK-NEXT: mov v0.b[4], w8
-; CHECK-NEXT: mov w8, v1.s[2]
-; CHECK-NEXT: mov v0.b[5], w9
-; CHECK-NEXT: mov w9, v1.s[3]
-; CHECK-NEXT: mov v0.b[6], w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov v0.b[7], w9
-; CHECK-NEXT: mov w9, v2.s[1]
-; CHECK-NEXT: mov v0.b[8], w8
-; CHECK-NEXT: mov w8, v2.s[2]
-; CHECK-NEXT: mov v0.b[9], w9
-; CHECK-NEXT: mov w9, v2.s[3]
-; CHECK-NEXT: mov v0.b[10], w8
-; CHECK-NEXT: fmov w8, s3
-; CHECK-NEXT: mov v0.b[11], w9
-; CHECK-NEXT: mov w9, v3.s[1]
-; CHECK-NEXT: mov v0.b[12], w8
-; CHECK-NEXT: mov w8, v3.s[2]
-; CHECK-NEXT: mov v0.b[13], w9
-; CHECK-NEXT: mov w9, v3.s[3]
-; CHECK-NEXT: mov v0.b[14], w8
-; CHECK-NEXT: mov v0.b[15], w9
+; CHECK-NEXT: uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
entry:
%a0 = extractelement <4 x i32> %a, i32 0
define <16 x i8> @extract_4_mixed(<4 x i16> %a, <4 x i32> %b, <4 x i32> %c, <4 x i16> %d) {
; CHECK-LABEL: extract_4_mixed:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: umov w8, v0.h[0]
-; CHECK-NEXT: umov w9, v0.h[1]
+; CHECK-NEXT: xtn v2.4h, v2.4s
; CHECK-NEXT: // kill: def $d3 killed $d3 def $q3
-; CHECK-NEXT: fmov s4, w8
-; CHECK-NEXT: umov w8, v0.h[2]
-; CHECK-NEXT: mov v4.b[1], w9
-; CHECK-NEXT: umov w9, v0.h[3]
-; CHECK-NEXT: mov v4.b[2], w8
-; CHECK-NEXT: fmov w8, s1
-; CHECK-NEXT: mov v4.b[3], w9
-; CHECK-NEXT: mov w9, v1.s[1]
-; CHECK-NEXT: mov v4.b[4], w8
-; CHECK-NEXT: mov w8, v1.s[2]
-; CHECK-NEXT: mov v4.b[5], w9
-; CHECK-NEXT: mov w9, v1.s[3]
-; CHECK-NEXT: mov v4.b[6], w8
-; CHECK-NEXT: fmov w8, s2
-; CHECK-NEXT: mov v4.b[7], w9
-; CHECK-NEXT: mov w9, v2.s[1]
-; CHECK-NEXT: mov v4.b[8], w8
-; CHECK-NEXT: mov w8, v2.s[2]
-; CHECK-NEXT: mov v4.b[9], w9
-; CHECK-NEXT: mov w9, v2.s[3]
-; CHECK-NEXT: mov v4.b[10], w8
-; CHECK-NEXT: umov w8, v3.h[0]
-; CHECK-NEXT: mov v4.b[11], w9
-; CHECK-NEXT: umov w9, v3.h[1]
-; CHECK-NEXT: mov v4.b[12], w8
-; CHECK-NEXT: umov w8, v3.h[2]
-; CHECK-NEXT: mov v4.b[13], w9
-; CHECK-NEXT: umov w9, v3.h[3]
-; CHECK-NEXT: mov v4.b[14], w8
-; CHECK-NEXT: mov v4.b[15], w9
-; CHECK-NEXT: mov v0.16b, v4.16b
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: xtn2 v0.8h, v1.4s
+; CHECK-NEXT: mov v2.d[1], v3.d[0]
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v2.16b
; CHECK-NEXT: ret
entry:
%a0 = extractelement <4 x i16> %a, i32 0
define <16 x i8> @extract_4_v4i32_one(<4 x i32> %a) {
; CHECK-LABEL: extract_4_v4i32_one:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: fmov w9, s0
-; CHECK-NEXT: mov w10, v0.s[2]
-; CHECK-NEXT: mov w11, v0.s[3]
-; CHECK-NEXT: mov v0.b[1], w8
-; CHECK-NEXT: mov v0.b[2], w10
-; CHECK-NEXT: mov v0.b[3], w11
-; CHECK-NEXT: mov v0.b[4], w9
-; CHECK-NEXT: mov v0.b[5], w8
-; CHECK-NEXT: mov v0.b[6], w10
-; CHECK-NEXT: mov v0.b[7], w11
-; CHECK-NEXT: mov v0.b[8], w9
-; CHECK-NEXT: mov v0.b[9], w8
-; CHECK-NEXT: mov v0.b[10], w10
-; CHECK-NEXT: mov v0.b[11], w11
-; CHECK-NEXT: mov v0.b[12], w9
-; CHECK-NEXT: mov v0.b[13], w8
-; CHECK-NEXT: mov v0.b[14], w10
-; CHECK-NEXT: mov v0.b[15], w11
+; CHECK-NEXT: uzp1 v0.8h, v0.8h, v0.8h
+; CHECK-NEXT: uzp1 v0.16b, v0.16b, v0.16b
; CHECK-NEXT: ret
entry:
%a0 = extractelement <4 x i32> %a, i32 0