}
}
- return SDValue();
+ // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
+ // Only implemented on little-endian subtargets.
+ bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
+
+ // This optimization only works on little endian.
+ if (!IsLittleEndian)
+ return SDValue();
+
+ if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
+ return SDValue();
+
+ auto getSourceOp = [](SDValue Operand) -> SDValue {
+ const unsigned Opcode = Operand.getOpcode();
+ if (Opcode == ISD::TRUNCATE)
+ return Operand->getOperand(0);
+ if (Opcode == ISD::BITCAST &&
+ Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
+ return Operand->getOperand(0)->getOperand(0);
+ return SDValue();
+ };
+
+ SDValue SourceOp0 = getSourceOp(Op0);
+ SDValue SourceOp1 = getSourceOp(Op1);
+
+ if (!SourceOp0 || !SourceOp1)
+ return SDValue();
+
+ if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
+ !SourceOp0.getValueType().isSimple())
+ return SDValue();
+
+ EVT ResultTy;
+
+ switch (SourceOp0.getSimpleValueType().SimpleTy) {
+ case MVT::v2i64:
+ ResultTy = MVT::v4i32;
+ break;
+ case MVT::v4i32:
+ ResultTy = MVT::v8i16;
+ break;
+ case MVT::v8i16:
+ ResultTy = MVT::v16i8;
+ break;
+ default:
+ return SDValue();
+ }
+
+ SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
+ SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
+ SDValue UzpResult =
+ DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
+
+ EVT BitcastResultTy;
+
+ switch (ResVT.getSimpleVT().SimpleTy) {
+ case MVT::v2i32:
+ BitcastResultTy = MVT::v2i64;
+ break;
+ case MVT::v4i16:
+ BitcastResultTy = MVT::v4i32;
+ break;
+ case MVT::v8i8:
+ BitcastResultTy = MVT::v8i16;
+ break;
+ default:
+ llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
+ }
+
+ return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
+ DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
}
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LE %s
; RUN: llc < %s -mtriple aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
-; Test cases to show when UZP1 (TRUNC, TRUNC) could be combined to TRUNC (UZP1) but not yet implemented.
-
define <4 x i16> @test_combine_v4i16_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LE-LABEL: test_combine_v4i16_v2i64:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: xtn v0.2s, v0.2d
-; CHECK-LE-NEXT: xtn v1.2s, v1.2d
-; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT: xtn v0.4h, v0.4s
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_combine_v4i16_v2i64:
define <4 x i16> @test_combine_v4i16_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LE-LABEL: test_combine_v4i16_v4i32:
; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
; CHECK-LE-NEXT: xtn v0.4h, v0.4s
-; CHECK-LE-NEXT: xtn v1.4h, v1.4s
-; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_combine_v4i16_v4i32:
define <4 x i16> @test_combine_v4i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LE-LABEL: test_combine_v4i16_v8i16:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: xtn v0.8b, v0.8h
-; CHECK-LE-NEXT: xtn v1.8b, v1.8h
-; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-LE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
+; CHECK-LE-NEXT: xtn v0.4h, v0.4s
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_combine_v4i16_v8i16:
define <8 x i8> @test_combine_v8i8_v2i64(<2 x i64> %a, <2 x i64> %b) {
; CHECK-LE-LABEL: test_combine_v8i8_v2i64:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: xtn v0.2s, v0.2d
-; CHECK-LE-NEXT: xtn v1.2s, v1.2d
-; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT: xtn v0.8b, v0.8h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_combine_v8i8_v2i64:
define <8 x i8> @test_combine_v8i8_v4i32(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LE-LABEL: test_combine_v8i8_v4i32:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: xtn v0.4h, v0.4s
-; CHECK-LE-NEXT: xtn v1.4h, v1.4s
-; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-LE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-LE-NEXT: xtn v0.8b, v0.8h
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_combine_v8i8_v4i32:
define <8 x i8> @test_combine_v8i8_v8i16(<8 x i16> %a, <8 x i16> %b) {
; CHECK-LE-LABEL: test_combine_v8i8_v8i16:
; CHECK-LE: // %bb.0:
+; CHECK-LE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
; CHECK-LE-NEXT: xtn v0.8b, v0.8h
-; CHECK-LE-NEXT: xtn v1.8b, v1.8h
-; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b
; CHECK-LE-NEXT: ret
;
; CHECK-BE-LABEL: test_combine_v8i8_v8i16:
define i8 @trunc_v4i64_v4i8(<4 x i64> %input) {
; CHECK-LE-LABEL: trunc_v4i64_v4i8:
; CHECK-LE: // %bb.0:
-; CHECK-LE-NEXT: xtn v1.2s, v1.2d
-; CHECK-LE-NEXT: xtn v0.2s, v0.2d
-; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-LE-NEXT: xtn v0.4h, v0.4s
; CHECK-LE-NEXT: addv h0, v0.4h
; CHECK-LE-NEXT: fmov w0, s0
; CHECK-LE-NEXT: ret