From cd68e17bc2f9b7b54a3d3ab5f917793d41ce17cb Mon Sep 17 00:00:00 2001 From: Lawrence Benson Date: Fri, 28 Apr 2023 11:19:45 +0100 Subject: [PATCH] [AArch64] Add support for efficient bitcast in vector truncate store. Following the changes in D145301, we now also support the efficient bitcast when storing the bool vector. Previously, this was expanded. Differential Revision: https://reviews.llvm.org/D148316 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 55 +++- llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll | 15 +- .../AArch64/vec-combine-compare-to-bitmask.ll | 60 ++++- .../AArch64/vec-combine-compare-truncate-store.ll | 281 +++++++++++++++++++++ llvm/test/CodeGen/AArch64/vec_uaddo.ll | 20 +- llvm/test/CodeGen/AArch64/vec_umulo.ll | 19 +- 6 files changed, 413 insertions(+), 37 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1b51140..1fb9833 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -19775,20 +19775,25 @@ static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) { static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue ComparisonResult(N, 0); - EVT BoolVecVT = ComparisonResult.getValueType(); - assert(BoolVecVT.isVector() && "Must be a vector type"); + EVT VecVT = ComparisonResult.getValueType(); + assert(VecVT.isVector() && "Must be a vector type"); - unsigned NumElts = BoolVecVT.getVectorNumElements(); + unsigned NumElts = VecVT.getVectorNumElements(); if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16) return SDValue(); + if (VecVT.getVectorElementType() != MVT::i1 && + !DAG.getTargetLoweringInfo().isTypeLegal(VecVT)) + return SDValue(); + // If we can find the original types to work on instead of a vector of i1, // we can avoid extend/extract conversion instructions. - EVT VecVT = tryGetOriginalBoolVectorType(ComparisonResult); - if (!VecVT.isSimple()) { - unsigned BitsPerElement = std::max(64 / NumElts, 8u); // min. 64-bit vector - VecVT = - BoolVecVT.changeVectorElementType(MVT::getIntegerVT(BitsPerElement)); + if (VecVT.getVectorElementType() == MVT::i1) { + VecVT = tryGetOriginalBoolVectorType(ComparisonResult); + if (!VecVT.isSimple()) { + unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector + VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts); + } } VecVT = VecVT.changeVectorElementTypeToInteger(); @@ -19849,6 +19854,37 @@ static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits); } +static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, + StoreSDNode *Store) { + if (!Store->isTruncatingStore()) + return SDValue(); + + SDLoc DL(Store); + SDValue VecOp = Store->getValue(); + EVT VT = VecOp.getValueType(); + EVT MemVT = Store->getMemoryVT(); + + if (!MemVT.isVector() || !VT.isVector() || + MemVT.getVectorElementType() != MVT::i1) + return SDValue(); + + // If we are storing a vector that we are currently building, let + // `scalarizeVectorStore()` handle this more efficiently. + if (VecOp.getOpcode() == ISD::BUILD_VECTOR) + return SDValue(); + + VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp); + SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG); + if (!VectorBits) + return SDValue(); + + EVT StoreVT = + EVT::getIntegerVT(*DAG.getContext(), MemVT.getStoreSizeInBits()); + SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT); + return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(), + Store->getMemOperand()); +} + static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, @@ -19887,6 +19923,9 @@ static SDValue performSTORECombine(SDNode *N, if (SDValue Store = foldTruncStoreOfExt(DAG, N)) return Store; + if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST)) + return Store; + return SDValue(); } diff --git a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll index cf7b9b1..e482833 100644 --- a/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll +++ b/llvm/test/CodeGen/AArch64/setcc-type-mismatch.ll @@ -1,9 +1,20 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s define void @test_mismatched_setcc(<4 x i22> %l, <4 x i22> %r, ptr %addr) { ; CHECK-LABEL: test_mismatched_setcc: -; CHECK: cmeq [[CMP128:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s -; CHECK: xtn {{v[0-9]+}}.4h, [[CMP128]].4s +; CHECK: // %bb.0: +; CHECK-NEXT: movi v2.4s, #63, msl #16 +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret %tst = icmp eq <4 x i22> %l, %r store <4 x i1> %tst, ptr %addr diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index e1daead..4938082 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -418,18 +418,59 @@ define i4 @convert_to_bitmask_float(<4 x float> %vec) { ret i4 %bitmask } -; TODO(lawben): Change this in follow-up patch to #D145301, as truncating stores fix this. -; Larger vector types don't map directly. -define i8 @no_convert_large_vector(<8 x i32> %vec) { +; Larger vector types don't map directly, but the can be split/truncated and then converted. +; After the comparison against 0, this is truncated to <8 x i16>, which is valid again. +define i8 @convert_large_vector(<8 x i32> %vec) { +; CHECK-LABEL: lCPI15_0: +; CHECK-NEXT: .short 1 +; CHECK-NEXT: .short 2 +; CHECK-NEXT: .short 4 +; CHECK-NEXT: .short 8 +; CHECK-NEXT: .short 16 +; CHECK-NEXT: .short 32 +; CHECK-NEXT: .short 64 +; CHECK-NEXT: .short 128 + ; CHECK-LABEL: convert_large_vector: -; CHECK: cmeq.4s v1, v1, #0 -; CHECK-NOT: addv +; CHECK: Lloh30: +; CHECK-NEXT: adrp x8, lCPI15_0@PAGE +; CHECK-NEXT: cmeq.4s v1, v1, #0 +; CHECK-NEXT: cmeq.4s v0, v0, #0 +; CHECK-NEXT: uzp1.8h v0, v0, v1 +; CHECK-NEXT: Lloh31: +; CHECK-NEXT: ldr q1, [x8, lCPI15_0@PAGEOFF] +; CHECK-NEXT: bic.16b v0, v1, v0 +; CHECK-NEXT: addv.8h h0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w0, w8, #0xff +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: ret %cmp_result = icmp ne <8 x i32> %vec, zeroinitializer %bitmask = bitcast <8 x i1> %cmp_result to i8 ret i8 %bitmask } +define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) { +; CHECK-LABEL: convert_legalized_illegal_element_size +; CHECK: ; %bb.0: +; CHECK-NEXT: movi.4s v1, #63, msl #16 +; CHECK-NEXT: Lloh32: +; CHECK-NEXT: adrp x8, lCPI16_0@PAGE +; CHECK-NEXT: cmtst.4s v0, v0, v1 +; CHECK-NEXT: Lloh33: +; CHECK-NEXT: ldr d1, [x8, lCPI16_0@PAGEOFF] +; CHECK-NEXT: xtn.4h v0, v0 +; CHECK-NEXT: and.8b v0, v0, v1 +; CHECK-NEXT: addv.4h h0, v0 +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret + + %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer + %bitmask = bitcast <4 x i1> %cmp_result to i4 + ret i4 %bitmask +} + ; This may still be converted as a v8i8 after the vector concat (but not as v4iX). define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) { ; CHECK-LABEL: no_direct_convert_for_bad_concat: @@ -450,3 +491,12 @@ define <8 x i1> @no_convert_without_direct_bitcast(<8 x i16> %vec) { %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer ret <8 x i1> %cmp_result } + +define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { +; CHECK-LABEL: no_combine_illegal_num_elements +; CHECK-NOT: addv + + %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer + %bitmask = bitcast <6 x i1> %cmp_result to i6 + ret i6 %bitmask +} diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll new file mode 100644 index 0000000..dc87809 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll @@ -0,0 +1,281 @@ +; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s + +define void @store_16_elements(<16 x i8> %vec, ptr %out) { +; Bits used in mask +; CHECK-LABEL: lCPI0_0 +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 16 +; CHECK-NEXT: .byte 32 +; CHECK-NEXT: .byte 64 +; CHECK-NEXT: .byte 128 +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 16 +; CHECK-NEXT: .byte 32 +; CHECK-NEXT: .byte 64 +; CHECK-NEXT: .byte 128 + +; Actual conversion +; CHECK-LABEL: store_16_elements +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh0: +; CHECK-NEXT: adrp x8, lCPI0_0@PAGE +; CHECK-NEXT: cmeq.16b v0, v0, #0 +; CHECK-NEXT: Lloh1: +; CHECK-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF] +; CHECK-NEXT: bic.16b v0, v1, v0 +; CHECK-NEXT: ext.16b v1, v0, v0, #8 +; CHECK-NEXT: addv.8b b0, v0 +; CHECK-NEXT: addv.8b b1, v1 +; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: orr w8, w9, w8, lsl #8 +; CHECK-NEXT: strh w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer + store <16 x i1> %cmp_result, ptr %out + ret void +} + +define void @store_8_elements(<8 x i16> %vec, ptr %out) { +; CHECK-LABEL: lCPI1_0: +; CHECK-NEXT: .short 1 +; CHECK-NEXT: .short 2 +; CHECK-NEXT: .short 4 +; CHECK-NEXT: .short 8 +; CHECK-NEXT: .short 16 +; CHECK-NEXT: .short 32 +; CHECK-NEXT: .short 64 +; CHECK-NEXT: .short 128 + +; CHECK-LABEL: store_8_elements +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh2: +; CHECK-NEXT: adrp x8, lCPI1_0@PAGE +; CHECK-NEXT: cmeq.8h v0, v0, #0 +; CHECK-NEXT: Lloh3: +; CHECK-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF] +; CHECK-NEXT: bic.16b v0, v1, v0 +; CHECK-NEXT: addv.8h h0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer + store <8 x i1> %cmp_result, ptr %out + ret void +} + +define void @store_4_elements(<4 x i32> %vec, ptr %out) { +; CHECK-LABEL: lCPI2_0: +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .long 2 +; CHECK-NEXT: .long 4 +; CHECK-NEXT: .long 8 + +; CHECK-LABEL: store_4_elements +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh4: +; CHECK-NEXT: adrp x8, lCPI2_0@PAGE +; CHECK-NEXT: cmeq.4s v0, v0, #0 +; CHECK-NEXT: Lloh5: +; CHECK-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF] +; CHECK-NEXT: bic.16b v0, v1, v0 +; CHECK-NEXT: addv.4s s0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer + store <4 x i1> %cmp_result, ptr %out + ret void +} + +define void @store_2_elements(<2 x i64> %vec, ptr %out) { +; CHECK-LABEL: lCPI3_0: +; CHECK-NEXT: .quad 1 +; CHECK-NEXT: .quad 2 + +; CHECK-LABEL: store_2_elements +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh6: +; CHECK-NEXT: adrp x8, lCPI3_0@PAGE +; CHECK-NEXT: cmeq.2d v0, v0, #0 +; CHECK-NEXT: Lloh7: +; CHECK-NEXT: ldr q1, [x8, lCPI3_0@PAGEOFF] +; CHECK-NEXT: bic.16b v0, v1, v0 +; CHECK-NEXT: addp.2d d0, v0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer + store <2 x i1> %cmp_result, ptr %out + ret void +} + +define void @add_trunc_compare_before_store(<4 x i32> %vec, ptr %out) { +; CHECK-LABEL: lCPI4_0: +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .long 2 +; CHECK-NEXT: .long 4 +; CHECK-NEXT: .long 8 + +; CHECK-LABEL: add_trunc_compare_before_store +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh8: +; CHECK-NEXT: adrp x8, lCPI4_0@PAGE +; CHECK-NEXT: shl.4s v0, v0, #31 +; CHECK-NEXT: cmlt.4s v0, v0, #0 +; CHECK-NEXT: Lloh9: +; CHECK-NEXT: ldr q1, [x8, lCPI4_0@PAGEOFF] +; CHECK-NEXT: and.16b v0, v0, v1 +; CHECK-NEXT: addv.4s s0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %trunc = trunc <4 x i32> %vec to <4 x i1> + store <4 x i1> %trunc, ptr %out + ret void +} + +define void @add_trunc_mask_unknown_vector_type(<4 x i1> %vec, ptr %out) { +; CHECK-LABEL: lCPI5_0: +; CHECK: .short 1 +; CHECK: .short 2 +; CHECK: .short 4 +; CHECK: .short 8 + +; CHECK-LABEL: add_trunc_mask_unknown_vector_type +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh10: +; CHECK-NEXT: adrp x8, lCPI5_0@PAGE +; CHECK-NEXT: shl.4h v0, v0, #15 +; CHECK-NEXT: cmlt.4h v0, v0, #0 +; CHECK-NEXT: Lloh11: +; CHECK-NEXT: ldr d1, [x8, lCPI5_0@PAGEOFF] +; CHECK-NEXT: and.8b v0, v0, v1 +; CHECK-NEXT: addv.4h h0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + store <4 x i1> %vec, ptr %out + ret void +} + +define void @store_8_elements_64_bit_vector(<8 x i8> %vec, ptr %out) { +; CHECK-LABEL: lCPI6_0: +; CHECK-NEXT: .byte 1 +; CHECK-NEXT: .byte 2 +; CHECK-NEXT: .byte 4 +; CHECK-NEXT: .byte 8 +; CHECK-NEXT: .byte 16 +; CHECK-NEXT: .byte 32 +; CHECK-NEXT: .byte 64 +; CHECK-NEXT: .byte 128 + +; CHECK-LABEL: store_8_elements_64_bit_vector +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh12: +; CHECK-NEXT: adrp x8, lCPI6_0@PAGE +; CHECK-NEXT: cmeq.8b v0, v0, #0 +; CHECK-NEXT: Lloh13: +; CHECK-NEXT: ldr d1, [x8, lCPI6_0@PAGEOFF] +; CHECK-NEXT: bic.8b v0, v1, v0 +; CHECK-NEXT: addv.8b b0, v0 +; CHECK-NEXT: st1.b { v0 }[0], [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <8 x i8> %vec, zeroinitializer + store <8 x i1> %cmp_result, ptr %out + ret void +} + +define void @store_4_elements_64_bit_vector(<4 x i16> %vec, ptr %out) { +; CHECK-LABEL: lCPI7_0: +; CHECK-NEXT: .short 1 +; CHECK-NEXT: .short 2 +; CHECK-NEXT: .short 4 +; CHECK-NEXT: .short 8 + +; CHECK-LABEL: store_4_elements_64_bit_vector +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh14: +; CHECK-NEXT: adrp x8, lCPI7_0@PAGE +; CHECK-NEXT: cmeq.4h v0, v0, #0 +; CHECK-NEXT: Lloh15: +; CHECK-NEXT: ldr d1, [x8, lCPI7_0@PAGEOFF] +; CHECK-NEXT: bic.8b v0, v1, v0 +; CHECK-NEXT: addv.4h h0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <4 x i16> %vec, zeroinitializer + store <4 x i1> %cmp_result, ptr %out + ret void +} + +define void @store_2_elements_64_bit_vector(<2 x i32> %vec, ptr %out) { +; CHECK-LABEL: lCPI8_0: +; CHECK-NEXT: .long 1 +; CHECK-NEXT: .long 2 + +; CHECK-LABEL: store_2_elements_64_bit_vector +; CHECK: ; %bb.0: +; CHECK-NEXT: Lloh16: +; CHECK-NEXT: adrp x8, lCPI8_0@PAGE +; CHECK-NEXT: cmeq.2s v0, v0, #0 +; CHECK-NEXT: Lloh17: +; CHECK-NEXT: ldr d1, [x8, lCPI8_0@PAGEOFF] +; CHECK-NEXT: bic.8b v0, v1, v0 +; CHECK-NEXT: addp.2s v0, v0, v0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: strb w8, [x0] +; CHECK-NEXT: ret + + %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer + store <2 x i1> %cmp_result, ptr %out + ret void +} + +define void @no_combine_without_truncate(<16 x i8> %vec, ptr %out) { +; CHECK-LABEL: no_combine_without_truncate +; CHECK: cmtst.16b v0, v0, v0 +; CHECK-NOT: addv.8b b0, v0 + + %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer + %extended_result = sext <16 x i1> %cmp_result to <16 x i8> + store <16 x i8> %extended_result, ptr %out + ret void +} + +define void @no_combine_for_non_bool_truncate(<4 x i32> %vec, ptr %out) { +; CHECK-LABEL: no_combine_for_non_bool_truncate +; CHECK: xtn.4h v0, v0 +; CHECK-NOT: addv.4s s0, v0 + + %trunc = trunc <4 x i32> %vec to <4 x i8> + store <4 x i8> %trunc, ptr %out + ret void +} + +define void @no_combine_for_build_vector(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) { +; CHECK-LABEL: no_combine_for_build_vector +; CHECK-NOT: addv + + %1 = insertelement <4 x i1> undef, i1 %a, i64 0 + %2 = insertelement <4 x i1> %1, i1 %b, i64 1 + %3 = insertelement <4 x i1> %2, i1 %c, i64 2 + %vec = insertelement <4 x i1> %3, i1 %d, i64 3 + store <4 x i1> %vec, ptr %out + ret void +} diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll index 4ccc2c6..a4e1c80 100644 --- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll +++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll @@ -246,22 +246,20 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; CHECK-LABEL: uaddo_v4i1: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.4h, #1 +; CHECK-NEXT: adrp x8, .LCPI10_0 +; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI10_0] ; CHECK-NEXT: and v1.8b, v1.8b, v2.8b ; CHECK-NEXT: and v0.8b, v0.8b, v2.8b ; CHECK-NEXT: add v0.4h, v0.4h, v1.4h -; CHECK-NEXT: umov w8, v0.h[0] -; CHECK-NEXT: umov w9, v0.h[1] -; CHECK-NEXT: umov w10, v0.h[2] -; CHECK-NEXT: umov w11, v0.h[3] -; CHECK-NEXT: and v1.8b, v0.8b, v2.8b -; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h -; CHECK-NEXT: and w8, w8, #0x1 -; CHECK-NEXT: bfi w8, w9, #1, #1 +; CHECK-NEXT: shl v1.4h, v0.4h, #15 +; CHECK-NEXT: and v2.8b, v0.8b, v2.8b +; CHECK-NEXT: cmeq v0.4h, v2.4h, v0.4h +; CHECK-NEXT: cmlt v1.4h, v1.4h, #0 ; CHECK-NEXT: mvn v0.8b, v0.8b -; CHECK-NEXT: bfi w8, w10, #2, #1 -; CHECK-NEXT: orr w8, w8, w11, lsl #3 -; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: and v1.8b, v1.8b, v3.8b +; CHECK-NEXT: addv h1, v1.4h ; CHECK-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) diff --git a/llvm/test/CodeGen/AArch64/vec_umulo.ll b/llvm/test/CodeGen/AArch64/vec_umulo.ll index a66cb6a..e40f477 100644 --- a/llvm/test/CodeGen/AArch64/vec_umulo.ll +++ b/llvm/test/CodeGen/AArch64/vec_umulo.ll @@ -296,18 +296,15 @@ define <4 x i32> @umulo_v4i24(<4 x i24> %a0, <4 x i24> %a1, ptr %p2) nounwind { define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind { ; CHECK-LABEL: umulo_v4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d2, d0 +; CHECK-NEXT: adrp x8, .LCPI10_0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: shl v0.4h, v0.4h, #15 +; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0] +; CHECK-NEXT: cmlt v0.4h, v0.4h, #0 +; CHECK-NEXT: and v0.8b, v0.8b, v1.8b +; CHECK-NEXT: addv h1, v0.4h ; CHECK-NEXT: movi v0.2d, #0000000000000000 -; CHECK-NEXT: and v1.8b, v2.8b, v1.8b -; CHECK-NEXT: umov w8, v1.h[0] -; CHECK-NEXT: umov w9, v1.h[1] -; CHECK-NEXT: umov w10, v1.h[2] -; CHECK-NEXT: umov w11, v1.h[3] -; CHECK-NEXT: and w8, w8, #0x1 -; CHECK-NEXT: bfi w8, w9, #1, #1 -; CHECK-NEXT: bfi w8, w10, #2, #1 -; CHECK-NEXT: orr w8, w8, w11, lsl #3 -; CHECK-NEXT: and w8, w8, #0xf +; CHECK-NEXT: fmov w8, s1 ; CHECK-NEXT: strb w8, [x0] ; CHECK-NEXT: ret %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) -- 2.7.4