static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue ComparisonResult(N, 0);
- EVT BoolVecVT = ComparisonResult.getValueType();
- assert(BoolVecVT.isVector() && "Must be a vector type");
+ EVT VecVT = ComparisonResult.getValueType();
+ assert(VecVT.isVector() && "Must be a vector type");
- unsigned NumElts = BoolVecVT.getVectorNumElements();
+ unsigned NumElts = VecVT.getVectorNumElements();
if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
return SDValue();
+ if (VecVT.getVectorElementType() != MVT::i1 &&
+ !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
+ return SDValue();
+
// If we can find the original types to work on instead of a vector of i1,
// we can avoid extend/extract conversion instructions.
- EVT VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
- if (!VecVT.isSimple()) {
- unsigned BitsPerElement = std::max(64 / NumElts, 8u); // min. 64-bit vector
- VecVT =
- BoolVecVT.changeVectorElementType(MVT::getIntegerVT(BitsPerElement));
+ if (VecVT.getVectorElementType() == MVT::i1) {
+ VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
+ if (!VecVT.isSimple()) {
+ unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
+ VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
+ }
}
VecVT = VecVT.changeVectorElementTypeToInteger();
return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
}
+static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
+ StoreSDNode *Store) {
+ if (!Store->isTruncatingStore())
+ return SDValue();
+
+ SDLoc DL(Store);
+ SDValue VecOp = Store->getValue();
+ EVT VT = VecOp.getValueType();
+ EVT MemVT = Store->getMemoryVT();
+
+ if (!MemVT.isVector() || !VT.isVector() ||
+ MemVT.getVectorElementType() != MVT::i1)
+ return SDValue();
+
+ // If we are storing a vector that we are currently building, let
+ // `scalarizeVectorStore()` handle this more efficiently.
+ if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
+ return SDValue();
+
+ VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
+ SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
+ if (!VectorBits)
+ return SDValue();
+
+ EVT StoreVT =
+ EVT::getIntegerVT(*DAG.getContext(), MemVT.getStoreSizeInBits());
+ SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
+ return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
+ Store->getMemOperand());
+}
+
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
if (SDValue Store = foldTruncStoreOfExt(DAG, N))
return Store;
+ if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
+ return Store;
+
return SDValue();
}
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-linux-gnu %s -o - | FileCheck %s
define void @test_mismatched_setcc(<4 x i22> %l, <4 x i22> %r, ptr %addr) {
; CHECK-LABEL: test_mismatched_setcc:
-; CHECK: cmeq [[CMP128:v[0-9]+]].4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-; CHECK: xtn {{v[0-9]+}}.4h, [[CMP128]].4s
+; CHECK: // %bb.0:
+; CHECK-NEXT: movi v2.4s, #63, msl #16
+; CHECK-NEXT: adrp x8, .LCPI0_0
+; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT: and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT: and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: cmeq v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-NEXT: addv s0, v0.4s
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
%tst = icmp eq <4 x i22> %l, %r
store <4 x i1> %tst, ptr %addr
ret i4 %bitmask
}
-; TODO(lawben): Change this in follow-up patch to #D145301, as truncating stores fix this.
-; Larger vector types don't map directly.
-define i8 @no_convert_large_vector(<8 x i32> %vec) {
+; Larger vector types don't map directly, but the can be split/truncated and then converted.
+; After the comparison against 0, this is truncated to <8 x i16>, which is valid again.
+define i8 @convert_large_vector(<8 x i32> %vec) {
+; CHECK-LABEL: lCPI15_0:
+; CHECK-NEXT: .short 1
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .short 4
+; CHECK-NEXT: .short 8
+; CHECK-NEXT: .short 16
+; CHECK-NEXT: .short 32
+; CHECK-NEXT: .short 64
+; CHECK-NEXT: .short 128
+
; CHECK-LABEL: convert_large_vector:
-; CHECK: cmeq.4s v1, v1, #0
-; CHECK-NOT: addv
+; CHECK: Lloh30:
+; CHECK-NEXT: adrp x8, lCPI15_0@PAGE
+; CHECK-NEXT: cmeq.4s v1, v1, #0
+; CHECK-NEXT: cmeq.4s v0, v0, #0
+; CHECK-NEXT: uzp1.8h v0, v0, v1
+; CHECK-NEXT: Lloh31:
+; CHECK-NEXT: ldr q1, [x8, lCPI15_0@PAGEOFF]
+; CHECK-NEXT: bic.16b v0, v1, v0
+; CHECK-NEXT: addv.8h h0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: and w0, w8, #0xff
+; CHECK-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
%cmp_result = icmp ne <8 x i32> %vec, zeroinitializer
%bitmask = bitcast <8 x i1> %cmp_result to i8
ret i8 %bitmask
}
+define i4 @convert_legalized_illegal_element_size(<4 x i22> %vec) {
+; CHECK-LABEL: convert_legalized_illegal_element_size
+; CHECK: ; %bb.0:
+; CHECK-NEXT: movi.4s v1, #63, msl #16
+; CHECK-NEXT: Lloh32:
+; CHECK-NEXT: adrp x8, lCPI16_0@PAGE
+; CHECK-NEXT: cmtst.4s v0, v0, v1
+; CHECK-NEXT: Lloh33:
+; CHECK-NEXT: ldr d1, [x8, lCPI16_0@PAGEOFF]
+; CHECK-NEXT: xtn.4h v0, v0
+; CHECK-NEXT: and.8b v0, v0, v1
+; CHECK-NEXT: addv.4h h0, v0
+; CHECK-NEXT: fmov w0, s0
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <4 x i22> %vec, zeroinitializer
+ %bitmask = bitcast <4 x i1> %cmp_result to i4
+ ret i4 %bitmask
+}
+
; This may still be converted as a v8i8 after the vector concat (but not as v4iX).
define i8 @no_direct_convert_for_bad_concat(<4 x i32> %vec) {
; CHECK-LABEL: no_direct_convert_for_bad_concat:
%cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
ret <8 x i1> %cmp_result
}
+
+define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) {
+; CHECK-LABEL: no_combine_illegal_num_elements
+; CHECK-NOT: addv
+
+ %cmp_result = icmp ne <6 x i32> %vec, zeroinitializer
+ %bitmask = bitcast <6 x i1> %cmp_result to i6
+ ret i6 %bitmask
+}
--- /dev/null
+; RUN: llc -mtriple=aarch64-apple-darwin -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+
+define void @store_16_elements(<16 x i8> %vec, ptr %out) {
+; Bits used in mask
+; CHECK-LABEL: lCPI0_0
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 32
+; CHECK-NEXT: .byte 64
+; CHECK-NEXT: .byte 128
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 32
+; CHECK-NEXT: .byte 64
+; CHECK-NEXT: .byte 128
+
+; Actual conversion
+; CHECK-LABEL: store_16_elements
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh0:
+; CHECK-NEXT: adrp x8, lCPI0_0@PAGE
+; CHECK-NEXT: cmeq.16b v0, v0, #0
+; CHECK-NEXT: Lloh1:
+; CHECK-NEXT: ldr q1, [x8, lCPI0_0@PAGEOFF]
+; CHECK-NEXT: bic.16b v0, v1, v0
+; CHECK-NEXT: ext.16b v1, v0, v0, #8
+; CHECK-NEXT: addv.8b b0, v0
+; CHECK-NEXT: addv.8b b1, v1
+; CHECK-NEXT: fmov w9, s0
+; CHECK-NEXT: fmov w8, s1
+; CHECK-NEXT: orr w8, w9, w8, lsl #8
+; CHECK-NEXT: strh w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer
+ store <16 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @store_8_elements(<8 x i16> %vec, ptr %out) {
+; CHECK-LABEL: lCPI1_0:
+; CHECK-NEXT: .short 1
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .short 4
+; CHECK-NEXT: .short 8
+; CHECK-NEXT: .short 16
+; CHECK-NEXT: .short 32
+; CHECK-NEXT: .short 64
+; CHECK-NEXT: .short 128
+
+; CHECK-LABEL: store_8_elements
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh2:
+; CHECK-NEXT: adrp x8, lCPI1_0@PAGE
+; CHECK-NEXT: cmeq.8h v0, v0, #0
+; CHECK-NEXT: Lloh3:
+; CHECK-NEXT: ldr q1, [x8, lCPI1_0@PAGEOFF]
+; CHECK-NEXT: bic.16b v0, v1, v0
+; CHECK-NEXT: addv.8h h0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <8 x i16> %vec, zeroinitializer
+ store <8 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @store_4_elements(<4 x i32> %vec, ptr %out) {
+; CHECK-LABEL: lCPI2_0:
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .long 2
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .long 8
+
+; CHECK-LABEL: store_4_elements
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh4:
+; CHECK-NEXT: adrp x8, lCPI2_0@PAGE
+; CHECK-NEXT: cmeq.4s v0, v0, #0
+; CHECK-NEXT: Lloh5:
+; CHECK-NEXT: ldr q1, [x8, lCPI2_0@PAGEOFF]
+; CHECK-NEXT: bic.16b v0, v1, v0
+; CHECK-NEXT: addv.4s s0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <4 x i32> %vec, zeroinitializer
+ store <4 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @store_2_elements(<2 x i64> %vec, ptr %out) {
+; CHECK-LABEL: lCPI3_0:
+; CHECK-NEXT: .quad 1
+; CHECK-NEXT: .quad 2
+
+; CHECK-LABEL: store_2_elements
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh6:
+; CHECK-NEXT: adrp x8, lCPI3_0@PAGE
+; CHECK-NEXT: cmeq.2d v0, v0, #0
+; CHECK-NEXT: Lloh7:
+; CHECK-NEXT: ldr q1, [x8, lCPI3_0@PAGEOFF]
+; CHECK-NEXT: bic.16b v0, v1, v0
+; CHECK-NEXT: addp.2d d0, v0
+; CHECK-NEXT: fmov x8, d0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <2 x i64> %vec, zeroinitializer
+ store <2 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @add_trunc_compare_before_store(<4 x i32> %vec, ptr %out) {
+; CHECK-LABEL: lCPI4_0:
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .long 2
+; CHECK-NEXT: .long 4
+; CHECK-NEXT: .long 8
+
+; CHECK-LABEL: add_trunc_compare_before_store
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh8:
+; CHECK-NEXT: adrp x8, lCPI4_0@PAGE
+; CHECK-NEXT: shl.4s v0, v0, #31
+; CHECK-NEXT: cmlt.4s v0, v0, #0
+; CHECK-NEXT: Lloh9:
+; CHECK-NEXT: ldr q1, [x8, lCPI4_0@PAGEOFF]
+; CHECK-NEXT: and.16b v0, v0, v1
+; CHECK-NEXT: addv.4s s0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %trunc = trunc <4 x i32> %vec to <4 x i1>
+ store <4 x i1> %trunc, ptr %out
+ ret void
+}
+
+define void @add_trunc_mask_unknown_vector_type(<4 x i1> %vec, ptr %out) {
+; CHECK-LABEL: lCPI5_0:
+; CHECK: .short 1
+; CHECK: .short 2
+; CHECK: .short 4
+; CHECK: .short 8
+
+; CHECK-LABEL: add_trunc_mask_unknown_vector_type
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh10:
+; CHECK-NEXT: adrp x8, lCPI5_0@PAGE
+; CHECK-NEXT: shl.4h v0, v0, #15
+; CHECK-NEXT: cmlt.4h v0, v0, #0
+; CHECK-NEXT: Lloh11:
+; CHECK-NEXT: ldr d1, [x8, lCPI5_0@PAGEOFF]
+; CHECK-NEXT: and.8b v0, v0, v1
+; CHECK-NEXT: addv.4h h0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ store <4 x i1> %vec, ptr %out
+ ret void
+}
+
+define void @store_8_elements_64_bit_vector(<8 x i8> %vec, ptr %out) {
+; CHECK-LABEL: lCPI6_0:
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 8
+; CHECK-NEXT: .byte 16
+; CHECK-NEXT: .byte 32
+; CHECK-NEXT: .byte 64
+; CHECK-NEXT: .byte 128
+
+; CHECK-LABEL: store_8_elements_64_bit_vector
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh12:
+; CHECK-NEXT: adrp x8, lCPI6_0@PAGE
+; CHECK-NEXT: cmeq.8b v0, v0, #0
+; CHECK-NEXT: Lloh13:
+; CHECK-NEXT: ldr d1, [x8, lCPI6_0@PAGEOFF]
+; CHECK-NEXT: bic.8b v0, v1, v0
+; CHECK-NEXT: addv.8b b0, v0
+; CHECK-NEXT: st1.b { v0 }[0], [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <8 x i8> %vec, zeroinitializer
+ store <8 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @store_4_elements_64_bit_vector(<4 x i16> %vec, ptr %out) {
+; CHECK-LABEL: lCPI7_0:
+; CHECK-NEXT: .short 1
+; CHECK-NEXT: .short 2
+; CHECK-NEXT: .short 4
+; CHECK-NEXT: .short 8
+
+; CHECK-LABEL: store_4_elements_64_bit_vector
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh14:
+; CHECK-NEXT: adrp x8, lCPI7_0@PAGE
+; CHECK-NEXT: cmeq.4h v0, v0, #0
+; CHECK-NEXT: Lloh15:
+; CHECK-NEXT: ldr d1, [x8, lCPI7_0@PAGEOFF]
+; CHECK-NEXT: bic.8b v0, v1, v0
+; CHECK-NEXT: addv.4h h0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <4 x i16> %vec, zeroinitializer
+ store <4 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @store_2_elements_64_bit_vector(<2 x i32> %vec, ptr %out) {
+; CHECK-LABEL: lCPI8_0:
+; CHECK-NEXT: .long 1
+; CHECK-NEXT: .long 2
+
+; CHECK-LABEL: store_2_elements_64_bit_vector
+; CHECK: ; %bb.0:
+; CHECK-NEXT: Lloh16:
+; CHECK-NEXT: adrp x8, lCPI8_0@PAGE
+; CHECK-NEXT: cmeq.2s v0, v0, #0
+; CHECK-NEXT: Lloh17:
+; CHECK-NEXT: ldr d1, [x8, lCPI8_0@PAGEOFF]
+; CHECK-NEXT: bic.8b v0, v1, v0
+; CHECK-NEXT: addp.2s v0, v0, v0
+; CHECK-NEXT: fmov w8, s0
+; CHECK-NEXT: strb w8, [x0]
+; CHECK-NEXT: ret
+
+ %cmp_result = icmp ne <2 x i32> %vec, zeroinitializer
+ store <2 x i1> %cmp_result, ptr %out
+ ret void
+}
+
+define void @no_combine_without_truncate(<16 x i8> %vec, ptr %out) {
+; CHECK-LABEL: no_combine_without_truncate
+; CHECK: cmtst.16b v0, v0, v0
+; CHECK-NOT: addv.8b b0, v0
+
+ %cmp_result = icmp ne <16 x i8> %vec, zeroinitializer
+ %extended_result = sext <16 x i1> %cmp_result to <16 x i8>
+ store <16 x i8> %extended_result, ptr %out
+ ret void
+}
+
+define void @no_combine_for_non_bool_truncate(<4 x i32> %vec, ptr %out) {
+; CHECK-LABEL: no_combine_for_non_bool_truncate
+; CHECK: xtn.4h v0, v0
+; CHECK-NOT: addv.4s s0, v0
+
+ %trunc = trunc <4 x i32> %vec to <4 x i8>
+ store <4 x i8> %trunc, ptr %out
+ ret void
+}
+
+define void @no_combine_for_build_vector(i1 %a, i1 %b, i1 %c, i1 %d, ptr %out) {
+; CHECK-LABEL: no_combine_for_build_vector
+; CHECK-NOT: addv
+
+ %1 = insertelement <4 x i1> undef, i1 %a, i64 0
+ %2 = insertelement <4 x i1> %1, i1 %b, i64 1
+ %3 = insertelement <4 x i1> %2, i1 %c, i64 2
+ %vec = insertelement <4 x i1> %3, i1 %d, i64 3
+ store <4 x i1> %vec, ptr %out
+ ret void
+}
; CHECK-LABEL: uaddo_v4i1:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v2.4h, #1
+; CHECK-NEXT: adrp x8, .LCPI10_0
+; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI10_0]
; CHECK-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-NEXT: add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: umov w8, v0.h[0]
-; CHECK-NEXT: umov w9, v0.h[1]
-; CHECK-NEXT: umov w10, v0.h[2]
-; CHECK-NEXT: umov w11, v0.h[3]
-; CHECK-NEXT: and v1.8b, v0.8b, v2.8b
-; CHECK-NEXT: cmeq v0.4h, v1.4h, v0.4h
-; CHECK-NEXT: and w8, w8, #0x1
-; CHECK-NEXT: bfi w8, w9, #1, #1
+; CHECK-NEXT: shl v1.4h, v0.4h, #15
+; CHECK-NEXT: and v2.8b, v0.8b, v2.8b
+; CHECK-NEXT: cmeq v0.4h, v2.4h, v0.4h
+; CHECK-NEXT: cmlt v1.4h, v1.4h, #0
; CHECK-NEXT: mvn v0.8b, v0.8b
-; CHECK-NEXT: bfi w8, w10, #2, #1
-; CHECK-NEXT: orr w8, w8, w11, lsl #3
-; CHECK-NEXT: and w8, w8, #0xf
+; CHECK-NEXT: and v1.8b, v1.8b, v3.8b
+; CHECK-NEXT: addv h1, v1.4h
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
+; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
; CHECK-LABEL: umulo_v4i1:
; CHECK: // %bb.0:
-; CHECK-NEXT: fmov d2, d0
+; CHECK-NEXT: adrp x8, .LCPI10_0
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: shl v0.4h, v0.4h, #15
+; CHECK-NEXT: ldr d1, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
+; CHECK-NEXT: and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: addv h1, v0.4h
; CHECK-NEXT: movi v0.2d, #0000000000000000
-; CHECK-NEXT: and v1.8b, v2.8b, v1.8b
-; CHECK-NEXT: umov w8, v1.h[0]
-; CHECK-NEXT: umov w9, v1.h[1]
-; CHECK-NEXT: umov w10, v1.h[2]
-; CHECK-NEXT: umov w11, v1.h[3]
-; CHECK-NEXT: and w8, w8, #0x1
-; CHECK-NEXT: bfi w8, w9, #1, #1
-; CHECK-NEXT: bfi w8, w10, #2, #1
-; CHECK-NEXT: orr w8, w8, w11, lsl #3
-; CHECK-NEXT: and w8, w8, #0xf
+; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: strb w8, [x0]
; CHECK-NEXT: ret
%t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)