From a69d9d61569b3c882b0b0a2c92dbf125ef64403e Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Thu, 22 Aug 2019 10:29:20 +0000 Subject: [PATCH] Reapply: [ARM] Fix lsrl with a 128/256 bit shift amount or a shift of 32 The CodeGen/Thumb2/mve-vaddv.ll test needed to be amended to reflect the changes from the above patch. This reverts commit cd53ff6, reapplying 7c6b229. llvm-svn: 369638 --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 13 +++-- llvm/test/CodeGen/ARM/shift_parts.ll | 65 +++++++++++++++++++++- llvm/test/CodeGen/Thumb2/mve-abs.ll | 58 +++++++++---------- llvm/test/CodeGen/Thumb2/mve-div-expand.ll | 12 ++-- llvm/test/CodeGen/Thumb2/mve-vaddv.ll | 18 +++--- llvm/test/CodeGen/Thumb2/mve-vcvt.ll | 12 ++-- 6 files changed, 112 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 04ac7777f5e0..275859a6b912 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -5938,14 +5938,15 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, unsigned ShPartsOpc = ARMISD::LSLL; ConstantSDNode *Con = dyn_cast(ShAmt); - // If the shift amount is greater than 32 then do the default optimisation - if (Con && Con->getZExtValue() > 32) + // If the shift amount is greater than 32 or has a greater bitwidth than 64 + // then do the default optimisation + if (ShAmt->getValueType(0).getSizeInBits() > 64 || + (Con && Con->getZExtValue() >= 32)) return SDValue(); - // Extract the lower 32 bits of the shift amount if it's an i64 - if (ShAmt->getValueType(0) == MVT::i64) - ShAmt = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ShAmt, - DAG.getConstant(0, dl, MVT::i32)); + // Extract the lower 32 bits of the shift amount if it's not an i32 + if (ShAmt->getValueType(0) != MVT::i32) + ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32); if (ShOpc == ISD::SRL) { if (!Con) diff --git a/llvm/test/CodeGen/ARM/shift_parts.ll b/llvm/test/CodeGen/ARM/shift_parts.ll index a3a98e6d2520..9bc77d585bf9 100644 --- a/llvm/test/CodeGen/ARM/shift_parts.ll +++ b/llvm/test/CodeGen/ARM/shift_parts.ll @@ -52,7 +52,8 @@ entry: define i64 @shift_left_imm_big2(i64 %x) { ; CHECK-MVE-LABEL: shift_left_imm_big2: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: lsll r0, r1, #32 +; CHECK-MVE-NEXT: mov r1, r0 +; CHECK-MVE-NEXT: movs r0, #0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-NON-MVE-LABEL: shift_left_imm_big2: @@ -128,7 +129,8 @@ entry: define i64 @shift_right_imm_big2(i64 %x) { ; CHECK-MVE-LABEL: shift_right_imm_big2: ; CHECK-MVE: @ %bb.0: @ %entry -; CHECK-MVE-NEXT: lsrl r0, r1, #32 +; CHECK-MVE-NEXT: mov r0, r1 +; CHECK-MVE-NEXT: movs r1, #0 ; CHECK-MVE-NEXT: bx lr ; ; CHECK-NON-MVE-LABEL: shift_right_imm_big2: @@ -219,3 +221,62 @@ entry: store i40 %bf.clear, i40* %0, align 1 ret void } + +%struct.a = type { i96 } + +define void @lsll_128bit_shift(%struct.a* nocapture %x) local_unnamed_addr #0 { +; CHECK-MVE-LABEL: lsll_128bit_shift: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: strd r1, r1, [r0] +; CHECK-MVE-NEXT: str r1, [r0, #8] +; CHECK-MVE-NEXT: bx lr +; +; CHECK-NON-MVE-LABEL: lsll_128bit_shift: +; CHECK-NON-MVE: @ %bb.0: @ %entry +; CHECK-NON-MVE-NEXT: movs r1, #0 +; CHECK-NON-MVE-NEXT: str r1, [r0] +; CHECK-NON-MVE-NEXT: str r1, [r0, #4] +; CHECK-NON-MVE-NEXT: str r1, [r0, #8] +; CHECK-NON-MVE-NEXT: bx lr +entry: + %0 = bitcast %struct.a* %x to i128* + %bf.load = load i128, i128* %0, align 8 + %bf.clear4 = and i128 %bf.load, -79228162514264337593543950336 + store i128 %bf.clear4, i128* %0, align 8 + ret void +} + +%struct.b = type { i184 } + +define void @lsll_256bit_shift(%struct.b* nocapture %x) local_unnamed_addr #0 { +; CHECK-MVE-LABEL: lsll_256bit_shift: +; CHECK-MVE: @ %bb.0: @ %entry +; CHECK-MVE-NEXT: movs r1, #0 +; CHECK-MVE-NEXT: str r1, [r0, #16] +; CHECK-MVE-NEXT: strd r1, r1, [r0, #8] +; CHECK-MVE-NEXT: strd r1, r1, [r0] +; CHECK-MVE-NEXT: ldrb r1, [r0, #23] +; CHECK-MVE-NEXT: lsls r1, r1, #24 +; CHECK-MVE-NEXT: str r1, [r0, #20] +; CHECK-MVE-NEXT: bx lr +; +; CHECK-NON-MVE-LABEL: lsll_256bit_shift: +; CHECK-NON-MVE: @ %bb.0: @ %entry +; CHECK-NON-MVE-NEXT: movs r1, #0 +; CHECK-NON-MVE-NEXT: str r1, [r0, #16] +; CHECK-NON-MVE-NEXT: str r1, [r0, #8] +; CHECK-NON-MVE-NEXT: str r1, [r0, #12] +; CHECK-NON-MVE-NEXT: str r1, [r0] +; CHECK-NON-MVE-NEXT: str r1, [r0, #4] +; CHECK-NON-MVE-NEXT: ldrb r1, [r0, #23] +; CHECK-NON-MVE-NEXT: lsls r1, r1, #24 +; CHECK-NON-MVE-NEXT: str r1, [r0, #20] +; CHECK-NON-MVE-NEXT: bx lr +entry: + %0 = bitcast %struct.b* %x to i192* + %bf.load = load i192, i192* %0, align 8 + %bf.clear4 = and i192 %bf.load, -24519928653854221733733552434404946937899825954937634816 + store i192 %bf.clear4, i192* %0, align 8 + ret void +} diff --git a/llvm/test/CodeGen/Thumb2/mve-abs.ll b/llvm/test/CodeGen/Thumb2/mve-abs.ll index 6e2100e2f463..081157b07042 100644 --- a/llvm/test/CodeGen/Thumb2/mve-abs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-abs.ll @@ -40,43 +40,39 @@ entry: define arm_aapcs_vfpcc <2 x i64> @abs_v2i64(<2 x i64> %s1) { ; CHECK-LABEL: abs_v2i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: vmov r12, s2 -; CHECK-NEXT: movs r0, #0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: rsbs.w lr, r12, #0 -; CHECK-NEXT: sbc.w r5, r0, r3 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: mov r2, lr -; CHECK-NEXT: lsrl r2, r5, #32 -; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: vmov r0, s3 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: rsbs.w r3, r12, #0 +; CHECK-NEXT: sbc.w lr, r2, r0 +; CHECK-NEXT: cmp r0, #0 ; CHECK-NEXT: it mi -; CHECK-NEXT: movmi r5, #1 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: movmi r1, #1 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: moveq r2, r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: rsbs r4, r1, #0 -; CHECK-NEXT: mov r6, r4 -; CHECK-NEXT: sbc.w r7, r0, r3 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: lsrl r6, r7, #32 -; CHECK-NEXT: it mi -; CHECK-NEXT: movmi r0, #1 +; CHECK-NEXT: moveq lr, r0 +; CHECK-NEXT: vmov r0, s1 +; CHECK-NEXT: rsbs r5, r4, #0 +; CHECK-NEXT: sbc.w r6, r2, r0 ; CHECK-NEXT: cmp r0, #0 -; CHECK-NEXT: ite eq -; CHECK-NEXT: moveq r6, r3 -; CHECK-NEXT: movne r1, r4 -; CHECK-NEXT: vmov.32 q0[0], r1 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: it mi +; CHECK-NEXT: movmi r2, #1 +; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: itt eq +; CHECK-NEXT: moveq r6, r0 +; CHECK-NEXT: moveq r5, r4 +; CHECK-NEXT: vmov.32 q0[0], r5 ; CHECK-NEXT: vmov.32 q0[1], r6 +; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: moveq lr, r12 -; CHECK-NEXT: vmov.32 q0[2], lr -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: moveq r3, r12 +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %0 = icmp slt <2 x i64> %s1, zeroinitializer %1 = sub nsw <2 x i64> zeroinitializer, %s1 diff --git a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll index e0dddcd273c2..794b340ad723 100644 --- a/llvm/test/CodeGen/Thumb2/mve-div-expand.ll +++ b/llvm/test/CodeGen/Thumb2/mve-div-expand.ll @@ -755,14 +755,12 @@ define arm_aapcs_vfpcc <2 x i64> @udiv_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s19 ; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: mov r1, lr ; CHECK-NEXT: bl __aeabi_uldivmod ; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} @@ -790,14 +788,12 @@ define arm_aapcs_vfpcc <2 x i64> @sdiv_i64(<2 x i64> %in1, <2 x i64> %in2) { ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s19 ; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 ; CHECK-NEXT: mov r0, r12 ; CHECK-NEXT: mov r1, lr ; CHECK-NEXT: bl __aeabi_ldivmod ; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll index 3ebc67f80fc7..94356e3921bd 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vaddv.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vaddv.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>) @@ -14,9 +15,6 @@ define arm_aapcs_vfpcc i64 @vaddv_v2i64_i64(<2 x i64> %s1) { ; CHECK-NEXT: vmov r2, s1 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: mov r2, r0 -; CHECK-NEXT: lsrl r2, r1, #32 -; CHECK-NEXT: mov r1, r2 ; CHECK-NEXT: bx lr entry: %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %s1) @@ -56,19 +54,17 @@ entry: define arm_aapcs_vfpcc i64 @vaddva_v2i64_i64(<2 x i64> %s1, i64 %x) { ; CHECK-LABEL: vaddva_v2i64_i64: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r12, s3 ; CHECK-NEXT: vmov lr, s1 -; CHECK-NEXT: adds r4, r3, r2 +; CHECK-NEXT: adds r2, r2, r3 ; CHECK-NEXT: adc.w r3, lr, r12 -; CHECK-NEXT: mov r2, r4 -; CHECK-NEXT: lsrl r2, r3, #32 -; CHECK-NEXT: adds r0, r0, r4 -; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: adds r0, r0, r2 +; CHECK-NEXT: adcs r1, r3 +; CHECK-NEXT: pop {r7, pc} entry: %t = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %s1) %r = add i64 %t, %x diff --git a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll index 524ec692c8c1..ff2c7927b099 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcvt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcvt.ll @@ -382,14 +382,12 @@ define arm_aapcs_vfpcc <2 x i64> @foo_int64_float(<2 x double> %src) { ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_d2lz ; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} @@ -410,14 +408,12 @@ define arm_aapcs_vfpcc <2 x i64> @foo_uint64_float(<2 x double> %src) { ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vmov r2, r3, d9 ; CHECK-NEXT: vmov.32 q4[0], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[1], r0 +; CHECK-NEXT: vmov.32 q4[1], r1 ; CHECK-NEXT: mov r0, r2 ; CHECK-NEXT: mov r1, r3 ; CHECK-NEXT: bl __aeabi_d2ulz ; CHECK-NEXT: vmov.32 q4[2], r0 -; CHECK-NEXT: lsrl r0, r1, #32 -; CHECK-NEXT: vmov.32 q4[3], r0 +; CHECK-NEXT: vmov.32 q4[3], r1 ; CHECK-NEXT: vmov q0, q4 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} -- 2.34.1