From ad8e75caa2ebface54c92d7e4d7dc21c3166b6c6 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 16 Jul 2021 23:11:42 +0100 Subject: [PATCH] [ARM] Fix for matching reductions that are both sext and zext. Fix a silly mistake that was not making sure that _both_ operands were the correct extend code. --- llvm/lib/Target/ARM/ARMISelLowering.cpp | 4 +- llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll | 16 ++++++- llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll | 58 ++++++++++++++++++++++- 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index eeb8885..261be2c 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -16063,7 +16063,7 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, return false; SDValue ExtA = Mul->getOperand(0); SDValue ExtB = Mul->getOperand(1); - if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) + if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode) return false; A = ExtA->getOperand(0); B = ExtB->getOperand(0); @@ -16097,7 +16097,7 @@ static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, return false; SDValue ExtA = Mul->getOperand(0); SDValue ExtB = Mul->getOperand(1); - if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode) + if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode) return false; A = ExtA->getOperand(0); B = ExtB->getOperand(0); diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll index b02c8b4..ba5cf08 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -695,8 +695,22 @@ entry: define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_v16i8_v16i16_szext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmlav.s8 r0, q0, q1 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vldrb.u16 q0, [r0, #8] +; CHECK-NEXT: vldrb.s16 q1, [r1, #8] +; CHECK-NEXT: vldrb.s16 q2, [r1] +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vldrb.u16 q1, [r0] +; CHECK-NEXT: vmul.i16 q1, q2, q1 +; CHECK-NEXT: vadd.i16 q0, q1, q0 +; CHECK-NEXT: vaddv.u16 r0, q0 ; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %xx = sext <16 x i8> %x to <16 x i16> diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll index ec29c4d..dc42ecd 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -1112,9 +1112,63 @@ entry: define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) { ; CHECK-LABEL: add_v16i8_v16i16_szext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vpt.i8 eq, q2, zr -; CHECK-NEXT: vmlavt.s8 r0, q0, q1 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: add r0, sp, #16 +; CHECK-NEXT: mov r1, sp +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: vcmp.i8 eq, q2, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vmov.i8 q1, #0xff +; CHECK-NEXT: vldrb.u16 q2, [r0] +; CHECK-NEXT: vpsel q0, q1, q0 +; CHECK-NEXT: vldrb.s16 q3, [r1] +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov.16 q1[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[1] +; CHECK-NEXT: vmov.16 q1[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[2] +; CHECK-NEXT: vmov.16 q1[2], r2 +; CHECK-NEXT: vmov.u8 r2, q0[3] +; CHECK-NEXT: vmov.16 q1[3], r2 +; CHECK-NEXT: vmov.u8 r2, q0[4] +; CHECK-NEXT: vmov.16 q1[4], r2 +; CHECK-NEXT: vmov.u8 r2, q0[5] +; CHECK-NEXT: vmov.16 q1[5], r2 +; CHECK-NEXT: vmov.u8 r2, q0[6] +; CHECK-NEXT: vmov.16 q1[6], r2 +; CHECK-NEXT: vmov.u8 r2, q0[7] +; CHECK-NEXT: vmov.16 q1[7], r2 +; CHECK-NEXT: vmov.u8 r2, q0[8] +; CHECK-NEXT: vcmp.i16 ne, q1, zr +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmult.i16 q1, q3, q2 +; CHECK-NEXT: vmov.16 q2[0], r2 +; CHECK-NEXT: vmov.u8 r2, q0[9] +; CHECK-NEXT: vmov.16 q2[1], r2 +; CHECK-NEXT: vmov.u8 r2, q0[10] +; CHECK-NEXT: vmov.16 q2[2], r2 +; CHECK-NEXT: vmov.u8 r2, q0[11] +; CHECK-NEXT: vmov.16 q2[3], r2 +; CHECK-NEXT: vmov.u8 r2, q0[12] +; CHECK-NEXT: vmov.16 q2[4], r2 +; CHECK-NEXT: vmov.u8 r2, q0[13] +; CHECK-NEXT: vmov.16 q2[5], r2 +; CHECK-NEXT: vmov.u8 r2, q0[14] +; CHECK-NEXT: vmov.16 q2[6], r2 +; CHECK-NEXT: vmov.u8 r2, q0[15] +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vldrb.u16 q0, [r0, #8] +; CHECK-NEXT: vcmp.i16 ne, q2, zr +; CHECK-NEXT: vldrb.s16 q2, [r1, #8] +; CHECK-NEXT: vmul.i16 q0, q2, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddt.i16 q1, q1, q0 +; CHECK-NEXT: vaddv.u16 r0, q1 ; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: bx lr entry: %c = icmp eq <16 x i8> %b, zeroinitializer -- 2.7.4