From ac5a5a9cfe7c83ee5fbbc48118b4239e7e6cf6c9 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Fri, 11 Feb 2022 12:45:07 -0600 Subject: [PATCH] [PowerPC] Add default handling for single element vectors, and split/promote vNi1 vectors. This patch updates the handling of vectors in getPreferredVectorAction(): For single-element and scalable vectors, fall back to default vector legalization handling. For vNi1 vectors, add handling to either split or promote them in order to prevent the production of wide v256i1/v512i1 types. The following assertion is fixed by this patch, as we ended up producing the wide vector types (that are used for MMA) in the backend prior to this fix. ``` Assertion failed: VT.getSizeInBits() == Operand.getValueSizeInBits() && "Cannot BITCAST between types of different sizes!" ``` Differential Revision: https://reviews.llvm.org/D119521 --- llvm/lib/Target/PowerPC/PPCISelLowering.h | 15 +- .../PowerPC/p10-handle-split-promote-vec.ll | 212 +++++++++++++++++++++ 2 files changed, 225 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/p10-handle-split-promote-vec.ll diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index c3f31f7..4131fa4 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -765,8 +765,19 @@ namespace llvm { /// then the VPERM for the shuffle. All in all a very slow sequence. TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override { - if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && - VT.getScalarSizeInBits() % 8 == 0) + // Default handling for scalable and single-element vectors. + if (VT.isScalableVector() || VT.getVectorNumElements() == 1) + return TargetLoweringBase::getPreferredVectorAction(VT); + + // Split and promote vNi1 vectors so we don't produce v256i1/v512i1 + // types as those are only for MMA instructions. + if (VT.getScalarSizeInBits() == 1 && VT.getSizeInBits() > 16) + return TypeSplitVector; + if (VT.getScalarSizeInBits() == 1) + return TypePromoteInteger; + + // Widen vectors that have reasonably sized elements. + if (VT.getScalarSizeInBits() % 8 == 0) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } diff --git a/llvm/test/CodeGen/PowerPC/p10-handle-split-promote-vec.ll b/llvm/test/CodeGen/PowerPC/p10-handle-split-promote-vec.ll new file mode 100644 index 0000000..ad0bd40 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/p10-handle-split-promote-vec.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-ibm-aix -vec-extabi \ +; RUN: -mcpu=pwr10 < %s | FileCheck %s -check-prefix=CHECK-AIX + +define i32 @SplitPromoteVectorTest(i32 %Opc) align 2 { +; CHECK-LABEL: SplitPromoteVectorTest: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: plxv v3, .LCPI0_0@PCREL(0), 1 +; CHECK-NEXT: mtvsrws v2, r3 +; CHECK-NEXT: li r5, 4 +; CHECK-NEXT: li r8, 0 +; CHECK-NEXT: vcmpequw v3, v2, v3 +; CHECK-NEXT: vextubrx r6, r5, v3 +; CHECK-NEXT: vextubrx r4, r8, v3 +; CHECK-NEXT: rlwimi r4, r6, 1, 30, 30 +; CHECK-NEXT: li r6, 8 +; CHECK-NEXT: vextubrx r7, r6, v3 +; CHECK-NEXT: rlwimi r4, r7, 2, 29, 29 +; CHECK-NEXT: li r7, 12 +; CHECK-NEXT: vextubrx r9, r7, v3 +; CHECK-NEXT: plxv v3, .LCPI0_1@PCREL(0), 1 +; CHECK-NEXT: rlwimi r4, r9, 3, 28, 28 +; CHECK-NEXT: vcmpequw v3, v2, v3 +; CHECK-NEXT: vextubrx r9, r8, v3 +; CHECK-NEXT: rlwimi r4, r9, 4, 27, 27 +; CHECK-NEXT: vextubrx r9, r5, v3 +; CHECK-NEXT: rlwimi r4, r9, 5, 26, 26 +; CHECK-NEXT: vextubrx r9, r6, v3 +; CHECK-NEXT: rlwimi r4, r9, 6, 25, 25 +; CHECK-NEXT: vextubrx r9, r7, v3 +; CHECK-NEXT: plxv v3, .LCPI0_2@PCREL(0), 1 +; CHECK-NEXT: rlwimi r4, r9, 7, 24, 24 +; CHECK-NEXT: vcmpequw v3, v2, v3 +; CHECK-NEXT: vextubrx r9, r8, v3 +; CHECK-NEXT: rlwimi r4, r9, 8, 23, 23 +; CHECK-NEXT: vextubrx r9, r5, v3 +; CHECK-NEXT: rlwimi r4, r9, 9, 22, 22 +; CHECK-NEXT: vextubrx r9, r6, v3 +; CHECK-NEXT: rlwimi r4, r9, 10, 21, 21 +; CHECK-NEXT: vextubrx r9, r7, v3 +; CHECK-NEXT: plxv v3, .LCPI0_3@PCREL(0), 1 +; CHECK-NEXT: rlwimi r4, r9, 11, 20, 20 +; CHECK-NEXT: vcmpequw v3, v2, v3 +; CHECK-NEXT: vextubrx r9, r8, v3 +; CHECK-NEXT: rlwimi r4, r9, 12, 19, 19 +; CHECK-NEXT: vextubrx r9, r5, v3 +; CHECK-NEXT: rlwimi r4, r9, 13, 18, 18 +; CHECK-NEXT: vextubrx r9, r6, v3 +; CHECK-NEXT: rlwimi r4, r9, 14, 17, 17 +; CHECK-NEXT: vextubrx r9, r7, v3 +; CHECK-NEXT: plxv v3, .LCPI0_4@PCREL(0), 1 +; CHECK-NEXT: rlwimi r4, r9, 15, 0, 16 +; CHECK-NEXT: vcmpequw v3, v2, v3 +; CHECK-NEXT: vextubrx r10, r5, v3 +; CHECK-NEXT: vextubrx r9, r8, v3 +; CHECK-NEXT: rlwimi r9, r10, 1, 30, 30 +; CHECK-NEXT: vextubrx r10, r6, v3 +; CHECK-NEXT: rlwimi r9, r10, 2, 29, 29 +; CHECK-NEXT: vextubrx r10, r7, v3 +; CHECK-NEXT: plxv v3, .LCPI0_5@PCREL(0), 1 +; CHECK-NEXT: rlwimi r9, r10, 3, 28, 28 +; CHECK-NEXT: vcmpequw v3, v2, v3 +; CHECK-NEXT: vextubrx r10, r8, v3 +; CHECK-NEXT: rlwimi r9, r10, 4, 27, 27 +; CHECK-NEXT: vextubrx r10, r5, v3 +; CHECK-NEXT: rlwimi r9, r10, 5, 26, 26 +; CHECK-NEXT: vextubrx r10, r6, v3 +; CHECK-NEXT: rlwimi r9, r10, 6, 25, 25 +; CHECK-NEXT: vextubrx r10, r7, v3 +; CHECK-NEXT: plxv v3, .LCPI0_6@PCREL(0), 1 +; CHECK-NEXT: rlwimi r9, r10, 7, 24, 24 +; CHECK-NEXT: vcmpequw v3, v2, v3 +; CHECK-NEXT: vextubrx r10, r8, v3 +; CHECK-NEXT: rlwimi r9, r10, 8, 23, 23 +; CHECK-NEXT: vextubrx r10, r5, v3 +; CHECK-NEXT: rlwimi r9, r10, 9, 22, 22 +; CHECK-NEXT: vextubrx r10, r6, v3 +; CHECK-NEXT: rlwimi r9, r10, 10, 21, 21 +; CHECK-NEXT: vextubrx r10, r7, v3 +; CHECK-NEXT: plxv v3, .LCPI0_7@PCREL(0), 1 +; CHECK-NEXT: rlwimi r9, r10, 11, 20, 20 +; CHECK-NEXT: vcmpequw v2, v2, v3 +; CHECK-NEXT: vextubrx r8, r8, v2 +; CHECK-NEXT: vextubrx r5, r5, v2 +; CHECK-NEXT: rlwimi r9, r8, 12, 19, 19 +; CHECK-NEXT: rlwimi r9, r5, 13, 18, 18 +; CHECK-NEXT: vextubrx r5, r6, v2 +; CHECK-NEXT: rlwimi r9, r5, 14, 17, 17 +; CHECK-NEXT: vextubrx r5, r7, v2 +; CHECK-NEXT: rlwimi r9, r5, 15, 0, 16 +; CHECK-NEXT: or r4, r9, r4 +; CHECK-NEXT: andi. r4, r4, 65535 +; CHECK-NEXT: iseleq r3, 0, r3 +; CHECK-NEXT: blr +; +; CHECK-AIX-LABEL: SplitPromoteVectorTest: +; CHECK-AIX: # %bb.0: # %entry +; CHECK-AIX-NEXT: ld 4, L..C0(2) # %const.0 +; CHECK-AIX-NEXT: mtvsrws 34, 3 +; CHECK-AIX-NEXT: li 8, 15 +; CHECK-AIX-NEXT: li 5, 11 +; CHECK-AIX-NEXT: lxv 35, 0(4) +; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 +; CHECK-AIX-NEXT: vextublx 4, 8, 3 +; CHECK-AIX-NEXT: vextublx 6, 5, 3 +; CHECK-AIX-NEXT: clrlwi 4, 4, 31 +; CHECK-AIX-NEXT: rlwimi 4, 6, 1, 30, 30 +; CHECK-AIX-NEXT: li 6, 7 +; CHECK-AIX-NEXT: vextublx 7, 6, 3 +; CHECK-AIX-NEXT: rlwimi 4, 7, 2, 29, 29 +; CHECK-AIX-NEXT: li 7, 3 +; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 3, 28, 28 +; CHECK-AIX-NEXT: ld 9, L..C1(2) # %const.1 +; CHECK-AIX-NEXT: lxv 35, 0(9) +; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 4, 27, 27 +; CHECK-AIX-NEXT: vextublx 9, 5, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 5, 26, 26 +; CHECK-AIX-NEXT: vextublx 9, 6, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 6, 25, 25 +; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 7, 24, 24 +; CHECK-AIX-NEXT: ld 9, L..C2(2) # %const.2 +; CHECK-AIX-NEXT: lxv 35, 0(9) +; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 8, 23, 23 +; CHECK-AIX-NEXT: vextublx 9, 5, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 9, 22, 22 +; CHECK-AIX-NEXT: vextublx 9, 6, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 10, 21, 21 +; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 11, 20, 20 +; CHECK-AIX-NEXT: ld 9, L..C3(2) # %const.3 +; CHECK-AIX-NEXT: lxv 35, 0(9) +; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 12, 19, 19 +; CHECK-AIX-NEXT: vextublx 9, 5, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 13, 18, 18 +; CHECK-AIX-NEXT: vextublx 9, 6, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 14, 17, 17 +; CHECK-AIX-NEXT: vextublx 9, 7, 3 +; CHECK-AIX-NEXT: rlwimi 4, 9, 15, 16, 16 +; CHECK-AIX-NEXT: ld 9, L..C4(2) # %const.4 +; CHECK-AIX-NEXT: lxv 35, 0(9) +; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 +; CHECK-AIX-NEXT: vextublx 9, 8, 3 +; CHECK-AIX-NEXT: vextublx 10, 5, 3 +; CHECK-AIX-NEXT: clrlwi 9, 9, 31 +; CHECK-AIX-NEXT: rlwimi 9, 10, 1, 30, 30 +; CHECK-AIX-NEXT: vextublx 10, 6, 3 +; CHECK-AIX-NEXT: rlwimi 9, 10, 2, 29, 29 +; CHECK-AIX-NEXT: vextublx 10, 7, 3 +; CHECK-AIX-NEXT: rlwimi 9, 10, 3, 28, 28 +; CHECK-AIX-NEXT: ld 10, L..C5(2) # %const.5 +; CHECK-AIX-NEXT: lxv 35, 0(10) +; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 +; CHECK-AIX-NEXT: vextublx 10, 8, 3 +; CHECK-AIX-NEXT: rlwimi 9, 10, 4, 27, 27 +; CHECK-AIX-NEXT: vextublx 10, 5, 3 +; CHECK-AIX-NEXT: rlwimi 9, 10, 5, 26, 26 +; CHECK-AIX-NEXT: vextublx 10, 6, 3 +; CHECK-AIX-NEXT: rlwimi 9, 10, 6, 25, 25 +; CHECK-AIX-NEXT: vextublx 10, 7, 3 +; CHECK-AIX-NEXT: rlwimi 9, 10, 7, 24, 24 +; CHECK-AIX-NEXT: ld 10, L..C6(2) # %const.6 +; CHECK-AIX-NEXT: lxv 35, 0(10) +; CHECK-AIX-NEXT: vcmpequw 3, 2, 3 +; CHECK-AIX-NEXT: vextublx 10, 8, 3 +; CHECK-AIX-NEXT: rlwimi 9, 10, 8, 23, 23 +; CHECK-AIX-NEXT: vextublx 10, 5, 3 +; CHECK-AIX-NEXT: rlwimi 9, 10, 9, 22, 22 +; CHECK-AIX-NEXT: vextublx 10, 6, 3 +; CHECK-AIX-NEXT: rlwimi 9, 10, 10, 21, 21 +; CHECK-AIX-NEXT: vextublx 10, 7, 3 +; CHECK-AIX-NEXT: rlwimi 9, 10, 11, 20, 20 +; CHECK-AIX-NEXT: ld 10, L..C7(2) # %const.7 +; CHECK-AIX-NEXT: lxv 35, 0(10) +; CHECK-AIX-NEXT: vcmpequw 2, 2, 3 +; CHECK-AIX-NEXT: vextublx 8, 8, 2 +; CHECK-AIX-NEXT: vextublx 5, 5, 2 +; CHECK-AIX-NEXT: rlwimi 9, 8, 12, 19, 19 +; CHECK-AIX-NEXT: rlwimi 9, 5, 13, 18, 18 +; CHECK-AIX-NEXT: vextublx 5, 6, 2 +; CHECK-AIX-NEXT: rlwimi 9, 5, 14, 17, 17 +; CHECK-AIX-NEXT: vextublx 5, 7, 2 +; CHECK-AIX-NEXT: rlwimi 9, 5, 15, 16, 16 +; CHECK-AIX-NEXT: or 4, 9, 4 +; CHECK-AIX-NEXT: andi. 4, 4, 65535 +; CHECK-AIX-NEXT: iseleq 3, 0, 3 +; CHECK-AIX-NEXT: blr +entry: + %0 = insertelement <32 x i32> poison, i32 %Opc, i64 0 + %shuffle = shufflevector <32 x i32> %0, <32 x i32> poison, <32 x i32> zeroinitializer + %1 = icmp eq <32 x i32> %shuffle, + %2 = bitcast <32 x i1> %1 to i32 + %3 = icmp ne i32 %2, 0 + %op.rdx = or i1 %3, false + %op.rdx255 = or i1 %op.rdx, false + %4 = or i1 %op.rdx255, false + %5 = or i1 %4, false + %6 = or i1 %5, false + %7 = or i1 %6, false + %cond = select i1 %7, i32 %Opc, i32 0 + ret i32 %cond +} -- 2.7.4