From 7a7e9109a2d64a1c09b5dbe958893329fc30467e Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Tue, 20 Dec 2022 10:11:55 -0600 Subject: [PATCH] [PowerPC] Implement P10 Byte Reverse Insructions Generate brh, brw and brd instructions for byte-swap operations on P10 and generating a single instruction for a 32-bit swap followed by a 16-bit right shift. Reviewed By: stefanp Differential Revision: https://reviews.llvm.org/D140414 --- llvm/lib/Target/PowerPC/P10InstrResources.td | 3 + llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 12 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 19 +-- llvm/lib/Target/PowerPC/PPCInstrP10.td | 15 +++ llvm/test/CodeGen/PowerPC/p10-bswap.ll | 137 +++++++++++++++++++++ .../Disassembler/PowerPC/ppc64-encoding-ISA31.txt | 9 ++ llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s | 9 ++ 7 files changed, 196 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/p10-bswap.ll diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td index 27dfc82..f7d07a0 100644 --- a/llvm/lib/Target/PowerPC/P10InstrResources.td +++ b/llvm/lib/Target/PowerPC/P10InstrResources.td @@ -1626,6 +1626,9 @@ def : InstRW<[P10W_PM_4C, P10W_DISP_ANY], // 4 Cycles Permute operations, 1 input operands def : InstRW<[P10W_PM_4C, P10W_DISP_ANY, P10PM_Read], (instrs + BRD, + BRH, BRH8, + BRW, BRW8, LVSL, LVSR, LXVKQ, diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index ed98922..5a7367b 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -3936,9 +3936,19 @@ bool PPCDAGToDAGISel::tryBitPermutation(SDNode *N) { switch (N->getOpcode()) { default: break; + case ISD::SRL: + // If we are on P10, we have a pattern for 32-bit (srl (bswap r), 16) that + // uses the BRH instruction. + if (Subtarget->isISA3_1() && N->getValueType(0) == MVT::i32 && + N->getOperand(0).getOpcode() == ISD::BSWAP) { + auto &OpRight = N->getOperand(1); + ConstantSDNode *SRLConst = dyn_cast(OpRight); + if (SRLConst && SRLConst->getSExtValue() == 16) + return false; + } + LLVM_FALLTHROUGH; case ISD::ROTL: case ISD::SHL: - case ISD::SRL: case ISD::AND: case ISD::OR: { BitPermutationSelector BPS(CurDAG); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index cb49b0a..1936c1f 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -452,14 +452,19 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::FROUND, MVT::f32, Legal); } - // PowerPC does not have BSWAP, but we can use vector BSWAP instruction xxbrd - // to speed up scalar BSWAP64. + // Prior to P10, PowerPC does not have BSWAP, but we can use vector BSWAP + // instruction xxbrd to speed up scalar BSWAP64. + if (Subtarget.isISA3_1()) { + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + setOperationAction(ISD::BSWAP, MVT::i64, Legal); + } else { + setOperationAction(ISD::BSWAP, MVT::i32, Expand); + setOperationAction( + ISD::BSWAP, MVT::i64, + (Subtarget.hasP9Vector() && Subtarget.isPPC64()) ? Custom : Expand); + } + // CTPOP or CTTZ were introduced in P8/P9 respectively - setOperationAction(ISD::BSWAP, MVT::i32 , Expand); - if (Subtarget.hasP9Vector() && Subtarget.isPPC64()) - setOperationAction(ISD::BSWAP, MVT::i64 , Custom); - else - setOperationAction(ISD::BSWAP, MVT::i64 , Expand); if (Subtarget.isISA3_0()) { setOperationAction(ISD::CTTZ , MVT::i32 , Legal); setOperationAction(ISD::CTTZ , MVT::i64 , Legal); diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index 853d81d..6e4d303 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -1744,6 +1744,21 @@ let Predicates = [IsISA3_1] in { v2i64:$vB))]>; def XVTLSBB : XX2_BF3_XO5_XB6_XO9<60, 2, 475, (outs crrc:$BF), (ins vsrc:$XB), "xvtlsbb $BF, $XB", IIC_VecGeneral, []>; + def BRH : XForm_11<31, 219, (outs gprc:$RA), (ins gprc:$RS), + "brh $RA, $RS", IIC_IntRotate, + [(set i32:$RA, (srl (bswap i32:$RS), (i32 16)))]>; + def BRW : XForm_11<31, 155, (outs gprc:$RA), (ins gprc:$RS), + "brw $RA, $RS", IIC_IntRotate, + [(set i32:$RA, (bswap i32:$RS))]>; + let isCodeGenOnly = 1 in { + def BRH8 : XForm_11<31, 219, (outs g8rc:$RA), (ins g8rc:$RS), + "brh $RA, $RS", IIC_IntRotate, []>; + def BRW8 : XForm_11<31, 155, (outs g8rc:$RA), (ins g8rc:$RS), + "brw $RA, $RS", IIC_IntRotate, []>; + } + def BRD : XForm_11<31, 187, (outs g8rc:$RA), (ins g8rc:$RS), + "brd $RA, $RS", IIC_IntRotate, + [(set i64:$RA, (bswap i64:$RS))]>; // The XFormMemOp flag for the following 8 instructions is set on // the instruction format. diff --git a/llvm/test/CodeGen/PowerPC/p10-bswap.ll b/llvm/test/CodeGen/PowerPC/p10-bswap.ll new file mode 100644 index 0000000..1b89e0c --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/p10-bswap.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mcpu=pwr10 -ppc-asm-full-reg-names < %s | FileCheck %s + +; Check that the brh/brw/brd instructions are generated for the bswap +; intrinsic for register operand on P10 and that the lhbrx/lwbrx/ldbrw +; instructions are generated for memory operand. + +declare i16 @llvm.bswap.i16(i16) + +define zeroext i16 @test_nomem16(i16 zeroext %a) { +; CHECK-LABEL: test_nomem16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: brh r3, r3 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +entry: + %0 = tail call i16 @llvm.bswap.i16(i16 %a) + ret i16 %0 +} + +declare i32 @llvm.bswap.i32(i32) + +define zeroext i32 @test_nomem32(i32 zeroext %a) { +; CHECK-LABEL: test_nomem32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: brw r3, r3 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +entry: + %0 = tail call i32 @llvm.bswap.i32(i32 %a) + ret i32 %0 +} + +; Check that brh and clrldi are produced from a call to @llvm.bswap.i32 +; followed by a right shift of 16 (and a zero-extend at the end of the DAG). +define zeroext i32 @test_bswap_shift16(i32 zeroext %a) { +; CHECK-LABEL: test_bswap_shift16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: brh r3, r3 +; CHECK-NEXT: clrldi r3, r3, 32 +; CHECK-NEXT: blr +entry: + %0 = tail call i32 @llvm.bswap.i32(i32 %a) + %shr = lshr i32 %0, 16 + ret i32 %shr +} + +; Check that brh are produced from a call to @llvm.bswap.i32 +; followed by a right shift of 16. +declare i64 @call_1() +define void @test_bswap_shift16_2() { +; CHECK-LABEL: test_bswap_shift16_2: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: mflr r0 +; CHECK-NEXT: std r0, 16(r1) +; CHECK-NEXT: stdu r1, -32(r1) +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset lr, 16 +; CHECK-NEXT: bl call_1@notoc +; CHECK-NEXT: brh r3, r3 +; CHECK-NEXT: sth r3, 0(r3) +bb: + switch i32 undef, label %bb1 [ + i32 78, label %bb2 + ] + +bb1: + unreachable + +bb2: + %i = call i64 @call_1() + %i3 = trunc i64 %i to i32 + %i4 = call i32 @llvm.bswap.i32(i32 %i3) + %i5 = lshr i32 %i4, 16 + %i6 = trunc i32 %i5 to i16 + store i16 %i6, ptr undef, align 2 + unreachable +} + +define zeroext i32 @test_bswap_shift18(i32 zeroext %a) { +; CHECK-LABEL: test_bswap_shift18: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: brw r3, r3 +; CHECK-NEXT: rlwinm r3, r3, 14, 18, 31 +; CHECK-NEXT: blr +entry: + %0 = tail call i32 @llvm.bswap.i32(i32 %a) + %shr = lshr i32 %0, 18 + ret i32 %shr +} + +declare i64 @llvm.bswap.i64(i64) + +define i64 @test_nomem64(i64 %a) { +; CHECK-LABEL: test_nomem64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: brd r3, r3 +; CHECK-NEXT: blr +entry: + %0 = tail call i64 @llvm.bswap.i64(i64 %a) + ret i64 %0 +} + +define i16 @test_mem16(ptr %a) { +; CHECK-LABEL: test_mem16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lhbrx r3, 0, r3 +; CHECK-NEXT: blr +entry: + %0 = load i16, ptr %a, align 2 + %1 = tail call i16 @llvm.bswap.i16(i16 %0) + ret i16 %1 +} + +define i32 @test_mem32(ptr %a) { +; CHECK-LABEL: test_mem32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwbrx r3, 0, r3 +; CHECK-NEXT: blr +entry: + %0 = load i32, ptr %a, align 4 + %1 = tail call i32 @llvm.bswap.i32(i32 %0) + ret i32 %1 +} + +define i64 @test_mem64(ptr %a) { +; CHECK-LABEL: test_mem64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ldbrx r3, 0, r3 +; CHECK-NEXT: blr +entry: + %0 = load i64, ptr %a, align 8 + %1 = tail call i64 @llvm.bswap.i64(i64 %0) + ret i64 %1 +} + diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt index 5a42c39..1bbc612 100644 --- a/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt +++ b/llvm/test/MC/Disassembler/PowerPC/ppc64-encoding-ISA31.txt @@ -423,6 +423,15 @@ # CHECK: xxpermx 6, 63, 21, 34, 2 0x05 0x00 0x00 0x02 0x88 0xdf 0xa8 0x8c +# CHECK: brh 1, 2 +0x7c 0x41 0x01 0xb6 + +# CHECK: brw 1, 2 +0x7c 0x41 0x01 0x36 + +# CHECK: brd 1, 2 +0x7c 0x41 0x01 0x76 + # CHECK: xxblendvb 6, 63, 21, 34 0x05 0x00 0x00 0x00 0x84 0xdf 0xa8 0x8c diff --git a/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s b/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s index d18c84d..efd9354 100644 --- a/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s +++ b/llvm/test/MC/PowerPC/ppc64-encoding-ISA31.s @@ -613,6 +613,15 @@ # CHECK-LE: xxpermx 6, 63, 21, 34, 2 # encoding: [0x02,0x00,0x00,0x05, # CHECK-LE-SAME: 0x8c,0xa8,0xdf,0x88] xxpermx 6, 63, 21, 34, 2 +# CHECK-BE: brh 1, 2 # encoding: [0x7c,0x41,0x01,0xb6] +# CHECK-LE: brh 1, 2 # encoding: [0xb6,0x01,0x41,0x7c] + brh 1, 2 +# CHECK-BE: brw 1, 2 # encoding: [0x7c,0x41,0x01,0x36] +# CHECK-LE: brw 1, 2 # encoding: [0x36,0x01,0x41,0x7c] + brw 1, 2 +# CHECK-BE: brd 1, 2 # encoding: [0x7c,0x41,0x01,0x76] +# CHECK-LE: brd 1, 2 # encoding: [0x76,0x01,0x41,0x7c] + brd 1, 2 # CHECK-BE: xxblendvb 6, 63, 21, 34 # encoding: [0x05,0x00,0x00,0x00, # CHECK-BE-SAME: 0x84,0xdf,0xa8,0x8c] # CHECK-LE: xxblendvb 6, 63, 21, 34 # encoding: [0x00,0x00,0x00,0x05, -- 2.7.4