From e0ef4ebe2f6ac3523ee25081b36c114c0f0ea695 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Mon, 7 Oct 2019 17:03:46 +0100 Subject: [PATCH] [ARM] Add IR intrinsics for MVE VLD[24] and VST[24]. The VST2 and VST4 instructions take two or four vector registers as input, and store part of each register to memory in an interleaved pattern. They come in variants indicating which part of each register they store (VST20 and VST21; VST40 to VST43 inclusive); the intention is that issuing each of those variants in turn has the combined effect of loading or storing the whole set of registers to a memory block of equal size. The corresponding VLD2 and VLD4 instructions load from memory in the same interleaved format: each one overwrites only part of its output register set, and again, the idea is that if you use VLD4{0,1,2,3} or VLD2{0,1} together, you end up having written to the whole of each register. I've implemented the stores and loads quite differently. The loads were easiest to implement as a single intrinsic that expands to all four VLD4x instructions or both VLD2x, delivering four complete output registers. (Implementing each individual load as a separate instruction taking four input registers to partially overwrite is possible in theory, but pointless, and when I tried it, I found it would need extra work to get the register allocation not to be horrible.) Since that intrinsic delivers multiple outputs, it has to be instruction-selected in custom C++. But the store instructions are easier to model individually, because they don't overwrite any register at all and you can write a DAG Isel pattern in Tablegen for each one. Hence, my new intrinsic `int_arm_mve_vld4q` expands to four load instructions, delivers four full output vectors, and is handled by C++ code, whereas `int_arm_mve_vst4q` expands to just one store instruction, takes four input vectors and a constant indicating which lanes to store, and is handled entirely in Tablegen. (And similarly for vld2q/vst2q.) This is asymmetric, but it was the easiest way to do each one. Reviewers: dmgreen, miyuki, ostannard Subscribers: kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68700 --- llvm/include/llvm/IR/IntrinsicsARM.td | 7 ++ llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp | 74 +++++++++++++++ llvm/lib/Target/ARM/ARMInstrMVE.td | 23 +++++ llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll | 109 +++++++++++++++++++++++ 4 files changed, 213 insertions(+) create mode 100644 llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index 885a231..db5111e 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -836,4 +836,11 @@ def int_arm_mve_vadc_predicated: Intrinsic< [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; +def int_arm_mve_vld2q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>; +def int_arm_mve_vld4q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>; + +def int_arm_mve_vst2q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem]>; +def int_arm_mve_vst4q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem] +>; + } // end TargetPrefix diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 6fe5e59..59acc34 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -232,6 +232,14 @@ private: void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, uint16_t OpcodeWithNoCarry, bool Add, bool Predicated); + /// SelectMVE_VLD - Select MVE interleaving load intrinsics. NumVecs + /// should be 2 or 4. The opcode array specifies the instructions + /// used for 8, 16 and 32-bit lane sizes respectively, and each + /// pointer points to a set of NumVecs sub-opcodes used for the + /// different stages (e.g. VLD20 versus VLD21) of each load family. + void SelectMVE_VLD(SDNode *N, unsigned NumVecs, + const uint16_t *const *Opcodes); + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. @@ -2449,6 +2457,47 @@ void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry, CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } +void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs, + const uint16_t *const *Opcodes) { + EVT VT = N->getValueType(0); + SDLoc Loc(N); + + const uint16_t *OurOpcodes; + switch (VT.getVectorElementType().getSizeInBits()) { + case 8: + OurOpcodes = Opcodes[0]; + break; + case 16: + OurOpcodes = Opcodes[1]; + break; + case 32: + OurOpcodes = Opcodes[2]; + break; + default: + llvm_unreachable("bad vector element size in SelectMVE_VLD"); + } + + EVT DataTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, NumVecs * 2); + EVT ResultTys[] = {DataTy, MVT::Other}; + + auto Data = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, DataTy), 0); + SDValue Chain = N->getOperand(0); + for (unsigned Stage = 0; Stage < NumVecs; ++Stage) { + SDValue Ops[] = {Data, N->getOperand(2), Chain}; + auto LoadInst = + CurDAG->getMachineNode(OurOpcodes[Stage], Loc, ResultTys, Ops); + Data = SDValue(LoadInst, 0); + Chain = SDValue(LoadInst, 1); + } + + for (unsigned i = 0; i < NumVecs; i++) + ReplaceUses(SDValue(N, i), + CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, Data)); + ReplaceUses(SDValue(N, NumVecs), Chain); + CurDAG->RemoveDeadNode(N); +} + void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, @@ -4182,6 +4231,31 @@ void ARMDAGToDAGISel::Select(SDNode *N) { IntNo == Intrinsic::arm_mve_vldr_gather_base_wb_predicated); return; } + + case Intrinsic::arm_mve_vld2q: { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD20_8, ARM::MVE_VLD21_8}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD20_16, + ARM::MVE_VLD21_16}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32, + ARM::MVE_VLD21_32}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 2, Opcodes); + return; + } + + case Intrinsic::arm_mve_vld4q: { + static const uint16_t Opcodes8[] = {ARM::MVE_VLD40_8, ARM::MVE_VLD41_8, + ARM::MVE_VLD42_8, ARM::MVE_VLD43_8}; + static const uint16_t Opcodes16[] = {ARM::MVE_VLD40_16, ARM::MVE_VLD41_16, + ARM::MVE_VLD42_16, + ARM::MVE_VLD43_16}; + static const uint16_t Opcodes32[] = {ARM::MVE_VLD40_32, ARM::MVE_VLD41_32, + ARM::MVE_VLD42_32, + ARM::MVE_VLD43_32}; + static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32}; + SelectMVE_VLD(N, 4, Opcodes); + return; + } } break; } diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 7d49df3..e43d643 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4352,6 +4352,29 @@ foreach wb = [MVE_vldst24_writeback< "vst" # n.nvecs # stage # "." # s.lanesize>; } +multiclass MVE_vst24_patterns { + foreach stage = [0,1] in + def : Pat<(int_arm_mve_vst2q i32:$addr, + (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)), + (!cast("MVE_VST2"#stage#"_"#lanesize) + (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1), + t2_addr_offset_none:$addr)>; + + foreach stage = [0,1,2,3] in + def : Pat<(int_arm_mve_vst4q i32:$addr, + (VT MQPR:$v0), (VT MQPR:$v1), + (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)), + (!cast("MVE_VST4"#stage#"_"#lanesize) + (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1, + VT:$v2, qsub_2, VT:$v3, qsub_3), + t2_addr_offset_none:$addr)>; +} +defm : MVE_vst24_patterns<8, v16i8>; +defm : MVE_vst24_patterns<16, v8i16>; +defm : MVE_vst24_patterns<32, v4i32>; +defm : MVE_vst24_patterns<16, v8f16>; +defm : MVE_vst24_patterns<32, v4f32>; + // end of MVE interleaving load/store // start of MVE predicable load/store diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll new file mode 100644 index 0000000..a8036a3 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vld24.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +%struct.float16x8x2_t = type { [2 x <8 x half>] } +%struct.uint8x16x4_t = type { [4 x <16 x i8>] } +%struct.uint32x4x2_t = type { [2 x <4 x i32>] } +%struct.int8x16x4_t = type { [4 x <16 x i8>] } + +define arm_aapcs_vfpcc %struct.float16x8x2_t @test_vld2q_f16(half* %addr) { +; CHECK-LABEL: test_vld2q_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vld20.16 {q0, q1}, [r0] +; CHECK-NEXT: vld21.16 {q0, q1}, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0f16(half* %addr) + %1 = extractvalue { <8 x half>, <8 x half> } %0, 0 + %2 = insertvalue %struct.float16x8x2_t undef, <8 x half> %1, 0, 0 + %3 = extractvalue { <8 x half>, <8 x half> } %0, 1 + %4 = insertvalue %struct.float16x8x2_t %2, <8 x half> %3, 0, 1 + ret %struct.float16x8x2_t %4 +} + +declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0f16(half*) + +define arm_aapcs_vfpcc %struct.uint8x16x4_t @test_vld4q_u8(i8* %addr) { +; CHECK-LABEL: test_vld4q_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vld40.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld41.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld42.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vld43.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8.p0i8(i8* %addr) + %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 0 + %2 = insertvalue %struct.uint8x16x4_t undef, <16 x i8> %1, 0, 0 + %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 1 + %4 = insertvalue %struct.uint8x16x4_t %2, <16 x i8> %3, 0, 1 + %5 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 2 + %6 = insertvalue %struct.uint8x16x4_t %4, <16 x i8> %5, 0, 2 + %7 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %0, 3 + %8 = insertvalue %struct.uint8x16x4_t %6, <16 x i8> %7, 0, 3 + ret %struct.uint8x16x4_t %8 +} + +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.mve.vld4q.v16i8.p0i8(i8*) + +define arm_aapcs_vfpcc void @test_vst2q_u32(i32* %addr, %struct.uint32x4x2_t %value.coerce) { +; CHECK-LABEL: test_vst2q_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: vst20.32 {q0, q1}, [r0] +; CHECK-NEXT: vst21.32 {q0, q1}, [r0] +; CHECK-NEXT: bx lr +entry: + %value.coerce.fca.0.0.extract = extractvalue %struct.uint32x4x2_t %value.coerce, 0, 0 + %value.coerce.fca.0.1.extract = extractvalue %struct.uint32x4x2_t %value.coerce, 0, 1 + tail call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* %addr, <4 x i32> %value.coerce.fca.0.0.extract, <4 x i32> %value.coerce.fca.0.1.extract, i32 0) + tail call void @llvm.arm.mve.vst2q.p0i32.v4i32(i32* %addr, <4 x i32> %value.coerce.fca.0.0.extract, <4 x i32> %value.coerce.fca.0.1.extract, i32 1) + ret void +} + +declare void @llvm.arm.mve.vst2q.p0i32.v4i32(i32*, <4 x i32>, <4 x i32>, i32) + +define arm_aapcs_vfpcc void @test_vst2q_f16(half* %addr, %struct.float16x8x2_t %value.coerce) { +; CHECK-LABEL: test_vst2q_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-NEXT: vst20.16 {q0, q1}, [r0] +; CHECK-NEXT: vst21.16 {q0, q1}, [r0] +; CHECK-NEXT: bx lr +entry: + %value.coerce.fca.0.0.extract = extractvalue %struct.float16x8x2_t %value.coerce, 0, 0 + %value.coerce.fca.0.1.extract = extractvalue %struct.float16x8x2_t %value.coerce, 0, 1 + call void @llvm.arm.mve.vst2q.p0f16.v8f16(half* %addr, <8 x half> %value.coerce.fca.0.0.extract, <8 x half> %value.coerce.fca.0.1.extract, i32 0) + call void @llvm.arm.mve.vst2q.p0f16.v8f16(half* %addr, <8 x half> %value.coerce.fca.0.0.extract, <8 x half> %value.coerce.fca.0.1.extract, i32 1) + ret void +} + +declare void @llvm.arm.mve.vst2q.p0f16.v8f16(half*, <8 x half>, <8 x half>, i32) + +define arm_aapcs_vfpcc void @test_vst4q_s8(i8* %addr, %struct.int8x16x4_t %value.coerce) { +; CHECK-LABEL: test_vst4q_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $q3 killed $q3 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: @ kill: def $q2 killed $q2 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: @ kill: def $q1 killed $q1 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: @ kill: def $q0 killed $q0 killed $q0_q1_q2_q3 def $q0_q1_q2_q3 +; CHECK-NEXT: vst40.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst41.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst42.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: vst43.8 {q0, q1, q2, q3}, [r0] +; CHECK-NEXT: bx lr +entry: + %value.coerce.fca.0.0.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 0 + %value.coerce.fca.0.1.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 1 + %value.coerce.fca.0.2.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 2 + %value.coerce.fca.0.3.extract = extractvalue %struct.int8x16x4_t %value.coerce, 0, 3 + tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 0) + tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 1) + tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 2) + tail call void @llvm.arm.mve.vst4q.p0i8.v16i8(i8* %addr, <16 x i8> %value.coerce.fca.0.0.extract, <16 x i8> %value.coerce.fca.0.1.extract, <16 x i8> %value.coerce.fca.0.2.extract, <16 x i8> %value.coerce.fca.0.3.extract, i32 3) + ret void +} + +declare void @llvm.arm.mve.vst4q.p0i8.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) -- 2.7.4