From 1d26bbcf9bb3b9bf1d1efea9d0aeac667995db3f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 11 Mar 2021 16:21:42 -0800 Subject: [PATCH] [RISCV] Return false from isShuffleMaskLegal except for splats. We don't support any other shuffles currently. This changes the bswap/bitreverse tests that check for this in their expansion code. Previously we expanded a byte swapping shuffle through memory. Now we're scalarizing and doing bit operations on scalars to swap bytes. In the future we can probably use vrgather.vx to do a byte swap shuffle. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 8 + llvm/lib/Target/RISCV/RISCVISelLowering.h | 4 + .../CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll | 3127 ++++++++++--------- llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll | 3154 +++++++++++++------- 4 files changed, 3810 insertions(+), 2483 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 025f5f6..5100047 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1068,6 +1068,14 @@ bool RISCVTargetLowering::shouldExpandBuildVectorWithShuffles( return false; } +bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { + // Only splats are currently supported. + if (ShuffleVectorSDNode::isSplatMask(M.data(), VT)) + return true; + + return false; +} + static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { MVT VT = Op.getSimpleValueType(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 0d02aca..03a75c7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -250,6 +250,10 @@ public: bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override; + /// Return true if the given shuffle mask can be codegen'd directly, or if it + /// should be stack expanded. + bool isShuffleMaskLegal(ArrayRef M, EVT VT) const override; + bool hasBitPreservingFPLogic(EVT VT) const override; bool shouldExpandBuildVectorWithShuffles(EVT VT, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll index 9198eba..c3ea321 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll @@ -1,90 +1,157 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2-RV64 -; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 -; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64 define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) { -; CHECK-LABEL: bitreverse_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; CHECK-NEXT: vle16.v v25, (a0) -; CHECK-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 1(sp) -; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu -; CHECK-NEXT: vslidedown.vi v26, v25, 14 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 15(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 15 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 14(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 12 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 13(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 13 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 12(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 10 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 11(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 11 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 10(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 8 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 9(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 9 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 8(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 6 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 7(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 7 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 6(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 4 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 5(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 5 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 4(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 2 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 3(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 3 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 2(sp) -; CHECK-NEXT: vslidedown.vi v25, v25, 1 -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 0(sp) -; CHECK-NEXT: vsetivli a1, 16, e8,m1,ta,mu -; CHECK-NEXT: vle8.v v25, (sp) -; CHECK-NEXT: vand.vi v26, v25, 15 -; CHECK-NEXT: vsll.vi v26, v26, 4 -; CHECK-NEXT: vand.vi v25, v25, -16 -; CHECK-NEXT: vsrl.vi v25, v25, 4 -; CHECK-NEXT: vor.vv v25, v25, v26 -; CHECK-NEXT: addi a1, zero, 51 -; CHECK-NEXT: vand.vx v26, v25, a1 -; CHECK-NEXT: vsll.vi v26, v26, 2 -; CHECK-NEXT: addi a1, zero, 204 -; CHECK-NEXT: vand.vx v25, v25, a1 -; CHECK-NEXT: vsrl.vi v25, v25, 2 -; CHECK-NEXT: vor.vv v25, v25, v26 -; CHECK-NEXT: addi a1, zero, 85 -; CHECK-NEXT: vand.vx v26, v25, a1 -; CHECK-NEXT: vsll.vi v26, v26, 1 -; CHECK-NEXT: addi a1, zero, 170 -; CHECK-NEXT: vand.vx v25, v25, a1 -; CHECK-NEXT: vsrl.vi v25, v25, 1 -; CHECK-NEXT: vor.vv v25, v25, v26 -; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; CHECK-NEXT: vse16.v v25, (a0) -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; LMULMAX2-RV32-LABEL: bitreverse_v8i16: +; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 8 +; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 8 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, 1 +; LMULMAX2-RV32-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV32-NEXT: lui a1, 15 +; LMULMAX2-RV32-NEXT: addi a1, a1, 240 +; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, 3 +; LMULMAX2-RV32-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV32-NEXT: lui a1, 13 +; LMULMAX2-RV32-NEXT: addi a1, a1, -820 +; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, 5 +; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 1 +; LMULMAX2-RV32-NEXT: lui a1, 11 +; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX2-RV32-NEXT: ret +; +; LMULMAX2-RV64-LABEL: bitreverse_v8i16: +; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX2-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 8 +; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 8 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: lui a1, 1 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV64-NEXT: lui a1, 15 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: lui a1, 3 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV64-NEXT: lui a1, 13 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -820 +; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: lui a1, 5 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 1 +; LMULMAX2-RV64-NEXT: lui a1, 11 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -1366 +; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX2-RV64-NEXT: ret +; +; LMULMAX1-RV32-LABEL: bitreverse_v8i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 8 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 8 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, 1 +; LMULMAX1-RV32-NEXT: addi a1, a1, -241 +; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4 +; LMULMAX1-RV32-NEXT: lui a1, 15 +; LMULMAX1-RV32-NEXT: addi a1, a1, 240 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, 3 +; LMULMAX1-RV32-NEXT: addi a1, a1, 819 +; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2 +; LMULMAX1-RV32-NEXT: lui a1, 13 +; LMULMAX1-RV32-NEXT: addi a1, a1, -820 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, 5 +; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 1 +; LMULMAX1-RV32-NEXT: lui a1, 11 +; LMULMAX1-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bitreverse_v8i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 8 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 8 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: lui a1, 1 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX1-RV64-NEXT: lui a1, 15 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 240 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: lui a1, 3 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX1-RV64-NEXT: lui a1, 13 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -820 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: lui a1, 5 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 1 +; LMULMAX1-RV64-NEXT: lui a1, 11 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -1366 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = load <8 x i16>, <8 x i16>* %y %c = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a) @@ -94,86 +161,201 @@ define void @bitreverse_v8i16(<8 x i16>* %x, <8 x i16>* %y) { declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) { -; CHECK-LABEL: bitreverse_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vle32.v v25, (a0) -; CHECK-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 3(sp) -; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu -; CHECK-NEXT: vslidedown.vi v26, v25, 12 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 15(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 13 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 14(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 14 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 13(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 15 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 12(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 8 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 11(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 9 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 10(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 10 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 9(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 11 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 8(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 4 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 7(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 5 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 6(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 6 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 5(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 7 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 4(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 1 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 2(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 2 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 1(sp) -; CHECK-NEXT: vslidedown.vi v25, v25, 3 -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 0(sp) -; CHECK-NEXT: vsetivli a1, 16, e8,m1,ta,mu -; CHECK-NEXT: vle8.v v25, (sp) -; CHECK-NEXT: vand.vi v26, v25, 15 -; CHECK-NEXT: vsll.vi v26, v26, 4 -; CHECK-NEXT: vand.vi v25, v25, -16 -; CHECK-NEXT: vsrl.vi v25, v25, 4 -; CHECK-NEXT: vor.vv v25, v25, v26 -; CHECK-NEXT: addi a1, zero, 51 -; CHECK-NEXT: vand.vx v26, v25, a1 -; CHECK-NEXT: vsll.vi v26, v26, 2 -; CHECK-NEXT: addi a1, zero, 204 -; CHECK-NEXT: vand.vx v25, v25, a1 -; CHECK-NEXT: vsrl.vi v25, v25, 2 -; CHECK-NEXT: vor.vv v25, v25, v26 -; CHECK-NEXT: addi a1, zero, 85 -; CHECK-NEXT: vand.vx v26, v25, a1 -; CHECK-NEXT: vsll.vi v26, v26, 1 -; CHECK-NEXT: addi a1, zero, 170 -; CHECK-NEXT: vand.vx v25, v25, a1 -; CHECK-NEXT: vsrl.vi v25, v25, 1 -; CHECK-NEXT: vor.vv v25, v25, v26 -; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vse32.v v25, (a0) -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; LMULMAX2-RV32-LABEL: bitreverse_v4i32: +; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX2-RV32-NEXT: vsrl.vi v26, v25, 8 +; LMULMAX2-RV32-NEXT: lui a1, 16 +; LMULMAX2-RV32-NEXT: addi a1, a1, -256 +; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 +; LMULMAX2-RV32-NEXT: vsrl.vi v27, v25, 24 +; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX2-RV32-NEXT: vsll.vi v27, v25, 8 +; LMULMAX2-RV32-NEXT: lui a1, 4080 +; LMULMAX2-RV32-NEXT: vand.vx v27, v27, a1 +; LMULMAX2-RV32-NEXT: vsll.vi v25, v25, 24 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, 61681 +; LMULMAX2-RV32-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV32-NEXT: lui a1, 986895 +; LMULMAX2-RV32-NEXT: addi a1, a1, 240 +; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, 209715 +; LMULMAX2-RV32-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV32-NEXT: lui a1, 838861 +; LMULMAX2-RV32-NEXT: addi a1, a1, -820 +; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, 349525 +; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 1 +; LMULMAX2-RV32-NEXT: lui a1, 699051 +; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV32-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX2-RV32-NEXT: ret +; +; LMULMAX2-RV64-LABEL: bitreverse_v4i32: +; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX2-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX2-RV64-NEXT: vsrl.vi v26, v25, 8 +; LMULMAX2-RV64-NEXT: lui a1, 16 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -256 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v27, v25, 24 +; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX2-RV64-NEXT: vsll.vi v27, v25, 8 +; LMULMAX2-RV64-NEXT: lui a1, 4080 +; LMULMAX2-RV64-NEXT: vand.vx v27, v27, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v25, v25, 24 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: lui a1, 61681 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV64-NEXT: lui a1, 241 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: lui a1, 209715 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV64-NEXT: lui a1, 205 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -820 +; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: lui a1, 349525 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 1 +; LMULMAX2-RV64-NEXT: lui a1, 171 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX2-RV64-NEXT: ret +; +; LMULMAX1-RV32-LABEL: bitreverse_v4i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 8 +; LMULMAX1-RV32-NEXT: lui a1, 16 +; LMULMAX1-RV32-NEXT: addi a1, a1, -256 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 24 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v25, 8 +; LMULMAX1-RV32-NEXT: lui a1, 4080 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, a1 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 24 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, 61681 +; LMULMAX1-RV32-NEXT: addi a1, a1, -241 +; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 4 +; LMULMAX1-RV32-NEXT: lui a1, 986895 +; LMULMAX1-RV32-NEXT: addi a1, a1, 240 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, 209715 +; LMULMAX1-RV32-NEXT: addi a1, a1, 819 +; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 2 +; LMULMAX1-RV32-NEXT: lui a1, 838861 +; LMULMAX1-RV32-NEXT: addi a1, a1, -820 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, 349525 +; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV32-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 1 +; LMULMAX1-RV32-NEXT: lui a1, 699051 +; LMULMAX1-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bitreverse_v4i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 8 +; LMULMAX1-RV64-NEXT: lui a1, 16 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -256 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 24 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v25, 8 +; LMULMAX1-RV64-NEXT: lui a1, 4080 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 24 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: lui a1, 61681 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX1-RV64-NEXT: lui a1, 241 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 240 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: lui a1, 209715 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX1-RV64-NEXT: lui a1, 205 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -820 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: lui a1, 349525 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 1 +; LMULMAX1-RV64-NEXT: lui a1, 171 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1366 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = load <4 x i32>, <4 x i32>* %y %c = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a) @@ -183,86 +365,483 @@ define void @bitreverse_v4i32(<4 x i32>* %x, <4 x i32>* %y) { declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) define void @bitreverse_v2i64(<2 x i64>* %x, <2 x i64>* %y) { -; CHECK-LABEL: bitreverse_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; CHECK-NEXT: vle64.v v25, (a0) -; CHECK-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 7(sp) -; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu -; CHECK-NEXT: vslidedown.vi v26, v25, 8 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 15(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 9 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 14(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 10 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 13(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 11 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 12(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 12 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 11(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 13 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 10(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 14 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 9(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 15 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 8(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 1 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 6(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 2 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 5(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 3 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 4(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 4 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 3(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 5 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 2(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 6 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 1(sp) -; CHECK-NEXT: vslidedown.vi v25, v25, 7 -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 0(sp) -; CHECK-NEXT: vsetivli a1, 16, e8,m1,ta,mu -; CHECK-NEXT: vle8.v v25, (sp) -; CHECK-NEXT: vand.vi v26, v25, 15 -; CHECK-NEXT: vsll.vi v26, v26, 4 -; CHECK-NEXT: vand.vi v25, v25, -16 -; CHECK-NEXT: vsrl.vi v25, v25, 4 -; CHECK-NEXT: vor.vv v25, v25, v26 -; CHECK-NEXT: addi a1, zero, 51 -; CHECK-NEXT: vand.vx v26, v25, a1 -; CHECK-NEXT: vsll.vi v26, v26, 2 -; CHECK-NEXT: addi a1, zero, 204 -; CHECK-NEXT: vand.vx v25, v25, a1 -; CHECK-NEXT: vsrl.vi v25, v25, 2 -; CHECK-NEXT: vor.vv v25, v25, v26 -; CHECK-NEXT: addi a1, zero, 85 -; CHECK-NEXT: vand.vx v26, v25, a1 -; CHECK-NEXT: vsll.vi v26, v26, 1 -; CHECK-NEXT: addi a1, zero, 170 -; CHECK-NEXT: vand.vx v25, v25, a1 -; CHECK-NEXT: vsrl.vi v25, v25, 1 -; CHECK-NEXT: vor.vv v25, v25, v26 -; CHECK-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; CHECK-NEXT: vse64.v v25, (a0) -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; LMULMAX2-RV32-LABEL: bitreverse_v2i64: +; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_0) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_0) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v26, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vsrl.vv v27, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_1) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_1) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vsrl.vv v29, v25, v28 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_2) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_2) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v30, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v29, v29, v30 +; LMULMAX2-RV32-NEXT: vor.vv v27, v29, v27 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_3) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_3) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v29, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vsrl.vv v30, v25, v29 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_4) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_4) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v31, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v30, v30, v31 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_5) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_5) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v31, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vsrl.vv v8, v25, v31 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_6) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_6) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v9, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX2-RV32-NEXT: vor.vv v30, v8, v30 +; LMULMAX2-RV32-NEXT: vor.vv v27, v30, v27 +; LMULMAX2-RV32-NEXT: vsll.vv v30, v25, v31 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_7) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_7) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v31, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v30, v30, v31 +; LMULMAX2-RV32-NEXT: vsll.vv v29, v25, v29 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_8) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_8) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v31, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v29, v29, v31 +; LMULMAX2-RV32-NEXT: vor.vv v29, v29, v30 +; LMULMAX2-RV32-NEXT: vsll.vv v26, v25, v26 +; LMULMAX2-RV32-NEXT: vsll.vv v25, v25, v28 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_9) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_9) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX2-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v29 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX2-RV32-NEXT: lui a1, 61681 +; LMULMAX2-RV32-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v26, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v26, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_10) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_10) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v27, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vsll.vv v26, v26, v27 +; LMULMAX2-RV32-NEXT: lui a1, 986895 +; LMULMAX2-RV32-NEXT: addi a1, a1, 240 +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX2-RV32-NEXT: vsrl.vv v25, v25, v27 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, 209715 +; LMULMAX2-RV32-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v26, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v26, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_11) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_11) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v27, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vsll.vv v26, v26, v27 +; LMULMAX2-RV32-NEXT: lui a1, 838861 +; LMULMAX2-RV32-NEXT: addi a1, a1, -820 +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX2-RV32-NEXT: vsrl.vv v25, v25, v27 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, 349525 +; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v26, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v26, v25, v26 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI2_12) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI2_12) +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v27, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vsll.vv v26, v26, v27 +; LMULMAX2-RV32-NEXT: lui a1, 699051 +; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX2-RV32-NEXT: vsrl.vv v25, v25, v27 +; LMULMAX2-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX2-RV32-NEXT: ret +; +; LMULMAX2-RV64-LABEL: bitreverse_v2i64: +; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX2-RV64-NEXT: addi a1, zero, 56 +; LMULMAX2-RV64-NEXT: vsrl.vx v26, v25, a1 +; LMULMAX2-RV64-NEXT: addi a2, zero, 40 +; LMULMAX2-RV64-NEXT: vsrl.vx v27, v25, a2 +; LMULMAX2-RV64-NEXT: lui a3, 16 +; LMULMAX2-RV64-NEXT: addiw a3, a3, -256 +; LMULMAX2-RV64-NEXT: vand.vx v27, v27, a3 +; LMULMAX2-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX2-RV64-NEXT: vsrl.vi v27, v25, 24 +; LMULMAX2-RV64-NEXT: lui a3, 4080 +; LMULMAX2-RV64-NEXT: vand.vx v27, v27, a3 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v25, 8 +; LMULMAX2-RV64-NEXT: addi a3, zero, 255 +; LMULMAX2-RV64-NEXT: slli a4, a3, 24 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a4 +; LMULMAX2-RV64-NEXT: vor.vv v27, v28, v27 +; LMULMAX2-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX2-RV64-NEXT: vsll.vi v27, v25, 8 +; LMULMAX2-RV64-NEXT: slli a4, a3, 32 +; LMULMAX2-RV64-NEXT: vand.vx v27, v27, a4 +; LMULMAX2-RV64-NEXT: vsll.vi v28, v25, 24 +; LMULMAX2-RV64-NEXT: slli a4, a3, 40 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a4 +; LMULMAX2-RV64-NEXT: vor.vv v27, v28, v27 +; LMULMAX2-RV64-NEXT: vsll.vx v28, v25, a1 +; LMULMAX2-RV64-NEXT: vsll.vx v25, v25, a2 +; LMULMAX2-RV64-NEXT: slli a1, a3, 48 +; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV64-NEXT: vor.vv v25, v28, v25 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: lui a1, 3855 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -241 +; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX2-RV64-NEXT: lui a1, 1044721 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: lui a1, 13107 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 819 +; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX2-RV64-NEXT: lui a1, 1035469 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -820 +; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: lui a1, 21845 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 1 +; LMULMAX2-RV64-NEXT: lui a1, 1026731 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX2-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX2-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX2-RV64-NEXT: ret +; +; LMULMAX1-RV32-LABEL: bitreverse_v2i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_0) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_0) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v27, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_1) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_1) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v29, v25, v28 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_2) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v30, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v30 +; LMULMAX1-RV32-NEXT: vor.vv v27, v29, v27 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_3) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_3) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v29, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v30, v25, v29 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_4) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_4) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v31, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v31 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_5) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_5) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v31, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v8, v25, v31 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_6) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_6) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v9, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v8, v8, v9 +; LMULMAX1-RV32-NEXT: vor.vv v30, v8, v30 +; LMULMAX1-RV32-NEXT: vor.vv v27, v30, v27 +; LMULMAX1-RV32-NEXT: vsll.vv v30, v25, v31 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_7) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_7) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v31, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v31 +; LMULMAX1-RV32-NEXT: vsll.vv v29, v25, v29 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_8) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_8) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v31, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v29, v29, v31 +; LMULMAX1-RV32-NEXT: vor.vv v29, v29, v30 +; LMULMAX1-RV32-NEXT: vsll.vv v26, v25, v26 +; LMULMAX1-RV32-NEXT: vsll.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_9) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_9) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v29 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: lui a1, 61681 +; LMULMAX1-RV32-NEXT: addi a1, a1, -241 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v26, a1 +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_10) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_10) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v27, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsll.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: lui a1, 986895 +; LMULMAX1-RV32-NEXT: addi a1, a1, 240 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v28, a1 +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, 209715 +; LMULMAX1-RV32-NEXT: addi a1, a1, 819 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v26, a1 +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_11) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_11) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v27, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsll.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: lui a1, 838861 +; LMULMAX1-RV32-NEXT: addi a1, a1, -820 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v28, a1 +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, 349525 +; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v26, a1 +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v26 +; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI2_12) +; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI2_12) +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v27, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsll.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: lui a1, 699051 +; LMULMAX1-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v28, a1 +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bitreverse_v2i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a1, zero, 56 +; LMULMAX1-RV64-NEXT: vsrl.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: addi a2, zero, 40 +; LMULMAX1-RV64-NEXT: vsrl.vx v27, v25, a2 +; LMULMAX1-RV64-NEXT: lui a3, 16 +; LMULMAX1-RV64-NEXT: addiw a3, a3, -256 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a3 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 24 +; LMULMAX1-RV64-NEXT: lui a3, 4080 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a3 +; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 8 +; LMULMAX1-RV64-NEXT: addi a3, zero, 255 +; LMULMAX1-RV64-NEXT: slli a4, a3, 24 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a4 +; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 +; LMULMAX1-RV64-NEXT: vor.vv v26, v27, v26 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v25, 8 +; LMULMAX1-RV64-NEXT: slli a4, a3, 32 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, a4 +; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 24 +; LMULMAX1-RV64-NEXT: slli a4, a3, 40 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a4 +; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 +; LMULMAX1-RV64-NEXT: vsll.vx v28, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vx v25, v25, a2 +; LMULMAX1-RV64-NEXT: slli a1, a3, 48 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vor.vv v25, v28, v25 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: lui a1, 3855 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -241 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 241 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -241 +; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 4 +; LMULMAX1-RV64-NEXT: lui a1, 1044721 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 241 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -241 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 240 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: lui a1, 13107 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 819 +; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 2 +; LMULMAX1-RV64-NEXT: lui a1, 1035469 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -820 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: lui a1, 21845 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v26, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 1 +; LMULMAX1-RV64-NEXT: lui a1, 1026731 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1366 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = load <2 x i64>, <2 x i64>* %y %c = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a) @@ -274,431 +853,193 @@ declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) define void @bitreverse_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV32-LABEL: bitreverse_v16i16: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 -; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 -; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli a1, 16, e16,m2,ta,mu ; LMULMAX2-RV32-NEXT: vle16.v v26, (a0) -; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 1(sp) -; LMULMAX2-RV32-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 31(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 30(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 29(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 28(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 27(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 26(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 25(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 24(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 23(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 22(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 21(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 20(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 19(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 18(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 17(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 17 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 16(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 15(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 14(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 13(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 12(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 11(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 10(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 9(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 8(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 7(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 6(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 5(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 4(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 3(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 2(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 0(sp) -; LMULMAX2-RV32-NEXT: addi a1, zero, 32 -; LMULMAX2-RV32-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle8.v v26, (sp) -; LMULMAX2-RV32-NEXT: vand.vi v28, v26, 15 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 8 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 8 +; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, 1 +; LMULMAX2-RV32-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV32-NEXT: vand.vi v26, v26, -16 +; LMULMAX2-RV32-NEXT: lui a1, 15 +; LMULMAX2-RV32-NEXT: addi a1, a1, 240 +; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 4 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV32-NEXT: addi a1, zero, 51 +; LMULMAX2-RV32-NEXT: lui a1, 3 +; LMULMAX2-RV32-NEXT: addi a1, a1, 819 ; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV32-NEXT: addi a1, zero, 204 +; LMULMAX2-RV32-NEXT: lui a1, 13 +; LMULMAX2-RV32-NEXT: addi a1, a1, -820 ; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 2 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV32-NEXT: addi a1, zero, 85 +; LMULMAX2-RV32-NEXT: lui a1, 5 +; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 ; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 170 +; LMULMAX2-RV32-NEXT: lui a1, 11 +; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 ; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV32-NEXT: vsetivli a1, 16, e16,m2,ta,mu ; LMULMAX2-RV32-NEXT: vse16.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bitreverse_v16i16: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 -; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 -; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli a1, 16, e16,m2,ta,mu ; LMULMAX2-RV64-NEXT: vle16.v v26, (a0) -; LMULMAX2-RV64-NEXT: vsetvli zero, zero, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 1(sp) -; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 31(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 30(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 29(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 28(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 27(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 26(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 25(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 24(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 23(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 22(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 21(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 20(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 19(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 18(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 17(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 17 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 16(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 15(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 14(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 13(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 12(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 11(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 10(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 9(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 8(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 7(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 6(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 5(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 4(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 3(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 2(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 0(sp) -; LMULMAX2-RV64-NEXT: addi a1, zero, 32 -; LMULMAX2-RV64-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle8.v v26, (sp) -; LMULMAX2-RV64-NEXT: vand.vi v28, v26, 15 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 8 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 8 +; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: lui a1, 1 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV64-NEXT: vand.vi v26, v26, -16 +; LMULMAX2-RV64-NEXT: lui a1, 15 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 4 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64-NEXT: lui a1, 3 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 ; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV64-NEXT: addi a1, zero, 204 +; LMULMAX2-RV64-NEXT: lui a1, 13 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -820 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 2 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64-NEXT: lui a1, 5 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 ; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 170 +; LMULMAX2-RV64-NEXT: lui a1, 11 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -1366 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: vsetivli a1, 16, e16,m2,ta,mu ; LMULMAX2-RV64-NEXT: vse16.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 ; LMULMAX2-RV64-NEXT: ret ; -; LMULMAX1-LABEL: bitreverse_v16i16: -; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -32 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 -; LMULMAX1-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: vle16.v v26, (a1) -; LMULMAX1-NEXT: vle16.v v25, (a0) -; LMULMAX1-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 1(sp) -; LMULMAX1-NEXT: vsetivli a2, 1, e8,m1,ta,mu -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 15(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 14(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 13(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 12(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 11(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 10(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 9(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 8(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 7(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 6(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 5(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 4(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 3(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 2(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 0(sp) -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 17(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 31(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 30(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 29(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 28(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 27(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 26(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 25(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 24(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 23(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 22(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 21(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 20(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 19(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 18(sp) -; LMULMAX1-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 16(sp) -; LMULMAX1-NEXT: vsetivli a2, 16, e8,m1,ta,mu -; LMULMAX1-NEXT: vle8.v v25, (sp) -; LMULMAX1-NEXT: vand.vi v26, v25, 15 -; LMULMAX1-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-NEXT: vand.vi v25, v25, -16 -; LMULMAX1-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-NEXT: addi a2, zero, 51 -; LMULMAX1-NEXT: vand.vx v26, v25, a2 -; LMULMAX1-NEXT: vsll.vi v26, v26, 2 -; LMULMAX1-NEXT: addi a3, zero, 204 -; LMULMAX1-NEXT: vand.vx v25, v25, a3 -; LMULMAX1-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-NEXT: addi a4, zero, 85 -; LMULMAX1-NEXT: vand.vx v26, v25, a4 -; LMULMAX1-NEXT: vsll.vi v26, v26, 1 -; LMULMAX1-NEXT: addi a5, zero, 170 -; LMULMAX1-NEXT: addi a6, sp, 16 -; LMULMAX1-NEXT: vle8.v v27, (a6) -; LMULMAX1-NEXT: vand.vx v25, v25, a5 -; LMULMAX1-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-NEXT: vand.vi v26, v27, 15 -; LMULMAX1-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-NEXT: vand.vi v27, v27, -16 -; LMULMAX1-NEXT: vsrl.vi v27, v27, 4 -; LMULMAX1-NEXT: vor.vv v26, v27, v26 -; LMULMAX1-NEXT: vand.vx v27, v26, a2 -; LMULMAX1-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-NEXT: vand.vx v26, v26, a3 -; LMULMAX1-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX1-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-NEXT: vand.vx v27, v26, a4 -; LMULMAX1-NEXT: vsll.vi v27, v27, 1 -; LMULMAX1-NEXT: vand.vx v26, v26, a5 -; LMULMAX1-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX1-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-NEXT: vsetivli a2, 8, e16,m1,ta,mu -; LMULMAX1-NEXT: vse16.v v26, (a0) -; LMULMAX1-NEXT: vse16.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 32 -; LMULMAX1-NEXT: ret +; LMULMAX1-RV32-LABEL: bitreverse_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a6, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v25, (a6) +; LMULMAX1-RV32-NEXT: vle16.v v26, (a0) +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 8 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 8 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: lui a2, 1 +; LMULMAX1-RV32-NEXT: addi a7, a2, -241 +; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a7 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV32-NEXT: lui a3, 15 +; LMULMAX1-RV32-NEXT: addi a3, a3, 240 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a3 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: lui a4, 3 +; LMULMAX1-RV32-NEXT: addi a4, a4, 819 +; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a4 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2 +; LMULMAX1-RV32-NEXT: lui a5, 13 +; LMULMAX1-RV32-NEXT: addi a5, a5, -820 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a5 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: lui a1, 5 +; LMULMAX1-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a1 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 1 +; LMULMAX1-RV32-NEXT: lui a2, 11 +; LMULMAX1-RV32-NEXT: addi a2, a2, -1366 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a2 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 8 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 8 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a7 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a3 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 4 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a4 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a5 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 2 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a1 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 1 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a2 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 1 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vse16.v v26, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v25, (a6) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bitreverse_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a6, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v25, (a6) +; LMULMAX1-RV64-NEXT: vle16.v v26, (a0) +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 8 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 8 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: lui a2, 1 +; LMULMAX1-RV64-NEXT: addiw a7, a2, -241 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a7 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV64-NEXT: lui a3, 15 +; LMULMAX1-RV64-NEXT: addiw a3, a3, 240 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: lui a4, 3 +; LMULMAX1-RV64-NEXT: addiw a4, a4, 819 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 +; LMULMAX1-RV64-NEXT: lui a5, 13 +; LMULMAX1-RV64-NEXT: addiw a5, a5, -820 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: lui a1, 5 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 1 +; LMULMAX1-RV64-NEXT: lui a2, 11 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -1366 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a2 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 8 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 8 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a7 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 1 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a2 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vse16.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v25, (a6) +; LMULMAX1-RV64-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = load <16 x i16>, <16 x i16>* %y %c = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a) @@ -710,431 +1051,253 @@ declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) define void @bitreverse_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV32-LABEL: bitreverse_v8i32: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 -; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 -; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu ; LMULMAX2-RV32-NEXT: vle32.v v26, (a0) -; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 3(sp) -; LMULMAX2-RV32-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 31(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 30(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 29(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 28(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 27(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 26(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 25(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 24(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 23(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 22(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 21(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 20(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 19(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 17 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 18(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 17(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 16(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 15(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 14(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 13(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 12(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 11(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 10(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 9(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 8(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 7(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 6(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 5(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 4(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 1 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 2(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 1(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 3 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 0(sp) -; LMULMAX2-RV32-NEXT: addi a1, zero, 32 -; LMULMAX2-RV32-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle8.v v26, (sp) -; LMULMAX2-RV32-NEXT: vand.vi v28, v26, 15 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 8 +; LMULMAX2-RV32-NEXT: lui a1, 16 +; LMULMAX2-RV32-NEXT: addi a1, a1, -256 +; LMULMAX2-RV32-NEXT: vand.vx v28, v28, a1 +; LMULMAX2-RV32-NEXT: vsrl.vi v30, v26, 24 +; LMULMAX2-RV32-NEXT: vor.vv v28, v28, v30 +; LMULMAX2-RV32-NEXT: vsll.vi v30, v26, 8 +; LMULMAX2-RV32-NEXT: lui a1, 4080 +; LMULMAX2-RV32-NEXT: vand.vx v30, v30, a1 +; LMULMAX2-RV32-NEXT: vsll.vi v26, v26, 24 +; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v30 +; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, 61681 +; LMULMAX2-RV32-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV32-NEXT: vand.vi v26, v26, -16 +; LMULMAX2-RV32-NEXT: lui a1, 986895 +; LMULMAX2-RV32-NEXT: addi a1, a1, 240 +; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 4 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV32-NEXT: addi a1, zero, 51 +; LMULMAX2-RV32-NEXT: lui a1, 209715 +; LMULMAX2-RV32-NEXT: addi a1, a1, 819 ; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV32-NEXT: addi a1, zero, 204 +; LMULMAX2-RV32-NEXT: lui a1, 838861 +; LMULMAX2-RV32-NEXT: addi a1, a1, -820 ; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 2 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV32-NEXT: addi a1, zero, 85 +; LMULMAX2-RV32-NEXT: lui a1, 349525 +; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 ; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 170 +; LMULMAX2-RV32-NEXT: lui a1, 699051 +; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 ; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu ; LMULMAX2-RV32-NEXT: vse32.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bitreverse_v8i32: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 -; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 -; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli a1, 8, e32,m2,ta,mu ; LMULMAX2-RV64-NEXT: vle32.v v26, (a0) -; LMULMAX2-RV64-NEXT: vsetvli zero, zero, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 3(sp) -; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 31(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 30(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 29(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 28(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 27(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 26(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 25(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 24(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 23(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 22(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 21(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 20(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 19(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 17 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 18(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 17(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 16(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 15(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 14(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 13(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 12(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 11(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 10(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 9(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 8(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 7(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 6(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 5(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 4(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 2(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 1(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 0(sp) -; LMULMAX2-RV64-NEXT: addi a1, zero, 32 -; LMULMAX2-RV64-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle8.v v26, (sp) -; LMULMAX2-RV64-NEXT: vand.vi v28, v26, 15 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 8 +; LMULMAX2-RV64-NEXT: lui a1, 16 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -256 +; LMULMAX2-RV64-NEXT: vand.vx v28, v28, a1 +; LMULMAX2-RV64-NEXT: vsrl.vi v30, v26, 24 +; LMULMAX2-RV64-NEXT: vor.vv v28, v28, v30 +; LMULMAX2-RV64-NEXT: vsll.vi v30, v26, 8 +; LMULMAX2-RV64-NEXT: lui a1, 4080 +; LMULMAX2-RV64-NEXT: vand.vx v30, v30, a1 +; LMULMAX2-RV64-NEXT: vsll.vi v26, v26, 24 +; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v30 +; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: lui a1, 61681 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV64-NEXT: vand.vi v26, v26, -16 +; LMULMAX2-RV64-NEXT: lui a1, 241 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 4 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64-NEXT: lui a1, 209715 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 ; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV64-NEXT: addi a1, zero, 204 +; LMULMAX2-RV64-NEXT: lui a1, 205 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -820 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 2 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64-NEXT: lui a1, 349525 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 ; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 170 +; LMULMAX2-RV64-NEXT: lui a1, 171 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -1366 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: vsetivli a1, 8, e32,m2,ta,mu ; LMULMAX2-RV64-NEXT: vse32.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 ; LMULMAX2-RV64-NEXT: ret ; -; LMULMAX1-LABEL: bitreverse_v8i32: -; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -32 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 -; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: vle32.v v26, (a1) -; LMULMAX1-NEXT: vle32.v v25, (a0) -; LMULMAX1-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 3(sp) -; LMULMAX1-NEXT: vsetivli a2, 1, e8,m1,ta,mu -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 15(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 14(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 13(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 12(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 11(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 10(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 9(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 8(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 7(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 6(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 5(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 4(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 2(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 1(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v26, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 0(sp) -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 19(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 31(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 30(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 29(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 28(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 27(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 26(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 25(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 24(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 23(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 22(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 21(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 20(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 18(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 17(sp) -; LMULMAX1-NEXT: vslidedown.vi v25, v25, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 16(sp) -; LMULMAX1-NEXT: vsetivli a2, 16, e8,m1,ta,mu -; LMULMAX1-NEXT: vle8.v v25, (sp) -; LMULMAX1-NEXT: vand.vi v26, v25, 15 -; LMULMAX1-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-NEXT: vand.vi v25, v25, -16 -; LMULMAX1-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-NEXT: addi a2, zero, 51 -; LMULMAX1-NEXT: vand.vx v26, v25, a2 -; LMULMAX1-NEXT: vsll.vi v26, v26, 2 -; LMULMAX1-NEXT: addi a3, zero, 204 -; LMULMAX1-NEXT: vand.vx v25, v25, a3 -; LMULMAX1-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-NEXT: addi a4, zero, 85 -; LMULMAX1-NEXT: vand.vx v26, v25, a4 -; LMULMAX1-NEXT: vsll.vi v26, v26, 1 -; LMULMAX1-NEXT: addi a5, zero, 170 -; LMULMAX1-NEXT: addi a6, sp, 16 -; LMULMAX1-NEXT: vle8.v v27, (a6) -; LMULMAX1-NEXT: vand.vx v25, v25, a5 -; LMULMAX1-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-NEXT: vand.vi v26, v27, 15 -; LMULMAX1-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-NEXT: vand.vi v27, v27, -16 -; LMULMAX1-NEXT: vsrl.vi v27, v27, 4 -; LMULMAX1-NEXT: vor.vv v26, v27, v26 -; LMULMAX1-NEXT: vand.vx v27, v26, a2 -; LMULMAX1-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-NEXT: vand.vx v26, v26, a3 -; LMULMAX1-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX1-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-NEXT: vand.vx v27, v26, a4 -; LMULMAX1-NEXT: vsll.vi v27, v27, 1 -; LMULMAX1-NEXT: vand.vx v26, v26, a5 -; LMULMAX1-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX1-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-NEXT: vse32.v v26, (a0) -; LMULMAX1-NEXT: vse32.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 32 -; LMULMAX1-NEXT: ret +; LMULMAX1-RV32-LABEL: bitreverse_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a7, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v25, (a7) +; LMULMAX1-RV32-NEXT: vle32.v v26, (a0) +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v25, 8 +; LMULMAX1-RV32-NEXT: lui a2, 16 +; LMULMAX1-RV32-NEXT: addi t0, a2, -256 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, t0 +; LMULMAX1-RV32-NEXT: vsrl.vi v28, v25, 24 +; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28 +; LMULMAX1-RV32-NEXT: vsll.vi v28, v25, 8 +; LMULMAX1-RV32-NEXT: lui a6, 4080 +; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV32-NEXT: vsll.vi v25, v25, 24 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: lui a4, 61681 +; LMULMAX1-RV32-NEXT: addi t1, a4, -241 +; LMULMAX1-RV32-NEXT: vand.vx v27, v25, t1 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV32-NEXT: lui a5, 986895 +; LMULMAX1-RV32-NEXT: addi a5, a5, 240 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a5 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: lui a3, 209715 +; LMULMAX1-RV32-NEXT: addi a3, a3, 819 +; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a3 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2 +; LMULMAX1-RV32-NEXT: lui a1, 838861 +; LMULMAX1-RV32-NEXT: addi a1, a1, -820 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: lui a2, 349525 +; LMULMAX1-RV32-NEXT: addi a2, a2, 1365 +; LMULMAX1-RV32-NEXT: vand.vx v27, v25, a2 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 1 +; LMULMAX1-RV32-NEXT: lui a4, 699051 +; LMULMAX1-RV32-NEXT: addi a4, a4, -1366 +; LMULMAX1-RV32-NEXT: vand.vx v25, v25, a4 +; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vsrl.vi v27, v26, 8 +; LMULMAX1-RV32-NEXT: vand.vx v27, v27, t0 +; LMULMAX1-RV32-NEXT: vsrl.vi v28, v26, 24 +; LMULMAX1-RV32-NEXT: vor.vv v27, v27, v28 +; LMULMAX1-RV32-NEXT: vsll.vi v28, v26, 8 +; LMULMAX1-RV32-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV32-NEXT: vsll.vi v26, v26, 24 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vand.vx v27, v26, t1 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a5 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 4 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a3 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 2 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a1 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 2 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vand.vx v27, v26, a2 +; LMULMAX1-RV32-NEXT: vsll.vi v27, v27, 1 +; LMULMAX1-RV32-NEXT: vand.vx v26, v26, a4 +; LMULMAX1-RV32-NEXT: vsrl.vi v26, v26, 1 +; LMULMAX1-RV32-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV32-NEXT: vse32.v v26, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v25, (a7) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bitreverse_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a7, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v25, (a7) +; LMULMAX1-RV64-NEXT: vle32.v v26, (a0) +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v25, 8 +; LMULMAX1-RV64-NEXT: lui a2, 16 +; LMULMAX1-RV64-NEXT: addiw t0, a2, -256 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t0 +; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24 +; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28 +; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8 +; LMULMAX1-RV64-NEXT: lui a6, 4080 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 24 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: lui a4, 61681 +; LMULMAX1-RV64-NEXT: addiw t1, a4, -241 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, t1 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV64-NEXT: lui a5, 241 +; LMULMAX1-RV64-NEXT: addiw a5, a5, -241 +; LMULMAX1-RV64-NEXT: slli a5, a5, 12 +; LMULMAX1-RV64-NEXT: addi a5, a5, 240 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: lui a3, 209715 +; LMULMAX1-RV64-NEXT: addiw a3, a3, 819 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a3 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 +; LMULMAX1-RV64-NEXT: lui a1, 205 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -820 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: lui a2, 349525 +; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a2 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 1 +; LMULMAX1-RV64-NEXT: lui a4, 171 +; LMULMAX1-RV64-NEXT: addiw a4, a4, -1365 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, -1366 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a4 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vsrl.vi v27, v26, 8 +; LMULMAX1-RV64-NEXT: vand.vx v27, v27, t0 +; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24 +; LMULMAX1-RV64-NEXT: vor.vv v27, v27, v28 +; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV64-NEXT: vsll.vi v26, v26, 24 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, t1 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a3 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a1 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a2 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 1 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a4 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vse32.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v25, (a7) +; LMULMAX1-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y %c = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a) @@ -1146,431 +1309,565 @@ declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>) define void @bitreverse_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-LABEL: bitreverse_v4i64: ; LMULMAX2-RV32: # %bb.0: -; LMULMAX2-RV32-NEXT: addi sp, sp, -64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV32-NEXT: sw ra, 60(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: sw s0, 56(sp) # 4-byte Folded Spill -; LMULMAX2-RV32-NEXT: .cfi_offset ra, -4 -; LMULMAX2-RV32-NEXT: .cfi_offset s0, -8 -; LMULMAX2-RV32-NEXT: addi s0, sp, 64 -; LMULMAX2-RV32-NEXT: .cfi_def_cfa s0, 0 -; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vle64.v v26, (a0) -; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 7(sp) -; LMULMAX2-RV32-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 31(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 30(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 29(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 28(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 27(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 26(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 25(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 24(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 23(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 17 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 22(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 21(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 20(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 19(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 18(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 17(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 16(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 15(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 14(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 13(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 12(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 11(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 10(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 9(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 8(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 1 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 6(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 5(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 4(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 3(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 2(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 1(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 7 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 0(sp) -; LMULMAX2-RV32-NEXT: addi a1, zero, 32 -; LMULMAX2-RV32-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle8.v v26, (sp) -; LMULMAX2-RV32-NEXT: vand.vi v28, v26, 15 -; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV32-NEXT: vand.vi v26, v26, -16 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 4 -; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV32-NEXT: addi a1, zero, 51 -; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV32-NEXT: addi a1, zero, 204 -; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 2 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_0) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_0) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsrl.vv v8, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_1) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_1) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v30, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsrl.vv v10, v26, v30 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_2) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_2) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v12, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v10, v10, v12 +; LMULMAX2-RV32-NEXT: vor.vv v10, v10, v8 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_3) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_3) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v8, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsrl.vv v12, v26, v8 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_4) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_4) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v14, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v12, v12, v14 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_5) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_5) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v14, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsrl.vv v16, v26, v14 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_6) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_6) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v18, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v16, v16, v18 +; LMULMAX2-RV32-NEXT: vor.vv v12, v16, v12 +; LMULMAX2-RV32-NEXT: vor.vv v10, v12, v10 +; LMULMAX2-RV32-NEXT: vsll.vv v12, v26, v14 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_7) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_7) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v14, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v12, v12, v14 +; LMULMAX2-RV32-NEXT: vsll.vv v8, v26, v8 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_8) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_8) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v14, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v8, v8, v14 +; LMULMAX2-RV32-NEXT: vor.vv v8, v8, v12 +; LMULMAX2-RV32-NEXT: vsll.vv v28, v26, v28 +; LMULMAX2-RV32-NEXT: vsll.vv v26, v26, v30 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_9) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_9) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v30, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v30 +; LMULMAX2-RV32-NEXT: vor.vv v26, v28, v26 +; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v8 +; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v10 +; LMULMAX2-RV32-NEXT: lui a1, 61681 +; LMULMAX2-RV32-NEXT: addi a1, a1, -241 +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v28, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_10) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_10) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v30, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsll.vv v28, v28, v30 +; LMULMAX2-RV32-NEXT: lui a1, 986895 +; LMULMAX2-RV32-NEXT: addi a1, a1, 240 +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v8, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v8 +; LMULMAX2-RV32-NEXT: vsrl.vv v26, v26, v30 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV32-NEXT: addi a1, zero, 85 -; LMULMAX2-RV32-NEXT: vand.vx v28, v26, a1 -; LMULMAX2-RV32-NEXT: vsll.vi v28, v28, 1 -; LMULMAX2-RV32-NEXT: addi a1, zero, 170 -; LMULMAX2-RV32-NEXT: vand.vx v26, v26, a1 -; LMULMAX2-RV32-NEXT: vsrl.vi v26, v26, 1 +; LMULMAX2-RV32-NEXT: lui a1, 209715 +; LMULMAX2-RV32-NEXT: addi a1, a1, 819 +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v28, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_11) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_11) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v30, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsll.vv v28, v28, v30 +; LMULMAX2-RV32-NEXT: lui a1, 838861 +; LMULMAX2-RV32-NEXT: addi a1, a1, -820 +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v8, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v8 +; LMULMAX2-RV32-NEXT: vsrl.vv v26, v26, v30 ; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, 349525 +; LMULMAX2-RV32-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v28, a1 ; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v28, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI5_12) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI5_12) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v30, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsll.vv v28, v28, v30 +; LMULMAX2-RV32-NEXT: lui a1, 699051 +; LMULMAX2-RV32-NEXT: addi a1, a1, -1366 +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.x v8, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vand.vv v26, v26, v8 +; LMULMAX2-RV32-NEXT: vsrl.vv v26, v26, v30 +; LMULMAX2-RV32-NEXT: vor.vv v26, v26, v28 ; LMULMAX2-RV32-NEXT: vse64.v v26, (a0) -; LMULMAX2-RV32-NEXT: addi sp, s0, -64 -; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: lw ra, 60(sp) # 4-byte Folded Reload -; LMULMAX2-RV32-NEXT: addi sp, sp, 64 ; LMULMAX2-RV32-NEXT: ret ; ; LMULMAX2-RV64-LABEL: bitreverse_v4i64: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: addi sp, sp, -64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 64 -; LMULMAX2-RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill -; LMULMAX2-RV64-NEXT: .cfi_offset ra, -8 -; LMULMAX2-RV64-NEXT: .cfi_offset s0, -16 -; LMULMAX2-RV64-NEXT: addi s0, sp, 64 -; LMULMAX2-RV64-NEXT: .cfi_def_cfa s0, 0 -; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu ; LMULMAX2-RV64-NEXT: vle64.v v26, (a0) -; LMULMAX2-RV64-NEXT: vsetvli zero, zero, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 7(sp) -; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 31(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 30(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 29(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 28(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 27(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 26(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 25(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 24(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 23(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 17 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 22(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 21(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 20(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 19(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 18(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 17(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 16(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 15(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 14(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 13(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 12(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 11(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 10(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 9(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 8(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 6(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 5(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 4(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 3(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 2(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 1(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 7 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 0(sp) -; LMULMAX2-RV64-NEXT: addi a1, zero, 32 -; LMULMAX2-RV64-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle8.v v26, (sp) -; LMULMAX2-RV64-NEXT: vand.vi v28, v26, 15 +; LMULMAX2-RV64-NEXT: addi a1, zero, 56 +; LMULMAX2-RV64-NEXT: vsrl.vx v28, v26, a1 +; LMULMAX2-RV64-NEXT: addi a2, zero, 40 +; LMULMAX2-RV64-NEXT: vsrl.vx v30, v26, a2 +; LMULMAX2-RV64-NEXT: lui a3, 16 +; LMULMAX2-RV64-NEXT: addiw a3, a3, -256 +; LMULMAX2-RV64-NEXT: vand.vx v30, v30, a3 +; LMULMAX2-RV64-NEXT: vor.vv v28, v30, v28 +; LMULMAX2-RV64-NEXT: vsrl.vi v30, v26, 24 +; LMULMAX2-RV64-NEXT: lui a3, 4080 +; LMULMAX2-RV64-NEXT: vand.vx v30, v30, a3 +; LMULMAX2-RV64-NEXT: vsrl.vi v8, v26, 8 +; LMULMAX2-RV64-NEXT: addi a3, zero, 255 +; LMULMAX2-RV64-NEXT: slli a4, a3, 24 +; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a4 +; LMULMAX2-RV64-NEXT: vor.vv v30, v8, v30 +; LMULMAX2-RV64-NEXT: vor.vv v28, v30, v28 +; LMULMAX2-RV64-NEXT: vsll.vi v30, v26, 8 +; LMULMAX2-RV64-NEXT: slli a4, a3, 32 +; LMULMAX2-RV64-NEXT: vand.vx v30, v30, a4 +; LMULMAX2-RV64-NEXT: vsll.vi v8, v26, 24 +; LMULMAX2-RV64-NEXT: slli a4, a3, 40 +; LMULMAX2-RV64-NEXT: vand.vx v8, v8, a4 +; LMULMAX2-RV64-NEXT: vor.vv v30, v8, v30 +; LMULMAX2-RV64-NEXT: vsll.vx v8, v26, a1 +; LMULMAX2-RV64-NEXT: vsll.vx v26, v26, a2 +; LMULMAX2-RV64-NEXT: slli a1, a3, 48 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 +; LMULMAX2-RV64-NEXT: vor.vv v26, v8, v26 +; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v30 +; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: lui a1, 3855 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -241 +; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 4 -; LMULMAX2-RV64-NEXT: vand.vi v26, v26, -16 +; LMULMAX2-RV64-NEXT: lui a1, 1044721 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -241 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 240 +; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 4 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: addi a1, zero, 51 +; LMULMAX2-RV64-NEXT: lui a1, 13107 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 819 ; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 2 -; LMULMAX2-RV64-NEXT: addi a1, zero, 204 +; LMULMAX2-RV64-NEXT: lui a1, 1035469 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -819 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -820 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 2 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: addi a1, zero, 85 +; LMULMAX2-RV64-NEXT: lui a1, 21845 +; LMULMAX2-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, 1365 ; LMULMAX2-RV64-NEXT: vand.vx v28, v26, a1 ; LMULMAX2-RV64-NEXT: vsll.vi v28, v28, 1 -; LMULMAX2-RV64-NEXT: addi a1, zero, 170 +; LMULMAX2-RV64-NEXT: lui a1, 1026731 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX2-RV64-NEXT: slli a1, a1, 12 +; LMULMAX2-RV64-NEXT: addi a1, a1, -1366 ; LMULMAX2-RV64-NEXT: vand.vx v26, v26, a1 ; LMULMAX2-RV64-NEXT: vsrl.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vor.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu ; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) -; LMULMAX2-RV64-NEXT: addi sp, s0, -64 -; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload -; LMULMAX2-RV64-NEXT: addi sp, sp, 64 ; LMULMAX2-RV64-NEXT: ret ; -; LMULMAX1-LABEL: bitreverse_v4i64: -; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -32 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 -; LMULMAX1-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: vle64.v v26, (a1) -; LMULMAX1-NEXT: vle64.v v25, (a0) -; LMULMAX1-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 7(sp) -; LMULMAX1-NEXT: vsetivli a2, 1, e8,m1,ta,mu -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 15(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 14(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 13(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 12(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 11(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 10(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 9(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 8(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 6(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 5(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 4(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 3(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 2(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 1(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v26, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 0(sp) -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 23(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 31(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 30(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 29(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 28(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 27(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 26(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 25(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 24(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 22(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 21(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 20(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 19(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 18(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 17(sp) -; LMULMAX1-NEXT: vslidedown.vi v25, v25, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 16(sp) -; LMULMAX1-NEXT: vsetivli a2, 16, e8,m1,ta,mu -; LMULMAX1-NEXT: vle8.v v25, (sp) -; LMULMAX1-NEXT: vand.vi v26, v25, 15 -; LMULMAX1-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-NEXT: vand.vi v25, v25, -16 -; LMULMAX1-NEXT: vsrl.vi v25, v25, 4 -; LMULMAX1-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-NEXT: addi a2, zero, 51 -; LMULMAX1-NEXT: vand.vx v26, v25, a2 -; LMULMAX1-NEXT: vsll.vi v26, v26, 2 -; LMULMAX1-NEXT: addi a3, zero, 204 -; LMULMAX1-NEXT: vand.vx v25, v25, a3 -; LMULMAX1-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-NEXT: addi a4, zero, 85 -; LMULMAX1-NEXT: vand.vx v26, v25, a4 -; LMULMAX1-NEXT: vsll.vi v26, v26, 1 -; LMULMAX1-NEXT: addi a5, zero, 170 -; LMULMAX1-NEXT: addi a6, sp, 16 -; LMULMAX1-NEXT: vle8.v v27, (a6) -; LMULMAX1-NEXT: vand.vx v25, v25, a5 -; LMULMAX1-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-NEXT: vand.vi v26, v27, 15 -; LMULMAX1-NEXT: vsll.vi v26, v26, 4 -; LMULMAX1-NEXT: vand.vi v27, v27, -16 -; LMULMAX1-NEXT: vsrl.vi v27, v27, 4 -; LMULMAX1-NEXT: vor.vv v26, v27, v26 -; LMULMAX1-NEXT: vand.vx v27, v26, a2 -; LMULMAX1-NEXT: vsll.vi v27, v27, 2 -; LMULMAX1-NEXT: vand.vx v26, v26, a3 -; LMULMAX1-NEXT: vsrl.vi v26, v26, 2 -; LMULMAX1-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-NEXT: vand.vx v27, v26, a4 -; LMULMAX1-NEXT: vsll.vi v27, v27, 1 -; LMULMAX1-NEXT: vand.vx v26, v26, a5 -; LMULMAX1-NEXT: vsrl.vi v26, v26, 1 -; LMULMAX1-NEXT: vor.vv v26, v26, v27 -; LMULMAX1-NEXT: vsetivli a2, 2, e64,m1,ta,mu -; LMULMAX1-NEXT: vse64.v v26, (a0) -; LMULMAX1-NEXT: vse64.v v25, (a1) -; LMULMAX1-NEXT: addi sp, sp, 32 -; LMULMAX1-NEXT: ret +; LMULMAX1-RV32-LABEL: bitreverse_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v11, (a1) +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_0) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v28, v11, v26 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_1) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_1) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v30, v11, v27 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_2) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_2) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v29, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v29 +; LMULMAX1-RV32-NEXT: vor.vv v9, v30, v28 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_3) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_3) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v28, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v31, v11, v28 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_4) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_4) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v30, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v10, v31, v30 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_5) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_5) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v31, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsrl.vv v12, v11, v31 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_6) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_6) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v8, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v12, v12, v8 +; LMULMAX1-RV32-NEXT: vor.vv v10, v12, v10 +; LMULMAX1-RV32-NEXT: vor.vv v12, v10, v9 +; LMULMAX1-RV32-NEXT: vsll.vv v10, v11, v31 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_7) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_7) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v9, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v13, v10, v9 +; LMULMAX1-RV32-NEXT: vsll.vv v14, v11, v28 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_8) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_8) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v10, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v14, v14, v10 +; LMULMAX1-RV32-NEXT: vor.vv v13, v14, v13 +; LMULMAX1-RV32-NEXT: vsll.vv v14, v11, v26 +; LMULMAX1-RV32-NEXT: vsll.vv v15, v11, v27 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_9) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_9) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v11, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v15, v15, v11 +; LMULMAX1-RV32-NEXT: vor.vv v14, v14, v15 +; LMULMAX1-RV32-NEXT: vor.vv v13, v14, v13 +; LMULMAX1-RV32-NEXT: vor.vv v15, v13, v12 +; LMULMAX1-RV32-NEXT: lui a2, 61681 +; LMULMAX1-RV32-NEXT: addi a2, a2, -241 +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v12, a2 +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v14, v15, v12 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_10) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_10) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v13, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsll.vv v16, v14, v13 +; LMULMAX1-RV32-NEXT: lui a2, 986895 +; LMULMAX1-RV32-NEXT: addi a2, a2, 240 +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v14, a2 +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v15, v15, v14 +; LMULMAX1-RV32-NEXT: vsrl.vv v15, v15, v13 +; LMULMAX1-RV32-NEXT: vor.vv v17, v15, v16 +; LMULMAX1-RV32-NEXT: lui a2, 209715 +; LMULMAX1-RV32-NEXT: addi a2, a2, 819 +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v15, a2 +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v18, v17, v15 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_11) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_11) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v16, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsll.vv v18, v18, v16 +; LMULMAX1-RV32-NEXT: lui a2, 838861 +; LMULMAX1-RV32-NEXT: addi a2, a2, -820 +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v19, a2 +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v17, v17, v19 +; LMULMAX1-RV32-NEXT: vsrl.vv v17, v17, v16 +; LMULMAX1-RV32-NEXT: vor.vv v17, v17, v18 +; LMULMAX1-RV32-NEXT: lui a2, 349525 +; LMULMAX1-RV32-NEXT: addi a2, a2, 1365 +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v18, a2 +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v20, v17, v18 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI5_12) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI5_12) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v21, (a2) +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vsll.vv v20, v20, v21 +; LMULMAX1-RV32-NEXT: lui a2, 699051 +; LMULMAX1-RV32-NEXT: addi a2, a2, -1366 +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vmv.v.x v22, a2 +; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vand.vv v17, v17, v22 +; LMULMAX1-RV32-NEXT: vsrl.vv v17, v17, v21 +; LMULMAX1-RV32-NEXT: vor.vv v17, v17, v20 +; LMULMAX1-RV32-NEXT: vsrl.vv v20, v25, v26 +; LMULMAX1-RV32-NEXT: vsrl.vv v23, v25, v27 +; LMULMAX1-RV32-NEXT: vand.vv v29, v23, v29 +; LMULMAX1-RV32-NEXT: vor.vv v29, v29, v20 +; LMULMAX1-RV32-NEXT: vsrl.vv v20, v25, v28 +; LMULMAX1-RV32-NEXT: vand.vv v30, v20, v30 +; LMULMAX1-RV32-NEXT: vsrl.vv v20, v25, v31 +; LMULMAX1-RV32-NEXT: vand.vv v8, v20, v8 +; LMULMAX1-RV32-NEXT: vor.vv v30, v8, v30 +; LMULMAX1-RV32-NEXT: vor.vv v29, v30, v29 +; LMULMAX1-RV32-NEXT: vsll.vv v30, v25, v31 +; LMULMAX1-RV32-NEXT: vand.vv v30, v30, v9 +; LMULMAX1-RV32-NEXT: vsll.vv v28, v25, v28 +; LMULMAX1-RV32-NEXT: vand.vv v28, v28, v10 +; LMULMAX1-RV32-NEXT: vor.vv v28, v28, v30 +; LMULMAX1-RV32-NEXT: vsll.vv v26, v25, v26 +; LMULMAX1-RV32-NEXT: vsll.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v11 +; LMULMAX1-RV32-NEXT: vor.vv v25, v26, v25 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v29 +; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v12 +; LMULMAX1-RV32-NEXT: vsll.vv v26, v26, v13 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v14 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v13 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v15 +; LMULMAX1-RV32-NEXT: vsll.vv v26, v26, v16 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v19 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v16 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vand.vv v26, v25, v18 +; LMULMAX1-RV32-NEXT: vsll.vv v26, v26, v21 +; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v22 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v21 +; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v17, (a1) +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bitreverse_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi sp, sp, -16 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: sd s0, 8(sp) # 8-byte Folded Spill +; LMULMAX1-RV64-NEXT: .cfi_offset s0, -8 +; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi t1, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (t1) +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi a7, zero, 56 +; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, a7 +; LMULMAX1-RV64-NEXT: addi t0, zero, 40 +; LMULMAX1-RV64-NEXT: vsrl.vx v28, v26, t0 +; LMULMAX1-RV64-NEXT: lui a1, 16 +; LMULMAX1-RV64-NEXT: addiw t2, a1, -256 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2 +; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 +; LMULMAX1-RV64-NEXT: vsrl.vi v28, v26, 24 +; LMULMAX1-RV64-NEXT: lui a6, 4080 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV64-NEXT: vsrl.vi v29, v26, 8 +; LMULMAX1-RV64-NEXT: addi a1, zero, 255 +; LMULMAX1-RV64-NEXT: slli t3, a1, 24 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t3 +; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 +; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 +; LMULMAX1-RV64-NEXT: vsll.vi v28, v26, 8 +; LMULMAX1-RV64-NEXT: slli t4, a1, 32 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4 +; LMULMAX1-RV64-NEXT: vsll.vi v29, v26, 24 +; LMULMAX1-RV64-NEXT: slli t5, a1, 40 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t5 +; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 +; LMULMAX1-RV64-NEXT: vsll.vx v29, v26, a7 +; LMULMAX1-RV64-NEXT: vsll.vx v26, v26, t0 +; LMULMAX1-RV64-NEXT: slli t6, a1, 48 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, t6 +; LMULMAX1-RV64-NEXT: vor.vv v26, v29, v26 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v28 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: lui a4, 3855 +; LMULMAX1-RV64-NEXT: addiw a4, a4, 241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, 241 +; LMULMAX1-RV64-NEXT: slli a4, a4, 12 +; LMULMAX1-RV64-NEXT: addi a4, a4, -241 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a4 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV64-NEXT: lui a5, 1044721 +; LMULMAX1-RV64-NEXT: addiw a5, a5, -241 +; LMULMAX1-RV64-NEXT: slli a5, a5, 12 +; LMULMAX1-RV64-NEXT: addi a5, a5, 241 +; LMULMAX1-RV64-NEXT: slli a5, a5, 12 +; LMULMAX1-RV64-NEXT: addi a5, a5, -241 +; LMULMAX1-RV64-NEXT: slli a5, a5, 12 +; LMULMAX1-RV64-NEXT: addi a5, a5, 240 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a5 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 4 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: lui a2, 13107 +; LMULMAX1-RV64-NEXT: addiw a2, a2, 819 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, 819 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, 819 +; LMULMAX1-RV64-NEXT: slli a2, a2, 12 +; LMULMAX1-RV64-NEXT: addi a2, a2, 819 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a2 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 +; LMULMAX1-RV64-NEXT: lui a3, 1035469 +; LMULMAX1-RV64-NEXT: addiw a3, a3, -819 +; LMULMAX1-RV64-NEXT: slli a3, a3, 12 +; LMULMAX1-RV64-NEXT: addi a3, a3, -819 +; LMULMAX1-RV64-NEXT: slli a3, a3, 12 +; LMULMAX1-RV64-NEXT: addi a3, a3, -819 +; LMULMAX1-RV64-NEXT: slli a3, a3, 12 +; LMULMAX1-RV64-NEXT: addi a3, a3, -820 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, a3 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 2 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: lui a1, 21845 +; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 +; LMULMAX1-RV64-NEXT: vand.vx v27, v26, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 1 +; LMULMAX1-RV64-NEXT: lui s0, 1026731 +; LMULMAX1-RV64-NEXT: addiw s0, s0, -1365 +; LMULMAX1-RV64-NEXT: slli s0, s0, 12 +; LMULMAX1-RV64-NEXT: addi s0, s0, -1365 +; LMULMAX1-RV64-NEXT: slli s0, s0, 12 +; LMULMAX1-RV64-NEXT: addi s0, s0, -1365 +; LMULMAX1-RV64-NEXT: slli s0, s0, 12 +; LMULMAX1-RV64-NEXT: addi s0, s0, -1366 +; LMULMAX1-RV64-NEXT: vand.vx v26, v26, s0 +; LMULMAX1-RV64-NEXT: vsrl.vi v26, v26, 1 +; LMULMAX1-RV64-NEXT: vor.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vsrl.vx v27, v25, a7 +; LMULMAX1-RV64-NEXT: vsrl.vx v28, v25, t0 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t2 +; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 +; LMULMAX1-RV64-NEXT: vsrl.vi v28, v25, 24 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, a6 +; LMULMAX1-RV64-NEXT: vsrl.vi v29, v25, 8 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t3 +; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 +; LMULMAX1-RV64-NEXT: vor.vv v27, v28, v27 +; LMULMAX1-RV64-NEXT: vsll.vi v28, v25, 8 +; LMULMAX1-RV64-NEXT: vand.vx v28, v28, t4 +; LMULMAX1-RV64-NEXT: vsll.vi v29, v25, 24 +; LMULMAX1-RV64-NEXT: vand.vx v29, v29, t5 +; LMULMAX1-RV64-NEXT: vor.vv v28, v29, v28 +; LMULMAX1-RV64-NEXT: vsll.vx v29, v25, a7 +; LMULMAX1-RV64-NEXT: vsll.vx v25, v25, t0 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, t6 +; LMULMAX1-RV64-NEXT: vor.vv v25, v29, v25 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a4 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 4 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a5 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 4 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a2 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 2 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, a3 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vand.vx v27, v25, a1 +; LMULMAX1-RV64-NEXT: vsll.vi v27, v27, 1 +; LMULMAX1-RV64-NEXT: vand.vx v25, v25, s0 +; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vor.vv v25, v25, v27 +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (t1) +; LMULMAX1-RV64-NEXT: ld s0, 8(sp) # 8-byte Folded Reload +; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y %c = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll index b9c999d..1694d68 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -1,71 +1,281 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2-RV64 -; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 -; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX2-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=LMULMAX1-RV64 define void @bswap_v8i16(<8 x i16>* %x, <8 x i16>* %y) { -; CHECK-LABEL: bswap_v8i16: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; CHECK-NEXT: vle16.v v25, (a0) -; CHECK-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 1(sp) -; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu -; CHECK-NEXT: vslidedown.vi v26, v25, 14 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 15(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 15 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 14(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 12 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 13(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 13 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 12(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 10 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 11(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 11 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 10(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 8 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 9(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 9 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 8(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 6 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 7(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 7 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 6(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 4 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 5(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 5 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 4(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 2 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 3(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 3 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 2(sp) -; CHECK-NEXT: vslidedown.vi v25, v25, 1 -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 0(sp) -; CHECK-NEXT: vsetivli a1, 16, e8,m1,ta,mu -; CHECK-NEXT: vle8.v v25, (sp) -; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; CHECK-NEXT: vse16.v v25, (a0) -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; LMULMAX2-RV32-LABEL: bswap_v8i16: +; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: addi sp, sp, -16 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 0(sp) +; LMULMAX2-RV32-NEXT: vsetivli a1, 1, e16,m1,ta,mu +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 7 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 14(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 6 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 12(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 5 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 10(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 4 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 8(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 6(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 4(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 2(sp) +; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle16.v v25, (sp) +; LMULMAX2-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: ret +; +; LMULMAX2-RV64-LABEL: bswap_v8i16: +; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: addi sp, sp, -16 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX2-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 0(sp) +; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e16,m1,ta,mu +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 7 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 14(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 6 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 12(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 5 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 10(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 4 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 8(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 6(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 4(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 2(sp) +; LMULMAX2-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX2-RV64-NEXT: vle16.v v25, (sp) +; LMULMAX2-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: ret +; +; LMULMAX1-RV32-LABEL: bswap_v8i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi sp, sp, -16 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: slli a2, a1, 8 +; LMULMAX1-RV32-NEXT: slli a1, a1, 16 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a2, a1 +; LMULMAX1-RV32-NEXT: sh a1, 0(sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 1, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: slli a2, a1, 8 +; LMULMAX1-RV32-NEXT: slli a1, a1, 16 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a2, a1 +; LMULMAX1-RV32-NEXT: sh a1, 14(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: slli a2, a1, 8 +; LMULMAX1-RV32-NEXT: slli a1, a1, 16 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a2, a1 +; LMULMAX1-RV32-NEXT: sh a1, 12(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: slli a2, a1, 8 +; LMULMAX1-RV32-NEXT: slli a1, a1, 16 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a2, a1 +; LMULMAX1-RV32-NEXT: sh a1, 10(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: slli a2, a1, 8 +; LMULMAX1-RV32-NEXT: slli a1, a1, 16 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a2, a1 +; LMULMAX1-RV32-NEXT: sh a1, 8(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: slli a2, a1, 8 +; LMULMAX1-RV32-NEXT: slli a1, a1, 16 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a2, a1 +; LMULMAX1-RV32-NEXT: sh a1, 6(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: slli a2, a1, 8 +; LMULMAX1-RV32-NEXT: slli a1, a1, 16 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a2, a1 +; LMULMAX1-RV32-NEXT: sh a1, 4(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: slli a2, a1, 8 +; LMULMAX1-RV32-NEXT: slli a1, a1, 16 +; LMULMAX1-RV32-NEXT: srli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a2, a1 +; LMULMAX1-RV32-NEXT: sh a1, 2(sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (sp) +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bswap_v8i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi sp, sp, -16 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: slli a2, a1, 8 +; LMULMAX1-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX1-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: sh a1, 0(sp) +; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: slli a2, a1, 8 +; LMULMAX1-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX1-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: sh a1, 14(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: slli a2, a1, 8 +; LMULMAX1-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX1-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: sh a1, 12(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: slli a2, a1, 8 +; LMULMAX1-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX1-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: sh a1, 10(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: slli a2, a1, 8 +; LMULMAX1-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX1-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: sh a1, 8(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: slli a2, a1, 8 +; LMULMAX1-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX1-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: sh a1, 6(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: slli a2, a1, 8 +; LMULMAX1-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX1-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: sh a1, 4(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: slli a2, a1, 8 +; LMULMAX1-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX1-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: sh a1, 2(sp) +; LMULMAX1-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (sp) +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = load <8 x i16>, <8 x i16>* %y %c = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %a) @@ -75,67 +285,257 @@ define void @bswap_v8i16(<8 x i16>* %x, <8 x i16>* %y) { declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) define void @bswap_v4i32(<4 x i32>* %x, <4 x i32>* %y) { -; CHECK-LABEL: bswap_v4i32: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vle32.v v25, (a0) -; CHECK-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 3(sp) -; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu -; CHECK-NEXT: vslidedown.vi v26, v25, 12 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 15(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 13 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 14(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 14 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 13(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 15 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 12(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 8 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 11(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 9 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 10(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 10 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 9(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 11 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 8(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 4 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 7(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 5 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 6(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 6 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 5(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 7 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 4(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 1 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 2(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 2 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 1(sp) -; CHECK-NEXT: vslidedown.vi v25, v25, 3 -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 0(sp) -; CHECK-NEXT: vsetivli a1, 16, e8,m1,ta,mu -; CHECK-NEXT: vle8.v v25, (sp) -; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vse32.v v25, (a0) -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; LMULMAX2-RV32-LABEL: bswap_v4i32: +; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: addi sp, sp, -16 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: lui a3, 16 +; LMULMAX2-RV32-NEXT: addi a3, a3, -256 +; LMULMAX2-RV32-NEXT: and a2, a2, a3 +; LMULMAX2-RV32-NEXT: srli a4, a1, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: slli a4, a1, 8 +; LMULMAX2-RV32-NEXT: lui a5, 4080 +; LMULMAX2-RV32-NEXT: and a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: sw a1, 0(sp) +; LMULMAX2-RV32-NEXT: vsetivli a1, 1, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: and a2, a2, a3 +; LMULMAX2-RV32-NEXT: srli a4, a1, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: slli a4, a1, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: and a2, a2, a3 +; LMULMAX2-RV32-NEXT: srli a4, a1, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: slli a4, a1, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: and a2, a2, a3 +; LMULMAX2-RV32-NEXT: srli a3, a1, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a3 +; LMULMAX2-RV32-NEXT: slli a3, a1, 8 +; LMULMAX2-RV32-NEXT: and a3, a3, a5 +; LMULMAX2-RV32-NEXT: slli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a3 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: ret +; +; LMULMAX2-RV64-LABEL: bswap_v4i32: +; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: addi sp, sp, -16 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX2-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX2-RV64-NEXT: lui a3, 16 +; LMULMAX2-RV64-NEXT: addiw a3, a3, -256 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: srliw a4, a1, 24 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: slli a4, a1, 8 +; LMULMAX2-RV64-NEXT: lui a5, 4080 +; LMULMAX2-RV64-NEXT: and a4, a4, a5 +; LMULMAX2-RV64-NEXT: slli a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a1, a4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: sw a1, 0(sp) +; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e32,m1,ta,mu +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: srliw a4, a1, 24 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: slli a4, a1, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, a5 +; LMULMAX2-RV64-NEXT: slli a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a1, a4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: sw a1, 12(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: srliw a4, a1, 24 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: slli a4, a1, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, a5 +; LMULMAX2-RV64-NEXT: slli a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a1, a4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: sw a1, 8(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: srliw a3, a1, 24 +; LMULMAX2-RV64-NEXT: or a2, a2, a3 +; LMULMAX2-RV64-NEXT: slli a3, a1, 8 +; LMULMAX2-RV64-NEXT: and a3, a3, a5 +; LMULMAX2-RV64-NEXT: slli a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a1, a3 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: sw a1, 4(sp) +; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX2-RV64-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: ret +; +; LMULMAX1-RV32-LABEL: bswap_v4i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi sp, sp, -16 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: lui a3, 16 +; LMULMAX1-RV32-NEXT: addi a3, a3, -256 +; LMULMAX1-RV32-NEXT: and a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a4, a1, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a4 +; LMULMAX1-RV32-NEXT: slli a4, a1, 8 +; LMULMAX1-RV32-NEXT: lui a5, 4080 +; LMULMAX1-RV32-NEXT: and a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 1, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: and a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a4, a1, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a4 +; LMULMAX1-RV32-NEXT: slli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: and a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a4, a1, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a4 +; LMULMAX1-RV32-NEXT: slli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: and a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a1, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: slli a3, a1, 8 +; LMULMAX1-RV32-NEXT: and a3, a3, a5 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a3 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bswap_v4i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi sp, sp, -16 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX1-RV64-NEXT: lui a3, 16 +; LMULMAX1-RV64-NEXT: addiw a3, a3, -256 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 24 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: slli a4, a1, 8 +; LMULMAX1-RV64-NEXT: lui a5, 4080 +; LMULMAX1-RV64-NEXT: and a4, a4, a5 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: sw a1, 0(sp) +; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 24 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: slli a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a4, a4, a5 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: sw a1, 12(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 24 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: slli a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a4, a4, a5 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: sw a1, 8(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: srliw a2, a1, 8 +; LMULMAX1-RV64-NEXT: and a2, a2, a3 +; LMULMAX1-RV64-NEXT: srliw a3, a1, 24 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: slli a3, a1, 8 +; LMULMAX1-RV64-NEXT: and a3, a3, a5 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a3 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: sw a1, 4(sp) +; LMULMAX1-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = load <4 x i32>, <4 x i32>* %y %c = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a) @@ -145,67 +545,271 @@ define void @bswap_v4i32(<4 x i32>* %x, <4 x i32>* %y) { declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { -; CHECK-LABEL: bswap_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi sp, sp, -16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; CHECK-NEXT: vle64.v v25, (a0) -; CHECK-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 7(sp) -; CHECK-NEXT: vsetivli a1, 1, e8,m1,ta,mu -; CHECK-NEXT: vslidedown.vi v26, v25, 8 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 15(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 9 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 14(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 10 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 13(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 11 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 12(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 12 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 11(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 13 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 10(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 14 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 9(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 15 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 8(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 1 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 6(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 2 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 5(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 3 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 4(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 4 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 3(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 5 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 2(sp) -; CHECK-NEXT: vslidedown.vi v26, v25, 6 -; CHECK-NEXT: vmv.x.s a1, v26 -; CHECK-NEXT: sb a1, 1(sp) -; CHECK-NEXT: vslidedown.vi v25, v25, 7 -; CHECK-NEXT: vmv.x.s a1, v25 -; CHECK-NEXT: sb a1, 0(sp) -; CHECK-NEXT: vsetivli a1, 16, e8,m1,ta,mu -; CHECK-NEXT: vle8.v v25, (sp) -; CHECK-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; CHECK-NEXT: vse64.v v25, (a0) -; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: ret +; LMULMAX2-RV32-LABEL: bswap_v2i64: +; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: addi sp, sp, -16 +; LMULMAX2-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: lui a3, 16 +; LMULMAX2-RV32-NEXT: addi a3, a3, -256 +; LMULMAX2-RV32-NEXT: and a2, a2, a3 +; LMULMAX2-RV32-NEXT: srli a4, a1, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: slli a4, a1, 8 +; LMULMAX2-RV32-NEXT: lui a6, 4080 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: slli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: sw a1, 4(sp) +; LMULMAX2-RV32-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v25, 1 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: and a2, a2, a3 +; LMULMAX2-RV32-NEXT: srli a4, a1, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: slli a4, a1, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a6 +; LMULMAX2-RV32-NEXT: slli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a4 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: sw a1, 12(sp) +; LMULMAX2-RV32-NEXT: addi a1, zero, 32 +; LMULMAX2-RV32-NEXT: vsrl.vx v25, v25, a1 +; LMULMAX2-RV32-NEXT: vmv.x.s a2, v25 +; LMULMAX2-RV32-NEXT: srli a4, a2, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a3 +; LMULMAX2-RV32-NEXT: srli a5, a2, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a2, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: slli a2, a2, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a5 +; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: sw a2, 0(sp) +; LMULMAX2-RV32-NEXT: vsrl.vx v25, v26, a1 +; LMULMAX2-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV32-NEXT: srli a2, a1, 8 +; LMULMAX2-RV32-NEXT: and a2, a2, a3 +; LMULMAX2-RV32-NEXT: srli a3, a1, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a3 +; LMULMAX2-RV32-NEXT: slli a3, a1, 8 +; LMULMAX2-RV32-NEXT: and a3, a3, a6 +; LMULMAX2-RV32-NEXT: slli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a3 +; LMULMAX2-RV32-NEXT: or a1, a1, a2 +; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX2-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX2-RV32-NEXT: addi sp, sp, 16 +; LMULMAX2-RV32-NEXT: ret +; +; LMULMAX2-RV64-LABEL: bswap_v2i64: +; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: addi sp, sp, -16 +; LMULMAX2-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV64-NEXT: srli a2, a1, 40 +; LMULMAX2-RV64-NEXT: lui a3, 16 +; LMULMAX2-RV64-NEXT: addiw a7, a3, -256 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: srli a4, a1, 56 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a1, 24 +; LMULMAX2-RV64-NEXT: lui a6, 4080 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: srli a5, a1, 8 +; LMULMAX2-RV64-NEXT: addi a3, zero, 255 +; LMULMAX2-RV64-NEXT: slli t0, a3, 24 +; LMULMAX2-RV64-NEXT: and a5, a5, t0 +; LMULMAX2-RV64-NEXT: or a4, a5, a4 +; LMULMAX2-RV64-NEXT: or t1, a4, a2 +; LMULMAX2-RV64-NEXT: slli a4, a1, 8 +; LMULMAX2-RV64-NEXT: slli t2, a3, 32 +; LMULMAX2-RV64-NEXT: and a4, a4, t2 +; LMULMAX2-RV64-NEXT: slli a2, a1, 24 +; LMULMAX2-RV64-NEXT: slli t3, a3, 40 +; LMULMAX2-RV64-NEXT: and a2, a2, t3 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: slli a4, a1, 40 +; LMULMAX2-RV64-NEXT: slli a3, a3, 48 +; LMULMAX2-RV64-NEXT: and a4, a4, a3 +; LMULMAX2-RV64-NEXT: slli a1, a1, 56 +; LMULMAX2-RV64-NEXT: or a1, a1, a4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: or a1, a1, t1 +; LMULMAX2-RV64-NEXT: sd a1, 0(sp) +; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX2-RV64-NEXT: srli a2, a1, 40 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: srli a4, a1, 56 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a1, 24 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: srli a5, a1, 8 +; LMULMAX2-RV64-NEXT: and a5, a5, t0 +; LMULMAX2-RV64-NEXT: or a4, a5, a4 +; LMULMAX2-RV64-NEXT: or a2, a4, a2 +; LMULMAX2-RV64-NEXT: slli a4, a1, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, t2 +; LMULMAX2-RV64-NEXT: slli a5, a1, 24 +; LMULMAX2-RV64-NEXT: and a5, a5, t3 +; LMULMAX2-RV64-NEXT: or a4, a5, a4 +; LMULMAX2-RV64-NEXT: slli a5, a1, 40 +; LMULMAX2-RV64-NEXT: and a3, a5, a3 +; LMULMAX2-RV64-NEXT: slli a1, a1, 56 +; LMULMAX2-RV64-NEXT: or a1, a1, a3 +; LMULMAX2-RV64-NEXT: or a1, a1, a4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: sd a1, 8(sp) +; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX2-RV64-NEXT: vle64.v v25, (sp) +; LMULMAX2-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX2-RV64-NEXT: addi sp, sp, 16 +; LMULMAX2-RV64-NEXT: ret +; +; LMULMAX1-RV32-LABEL: bswap_v2i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi sp, sp, -16 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: lui a3, 16 +; LMULMAX1-RV32-NEXT: addi a3, a3, -256 +; LMULMAX1-RV32-NEXT: and a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a4, a1, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a4 +; LMULMAX1-RV32-NEXT: slli a4, a1, 8 +; LMULMAX1-RV32-NEXT: lui a6, 4080 +; LMULMAX1-RV32-NEXT: and a4, a4, a6 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 1 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: and a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a4, a1, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a4 +; LMULMAX1-RV32-NEXT: slli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a6 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: addi a1, zero, 32 +; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a1 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV32-NEXT: srli a4, a2, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a3 +; LMULMAX1-RV32-NEXT: srli a5, a2, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a2, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a6 +; LMULMAX1-RV32-NEXT: slli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a5 +; LMULMAX1-RV32-NEXT: or a2, a2, a4 +; LMULMAX1-RV32-NEXT: sw a2, 0(sp) +; LMULMAX1-RV32-NEXT: vsrl.vx v25, v26, a1 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: srli a2, a1, 8 +; LMULMAX1-RV32-NEXT: and a2, a2, a3 +; LMULMAX1-RV32-NEXT: srli a3, a1, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a3 +; LMULMAX1-RV32-NEXT: slli a3, a1, 8 +; LMULMAX1-RV32-NEXT: and a3, a3, a6 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a3 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: addi sp, sp, 16 +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bswap_v2i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi sp, sp, -16 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 16 +; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: srli a2, a1, 40 +; LMULMAX1-RV64-NEXT: lui a3, 16 +; LMULMAX1-RV64-NEXT: addiw a7, a3, -256 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a4, a1, 56 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a1, 24 +; LMULMAX1-RV64-NEXT: lui a6, 4080 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: srli a5, a1, 8 +; LMULMAX1-RV64-NEXT: addi a3, zero, 255 +; LMULMAX1-RV64-NEXT: slli t0, a3, 24 +; LMULMAX1-RV64-NEXT: and a5, a5, t0 +; LMULMAX1-RV64-NEXT: or a4, a5, a4 +; LMULMAX1-RV64-NEXT: or t1, a4, a2 +; LMULMAX1-RV64-NEXT: slli a4, a1, 8 +; LMULMAX1-RV64-NEXT: slli t2, a3, 32 +; LMULMAX1-RV64-NEXT: and a4, a4, t2 +; LMULMAX1-RV64-NEXT: slli a2, a1, 24 +; LMULMAX1-RV64-NEXT: slli t3, a3, 40 +; LMULMAX1-RV64-NEXT: and a2, a2, t3 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: slli a4, a1, 40 +; LMULMAX1-RV64-NEXT: slli a3, a3, 48 +; LMULMAX1-RV64-NEXT: and a4, a4, a3 +; LMULMAX1-RV64-NEXT: slli a1, a1, 56 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: or a1, a1, t1 +; LMULMAX1-RV64-NEXT: sd a1, 0(sp) +; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: srli a2, a1, 40 +; LMULMAX1-RV64-NEXT: and a2, a2, a7 +; LMULMAX1-RV64-NEXT: srli a4, a1, 56 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: srli a4, a1, 24 +; LMULMAX1-RV64-NEXT: and a4, a4, a6 +; LMULMAX1-RV64-NEXT: srli a5, a1, 8 +; LMULMAX1-RV64-NEXT: and a5, a5, t0 +; LMULMAX1-RV64-NEXT: or a4, a5, a4 +; LMULMAX1-RV64-NEXT: or a2, a4, a2 +; LMULMAX1-RV64-NEXT: slli a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a4, a4, t2 +; LMULMAX1-RV64-NEXT: slli a5, a1, 24 +; LMULMAX1-RV64-NEXT: and a5, a5, t3 +; LMULMAX1-RV64-NEXT: or a4, a5, a4 +; LMULMAX1-RV64-NEXT: slli a5, a1, 40 +; LMULMAX1-RV64-NEXT: and a3, a5, a3 +; LMULMAX1-RV64-NEXT: slli a1, a1, 56 +; LMULMAX1-RV64-NEXT: or a1, a1, a3 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: sd a1, 8(sp) +; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (sp) +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: addi sp, sp, 16 +; LMULMAX1-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = load <2 x i64>, <2 x i64>* %y %c = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %a) @@ -228,107 +832,120 @@ define void @bswap_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli a1, 16, e16,m2,ta,mu ; LMULMAX2-RV32-NEXT: vle16.v v26, (a0) -; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e8,m2,ta,mu ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 1(sp) -; LMULMAX2-RV32-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 31(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 30(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 29(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 28(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 27(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 26(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 25(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 24(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 23(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 22(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 21(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 20(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 19(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 18(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 17(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 17 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 0(sp) +; LMULMAX2-RV32-NEXT: vsetivli a1, 1, e16,m2,ta,mu +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 16(sp) +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 30(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 15(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 28(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 14(sp) +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 26(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 13(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 24(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 12(sp) +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 22(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 11(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 20(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 10(sp) +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 18(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 9(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 16(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 8(sp) +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 14(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 7(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 12(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 6(sp) +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 10(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 5(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 8(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 4(sp) +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 6(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 3(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 2(sp) +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 4(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 0(sp) -; LMULMAX2-RV32-NEXT: addi a1, zero, 32 -; LMULMAX2-RV32-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle8.v v26, (sp) +; LMULMAX2-RV32-NEXT: slli a2, a1, 8 +; LMULMAX2-RV32-NEXT: slli a1, a1, 16 +; LMULMAX2-RV32-NEXT: srli a1, a1, 24 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sh a1, 2(sp) ; LMULMAX2-RV32-NEXT: vsetivli a1, 16, e16,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle16.v v26, (sp) ; LMULMAX2-RV32-NEXT: vse16.v v26, (a0) ; LMULMAX2-RV32-NEXT: addi sp, s0, -64 ; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload @@ -349,107 +966,120 @@ define void @bswap_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli a1, 16, e16,m2,ta,mu ; LMULMAX2-RV64-NEXT: vle16.v v26, (a0) -; LMULMAX2-RV64-NEXT: vsetvli zero, zero, e8,m2,ta,mu ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 1(sp) -; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 31(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 30(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 29(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 28(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 27(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 26(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 25(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 24(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 23(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 22(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 21(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 20(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 19(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 18(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 17(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 17 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 0(sp) +; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e16,m2,ta,mu +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 16(sp) +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 30(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 15(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 28(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 14(sp) +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 26(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 13(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 24(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 12(sp) +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 22(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 11(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 20(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 10(sp) +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 18(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 9(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 16(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 8(sp) +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 14(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 7(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 12(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 6(sp) +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 10(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 5(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 8(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 4(sp) +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 6(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 3(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 2(sp) +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 4(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 0(sp) -; LMULMAX2-RV64-NEXT: addi a1, zero, 32 -; LMULMAX2-RV64-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle8.v v26, (sp) +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slliw a1, a1, 16 +; LMULMAX2-RV64-NEXT: srliw a1, a1, 24 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sh a1, 2(sp) ; LMULMAX2-RV64-NEXT: vsetivli a1, 16, e16,m2,ta,mu +; LMULMAX2-RV64-NEXT: vle16.v v26, (sp) ; LMULMAX2-RV64-NEXT: vse16.v v26, (a0) ; LMULMAX2-RV64-NEXT: addi sp, s0, -64 ; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -457,119 +1087,261 @@ define void @bswap_v16i16(<16 x i16>* %x, <16 x i16>* %y) { ; LMULMAX2-RV64-NEXT: addi sp, sp, 64 ; LMULMAX2-RV64-NEXT: ret ; -; LMULMAX1-LABEL: bswap_v16i16: -; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -32 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 -; LMULMAX1-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: vle16.v v26, (a1) -; LMULMAX1-NEXT: vle16.v v25, (a0) -; LMULMAX1-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 1(sp) -; LMULMAX1-NEXT: vsetivli a2, 1, e8,m1,ta,mu -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 15(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 14(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 13(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 12(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 11(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 10(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 9(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 8(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 7(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 6(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 5(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 4(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 3(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 2(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v26, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 0(sp) -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 17(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 31(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 30(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 29(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 28(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 27(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 26(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 25(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 24(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 23(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 22(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 21(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 20(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 19(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 18(sp) -; LMULMAX1-NEXT: vslidedown.vi v25, v25, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 16(sp) -; LMULMAX1-NEXT: vsetivli a2, 16, e8,m1,ta,mu -; LMULMAX1-NEXT: addi a2, sp, 16 -; LMULMAX1-NEXT: vle8.v v25, (a2) -; LMULMAX1-NEXT: vle8.v v26, (sp) -; LMULMAX1-NEXT: vsetivli a2, 8, e16,m1,ta,mu -; LMULMAX1-NEXT: vse16.v v25, (a0) -; LMULMAX1-NEXT: vse16.v v26, (a1) -; LMULMAX1-NEXT: addi sp, sp, 32 -; LMULMAX1-NEXT: ret +; LMULMAX1-RV32-LABEL: bswap_v16i16: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a1, a0, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a1) +; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 16(sp) +; LMULMAX1-RV32-NEXT: vsetivli a2, 1, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 7 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 30(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 6 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 28(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 5 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 26(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 4 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 24(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 3 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 22(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 2 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 20(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 18(sp) +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 0(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 7 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 14(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 6 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 12(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 5 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 10(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 4 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 8(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 6(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 4(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV32-NEXT: slli a3, a2, 8 +; LMULMAX1-RV32-NEXT: slli a2, a2, 16 +; LMULMAX1-RV32-NEXT: srli a2, a2, 24 +; LMULMAX1-RV32-NEXT: or a2, a3, a2 +; LMULMAX1-RV32-NEXT: sh a2, 2(sp) +; LMULMAX1-RV32-NEXT: vsetivli a2, 8, e16,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle16.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a2, sp, 16 +; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse16.v v26, (a1) +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bswap_v16i16: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, a0, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a1) +; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 16(sp) +; LMULMAX1-RV64-NEXT: vsetivli a2, 1, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 7 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 30(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 6 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 28(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 26(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 4 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 24(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 22(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v27 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 20(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 18(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 0(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 7 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 14(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 6 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 12(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 5 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 10(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 4 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 8(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 6(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 4(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v25 +; LMULMAX1-RV64-NEXT: slli a3, a2, 8 +; LMULMAX1-RV64-NEXT: slliw a2, a2, 16 +; LMULMAX1-RV64-NEXT: srliw a2, a2, 24 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: sh a2, 2(sp) +; LMULMAX1-RV64-NEXT: vsetivli a2, 8, e16,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle16.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a2, sp, 16 +; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) +; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse16.v v26, (a1) +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = load <16 x i16>, <16 x i16>* %y %c = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %a) @@ -592,107 +1364,107 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu ; LMULMAX2-RV32-NEXT: vle32.v v26, (a0) -; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 3(sp) -; LMULMAX2-RV32-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 31(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 30(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 29(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 28(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 27(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 26(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 25(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 24(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 23(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 22(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 21(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 20(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 19(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 17 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 18(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 17(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 16(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 15(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 14(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 13(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 12(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 11(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 10(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 9(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 8(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 7(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 6(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 5(sp) +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v26 +; LMULMAX2-RV32-NEXT: srli a2, a3, 8 +; LMULMAX2-RV32-NEXT: lui a1, 16 +; LMULMAX2-RV32-NEXT: addi a1, a1, -256 +; LMULMAX2-RV32-NEXT: and a2, a2, a1 +; LMULMAX2-RV32-NEXT: srli a4, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a2, a4 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: lui a2, 4080 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 0(sp) +; LMULMAX2-RV32-NEXT: vsetivli a3, 1, e32,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 7 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 4(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 1 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 2(sp) +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV32-NEXT: srli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 28(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV32-NEXT: srli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 24(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV32-NEXT: srli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 20(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV32-NEXT: srli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 16(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV32-NEXT: srli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 12(sp) ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 1(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 3 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 0(sp) -; LMULMAX2-RV32-NEXT: addi a1, zero, 32 -; LMULMAX2-RV32-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle8.v v26, (sp) +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV32-NEXT: srli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a2 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 8(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 1 +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v26 +; LMULMAX2-RV32-NEXT: srli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a1, a4, a1 +; LMULMAX2-RV32-NEXT: srli a4, a3, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a4 +; LMULMAX2-RV32-NEXT: slli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a2, a4, a2 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a2, a3, a2 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sw a1, 4(sp) ; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v26, (sp) ; LMULMAX2-RV32-NEXT: vse32.v v26, (a0) ; LMULMAX2-RV32-NEXT: addi sp, s0, -64 ; LMULMAX2-RV32-NEXT: lw s0, 56(sp) # 4-byte Folded Reload @@ -713,107 +1485,107 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli a1, 8, e32,m2,ta,mu ; LMULMAX2-RV64-NEXT: vle32.v v26, (a0) -; LMULMAX2-RV64-NEXT: vsetvli zero, zero, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 3(sp) -; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 31(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 30(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 29(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 28(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 27(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 26(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 25(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 24(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 23(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 22(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 21(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 20(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 19(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 17 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 18(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 17(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 16(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 15(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 14(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 13(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 12(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 11(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 10(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 9(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 8(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 7(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 6(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 5(sp) +; LMULMAX2-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX2-RV64-NEXT: srliw a2, a3, 8 +; LMULMAX2-RV64-NEXT: lui a1, 16 +; LMULMAX2-RV64-NEXT: addiw a1, a1, -256 +; LMULMAX2-RV64-NEXT: and a2, a2, a1 +; LMULMAX2-RV64-NEXT: srliw a4, a3, 24 +; LMULMAX2-RV64-NEXT: or a4, a2, a4 +; LMULMAX2-RV64-NEXT: slli a5, a3, 8 +; LMULMAX2-RV64-NEXT: lui a2, 4080 +; LMULMAX2-RV64-NEXT: and a5, a5, a2 +; LMULMAX2-RV64-NEXT: slli a3, a3, 24 +; LMULMAX2-RV64-NEXT: or a3, a3, a5 +; LMULMAX2-RV64-NEXT: or a3, a3, a4 +; LMULMAX2-RV64-NEXT: sw a3, 0(sp) +; LMULMAX2-RV64-NEXT: vsetivli a3, 1, e32,m2,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 7 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 4(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 2(sp) +; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, a1 +; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 +; LMULMAX2-RV64-NEXT: or a4, a4, a5 +; LMULMAX2-RV64-NEXT: slli a5, a3, 8 +; LMULMAX2-RV64-NEXT: and a5, a5, a2 +; LMULMAX2-RV64-NEXT: slli a3, a3, 24 +; LMULMAX2-RV64-NEXT: or a3, a3, a5 +; LMULMAX2-RV64-NEXT: or a3, a3, a4 +; LMULMAX2-RV64-NEXT: sw a3, 28(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 +; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, a1 +; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 +; LMULMAX2-RV64-NEXT: or a4, a4, a5 +; LMULMAX2-RV64-NEXT: slli a5, a3, 8 +; LMULMAX2-RV64-NEXT: and a5, a5, a2 +; LMULMAX2-RV64-NEXT: slli a3, a3, 24 +; LMULMAX2-RV64-NEXT: or a3, a3, a5 +; LMULMAX2-RV64-NEXT: or a3, a3, a4 +; LMULMAX2-RV64-NEXT: sw a3, 24(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 +; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, a1 +; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 +; LMULMAX2-RV64-NEXT: or a4, a4, a5 +; LMULMAX2-RV64-NEXT: slli a5, a3, 8 +; LMULMAX2-RV64-NEXT: and a5, a5, a2 +; LMULMAX2-RV64-NEXT: slli a3, a3, 24 +; LMULMAX2-RV64-NEXT: or a3, a3, a5 +; LMULMAX2-RV64-NEXT: or a3, a3, a4 +; LMULMAX2-RV64-NEXT: sw a3, 20(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 +; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, a1 +; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 +; LMULMAX2-RV64-NEXT: or a4, a4, a5 +; LMULMAX2-RV64-NEXT: slli a5, a3, 8 +; LMULMAX2-RV64-NEXT: and a5, a5, a2 +; LMULMAX2-RV64-NEXT: slli a3, a3, 24 +; LMULMAX2-RV64-NEXT: or a3, a3, a5 +; LMULMAX2-RV64-NEXT: or a3, a3, a4 +; LMULMAX2-RV64-NEXT: sw a3, 16(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 +; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, a1 +; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 +; LMULMAX2-RV64-NEXT: or a4, a4, a5 +; LMULMAX2-RV64-NEXT: slli a5, a3, 8 +; LMULMAX2-RV64-NEXT: and a5, a5, a2 +; LMULMAX2-RV64-NEXT: slli a3, a3, 24 +; LMULMAX2-RV64-NEXT: or a3, a3, a5 +; LMULMAX2-RV64-NEXT: or a3, a3, a4 +; LMULMAX2-RV64-NEXT: sw a3, 12(sp) ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 1(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 3 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 0(sp) -; LMULMAX2-RV64-NEXT: addi a1, zero, 32 -; LMULMAX2-RV64-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle8.v v26, (sp) +; LMULMAX2-RV64-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, a1 +; LMULMAX2-RV64-NEXT: srliw a5, a3, 24 +; LMULMAX2-RV64-NEXT: or a4, a4, a5 +; LMULMAX2-RV64-NEXT: slli a5, a3, 8 +; LMULMAX2-RV64-NEXT: and a5, a5, a2 +; LMULMAX2-RV64-NEXT: slli a3, a3, 24 +; LMULMAX2-RV64-NEXT: or a3, a3, a5 +; LMULMAX2-RV64-NEXT: or a3, a3, a4 +; LMULMAX2-RV64-NEXT: sw a3, 8(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 +; LMULMAX2-RV64-NEXT: vmv.x.s a3, v26 +; LMULMAX2-RV64-NEXT: srliw a4, a3, 8 +; LMULMAX2-RV64-NEXT: and a1, a4, a1 +; LMULMAX2-RV64-NEXT: srliw a4, a3, 24 +; LMULMAX2-RV64-NEXT: or a1, a1, a4 +; LMULMAX2-RV64-NEXT: slli a4, a3, 8 +; LMULMAX2-RV64-NEXT: and a2, a4, a2 +; LMULMAX2-RV64-NEXT: slli a3, a3, 24 +; LMULMAX2-RV64-NEXT: or a2, a3, a2 +; LMULMAX2-RV64-NEXT: or a1, a2, a1 +; LMULMAX2-RV64-NEXT: sw a1, 4(sp) ; LMULMAX2-RV64-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-RV64-NEXT: vle32.v v26, (sp) ; LMULMAX2-RV64-NEXT: vse32.v v26, (a0) ; LMULMAX2-RV64-NEXT: addi sp, s0, -64 ; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -821,119 +1593,235 @@ define void @bswap_v8i32(<8 x i32>* %x, <8 x i32>* %y) { ; LMULMAX2-RV64-NEXT: addi sp, sp, 64 ; LMULMAX2-RV64-NEXT: ret ; -; LMULMAX1-LABEL: bswap_v8i32: -; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -32 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 -; LMULMAX1-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: vle32.v v26, (a1) -; LMULMAX1-NEXT: vle32.v v25, (a0) -; LMULMAX1-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 3(sp) -; LMULMAX1-NEXT: vsetivli a2, 1, e8,m1,ta,mu -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 15(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 14(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 13(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 12(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 11(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 10(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 9(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 8(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 7(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 6(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 5(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 4(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 2(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 1(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v26, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 0(sp) -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 19(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 31(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 30(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 29(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 28(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 27(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 26(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 25(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 24(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 23(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 22(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 21(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 20(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 18(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 17(sp) -; LMULMAX1-NEXT: vslidedown.vi v25, v25, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 16(sp) -; LMULMAX1-NEXT: vsetivli a2, 16, e8,m1,ta,mu -; LMULMAX1-NEXT: addi a2, sp, 16 -; LMULMAX1-NEXT: vle8.v v25, (a2) -; LMULMAX1-NEXT: vle8.v v26, (sp) -; LMULMAX1-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-NEXT: vse32.v v25, (a0) -; LMULMAX1-NEXT: vse32.v v26, (a1) -; LMULMAX1-NEXT: addi sp, sp, 32 -; LMULMAX1-NEXT: ret +; LMULMAX1-RV32-LABEL: bswap_v8i32: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a6, a0, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a6) +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vmv.x.s a4, v26 +; LMULMAX1-RV32-NEXT: srli a3, a4, 8 +; LMULMAX1-RV32-NEXT: lui a2, 16 +; LMULMAX1-RV32-NEXT: addi a2, a2, -256 +; LMULMAX1-RV32-NEXT: and a3, a3, a2 +; LMULMAX1-RV32-NEXT: srli a5, a4, 24 +; LMULMAX1-RV32-NEXT: or a5, a3, a5 +; LMULMAX1-RV32-NEXT: slli a1, a4, 8 +; LMULMAX1-RV32-NEXT: lui a3, 4080 +; LMULMAX1-RV32-NEXT: and a1, a1, a3 +; LMULMAX1-RV32-NEXT: slli a4, a4, 24 +; LMULMAX1-RV32-NEXT: or a1, a4, a1 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 1, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 3 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 2 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v26, 1 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a2, a4, a2 +; LMULMAX1-RV32-NEXT: srli a4, a1, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a4 +; LMULMAX1-RV32-NEXT: slli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a3, a4, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a3 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a6) +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bswap_v8i32: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a6, a0, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a6) +; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vmv.x.s a4, v26 +; LMULMAX1-RV64-NEXT: srliw a3, a4, 8 +; LMULMAX1-RV64-NEXT: lui a2, 16 +; LMULMAX1-RV64-NEXT: addiw a2, a2, -256 +; LMULMAX1-RV64-NEXT: and a3, a3, a2 +; LMULMAX1-RV64-NEXT: srliw a5, a4, 24 +; LMULMAX1-RV64-NEXT: or a5, a3, a5 +; LMULMAX1-RV64-NEXT: slli a1, a4, 8 +; LMULMAX1-RV64-NEXT: lui a3, 4080 +; LMULMAX1-RV64-NEXT: and a1, a1, a3 +; LMULMAX1-RV64-NEXT: slli a4, a4, 24 +; LMULMAX1-RV64-NEXT: or a1, a4, a1 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: sw a1, 16(sp) +; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 3 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a4, a4, a2 +; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 +; LMULMAX1-RV64-NEXT: or a4, a4, a5 +; LMULMAX1-RV64-NEXT: slli a5, a1, 8 +; LMULMAX1-RV64-NEXT: and a5, a5, a3 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: sw a1, 28(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v26, 2 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v27 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a4, a4, a2 +; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 +; LMULMAX1-RV64-NEXT: or a4, a4, a5 +; LMULMAX1-RV64-NEXT: slli a5, a1, 8 +; LMULMAX1-RV64-NEXT: and a5, a5, a3 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: sw a1, 24(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a4, a4, a2 +; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 +; LMULMAX1-RV64-NEXT: or a4, a4, a5 +; LMULMAX1-RV64-NEXT: slli a5, a1, 8 +; LMULMAX1-RV64-NEXT: and a5, a5, a3 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: sw a1, 20(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a4, a4, a2 +; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 +; LMULMAX1-RV64-NEXT: or a4, a4, a5 +; LMULMAX1-RV64-NEXT: slli a5, a1, 8 +; LMULMAX1-RV64-NEXT: and a5, a5, a3 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: sw a1, 0(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 3 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a4, a4, a2 +; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 +; LMULMAX1-RV64-NEXT: or a4, a4, a5 +; LMULMAX1-RV64-NEXT: slli a5, a1, 8 +; LMULMAX1-RV64-NEXT: and a5, a5, a3 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: sw a1, 12(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 2 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a4, a4, a2 +; LMULMAX1-RV64-NEXT: srliw a5, a1, 24 +; LMULMAX1-RV64-NEXT: or a4, a4, a5 +; LMULMAX1-RV64-NEXT: slli a5, a1, 8 +; LMULMAX1-RV64-NEXT: and a5, a5, a3 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: sw a1, 8(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a2, a4, a2 +; LMULMAX1-RV64-NEXT: srliw a4, a1, 24 +; LMULMAX1-RV64-NEXT: or a2, a2, a4 +; LMULMAX1-RV64-NEXT: slli a4, a1, 8 +; LMULMAX1-RV64-NEXT: and a3, a4, a3 +; LMULMAX1-RV64-NEXT: slli a1, a1, 24 +; LMULMAX1-RV64-NEXT: or a1, a1, a3 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: sw a1, 4(sp) +; LMULMAX1-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle32.v v26, (a1) +; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse32.v v26, (a6) +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: ret %a = load <8 x i32>, <8 x i32>* %x %b = load <8 x i32>, <8 x i32>* %y %c = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %a) @@ -956,106 +1844,108 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV32-NEXT: andi sp, sp, -32 ; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vle64.v v26, (a0) -; LMULMAX2-RV32-NEXT: vsetvli zero, zero, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 7(sp) -; LMULMAX2-RV32-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 31(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 30(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 29(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 28(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 27(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 26(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 25(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 24(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 23(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 17 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 22(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 21(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 20(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 19(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 18(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 17(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 16(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 15(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 14(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 13(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 12(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 11(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 10(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 9(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 8(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 1 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 6(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 5(sp) +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v26 +; LMULMAX2-RV32-NEXT: srli a2, a3, 8 +; LMULMAX2-RV32-NEXT: lui a1, 16 +; LMULMAX2-RV32-NEXT: addi a1, a1, -256 +; LMULMAX2-RV32-NEXT: and a2, a2, a1 +; LMULMAX2-RV32-NEXT: srli a4, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a2, a4 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: lui a6, 4080 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 4(sp) +; LMULMAX2-RV32-NEXT: vsetivli a3, 1, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 3 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 4(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 3(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 2(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v28, v26, 6 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV32-NEXT: sb a1, 1(sp) -; LMULMAX2-RV32-NEXT: vslidedown.vi v26, v26, 7 -; LMULMAX2-RV32-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV32-NEXT: sb a1, 0(sp) -; LMULMAX2-RV32-NEXT: addi a1, zero, 32 -; LMULMAX2-RV32-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV32-NEXT: vle8.v v26, (sp) +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v28 +; LMULMAX2-RV32-NEXT: srli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 28(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v30, v26, 2 +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v30 +; LMULMAX2-RV32-NEXT: srli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 20(sp) +; LMULMAX2-RV32-NEXT: vslidedown.vi v8, v26, 1 +; LMULMAX2-RV32-NEXT: vmv.x.s a3, v8 +; LMULMAX2-RV32-NEXT: srli a4, a3, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a3, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a3, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: slli a3, a3, 24 +; LMULMAX2-RV32-NEXT: or a3, a3, a5 +; LMULMAX2-RV32-NEXT: or a3, a3, a4 +; LMULMAX2-RV32-NEXT: sw a3, 12(sp) +; LMULMAX2-RV32-NEXT: addi a3, zero, 32 +; LMULMAX2-RV32-NEXT: vsrl.vx v26, v26, a3 +; LMULMAX2-RV32-NEXT: vmv.x.s a4, v26 +; LMULMAX2-RV32-NEXT: srli a5, a4, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a1 +; LMULMAX2-RV32-NEXT: srli a2, a4, 24 +; LMULMAX2-RV32-NEXT: or a2, a5, a2 +; LMULMAX2-RV32-NEXT: slli a5, a4, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: slli a4, a4, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: or a2, a4, a2 +; LMULMAX2-RV32-NEXT: sw a2, 0(sp) +; LMULMAX2-RV32-NEXT: vsrl.vx v26, v28, a3 +; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV32-NEXT: srli a4, a2, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a2, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a2, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: slli a2, a2, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a5 +; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: sw a2, 24(sp) +; LMULMAX2-RV32-NEXT: vsrl.vx v26, v30, a3 +; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV32-NEXT: srli a4, a2, 8 +; LMULMAX2-RV32-NEXT: and a4, a4, a1 +; LMULMAX2-RV32-NEXT: srli a5, a2, 24 +; LMULMAX2-RV32-NEXT: or a4, a4, a5 +; LMULMAX2-RV32-NEXT: slli a5, a2, 8 +; LMULMAX2-RV32-NEXT: and a5, a5, a6 +; LMULMAX2-RV32-NEXT: slli a2, a2, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a5 +; LMULMAX2-RV32-NEXT: or a2, a2, a4 +; LMULMAX2-RV32-NEXT: sw a2, 16(sp) +; LMULMAX2-RV32-NEXT: vsrl.vx v26, v8, a3 +; LMULMAX2-RV32-NEXT: vmv.x.s a2, v26 +; LMULMAX2-RV32-NEXT: srli a3, a2, 8 +; LMULMAX2-RV32-NEXT: and a1, a3, a1 +; LMULMAX2-RV32-NEXT: srli a3, a2, 24 +; LMULMAX2-RV32-NEXT: or a1, a1, a3 +; LMULMAX2-RV32-NEXT: slli a3, a2, 8 +; LMULMAX2-RV32-NEXT: and a3, a3, a6 +; LMULMAX2-RV32-NEXT: slli a2, a2, 24 +; LMULMAX2-RV32-NEXT: or a2, a2, a3 +; LMULMAX2-RV32-NEXT: or a1, a2, a1 +; LMULMAX2-RV32-NEXT: sw a1, 8(sp) +; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v26, (sp) ; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu ; LMULMAX2-RV32-NEXT: vse64.v v26, (a0) ; LMULMAX2-RV32-NEXT: addi sp, s0, -64 @@ -1077,107 +1967,112 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV64-NEXT: andi sp, sp, -32 ; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu ; LMULMAX2-RV64-NEXT: vle64.v v26, (a0) -; LMULMAX2-RV64-NEXT: vsetvli zero, zero, e8,m2,ta,mu ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 7(sp) -; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 24 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 31(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 25 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 30(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 26 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 29(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 27 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 28(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 28 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 27(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 29 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 26(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 30 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 25(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 31 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 24(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 16 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 23(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 17 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 22(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 18 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 21(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 19 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 20(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 20 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 19(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 21 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 18(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 22 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 17(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 23 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 16(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 8 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 15(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 9 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 14(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 10 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 13(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 11 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 12(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 12 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 11(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 13 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 10(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 14 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 9(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 15 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 8(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 1 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 6(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 5(sp) +; LMULMAX2-RV64-NEXT: srli a2, a1, 40 +; LMULMAX2-RV64-NEXT: lui a3, 16 +; LMULMAX2-RV64-NEXT: addiw a7, a3, -256 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: srli a3, a1, 56 +; LMULMAX2-RV64-NEXT: or a2, a2, a3 +; LMULMAX2-RV64-NEXT: srli a3, a1, 24 +; LMULMAX2-RV64-NEXT: lui a6, 4080 +; LMULMAX2-RV64-NEXT: and a4, a3, a6 +; LMULMAX2-RV64-NEXT: srli a5, a1, 8 +; LMULMAX2-RV64-NEXT: addi a3, zero, 255 +; LMULMAX2-RV64-NEXT: slli t0, a3, 24 +; LMULMAX2-RV64-NEXT: and a5, a5, t0 +; LMULMAX2-RV64-NEXT: or a4, a5, a4 +; LMULMAX2-RV64-NEXT: or a4, a4, a2 +; LMULMAX2-RV64-NEXT: slli a2, a1, 8 +; LMULMAX2-RV64-NEXT: slli t1, a3, 32 +; LMULMAX2-RV64-NEXT: and a2, a2, t1 +; LMULMAX2-RV64-NEXT: slli a5, a1, 24 +; LMULMAX2-RV64-NEXT: slli t2, a3, 40 +; LMULMAX2-RV64-NEXT: and a5, a5, t2 +; LMULMAX2-RV64-NEXT: or a5, a5, a2 +; LMULMAX2-RV64-NEXT: slli a2, a1, 40 +; LMULMAX2-RV64-NEXT: slli a3, a3, 48 +; LMULMAX2-RV64-NEXT: and a2, a2, a3 +; LMULMAX2-RV64-NEXT: slli a1, a1, 56 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: or a1, a1, a5 +; LMULMAX2-RV64-NEXT: or a1, a1, a4 +; LMULMAX2-RV64-NEXT: sd a1, 0(sp) +; LMULMAX2-RV64-NEXT: vsetivli a1, 1, e64,m2,ta,mu ; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 3 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 4(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 4 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 3(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 5 -; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 2(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 6 +; LMULMAX2-RV64-NEXT: srli a2, a1, 40 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: srli a4, a1, 56 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a1, 24 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: srli a5, a1, 8 +; LMULMAX2-RV64-NEXT: and a5, a5, t0 +; LMULMAX2-RV64-NEXT: or a4, a5, a4 +; LMULMAX2-RV64-NEXT: or a2, a4, a2 +; LMULMAX2-RV64-NEXT: slli a4, a1, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, t1 +; LMULMAX2-RV64-NEXT: slli a5, a1, 24 +; LMULMAX2-RV64-NEXT: and a5, a5, t2 +; LMULMAX2-RV64-NEXT: or a4, a5, a4 +; LMULMAX2-RV64-NEXT: slli a5, a1, 40 +; LMULMAX2-RV64-NEXT: and a5, a5, a3 +; LMULMAX2-RV64-NEXT: slli a1, a1, 56 +; LMULMAX2-RV64-NEXT: or a1, a1, a5 +; LMULMAX2-RV64-NEXT: or a1, a1, a4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: sd a1, 24(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v28, v26, 2 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v28 -; LMULMAX2-RV64-NEXT: sb a1, 1(sp) -; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 7 +; LMULMAX2-RV64-NEXT: srli a2, a1, 40 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: srli a4, a1, 56 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a1, 24 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: srli a5, a1, 8 +; LMULMAX2-RV64-NEXT: and a5, a5, t0 +; LMULMAX2-RV64-NEXT: or a4, a5, a4 +; LMULMAX2-RV64-NEXT: or a2, a4, a2 +; LMULMAX2-RV64-NEXT: slli a4, a1, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, t1 +; LMULMAX2-RV64-NEXT: slli a5, a1, 24 +; LMULMAX2-RV64-NEXT: and a5, a5, t2 +; LMULMAX2-RV64-NEXT: or a4, a5, a4 +; LMULMAX2-RV64-NEXT: slli a5, a1, 40 +; LMULMAX2-RV64-NEXT: and a5, a5, a3 +; LMULMAX2-RV64-NEXT: slli a1, a1, 56 +; LMULMAX2-RV64-NEXT: or a1, a1, a5 +; LMULMAX2-RV64-NEXT: or a1, a1, a4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: sd a1, 16(sp) +; LMULMAX2-RV64-NEXT: vslidedown.vi v26, v26, 1 ; LMULMAX2-RV64-NEXT: vmv.x.s a1, v26 -; LMULMAX2-RV64-NEXT: sb a1, 0(sp) -; LMULMAX2-RV64-NEXT: addi a1, zero, 32 -; LMULMAX2-RV64-NEXT: vsetvli a1, a1, e8,m2,ta,mu -; LMULMAX2-RV64-NEXT: vle8.v v26, (sp) +; LMULMAX2-RV64-NEXT: srli a2, a1, 40 +; LMULMAX2-RV64-NEXT: and a2, a2, a7 +; LMULMAX2-RV64-NEXT: srli a4, a1, 56 +; LMULMAX2-RV64-NEXT: or a2, a2, a4 +; LMULMAX2-RV64-NEXT: srli a4, a1, 24 +; LMULMAX2-RV64-NEXT: and a4, a4, a6 +; LMULMAX2-RV64-NEXT: srli a5, a1, 8 +; LMULMAX2-RV64-NEXT: and a5, a5, t0 +; LMULMAX2-RV64-NEXT: or a4, a5, a4 +; LMULMAX2-RV64-NEXT: or a2, a4, a2 +; LMULMAX2-RV64-NEXT: slli a4, a1, 8 +; LMULMAX2-RV64-NEXT: and a4, a4, t1 +; LMULMAX2-RV64-NEXT: slli a5, a1, 24 +; LMULMAX2-RV64-NEXT: and a5, a5, t2 +; LMULMAX2-RV64-NEXT: or a4, a5, a4 +; LMULMAX2-RV64-NEXT: slli a5, a1, 40 +; LMULMAX2-RV64-NEXT: and a3, a5, a3 +; LMULMAX2-RV64-NEXT: slli a1, a1, 56 +; LMULMAX2-RV64-NEXT: or a1, a1, a3 +; LMULMAX2-RV64-NEXT: or a1, a1, a4 +; LMULMAX2-RV64-NEXT: or a1, a1, a2 +; LMULMAX2-RV64-NEXT: sd a1, 8(sp) ; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV64-NEXT: vle64.v v26, (sp) ; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX2-RV64-NEXT: addi sp, s0, -64 ; LMULMAX2-RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload @@ -1185,119 +2080,242 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-RV64-NEXT: addi sp, sp, 64 ; LMULMAX2-RV64-NEXT: ret ; -; LMULMAX1-LABEL: bswap_v4i64: -; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi sp, sp, -32 -; LMULMAX1-NEXT: .cfi_def_cfa_offset 32 -; LMULMAX1-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: vle64.v v26, (a1) -; LMULMAX1-NEXT: vle64.v v25, (a0) -; LMULMAX1-NEXT: vsetvli zero, zero, e8,m1,ta,mu -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 7(sp) -; LMULMAX1-NEXT: vsetivli a2, 1, e8,m1,ta,mu -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 15(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 14(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 13(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 12(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 11(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 10(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 9(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 8(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 6(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 5(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 4(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 3(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 2(sp) -; LMULMAX1-NEXT: vslidedown.vi v27, v26, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v27 -; LMULMAX1-NEXT: sb a2, 1(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v26, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 0(sp) -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 23(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 8 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 31(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 9 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 30(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 10 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 29(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 11 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 28(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 12 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 27(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 13 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 26(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 14 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 25(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 15 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 24(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 1 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 22(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 2 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 21(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 3 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 20(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 4 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 19(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 5 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 18(sp) -; LMULMAX1-NEXT: vslidedown.vi v26, v25, 6 -; LMULMAX1-NEXT: vmv.x.s a2, v26 -; LMULMAX1-NEXT: sb a2, 17(sp) -; LMULMAX1-NEXT: vslidedown.vi v25, v25, 7 -; LMULMAX1-NEXT: vmv.x.s a2, v25 -; LMULMAX1-NEXT: sb a2, 16(sp) -; LMULMAX1-NEXT: vsetivli a2, 16, e8,m1,ta,mu -; LMULMAX1-NEXT: addi a2, sp, 16 -; LMULMAX1-NEXT: vle8.v v25, (a2) -; LMULMAX1-NEXT: vle8.v v26, (sp) -; LMULMAX1-NEXT: vsetivli a2, 2, e64,m1,ta,mu -; LMULMAX1-NEXT: vse64.v v25, (a0) -; LMULMAX1-NEXT: vse64.v v26, (a1) -; LMULMAX1-NEXT: addi sp, sp, 32 -; LMULMAX1-NEXT: ret +; LMULMAX1-RV32-LABEL: bswap_v4i64: +; LMULMAX1-RV32: # %bb.0: +; LMULMAX1-RV32-NEXT: addi sp, sp, -32 +; LMULMAX1-RV32-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: addi a6, a0, 16 +; LMULMAX1-RV32-NEXT: vle64.v v26, (a6) +; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vmv.x.s a4, v26 +; LMULMAX1-RV32-NEXT: srli a3, a4, 8 +; LMULMAX1-RV32-NEXT: lui a2, 16 +; LMULMAX1-RV32-NEXT: addi a2, a2, -256 +; LMULMAX1-RV32-NEXT: and a3, a3, a2 +; LMULMAX1-RV32-NEXT: srli a5, a4, 24 +; LMULMAX1-RV32-NEXT: or a5, a3, a5 +; LMULMAX1-RV32-NEXT: slli a1, a4, 8 +; LMULMAX1-RV32-NEXT: lui a3, 4080 +; LMULMAX1-RV32-NEXT: and a1, a1, a3 +; LMULMAX1-RV32-NEXT: slli a4, a4, 24 +; LMULMAX1-RV32-NEXT: or a1, a4, a1 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: sw a1, 20(sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vslidedown.vi v27, v26, 1 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v27 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 28(sp) +; LMULMAX1-RV32-NEXT: addi a7, zero, 32 +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v26, a7 +; LMULMAX1-RV32-NEXT: vmv.x.s a4, v26 +; LMULMAX1-RV32-NEXT: srli a5, a4, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a2 +; LMULMAX1-RV32-NEXT: srli a1, a4, 24 +; LMULMAX1-RV32-NEXT: or a1, a5, a1 +; LMULMAX1-RV32-NEXT: slli a5, a4, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a4, a4, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: or a1, a4, a1 +; LMULMAX1-RV32-NEXT: sw a1, 16(sp) +; LMULMAX1-RV32-NEXT: vsrl.vx v26, v27, a7 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 24(sp) +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 4(sp) +; LMULMAX1-RV32-NEXT: vslidedown.vi v26, v25, 1 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 12(sp) +; LMULMAX1-RV32-NEXT: vsrl.vx v25, v25, a7 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a4, a4, a2 +; LMULMAX1-RV32-NEXT: srli a5, a1, 24 +; LMULMAX1-RV32-NEXT: or a4, a4, a5 +; LMULMAX1-RV32-NEXT: slli a5, a1, 8 +; LMULMAX1-RV32-NEXT: and a5, a5, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a5 +; LMULMAX1-RV32-NEXT: or a1, a1, a4 +; LMULMAX1-RV32-NEXT: sw a1, 0(sp) +; LMULMAX1-RV32-NEXT: vsrl.vx v25, v26, a7 +; LMULMAX1-RV32-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV32-NEXT: srli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a2, a4, a2 +; LMULMAX1-RV32-NEXT: srli a4, a1, 24 +; LMULMAX1-RV32-NEXT: or a2, a2, a4 +; LMULMAX1-RV32-NEXT: slli a4, a1, 8 +; LMULMAX1-RV32-NEXT: and a3, a4, a3 +; LMULMAX1-RV32-NEXT: slli a1, a1, 24 +; LMULMAX1-RV32-NEXT: or a1, a1, a3 +; LMULMAX1-RV32-NEXT: or a1, a1, a2 +; LMULMAX1-RV32-NEXT: sw a1, 8(sp) +; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (sp) +; LMULMAX1-RV32-NEXT: addi a1, sp, 16 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) +; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse64.v v26, (a6) +; LMULMAX1-RV32-NEXT: addi sp, sp, 32 +; LMULMAX1-RV32-NEXT: ret +; +; LMULMAX1-RV64-LABEL: bswap_v4i64: +; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: addi sp, sp, -32 +; LMULMAX1-RV64-NEXT: .cfi_def_cfa_offset 32 +; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a6, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a6) +; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vmv.x.s a2, v26 +; LMULMAX1-RV64-NEXT: srli a1, a2, 40 +; LMULMAX1-RV64-NEXT: lui a3, 16 +; LMULMAX1-RV64-NEXT: addiw t0, a3, -256 +; LMULMAX1-RV64-NEXT: and a1, a1, t0 +; LMULMAX1-RV64-NEXT: srli a3, a2, 56 +; LMULMAX1-RV64-NEXT: or a1, a1, a3 +; LMULMAX1-RV64-NEXT: srli a3, a2, 24 +; LMULMAX1-RV64-NEXT: lui a7, 4080 +; LMULMAX1-RV64-NEXT: and a3, a3, a7 +; LMULMAX1-RV64-NEXT: srli a5, a2, 8 +; LMULMAX1-RV64-NEXT: addi a4, zero, 255 +; LMULMAX1-RV64-NEXT: slli t1, a4, 24 +; LMULMAX1-RV64-NEXT: and a5, a5, t1 +; LMULMAX1-RV64-NEXT: or a3, a5, a3 +; LMULMAX1-RV64-NEXT: or a5, a3, a1 +; LMULMAX1-RV64-NEXT: slli a1, a2, 8 +; LMULMAX1-RV64-NEXT: slli t2, a4, 32 +; LMULMAX1-RV64-NEXT: and a3, a1, t2 +; LMULMAX1-RV64-NEXT: slli a1, a2, 24 +; LMULMAX1-RV64-NEXT: slli t3, a4, 40 +; LMULMAX1-RV64-NEXT: and a1, a1, t3 +; LMULMAX1-RV64-NEXT: or a1, a1, a3 +; LMULMAX1-RV64-NEXT: slli a3, a2, 40 +; LMULMAX1-RV64-NEXT: slli a4, a4, 48 +; LMULMAX1-RV64-NEXT: and a3, a3, a4 +; LMULMAX1-RV64-NEXT: slli a2, a2, 56 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: or a1, a2, a1 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: sd a1, 16(sp) +; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v26, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v26 +; LMULMAX1-RV64-NEXT: srli a2, a1, 40 +; LMULMAX1-RV64-NEXT: and a2, a2, t0 +; LMULMAX1-RV64-NEXT: srli a3, a1, 56 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a1, 24 +; LMULMAX1-RV64-NEXT: and a3, a3, a7 +; LMULMAX1-RV64-NEXT: srli a5, a1, 8 +; LMULMAX1-RV64-NEXT: and a5, a5, t1 +; LMULMAX1-RV64-NEXT: or a3, a5, a3 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: slli a3, a1, 8 +; LMULMAX1-RV64-NEXT: and a3, a3, t2 +; LMULMAX1-RV64-NEXT: slli a5, a1, 24 +; LMULMAX1-RV64-NEXT: and a5, a5, t3 +; LMULMAX1-RV64-NEXT: or a3, a5, a3 +; LMULMAX1-RV64-NEXT: slli a5, a1, 40 +; LMULMAX1-RV64-NEXT: and a5, a5, a4 +; LMULMAX1-RV64-NEXT: slli a1, a1, 56 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: or a1, a1, a3 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: sd a1, 24(sp) +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: srli a2, a1, 40 +; LMULMAX1-RV64-NEXT: and a2, a2, t0 +; LMULMAX1-RV64-NEXT: srli a3, a1, 56 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a1, 24 +; LMULMAX1-RV64-NEXT: and a3, a3, a7 +; LMULMAX1-RV64-NEXT: srli a5, a1, 8 +; LMULMAX1-RV64-NEXT: and a5, a5, t1 +; LMULMAX1-RV64-NEXT: or a3, a5, a3 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: slli a3, a1, 8 +; LMULMAX1-RV64-NEXT: and a3, a3, t2 +; LMULMAX1-RV64-NEXT: slli a5, a1, 24 +; LMULMAX1-RV64-NEXT: and a5, a5, t3 +; LMULMAX1-RV64-NEXT: or a3, a5, a3 +; LMULMAX1-RV64-NEXT: slli a5, a1, 40 +; LMULMAX1-RV64-NEXT: and a5, a5, a4 +; LMULMAX1-RV64-NEXT: slli a1, a1, 56 +; LMULMAX1-RV64-NEXT: or a1, a1, a5 +; LMULMAX1-RV64-NEXT: or a1, a1, a3 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: sd a1, 0(sp) +; LMULMAX1-RV64-NEXT: vslidedown.vi v25, v25, 1 +; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 +; LMULMAX1-RV64-NEXT: srli a2, a1, 40 +; LMULMAX1-RV64-NEXT: and a2, a2, t0 +; LMULMAX1-RV64-NEXT: srli a3, a1, 56 +; LMULMAX1-RV64-NEXT: or a2, a2, a3 +; LMULMAX1-RV64-NEXT: srli a3, a1, 24 +; LMULMAX1-RV64-NEXT: and a3, a3, a7 +; LMULMAX1-RV64-NEXT: srli a5, a1, 8 +; LMULMAX1-RV64-NEXT: and a5, a5, t1 +; LMULMAX1-RV64-NEXT: or a3, a5, a3 +; LMULMAX1-RV64-NEXT: or a2, a3, a2 +; LMULMAX1-RV64-NEXT: slli a3, a1, 8 +; LMULMAX1-RV64-NEXT: and a3, a3, t2 +; LMULMAX1-RV64-NEXT: slli a5, a1, 24 +; LMULMAX1-RV64-NEXT: and a5, a5, t3 +; LMULMAX1-RV64-NEXT: or a3, a5, a3 +; LMULMAX1-RV64-NEXT: slli a5, a1, 40 +; LMULMAX1-RV64-NEXT: and a4, a5, a4 +; LMULMAX1-RV64-NEXT: slli a1, a1, 56 +; LMULMAX1-RV64-NEXT: or a1, a1, a4 +; LMULMAX1-RV64-NEXT: or a1, a1, a3 +; LMULMAX1-RV64-NEXT: or a1, a1, a2 +; LMULMAX1-RV64-NEXT: sd a1, 8(sp) +; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vle64.v v25, (sp) +; LMULMAX1-RV64-NEXT: addi a1, sp, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) +; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a6) +; LMULMAX1-RV64-NEXT: addi sp, sp, 32 +; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y %c = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %a) -- 2.7.4