From fed2889f075dc4d484c68fe6c3295e986535bcfd Mon Sep 17 00:00:00 2001 From: Chen Zheng Date: Fri, 5 Nov 2021 03:22:17 +0000 Subject: [PATCH] [PowerPC] use correct selection for v16i8/v8i16 splat load Reviewed By: nemanjai Differential Revision: https://reviews.llvm.org/D113236 --- llvm/lib/Target/PowerPC/PPCInstrVSX.td | 6 +- .../CodeGen/PowerPC/canonical-merge-shuffles.ll | 2 +- llvm/test/CodeGen/PowerPC/load-and-splat.ll | 463 ++++++++++++++++++++- 3 files changed, 464 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index c25e9f8..03ce031 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -3556,12 +3556,10 @@ def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A, (v4i32 (VSPLTISW imm:$A))>; // Splat loads. -// Note that, we use MTVSRD without checking PPC64 because we only care the -// lowest 16/8 bits. def : Pat<(v8i16 (PPCldsplat ForceXForm:$A)), - (v8i16 (VSPLTHs 3, (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (LHZX ForceXForm:$A), sub_32))))>; + (v8i16 (VSPLTHs 3, (MTVSRWZ (LHZX ForceXForm:$A))))>; def : Pat<(v16i8 (PPCldsplat ForceXForm:$A)), - (v16i8 (VSPLTBs 7, (MTVSRD (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (LBZX ForceXForm:$A), sub_32))))>; + (v16i8 (VSPLTBs 7, (MTVSRWZ (LBZX ForceXForm:$A))))>; } // HasVSX, HasDirectMove // Big endian VSX subtarget with direct moves. diff --git a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll index 655ab5c..1c55f4e 100644 --- a/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/llvm/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -825,7 +825,7 @@ define dso_local void @testByteSplat() #0 { ; CHECK-P8-LABEL: testByteSplat: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: lbzx r3, 0, r3 -; CHECK-P8-NEXT: mtvsrd v2, r3 +; CHECK-P8-NEXT: mtvsrwz v2, r3 ; CHECK-P8-NEXT: vspltb v2, v2, 7 ; CHECK-P8-NEXT: stvx v2, 0, r3 ; CHECK-P8-NEXT: blr diff --git a/llvm/test/CodeGen/PowerPC/load-and-splat.ll b/llvm/test/CodeGen/PowerPC/load-and-splat.ll index 0f3f110..8196efa 100644 --- a/llvm/test/CodeGen/PowerPC/load-and-splat.ll +++ b/llvm/test/CodeGen/PowerPC/load-and-splat.ll @@ -8,6 +8,15 @@ ; RUN: llc -mcpu=pwr7 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ ; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s \ ; RUN: -check-prefix=P7 +; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mtriple=powerpc-ibm-aix-xcoff < %s | FileCheck %s \ +; RUN: -check-prefix=P9-AIX32 +; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mtriple=powerpc-ibm-aix-xcoff < %s | FileCheck %s \ +; RUN: -check-prefix=P8-AIX32 +; RUN: llc -mcpu=pwr7 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mtriple=powerpc-ibm-aix-xcoff < %s | FileCheck %s \ +; RUN: -check-prefix=P7-AIX32 ; v2f64 define dso_local void @test(<2 x double>* nocapture %c, double* nocapture readonly %a) local_unnamed_addr { @@ -31,6 +40,27 @@ define dso_local void @test(<2 x double>* nocapture %c, double* nocapture readon ; P7-NEXT: lxvdsx vs0, 0, r4 ; P7-NEXT: stxvd2x vs0, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: test: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: addi r4, r4, 24 +; P9-AIX32-NEXT: lxvdsx vs0, 0, r4 +; P9-AIX32-NEXT: stxv vs0, 0(r3) +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: test: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: addi r4, r4, 24 +; P8-AIX32-NEXT: lxvdsx vs0, 0, r4 +; P8-AIX32-NEXT: stxvd2x vs0, 0, r3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: test: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: addi r4, r4, 24 +; P7-AIX32-NEXT: lxvdsx vs0, 0, r4 +; P7-AIX32-NEXT: stxvd2x vs0, 0, r3 +; P7-AIX32-NEXT: blr entry: %arrayidx = getelementptr inbounds double, double* %a, i64 3 %0 = load double, double* %arrayidx, align 8 @@ -64,6 +94,29 @@ define dso_local void @test2(<4 x float>* nocapture %c, float* nocapture readonl ; P7-NEXT: xxspltw vs0, vs0, 1 ; P7-NEXT: stxvw4x vs0, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: test2: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: addi r4, r4, 12 +; P9-AIX32-NEXT: lxvwsx vs0, 0, r4 +; P9-AIX32-NEXT: stxv vs0, 0(r3) +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: test2: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: addi r4, r4, 12 +; P8-AIX32-NEXT: lfiwzx f0, 0, r4 +; P8-AIX32-NEXT: xxspltw vs0, vs0, 1 +; P8-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: test2: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: addi r4, r4, 12 +; P7-AIX32-NEXT: lfiwzx f0, 0, r4 +; P7-AIX32-NEXT: xxspltw vs0, vs0, 1 +; P7-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: blr entry: %arrayidx = getelementptr inbounds float, float* %a, i64 3 %0 = load float, float* %arrayidx, align 4 @@ -97,6 +150,29 @@ define dso_local void @test3(<4 x i32>* nocapture %c, i32* nocapture readonly %a ; P7-NEXT: xxspltw vs0, vs0, 1 ; P7-NEXT: stxvw4x vs0, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: test3: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: addi r4, r4, 12 +; P9-AIX32-NEXT: lxvwsx vs0, 0, r4 +; P9-AIX32-NEXT: stxv vs0, 0(r3) +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: test3: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: addi r4, r4, 12 +; P8-AIX32-NEXT: lfiwzx f0, 0, r4 +; P8-AIX32-NEXT: xxspltw vs0, vs0, 1 +; P8-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: test3: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: addi r4, r4, 12 +; P7-AIX32-NEXT: lfiwzx f0, 0, r4 +; P7-AIX32-NEXT: xxspltw vs0, vs0, 1 +; P7-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: blr entry: %arrayidx = getelementptr inbounds i32, i32* %a, i64 3 %0 = load i32, i32* %arrayidx, align 4 @@ -129,6 +205,52 @@ define dso_local void @test4(<2 x i64>* nocapture %c, i64* nocapture readonly %a ; P7-NEXT: lxvdsx vs0, 0, r4 ; P7-NEXT: stxvd2x vs0, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: test4: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: lwz r5, 24(r4) +; P9-AIX32-NEXT: lwz r4, 28(r4) +; P9-AIX32-NEXT: stw r4, -16(r1) +; P9-AIX32-NEXT: stw r5, -32(r1) +; P9-AIX32-NEXT: lxv v2, -16(r1) +; P9-AIX32-NEXT: lxv v3, -32(r1) +; P9-AIX32-NEXT: vmrghw v2, v3, v2 +; P9-AIX32-NEXT: xxswapd vs0, v2 +; P9-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 +; P9-AIX32-NEXT: stxv vs0, 0(r3) +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: test4: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lwz r5, 24(r4) +; P8-AIX32-NEXT: lwz r4, 28(r4) +; P8-AIX32-NEXT: stw r4, -16(r1) +; P8-AIX32-NEXT: stw r5, -32(r1) +; P8-AIX32-NEXT: addi r4, r1, -16 +; P8-AIX32-NEXT: addi r5, r1, -32 +; P8-AIX32-NEXT: lxvw4x v2, 0, r4 +; P8-AIX32-NEXT: lxvw4x v3, 0, r5 +; P8-AIX32-NEXT: vmrghw v2, v3, v2 +; P8-AIX32-NEXT: xxswapd vs0, v2 +; P8-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 +; P8-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: test4: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lwz r6, 28(r4) +; P7-AIX32-NEXT: lwz r4, 24(r4) +; P7-AIX32-NEXT: addi r5, r1, -16 +; P7-AIX32-NEXT: stw r6, -16(r1) +; P7-AIX32-NEXT: stw r4, -32(r1) +; P7-AIX32-NEXT: addi r4, r1, -32 +; P7-AIX32-NEXT: lxvw4x v2, 0, r5 +; P7-AIX32-NEXT: lxvw4x v3, 0, r4 +; P7-AIX32-NEXT: vmrghw v2, v3, v2 +; P7-AIX32-NEXT: xxswapd vs0, v2 +; P7-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 +; P7-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: blr entry: %arrayidx = getelementptr inbounds i64, i64* %a, i64 3 %0 = load i64, i64* %arrayidx, align 8 @@ -160,6 +282,52 @@ define void @test5(<2 x i64>* %a, i32* %in) { ; P7-NEXT: xxspltd vs0, f0, 0 ; P7-NEXT: stxvd2x vs0, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: test5: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: lwz r4, 0(r4) +; P9-AIX32-NEXT: srawi r5, r4, 31 +; P9-AIX32-NEXT: stw r4, -16(r1) +; P9-AIX32-NEXT: lxv v2, -16(r1) +; P9-AIX32-NEXT: stw r5, -32(r1) +; P9-AIX32-NEXT: lxv v3, -32(r1) +; P9-AIX32-NEXT: vmrghw v2, v3, v2 +; P9-AIX32-NEXT: xxswapd vs0, v2 +; P9-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 +; P9-AIX32-NEXT: stxv vs0, 0(r3) +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: test5: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lwz r4, 0(r4) +; P8-AIX32-NEXT: srawi r5, r4, 31 +; P8-AIX32-NEXT: stw r4, -16(r1) +; P8-AIX32-NEXT: addi r4, r1, -16 +; P8-AIX32-NEXT: stw r5, -32(r1) +; P8-AIX32-NEXT: addi r5, r1, -32 +; P8-AIX32-NEXT: lxvw4x v2, 0, r4 +; P8-AIX32-NEXT: lxvw4x v3, 0, r5 +; P8-AIX32-NEXT: vmrghw v2, v3, v2 +; P8-AIX32-NEXT: xxswapd vs0, v2 +; P8-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 +; P8-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: test5: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lwz r4, 0(r4) +; P7-AIX32-NEXT: addi r5, r1, -16 +; P7-AIX32-NEXT: stw r4, -16(r1) +; P7-AIX32-NEXT: srawi r4, r4, 31 +; P7-AIX32-NEXT: stw r4, -32(r1) +; P7-AIX32-NEXT: addi r4, r1, -32 +; P7-AIX32-NEXT: lxvw4x v2, 0, r5 +; P7-AIX32-NEXT: lxvw4x v3, 0, r4 +; P7-AIX32-NEXT: vmrghw v2, v3, v2 +; P7-AIX32-NEXT: xxswapd vs0, v2 +; P7-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 +; P7-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: blr entry: %0 = load i32, i32* %in, align 4 %conv = sext i32 %0 to i64 @@ -191,6 +359,52 @@ define void @test6(<2 x i64>* %a, i32* %in) { ; P7-NEXT: xxspltd vs0, f0, 0 ; P7-NEXT: stxvd2x vs0, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: test6: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: lwz r4, 0(r4) +; P9-AIX32-NEXT: li r5, 0 +; P9-AIX32-NEXT: stw r5, -32(r1) +; P9-AIX32-NEXT: lxv v2, -32(r1) +; P9-AIX32-NEXT: stw r4, -16(r1) +; P9-AIX32-NEXT: lxv v3, -16(r1) +; P9-AIX32-NEXT: vmrghw v2, v2, v3 +; P9-AIX32-NEXT: xxswapd vs0, v2 +; P9-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 +; P9-AIX32-NEXT: stxv vs0, 0(r3) +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: test6: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lwz r4, 0(r4) +; P8-AIX32-NEXT: li r5, 0 +; P8-AIX32-NEXT: stw r5, -32(r1) +; P8-AIX32-NEXT: addi r5, r1, -16 +; P8-AIX32-NEXT: stw r4, -16(r1) +; P8-AIX32-NEXT: addi r4, r1, -32 +; P8-AIX32-NEXT: lxvw4x v2, 0, r4 +; P8-AIX32-NEXT: lxvw4x v3, 0, r5 +; P8-AIX32-NEXT: vmrghw v2, v2, v3 +; P8-AIX32-NEXT: xxswapd vs0, v2 +; P8-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 +; P8-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: test6: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lwz r4, 0(r4) +; P7-AIX32-NEXT: li r5, 0 +; P7-AIX32-NEXT: stw r5, -32(r1) +; P7-AIX32-NEXT: addi r5, r1, -16 +; P7-AIX32-NEXT: stw r4, -16(r1) +; P7-AIX32-NEXT: addi r4, r1, -32 +; P7-AIX32-NEXT: lxvw4x v2, 0, r4 +; P7-AIX32-NEXT: lxvw4x v3, 0, r5 +; P7-AIX32-NEXT: vmrghw v2, v2, v3 +; P7-AIX32-NEXT: xxswapd vs0, v2 +; P7-AIX32-NEXT: xxsldwi vs0, vs0, v2, 2 +; P7-AIX32-NEXT: stxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: blr entry: %0 = load i32, i32* %in, align 4 %conv = zext i32 %0 to i64 @@ -212,7 +426,7 @@ define void @test7(<8 x i16>* %a, i16* %in) { ; P8-LABEL: test7: ; P8: # %bb.0: # %entry ; P8-NEXT: lhzx r4, 0, r4 -; P8-NEXT: mtvsrd v2, r4 +; P8-NEXT: mtvsrwz v2, r4 ; P8-NEXT: vsplth v2, v2, 3 ; P8-NEXT: stvx v2, 0, r3 ; P8-NEXT: blr @@ -227,6 +441,32 @@ define void @test7(<8 x i16>* %a, i16* %in) { ; P7-NEXT: vsplth v2, v2, 0 ; P7-NEXT: stxvw4x v2, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: test7: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: lxsihzx v2, 0, r4 +; P9-AIX32-NEXT: vsplth v2, v2, 3 +; P9-AIX32-NEXT: stxv v2, 0(r3) +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: test7: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lhzx r4, 0, r4 +; P8-AIX32-NEXT: mtvsrwz v2, r4 +; P8-AIX32-NEXT: vsplth v2, v2, 3 +; P8-AIX32-NEXT: stxvw4x v2, 0, r3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: test7: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: li r5, 1 +; P7-AIX32-NEXT: lvx v2, 0, r4 +; P7-AIX32-NEXT: lvsl v4, 0, r4 +; P7-AIX32-NEXT: lvx v3, r5, r4 +; P7-AIX32-NEXT: vperm v2, v2, v3, v4 +; P7-AIX32-NEXT: vsplth v2, v2, 0 +; P7-AIX32-NEXT: stxvw4x v2, 0, r3 +; P7-AIX32-NEXT: blr entry: %0 = load i16, i16* %in, align 2 %splat.splatinsert.i = insertelement <8 x i16> poison, i16 %0, i32 0 @@ -247,7 +487,7 @@ define void @test8(<16 x i8>* %a, i8* %in) { ; P8-LABEL: test8: ; P8: # %bb.0: # %entry ; P8-NEXT: lbzx r4, 0, r4 -; P8-NEXT: mtvsrd v2, r4 +; P8-NEXT: mtvsrwz v2, r4 ; P8-NEXT: vspltb v2, v2, 7 ; P8-NEXT: stvx v2, 0, r3 ; P8-NEXT: blr @@ -260,6 +500,30 @@ define void @test8(<16 x i8>* %a, i8* %in) { ; P7-NEXT: vspltb v2, v2, 0 ; P7-NEXT: stxvw4x v2, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: test8: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: lxsibzx v2, 0, r4 +; P9-AIX32-NEXT: vspltb v2, v2, 7 +; P9-AIX32-NEXT: stxv v2, 0(r3) +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: test8: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lbzx r4, 0, r4 +; P8-AIX32-NEXT: mtvsrwz v2, r4 +; P8-AIX32-NEXT: vspltb v2, v2, 7 +; P8-AIX32-NEXT: stxvw4x v2, 0, r3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: test8: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lvsl v2, 0, r4 +; P7-AIX32-NEXT: lvx v3, 0, r4 +; P7-AIX32-NEXT: vperm v2, v3, v3, v2 +; P7-AIX32-NEXT: vspltb v2, v2, 0 +; P7-AIX32-NEXT: stxvw4x v2, 0, r3 +; P7-AIX32-NEXT: blr entry: %0 = load i8, i8* %in, align 1 %splat.splatinsert.i = insertelement <16 x i8> poison, i8 %0, i32 0 @@ -288,6 +552,27 @@ define <16 x i8> @unadjusted_lxvwsx(i32* %s, i32* %t) { ; P7-NEXT: lxvw4x vs0, 0, r4 ; P7-NEXT: xxspltw v2, vs0, 0 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: unadjusted_lxvwsx: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: lxvwsx v2, 0, r3 +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: unadjusted_lxvwsx: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lwz r3, 0(r3) +; P8-AIX32-NEXT: mtfprwz f0, r3 +; P8-AIX32-NEXT: xxspltw v2, vs0, 1 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: unadjusted_lxvwsx: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lwz r3, 0(r3) +; P7-AIX32-NEXT: addi r4, r1, -16 +; P7-AIX32-NEXT: stw r3, -16(r1) +; P7-AIX32-NEXT: lxvw4x vs0, 0, r4 +; P7-AIX32-NEXT: xxspltw v2, vs0, 0 +; P7-AIX32-NEXT: blr entry: %0 = bitcast i32* %s to <4 x i8>* %1 = load <4 x i8>, <4 x i8>* %0, align 4 @@ -316,6 +601,35 @@ define <16 x i8> @adjusted_lxvwsx(i64* %s, i64* %t) { ; P7-NEXT: lxvw4x vs0, 0, r4 ; P7-NEXT: xxspltw v2, vs0, 1 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: adjusted_lxvwsx: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: lwz r3, 4(r3) +; P9-AIX32-NEXT: stw r3, -16(r1) +; P9-AIX32-NEXT: lxv v2, -16(r1) +; P9-AIX32-NEXT: vmrghw v2, v2, v2 +; P9-AIX32-NEXT: xxspltw v2, v2, 1 +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: adjusted_lxvwsx: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lwz r3, 4(r3) +; P8-AIX32-NEXT: addi r4, r1, -16 +; P8-AIX32-NEXT: stw r3, -16(r1) +; P8-AIX32-NEXT: lxvw4x v2, 0, r4 +; P8-AIX32-NEXT: vmrghw v2, v2, v2 +; P8-AIX32-NEXT: xxspltw v2, v2, 1 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: adjusted_lxvwsx: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lwz r3, 4(r3) +; P7-AIX32-NEXT: addi r4, r1, -16 +; P7-AIX32-NEXT: stw r3, -16(r1) +; P7-AIX32-NEXT: lxvw4x v2, 0, r4 +; P7-AIX32-NEXT: vmrghw v2, v2, v2 +; P7-AIX32-NEXT: xxspltw v2, v2, 1 +; P7-AIX32-NEXT: blr entry: %0 = bitcast i64* %s to <8 x i8>* %1 = load <8 x i8>, <8 x i8>* %0, align 8 @@ -340,6 +654,23 @@ define <16 x i8> @unadjusted_lxvwsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { ; P7-NEXT: lxvw4x vs0, 0, r3 ; P7-NEXT: xxspltw v2, vs0, 0 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: unadjusted_lxvwsx_v16i8: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: lxvwsx v2, 0, r3 +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: unadjusted_lxvwsx_v16i8: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: xxspltw v2, vs0, 0 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: unadjusted_lxvwsx_v16i8: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: xxspltw v2, vs0, 0 +; P7-AIX32-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> @@ -364,6 +695,24 @@ define <16 x i8> @adjusted_lxvwsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { ; P7-NEXT: lxvw4x vs0, 0, r3 ; P7-NEXT: xxspltw v2, vs0, 1 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: adjusted_lxvwsx_v16i8: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: addi r3, r3, 4 +; P9-AIX32-NEXT: lxvwsx v2, 0, r3 +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: adjusted_lxvwsx_v16i8: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: xxspltw v2, vs0, 1 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: adjusted_lxvwsx_v16i8: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: xxspltw v2, vs0, 1 +; P7-AIX32-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> @@ -388,6 +737,24 @@ define <16 x i8> @adjusted_lxvwsx_v16i8_2(<16 x i8> *%s, <16 x i8> %t) { ; P7-NEXT: lxvw4x vs0, 0, r3 ; P7-NEXT: xxspltw v2, vs0, 2 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: adjusted_lxvwsx_v16i8_2: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: addi r3, r3, 8 +; P9-AIX32-NEXT: lxvwsx v2, 0, r3 +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: adjusted_lxvwsx_v16i8_2: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: xxspltw v2, vs0, 2 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: adjusted_lxvwsx_v16i8_2: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: xxspltw v2, vs0, 2 +; P7-AIX32-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> @@ -412,6 +779,24 @@ define <16 x i8> @adjusted_lxvwsx_v16i8_3(<16 x i8> *%s, <16 x i8> %t) { ; P7-NEXT: lxvw4x vs0, 0, r3 ; P7-NEXT: xxspltw v2, vs0, 3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: adjusted_lxvwsx_v16i8_3: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: addi r3, r3, 12 +; P9-AIX32-NEXT: lxvwsx v2, 0, r3 +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: adjusted_lxvwsx_v16i8_3: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lxvw4x vs0, 0, r3 +; P8-AIX32-NEXT: xxspltw v2, vs0, 3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: adjusted_lxvwsx_v16i8_3: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lxvw4x vs0, 0, r3 +; P7-AIX32-NEXT: xxspltw v2, vs0, 3 +; P7-AIX32-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> @@ -433,6 +818,47 @@ define <16 x i8> @unadjusted_lxvdsx(i64* %s, i64* %t) { ; P7: # %bb.0: # %entry ; P7-NEXT: lxvdsx v2, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: unadjusted_lxvdsx: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: lwz r4, 0(r3) +; P9-AIX32-NEXT: stw r4, -16(r1) +; P9-AIX32-NEXT: lwz r3, 4(r3) +; P9-AIX32-NEXT: lxv vs1, -16(r1) +; P9-AIX32-NEXT: mtfprwz f0, r3 +; P9-AIX32-NEXT: xxinsertw vs1, vs0, 4 +; P9-AIX32-NEXT: xxmrghd v2, vs1, vs1 +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: unadjusted_lxvdsx: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lwz r4, 4(r3) +; P8-AIX32-NEXT: stw r4, -32(r1) +; P8-AIX32-NEXT: addi r4, r1, -16 +; P8-AIX32-NEXT: lwz r3, 0(r3) +; P8-AIX32-NEXT: stw r3, -16(r1) +; P8-AIX32-NEXT: addi r3, r1, -32 +; P8-AIX32-NEXT: lxvw4x v2, 0, r3 +; P8-AIX32-NEXT: lxvw4x v3, 0, r4 +; P8-AIX32-NEXT: vmrghw v2, v3, v2 +; P8-AIX32-NEXT: xxsldwi vs0, v3, v2, 2 +; P8-AIX32-NEXT: xxmrgld v2, vs0, vs0 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: unadjusted_lxvdsx: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lwz r5, 4(r3) +; P7-AIX32-NEXT: addi r4, r1, -32 +; P7-AIX32-NEXT: stw r5, -32(r1) +; P7-AIX32-NEXT: lwz r3, 0(r3) +; P7-AIX32-NEXT: stw r3, -16(r1) +; P7-AIX32-NEXT: addi r3, r1, -16 +; P7-AIX32-NEXT: lxvw4x v2, 0, r4 +; P7-AIX32-NEXT: lxvw4x v3, 0, r3 +; P7-AIX32-NEXT: vmrghw v2, v3, v2 +; P7-AIX32-NEXT: xxsldwi vs0, v3, v2, 2 +; P7-AIX32-NEXT: xxmrgld v2, vs0, vs0 +; P7-AIX32-NEXT: blr entry: %0 = bitcast i64* %s to <8 x i8>* %1 = load <8 x i8>, <8 x i8>* %0, align 8 @@ -455,6 +881,21 @@ define <16 x i8> @unadjusted_lxvdsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { ; P7: # %bb.0: # %entry ; P7-NEXT: lxvdsx v2, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: unadjusted_lxvdsx_v16i8: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: lxvdsx v2, 0, r3 +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: unadjusted_lxvdsx_v16i8: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: lxvdsx v2, 0, r3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: unadjusted_lxvdsx_v16i8: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: lxvdsx v2, 0, r3 +; P7-AIX32-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> @@ -479,6 +920,24 @@ define <16 x i8> @adjusted_lxvdsx_v16i8(<16 x i8> *%s, <16 x i8> %t) { ; P7-NEXT: addi r3, r3, 8 ; P7-NEXT: lxvdsx v2, 0, r3 ; P7-NEXT: blr +; +; P9-AIX32-LABEL: adjusted_lxvdsx_v16i8: +; P9-AIX32: # %bb.0: # %entry +; P9-AIX32-NEXT: addi r3, r3, 8 +; P9-AIX32-NEXT: lxvdsx v2, 0, r3 +; P9-AIX32-NEXT: blr +; +; P8-AIX32-LABEL: adjusted_lxvdsx_v16i8: +; P8-AIX32: # %bb.0: # %entry +; P8-AIX32-NEXT: addi r3, r3, 8 +; P8-AIX32-NEXT: lxvdsx v2, 0, r3 +; P8-AIX32-NEXT: blr +; +; P7-AIX32-LABEL: adjusted_lxvdsx_v16i8: +; P7-AIX32: # %bb.0: # %entry +; P7-AIX32-NEXT: addi r3, r3, 8 +; P7-AIX32-NEXT: lxvdsx v2, 0, r3 +; P7-AIX32-NEXT: blr entry: %0 = load <16 x i8>, <16 x i8>* %s, align 16 %1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> -- 2.7.4