From 34033a84b8b9a07917ffc73af05e857c7b4cfa33 Mon Sep 17 00:00:00 2001 From: Amy Kwan Date: Wed, 15 Jun 2022 12:25:45 -0500 Subject: [PATCH] [PowerPC] Skip combine for vector_shuffles when two scalar_to_vector nodes are different vector types. Currently in `combineVectorShuffle()`, we update the shuffle mask if either input vector comes from a scalar_to_vector, and we keep the respective input vectors in its permuted form by producing PPCISD::SCALAR_TO_VECTOR_PERMUTED. However, it is possible that we end up in a situation where both input vectors to the vector_shuffle are scalar_to_vector, and are different vector types. In situations like this, the shuffle mask is updated incorrectly as the current code assumes both scalar_to_vector inputs are the same vector type. This patch skips the combines for vector_shuffle if both input vectors are scalar_to_vector, and if they are of different vector types. A follow up patch will focus on fixing this issue afterwards, in order to correctly update the shuffle mask. Differential Revision: https://reviews.llvm.org/D127818 --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 11 ++ .../PowerPC/p8-scalar_vector_conversions.ll | 8 +- .../CodeGen/PowerPC/scalar_to_vector_shuffle.ll | 138 +++++++++++++++++++++ 3 files changed, 154 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/scalar_to_vector_shuffle.ll diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 6d451f8..f3d0c26 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -14885,6 +14885,17 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN, SDValue SToVLHS = isScalarToVec(LHS); SDValue SToVRHS = isScalarToVec(RHS); if (SToVLHS || SToVRHS) { + // FIXME: If both LHS and RHS are SCALAR_TO_VECTOR, but are not the + // same type and have differing element sizes, then do not perform + // the following transformation. The current transformation for + // SCALAR_TO_VECTOR assumes that both input vectors have the same + // element size. This will be updated in the future to account for + // differing sizes of the LHS and RHS. + if (SToVLHS && SToVRHS && + (SToVLHS.getValueType().getScalarSizeInBits() != + SToVRHS.getValueType().getScalarSizeInBits())) + return Res; + int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements() : SToVRHS.getValueType().getVectorNumElements(); int NumEltsOut = ShuffV.size(); diff --git a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll index d8f9a08..c8d4b4b 100644 --- a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll +++ b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll @@ -170,7 +170,7 @@ define <2 x double> @buildd() { ; ; CHECK-AIX-LABEL: buildd: ; CHECK-AIX: # %bb.0: # %entry -; CHECK-AIX-NEXT: ld 3, L..C0(2) +; CHECK-AIX-NEXT: ld 3, L..C0(2) # @d ; CHECK-AIX-NEXT: lxvdsx 34, 0, 3 ; CHECK-AIX-NEXT: blr entry: @@ -2567,9 +2567,11 @@ define <2 x i64> @buildi2(i64 %arg, i32 %arg1) { ; ; CHECK-LE-LABEL: buildi2: ; CHECK-LE: # %bb.0: # %entry -; CHECK-LE-NEXT: mtfprwz f0, r4 +; CHECK-LE-NEXT: mtfprd f0, r4 ; CHECK-LE-NEXT: mtfprd f1, r3 -; CHECK-LE-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-NEXT: xxswapd vs0, vs0 +; CHECK-LE-NEXT: xxswapd v2, vs1 +; CHECK-LE-NEXT: xxmrgld v2, v2, vs0 ; CHECK-LE-NEXT: blr ; ; CHECK-AIX-LABEL: buildi2: diff --git a/llvm/test/CodeGen/PowerPC/scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/scalar_to_vector_shuffle.ll new file mode 100644 index 0000000..aa7484a --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/scalar_to_vector_shuffle.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-LE-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-unknown-linux-gnu < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-BE-P9 + +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc64-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-64-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P8 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-asm-full-reg-names \ +; RUN: -ppc-vsr-nums-as-vr -mtriple=powerpc-ibm-aix < %s | \ +; RUN: FileCheck %s --check-prefix=CHECK-AIX-32-P9 + +define <16 x i8> @test_4_8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) local_unnamed_addr { +; CHECK-LE-P8-LABEL: test_4_8: +; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P8-NEXT: lfdx f1, 0, r4 +; CHECK-LE-P8-NEXT: addi r3, r5, .LCPI0_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs2, 0, r3 +; CHECK-LE-P8-NEXT: xxswapd v2, f0 +; CHECK-LE-P8-NEXT: xxswapd v3, f1 +; CHECK-LE-P8-NEXT: xxswapd v4, vs2 +; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P8-NEXT: blr +; +; CHECK-LE-P9-LABEL: test_4_8: +; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l +; CHECK-LE-P9-NEXT: xxswapd v2, f0 +; CHECK-LE-P9-NEXT: lfd f0, 0(r4) +; CHECK-LE-P9-NEXT: lxv v4, 0(r3) +; CHECK-LE-P9-NEXT: xxswapd v3, f0 +; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P9-NEXT: blr +; +; CHECK-BE-P8-LABEL: test_4_8: +; CHECK-BE-P8: # %bb.0: # %entry +; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P8-NEXT: addis r5, r2, .LCPI0_0@toc@ha +; CHECK-BE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-BE-P8-NEXT: addi r3, r5, .LCPI0_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P8-NEXT: blr +; +; CHECK-BE-P9-LABEL: test_4_8: +; CHECK-BE-P9: # %bb.0: # %entry +; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha +; CHECK-BE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l +; CHECK-BE-P9-NEXT: lxv v4, 0(r3) +; CHECK-BE-P9-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P9-NEXT: blr +; +; CHECK-AIX-64-P8-LABEL: test_4_8: +; CHECK-AIX-64-P8: # %bb.0: # %entry +; CHECK-AIX-64-P8-NEXT: ld r5, L..C0(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r5 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P8-NEXT: blr +; +; CHECK-AIX-64-P9-LABEL: test_4_8: +; CHECK-AIX-64-P9: # %bb.0: # %entry +; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C0(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsd v3, 0(r4) +; CHECK-AIX-64-P9-NEXT: xxsldwi v2, f0, f0, 1 +; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P9-NEXT: blr +; +; CHECK-AIX-32-P8-LABEL: test_4_8: +; CHECK-AIX-32-P8: # %bb.0: # %entry +; CHECK-AIX-32-P8-NEXT: lwz r5, 4(r4) +; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P8-NEXT: stw r5, -16(r1) +; CHECK-AIX-32-P8-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P8-NEXT: addi r4, r1, -32 +; CHECK-AIX-32-P8-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C0(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: xxmrghw v3, vs1, vs0 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: blr +; +; CHECK-AIX-32-P9-LABEL: test_4_8: +; CHECK-AIX-32-P9: # %bb.0: # %entry +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, 4(r4) +; CHECK-AIX-32-P9-NEXT: stw r3, -16(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, 0(r4) +; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) +; CHECK-AIX-32-P9-NEXT: stw r3, -32(r1) +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C0(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv vs1, -32(r1) +; CHECK-AIX-32-P9-NEXT: lxv v4, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxmrghw v3, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P9-NEXT: blr +entry: + %0 = load <4 x i8>, ptr %a, align 4 + %bc1 = bitcast <4 x i8> %0 to i32 + %vecinit3 = insertelement <4 x i32> poison, i32 %bc1, i64 0 + %1 = load <8 x i8>, ptr %b, align 8 + %bc2 = bitcast <8 x i8> %1 to i64 + %vecinit6 = insertelement <2 x i64> undef, i64 %bc2, i64 0 + %2 = bitcast <4 x i32> %vecinit3 to <16 x i8> + %3 = bitcast <2 x i64> %vecinit6 to <16 x i8> + %shuffle = shufflevector <16 x i8> %2, <16 x i8> %3, <16 x i32> + ret <16 x i8> %shuffle +} -- 2.7.4