[P10] [Power PC] Exploiting new load rightmost vector element instructions.

author Albion Fung <albion.fung@ibm.com>

Tue, 9 Mar 2021 21:07:31 +0000 (16:07 -0500)

committer Albion Fung <conanap@lep82435v.canlab.ibm.com>

Tue, 9 Mar 2021 21:08:17 +0000 (16:08 -0500)
author Albion Fung <albion.fung@ibm.com>
Tue, 9 Mar 2021 21:07:31 +0000 (16:07 -0500)
committer Albion Fung <conanap@lep82435v.canlab.ibm.com>
Tue, 9 Mar 2021 21:08:17 +0000 (16:08 -0500)
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td

index 0c6749c..14af94f 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -2563,6 +2563,11 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, IsLittleEndian] in {
              (STXVRDX $src, xoaddr:$dst)>;
    def : Pat<(store (f64 (extractelt v2f64:$src, 0)), xoaddr:$dst),
              (STXVRDX $src, xoaddr:$dst)>;
+  // Load element 0 of a VSX register to memory
+  def : Pat<(v8i16 (scalar_to_vector (i32 (extloadi16 xoaddr:$src)))),
+            (v8i16 (COPY_TO_REGCLASS (LXVRHX xoaddr:$src), VSRC))>;
+  def : Pat<(v16i8 (scalar_to_vector (i32 (extloadi8 xoaddr:$src)))),
+            (v16i8 (COPY_TO_REGCLASS (LXVRBX xoaddr:$src), VSRC))>;
   }
  
  // FIXME: The swap is overkill when the shift amount is a constant.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td

index e8babce..475098e 100644 (file)
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -152,6 +152,7 @@ def HasDirectMove : Predicate<"Subtarget->hasDirectMove()">;
  def NoP9Vector : Predicate<"!Subtarget->hasP9Vector()">;
  def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">;
  def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">;
+def NoP10Vector: Predicate<"!Subtarget->hasP10Vector()">;
  
  //--------------------- VSX-specific instruction formats ---------------------//
  // By default, all VSX instructions are to be selected over their Altivec
@@ -2437,6 +2438,8 @@ def MrgWords {
  // [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian]
  // [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian]
  // [HasVSX, HasP9Vector]
+// [HasVSX, HasP9Vector, NoP10Vector]
+// [HasVSX, HasP9Vector, IsBigEndian]
  // [HasVSX, HasP9Vector, IsBigEndian, IsPPC64]
  // [HasVSX, HasP9Vector, IsLittleEndian]
  // [HasVSX, HasP9Altivec]
@@ -3735,9 +3738,6 @@ def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
            (STXVX $rS, xoaddr:$dst)>;
  
  // Build vectors from i8 loads
-defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
-                         (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
-                         (VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
  defm : ScalToVecWPermute<v8i16, ScalarLoads.ZELi8,
                           (VSPLTHs 3, (LXSIBZX xoaddr:$src)),
                           (VSPLTHs 3, (LXSIBZX xoaddr:$src))>;
@@ -3755,9 +3755,6 @@ defm : ScalToVecWPermute<v2i64, ScalarLoads.SELi8i64,
                           (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0)>;
  
  // Build vectors from i16 loads
-defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
-                         (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
-                         (VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
  defm : ScalToVecWPermute<v4i32, ScalarLoads.ZELi16,
                           (XXSPLTWs (LXSIHZX xoaddr:$src), 1),
                           (XXSPLTWs (LXSIHZX xoaddr:$src), 1)>;
@@ -3955,6 +3952,38 @@ def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
            (v4i32 (LXVWSX xoaddr:$A))>;
  } // HasVSX, HasP9Vector
  
+// Any Power9 VSX subtarget with equivalent length but better Power10 VSX
+// patterns.
+// Two identical blocks are required due to the slightly different predicates:
+// One without P10 instructions, the other is BigEndian only with P10 instructions.
+let Predicates = [HasVSX, HasP9Vector, NoP10Vector] in {
+// Little endian Power10 subtargets produce a shorter pattern but require a
+// COPY_TO_REGCLASS. The COPY_TO_REGCLASS makes it appear to need two instructions 
+// to perform the operation, when only one instruction is produced in practice.
+// The NoP10Vector predicate excludes these patterns from Power10 VSX subtargets.
+defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
+                         (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
+                         (VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
+// Build vectors from i16 loads
+defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
+                         (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
+                         (VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
+} // HasVSX, HasP9Vector, NoP10Vector
+
+// Any big endian Power9 VSX subtarget
+let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in {
+// Power10 VSX subtargets produce a shorter pattern for little endian targets
+// but this is still the best pattern for Power9 and Power10 VSX big endian
+// Build vectors from i8 loads
+defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
+                         (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
+                         (VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
+// Build vectors from i16 loads
+defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
+                         (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
+                         (VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
+} // HasVSX, HasP9Vector, NoP10Vector
+
  // Big endian 64Bit Power9 subtarget.
  let Predicates = [HasVSX, HasP9Vector, IsBigEndian, IsPPC64] in {
  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
diff --git a/llvm/test/CodeGen/PowerPC/load-rightmost-vector-elt.ll b/llvm/test/CodeGen/PowerPC/load-rightmost-vector-elt.ll

new file mode 100644 (file)

index 0000000..a91ab8c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/load-rightmost-vector-elt.ll
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr10 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \
+; RUN:     < %s | FileCheck %s --check-prefix=CHECK-P10LE
+
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr10 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \
+; RUN:     < %s | FileCheck %s --check-prefix=CHECK-P10BE
+
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \
+; RUN:     < %s | FileCheck %s --check-prefix=CHECK-P9
+
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:     -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names \
+; RUN:     < %s | FileCheck %s --check-prefix=CHECK-P9
+
+define <8 x i16> @test1(i16*  %a) {
+; CHECK-P10LE-LABEL: test1:
+; CHECK-P10LE:       # %bb.0: # %entry
+; CHECK-P10LE-NEXT:    lxvrhx v2, 0, r3
+; CHECK-P10LE-NEXT:    blr
+;
+; CHECK-P10BE-LABEL: test1:
+; CHECK-P10BE:       # %bb.0: # %entry
+; CHECK-P10BE-NEXT:    lxsihzx v2, 0, r3
+; CHECK-P10BE-NEXT:    vsplth v2, v2, 3
+; CHECK-P10BE-NEXT:    blr
+;
+; CHECK-P9-LABEL: test1:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxsihzx v2, 0, r3
+; CHECK-P9-NEXT:    vsplth v2, v2, 3
+; CHECK-P9-NEXT:    blr
+entry:
+  %0 = load i16, i16* %a, align 2
+  %vecinit = insertelement <8 x i16> undef, i16 %0, i32 0
+  ret <8 x i16> %vecinit
+}
+
+define <16 x i8> @test2(i8*  %a) {
+; CHECK-P10LE-LABEL: test2:
+; CHECK-P10LE:       # %bb.0: # %entry
+; CHECK-P10LE-NEXT:    lxvrbx v2, 0, r3
+; CHECK-P10LE-NEXT:    blr
+;
+; CHECK-P10BE-LABEL: test2:
+; CHECK-P10BE:       # %bb.0: # %entry
+; CHECK-P10BE-NEXT:    lxsibzx v2, 0, r3
+; CHECK-P10BE-NEXT:    vspltb v2, v2, 7
+; CHECK-P10BE-NEXT:    blr
+;
+; CHECK-P9-LABEL: test2:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxsibzx v2, 0, r3
+; CHECK-P9-NEXT:    vspltb v2, v2, 7
+; CHECK-P9-NEXT:    blr
+entry:
+  %0 = load i8, i8* %a, align 1
+  %vecins = insertelement <16 x i8> undef, i8 %0, i32 0
+  ret <16 x i8> %vecins
+}
+
author	Albion Fung <albion.fung@ibm.com>
	Tue, 9 Mar 2021 21:07:31 +0000 (16:07 -0500)
committer	Albion Fung <conanap@lep82435v.canlab.ibm.com>
	Tue, 9 Mar 2021 21:08:17 +0000 (16:08 -0500)
llvm/lib/Target/PowerPC/PPCInstrPrefix.td		patch \| blob \| history
llvm/lib/Target/PowerPC/PPCInstrVSX.td		patch \| blob \| history
llvm/test/CodeGen/PowerPC/load-rightmost-vector-elt.ll	[new file with mode: 0644]	patch \| blob