[X86] Add support for "light" AVX

author Ilya Tokar <tokarip@google.com>

Thu, 15 Dec 2022 20:00:27 +0000 (15:00 -0500)

committer Ilya Tokar <tokarip@google.com>

Tue, 24 Jan 2023 22:02:46 +0000 (17:02 -0500)
author Ilya Tokar <tokarip@google.com>
Thu, 15 Dec 2022 20:00:27 +0000 (15:00 -0500)
committer Ilya Tokar <tokarip@google.com>
Tue, 24 Jan 2023 22:02:46 +0000 (17:02 -0500)
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td

index 370deab..83bd2ff 100644 (file)
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -615,6 +615,10 @@ def TuningPrefer256Bit
      : SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
                         "Prefer 256-bit AVX instructions">;
  
+def TuningAllowLight256Bit
+    : SubtargetFeature<"allow-light-256-bit", "AllowLight256Bit", "true",
+                       "Enable generation of 256-bit load/stores even if we prefer 128-bit">;
+
  def TuningPreferMaskRegisters
      : SubtargetFeature<"prefer-mask-registers", "PreferMaskRegisters", "true",
                         "Prefer AVX512 mask registers over PTEST/MOVMSK">;
@@ -777,7 +781,8 @@ def ProcessorFeatures {
                                        TuningFastVariablePerLaneShuffle,
                                        TuningPOPCNTFalseDeps,
                                        TuningLZCNTFalseDeps,
-                                      TuningInsertVZEROUPPER];
+                                      TuningInsertVZEROUPPER,
+                                      TuningAllowLight256Bit];
    list<SubtargetFeature> HSWFeatures =
      !listconcat(IVBFeatures, HSWAdditionalFeatures);
  
@@ -805,7 +810,8 @@ def ProcessorFeatures {
                                        TuningFastVariableCrossLaneShuffle,
                                        TuningFastVariablePerLaneShuffle,
                                        TuningPOPCNTFalseDeps,
-                                      TuningInsertVZEROUPPER];
+                                      TuningInsertVZEROUPPER,
+                                      TuningAllowLight256Bit];
    list<SubtargetFeature> SKLFeatures =
      !listconcat(BDWFeatures, SKLAdditionalFeatures);
  
@@ -833,7 +839,8 @@ def ProcessorFeatures {
                                        TuningFastVariablePerLaneShuffle,
                                        TuningPrefer256Bit,
                                        TuningPOPCNTFalseDeps,
-                                      TuningInsertVZEROUPPER];
+                                      TuningInsertVZEROUPPER,
+                                      TuningAllowLight256Bit];
    list<SubtargetFeature> SKXFeatures =
      !listconcat(BDWFeatures, SKXAdditionalFeatures);
  
@@ -870,7 +877,8 @@ def ProcessorFeatures {
                                        TuningFastVariableCrossLaneShuffle,
                                        TuningFastVariablePerLaneShuffle,
                                        TuningPrefer256Bit,
-                                      TuningInsertVZEROUPPER];
+                                      TuningInsertVZEROUPPER,
+                                      TuningAllowLight256Bit];
    list<SubtargetFeature> CNLFeatures =
      !listconcat(SKLFeatures, CNLAdditionalFeatures);
  
@@ -894,7 +902,8 @@ def ProcessorFeatures {
                                        TuningFastVariableCrossLaneShuffle,
                                        TuningFastVariablePerLaneShuffle,
                                        TuningPrefer256Bit,
-                                      TuningInsertVZEROUPPER];
+                                      TuningInsertVZEROUPPER,
+                                      TuningAllowLight256Bit];
    list<SubtargetFeature> ICLFeatures =
      !listconcat(CNLFeatures, ICLAdditionalFeatures);
  
@@ -1276,7 +1285,8 @@ def ProcessorFeatures {
                                       TuningFastMOVBE,
                                       TuningSlowSHLD,
                                       TuningSBBDepBreaking,
-                                     TuningInsertVZEROUPPER];
+                                     TuningInsertVZEROUPPER,
+                                     TuningAllowLight256Bit];
    list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
                                                    FeatureRDPID,
                                                    FeatureRDPRU,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index 10b4dcb..a33ee63 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2702,7 +2702,7 @@ EVT X86TargetLowering::getOptimalMemOpType(
        }
        // FIXME: Check if unaligned 32-byte accesses are slow.
        if (Op.size() >= 32 && Subtarget.hasAVX() &&
-          (Subtarget.getPreferVectorWidth() >= 256)) {
+          Subtarget.useLight256BitInstructions()) {
          // Although this isn't a well-supported type for AVX1, we'll let
          // legalization and shuffle lowering produce the optimal codegen. If we
          // choose an optimal type with a vector element larger than a byte,
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h

index c6bf502..4c7123a 100644 (file)
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -255,6 +255,10 @@ public:
      return hasAVX512() && (canExtendTo512DQ() || RequiredVectorWidth > 256);
    }
  
+  bool useLight256BitInstructions() const {
+    return getPreferVectorWidth() >= 256 || AllowLight256Bit;
+  }
+
    bool useBWIRegs() const {
      return hasBWI() && useAVX512Regs();
    }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h

index d0abfe2..ef8c4a1 100644 (file)
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -92,6 +92,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
        // Perf-tuning flags.
        X86::TuningFastGather,
        X86::TuningSlowUAMem32,
+      X86::TuningAllowLight256Bit,
  
        // Based on whether user set the -mprefer-vector-width command line.
        X86::TuningPrefer128Bit,
diff --git a/llvm/test/CodeGen/X86/memcpy-light-avx.ll b/llvm/test/CodeGen/X86/memcpy-light-avx.ll

new file mode 100644 (file)

index 0000000..248dc93
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memcpy-light-avx.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell -mattr=prefer-128-bit | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=znver1 -mattr=prefer-128-bit | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2,+prefer-128-bit,+allow-light-256-bit | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2,+prefer-128-bit,-allow-light-256-bit | FileCheck %s --check-prefixes=NO256
+
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind
+
+define void @test1(ptr %a, ptr %b) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups (%rsi), %ymm0
+; CHECK-NEXT:    vmovups %ymm0, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+;
+; NO256-LABEL: test1:
+; NO256:       # %bb.0:
+; NO256-NEXT:    vmovups (%rsi), %xmm0
+; NO256-NEXT:    vmovups 16(%rsi), %xmm1
+; NO256-NEXT:    vmovups %xmm1, 16(%rdi)
+; NO256-NEXT:    vmovups %xmm0, (%rdi)
+; NO256-NEXT:    retq
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %a, ptr %b, i64 32, i1 0 )
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/vector-width-store-merge.ll b/llvm/test/CodeGen/X86/vector-width-store-merge.ll

index ea61fd3..50c7b01 100644 (file)
--- a/llvm/test/CodeGen/X86/vector-width-store-merge.ll
+++ b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
@@ -1,18 +1,27 @@
  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=skylake| FileCheck %s --check-prefixes=CHECK,PREFER256
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=sandybridge| FileCheck %s --check-prefixes=CHECK,LIGHT256
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1| FileCheck %s --check-prefixes=CHECK,PREFER256
  
  ; This tests whether or not we generate vectors large than preferred vector width when
  ; lowering memmove.
  
  ; Function Attrs: nounwind uwtable
  define weak_odr dso_local void @A(ptr %src, ptr %dst) local_unnamed_addr #0 {
-; CHECK-LABEL: A:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovups (%rdi), %xmm0
-; CHECK-NEXT:    vmovups 16(%rdi), %xmm1
-; CHECK-NEXT:    vmovups %xmm1, 16(%rsi)
-; CHECK-NEXT:    vmovups %xmm0, (%rsi)
-; CHECK-NEXT:    retq
+; PREFER256-LABEL: A:
+; PREFER256:       # %bb.0: # %entry
+; PREFER256-NEXT:    vmovups (%rdi), %ymm0
+; PREFER256-NEXT:    vmovups %ymm0, (%rsi)
+; PREFER256-NEXT:    vzeroupper
+; PREFER256-NEXT:    retq
+;
+; LIGHT256-LABEL: A:
+; LIGHT256:       # %bb.0: # %entry
+; LIGHT256-NEXT:    vmovups (%rdi), %xmm0
+; LIGHT256-NEXT:    vmovups 16(%rdi), %xmm1
+; LIGHT256-NEXT:    vmovups %xmm1, 16(%rsi)
+; LIGHT256-NEXT:    vmovups %xmm0, (%rsi)
+; LIGHT256-NEXT:    retq
  entry:
    call void @llvm.memmove.p0.p0.i64(ptr align 1 %dst, ptr align 1 %src, i64 32, i1 false)
    ret void
@@ -20,17 +29,26 @@ entry:
  
  ; Function Attrs: nounwind uwtable
  define weak_odr dso_local void @B(ptr %src, ptr %dst) local_unnamed_addr #0 {
-; CHECK-LABEL: B:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovups (%rdi), %xmm0
-; CHECK-NEXT:    vmovups 16(%rdi), %xmm1
-; CHECK-NEXT:    vmovups 32(%rdi), %xmm2
-; CHECK-NEXT:    vmovups 48(%rdi), %xmm3
-; CHECK-NEXT:    vmovups %xmm3, 48(%rsi)
-; CHECK-NEXT:    vmovups %xmm2, 32(%rsi)
-; CHECK-NEXT:    vmovups %xmm1, 16(%rsi)
-; CHECK-NEXT:    vmovups %xmm0, (%rsi)
-; CHECK-NEXT:    retq
+; PREFER256-LABEL: B:
+; PREFER256:       # %bb.0: # %entry
+; PREFER256-NEXT:    vmovups (%rdi), %ymm0
+; PREFER256-NEXT:    vmovups 32(%rdi), %ymm1
+; PREFER256-NEXT:    vmovups %ymm1, 32(%rsi)
+; PREFER256-NEXT:    vmovups %ymm0, (%rsi)
+; PREFER256-NEXT:    vzeroupper
+; PREFER256-NEXT:    retq
+;
+; LIGHT256-LABEL: B:
+; LIGHT256:       # %bb.0: # %entry
+; LIGHT256-NEXT:    vmovups (%rdi), %xmm0
+; LIGHT256-NEXT:    vmovups 16(%rdi), %xmm1
+; LIGHT256-NEXT:    vmovups 32(%rdi), %xmm2
+; LIGHT256-NEXT:    vmovups 48(%rdi), %xmm3
+; LIGHT256-NEXT:    vmovups %xmm3, 48(%rsi)
+; LIGHT256-NEXT:    vmovups %xmm2, 32(%rsi)
+; LIGHT256-NEXT:    vmovups %xmm1, 16(%rsi)
+; LIGHT256-NEXT:    vmovups %xmm0, (%rsi)
+; LIGHT256-NEXT:    retq
  entry:
    call void @llvm.memmove.p0.p0.i64(ptr align 1 %dst, ptr align 1 %src, i64 64, i1 false)
    ret void
@@ -67,7 +85,7 @@ entry:
  ; Function Attrs: argmemonly nounwind
  declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1 immarg) #1
  
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
  attributes #1 = { argmemonly nounwind }
  attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
author	Ilya Tokar <tokarip@google.com>
	Thu, 15 Dec 2022 20:00:27 +0000 (15:00 -0500)
committer	Ilya Tokar <tokarip@google.com>
	Tue, 24 Jan 2023 22:02:46 +0000 (17:02 -0500)
llvm/lib/Target/X86/X86.td		patch \| blob \| history
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/X86/X86Subtarget.h		patch \| blob \| history
llvm/lib/Target/X86/X86TargetTransformInfo.h		patch \| blob \| history
llvm/test/CodeGen/X86/memcpy-light-avx.ll	[new file with mode: 0644]	patch \| blob
llvm/test/CodeGen/X86/vector-width-store-merge.ll		patch \| blob \| history