Add a feature flag for slow 32-byte unaligned memory accesses [x86].

author Sanjay Patel <spatel@rotateright.com>

Fri, 21 Nov 2014 17:40:04 +0000 (17:40 +0000)

committer Sanjay Patel <spatel@rotateright.com>

Fri, 21 Nov 2014 17:40:04 +0000 (17:40 +0000)
author Sanjay Patel <spatel@rotateright.com>
Fri, 21 Nov 2014 17:40:04 +0000 (17:40 +0000)
committer Sanjay Patel <spatel@rotateright.com>
Fri, 21 Nov 2014 17:40:04 +0000 (17:40 +0000)
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td

index 9729f46..f553a58 100644 (file)
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -82,6 +82,9 @@ def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
  def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
                                          "IsUAMemFast", "true",
                                          "Fast unaligned memory access">;
+def FeatureSlowUAMem32 : SubtargetFeature<"slow-unaligned-mem-32",
+                            "IsUAMem32Slow", "true",
+                            "Slow unaligned 32-byte memory access">;
  def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
                                        "Support SSE 4a instructions",
                                        [FeatureSSE3]>;
@@ -271,12 +274,14 @@ def : ProcessorModel<"westmere", SandyBridgeModel,
  // rather than a superset.
  def : ProcessorModel<"corei7-avx", SandyBridgeModel,
                       [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
-                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL]>;
+                      FeatureSlowUAMem32, FeaturePOPCNT, FeatureAES,
+                      FeaturePCLMUL]>;
  // Ivy Bridge
  def : ProcessorModel<"core-avx-i", SandyBridgeModel,
                       [FeatureAVX, FeatureCMPXCHG16B, FeatureFastUAMem,
-                      FeaturePOPCNT, FeatureAES, FeaturePCLMUL, FeatureRDRAND,
-                      FeatureF16C, FeatureFSGSBase]>;
+                      FeatureSlowUAMem32, FeaturePOPCNT, FeatureAES,
+                      FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
+                      FeatureFSGSBase]>;
  
  // Haswell
  def : ProcessorModel<"core-avx2", HaswellModel,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp

index 4ec7fa1..07e96ff 100644 (file)
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -24376,11 +24376,12 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
    SDLoc dl(Ld);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
-  // On Sandybridge unaligned 256bit loads are inefficient.
+  // For chips with slow 32-byte unaligned loads, break the 32-byte operation
+  // into two 16-byte operations.
    ISD::LoadExtType Ext = Ld->getExtensionType();
    unsigned Alignment = Ld->getAlignment();
    bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8;
-  if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+  if (RegVT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
        !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
      unsigned NumElems = RegVT.getVectorNumElements();
      if (NumElems < 2)
@@ -24423,13 +24424,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
    SDValue StoredVal = St->getOperand(1);
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  
-  // If we are saving a concatenation of two XMM registers, perform two stores.
-  // On Sandy Bridge, 256-bit memory operations are executed by two
-  // 128-bit ports. However, on Haswell it is better to issue a single 256-bit
-  // memory  operation.
+  // If we are saving a concatenation of two XMM registers and 32-byte stores
+  // are slow, such as on Sandy Bridge, perform two 16-byte stores.
    unsigned Alignment = St->getAlignment();
    bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8;
-  if (VT.is256BitVector() && !Subtarget->hasInt256() &&
+  if (VT.is256BitVector() && Subtarget->isUnalignedMem32Slow() &&
        StVT == VT && !IsAligned) {
      unsigned NumElems = VT.getVectorNumElements();
      if (NumElems < 2)
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp

index afa0173..e59395c 100644 (file)
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -264,6 +264,7 @@ void X86Subtarget::initializeEnvironment() {
    IsBTMemSlow = false;
    IsSHLDSlow = false;
    IsUAMemFast = false;
+  IsUAMem32Slow = false;
    HasVectorUAMem = false;
    HasCmpxchg16b = false;
    UseLeaForSP = false;
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h

index cf76ac7..3f508c5 100644 (file)
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -159,6 +159,9 @@ protected:
    /// IsUAMemFast - True if unaligned memory access is fast.
    bool IsUAMemFast;
  
+  /// True if unaligned 32-byte memory accesses are slow.
+  bool IsUAMem32Slow;
+  
    /// HasVectorUAMem - True if SIMD operations can have unaligned memory
    /// operands. This may require setting a feature bit in the processor.
    bool HasVectorUAMem;
@@ -374,6 +377,7 @@ public:
    bool isBTMemSlow() const { return IsBTMemSlow; }
    bool isSHLDSlow() const { return IsSHLDSlow; }
    bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
+  bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
    bool hasVectorUAMem() const { return HasVectorUAMem; }
    bool hasCmpxchg16b() const { return HasCmpxchg16b; }
    bool useLeaForSP() const { return UseLeaForSP; }
diff --git a/llvm/test/CodeGen/X86/2012-05-19-avx2-store.ll b/llvm/test/CodeGen/X86/2012-05-19-avx2-store.ll

deleted file mode 100644 (file)

index 1c1e8e2..0000000
--- a/llvm/test/CodeGen/X86/2012-05-19-avx2-store.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx2 | FileCheck %s
-
-define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp {
-entry:
-  ; CHECK: vmovaps
-  ; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]),
-  ; CHECK: vmovups
-  %A = load <4 x i32>* %Ap
-  %B = load <4 x i32>* %Bp
-  %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  store <8 x i32> %Z, <8 x i32>* %P, align 16
-  ret void
-}
diff --git a/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll

new file mode 100644 (file)

index 0000000..01342ba
--- /dev/null
+++ b/llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefix=SANDYB
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx-i | FileCheck %s --check-prefix=SANDYB
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s --check-prefix=HASWELL
+
+; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte load
+; because that is slower than two 16-byte loads. 
+; Other AVX-capable chips don't have that problem.
+
+define <8 x float> @load32bytes(<8 x float>* %Ap) {
+  ; CHECK-LABEL: load32bytes
+
+  ; SANDYB: vmovaps
+  ; SANDYB: vinsertf128
+  ; SANDYB: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL: retq
+
+  %A = load <8 x float>* %Ap, align 16
+  ret <8 x float> %A
+}
+
+; On Sandy Bridge or Ivy Bridge, we should not generate an unaligned 32-byte store
+; because that is slowerthan two 16-byte stores. 
+; Other AVX-capable chips don't have that problem.
+
+define void @store32bytes(<8 x float> %A, <8 x float>* %P) {
+  ; CHECK-LABEL: store32bytes
+
+  ; SANDYB: vextractf128
+  ; SANDYB: vmovaps
+  ; SANDYB: retq
+
+  ; BTVER2: vmovups
+  ; BTVER2: retq
+
+  ; HASWELL: vmovups
+  ; HASWELL: retq
+
+  store <8 x float> %A, <8 x float>* %P, align 16
+  ret void
+}
author	Sanjay Patel <spatel@rotateright.com>
	Fri, 21 Nov 2014 17:40:04 +0000 (17:40 +0000)
committer	Sanjay Patel <spatel@rotateright.com>
	Fri, 21 Nov 2014 17:40:04 +0000 (17:40 +0000)
llvm/lib/Target/X86/X86.td		patch \| blob \| history
llvm/lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
llvm/lib/Target/X86/X86Subtarget.cpp		patch \| blob \| history
llvm/lib/Target/X86/X86Subtarget.h		patch \| blob \| history
llvm/test/CodeGen/X86/2012-05-19-avx2-store.ll	[deleted file]	patch \| blob \| history
llvm/test/CodeGen/X86/unaligned-32-byte-memops.ll	[new file with mode: 0644]	patch \| blob