Remove AVX/SSE transition penalties

author Li Tian <litian2025@gmail.com>

Mon, 12 Dec 2016 03:13:28 +0000 (19:13 -0800)

committer Li Tian <litian2025@gmail.com>

Sun, 8 Jan 2017 20:44:12 +0000 (12:44 -0800)
author Li Tian <litian2025@gmail.com>
Mon, 12 Dec 2016 03:13:28 +0000 (19:13 -0800)
committer Li Tian <litian2025@gmail.com>
Sun, 8 Jan 2017 20:44:12 +0000 (12:44 -0800)
diff --git a/src/coreclr/src/jit/codegen.h b/src/coreclr/src/jit/codegen.h

index c6e38ab..15abbbf 100755 (executable)
--- a/src/coreclr/src/jit/codegen.h
+++ b/src/coreclr/src/jit/codegen.h
@@ -390,6 +390,8 @@ protected:
      // Save/Restore callee saved float regs to stack
      void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
      void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
+    // Generate VZeroupper instruction to avoid AVX/SSE transition penalty
+    bool genVzeroupperIfNeeded(bool check256bitOnly = true);
  
  #endif // _TARGET_XARCH_ && FEATURE_STACK_FP_X87
  
diff --git a/src/coreclr/src/jit/codegencommon.cpp b/src/coreclr/src/jit/codegencommon.cpp

index 2409115..000051c 100644 (file)
--- a/src/coreclr/src/jit/codegencommon.cpp
+++ b/src/coreclr/src/jit/codegencommon.cpp
@@ -10583,7 +10583,8 @@ GenTreePtr CodeGen::genMakeConst(const void* cnsAddr, var_types cnsType, GenTree
  //             funclet frames: this will be FuncletInfo.fiSpDelta.
  void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
  {
-    regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
+    bool      bVzeroupperIssued = genVzeroupperIfNeeded(false);
+    regMaskTP regMask           = compiler->compCalleeFPRegsSavedMask;
  
      // Only callee saved floating point registers should be in regMask
      assert((regMask & RBM_FLT_CALLEE_SAVED) == regMask);
@@ -10611,6 +10612,17 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
          regMaskTP regBit = genRegMask(reg);
          if ((regBit & regMask) != 0)
          {
+#ifdef FEATURE_AVX_SUPPORT
+            // when we reach here, function does not contain AVX instruction so far, however, since copyIns can
+            // be an AVX instruction such as vmovupd, we should check and issue vzeroupper before the copyIns to
+            // avoid Legacy SSE code (from native code such as Reverse PInvoke) to AVX transition penalty
+            if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX && !bVzeroupperIssued &&
+                getEmitter()->IsAVXInstruction(copyIns))
+            {
+                instGen(INS_vzeroupper);
+                bVzeroupperIssued = true;
+            }
+#endif
              // ABI requires us to preserve lower 128-bits of YMM register.
              getEmitter()->emitIns_AR_R(copyIns,
                                         EA_8BYTE, // TODO-XArch-Cleanup: size specified here doesn't matter but should be
@@ -10621,16 +10633,6 @@ void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
              offset -= XMM_REGSIZE_BYTES;
          }
      }
-
-#ifdef FEATURE_AVX_SUPPORT
-    // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
-    // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
-    // using SSE2.
-    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
-    {
-        instGen(INS_vzeroupper);
-    }
-#endif
  }
  
  // Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
@@ -10651,6 +10653,7 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
      // fast path return
      if (regMask == RBM_NONE)
      {
+        genVzeroupperIfNeeded();
          return;
      }
  
@@ -10682,16 +10685,6 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
      assert((offset % 16) == 0);
  #endif // _TARGET_AMD64_
  
-#ifdef FEATURE_AVX_SUPPORT
-    // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
-    // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
-    // using SSE2.
-    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
-    {
-        instGen(INS_vzeroupper);
-    }
-#endif
-
      for (regNumber reg = REG_FLT_CALLEE_SAVED_FIRST; regMask != RBM_NONE; reg = REG_NEXT(reg))
      {
          regMaskTP regBit = genRegMask(reg);
@@ -10706,7 +10699,46 @@ void CodeGen::genRestoreCalleeSavedFltRegs(unsigned lclFrameSize)
              offset -= XMM_REGSIZE_BYTES;
          }
      }
+    genVzeroupperIfNeeded();
+}
+
+// Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the
+// AVX/Legacy SSE transition penalties can be avoided
+//
+// Params
+//   check256bitOnly  - Flag to check if the function contains 256-bit AVX instruction and generate Vzeroupper
+//      instruction, otherwise check if the function contains AVX instruciton (either 128-bit or 256-bit).
+//
+// Return Value:
+//     true if Vzeroupper instruction is issued, false otherwise.
+//
+bool CodeGen::genVzeroupperIfNeeded(bool check256bitOnly)
+{
+    bool bVzeroupperIssued = false;
+#ifdef FEATURE_AVX_SUPPORT
+    if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
+    {
+        if (check256bitOnly)
+        {
+            if (getEmitter()->Contains256bitAVX())
+            {
+                instGen(INS_vzeroupper);
+                bVzeroupperIssued = true;
+            }
+        }
+        else
+        {
+            if (getEmitter()->ContainsAVX())
+            {
+                instGen(INS_vzeroupper);
+                bVzeroupperIssued = true;
+            }
+        }
+    }
+#endif
+    return bVzeroupperIssued;
  }
+
  #endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
  
  //-----------------------------------------------------------------------------------
diff --git a/src/coreclr/src/jit/codegenxarch.cpp b/src/coreclr/src/jit/codegenxarch.cpp

index 8e0af48..495bc7e 100644 (file)
--- a/src/coreclr/src/jit/codegenxarch.cpp
+++ b/src/coreclr/src/jit/codegenxarch.cpp
@@ -5001,6 +5001,20 @@ void CodeGen::genCallInstruction(GenTreePtr node)
  
  #endif // defined(_TARGET_X86_)
  
+#ifdef FEATURE_AVX_SUPPORT
+    // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
+    // if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
+    // transition penalty, assuming the user function contains legacy SSE instruction
+    if (call->IsPInvoke() && call->gtCallType == CT_USER_FUNC &&
+        compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
+    {
+        if (getEmitter()->Contains256bitAVX())
+        {
+            instGen(INS_vzeroupper);
+        }
+    }
+#endif
+
      if (target != nullptr)
      {
  #ifdef _TARGET_X86_
diff --git a/src/coreclr/src/jit/compiler.cpp b/src/coreclr/src/jit/compiler.cpp

index 30eccc3..47d3c35 100644 (file)
--- a/src/coreclr/src/jit/compiler.cpp
+++ b/src/coreclr/src/jit/compiler.cpp
@@ -2310,6 +2310,9 @@ void Compiler::compSetProcessor()
          if (opts.compCanUseAVX)
          {
              codeGen->getEmitter()->SetUseAVX(true);
+            // Assume each JITted method does not contain AVX instruction at first
+            codeGen->getEmitter()->SetContainsAVX(false);
+            codeGen->getEmitter()->SetContains256bitAVX(false);
          }
          else
  #endif // FEATURE_AVX_SUPPORT
diff --git a/src/coreclr/src/jit/emitxarch.h b/src/coreclr/src/jit/emitxarch.h

index 98256cd..4753c14 100644 (file)
--- a/src/coreclr/src/jit/emitxarch.h
+++ b/src/coreclr/src/jit/emitxarch.h
@@ -150,6 +150,26 @@ void SetUseAVX(bool value)
      useAVXEncodings = value;
  }
  
+bool containsAVXInstruction;
+bool ContainsAVX()
+{
+    return containsAVXInstruction;
+}
+void SetContainsAVX(bool value)
+{
+    containsAVXInstruction = value;
+}
+
+bool contains256bitAVXInstruction;
+bool Contains256bitAVX()
+{
+    return contains256bitAVXInstruction;
+}
+void SetContains256bitAVX(bool value)
+{
+    contains256bitAVXInstruction = value;
+}
+
  bool IsThreeOperandBinaryAVXInstruction(instruction ins);
  bool IsThreeOperandMoveAVXInstruction(instruction ins);
  bool IsThreeOperandAVXInstruction(instruction ins)
@@ -162,6 +182,14 @@ bool                     UseAVX()
  {
      return false;
  }
+bool ContainsAVX()
+{
+    return false;
+}
+bool Contains256bitAVX()
+{
+    return false;
+}
  bool hasVexPrefix(code_t code)
  {
      return false;
diff --git a/src/coreclr/src/jit/lower.h b/src/coreclr/src/jit/lower.h

index 555b9e2..da47cf9 100644 (file)
--- a/src/coreclr/src/jit/lower.h
+++ b/src/coreclr/src/jit/lower.h
@@ -235,6 +235,7 @@ private:
  
  #if defined(_TARGET_XARCH_)
      void SetMulOpCounts(GenTreePtr tree);
+    void SetContainsAVXFlags(bool isFloatingType = true, unsigned sizeOfSIMDVector = 0);
  #endif // defined(_TARGET_XARCH_)
  
  #if !CPU_LOAD_STORE_ARCH
diff --git a/src/coreclr/src/jit/lowerxarch.cpp b/src/coreclr/src/jit/lowerxarch.cpp

index bf5d29c..b9df5cd 100644 (file)
--- a/src/coreclr/src/jit/lowerxarch.cpp
+++ b/src/coreclr/src/jit/lowerxarch.cpp
@@ -166,7 +166,8 @@ void Lowering::TreeNodeInfoInit(GenTree* tree)
      Compiler*   compiler = comp;
  
      TreeNodeInfo* info = &(tree->gtLsraInfo);
-
+    // floating type generates AVX instruction (vmovss etc.), set the flag
+    SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
      switch (tree->OperGet())
      {
          GenTree* op1;
@@ -1773,6 +1774,8 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                      {
                          MakeSrcContained(blkNode, source);
                      }
+                    // use XMM register to fill with constants, it's AVX instruction and set the flag
+                    SetContainsAVXFlags();
                  }
                  blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
  
@@ -1954,6 +1957,8 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode)
                      // series of 16-byte loads and stores.
                      blkNode->gtLsraInfo.internalFloatCount = 1;
                      blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
+                    // use XMM register for load and store, need set the flag for AVX instruction
+                    SetContainsAVXFlags();
                  }
  
                  // If src or dst are on stack, we don't have to generate the address into a register
@@ -2732,6 +2737,7 @@ void Lowering::TreeNodeInfoInitSIMD(GenTree* tree)
      TreeNodeInfo* info     = &(tree->gtLsraInfo);
      LinearScan*   lsra     = m_lsra;
      info->dstCount         = 1;
+    SetContainsAVXFlags(true, simdTree->gtSIMDSize);
      switch (simdTree->gtSIMDIntrinsicID)
      {
          GenTree* op1;
@@ -4573,6 +4579,32 @@ void Lowering::SetMulOpCounts(GenTreePtr tree)
  }
  
  //------------------------------------------------------------------------------
+// SetContainsAVXFlags: default value of isFloatingType is true, we set the
+// ContainsAVX flag when floating type value is true, when SIMD vector size is
+// 32 bytes, it is 256bit AVX instruction and we set Contains256bitAVX flag too
+//
+// Arguments:
+//    isFloatingType    - is floating type
+//    sizeOfSIMDVector  - SIMD Vector size
+//
+void Lowering::SetContainsAVXFlags(bool isFloatingType, unsigned sizeOfSIMDVector)
+{
+#ifdef FEATURE_AVX_SUPPORT
+    if (comp->getSIMDInstructionSet() == InstructionSet_AVX)
+    {
+        if (isFloatingType)
+        {
+            comp->getEmitter()->SetContainsAVX(true);
+            if (sizeOfSIMDVector == 32)
+            {
+                comp->codeGen->getEmitter()->SetContains256bitAVX(true);
+            }
+        }
+    }
+#endif
+}
+
+//------------------------------------------------------------------------------
  // isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
  //
  // Arguments:
author	Li Tian <litian2025@gmail.com>
	Mon, 12 Dec 2016 03:13:28 +0000 (19:13 -0800)
committer	Li Tian <litian2025@gmail.com>
	Sun, 8 Jan 2017 20:44:12 +0000 (12:44 -0800)
src/coreclr/src/jit/codegen.h		patch \| blob \| history
src/coreclr/src/jit/codegencommon.cpp		patch \| blob \| history
src/coreclr/src/jit/codegenxarch.cpp		patch \| blob \| history
src/coreclr/src/jit/compiler.cpp		patch \| blob \| history
src/coreclr/src/jit/emitxarch.h		patch \| blob \| history
src/coreclr/src/jit/lower.h		patch \| blob \| history
src/coreclr/src/jit/lowerxarch.cpp		patch \| blob \| history