JIT: Improve block unrolling, enable AVX-512 (#85501)
authorEgor Bogatov <egorbo@gmail.com>
Wed, 3 May 2023 19:19:44 +0000 (21:19 +0200)
committerGitHub <noreply@github.com>
Wed, 3 May 2023 19:19:44 +0000 (21:19 +0200)
src/coreclr/jit/codegen.h
src/coreclr/jit/codegenxarch.cpp
src/coreclr/jit/lowerxarch.cpp

index 94d67a06ace08865f55201de8381fbdefc20ccda..070ce82e6ec22f4df6756afae6aca626bd97d14a 100644 (file)
@@ -780,6 +780,9 @@ protected:
 #endif
 
     void genSetRegToConst(regNumber targetReg, var_types targetType, GenTree* tree);
+#if defined(FEATURE_SIMD)
+    void genSetRegToConst(regNumber targetReg, var_types targetType, simd_t* val);
+#endif
     void genCodeForTreeNode(GenTree* treeNode);
     void genCodeForBinary(GenTreeOp* treeNode);
 
index c0b2404cefb58ac95c1d8dc083f33252d1f66358..1988aa632cf35b380a69c420cc93c3fa9fcfe424 100644 (file)
@@ -417,6 +417,123 @@ void CodeGen::instGen_Set_Reg_To_Imm(emitAttr  size,
     regSet.verifyRegUsed(reg);
 }
 
+#if defined(FEATURE_SIMD)
+//----------------------------------------------------------------------------------
+// genSetRegToConst: generate code to set target SIMD register to a given constant value
+//
+// Arguments:
+//    targetReg  - target SIMD register
+//    targetType - target's type
+//    simd_t     - constant data (its width depends on type)
+//
+void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t* val)
+{
+    emitter* emit = GetEmitter();
+    emitAttr attr = emitTypeSize(targetType);
+
+    switch (targetType)
+    {
+        case TYP_SIMD8:
+        {
+            simd8_t val8 = *(simd8_t*)val;
+            if (val8.IsAllBitsSet())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
+            }
+            else if (val8.IsZero())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg);
+            }
+            else
+            {
+                CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(val8);
+                emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+            }
+            break;
+        }
+        case TYP_SIMD12:
+        {
+            simd12_t val12 = *(simd12_t*)val;
+            if (val12.IsAllBitsSet())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
+            }
+            else if (val12.IsZero())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg);
+            }
+            else
+            {
+                simd16_t val16 = {};
+                memcpy(&val16, &val12, sizeof(val12));
+                CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16);
+                emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+            }
+            break;
+        }
+        case TYP_SIMD16:
+        {
+            simd16_t val16 = *(simd16_t*)val;
+            if (val16.IsAllBitsSet())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
+            }
+            else if (val16.IsZero())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
+            }
+            else
+            {
+                CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16);
+                emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+            }
+            break;
+        }
+        case TYP_SIMD32:
+        {
+            simd32_t val32 = *(simd32_t*)val;
+            if (val32.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
+            {
+                emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
+            }
+            else if (val32.IsZero())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
+            }
+            else
+            {
+                CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(val32);
+                emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+            }
+            break;
+        }
+        case TYP_SIMD64:
+        {
+            simd64_t val64 = *(simd64_t*)val;
+            if (val64.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F))
+            {
+                emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
+                                           static_cast<int8_t>(0xFF));
+            }
+            else if (val64.IsZero())
+            {
+                emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
+            }
+            else
+            {
+                CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(val64);
+                emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+            }
+            break;
+        }
+        default:
+        {
+            unreached();
+        }
+    }
+}
+#endif // FEATURE_SIMD
+
 /***********************************************************************************
  *
  * Generate code to set a register 'targetReg' of type 'targetType' to the constant
@@ -484,147 +601,12 @@ void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, GenTre
 
         case GT_CNS_VEC:
         {
-            GenTreeVecCon* vecCon = tree->AsVecCon();
-
-            emitter* emit = GetEmitter();
-            emitAttr attr = emitTypeSize(targetType);
-
-            if (vecCon->IsAllBitsSet())
-            {
-                switch (attr)
-                {
-                    case EA_8BYTE:
-                    case EA_16BYTE:
-                    {
-                        emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
-                        return;
-                    }
-
 #if defined(FEATURE_SIMD)
-                    case EA_32BYTE:
-                    {
-                        if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
-                        {
-                            emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
-                            return;
-                        }
-                        break;
-                    }
-
-                    case EA_64BYTE:
-                    {
-                        assert(compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F));
-                        emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
-                                                   static_cast<int8_t>(0xFF));
-                        return;
-                    }
-#endif // FEATURE_SIMD
-
-                    default:
-                    {
-                        unreached();
-                    }
-                }
-            }
-
-            if (vecCon->IsZero())
-            {
-                switch (attr)
-                {
-                    case EA_8BYTE:
-                    case EA_16BYTE:
-                    {
-                        emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg);
-                        return;
-                    }
-
-                    case EA_32BYTE:
-                    {
-                        if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX))
-                        {
-                            emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
-                            return;
-                        }
-                        break;
-                    }
-
-                    case EA_64BYTE:
-                    {
-                        if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F))
-                        {
-                            emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
-                            return;
-                        }
-                        break;
-                    }
-
-                    default:
-                    {
-                        unreached();
-                    }
-                }
-            }
-
-            switch (tree->TypeGet())
-            {
-#if defined(FEATURE_SIMD)
-                case TYP_SIMD8:
-                {
-                    simd8_t constValue;
-                    memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd8_t));
-
-                    CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(constValue);
-                    emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
-                    break;
-                }
-
-                case TYP_SIMD12:
-                {
-                    simd16_t constValue = {};
-                    memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd12_t));
-
-                    CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(constValue);
-                    emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
-                    break;
-                }
-
-                case TYP_SIMD16:
-                {
-                    simd16_t constValue;
-                    memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd16_t));
-
-                    CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(constValue);
-                    emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
-                    break;
-                }
-
-                case TYP_SIMD32:
-                {
-                    simd32_t constValue;
-                    memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd32_t));
-
-                    CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(constValue);
-                    emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
-                    break;
-                }
-
-                case TYP_SIMD64:
-                {
-                    simd64_t constValue;
-                    memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd64_t));
-
-                    CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue);
-                    emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
-                    break;
-                }
-#endif // FEATURE_SIMD
-
-                default:
-                {
-                    unreached();
-                }
-            }
-
+            GenTreeVecCon* vecCon = tree->AsVecCon();
+            genSetRegToConst(vecCon->GetRegNum(), targetType, &vecCon->gtSimdVal);
+#else
+            unreached();
+#endif
             break;
         }
 
@@ -3134,17 +3116,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
     // INITBLK zeroes a struct that contains GC pointers and can be observed by
     // other threads (i.e. when dstAddr is not an address of a local).
     // For example, this can happen when initializing a struct field of an object.
-    const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences();
-
-#ifdef TARGET_AMD64
-    // On Amd64 the JIT will not use SIMD stores for such structs and instead
-    // will always allocate a GP register for src node.
-    const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
-#else
-    // On X86 the JIT will use movq for structs that are larger than 16 bytes
-    // since it is more beneficial than using two mov-s from a GP register.
-    const bool willUseSimdMov = (size >= 16);
-#endif
+    const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences() && compiler->IsBaselineSimdIsaSupported();
+    const bool willUseSimdMov       = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
 
     if (!src->isContained())
     {
@@ -3152,14 +3125,8 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
     }
     else
     {
-        // If src is contained then it must be 0.
-        assert(src->IsIntegralConst(0));
         assert(willUseSimdMov);
-#ifdef TARGET_AMD64
         assert(size >= XMM_REGSIZE_BYTES);
-#else
-        assert(size % 8 == 0);
-#endif
     }
 
     emitter* emit = GetEmitter();
@@ -3167,57 +3134,20 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
     assert(size <= INT32_MAX);
     assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
 
+#ifdef FEATURE_SIMD
     if (willUseSimdMov)
     {
         regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);
-
-        unsigned regSize = compiler->roundDownSIMDSize(size);
+        unsigned  regSize   = compiler->roundDownSIMDSize(size);
         if (size < ZMM_RECOMMENDED_THRESHOLD)
         {
             // Involve ZMM only for large data due to possible downclocking.
             regSize = min(regSize, YMM_REGSIZE_BYTES);
         }
-
-        bool zeroing = false;
-        if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
-        {
-            // If the source is constant 0 then always use xorps, it's faster
-            // than copying the constant from a GPR to a XMM register.
-            emit->emitIns_SIMD_R_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg, srcXmmReg);
-            zeroing = true;
-        }
-        else
-        {
-            // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk.
-            regSize = min(regSize, YMM_REGSIZE_BYTES);
-
-            if (compiler->compOpportunisticallyDependsOn(InstructionSet_Vector512))
-            {
-                emit->emitIns_R_R(INS_vpbroadcastd_gpr, EA_ATTR(regSize), srcXmmReg, srcIntReg);
-            }
-            else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
-            {
-                emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false);
-                emit->emitIns_R_R(INS_vpbroadcastd, EA_ATTR(regSize), srcXmmReg, srcXmmReg);
-            }
-            else
-            {
-                emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false);
-
-                emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg);
-
-#ifdef TARGET_X86
-                // For x86, we need one more to convert it from 8 bytes to 16 bytes.
-                emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg);
-#endif
-
-                if (regSize == YMM_REGSIZE_BYTES)
-                {
-                    // Extend the bytes in the lower lanes to the upper lanes
-                    emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1);
-                }
-            }
-        }
+        var_types loadType = compiler->getSIMDTypeForSize(regSize);
+        simd_t    vecCon;
+        memset(&vecCon, (uint8_t)src->AsIntCon()->IconValue(), sizeof(simd_t));
+        genSetRegToConst(srcXmmReg, loadType, &vecCon);
 
         instruction simdMov      = simdUnalignedMovIns();
         unsigned    bytesWritten = 0;
@@ -3236,55 +3166,32 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
 
         while (bytesWritten < size)
         {
-#ifdef TARGET_X86
-            if (!canUse16BytesSimdMov || (bytesWritten + regSize > size))
-            {
-                simdMov = INS_movq;
-                regSize = 8;
-            }
-#endif
             if (bytesWritten + regSize > size)
             {
+                // We have a remainder that is smaller than regSize.
                 break;
             }
 
             emitSimdMovs();
             dstOffset += regSize;
             bytesWritten += regSize;
-
-            if (!zeroing)
-            {
-                assert(regSize <= YMM_REGSIZE_BYTES);
-            }
-
-            if (!zeroing && regSize == YMM_REGSIZE_BYTES && size - bytesWritten < YMM_REGSIZE_BYTES)
-            {
-                regSize = XMM_REGSIZE_BYTES;
-            }
         }
 
         size -= bytesWritten;
 
-        // Handle the remainder by overlapping with previously processed data (only for zeroing)
-        if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
+        // Handle the remainder by overlapping with previously processed data
+        if ((size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
         {
-            if (isPow2(size) && (size <= REGSIZE_BYTES))
-            {
-                // For sizes like 1,2,4 and 8 we delegate handling to normal stores
-                // because that will be a single instruction that is smaller than SIMD mov
-            }
-            else
-            {
-                // Get optimal register size to cover the whole remainder (with overlapping)
-                regSize = compiler->roundUpSIMDSize(size);
+            // Get optimal register size to cover the whole remainder (with overlapping)
+            regSize = compiler->roundUpSIMDSize(size);
 
-                // Rewind dstOffset so we can fit a vector for the while remainder
-                dstOffset -= (regSize - size);
-                emitSimdMovs();
-                size = 0;
-            }
+            // Rewind dstOffset so we can fit a vector for the while remainder
+            dstOffset -= (regSize - size);
+            emitSimdMovs();
+            size = 0;
         }
     }
+#endif // FEATURE_SIMD
 
     assert((srcIntReg != REG_NA) || (size == 0));
 
index ca3c64dd2ff24ae8a5e34b19ffcbcc8354eb7dfa..d780773abba69a9994c6375b71ec782b98f8d0bc 100644 (file)
@@ -327,7 +327,6 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
             }
             else
             {
-
                 // The fill value of an initblk is interpreted to hold a
                 // value of (unsigned int8) however a constant of any size
                 // may practically reside on the evaluation stack. So extract
@@ -337,38 +336,22 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
 
                 ssize_t fill = src->AsIntCon()->IconValue() & 0xFF;
 
-                if (fill == 0)
+                const bool canUseSimd = !blkNode->IsOnHeapAndContainsReferences() && comp->IsBaselineSimdIsaSupported();
+                if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset, canUseSimd))
                 {
-                    if (size >= XMM_REGSIZE_BYTES)
-                    {
-                        const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
-#ifdef TARGET_AMD64
-
-                        bool willUseOnlySimdMov = size % XMM_REGSIZE_BYTES == 0;
-                        if (!willUseOnlySimdMov)
-                        {
-                            // If we have a remainder we still might only use SIMD to process it (via overlapping)
-                            // unless it's more efficient to do that via scalar op (for sizes 1,2,4 and 8)
-                            const unsigned remainder = size % XMM_REGSIZE_BYTES;
-                            if (!isPow2(remainder) || (remainder > REGSIZE_BYTES))
-                            {
-                                willUseOnlySimdMov = true;
-                            }
-                        }
-#else
-                        const bool willUseOnlySimdMov = (size % 8 == 0);
-#endif
-                        if (willUseOnlySimdMov && canUse16BytesSimdMov)
-                        {
-                            src->SetContained();
-                        }
-                        else if (size > comp->getUnrollThreshold(Compiler::UnrollKind::Memset,
-                                                                 /*canUseSimd*/ canUse16BytesSimdMov))
-                        {
-                            // It turns out we can't use SIMD so the default threshold is too big
-                            goto TOO_BIG_TO_UNROLL;
-                        }
-                    }
+                    // It turns out we can't use SIMD so the default threshold is too big
+                    goto TOO_BIG_TO_UNROLL;
+                }
+                if (canUseSimd && (size >= XMM_REGSIZE_BYTES))
+                {
+                    // We're going to use SIMD (and only SIMD - we don't want to occupy a GPR register
+                    // with a fill value just to handle the remainder when we can do that with
+                    // an overlapped SIMD load).
+                    src->SetContained();
+                }
+                else if (fill == 0)
+                {
+                    // Leave as is - zero shouldn't be contained when we don't use SIMD.
                 }
 #ifdef TARGET_AMD64
                 else if (size >= REGSIZE_BYTES)