regSet.verifyRegUsed(reg);
}
+#if defined(FEATURE_SIMD)
+//----------------------------------------------------------------------------------
+// genSetRegToConst: generate code to set target SIMD register to a given constant value
+//
+// Arguments:
+// targetReg - target SIMD register
+// targetType - target's type
+// simd_t - constant data (its width depends on type)
+//
+void CodeGen::genSetRegToConst(regNumber targetReg, var_types targetType, simd_t* val)
+{
+ emitter* emit = GetEmitter();
+ emitAttr attr = emitTypeSize(targetType);
+
+ switch (targetType)
+ {
+ case TYP_SIMD8:
+ {
+ simd8_t val8 = *(simd8_t*)val;
+ if (val8.IsAllBitsSet())
+ {
+ emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
+ }
+ else if (val8.IsZero())
+ {
+ emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg);
+ }
+ else
+ {
+ CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(val8);
+ emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+ }
+ break;
+ }
+ case TYP_SIMD12:
+ {
+ simd12_t val12 = *(simd12_t*)val;
+ if (val12.IsAllBitsSet())
+ {
+ emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
+ }
+ else if (val12.IsZero())
+ {
+ emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg);
+ }
+ else
+ {
+ simd16_t val16 = {};
+ memcpy(&val16, &val12, sizeof(val12));
+ CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16);
+ emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+ }
+ break;
+ }
+ case TYP_SIMD16:
+ {
+ simd16_t val16 = *(simd16_t*)val;
+ if (val16.IsAllBitsSet())
+ {
+ emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
+ }
+ else if (val16.IsZero())
+ {
+ emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
+ }
+ else
+ {
+ CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(val16);
+ emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+ }
+ break;
+ }
+ case TYP_SIMD32:
+ {
+ simd32_t val32 = *(simd32_t*)val;
+ if (val32.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
+ {
+ emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
+ }
+ else if (val32.IsZero())
+ {
+ emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
+ }
+ else
+ {
+ CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(val32);
+ emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+ }
+ break;
+ }
+ case TYP_SIMD64:
+ {
+ simd64_t val64 = *(simd64_t*)val;
+ if (val64.IsAllBitsSet() && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F))
+ {
+ emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
+ static_cast<int8_t>(0xFF));
+ }
+ else if (val64.IsZero())
+ {
+ emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
+ }
+ else
+ {
+ CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(val64);
+ emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
+ }
+ break;
+ }
+ default:
+ {
+ unreached();
+ }
+ }
+}
+#endif // FEATURE_SIMD
+
/***********************************************************************************
*
* Generate code to set a register 'targetReg' of type 'targetType' to the constant
case GT_CNS_VEC:
{
- GenTreeVecCon* vecCon = tree->AsVecCon();
-
- emitter* emit = GetEmitter();
- emitAttr attr = emitTypeSize(targetType);
-
- if (vecCon->IsAllBitsSet())
- {
- switch (attr)
- {
- case EA_8BYTE:
- case EA_16BYTE:
- {
- emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, EA_16BYTE, targetReg, targetReg, targetReg);
- return;
- }
-
#if defined(FEATURE_SIMD)
- case EA_32BYTE:
- {
- if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
- {
- emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, attr, targetReg, targetReg, targetReg);
- return;
- }
- break;
- }
-
- case EA_64BYTE:
- {
- assert(compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F));
- emit->emitIns_SIMD_R_R_R_I(INS_vpternlogd, attr, targetReg, targetReg, targetReg,
- static_cast<int8_t>(0xFF));
- return;
- }
-#endif // FEATURE_SIMD
-
- default:
- {
- unreached();
- }
- }
- }
-
- if (vecCon->IsZero())
- {
- switch (attr)
- {
- case EA_8BYTE:
- case EA_16BYTE:
- {
- emit->emitIns_SIMD_R_R_R(INS_xorps, EA_16BYTE, targetReg, targetReg, targetReg);
- return;
- }
-
- case EA_32BYTE:
- {
- if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX))
- {
- emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
- return;
- }
- break;
- }
-
- case EA_64BYTE:
- {
- if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX512F))
- {
- emit->emitIns_SIMD_R_R_R(INS_xorps, attr, targetReg, targetReg, targetReg);
- return;
- }
- break;
- }
-
- default:
- {
- unreached();
- }
- }
- }
-
- switch (tree->TypeGet())
- {
-#if defined(FEATURE_SIMD)
- case TYP_SIMD8:
- {
- simd8_t constValue;
- memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd8_t));
-
- CORINFO_FIELD_HANDLE hnd = emit->emitSimd8Const(constValue);
- emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
- break;
- }
-
- case TYP_SIMD12:
- {
- simd16_t constValue = {};
- memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd12_t));
-
- CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(constValue);
- emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
- break;
- }
-
- case TYP_SIMD16:
- {
- simd16_t constValue;
- memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd16_t));
-
- CORINFO_FIELD_HANDLE hnd = emit->emitSimd16Const(constValue);
- emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
- break;
- }
-
- case TYP_SIMD32:
- {
- simd32_t constValue;
- memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd32_t));
-
- CORINFO_FIELD_HANDLE hnd = emit->emitSimd32Const(constValue);
- emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
- break;
- }
-
- case TYP_SIMD64:
- {
- simd64_t constValue;
- memcpy(&constValue, &vecCon->gtSimdVal, sizeof(simd64_t));
-
- CORINFO_FIELD_HANDLE hnd = emit->emitSimd64Const(constValue);
- emit->emitIns_R_C(ins_Load(targetType), attr, targetReg, hnd, 0);
- break;
- }
-#endif // FEATURE_SIMD
-
- default:
- {
- unreached();
- }
- }
-
+ GenTreeVecCon* vecCon = tree->AsVecCon();
+ genSetRegToConst(vecCon->GetRegNum(), targetType, &vecCon->gtSimdVal);
+#else
+ unreached();
+#endif
break;
}
// INITBLK zeroes a struct that contains GC pointers and can be observed by
// other threads (i.e. when dstAddr is not an address of a local).
// For example, this can happen when initializing a struct field of an object.
- const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences();
-
-#ifdef TARGET_AMD64
- // On Amd64 the JIT will not use SIMD stores for such structs and instead
- // will always allocate a GP register for src node.
- const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
-#else
- // On X86 the JIT will use movq for structs that are larger than 16 bytes
- // since it is more beneficial than using two mov-s from a GP register.
- const bool willUseSimdMov = (size >= 16);
-#endif
+ const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences() && compiler->IsBaselineSimdIsaSupported();
+ const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
if (!src->isContained())
{
}
else
{
- // If src is contained then it must be 0.
- assert(src->IsIntegralConst(0));
assert(willUseSimdMov);
-#ifdef TARGET_AMD64
assert(size >= XMM_REGSIZE_BYTES);
-#else
- assert(size % 8 == 0);
-#endif
}
emitter* emit = GetEmitter();
assert(size <= INT32_MAX);
assert(dstOffset < (INT32_MAX - static_cast<int>(size)));
+#ifdef FEATURE_SIMD
if (willUseSimdMov)
{
regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);
-
- unsigned regSize = compiler->roundDownSIMDSize(size);
+ unsigned regSize = compiler->roundDownSIMDSize(size);
if (size < ZMM_RECOMMENDED_THRESHOLD)
{
// Involve ZMM only for large data due to possible downclocking.
regSize = min(regSize, YMM_REGSIZE_BYTES);
}
-
- bool zeroing = false;
- if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
- {
- // If the source is constant 0 then always use xorps, it's faster
- // than copying the constant from a GPR to a XMM register.
- emit->emitIns_SIMD_R_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg, srcXmmReg);
- zeroing = true;
- }
- else
- {
- // TODO-AVX512-ARCH: Enable AVX-512 for non-zeroing initblk.
- regSize = min(regSize, YMM_REGSIZE_BYTES);
-
- if (compiler->compOpportunisticallyDependsOn(InstructionSet_Vector512))
- {
- emit->emitIns_R_R(INS_vpbroadcastd_gpr, EA_ATTR(regSize), srcXmmReg, srcIntReg);
- }
- else if (compiler->compOpportunisticallyDependsOn(InstructionSet_AVX2))
- {
- emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false);
- emit->emitIns_R_R(INS_vpbroadcastd, EA_ATTR(regSize), srcXmmReg, srcXmmReg);
- }
- else
- {
- emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false);
-
- emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg);
-
-#ifdef TARGET_X86
- // For x86, we need one more to convert it from 8 bytes to 16 bytes.
- emit->emitIns_SIMD_R_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg, srcXmmReg);
-#endif
-
- if (regSize == YMM_REGSIZE_BYTES)
- {
- // Extend the bytes in the lower lanes to the upper lanes
- emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1);
- }
- }
- }
+ var_types loadType = compiler->getSIMDTypeForSize(regSize);
+ simd_t vecCon;
+ memset(&vecCon, (uint8_t)src->AsIntCon()->IconValue(), sizeof(simd_t));
+ genSetRegToConst(srcXmmReg, loadType, &vecCon);
instruction simdMov = simdUnalignedMovIns();
unsigned bytesWritten = 0;
while (bytesWritten < size)
{
-#ifdef TARGET_X86
- if (!canUse16BytesSimdMov || (bytesWritten + regSize > size))
- {
- simdMov = INS_movq;
- regSize = 8;
- }
-#endif
if (bytesWritten + regSize > size)
{
+ // We have a remainder that is smaller than regSize.
break;
}
emitSimdMovs();
dstOffset += regSize;
bytesWritten += regSize;
-
- if (!zeroing)
- {
- assert(regSize <= YMM_REGSIZE_BYTES);
- }
-
- if (!zeroing && regSize == YMM_REGSIZE_BYTES && size - bytesWritten < YMM_REGSIZE_BYTES)
- {
- regSize = XMM_REGSIZE_BYTES;
- }
}
size -= bytesWritten;
- // Handle the remainder by overlapping with previously processed data (only for zeroing)
- if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
+ // Handle the remainder by overlapping with previously processed data
+ if ((size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
{
- if (isPow2(size) && (size <= REGSIZE_BYTES))
- {
- // For sizes like 1,2,4 and 8 we delegate handling to normal stores
- // because that will be a single instruction that is smaller than SIMD mov
- }
- else
- {
- // Get optimal register size to cover the whole remainder (with overlapping)
- regSize = compiler->roundUpSIMDSize(size);
+ // Get optimal register size to cover the whole remainder (with overlapping)
+ regSize = compiler->roundUpSIMDSize(size);
- // Rewind dstOffset so we can fit a vector for the while remainder
- dstOffset -= (regSize - size);
- emitSimdMovs();
- size = 0;
- }
+ // Rewind dstOffset so we can fit a vector for the while remainder
+ dstOffset -= (regSize - size);
+ emitSimdMovs();
+ size = 0;
}
}
+#endif // FEATURE_SIMD
assert((srcIntReg != REG_NA) || (size == 0));