From b066a16f3af015be3f457430aeddfd7f24514c0a Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Mon, 22 May 2023 18:53:29 -0700 Subject: [PATCH] Update GetElement and ToScalar to lower IND, LCL_FLD, and LCL_VAR specially (#86400) * Update GetElement and ToScalar to lower IND, LCL_FLD, and LCL_VAR specially * Ensure GetElement still correctly handles too large indices * Only optimize to LCL_FLD for lvDoNotEnregister * Account for an extreme edge case in offset overflow * Use simdSize rather than maxSIMDStructBytes --- src/coreclr/jit/lower.h | 4 +- src/coreclr/jit/lowerxarch.cpp | 323 ++++++++++++++++++++++++++++++++--------- 2 files changed, 256 insertions(+), 71 deletions(-) diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 2644052..63f5920 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -356,8 +356,8 @@ private: GenTree* LowerHWIntrinsicDot(GenTreeHWIntrinsic* node); #if defined(TARGET_XARCH) void LowerFusedMultiplyAdd(GenTreeHWIntrinsic* node); - void LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node); - void LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node); + GenTree* LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node); + GenTree* LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicCndSel(GenTreeHWIntrinsic* node); GenTree* LowerHWIntrinsicWithElement(GenTreeHWIntrinsic* node); GenTree* TryLowerAndOpToResetLowestSetBit(GenTreeOp* andNode); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 16120f1..4b516a5 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -1056,23 +1056,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector256_GetElement: case NI_Vector512_GetElement: { - LowerHWIntrinsicGetElement(node); - - if ((node->GetHWIntrinsicId() == NI_Vector128_GetElement) || - (node->GetHWIntrinsicId() == NI_Vector256_GetElement) || - (node->GetHWIntrinsicId() == NI_Vector512_GetElement)) - { - // Most NI_Vector*_GetElement intrinsics are lowered to - // alternative nodes, such as the Extract intrinsics, - // which are themselves lowered. - // - // However, certain types may not have a direct equivalent - // in which case we specially handle them directly as GetElement - // and want to do the relevant containment checks. - ContainCheckHWIntrinsic(node); - } - - return node->gtNext; + return LowerHWIntrinsicGetElement(node); } case NI_Vector256_GetUpper: @@ -1210,8 +1194,7 @@ GenTree* Lowering::LowerHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector256_ToScalar: case NI_Vector512_ToScalar: { - LowerHWIntrinsicToScalar(node); - break; + return LowerHWIntrinsicToScalar(node); } case NI_SSE41_Extract: @@ -3260,7 +3243,7 @@ GenTree* Lowering::LowerHWIntrinsicCreate(GenTreeHWIntrinsic* node) // Arguments: // node - The hardware intrinsic node. // -void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) +GenTree* Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) { NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); var_types simdType = node->gtType; @@ -3297,50 +3280,197 @@ void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) } node->ResetHWIntrinsicId(intrinsicId, op1); - LowerNode(node); - - return; + return LowerNode(node); } + uint32_t count = simdSize / genTypeSize(simdBaseType); + uint32_t elemSize = genTypeSize(simdBaseType); + if (op1->OperIs(GT_IND)) { - // If the vector is already in memory, we force its - // addr to be evaluated into a reg. This would allow - // us to generate [regBase] or [regBase + offset] or - // [regBase + sizeOf(simdBaseType) * regIndex] to access - // the required vector element directly from memory. - // - // TODO-CQ-XARCH: If addr of GT_IND is GT_LEA, we - // might be able update GT_LEA to fold the regIndex - // or offset in some cases. Instead with this - // approach we always evaluate GT_LEA into a reg. - // Ideally, we should be able to lower GetItem intrinsic - // into GT_IND(newAddr) where newAddr combines - // the addr of the vector with the given index. - op1->gtFlags |= GTF_IND_REQ_ADDR_IN_REG; + // We want to optimize GetElement down to an Indir where possible as + // this unlocks additional containment opportunities for various nodes + + var_types newType; + GenTree* newBase; + GenTree* newIndex; + uint32_t newScale; + int32_t newOffset; + + GenTreeIndir* indir = op1->AsIndir(); + GenTree* addr = indir->Addr(); + + newType = simdBaseType; + + if (addr->OperIsAddrMode()) + { + // We have an existing addressing mode, so we want to try and + // combine with that where possible to keep things as a 1x LEA + + GenTreeAddrMode* addrMode = addr->AsAddrMode(); + + newBase = addrMode->Base(); + newIndex = addrMode->Index(); + newScale = addrMode->GetScale(); + newOffset = addrMode->Offset(); + + if (op2->OperIsConst() && (newOffset < (INT32_MAX - static_cast(simdSize)))) + { + // op2 is a constant, so add it to the existing offset + + BlockRange().Remove(addrMode); + BlockRange().Remove(op2); + + int32_t addOffset = (static_cast(op2->AsIntCon()->IconValue()) % count); + addOffset *= static_cast(elemSize); + + newOffset += addOffset; + } + else if (newIndex == nullptr) + { + // op2 is not a constant and the addressing mode doesn't + // have its own existing index, so use our index and scale + + BlockRange().Remove(addrMode); + + newIndex = op2; + newScale = elemSize; + } + else if (addrMode->GetScale() == elemSize) + { + // op2 is not a constant but the addressing mode has its + // own already with a matching scale, so add ours to theirs + + BlockRange().Remove(addrMode); + + newIndex = comp->gtNewOperNode(GT_ADD, TYP_I_IMPL, newIndex, op2); + BlockRange().InsertBefore(node, newIndex); + + LowerNode(newIndex); + } + else + { + // op2 is not a constant but the addressing mode is already + // complex, so build a new addressing mode with the prev as our base + + newBase = addrMode; + newIndex = op2; + newScale = elemSize; + newOffset = 0; + } + } + else if (op2->OperIsConst()) + { + // We don't have an addressing mode, so build one with the old addr + // as the base and the offset using the op2 constant and scale + + BlockRange().Remove(op2); + + newBase = addr; + newIndex = nullptr; + newScale = 0; + newOffset = (static_cast(op2->AsIntCon()->IconValue()) % count); + newOffset *= static_cast(elemSize); + } + else + { + // We don't have an addressing mode, so build one with the old addr + // as the base and the index set to op2 + + newBase = addr; + newIndex = op2; + newScale = elemSize; + newOffset = 0; + } + + if (newBase != nullptr) + { + newBase->ClearContained(); + } + + if (newIndex != nullptr) + { + newIndex->ClearContained(); + } + + GenTreeAddrMode* newAddr = + new (comp, GT_LEA) GenTreeAddrMode(addr->TypeGet(), newBase, newIndex, newScale, newOffset); + BlockRange().InsertBefore(node, newAddr); + + GenTreeIndir* newIndir = comp->gtNewIndir(newType, newAddr, (indir->gtFlags & GTF_IND_FLAGS)); + BlockRange().InsertBefore(node, newIndir); + + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(newIndir); + } + + BlockRange().Remove(op1); + BlockRange().Remove(node); + + assert(newAddr->gtNext == newIndir); + return LowerNode(newAddr); } if (!op2->OperIsConst()) { // We will specially handle GetElement in codegen when op2 isn't a constant - return; + ContainCheckHWIntrinsic(node); + return node->gtNext; } // We should have a bounds check inserted for any index outside the allowed range // but we need to generate some code anyways, and so we'll simply mask here for simplicity. - ssize_t count = simdSize / genTypeSize(simdBaseType); - ssize_t imm8 = static_cast(op2->AsIntCon()->IconValue()) % count; - ssize_t simd16Cnt = 16 / genTypeSize(simdBaseType); - ssize_t simd16Idx = imm8 / simd16Cnt; + uint32_t imm8 = static_cast(op2->AsIntCon()->IconValue()) % count; + uint32_t simd16Cnt = 16 / elemSize; + uint32_t simd16Idx = imm8 / simd16Cnt; - assert(0 <= imm8 && imm8 < count); + assert((0 <= imm8) && (imm8 < count)); - if (IsContainableMemoryOp(op1) && IsSafeToContainMem(node, op1)) + if (IsContainableMemoryOp(op1)) { - // We will specially handle GetElement in codegen when op1 is already in memory - op2->AsIntCon()->SetIconValue(imm8); - return; + // We will specially handle GetElement when op1 is already in memory + + if (op1->OperIs(GT_LCL_VAR, GT_LCL_FLD)) + { + // We want to optimize GetElement down to a LclFld where possible as + // this unlocks additional containment opportunities for various nodes + + GenTreeLclVarCommon* lclVar = op1->AsLclVarCommon(); + uint32_t lclOffs = lclVar->GetLclOffs() + (imm8 * elemSize); + LclVarDsc* lclDsc = comp->lvaGetDesc(lclVar); + + if (lclDsc->lvDoNotEnregister && (lclOffs <= 0xFFFF) && ((lclOffs + elemSize) <= lclDsc->lvExactSize())) + { + GenTree* lclFld = + comp->gtNewLclFldNode(lclVar->GetLclNum(), simdBaseType, static_cast(lclOffs)); + BlockRange().InsertBefore(node, lclFld); + + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(lclFld); + } + + BlockRange().Remove(op1); + BlockRange().Remove(op2); + BlockRange().Remove(node); + + return LowerNode(lclFld); + } + } + + if (IsSafeToContainMem(node, op1)) + { + // Handle other cases in codegen + + op2->AsIntCon()->SetIconValue(imm8); + ContainCheckHWIntrinsic(node); + + return node->gtNext; + } } switch (simdBaseType) @@ -3489,8 +3619,7 @@ void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) node->SetSimdSize(16); node->ResetHWIntrinsicId(NI_Vector128_ToScalar, op1); - LowerNode(node); - return; + return LowerNode(node); } else { @@ -3538,9 +3667,15 @@ void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) node->ResetHWIntrinsicId(resIntrinsic, op1, op2); } + GenTree* next = node->gtNext; + if (node->GetHWIntrinsicId() != intrinsicId) { - LowerNode(node); + next = LowerNode(node); + } + else + { + ContainCheckHWIntrinsic(node); } if ((simdBaseType == TYP_BYTE) || (simdBaseType == TYP_SHORT)) @@ -3560,8 +3695,10 @@ void Lowering::LowerHWIntrinsicGetElement(GenTreeHWIntrinsic* node) { use.ReplaceWith(cast); } - LowerNode(cast); + next = LowerNode(cast); } + + return next; } //---------------------------------------------------------------------------------------------- @@ -4695,7 +4832,7 @@ GenTree* Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // Arguments: // node - The hardware intrinsic node. // -void Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node) +GenTree* Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node) { NamedIntrinsic intrinsicId = node->GetHWIntrinsicId(); CorInfoType simdBaseJitType = node->GetSimdBaseJitType(); @@ -4712,10 +4849,67 @@ void Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node) GenTree* op1 = node->Op(1); - if (IsContainableMemoryOp(op1) && IsSafeToContainMem(node, op1)) + if (IsContainableMemoryOp(op1)) { - // We will specially handle ToScalar in codegen when op1 is already in memory - return; + // We will specially handle ToScalar when op1 is already in memory + + if (op1->OperIs(GT_IND)) + { + // We want to optimize ToScalar down to an Indir where possible as + // this unlocks additional containment opportunities for various nodes + + GenTreeIndir* indir = op1->AsIndir(); + + GenTreeIndir* newIndir = comp->gtNewIndir(simdBaseType, indir->Addr(), (indir->gtFlags & GTF_IND_FLAGS)); + BlockRange().InsertBefore(node, newIndir); + + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(newIndir); + } + + BlockRange().Remove(op1); + BlockRange().Remove(node); + + return LowerNode(newIndir); + } + + if (op1->OperIs(GT_LCL_VAR, GT_LCL_FLD)) + { + uint32_t elemSize = genTypeSize(simdBaseType); + + // We want to optimize ToScalar down to a LclFld where possible as + // this unlocks additional containment opportunities for various nodes + + GenTreeLclVarCommon* lclVar = op1->AsLclVarCommon(); + uint32_t lclOffs = lclVar->GetLclOffs() + (0 * elemSize); + LclVarDsc* lclDsc = comp->lvaGetDesc(lclVar); + + if (lclDsc->lvDoNotEnregister && (lclOffs <= 0xFFFF) && ((lclOffs + elemSize) <= lclDsc->lvExactSize())) + { + GenTree* lclFld = comp->gtNewLclFldNode(lclVar->GetLclNum(), simdBaseType, lclVar->GetLclOffs()); + BlockRange().InsertBefore(node, lclFld); + + LIR::Use use; + if (BlockRange().TryGetUse(node, &use)) + { + use.ReplaceWith(lclFld); + } + + BlockRange().Remove(op1); + BlockRange().Remove(node); + + return LowerNode(lclFld); + } + } + + if (IsSafeToContainMem(node, op1)) + { + // Handle other cases in codegen + ContainCheckHWIntrinsic(node); + return node->gtNext; + } } switch (simdBaseType) @@ -4758,7 +4952,7 @@ void Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node) case TYP_DOUBLE: { ContainCheckHWIntrinsic(node); - return; + return node->gtNext; } default: @@ -4767,7 +4961,7 @@ void Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node) } } - LowerNode(node); + GenTree* next = LowerNode(node); if (genTypeSize(simdBaseType) < 4) { @@ -4786,8 +4980,10 @@ void Lowering::LowerHWIntrinsicToScalar(GenTreeHWIntrinsic* node) { use.ReplaceWith(cast); } - LowerNode(cast); + next = LowerNode(cast); } + + return next; } //---------------------------------------------------------------------------------------------- @@ -8084,12 +8280,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) case NI_Vector256_GetElement: case NI_Vector512_GetElement: { - if (op1->OperIs(GT_IND)) - { - assert((op1->gtFlags & GTF_IND_REQ_ADDR_IN_REG) != 0); - op1->AsIndir()->Addr()->ClearContained(); - } - if (op2->OperIsConst()) { MakeSrcContained(node, op2); @@ -8098,11 +8288,6 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) if (IsContainableMemoryOp(op1) && IsSafeToContainMem(node, op1)) { MakeSrcContained(node, op1); - - if (op1->OperIs(GT_IND)) - { - op1->AsIndir()->Addr()->ClearContained(); - } } break; } -- 2.7.4