From cc9fdadb9aa9bec7eb45c9cf62563259b00bc55e Mon Sep 17 00:00:00 2001 From: Kunal Pathak Date: Mon, 14 Jun 2021 13:29:15 -0700 Subject: [PATCH] Do not mark op2 as delayRegFree if op1==op2 (#53964) * Do not mark op2 as delayRegFree if op1==op2 * Revert NodesAreEquivalentLeaves change * Pass rmwNode to `BuildDelayFreeUses()` which does the right thing * Make similar change in arm64 * remove TODO comment * review feedback --- src/coreclr/jit/lower.cpp | 17 +++++++----- src/coreclr/jit/lsraarm64.cpp | 60 +++++++++---------------------------------- src/coreclr/jit/lsrabuild.cpp | 4 +++ src/coreclr/jit/lsraxarch.cpp | 11 ++++---- 4 files changed, 32 insertions(+), 60 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 254d557..389c29f 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -6210,17 +6210,22 @@ bool Lowering::IndirsAreEquivalent(GenTree* candidate, GenTree* storeInd) } } -/** Test whether the two given nodes are the same leaves. - * Right now, only constant integers and local variables are supported - */ +//------------------------------------------------------------------------ +// NodesAreEquivalentLeaves: Check whether the two given nodes are the same leaves. +// +// Arguments: +// tree1 and tree2 are nodes to be checked. +// Return Value: +// Returns true if they are same leaves, false otherwise. +// +// static bool Lowering::NodesAreEquivalentLeaves(GenTree* tree1, GenTree* tree2) { - if (tree1 == nullptr && tree2 == nullptr) + if (tree1 == tree2) { return true; } - // both null, they are equivalent, otherwise if either is null not equivalent if (tree1 == nullptr || tree2 == nullptr) { return false; @@ -6247,7 +6252,7 @@ bool Lowering::NodesAreEquivalentLeaves(GenTree* tree1, GenTree* tree2) switch (tree1->OperGet()) { case GT_CNS_INT: - return tree1->AsIntCon()->gtIconVal == tree2->AsIntCon()->gtIconVal && + return tree1->AsIntCon()->IconValue() == tree2->AsIntCon()->IconValue() && tree1->IsIconHandle() == tree2->IsIconHandle(); case GT_LCL_VAR: case GT_LCL_VAR_ADDR: diff --git a/src/coreclr/jit/lsraarm64.cpp b/src/coreclr/jit/lsraarm64.cpp index 223dc90..a7a14fd 100644 --- a/src/coreclr/jit/lsraarm64.cpp +++ b/src/coreclr/jit/lsraarm64.cpp @@ -1112,54 +1112,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) // RMW intrinsic operands doesn't have to be delayFree when they can be assigned the same register as op1Reg // (i.e. a register that corresponds to read-modify-write operand) and one of them is the last use. - bool op2DelayFree = isRMW; - bool op3DelayFree = isRMW; - bool op4DelayFree = isRMW; - assert(intrin.op1 != nullptr); - if (isRMW && intrin.op1->OperIs(GT_LCL_VAR)) - { - unsigned int varNum1 = intrin.op1->AsLclVar()->GetLclNum(); - bool op1LastUse = false; - - unsigned int varNum2 = BAD_VAR_NUM; - unsigned int varNum3 = BAD_VAR_NUM; - unsigned int varNum4 = BAD_VAR_NUM; - - if (intrin.op2->OperIs(GT_LCL_VAR)) - { - varNum2 = intrin.op2->AsLclVar()->GetLclNum(); - op1LastUse |= ((varNum1 == varNum2) && intrin.op2->HasLastUse()); - } - - if (intrin.op3 != nullptr) - { - if (intrin.op3->OperIs(GT_LCL_VAR)) - { - varNum3 = intrin.op3->AsLclVar()->GetLclNum(); - op1LastUse |= ((varNum1 == varNum3) && intrin.op3->HasLastUse()); - } - - if ((intrin.op4 != nullptr) && intrin.op4->OperIs(GT_LCL_VAR)) - { - varNum4 = intrin.op4->AsLclVar()->GetLclNum(); - op1LastUse |= ((varNum1 == varNum4) && intrin.op4->HasLastUse()); - } - } - - if (op1LastUse) - { - op2DelayFree = (varNum1 != varNum2); - op3DelayFree = (varNum1 != varNum3); - op4DelayFree = (varNum1 != varNum4); - } - } - + bool forceOp2DelayFree = false; if ((intrin.id == NI_Vector64_GetElement) || (intrin.id == NI_Vector128_GetElement)) { - assert(!op2DelayFree); - if (!intrin.op2->IsCnsIntOrI() && (!intrin.op1->isContained() || intrin.op1->OperIsLocal())) { // If the index is not a constant and the object is not contained or is a local @@ -1168,7 +1125,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) // TODO-Cleanup: An internal register will never clobber a source; this code actually // ensures that the index (op2) doesn't interfere with the target. buildInternalIntRegisterDefForNode(intrinsicTree); - op2DelayFree = true; + forceOp2DelayFree = true; } if (!intrin.op2->IsCnsIntOrI() && !intrin.op1->isContained()) @@ -1179,15 +1136,22 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) } } - srcCount += op2DelayFree ? BuildDelayFreeUses(intrin.op2) : BuildOperandUses(intrin.op2); + if (forceOp2DelayFree) + { + srcCount += BuildDelayFreeUses(intrin.op2); + } + else + { + srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, intrin.op1) : BuildOperandUses(intrin.op2); + } if (intrin.op3 != nullptr) { - srcCount += op3DelayFree ? BuildDelayFreeUses(intrin.op3) : BuildOperandUses(intrin.op3); + srcCount += isRMW ? BuildDelayFreeUses(intrin.op3, intrin.op1) : BuildOperandUses(intrin.op3); if (intrin.op4 != nullptr) { - srcCount += op4DelayFree ? BuildDelayFreeUses(intrin.op4) : BuildOperandUses(intrin.op4); + srcCount += isRMW ? BuildDelayFreeUses(intrin.op4, intrin.op1) : BuildOperandUses(intrin.op4); } } } diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 536cfa4..dcea881 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3097,6 +3097,10 @@ int LinearScan::BuildDelayFreeUses(GenTree* node, GenTree* rmwNode, regMaskTP ca } if (use != nullptr) { + // If node != rmwNode, then definitely node should be marked as "delayFree". + // However, if node == rmwNode, then we can mark node as "delayFree" only + // none of the node/rmwNode are the last uses. If either of them are last use, + // we can safely reuse the rmwNode as destination. if ((use->getInterval() != rmwInterval) || (!rmwIsLastUse && !use->lastUse)) { setDelayFree(use); diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 1cd8112..811010f 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -2420,9 +2420,9 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) // Any pair of the index, mask, or destination registers should be different srcCount += BuildOperandUses(op1); - srcCount += BuildDelayFreeUses(op2); - srcCount += BuildDelayFreeUses(op3); - srcCount += BuildDelayFreeUses(op4); + srcCount += BuildDelayFreeUses(op2, op1); + srcCount += BuildDelayFreeUses(op3, op1); + srcCount += BuildDelayFreeUses(op4, op1); // op5 should always be contained assert(argList->Rest()->Current()->isContained()); @@ -2481,8 +2481,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) // When op2 is not contained or if we are producing a scalar value // we need to mark it as delay free because the operand and target // exist in the same register set. - - srcCount += BuildDelayFreeUses(op2); + srcCount += BuildDelayFreeUses(op2, op1); } else { @@ -2500,7 +2499,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree) if (op3 != nullptr) { - srcCount += isRMW ? BuildDelayFreeUses(op3) : BuildOperandUses(op3); + srcCount += isRMW ? BuildDelayFreeUses(op3, op1) : BuildOperandUses(op3); } } } -- 2.7.4