From 55b1acf78e86a255f46abf1202a434f566aa26cc Mon Sep 17 00:00:00 2001 From: Steve MacLean Date: Thu, 24 May 2018 20:08:44 -0400 Subject: [PATCH] [Arm64] JIT generate LSE Atomics When ARMv8.1 LSE Atomics are available, use them to implement Interlocked operations. --- src/jit/codegenarm64.cpp | 308 +++++++++++++++++++++++++++-------------------- src/jit/lowerarmarch.cpp | 3 +- src/jit/lsraarm64.cpp | 60 +++++---- 3 files changed, 217 insertions(+), 154 deletions(-) diff --git a/src/jit/codegenarm64.cpp b/src/jit/codegenarm64.cpp index afa3039..8bb04c2 100644 --- a/src/jit/codegenarm64.cpp +++ b/src/jit/codegenarm64.cpp @@ -2664,92 +2664,118 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode) regNumber dataReg = data->gtRegNum; regNumber addrReg = addr->gtRegNum; - regNumber exResultReg = treeNode->ExtractTempReg(RBM_ALLINT); - regNumber storeDataReg = (treeNode->OperGet() == GT_XCHG) ? dataReg : treeNode->ExtractTempReg(RBM_ALLINT); - regNumber loadReg = (targetReg != REG_NA) ? targetReg : storeDataReg; + genConsumeAddress(addr); + genConsumeRegs(data); - // Check allocator assumptions - // - // The register allocator should have extended the lifetimes of all input and internal registers so that - // none interfere with the target. - noway_assert(addrReg != targetReg); + emitAttr dataSize = emitActualTypeSize(data); - noway_assert(addrReg != loadReg); - noway_assert(dataReg != loadReg); + if (compiler->compSupports(InstructionSet_Atomics)) + { + assert(!data->isContainedIntOrIImmed()); - noway_assert(addrReg != storeDataReg); - noway_assert((treeNode->OperGet() == GT_XCHG) || (addrReg != dataReg)); + switch (treeNode->gtOper) + { + case GT_XCHG: + getEmitter()->emitIns_R_R_R(INS_swpal, dataSize, dataReg, targetReg, addrReg); + break; + case GT_XADD: + if ((targetReg == REG_NA) || (targetReg == REG_ZR)) + { + getEmitter()->emitIns_R_R(INS_staddl, dataSize, dataReg, addrReg); + } + else + { + getEmitter()->emitIns_R_R_R(INS_ldaddal, dataSize, dataReg, targetReg, addrReg); + } + break; + default: + assert(!"Unexpected treeNode->gtOper"); + } - assert(addr->isUsedFromReg()); - noway_assert(exResultReg != REG_NA); - noway_assert(exResultReg != targetReg); - noway_assert((targetReg != REG_NA) || (treeNode->OperGet() != GT_XCHG)); + instGen_MemoryBarrier(INS_BARRIER_ISH); + } + else + { + regNumber exResultReg = treeNode->ExtractTempReg(RBM_ALLINT); + regNumber storeDataReg = (treeNode->OperGet() == GT_XCHG) ? dataReg : treeNode->ExtractTempReg(RBM_ALLINT); + regNumber loadReg = (targetReg != REG_NA) ? targetReg : storeDataReg; - // Store exclusive unpredictable cases must be avoided - noway_assert(exResultReg != storeDataReg); - noway_assert(exResultReg != addrReg); + // Check allocator assumptions + // + // The register allocator should have extended the lifetimes of all input and internal registers so that + // none interfere with the target. + noway_assert(addrReg != targetReg); - genConsumeAddress(addr); - genConsumeRegs(data); + noway_assert(addrReg != loadReg); + noway_assert(dataReg != loadReg); - // NOTE: `genConsumeAddress` marks the consumed register as not a GC pointer, as it assumes that the input registers - // die at the first instruction generated by the node. This is not the case for these atomics as the input - // registers are multiply-used. As such, we need to mark the addr register as containing a GC pointer until - // we are finished generating the code for this node. + noway_assert(addrReg != storeDataReg); + noway_assert((treeNode->OperGet() == GT_XCHG) || (addrReg != dataReg)); - gcInfo.gcMarkRegPtrVal(addrReg, addr->TypeGet()); + assert(addr->isUsedFromReg()); + noway_assert(exResultReg != REG_NA); + noway_assert(exResultReg != targetReg); + noway_assert((targetReg != REG_NA) || (treeNode->OperGet() != GT_XCHG)); - // TODO-ARM64-CQ Use ARMv8.1 atomics if available - // https://github.com/dotnet/coreclr/issues/11881 + // Store exclusive unpredictable cases must be avoided + noway_assert(exResultReg != storeDataReg); + noway_assert(exResultReg != addrReg); - // Emit code like this: - // retry: - // ldxr loadReg, [addrReg] - // add storeDataReg, loadReg, dataReg # Only for GT_XADD - // # GT_XCHG storeDataReg === dataReg - // stxr exResult, storeDataReg, [addrReg] - // cbnz exResult, retry - // dmb ish + // NOTE: `genConsumeAddress` marks the consumed register as not a GC pointer, as it assumes that the input + // registers + // die at the first instruction generated by the node. This is not the case for these atomics as the input + // registers are multiply-used. As such, we need to mark the addr register as containing a GC pointer until + // we are finished generating the code for this node. - BasicBlock* labelRetry = genCreateTempLabel(); - genDefineTempLabel(labelRetry); + gcInfo.gcMarkRegPtrVal(addrReg, addr->TypeGet()); - emitAttr dataSize = emitActualTypeSize(data); + // Emit code like this: + // retry: + // ldxr loadReg, [addrReg] + // add storeDataReg, loadReg, dataReg # Only for GT_XADD + // # GT_XCHG storeDataReg === dataReg + // stxr exResult, storeDataReg, [addrReg] + // cbnz exResult, retry + // dmb ish - // The following instruction includes a acquire half barrier - getEmitter()->emitIns_R_R(INS_ldaxr, dataSize, loadReg, addrReg); + BasicBlock* labelRetry = genCreateTempLabel(); + genDefineTempLabel(labelRetry); - switch (treeNode->OperGet()) - { - case GT_XADD: - if (data->isContainedIntOrIImmed()) - { - // Even though INS_add is specified here, the encoder will choose either - // an INS_add or an INS_sub and encode the immediate as a positive value - genInstrWithConstant(INS_add, dataSize, storeDataReg, loadReg, data->AsIntConCommon()->IconValue(), - REG_NA); - } - else - { - getEmitter()->emitIns_R_R_R(INS_add, dataSize, storeDataReg, loadReg, dataReg); - } - break; - case GT_XCHG: - assert(!data->isContained()); - storeDataReg = dataReg; - break; - default: - unreached(); - } + // The following instruction includes a acquire half barrier + getEmitter()->emitIns_R_R(INS_ldaxr, dataSize, loadReg, addrReg); - // The following instruction includes a release half barrier - getEmitter()->emitIns_R_R_R(INS_stlxr, dataSize, exResultReg, storeDataReg, addrReg); + switch (treeNode->OperGet()) + { + case GT_XADD: + if (data->isContainedIntOrIImmed()) + { + // Even though INS_add is specified here, the encoder will choose either + // an INS_add or an INS_sub and encode the immediate as a positive value + genInstrWithConstant(INS_add, dataSize, storeDataReg, loadReg, data->AsIntConCommon()->IconValue(), + REG_NA); + } + else + { + getEmitter()->emitIns_R_R_R(INS_add, dataSize, storeDataReg, loadReg, dataReg); + } + break; + case GT_XCHG: + assert(!data->isContained()); + storeDataReg = dataReg; + break; + default: + unreached(); + } + + // The following instruction includes a release half barrier + getEmitter()->emitIns_R_R_R(INS_stlxr, dataSize, exResultReg, storeDataReg, addrReg); - getEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg); + getEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg); - instGen_MemoryBarrier(INS_BARRIER_ISH); + instGen_MemoryBarrier(INS_BARRIER_ISH); - gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask()); + gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask()); + } if (treeNode->gtRegNum != REG_NA) { @@ -2775,88 +2801,110 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode) regNumber dataReg = data->gtRegNum; regNumber addrReg = addr->gtRegNum; regNumber comparandReg = comparand->gtRegNum; - regNumber exResultReg = treeNode->ExtractTempReg(RBM_ALLINT); - - // Check allocator assumptions - // - // The register allocator should have extended the lifetimes of all input and internal registers so that - // none interfere with the target. - noway_assert(addrReg != targetReg); - noway_assert(dataReg != targetReg); - noway_assert(comparandReg != targetReg); - noway_assert(addrReg != dataReg); - noway_assert(targetReg != REG_NA); - noway_assert(exResultReg != REG_NA); - noway_assert(exResultReg != targetReg); - - assert(addr->isUsedFromReg()); - assert(data->isUsedFromReg()); - assert(!comparand->isUsedFromMemory()); - - // Store exclusive unpredictable cases must be avoided - noway_assert(exResultReg != dataReg); - noway_assert(exResultReg != addrReg); genConsumeAddress(addr); genConsumeRegs(data); genConsumeRegs(comparand); - // NOTE: `genConsumeAddress` marks the consumed register as not a GC pointer, as it assumes that the input registers - // die at the first instruction generated by the node. This is not the case for these atomics as the input - // registers are multiply-used. As such, we need to mark the addr register as containing a GC pointer until - // we are finished generating the code for this node. - - gcInfo.gcMarkRegPtrVal(addrReg, addr->TypeGet()); - - // TODO-ARM64-CQ Use ARMv8.1 atomics if available - // https://github.com/dotnet/coreclr/issues/11881 - - // Emit code like this: - // retry: - // ldxr targetReg, [addrReg] - // cmp targetReg, comparandReg - // bne compareFail - // stxr exResult, dataReg, [addrReg] - // cbnz exResult, retry - // compareFail: - // dmb ish + if (compiler->compSupports(InstructionSet_Atomics)) + { + emitAttr dataSize = emitActualTypeSize(data); - BasicBlock* labelRetry = genCreateTempLabel(); - BasicBlock* labelCompareFail = genCreateTempLabel(); - genDefineTempLabel(labelRetry); + // casal use the comparand as the target reg + if (targetReg != comparandReg) + { + getEmitter()->emitIns_R_R(INS_mov, dataSize, targetReg, comparandReg); - // The following instruction includes a acquire half barrier - getEmitter()->emitIns_R_R(INS_ldaxr, emitTypeSize(treeNode), targetReg, addrReg); + // Catch case we destroyed data or address before use + noway_assert(addrReg != targetReg); + noway_assert(dataReg != targetReg); + } + getEmitter()->emitIns_R_R_R(INS_casal, dataSize, targetReg, dataReg, addrReg); - if (comparand->isContainedIntOrIImmed()) + instGen_MemoryBarrier(INS_BARRIER_ISH); + } + else { - if (comparand->IsIntegralConst(0)) + regNumber exResultReg = treeNode->ExtractTempReg(RBM_ALLINT); + + // Check allocator assumptions + // + // The register allocator should have extended the lifetimes of all input and internal registers so that + // none interfere with the target. + noway_assert(addrReg != targetReg); + noway_assert(dataReg != targetReg); + noway_assert(comparandReg != targetReg); + noway_assert(addrReg != dataReg); + noway_assert(targetReg != REG_NA); + noway_assert(exResultReg != REG_NA); + noway_assert(exResultReg != targetReg); + + assert(addr->isUsedFromReg()); + assert(data->isUsedFromReg()); + assert(!comparand->isUsedFromMemory()); + + // Store exclusive unpredictable cases must be avoided + noway_assert(exResultReg != dataReg); + noway_assert(exResultReg != addrReg); + + // NOTE: `genConsumeAddress` marks the consumed register as not a GC pointer, as it assumes that the input + // registers + // die at the first instruction generated by the node. This is not the case for these atomics as the input + // registers are multiply-used. As such, we need to mark the addr register as containing a GC pointer until + // we are finished generating the code for this node. + + gcInfo.gcMarkRegPtrVal(addrReg, addr->TypeGet()); + + // TODO-ARM64-CQ Use ARMv8.1 atomics if available + // https://github.com/dotnet/coreclr/issues/11881 + + // Emit code like this: + // retry: + // ldxr targetReg, [addrReg] + // cmp targetReg, comparandReg + // bne compareFail + // stxr exResult, dataReg, [addrReg] + // cbnz exResult, retry + // compareFail: + // dmb ish + + BasicBlock* labelRetry = genCreateTempLabel(); + BasicBlock* labelCompareFail = genCreateTempLabel(); + genDefineTempLabel(labelRetry); + + // The following instruction includes a acquire half barrier + getEmitter()->emitIns_R_R(INS_ldaxr, emitTypeSize(treeNode), targetReg, addrReg); + + if (comparand->isContainedIntOrIImmed()) { - getEmitter()->emitIns_J_R(INS_cbnz, emitActualTypeSize(treeNode), labelCompareFail, targetReg); + if (comparand->IsIntegralConst(0)) + { + getEmitter()->emitIns_J_R(INS_cbnz, emitActualTypeSize(treeNode), labelCompareFail, targetReg); + } + else + { + getEmitter()->emitIns_R_I(INS_cmp, emitActualTypeSize(treeNode), targetReg, + comparand->AsIntConCommon()->IconValue()); + getEmitter()->emitIns_J(INS_bne, labelCompareFail); + } } else { - getEmitter()->emitIns_R_I(INS_cmp, emitActualTypeSize(treeNode), targetReg, - comparand->AsIntConCommon()->IconValue()); + getEmitter()->emitIns_R_R(INS_cmp, emitActualTypeSize(treeNode), targetReg, comparandReg); getEmitter()->emitIns_J(INS_bne, labelCompareFail); } - } - else - { - getEmitter()->emitIns_R_R(INS_cmp, emitActualTypeSize(treeNode), targetReg, comparandReg); - getEmitter()->emitIns_J(INS_bne, labelCompareFail); - } - // The following instruction includes a release half barrier - getEmitter()->emitIns_R_R_R(INS_stlxr, emitTypeSize(treeNode), exResultReg, dataReg, addrReg); + // The following instruction includes a release half barrier + getEmitter()->emitIns_R_R_R(INS_stlxr, emitTypeSize(treeNode), exResultReg, dataReg, addrReg); - getEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg); + getEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg); - genDefineTempLabel(labelCompareFail); + genDefineTempLabel(labelCompareFail); - instGen_MemoryBarrier(INS_BARRIER_ISH); + instGen_MemoryBarrier(INS_BARRIER_ISH); - gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask()); + gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask()); + } genProduceReg(treeNode); } diff --git a/src/jit/lowerarmarch.cpp b/src/jit/lowerarmarch.cpp index 47998fe..b156b28 100644 --- a/src/jit/lowerarmarch.cpp +++ b/src/jit/lowerarmarch.cpp @@ -90,7 +90,8 @@ bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode) case GT_CMPXCHG: case GT_LOCKADD: case GT_XADD: - return emitter::emitIns_valid_imm_for_add(immVal, size); + return comp->compSupports(InstructionSet_Atomics) ? false + : emitter::emitIns_valid_imm_for_add(immVal, size); #elif defined(_TARGET_ARM_) return emitter::emitIns_valid_imm_for_add(immVal, flags); #endif diff --git a/src/jit/lsraarm64.cpp b/src/jit/lsraarm64.cpp index 9cc17ca..b76dcf4 100644 --- a/src/jit/lsraarm64.cpp +++ b/src/jit/lsraarm64.cpp @@ -418,16 +418,27 @@ int LinearScan::BuildNode(GenTree* tree) srcCount = cmpXchgNode->gtOpComparand->isContained() ? 2 : 3; assert(dstCount == 1); - buildInternalIntRegisterDefForNode(tree); + if (!compiler->compSupports(InstructionSet_Atomics)) + { + // For ARMv8 exclusives requires a single internal register + buildInternalIntRegisterDefForNode(tree); + } // For ARMv8 exclusives the lifetime of the addr and data must be extended because // it may be used used multiple during retries + + // For ARMv8.1 atomic cas the lifetime of the addr and data must be extended to prevent + // them being reused as the target register which must be destroyed early + RefPosition* locationUse = BuildUse(tree->gtCmpXchg.gtOpLocation); setDelayFree(locationUse); RefPosition* valueUse = BuildUse(tree->gtCmpXchg.gtOpValue); setDelayFree(valueUse); - if (!cmpXchgNode->gtOpComparand->isContained()) + if (!cmpXchgNode->gtOpComparand->isContained() && !compiler->compSupports(InstructionSet_Atomics)) { + // For ARMv8 exclusives the lifetime of the comparand must be extended because + // it may be used used multiple during retries + RefPosition* comparandUse = BuildUse(tree->gtCmpXchg.gtOpComparand); setDelayFree(comparandUse); } @@ -446,34 +457,37 @@ int LinearScan::BuildNode(GenTree* tree) assert(dstCount == (tree->TypeGet() == TYP_VOID) ? 0 : 1); srcCount = tree->gtGetOp2()->isContained() ? 1 : 2; - // GT_XCHG requires a single internal regiester; the others require two. - buildInternalIntRegisterDefForNode(tree); - if (tree->OperGet() != GT_XCHG) + if (!compiler->compSupports(InstructionSet_Atomics)) { + // GT_XCHG requires a single internal register; the others require two. buildInternalIntRegisterDefForNode(tree); - } + if (tree->OperGet() != GT_XCHG) + { + buildInternalIntRegisterDefForNode(tree); + } - // For ARMv8 exclusives the lifetime of the addr and data must be extended because - // it may be used used multiple during retries - assert(!tree->gtGetOp1()->isContained()); - RefPosition* op1Use = BuildUse(tree->gtGetOp1()); - RefPosition* op2Use = nullptr; - if (!tree->gtGetOp2()->isContained()) - { - op2Use = BuildUse(tree->gtGetOp2()); - } + // For ARMv8 exclusives the lifetime of the addr and data must be extended because + // it may be used used multiple during retries + assert(!tree->gtGetOp1()->isContained()); + RefPosition* op1Use = BuildUse(tree->gtGetOp1()); + RefPosition* op2Use = nullptr; + if (!tree->gtGetOp2()->isContained()) + { + op2Use = BuildUse(tree->gtGetOp2()); + } - // Internals may not collide with target - if (dstCount == 1) - { - setDelayFree(op1Use); - if (op2Use != nullptr) + // Internals may not collide with target + if (dstCount == 1) { - setDelayFree(op2Use); + setDelayFree(op1Use); + if (op2Use != nullptr) + { + setDelayFree(op2Use); + } + setInternalRegsDelayFree = true; } - setInternalRegsDelayFree = true; + buildInternalRegisterUses(); } - buildInternalRegisterUses(); if (dstCount == 1) { BuildDef(tree); -- 2.7.4