[Arm64] JIT generate LSE Atomics
authorSteve MacLean <sdmaclea.qdt@qualcommdatacenter.com>
Fri, 25 May 2018 00:08:44 +0000 (20:08 -0400)
committerSteve MacLean <sdmaclea.qdt@qualcommdatacenter.com>
Thu, 7 Jun 2018 19:00:06 +0000 (15:00 -0400)
When ARMv8.1 LSE Atomics are available, use them to implement
Interlocked operations.

src/jit/codegenarm64.cpp
src/jit/lowerarmarch.cpp
src/jit/lsraarm64.cpp

index afa3039..8bb04c2 100644 (file)
@@ -2664,92 +2664,118 @@ void CodeGen::genLockedInstructions(GenTreeOp* treeNode)
     regNumber dataReg   = data->gtRegNum;
     regNumber addrReg   = addr->gtRegNum;
 
-    regNumber exResultReg  = treeNode->ExtractTempReg(RBM_ALLINT);
-    regNumber storeDataReg = (treeNode->OperGet() == GT_XCHG) ? dataReg : treeNode->ExtractTempReg(RBM_ALLINT);
-    regNumber loadReg      = (targetReg != REG_NA) ? targetReg : storeDataReg;
+    genConsumeAddress(addr);
+    genConsumeRegs(data);
 
-    // Check allocator assumptions
-    //
-    // The register allocator should have extended the lifetimes of all input and internal registers so that
-    // none interfere with the target.
-    noway_assert(addrReg != targetReg);
+    emitAttr dataSize = emitActualTypeSize(data);
 
-    noway_assert(addrReg != loadReg);
-    noway_assert(dataReg != loadReg);
+    if (compiler->compSupports(InstructionSet_Atomics))
+    {
+        assert(!data->isContainedIntOrIImmed());
 
-    noway_assert(addrReg != storeDataReg);
-    noway_assert((treeNode->OperGet() == GT_XCHG) || (addrReg != dataReg));
+        switch (treeNode->gtOper)
+        {
+            case GT_XCHG:
+                getEmitter()->emitIns_R_R_R(INS_swpal, dataSize, dataReg, targetReg, addrReg);
+                break;
+            case GT_XADD:
+                if ((targetReg == REG_NA) || (targetReg == REG_ZR))
+                {
+                    getEmitter()->emitIns_R_R(INS_staddl, dataSize, dataReg, addrReg);
+                }
+                else
+                {
+                    getEmitter()->emitIns_R_R_R(INS_ldaddal, dataSize, dataReg, targetReg, addrReg);
+                }
+                break;
+            default:
+                assert(!"Unexpected treeNode->gtOper");
+        }
 
-    assert(addr->isUsedFromReg());
-    noway_assert(exResultReg != REG_NA);
-    noway_assert(exResultReg != targetReg);
-    noway_assert((targetReg != REG_NA) || (treeNode->OperGet() != GT_XCHG));
+        instGen_MemoryBarrier(INS_BARRIER_ISH);
+    }
+    else
+    {
+        regNumber exResultReg  = treeNode->ExtractTempReg(RBM_ALLINT);
+        regNumber storeDataReg = (treeNode->OperGet() == GT_XCHG) ? dataReg : treeNode->ExtractTempReg(RBM_ALLINT);
+        regNumber loadReg      = (targetReg != REG_NA) ? targetReg : storeDataReg;
 
-    // Store exclusive unpredictable cases must be avoided
-    noway_assert(exResultReg != storeDataReg);
-    noway_assert(exResultReg != addrReg);
+        // Check allocator assumptions
+        //
+        // The register allocator should have extended the lifetimes of all input and internal registers so that
+        // none interfere with the target.
+        noway_assert(addrReg != targetReg);
 
-    genConsumeAddress(addr);
-    genConsumeRegs(data);
+        noway_assert(addrReg != loadReg);
+        noway_assert(dataReg != loadReg);
 
-    // NOTE: `genConsumeAddress` marks the consumed register as not a GC pointer, as it assumes that the input registers
-    // die at the first instruction generated by the node. This is not the case for these atomics as the  input
-    // registers are multiply-used. As such, we need to mark the addr register as containing a GC pointer until
-    // we are finished generating the code for this node.
+        noway_assert(addrReg != storeDataReg);
+        noway_assert((treeNode->OperGet() == GT_XCHG) || (addrReg != dataReg));
 
-    gcInfo.gcMarkRegPtrVal(addrReg, addr->TypeGet());
+        assert(addr->isUsedFromReg());
+        noway_assert(exResultReg != REG_NA);
+        noway_assert(exResultReg != targetReg);
+        noway_assert((targetReg != REG_NA) || (treeNode->OperGet() != GT_XCHG));
 
-    // TODO-ARM64-CQ Use ARMv8.1 atomics if available
-    // https://github.com/dotnet/coreclr/issues/11881
+        // Store exclusive unpredictable cases must be avoided
+        noway_assert(exResultReg != storeDataReg);
+        noway_assert(exResultReg != addrReg);
 
-    // Emit code like this:
-    //   retry:
-    //     ldxr loadReg, [addrReg]
-    //     add storeDataReg, loadReg, dataReg         # Only for GT_XADD
-    //                                                # GT_XCHG storeDataReg === dataReg
-    //     stxr exResult, storeDataReg, [addrReg]
-    //     cbnz exResult, retry
-    //     dmb ish
+        // NOTE: `genConsumeAddress` marks the consumed register as not a GC pointer, as it assumes that the input
+        // registers
+        // die at the first instruction generated by the node. This is not the case for these atomics as the  input
+        // registers are multiply-used. As such, we need to mark the addr register as containing a GC pointer until
+        // we are finished generating the code for this node.
 
-    BasicBlock* labelRetry = genCreateTempLabel();
-    genDefineTempLabel(labelRetry);
+        gcInfo.gcMarkRegPtrVal(addrReg, addr->TypeGet());
 
-    emitAttr dataSize = emitActualTypeSize(data);
+        // Emit code like this:
+        //   retry:
+        //     ldxr loadReg, [addrReg]
+        //     add storeDataReg, loadReg, dataReg         # Only for GT_XADD
+        //                                                # GT_XCHG storeDataReg === dataReg
+        //     stxr exResult, storeDataReg, [addrReg]
+        //     cbnz exResult, retry
+        //     dmb ish
 
-    // The following instruction includes a acquire half barrier
-    getEmitter()->emitIns_R_R(INS_ldaxr, dataSize, loadReg, addrReg);
+        BasicBlock* labelRetry = genCreateTempLabel();
+        genDefineTempLabel(labelRetry);
 
-    switch (treeNode->OperGet())
-    {
-        case GT_XADD:
-            if (data->isContainedIntOrIImmed())
-            {
-                // Even though INS_add is specified here, the encoder will choose either
-                // an INS_add or an INS_sub and encode the immediate as a positive value
-                genInstrWithConstant(INS_add, dataSize, storeDataReg, loadReg, data->AsIntConCommon()->IconValue(),
-                                     REG_NA);
-            }
-            else
-            {
-                getEmitter()->emitIns_R_R_R(INS_add, dataSize, storeDataReg, loadReg, dataReg);
-            }
-            break;
-        case GT_XCHG:
-            assert(!data->isContained());
-            storeDataReg = dataReg;
-            break;
-        default:
-            unreached();
-    }
+        // The following instruction includes a acquire half barrier
+        getEmitter()->emitIns_R_R(INS_ldaxr, dataSize, loadReg, addrReg);
 
-    // The following instruction includes a release half barrier
-    getEmitter()->emitIns_R_R_R(INS_stlxr, dataSize, exResultReg, storeDataReg, addrReg);
+        switch (treeNode->OperGet())
+        {
+            case GT_XADD:
+                if (data->isContainedIntOrIImmed())
+                {
+                    // Even though INS_add is specified here, the encoder will choose either
+                    // an INS_add or an INS_sub and encode the immediate as a positive value
+                    genInstrWithConstant(INS_add, dataSize, storeDataReg, loadReg, data->AsIntConCommon()->IconValue(),
+                                         REG_NA);
+                }
+                else
+                {
+                    getEmitter()->emitIns_R_R_R(INS_add, dataSize, storeDataReg, loadReg, dataReg);
+                }
+                break;
+            case GT_XCHG:
+                assert(!data->isContained());
+                storeDataReg = dataReg;
+                break;
+            default:
+                unreached();
+        }
+
+        // The following instruction includes a release half barrier
+        getEmitter()->emitIns_R_R_R(INS_stlxr, dataSize, exResultReg, storeDataReg, addrReg);
 
-    getEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg);
+        getEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg);
 
-    instGen_MemoryBarrier(INS_BARRIER_ISH);
+        instGen_MemoryBarrier(INS_BARRIER_ISH);
 
-    gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask());
+        gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask());
+    }
 
     if (treeNode->gtRegNum != REG_NA)
     {
@@ -2775,88 +2801,110 @@ void CodeGen::genCodeForCmpXchg(GenTreeCmpXchg* treeNode)
     regNumber dataReg      = data->gtRegNum;
     regNumber addrReg      = addr->gtRegNum;
     regNumber comparandReg = comparand->gtRegNum;
-    regNumber exResultReg  = treeNode->ExtractTempReg(RBM_ALLINT);
-
-    // Check allocator assumptions
-    //
-    // The register allocator should have extended the lifetimes of all input and internal registers so that
-    // none interfere with the target.
-    noway_assert(addrReg != targetReg);
-    noway_assert(dataReg != targetReg);
-    noway_assert(comparandReg != targetReg);
-    noway_assert(addrReg != dataReg);
-    noway_assert(targetReg != REG_NA);
-    noway_assert(exResultReg != REG_NA);
-    noway_assert(exResultReg != targetReg);
-
-    assert(addr->isUsedFromReg());
-    assert(data->isUsedFromReg());
-    assert(!comparand->isUsedFromMemory());
-
-    // Store exclusive unpredictable cases must be avoided
-    noway_assert(exResultReg != dataReg);
-    noway_assert(exResultReg != addrReg);
 
     genConsumeAddress(addr);
     genConsumeRegs(data);
     genConsumeRegs(comparand);
 
-    // NOTE: `genConsumeAddress` marks the consumed register as not a GC pointer, as it assumes that the input registers
-    // die at the first instruction generated by the node. This is not the case for these atomics as the  input
-    // registers are multiply-used. As such, we need to mark the addr register as containing a GC pointer until
-    // we are finished generating the code for this node.
-
-    gcInfo.gcMarkRegPtrVal(addrReg, addr->TypeGet());
-
-    // TODO-ARM64-CQ Use ARMv8.1 atomics if available
-    // https://github.com/dotnet/coreclr/issues/11881
-
-    // Emit code like this:
-    //   retry:
-    //     ldxr targetReg, [addrReg]
-    //     cmp targetReg, comparandReg
-    //     bne compareFail
-    //     stxr exResult, dataReg, [addrReg]
-    //     cbnz exResult, retry
-    //   compareFail:
-    //     dmb ish
+    if (compiler->compSupports(InstructionSet_Atomics))
+    {
+        emitAttr dataSize = emitActualTypeSize(data);
 
-    BasicBlock* labelRetry       = genCreateTempLabel();
-    BasicBlock* labelCompareFail = genCreateTempLabel();
-    genDefineTempLabel(labelRetry);
+        // casal use the comparand as the target reg
+        if (targetReg != comparandReg)
+        {
+            getEmitter()->emitIns_R_R(INS_mov, dataSize, targetReg, comparandReg);
 
-    // The following instruction includes a acquire half barrier
-    getEmitter()->emitIns_R_R(INS_ldaxr, emitTypeSize(treeNode), targetReg, addrReg);
+            // Catch case we destroyed data or address before use
+            noway_assert(addrReg != targetReg);
+            noway_assert(dataReg != targetReg);
+        }
+        getEmitter()->emitIns_R_R_R(INS_casal, dataSize, targetReg, dataReg, addrReg);
 
-    if (comparand->isContainedIntOrIImmed())
+        instGen_MemoryBarrier(INS_BARRIER_ISH);
+    }
+    else
     {
-        if (comparand->IsIntegralConst(0))
+        regNumber exResultReg = treeNode->ExtractTempReg(RBM_ALLINT);
+
+        // Check allocator assumptions
+        //
+        // The register allocator should have extended the lifetimes of all input and internal registers so that
+        // none interfere with the target.
+        noway_assert(addrReg != targetReg);
+        noway_assert(dataReg != targetReg);
+        noway_assert(comparandReg != targetReg);
+        noway_assert(addrReg != dataReg);
+        noway_assert(targetReg != REG_NA);
+        noway_assert(exResultReg != REG_NA);
+        noway_assert(exResultReg != targetReg);
+
+        assert(addr->isUsedFromReg());
+        assert(data->isUsedFromReg());
+        assert(!comparand->isUsedFromMemory());
+
+        // Store exclusive unpredictable cases must be avoided
+        noway_assert(exResultReg != dataReg);
+        noway_assert(exResultReg != addrReg);
+
+        // NOTE: `genConsumeAddress` marks the consumed register as not a GC pointer, as it assumes that the input
+        // registers
+        // die at the first instruction generated by the node. This is not the case for these atomics as the  input
+        // registers are multiply-used. As such, we need to mark the addr register as containing a GC pointer until
+        // we are finished generating the code for this node.
+
+        gcInfo.gcMarkRegPtrVal(addrReg, addr->TypeGet());
+
+        // TODO-ARM64-CQ Use ARMv8.1 atomics if available
+        // https://github.com/dotnet/coreclr/issues/11881
+
+        // Emit code like this:
+        //   retry:
+        //     ldxr targetReg, [addrReg]
+        //     cmp targetReg, comparandReg
+        //     bne compareFail
+        //     stxr exResult, dataReg, [addrReg]
+        //     cbnz exResult, retry
+        //   compareFail:
+        //     dmb ish
+
+        BasicBlock* labelRetry       = genCreateTempLabel();
+        BasicBlock* labelCompareFail = genCreateTempLabel();
+        genDefineTempLabel(labelRetry);
+
+        // The following instruction includes a acquire half barrier
+        getEmitter()->emitIns_R_R(INS_ldaxr, emitTypeSize(treeNode), targetReg, addrReg);
+
+        if (comparand->isContainedIntOrIImmed())
         {
-            getEmitter()->emitIns_J_R(INS_cbnz, emitActualTypeSize(treeNode), labelCompareFail, targetReg);
+            if (comparand->IsIntegralConst(0))
+            {
+                getEmitter()->emitIns_J_R(INS_cbnz, emitActualTypeSize(treeNode), labelCompareFail, targetReg);
+            }
+            else
+            {
+                getEmitter()->emitIns_R_I(INS_cmp, emitActualTypeSize(treeNode), targetReg,
+                                          comparand->AsIntConCommon()->IconValue());
+                getEmitter()->emitIns_J(INS_bne, labelCompareFail);
+            }
         }
         else
         {
-            getEmitter()->emitIns_R_I(INS_cmp, emitActualTypeSize(treeNode), targetReg,
-                                      comparand->AsIntConCommon()->IconValue());
+            getEmitter()->emitIns_R_R(INS_cmp, emitActualTypeSize(treeNode), targetReg, comparandReg);
             getEmitter()->emitIns_J(INS_bne, labelCompareFail);
         }
-    }
-    else
-    {
-        getEmitter()->emitIns_R_R(INS_cmp, emitActualTypeSize(treeNode), targetReg, comparandReg);
-        getEmitter()->emitIns_J(INS_bne, labelCompareFail);
-    }
 
-    // The following instruction includes a release half barrier
-    getEmitter()->emitIns_R_R_R(INS_stlxr, emitTypeSize(treeNode), exResultReg, dataReg, addrReg);
+        // The following instruction includes a release half barrier
+        getEmitter()->emitIns_R_R_R(INS_stlxr, emitTypeSize(treeNode), exResultReg, dataReg, addrReg);
 
-    getEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg);
+        getEmitter()->emitIns_J_R(INS_cbnz, EA_4BYTE, labelRetry, exResultReg);
 
-    genDefineTempLabel(labelCompareFail);
+        genDefineTempLabel(labelCompareFail);
 
-    instGen_MemoryBarrier(INS_BARRIER_ISH);
+        instGen_MemoryBarrier(INS_BARRIER_ISH);
 
-    gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask());
+        gcInfo.gcMarkRegSetNpt(addr->gtGetRegMask());
+    }
 
     genProduceReg(treeNode);
 }
index 47998fe..b156b28 100644 (file)
@@ -90,7 +90,8 @@ bool Lowering::IsContainableImmed(GenTree* parentNode, GenTree* childNode)
             case GT_CMPXCHG:
             case GT_LOCKADD:
             case GT_XADD:
-                return emitter::emitIns_valid_imm_for_add(immVal, size);
+                return comp->compSupports(InstructionSet_Atomics) ? false
+                                                                  : emitter::emitIns_valid_imm_for_add(immVal, size);
 #elif defined(_TARGET_ARM_)
                 return emitter::emitIns_valid_imm_for_add(immVal, flags);
 #endif
index 9cc17ca..b76dcf4 100644 (file)
@@ -418,16 +418,27 @@ int LinearScan::BuildNode(GenTree* tree)
             srcCount                    = cmpXchgNode->gtOpComparand->isContained() ? 2 : 3;
             assert(dstCount == 1);
 
-            buildInternalIntRegisterDefForNode(tree);
+            if (!compiler->compSupports(InstructionSet_Atomics))
+            {
+                // For ARMv8 exclusives requires a single internal register
+                buildInternalIntRegisterDefForNode(tree);
+            }
 
             // For ARMv8 exclusives the lifetime of the addr and data must be extended because
             // it may be used used multiple during retries
+
+            // For ARMv8.1 atomic cas the lifetime of the addr and data must be extended to prevent
+            // them being reused as the target register which must be destroyed early
+
             RefPosition* locationUse = BuildUse(tree->gtCmpXchg.gtOpLocation);
             setDelayFree(locationUse);
             RefPosition* valueUse = BuildUse(tree->gtCmpXchg.gtOpValue);
             setDelayFree(valueUse);
-            if (!cmpXchgNode->gtOpComparand->isContained())
+            if (!cmpXchgNode->gtOpComparand->isContained() && !compiler->compSupports(InstructionSet_Atomics))
             {
+                // For ARMv8 exclusives the lifetime of the comparand must be extended because
+                // it may be used used multiple during retries
+
                 RefPosition* comparandUse = BuildUse(tree->gtCmpXchg.gtOpComparand);
                 setDelayFree(comparandUse);
             }
@@ -446,34 +457,37 @@ int LinearScan::BuildNode(GenTree* tree)
             assert(dstCount == (tree->TypeGet() == TYP_VOID) ? 0 : 1);
             srcCount = tree->gtGetOp2()->isContained() ? 1 : 2;
 
-            // GT_XCHG requires a single internal regiester; the others require two.
-            buildInternalIntRegisterDefForNode(tree);
-            if (tree->OperGet() != GT_XCHG)
+            if (!compiler->compSupports(InstructionSet_Atomics))
             {
+                // GT_XCHG requires a single internal register; the others require two.
                 buildInternalIntRegisterDefForNode(tree);
-            }
+                if (tree->OperGet() != GT_XCHG)
+                {
+                    buildInternalIntRegisterDefForNode(tree);
+                }
 
-            // For ARMv8 exclusives the lifetime of the addr and data must be extended because
-            // it may be used used multiple during retries
-            assert(!tree->gtGetOp1()->isContained());
-            RefPosition* op1Use = BuildUse(tree->gtGetOp1());
-            RefPosition* op2Use = nullptr;
-            if (!tree->gtGetOp2()->isContained())
-            {
-                op2Use = BuildUse(tree->gtGetOp2());
-            }
+                // For ARMv8 exclusives the lifetime of the addr and data must be extended because
+                // it may be used used multiple during retries
+                assert(!tree->gtGetOp1()->isContained());
+                RefPosition* op1Use = BuildUse(tree->gtGetOp1());
+                RefPosition* op2Use = nullptr;
+                if (!tree->gtGetOp2()->isContained())
+                {
+                    op2Use = BuildUse(tree->gtGetOp2());
+                }
 
-            // Internals may not collide with target
-            if (dstCount == 1)
-            {
-                setDelayFree(op1Use);
-                if (op2Use != nullptr)
+                // Internals may not collide with target
+                if (dstCount == 1)
                 {
-                    setDelayFree(op2Use);
+                    setDelayFree(op1Use);
+                    if (op2Use != nullptr)
+                    {
+                        setDelayFree(op2Use);
+                    }
+                    setInternalRegsDelayFree = true;
                 }
-                setInternalRegsDelayFree = true;
+                buildInternalRegisterUses();
             }
-            buildInternalRegisterUses();
             if (dstCount == 1)
             {
                 BuildDef(tree);