[Arm64] Use SIMD register to zero init frame (#46609)
authorEgor Chesakov <Egor.Chesakov@microsoft.com>
Tue, 26 Jan 2021 19:51:01 +0000 (11:51 -0800)
committerGitHub <noreply@github.com>
Tue, 26 Jan 2021 19:51:01 +0000 (11:51 -0800)
* Inline "stp q-reg, q-reg, addr-reg" for frames >= 32 bytes

* Use a pair of "stp q-reg, q-reg, addr-reg" in a loop that clears 64 bytes per iteration for frames >= 192 bytes

* Use dc zva instruction when the instruction is permitted and its block size is set to 64 bytes for frames >= 256 bytes

14 files changed:
src/coreclr/inc/corinfoinstructionset.h
src/coreclr/inc/jiteeversionguid.h
src/coreclr/jit/codegenarm64.cpp
src/coreclr/jit/codegencommon.cpp
src/coreclr/jit/emitarm64.cpp
src/coreclr/jit/emitfmtsarm64.h
src/coreclr/jit/instrsarm64.h
src/coreclr/jit/target.h
src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs
src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs
src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt
src/coreclr/vm/arm64/asmhelpers.S
src/coreclr/vm/arm64/asmhelpers.asm
src/coreclr/vm/codeman.cpp

index 75ab11d..5b7ec3f 100644 (file)
@@ -27,14 +27,15 @@ enum CORINFO_InstructionSet
     InstructionSet_Atomics=9,
     InstructionSet_Vector64=10,
     InstructionSet_Vector128=11,
-    InstructionSet_ArmBase_Arm64=12,
-    InstructionSet_AdvSimd_Arm64=13,
-    InstructionSet_Aes_Arm64=14,
-    InstructionSet_Crc32_Arm64=15,
-    InstructionSet_Dp_Arm64=16,
-    InstructionSet_Rdm_Arm64=17,
-    InstructionSet_Sha1_Arm64=18,
-    InstructionSet_Sha256_Arm64=19,
+    InstructionSet_Dczva=12,
+    InstructionSet_ArmBase_Arm64=13,
+    InstructionSet_AdvSimd_Arm64=14,
+    InstructionSet_Aes_Arm64=15,
+    InstructionSet_Crc32_Arm64=16,
+    InstructionSet_Dp_Arm64=17,
+    InstructionSet_Rdm_Arm64=18,
+    InstructionSet_Sha1_Arm64=19,
+    InstructionSet_Sha256_Arm64=20,
 #endif // TARGET_ARM64
 #ifdef TARGET_AMD64
     InstructionSet_X86Base=1,
@@ -457,6 +458,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet)
             return "Vector64";
         case InstructionSet_Vector128 :
             return "Vector128";
+        case InstructionSet_Dczva :
+            return "Dczva";
 #endif // TARGET_ARM64
 #ifdef TARGET_AMD64
         case InstructionSet_X86Base :
index 69d85ed..452e508 100644 (file)
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 //
 
-constexpr GUID JITEEVersionIdentifier = { /* 000b3acb-92d2-4003-8760-e545241dd9a8 */
-    0x000b3acb,
-    0x92d2,
-    0x4003,
-    {0x87, 0x60, 0xe5, 0x45, 0x24, 0x1d, 0xd9, 0xa8}
+constexpr GUID JITEEVersionIdentifier = { /* 960894e2-ec41-4088-82bb-bdcbac4ac2d3 */
+    0x960894e2,
+    0xec41,
+    0x4088,
+    {0x82, 0xbb, 0xbd, 0xcb, 0xac, 0x4a, 0xc2, 0xd3}
 };
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
index dcd3e08..7bff146 100644 (file)
@@ -54,7 +54,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 //                          Default: false.
 //
 // Return Value:
-//    returns true if the immediate was too large and tmpReg was used and modified.
+//    returns true if the immediate was small enough to be encoded inside instruction. If not,
+//    returns false meaning the immediate was too large and tmpReg was used and modified.
 //
 bool CodeGen::genInstrWithConstant(instruction ins,
                                    emitAttr    attr,
index 3933ace..90c7500 100644 (file)
@@ -6148,37 +6148,33 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
     if (genUseBlockInit)
     {
         assert(untrLclHi > untrLclLo);
-#ifdef TARGET_ARMARCH
-        /*
-            Generate the following code:
-
-            For cnt less than 10
-
-                mov     rZero1, 0
-                mov     rZero2, 0
-                mov     rCnt,  <cnt>
-                stm     <rZero1,rZero2>,[rAddr!]
-    <optional>  stm     <rZero1,rZero2>,[rAddr!]
-    <optional>  stm     <rZero1,rZero2>,[rAddr!]
-    <optional>  stm     <rZero1,rZero2>,[rAddr!]
-    <optional>  str     rZero1,[rAddr]
-
-            For rCnt greater than or equal to 10
-
-                mov     rZero1, 0
-                mov     rZero2, 0
-                mov     rCnt,  <cnt/2>
-                sub     rAddr, sp, OFFS
-
-            loop:
-                stm     <rZero1,rZero2>,[rAddr!]
-                sub     rCnt,rCnt,1
-                jnz     loop
-
-    <optional>  str     rZero1,[rAddr]   // When cnt is odd
-
-            NOTE: for ARM64, the instruction is stp, not stm. And we can use ZR instead of allocating registers.
-         */
+#ifdef TARGET_ARM
+        // Generate the following code:
+        //
+        // For cnt less than 10
+        //
+        //            mov     rZero1, 0
+        //            mov     rZero2, 0
+        //            mov     rCnt,  <cnt>
+        //            stm     <rZero1,rZero2>,[rAddr!]
+        // <optional> stm     <rZero1,rZero2>,[rAddr!]
+        // <optional> stm     <rZero1,rZero2>,[rAddr!]
+        // <optional> stm     <rZero1,rZero2>,[rAddr!]
+        // <optional> str     rZero1,[rAddr]
+        //
+        // For rCnt greater than or equal to 10
+        //
+        //            mov     rZero1, 0
+        //            mov     rZero2, 0
+        //            mov     rCnt,  <cnt/2>
+        //            sub     rAddr, sp, OFFS
+        //
+        //        loop:
+        //            stm     <rZero1,rZero2>,[rAddr!]
+        //            sub     rCnt,rCnt,1
+        //            jnz     loop
+        //
+        // <optional> str     rZero1,[rAddr]   // When cnt is odd
 
         regNumber rAddr;
         regNumber rCnt = REG_NA; // Invalid
@@ -6190,8 +6186,6 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
         availMask &= ~genRegMask(initReg); // Remove the pre-calculated initReg as we will zero it and maybe use it for
                                            // a large constant.
 
-#if defined(TARGET_ARM)
-
         if (compiler->compLocallocUsed)
         {
             availMask &= ~RBM_SAVED_LOCALLOC_SP; // Remove the register reserved when we have a localloc frame
@@ -6214,13 +6208,6 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
         rAddr   = genRegNumFromMask(regMask);
         availMask &= ~regMask;
 
-#else // !define(TARGET_ARM)
-
-        rAddr           = initReg;
-        *pInitRegZeroed = false;
-
-#endif // !defined(TARGET_ARM)
-
         bool     useLoop   = false;
         unsigned uCntBytes = untrLclHi - untrLclLo;
         assert((uCntBytes % sizeof(int)) == 0);         // The smallest stack slot is always 4 bytes.
@@ -6245,11 +6232,7 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
         // rAddr is not a live incoming argument reg
         assert((genRegMask(rAddr) & intRegState.rsCalleeRegArgMaskLiveIn) == 0);
 
-#if defined(TARGET_ARM)
         if (arm_Valid_Imm_For_Add(untrLclLo, INS_FLAGS_DONT_CARE))
-#else  // !TARGET_ARM
-        if (emitter::emitIns_valid_imm_for_add(untrLclLo, EA_PTRSIZE))
-#endif // !TARGET_ARM
         {
             GetEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, rAddr, genFramePointerReg(), untrLclLo);
         }
@@ -6269,65 +6252,212 @@ void CodeGen::genZeroInitFrame(int untrLclHi, int untrLclLo, regNumber initReg,
             instGen_Set_Reg_To_Imm(EA_PTRSIZE, rCnt, (ssize_t)uCntSlots / 2);
         }
 
-#if defined(TARGET_ARM)
         rZero1 = genGetZeroReg(initReg, pInitRegZeroed);
         instGen_Set_Reg_To_Zero(EA_PTRSIZE, rZero2);
         target_ssize_t stmImm = (target_ssize_t)(genRegMask(rZero1) | genRegMask(rZero2));
-#endif // TARGET_ARM
 
         if (!useLoop)
         {
             while (uCntBytes >= REGSIZE_BYTES * 2)
             {
-#ifdef TARGET_ARM
                 GetEmitter()->emitIns_R_I(INS_stm, EA_PTRSIZE, rAddr, stmImm);
-#else  // !TARGET_ARM
-                GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, rAddr, 2 * REGSIZE_BYTES,
-                                              INS_OPTS_POST_INDEX);
-#endif // !TARGET_ARM
                 uCntBytes -= REGSIZE_BYTES * 2;
             }
         }
-        else // useLoop is true
+        else
         {
-#ifdef TARGET_ARM
             GetEmitter()->emitIns_R_I(INS_stm, EA_PTRSIZE, rAddr, stmImm); // zero stack slots
             GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, rCnt, 1, INS_FLAGS_SET);
-#else  // !TARGET_ARM
-            GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, rAddr, 2 * REGSIZE_BYTES,
-                                          INS_OPTS_POST_INDEX); // zero stack slots
-            GetEmitter()->emitIns_R_R_I(INS_subs, EA_PTRSIZE, rCnt, rCnt, 1);
-#endif // !TARGET_ARM
             GetEmitter()->emitIns_J(INS_bhi, NULL, -3);
             uCntBytes %= REGSIZE_BYTES * 2;
         }
 
         if (uCntBytes >= REGSIZE_BYTES) // check and zero the last register-sized stack slot (odd number)
         {
-#ifdef TARGET_ARM
             GetEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, rZero1, rAddr, 0);
-#else  // TARGET_ARM
-            if ((uCntBytes - REGSIZE_BYTES) == 0)
+            uCntBytes -= REGSIZE_BYTES;
+        }
+
+        noway_assert(uCntBytes == 0);
+
+#elif defined(TARGET_ARM64)
+        int bytesToWrite = untrLclHi - untrLclLo;
+
+        const regNumber zeroSimdReg          = REG_ZERO_INIT_FRAME_SIMD;
+        bool            simdRegZeroed        = false;
+        const int       simdRegPairSizeBytes = 2 * FP_REGSIZE_BYTES;
+
+        regNumber addrReg = REG_ZERO_INIT_FRAME_REG1;
+
+        if (addrReg == initReg)
+        {
+            *pInitRegZeroed = false;
+        }
+
+        int addrOffset = 0;
+
+        // The following invariants are held below:
+        //
+        //   1) [addrReg, #addrOffset] points at a location where next chunk of zero bytes will be written;
+        //   2) bytesToWrite specifies the number of bytes on the frame to initialize;
+        //   3) if simdRegZeroed is true then 128-bit wide zeroSimdReg contains zeroes.
+
+        const int bytesUseZeroingLoop = 192;
+
+        if (bytesToWrite >= bytesUseZeroingLoop)
+        {
+            // Generates the following code:
+            //
+            // When the size of the region is greater than or equal to 256 bytes
+            // **and** DC ZVA instruction use is permitted
+            // **and** the instruction block size is configured to 64 bytes:
+            //
+            //    movi    v16.16b, #0
+            //    add     x9, fp, #(untrLclLo+64)
+            //    add     x10, fp, #(untrLclHi-64)
+            //    stp     q16, q16, [x9, #-64]
+            //    stp     q16, q16, [x9, #-32]
+            //    bfm     x9, xzr, #0, #5
+            //
+            // loop:
+            //    dc      zva, x9
+            //    add     x9, x9, #64
+            //    cmp     x9, x10
+            //    blo     loop
+            //
+            //    stp     q16, q16, [x10]
+            //    stp     q16, q16, [x10, #32]
+            //
+            // Otherwise:
+            //
+            //     movi    v16.16b, #0
+            //     add     x9, fp, #(untrLclLo-32)
+            //     mov     x10, #(bytesToWrite-64)
+            //
+            // loop:
+            //     stp     q16, q16, [x9, #32]
+            //     stp     q16, q16, [x9, #64]!
+            //     subs    x10, x10, #64
+            //     bge     loop
+
+            const int bytesUseDataCacheZeroInstruction = 256;
+
+            GetEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, zeroSimdReg, 0, INS_OPTS_16B);
+            simdRegZeroed = true;
+
+            if ((bytesToWrite >= bytesUseDataCacheZeroInstruction) &&
+                compiler->compOpportunisticallyDependsOn(InstructionSet_Dczva))
             {
-                GetEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, REG_ZR, rAddr, 0);
+                // The first and the last 64 bytes should be written with two stp q-reg instructions.
+                // This is in order to avoid **unintended** zeroing of the data by dc zva
+                // outside of [fp+untrLclLo, fp+untrLclHi) memory region.
+
+                genInstrWithConstant(INS_add, EA_PTRSIZE, addrReg, genFramePointerReg(), untrLclLo + 64, addrReg);
+                addrOffset = -64;
+
+                const regNumber endAddrReg = REG_ZERO_INIT_FRAME_REG2;
+
+                if (endAddrReg == initReg)
+                {
+                    *pInitRegZeroed = false;
+                }
+
+                genInstrWithConstant(INS_add, EA_PTRSIZE, endAddrReg, genFramePointerReg(), untrLclHi - 64, endAddrReg);
+
+                GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_16BYTE, zeroSimdReg, zeroSimdReg, addrReg, addrOffset);
+                addrOffset += simdRegPairSizeBytes;
+
+                GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_16BYTE, zeroSimdReg, zeroSimdReg, addrReg, addrOffset);
+                addrOffset += simdRegPairSizeBytes;
+
+                assert(addrOffset == 0);
+
+                GetEmitter()->emitIns_R_R_I_I(INS_bfm, EA_PTRSIZE, addrReg, REG_ZR, 0, 5);
+                // addrReg points at the beginning of a cache line.
+
+                GetEmitter()->emitIns_R(INS_dczva, EA_PTRSIZE, addrReg);
+                GetEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, addrReg, addrReg, 64);
+                GetEmitter()->emitIns_R_R(INS_cmp, EA_PTRSIZE, addrReg, endAddrReg);
+                GetEmitter()->emitIns_J(INS_blo, NULL, -4);
+
+                addrReg      = endAddrReg;
+                bytesToWrite = 64;
             }
             else
             {
-                GetEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, REG_ZR, rAddr, REGSIZE_BYTES, INS_OPTS_POST_INDEX);
+                genInstrWithConstant(INS_add, EA_PTRSIZE, addrReg, genFramePointerReg(), untrLclLo - 32, addrReg);
+                addrOffset = 32;
+
+                const regNumber countReg = REG_ZERO_INIT_FRAME_REG2;
+
+                if (countReg == initReg)
+                {
+                    *pInitRegZeroed = false;
+                }
+
+                instGen_Set_Reg_To_Imm(EA_PTRSIZE, countReg, bytesToWrite - 64);
+
+                GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_16BYTE, zeroSimdReg, zeroSimdReg, addrReg, 32);
+                GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_16BYTE, zeroSimdReg, zeroSimdReg, addrReg, 64,
+                                              INS_OPTS_PRE_INDEX);
+
+                GetEmitter()->emitIns_R_R_I(INS_subs, EA_PTRSIZE, countReg, countReg, 64);
+                GetEmitter()->emitIns_J(INS_bge, NULL, -4);
+
+                bytesToWrite %= 64;
             }
-#endif // !TARGET_ARM
-            uCntBytes -= REGSIZE_BYTES;
         }
-#ifdef TARGET_ARM64
-        if (uCntBytes > 0)
+        else
         {
-            assert(uCntBytes == sizeof(int));
-            GetEmitter()->emitIns_R_R_I(INS_str, EA_4BYTE, REG_ZR, rAddr, 0);
-            uCntBytes -= sizeof(int);
+            genInstrWithConstant(INS_add, EA_PTRSIZE, addrReg, genFramePointerReg(), untrLclLo, addrReg);
+        }
+
+        if (bytesToWrite >= simdRegPairSizeBytes)
+        {
+            // Generates the following code:
+            //
+            //     movi    v16.16b, #0
+            //     stp     q16, q16, [x9, #addrOffset]
+            //     stp     q16, q16, [x9, #(addrOffset+32)]
+            // ...
+            //     stp     q16, q16, [x9, #(addrOffset+roundDown(bytesToWrite, 32))]
+
+            if (!simdRegZeroed)
+            {
+                GetEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, zeroSimdReg, 0, INS_OPTS_16B);
+                simdRegZeroed = true;
+            }
+
+            for (; bytesToWrite >= simdRegPairSizeBytes; bytesToWrite -= simdRegPairSizeBytes)
+            {
+                GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_16BYTE, zeroSimdReg, zeroSimdReg, addrReg, addrOffset);
+                addrOffset += simdRegPairSizeBytes;
+            }
+        }
+
+        const int regPairSizeBytes = 2 * REGSIZE_BYTES;
+
+        if (bytesToWrite >= regPairSizeBytes)
+        {
+            GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, addrReg, addrOffset);
+            addrOffset += regPairSizeBytes;
+            bytesToWrite -= regPairSizeBytes;
+        }
+
+        if (bytesToWrite >= REGSIZE_BYTES)
+        {
+            GetEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, REG_ZR, addrReg, addrOffset);
+            addrOffset += REGSIZE_BYTES;
+            bytesToWrite -= REGSIZE_BYTES;
+        }
+
+        if (bytesToWrite == sizeof(int))
+        {
+            GetEmitter()->emitIns_R_R_I(INS_str, EA_4BYTE, REG_ZR, addrReg, addrOffset);
+            bytesToWrite = 0;
         }
-#endif // TARGET_ARM64
-        noway_assert(uCntBytes == 0);
 
+        assert(bytesToWrite == 0);
 #elif defined(TARGET_XARCH)
         assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
         emitter*  emit        = GetEmitter();
index 2f78d20..eaac34f 100644 (file)
@@ -385,7 +385,7 @@ void emitter::emitInsSanityCheck(instrDesc* id)
         case IF_DI_2D: // DI_2D   X........Nrrrrrr ssssssnnnnnddddd      Rd Rn    imr, imms   (N,r,s)
             assert(isValidGeneralDatasize(id->idOpSize()));
             assert(isGeneralRegister(id->idReg1()));
-            assert(isGeneralRegister(id->idReg2()));
+            assert(isGeneralRegisterOrZR(id->idReg2()));
             assert(isValidImmNRS(emitGetInsSC(id), id->idOpSize()));
             break;
 
@@ -915,6 +915,12 @@ void emitter::emitInsSanityCheck(instrDesc* id)
         case IF_SI_0B: // SI_0B   ................ ....bbbb........               imm4 - barrier
             break;
 
+        case IF_SR_1A: // SR_1A   ................ ...........ttttt      Rt       (dc zva)
+            datasize = id->idOpSize();
+            assert(isGeneralRegister(id->idReg1()));
+            assert(datasize == EA_8BYTE);
+            break;
+
         default:
             printf("unexpected format %s\n", emitIfName(id->idInsFmt()));
             assert(!"Unexpected format");
@@ -3683,6 +3689,14 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg)
             fmt = IF_BR_1A;
             break;
 
+        case INS_dczva:
+            assert(isGeneralRegister(reg));
+            assert(attr == EA_8BYTE);
+            id = emitNewInstrSmall(attr);
+            id->idReg1(reg);
+            fmt = IF_SR_1A;
+            break;
+
         default:
             unreached();
     }
@@ -6941,7 +6955,7 @@ void emitter::emitIns_R_R_I_I(
         case INS_sbfm:
         case INS_ubfm:
             assert(isGeneralRegister(reg1));
-            assert(isGeneralRegister(reg2));
+            assert((ins == INS_bfm) ? isGeneralRegisterOrZR(reg2) : isGeneralRegister(reg2));
             assert(isValidImmShift(imm1, size));
             assert(isValidImmShift(imm2, size));
             assert(insOptsNone(opt));
@@ -11372,6 +11386,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             dst += emitOutput_Instr(dst, code);
             break;
 
+        case IF_SR_1A: // SR_1A   ................ ...........ttttt      Rt       (dc zva)
+            assert(insOptsNone(id->idInsOpt()));
+            code = emitInsCode(ins, fmt);
+            code |= insEncodeReg_Rt(id->idReg1()); // ttttt
+            dst += emitOutput_Instr(dst, code);
+            break;
+
         default:
             assert(!"Unexpected format");
             break;
@@ -13293,6 +13314,10 @@ void emitter::emitDispIns(
             emitDispBarrier((insBarrier)emitGetInsSC(id));
             break;
 
+        case IF_SR_1A: // SR_1A   ................ ...........ttttt      Rt       (dc zva)
+            emitDispReg(id->idReg1(), size, false);
+            break;
+
         default:
             printf("unexpected format %s", emitIfName(id->idInsFmt()));
             assert(!"unexpectedFormat");
@@ -15366,6 +15391,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             result.insLatency    = PERFSCORE_LATENCY_1C;
             break;
 
+        case IF_SR_1A:
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
         case IF_DV_2T: // addv, saddlv, smaxv, sminv, uaddlv, umaxv, uminv
             switch (ins)
             {
index 05d7d2c..81f4108 100644 (file)
@@ -111,6 +111,7 @@ IF_DEF(EN2Q, IS_NONE, NONE) // Instruction has 2 possible encoding types, type Q
 //   BR  :: Branches - Register
 //   SN  :: System - No Registers or Immediates
 //   SI  :: System - Immediate
+//   SR  :: System - Register
 //
 //   _   :: a separator char '_'
 //
@@ -226,6 +227,8 @@ IF_DEF(SN_0A, IS_NONE, NONE) // SN_0A   ................ ................
 IF_DEF(SI_0A, IS_NONE, NONE) // SI_0A   ...........iiiii iiiiiiiiiii.....               imm16
 IF_DEF(SI_0B, IS_NONE, NONE) // SI_0B   ................ ....bbbb........               imm4 - barrier
 
+IF_DEF(SR_1A, IS_NONE, NONE) // SR_1A   ................ ...........ttttt      Rt       (dc zva)
+
 IF_DEF(INVALID, IS_NONE, NONE) //
 
 //////////////////////////////////////////////////////////////////////////////
index cddd9b0..1bb1357 100644 (file)
@@ -1572,6 +1572,9 @@ INST1(dmb,         "dmb",          0,      IF_SI_0B,  0xD50330BF)
 INST1(isb,         "isb",          0,      IF_SI_0B,  0xD50330DF)
                                    //  isb     barrierKind          SI_0B  1101010100000011 0011bbbb11011111   D503 30DF   imm4 - barrier kind
 
+INST1(dczva,       "dczva",        0,      IF_SR_1A,  0xD50B7420)
+                                   //  dc      zva,Rt               SR_1A  1101010100001011 01110100001ttttt   D50B 7420   Rt
+
 INST1(umov,        "umov",         0,      IF_DV_2B,  0x0E003C00)
                                    //  umov    Rd,Vn[]              DV_2B  0Q001110000iiiii 001111nnnnnddddd   0E00 3C00   Rd,Vn[]
 
index d4d501e..559df53 100644 (file)
@@ -1556,6 +1556,10 @@ typedef unsigned char   regNumberSmall;
   // have encoding that restricts what registers that can be used for the indexed element when the element size is H (i.e. 2 bytes).
   #define RBM_ASIMD_INDEXED_H_ELEMENT_ALLOWED_REGS (RBM_V0|RBM_V1|RBM_V2|RBM_V3|RBM_V4|RBM_V5|RBM_V6|RBM_V7|RBM_V8|RBM_V9|RBM_V10|RBM_V11|RBM_V12|RBM_V13|RBM_V14|RBM_V15)
 
+  #define REG_ZERO_INIT_FRAME_REG1 REG_R9
+  #define REG_ZERO_INIT_FRAME_REG2 REG_R10
+  #define REG_ZERO_INIT_FRAME_SIMD REG_V16
+
 #else
   #error Unsupported or unset target architecture
 #endif
index e064a95..7b816cd 100644 (file)
@@ -42,6 +42,7 @@ namespace Internal.ReadyToRunConstants
                             case InstructionSet.ARM64_Atomics: return ReadyToRunInstructionSet.Atomics;
                             case InstructionSet.ARM64_Vector64: return null;
                             case InstructionSet.ARM64_Vector128: return null;
+                            case InstructionSet.ARM64_Dczva: return null;
 
                             default: throw new Exception("Unknown instruction set");
                         }
index 4270c1e..8a8111d 100644 (file)
@@ -28,14 +28,15 @@ namespace Internal.JitInterface
         ARM64_Atomics=9,
         ARM64_Vector64=10,
         ARM64_Vector128=11,
-        ARM64_ArmBase_Arm64=12,
-        ARM64_AdvSimd_Arm64=13,
-        ARM64_Aes_Arm64=14,
-        ARM64_Crc32_Arm64=15,
-        ARM64_Dp_Arm64=16,
-        ARM64_Rdm_Arm64=17,
-        ARM64_Sha1_Arm64=18,
-        ARM64_Sha256_Arm64=19,
+        ARM64_Dczva=12,
+        ARM64_ArmBase_Arm64=13,
+        ARM64_AdvSimd_Arm64=14,
+        ARM64_Aes_Arm64=15,
+        ARM64_Crc32_Arm64=16,
+        ARM64_Dp_Arm64=17,
+        ARM64_Rdm_Arm64=18,
+        ARM64_Sha1_Arm64=19,
+        ARM64_Sha256_Arm64=20,
         X64_X86Base=1,
         X64_SSE=2,
         X64_SSE2=3,
@@ -551,6 +552,7 @@ namespace Internal.JitInterface
                     yield return new InstructionSetInfo("lse", "", InstructionSet.ARM64_Atomics, true);
                     yield return new InstructionSetInfo("Vector64", "", InstructionSet.ARM64_Vector64, false);
                     yield return new InstructionSetInfo("Vector128", "", InstructionSet.ARM64_Vector128, false);
+                    yield return new InstructionSetInfo("Dczva", "", InstructionSet.ARM64_Dczva, false);
                     break;
 
                 case TargetArchitecture.X64:
index d8fe398..6e64e7e 100644 (file)
@@ -93,6 +93,7 @@ instructionset     ,ARM64 ,Sha256    ,        ,20 ,Sha256   ,sha2
 instructionset     ,ARM64 ,          ,Atomics ,21 ,Atomics  ,lse
 instructionset     ,ARM64 ,          ,        ,   ,Vector64 ,
 instructionset     ,ARM64 ,          ,        ,   ,Vector128,
+instructionset     ,ARM64 ,          ,        ,   ,Dczva    ,
 
 instructionset64bit,ARM64 ,ArmBase
 instructionset64bit,ARM64 ,AdvSimd
index d9c7fcd..78790be 100644 (file)
@@ -16,6 +16,13 @@ LEAF_ENTRY GetCurrentSP, _TEXT
     ret lr
 LEAF_END GetCurrentSP, _TEXT
 
+// DWORD64 __stdcall GetDataCacheZeroIDReg(void)
+LEAF_ENTRY GetDataCacheZeroIDReg, _TEXT
+    mrs     x0, dczid_el0
+    and     x0, x0, 31
+    ret     lr
+LEAF_END GetDataCacheZeroIDReg, _TEXT
+
 //-----------------------------------------------------------------------------
 // This routine captures the machine state. It is used by helper method frame
 //-----------------------------------------------------------------------------
index 2f9227b..304a659 100644 (file)
         ret     lr
     LEAF_END
 
+;; DWORD64 __stdcall GetDataCacheZeroIDReg(void);
+    LEAF_ENTRY GetDataCacheZeroIDReg
+        mrs     x0, dczid_el0
+        and     x0, x0, 31
+        ret     lr
+    LEAF_END
+
 ;;-----------------------------------------------------------------------------
 ;; This routine captures the machine state. It is used by helper method frame
 ;;-----------------------------------------------------------------------------
index c4f0073..f9338f7 100644 (file)
@@ -1263,6 +1263,10 @@ bool DoesOSSupportAVX()
 
 #endif // defined(TARGET_X86) || defined(TARGET_AMD64)
 
+#ifdef TARGET_ARM64
+extern "C" DWORD64 __stdcall GetDataCacheZeroIDReg();
+#endif
+
 void EEJitManager::SetCpuInfo()
 {
     LIMITED_METHOD_CONTRACT;
@@ -1514,6 +1518,16 @@ void EEJitManager::SetCpuInfo()
         CPUCompileFlags.Set(InstructionSet_Crc32);
     }
 #endif // HOST_64BIT
+#ifndef CROSSGEN_COMPILE
+    if (GetDataCacheZeroIDReg() == 4)
+    {
+        // DCZID_EL0<4> (DZP) indicates whether use of DC ZVA instructions is permitted (0) or prohibited (1).
+        // DCZID_EL0<3:0> (BS) specifies Log2 of the block size in words.
+        //
+        // We set the flag when the instruction is permitted and the block size is 64 bytes.
+        CPUCompileFlags.Set(InstructionSet_Dczva);
+    }
+#endif
 #endif // TARGET_ARM64
 
     CPUCompileFlags.Set64BitInstructionSetVariants();