Added PerfScore support for Arm64 (#751)
authorBrian Sullivan <briansul@microsoft.com>
Thu, 12 Dec 2019 05:14:18 +0000 (21:14 -0800)
committerGitHub <noreply@github.com>
Thu, 12 Dec 2019 05:14:18 +0000 (21:14 -0800)
Based upon arm_cortex_a55_software_optimization_guide_v2.pdf

src/coreclr/src/jit/emit.cpp
src/coreclr/src/jit/emit.h
src/coreclr/src/jit/emitarm.cpp
src/coreclr/src/jit/emitarm64.cpp
src/coreclr/src/jit/emitxarch.cpp

index 32b4a77..2e3c444 100644 (file)
@@ -1107,7 +1107,7 @@ float emitter::insEvaluateExecutionCost(instrDesc* id)
 
     if (memAccessKind == PERFSCORE_MEMORY_WRITE)
     {
-        // We assume that we won't read back from memory for the next WR_GENERAL (3) cycles
+        // We assume that we won't read back from memory for the next WR_GENERAL cycles
         // Thus we normally won't pay latency costs for writes.
         latency = max(0.0f, latency - PERFSCORE_LATENCY_WR_GENERAL);
     }
@@ -1121,6 +1121,34 @@ float emitter::insEvaluateExecutionCost(instrDesc* id)
     return max(throughput, latency);
 }
 
+//------------------------------------------------------------------------------------
+// perfScoreUnhandledInstruction:
+//    Helper method used to report an unhandled instruction
+//
+// Arguments:
+//    id  - The current instruction descriptor to be evaluated
+//    pResult - pointer to struct holding the instruction characteristics
+//              if we return these are updated with default values
+//
+// Notes:
+//     When validating that the PerfScore handles every instruction.
+//     the #if 0 block is changed into a #ifdef DEBUG
+//     We will print the instruction and instruction group
+//     and instead of returning we will assert
+//
+//     Otherwise we will return default latencies of 1 cycle.
+//
+void emitter::perfScoreUnhandledInstruction(instrDesc* id, insExecutionCharacteristics* pResult)
+{
+// Change this to #ifdef DEBUG to assert on any unhandled instructions
+#if 0
+    printf("PerfScore: unhandled instruction: %s, format %s", codeGen->genInsName(id->idIns()), emitIfName(id->idInsFmt()));
+    assert(!"PerfScore: unhandled instruction");
+#endif
+    pResult->insThroughput = PERFSCORE_THROUGHPUT_1C;
+    pResult->insLatency    = PERFSCORE_LATENCY_1C;
+}
+
 #endif // defined(DEBUG) || defined(LATE_DISASM)
 
 //----------------------------------------------------------------------------------------
index 1bd579b..78e3408 100644 (file)
@@ -1233,19 +1233,17 @@ protected:
 #define PERFSCORE_THROUGHPUT_4C 4.0f   // slower - 4 cycles
 #define PERFSCORE_THROUGHPUT_5C 5.0f   // slower - 5 cycles
 #define PERFSCORE_THROUGHPUT_6C 6.0f   // slower - 6 cycles
+#define PERFSCORE_THROUGHPUT_9C 9.0f   // slower - 9 cycles
 #define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles
 #define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles
+#define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles
 #define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles
 #define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles
 #define PERFSCORE_THROUGHPUT_52C 52.0f // slower - 52 cycles
 #define PERFSCORE_THROUGHPUT_57C 57.0f // slower - 57 cycles
 
-#define PERFSCORE_THROUGHPUT_DEFAULT PERFSCORE_THROUGHPUT_1C
-
 #define PERFSCORE_LATENCY_ILLEGAL -1024.0f
 
-#define PERFSCORE_LATENCY_DEFAULT 1.0f
-
 #define PERFSCORE_LATENCY_ZERO 0.0f
 #define PERFSCORE_LATENCY_1C 1.0f
 #define PERFSCORE_LATENCY_2C 2.0f
@@ -1260,7 +1258,11 @@ protected:
 #define PERFSCORE_LATENCY_11C 11.0f
 #define PERFSCORE_LATENCY_12C 12.0f
 #define PERFSCORE_LATENCY_13C 13.0f
+#define PERFSCORE_LATENCY_15C 15.0f
 #define PERFSCORE_LATENCY_16C 16.0f
+#define PERFSCORE_LATENCY_18C 18.0f
+#define PERFSCORE_LATENCY_20C 20.0f
+#define PERFSCORE_LATENCY_22C 22.0f
 #define PERFSCORE_LATENCY_23C 23.0f
 #define PERFSCORE_LATENCY_26C 26.0f
 #define PERFSCORE_LATENCY_62C 62.0f
@@ -1271,22 +1273,46 @@ protected:
 #define PERFSCORE_LATENCY_BRANCH_COND 2.0f     // includes cost of a possible misprediction
 #define PERFSCORE_LATENCY_BRANCH_INDIRECT 2.0f // includes cost of a possible misprediction
 
+#if defined(_TARGET_XARCH_)
+
+// a read,write or modify from stack location, possible def to use latency from L0 cache
+#define PERFSCORE_LATENCY_RD_STACK PERFSCORE_LATENCY_2C
+#define PERFSCORE_LATENCY_WR_STACK PERFSCORE_LATENCY_2C
+#define PERFSCORE_LATENCY_RD_WR_STACK PERFSCORE_LATENCY_5C
+
+// a read, write or modify from constant location, possible def to use latency from L0 cache
+#define PERFSCORE_LATENCY_RD_CONST_ADDR PERFSCORE_LATENCY_2C
+#define PERFSCORE_LATENCY_WR_CONST_ADDR PERFSCORE_LATENCY_2C
+#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR PERFSCORE_LATENCY_5C
+
+// a read, write or modify from memory location, possible def to use latency from L0 or L1 cache
+// plus an extra cost  (of 1.0) for a increased chance  of a cache miss
+#define PERFSCORE_LATENCY_RD_GENERAL PERFSCORE_LATENCY_3C
+#define PERFSCORE_LATENCY_WR_GENERAL PERFSCORE_LATENCY_3C
+#define PERFSCORE_LATENCY_RD_WR_GENERAL PERFSCORE_LATENCY_6C
+
+#elif defined(_TARGET_ARM64_) || defined(_TARGET_ARM_)
+
 // a read,write or modify from stack location, possible def to use latency from L0 cache
-#define PERFSCORE_LATENCY_RD_STACK 2.0f
-#define PERFSCORE_LATENCY_WR_STACK 2.0f
-#define PERFSCORE_LATENCY_RD_WR_STACK 5.0f
+#define PERFSCORE_LATENCY_RD_STACK PERFSCORE_LATENCY_3C
+#define PERFSCORE_LATENCY_WR_STACK PERFSCORE_LATENCY_1C
+#define PERFSCORE_LATENCY_RD_WR_STACK PERFSCORE_LATENCY_3C
 
 // a read, write or modify from constant location, possible def to use latency from L0 cache
-#define PERFSCORE_LATENCY_RD_CONST_ADDR 2.0f
-#define PERFSCORE_LATENCY_WR_CONST_ADDR 2.0f
-#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR 5.0f
+#define PERFSCORE_LATENCY_RD_CONST_ADDR PERFSCORE_LATENCY_3C
+#define PERFSCORE_LATENCY_WR_CONST_ADDR PERFSCORE_LATENCY_1C
+#define PERFSCORE_LATENCY_RD_WR_CONST_ADDR PERFSCORE_LATENCY_3C
 
 // a read, write or modify from memory location, possible def to use latency from L0 or L1 cache
 // plus an extra cost  (of 1.0) for a increased chance  of a cache miss
-#define PERFSCORE_LATENCY_RD_GENERAL 3.0f
-#define PERFSCORE_LATENCY_WR_GENERAL 3.0f
-#define PERFSCORE_LATENCY_RD_WR_GENERAL 6.0f
+#define PERFSCORE_LATENCY_RD_GENERAL PERFSCORE_LATENCY_4C
+#define PERFSCORE_LATENCY_WR_GENERAL PERFSCORE_LATENCY_1C
+#define PERFSCORE_LATENCY_RD_WR_GENERAL PERFSCORE_LATENCY_4C
 
+#endif // _TARGET_XXX
+
+// Make this an enum:
+//
 #define PERFSCORE_MEMORY_NONE 0
 #define PERFSCORE_MEMORY_READ 1
 #define PERFSCORE_MEMORY_WRITE 2
@@ -1295,8 +1321,7 @@ protected:
 #define PERFSCORE_CODESIZE_COST_HOT 0.10f
 #define PERFSCORE_CODESIZE_COST_COLD 0.01f
 
-#define PERFSCORE_CALLEE_SPILL_COST                                                                                    \
-    0.75f // heuristicly derived - actual cost is one push and one pop, in the prolog/epilog
+#define PERFSCORE_CALLEE_SPILL_COST 0.75f
 
     struct insExecutionCharacteristics
     {
@@ -1305,9 +1330,12 @@ protected:
         unsigned insMemoryAccessKind;
     };
 
-    insExecutionCharacteristics getInsExecutionCharacteristics(instrDesc* id);
     float insEvaluateExecutionCost(instrDesc* id);
 
+    insExecutionCharacteristics getInsExecutionCharacteristics(instrDesc* id);
+
+    void emitter::perfScoreUnhandledInstruction(instrDesc* id, insExecutionCharacteristics* result);
+
 #endif // defined(DEBUG) || defined(LATE_DISASM)
 
     BasicBlock::weight_t getCurrentBlockWeight();
index 69af7d4..0596469 100644 (file)
@@ -7991,8 +7991,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
 
     // ToDo: Calculate actual throughput and latency values
     //
-    result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT;
-    result.insLatency    = PERFSCORE_LATENCY_DEFAULT;
+    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+    result.insLatency    = PERFSCORE_LATENCY_1C;
 
     return result;
 }
index 427a1c0..ad88954 100644 (file)
@@ -12195,6 +12195,11 @@ void emitter::getMemoryOperation(instrDesc* id, unsigned* pMemAccessKind, bool*
 //    A struct containing the current instruction execution characteristics
 //
 // Notes:
+//    The instruction latencies and throughput values returned by this function
+//    are from
+//
+//    The Arm Cortex-A55 Software Optimization Guide:
+//    https://static.docs.arm.com/epm128372/20/arm_cortex_a55_software_optimization_guide_v2.pdf
 //
 emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(instrDesc* id)
 {
@@ -12209,25 +12214,878 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
     result.insThroughput = PERFSCORE_THROUGHPUT_ILLEGAL;
     result.insLatency    = PERFSCORE_LATENCY_ILLEGAL;
 
+    // Initialize insLatency based upon the instruction's memAccessKind and local access values
+    //
     if (memAccessKind == PERFSCORE_MEMORY_READ)
     {
-        result.insLatency = PERFSCORE_LATENCY_4C;
+        result.insLatency = isLocalAccess ? PERFSCORE_LATENCY_RD_STACK : PERFSCORE_LATENCY_RD_GENERAL;
     }
     else if (memAccessKind == PERFSCORE_MEMORY_WRITE)
     {
-        result.insLatency = PERFSCORE_LATENCY_1C;
+        result.insLatency = isLocalAccess ? PERFSCORE_LATENCY_WR_STACK : PERFSCORE_LATENCY_WR_GENERAL;
     }
     else if (memAccessKind == PERFSCORE_MEMORY_READ_WRITE)
     {
-        result.insLatency = PERFSCORE_LATENCY_5C;
+        result.insLatency = isLocalAccess ? PERFSCORE_LATENCY_RD_WR_STACK : PERFSCORE_LATENCY_RD_WR_GENERAL;
     }
 
-    // ToDo: Determine individual instruction throughput as latency
-    //
-    result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT;
-    if (memAccessKind == PERFSCORE_MEMORY_NONE)
+    switch (insFmt)
     {
-        result.insLatency = PERFSCORE_LATENCY_DEFAULT;
+        //
+        //  Branch Instructions
+        //
+
+        case IF_BI_0A:                                      // b, bl_local
+        case IF_BI_0C:                                      // bl, b_tail
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C; // but is Dual Issue
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        case IF_BI_0B: // beq, bne, bge, blt, bgt, ble, ...
+        case IF_BI_1A: // cbz, cbnz
+        case IF_BI_1B: // tbz, tbnz
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        case IF_BR_1B: // blr, br_tail
+            if (ins == INS_blr)
+            {
+                result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                result.insLatency    = PERFSCORE_LATENCY_1C;
+                break;
+            }
+            // otherwise we should have a br_tail instruction
+            assert(ins == INS_br_tail);
+            __fallthrough;
+        case IF_BR_1A: // ret, br
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        //
+        //  Arithmetic and logical instructions
+        //
+
+        // ALU, basic
+        case IF_DR_3A: // add, adds, adc, adcs, and, ands, bic, bics,
+                       // eon, eor, orn, orr, sub, subs, sbc, sbcs
+                       // asr, asrv, lsl, lslv, lsr, lsrv, ror, rorv
+                       // sdiv, udiv, mul, smull, smulh, umull, umulh, mneg
+        case IF_DR_2A: // cmp, cmn, tst
+
+            switch (ins)
+            {
+                case INS_mul:
+                case INS_smull:
+                case INS_umull:
+                case INS_mneg:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                    result.insLatency    = PERFSCORE_LATENCY_3C;
+                    break;
+
+                case INS_smulh:
+                case INS_umulh:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_3C;
+                    result.insLatency    = PERFSCORE_LATENCY_6C;
+                    break;
+
+                case INS_sdiv:
+                case INS_udiv:
+                    if (id->idOpSize() == EA_4BYTE)
+                    {
+                        result.insThroughput = PERFSCORE_THROUGHPUT_4C;
+                        result.insLatency    = PERFSCORE_LATENCY_12C;
+                        break;
+                    }
+                    else
+                    {
+                        assert(id->idOpSize() == EA_8BYTE);
+                        result.insThroughput = PERFSCORE_THROUGHPUT_4C;
+                        result.insLatency    = PERFSCORE_LATENCY_20C;
+                        break;
+                    }
+
+                case INS_add:
+                case INS_adds:
+                case INS_adc:
+                case INS_adcs:
+                case INS_and:
+                case INS_ands:
+                case INS_bic:
+                case INS_bics:
+                case INS_eon:
+                case INS_eor:
+                case INS_orn:
+                case INS_orr:
+                case INS_sub:
+                case INS_subs:
+                case INS_sbc:
+                case INS_sbcs:
+                case INS_asr:
+                case INS_lsl:
+                case INS_lsr:
+                case INS_ror:
+                case INS_cmp:
+                case INS_cmn:
+                case INS_tst:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_1C;
+                    break;
+
+                case INS_asrv:
+                case INS_lslv:
+                case INS_lsrv:
+                case INS_rorv:
+                    // variable shift by register
+                    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                    result.insLatency    = PERFSCORE_LATENCY_1C;
+                    break;
+
+                default:
+                    // all other instructions
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
+            }
+
+        // ALU, basic immediate
+        case IF_DI_1A: // cmp, cmn
+        case IF_DI_1C: // tst
+        case IF_DI_1D: // mov reg, imm(N,r,s)
+        case IF_DI_1F: // ccmp, ccmn
+        case IF_DI_2A: // add, adds, suv, subs
+        case IF_DI_2C: // and, ands, eor, orr
+
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        case IF_DR_2D: // cinc, cinv, cneg
+        case IF_DR_2E: // mov, neg, mvn, negs
+
+        case IF_DI_1E: // adr, adrp
+        case IF_DI_1B: // mov, movk, movn, movz
+
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        // ALU, shift by immediate
+        case IF_DR_3B: // add, adds, and, ands, bic, bics,
+                       // eon, eor, orn, orr, sub, subs
+        case IF_DR_2B: // cmp, cmn, tst
+        case IF_DR_2F: // neg, negs, mvn
+        case IF_DI_2B: // ror
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+
+        // ALU, extend, scale
+        case IF_DR_3C: // add, adc, and, bic, eon, eor, orn, orr, sub, sbc
+        case IF_DR_2C: // cmp
+        case IF_DR_2J: // sha1h
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+        // ALU, Conditional select
+        case IF_DR_1D: // cset, csetm
+        case IF_DR_3D: // csel, csinc, csinv, csneg
+
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        // ALU, Conditional compare
+        case IF_DR_2I: // ccmp , ccmn
+
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        // Multiply accumulate
+        case IF_DR_4A: // madd, msub, smaddl, smsubl, umaddl, umsubl
+            if (id->idOpSize() == EA_4BYTE)
+            {
+                result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                result.insLatency    = PERFSCORE_LATENCY_3C;
+                break;
+            }
+            else
+            {
+                assert(id->idOpSize() == EA_8BYTE);
+                result.insThroughput = PERFSCORE_THROUGHPUT_5C;
+                result.insLatency    = PERFSCORE_LATENCY_3C;
+                break;
+            }
+
+        // Miscellaneous Data Preocessing instructions
+        case IF_DR_3E: // extr
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+
+        case IF_DR_2H: // sxtb, sxth, sxtw, uxtb, uxth, sha1h
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        case IF_DI_2D: // lsl, lsr, asr, sbfm, bfm, ubfm, sbfiz, bfi, ubfiz, sbfx, bfxil, ubfx
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+
+        case IF_DR_2G: // mov sp, cls, clz, rbit, rev16, rev32, rev
+            if (ins == INS_rbit)
+            {
+                result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                result.insLatency    = PERFSCORE_LATENCY_2C;
+                break;
+            }
+            else
+            {
+                result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                result.insLatency    = PERFSCORE_LATENCY_1C;
+                break;
+            }
+
+        //
+        //  Load/Store Instructions
+        //
+
+        case IF_LS_1A: // ldr, ldrsw (literal, pc relative immediate)
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            break;
+
+        case IF_LS_2A: // ldr, ldrsw, ldrb, ldrh, ldrsb, ldrsh, str, strb, strh (no immediate)
+                       // ldar, ldarb, ldarh, ldxr, ldxrb, ldxrh,
+                       // ldaxr, ldaxrb, ldaxrh, stlr, stlrb, stlrh
+
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            // ToDo: store release have 2/4 cycle latency
+            break;
+
+        case IF_LS_2B: // ldr, ldrsw, ldrb, ldrh, ldrsb, ldrsh, str, strb, strh (scaled immediate)
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            break;
+
+        case IF_LS_2C: // ldr, ldrsw, ldrb, ldrh, ldrsb, ldrsh, str, strb, strh
+                       // ldur, ldurb, ldurh, ldursb, ldursh, ldursw, stur, sturb, sturh
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            break;
+
+        case IF_LS_3A: // ldr, ldrsw, ldrb, ldrh, ldrsb, ldrsh, str, strb strh (register extend, scale 2,4,8)
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            break;
+
+        case IF_LS_3B: // ldp, ldpsw, ldnp, stp, stnp  (load/store pair zero offset)
+        case IF_LS_3C: // load/store pair with offset pre/post inc
+            if (memAccessKind == PERFSCORE_MEMORY_READ)
+            {
+                // ldp, ldpsw, ldnp
+                result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                if (emitIGisInEpilog(emitCurIG) && (ins == INS_ldp))
+                {
+                    // Reduce latency for ldp instructions in the epilog
+                    //
+                    result.insLatency = PERFSCORE_LATENCY_2C;
+                }
+                else if (id->idOpSize() == EA_8BYTE) // X-form
+                {
+                    // the X-reg variant has an extra cycle of latency
+                    // and two cycle throughput
+                    result.insLatency += 1.0;
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2C;
+                }
+            }
+            else // store instructions
+            {
+                // stp, stnp
+                assert(memAccessKind == PERFSCORE_MEMORY_WRITE);
+                result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            }
+            break;
+
+        case IF_LS_3D: // stxr, stxrb, stxrh, stlxr, stlxrb, srlxrh
+            // Store exclusive register, returning status
+            assert(emitInsIsStore(ins));
+            // @ToDo - find out the actual latency
+            result.insThroughput = PERFSCORE_THROUGHPUT_2C;
+            result.insLatency    = max(PERFSCORE_LATENCY_4C, result.insLatency);
+            break;
+
+        case IF_LS_3E: //  ARMv8.1 LSE Atomics
+            if (memAccessKind == PERFSCORE_MEMORY_WRITE)
+            {
+                // staddb, staddlb, staddh, staddlh, stadd. staddl
+                result.insThroughput = PERFSCORE_THROUGHPUT_2C;
+                result.insLatency    = PERFSCORE_LATENCY_2C;
+            }
+            else
+            {
+                assert(memAccessKind == PERFSCORE_MEMORY_READ_WRITE);
+                result.insThroughput = PERFSCORE_THROUGHPUT_3C;
+                result.insLatency    = max(PERFSCORE_LATENCY_3C, result.insLatency);
+            }
+            break;
+
+        case IF_LS_2D: // ld1                         (vector - multiple structures)
+        case IF_LS_2E: // ld1                         (vector - single structure)
+        case IF_LS_3F: // ld1                         (vector - multiple structures)
+        case IF_LS_3G: // ld1                         (vector - single structure)
+            if (id->idOpSize() == EA_8BYTE)
+            {
+                // D-form
+                result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                result.insLatency    = PERFSCORE_LATENCY_3C;
+            }
+            else
+            {
+                // Q-form
+                assert(id->idOpSize() == EA_16BYTE);
+                result.insThroughput = PERFSCORE_THROUGHPUT_2C;
+                result.insLatency    = PERFSCORE_LATENCY_4C;
+            }
+            break;
+
+        case IF_SN_0A: // bkpt, nop
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_ZERO;
+            break;
+
+        case IF_SI_0B: // dmb, dsb, isb
+            // @ToDo - find out the actual latency
+            result.insThroughput = PERFSCORE_THROUGHPUT_10C;
+            result.insLatency    = PERFSCORE_LATENCY_10C;
+            break;
+
+        case IF_DV_2J: // fcvt  Vd Vn
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_4C;
+            break;
+
+        case IF_DV_2K: // fcmp  Vd Vn
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        case IF_DV_1A: // fmov - immediate (scalar)
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        case IF_DV_1B: // fmov, orr, bic, movi, mvni  (immediate vector)
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        case IF_DV_1C: // fcmp vn, #0.0
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            result.insLatency    = PERFSCORE_LATENCY_3C;
+            break;
+
+        case IF_DV_2A: // fabs, fneg, fsqrt, fcvtXX, frintX, scvtf, ucvtf, fcmXX (vector)
+            switch (ins)
+            {
+                case INS_fabs:
+                case INS_fneg:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency = (id->idOpSize() == EA_8BYTE) ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_3C / 2;
+                    break;
+
+                case INS_fsqrt:
+                    if ((id->idInsOpt() == INS_OPTS_2S) || (id->idInsOpt() == INS_OPTS_4S))
+                    {
+                        // S-form
+                        result.insThroughput = PERFSCORE_THROUGHPUT_3C;
+                        result.insLatency    = PERFSCORE_LATENCY_11C;
+                    }
+                    else
+                    {
+                        // D-form
+                        assert(id->idInsOpt() == INS_OPTS_2D);
+                        result.insThroughput = PERFSCORE_THROUGHPUT_6C;
+                        result.insLatency    = PERFSCORE_LATENCY_18C;
+                    }
+                    break;
+
+                case INS_fcvtas:
+                case INS_fcvtau:
+                case INS_fcvtms:
+                case INS_fcvtmu:
+                case INS_fcvtns:
+                case INS_fcvtnu:
+                case INS_fcvtps:
+                case INS_fcvtpu:
+                case INS_fcvtzs:
+                case INS_fcvtzu:
+                case INS_frinta:
+                case INS_frinti:
+                case INS_frintm:
+                case INS_frintn:
+                case INS_frintp:
+                case INS_frintx:
+                case INS_frintz:
+                case INS_scvtf:
+                case INS_ucvtf:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_4C;
+                    break;
+
+                case INS_fcmeq:
+                case INS_fcmge:
+                case INS_fcmgt:
+                case INS_fcmle:
+                case INS_fcmlt:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_2C;
+                    break;
+
+                default:
+                    // all other instructions
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
+            }
+            break;
+
+        case IF_DV_2G: // fmov, fabs, fneg, fsqrt, faddp, fcmXX, fcvtXX, frintX, scvtf, ucvtf (scalar)
+            switch (ins)
+            {
+                case INS_fmov:
+                    // FP move, vector register
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_1C;
+                    break;
+
+                case INS_fabs:
+                case INS_fneg:
+                case INS_faddp:
+
+                case INS_fcvtas:
+                case INS_fcvtau:
+                case INS_fcvtms:
+                case INS_fcvtmu:
+                case INS_fcvtns:
+                case INS_fcvtnu:
+                case INS_fcvtps:
+                case INS_fcvtpu:
+                case INS_fcvtzs:
+                case INS_fcvtzu:
+                case INS_scvtf:
+                case INS_ucvtf:
+
+                case INS_frinta:
+                case INS_frinti:
+                case INS_frintm:
+                case INS_frintn:
+                case INS_frintp:
+                case INS_frintx:
+                case INS_frintz:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_3C;
+                    break;
+
+                case INS_fcvtl:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                    result.insLatency    = PERFSCORE_LATENCY_4C;
+                    break;
+
+                case INS_fsqrt:
+                    if (id->idOpSize() == EA_8BYTE)
+                    {
+                        // D-form
+                        result.insThroughput = PERFSCORE_THROUGHPUT_19C;
+                        result.insLatency    = PERFSCORE_LATENCY_22C;
+                    }
+                    else
+                    {
+                        // S-form
+                        assert(id->idOpSize() == EA_4BYTE);
+                        result.insThroughput = PERFSCORE_THROUGHPUT_9C;
+                        result.insLatency    = PERFSCORE_LATENCY_12C;
+                    }
+                    break;
+
+                default:
+                    // all other instructions
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
+            }
+            break;
+
+        case IF_DV_3B: // fadd, fsub, fdiv, fmul, fmulx, fmla, fmls, fmin, fmax, fabd, faddp, fcmXX (vector)
+            switch (ins)
+            {
+                case INS_fmin:
+                case INS_fmax:
+                case INS_fabd:
+                case INS_fadd:
+                case INS_fsub:
+                case INS_faddp:
+                case INS_fmul:
+                case INS_fmulx:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_4C;
+                    break;
+
+                case INS_fcmeq:
+                case INS_fcmge:
+                case INS_fcmgt:
+                case INS_fcmle:
+                case INS_fcmlt:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_2C;
+                    break;
+
+                case INS_fdiv:
+                    if ((id->idInsOpt() == INS_OPTS_2S) || (id->idInsOpt() == INS_OPTS_4S))
+                    {
+                        // S-form
+                        result.insThroughput = PERFSCORE_THROUGHPUT_10C;
+                        result.insLatency    = PERFSCORE_LATENCY_13C;
+                    }
+                    else
+                    {
+                        // D-form
+                        assert(id->idInsOpt() == INS_OPTS_2D);
+                        result.insThroughput = PERFSCORE_THROUGHPUT_10C;
+                        result.insLatency    = PERFSCORE_LATENCY_22C;
+                    }
+                    break;
+
+                default:
+                    // all other instructions
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
+            }
+            break;
+
+        case IF_DV_3BI: // fmul, fmulx, fmla, fmls (vector by elem)
+        case IF_DV_3AI: // mul, mla, mls (vector by elem)
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            result.insLatency    = PERFSCORE_LATENCY_4C;
+            break;
+
+        case IF_DV_4A: // fmadd, fmsub, fnmadd, fnsub (scalar)
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_4C;
+            break;
+
+        case IF_DV_3D: // fadd, fsub, fdiv, fmul, fmulx, fmla, fmls, fmin, fmax, fabd, faddp, fcmXX (scalar)
+            switch (ins)
+            {
+                case INS_fadd:
+                case INS_fsub:
+                case INS_fmin:
+                case INS_fmax:
+                case INS_fmul:
+                case INS_fmulx:
+                case INS_fnmul:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_4C;
+                    break;
+
+                case INS_fdiv:
+                    if (id->idOpSize() == EA_8BYTE)
+                    {
+                        // D-form
+                        result.insThroughput = PERFSCORE_THROUGHPUT_6C;
+                        result.insLatency    = PERFSCORE_LATENCY_15C;
+                    }
+                    else
+                    {
+                        // S-form
+                        assert(id->idOpSize() == EA_4BYTE);
+                        result.insThroughput = PERFSCORE_THROUGHPUT_3C;
+                        result.insLatency    = PERFSCORE_LATENCY_10C;
+                    }
+                    break;
+
+                default:
+                    // all other instructions
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
+            }
+            break;
+
+        case IF_DV_2H: // fmov, fcvtXX - to general
+            // fmov : FP transfer to general register
+            // fcvtaXX : FP convert from vector to general
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_3C;
+            break;
+
+        case IF_DV_2I: // fmov, Xcvtf - from general
+            switch (ins)
+            {
+                case INS_fmov:
+                    // FP transfer from general register
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_2C;
+                    break;
+
+                case INS_scvtf:
+                case INS_ucvtf:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_5C;
+                    break;
+
+                default:
+                    // all other instructions
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
+            }
+            break;
+
+        case IF_DV_3C: // mov,and, bic, eor, mov,mvn, orn, bsl, bit, bif (vector)
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        case IF_DV_2E: // mov, dup (scalar)
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+
+        case IF_DV_2F: // mov, ins (element)
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+
+        case IF_DV_2B: // smov, umov - to general)
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+
+        case IF_DV_2C: // mov, dup, ins - from general)
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            if (ins == INS_dup)
+            {
+                result.insLatency = PERFSCORE_LATENCY_3C;
+            }
+            else
+            {
+                assert((ins == INS_ins) || (ins == INS_mov));
+                result.insLatency = PERFSCORE_LATENCY_2C;
+            }
+            break;
+
+        case IF_DV_2D: // dup (dvector)
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+
+        case IF_DV_3A: // (vector)
+            // add, sub, mul, mla, mls, cmeq, cmge, cmgt, cmhi, cmhs, ctst,
+            // pmul, saba, uaba, sabd, uabd, umin, umax, smin, smax,
+            switch (ins)
+            {
+                case INS_add:
+                case INS_sub:
+                case INS_cmeq:
+                case INS_cmge:
+                case INS_cmgt:
+                case INS_cmhi:
+                case INS_cmhs:
+                case INS_umin:
+                case INS_umax:
+                case INS_smin:
+                case INS_smax:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_2C;
+                    break;
+
+                case INS_ctst:
+                case INS_pmul:
+                case INS_sabd:
+                case INS_uabd:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_3C;
+                    break;
+
+                case INS_mul:
+                case INS_mla:
+                case INS_mls:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_4C;
+                    break;
+
+                case INS_saba:
+                case INS_uaba:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2C;
+                    result.insLatency    = PERFSCORE_LATENCY_4C;
+                    break;
+
+                default:
+                    // all other instructions
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
+            }
+            break;
+
+        case IF_DV_3DI: // mul, mla, mls (scalar by elem)
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            result.insLatency    = PERFSCORE_LATENCY_4C;
+            break;
+
+        case IF_DV_3E: // add, sub, cmeq, cmge, cmgt, cmhi, cmhs, ctst, (scalar)
+            result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+
+        case IF_DV_2L: // abs, neg, cmeq, cmge, cmgt, cmle, cmlt (scalar)
+        case IF_DV_2M: // (vector)
+            // abs, neg, mvn, not, cmeq, cmge, cmgt, cmle, cmlt,
+            // addv, saddlv,  uaddlv, smaxv, sminv, umaxv, uminv
+            // cls, clz, cnt, rbit, rev16, rev32, rev64,
+            // xtn, xtn2, shll, shll2
+            switch (ins)
+            {
+                case INS_abs:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_3C;
+                    break;
+
+                case INS_addv:
+                case INS_saddlv:
+                case INS_uaddlv:
+                case INS_cls:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                    result.insLatency    = PERFSCORE_LATENCY_3C;
+                    break;
+
+                case INS_sminv:
+                case INS_smaxv:
+                case INS_uminv:
+                case INS_umaxv:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                    result.insLatency    = PERFSCORE_LATENCY_4C;
+                    break;
+
+                case INS_cmeq:
+                case INS_cmge:
+                case INS_cmgt:
+                case INS_cmle:
+                case INS_cmlt:
+
+                case INS_clz:
+                case INS_cnt:
+                case INS_rbit:
+                case INS_rev16:
+                case INS_rev32:
+                case INS_rev64:
+                case INS_xtn:
+                case INS_xtn2:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_2C;
+                    break;
+
+                case INS_mvn:
+                case INS_not:
+                case INS_neg:
+                case INS_shll:
+                case INS_shll2:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_1C;
+                    break;
+
+                default:
+                    // all other instructions
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
+            }
+            break;
+
+        case IF_DV_2N: // sshr, ssra, srshr, srsra, shl, ushr, usra, urshr, ursra, sri, sli (shift by immediate -
+                       // scalar)
+        case IF_DV_2O: // sshr, ssra, srshr, srsra, shl, ushr, usra, urshr, ursra, sri, sli (shift by immediate -
+                       // vector)
+                       // sshll, sshll2, ushll, ushll2, shrn, shrn2, rshrn, rshrn2, sxrl, sxl2, uxtl, uxtl2
+            switch (ins)
+            {
+                case INS_shl:
+                case INS_shrn:
+                case INS_shrn2:
+                case INS_sli:
+                case INS_sri:
+                case INS_sshr:
+                case INS_ushr:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_2C;
+                    break;
+
+                case INS_shll:
+                case INS_shll2:
+                case INS_sshll:
+                case INS_sshll2:
+                case INS_ushll:
+                case INS_ushll2:
+                case INS_sxtl:
+                case INS_sxtl2:
+                case INS_uxtl:
+                case INS_uxtl2:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                    result.insLatency    = PERFSCORE_LATENCY_2C;
+                    break;
+
+                case INS_rshrn:
+                case INS_rshrn2:
+                case INS_srshr:
+                case INS_urshr:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_2X;
+                    result.insLatency    = PERFSCORE_LATENCY_3C;
+                    break;
+
+                default:
+                    // all other instructions
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
+            }
+            break;
+
+        case IF_DV_2P: // aese, aesd, aesmc, aesimc, sha1su1, sha256su0
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            result.insLatency    = PERFSCORE_LATENCY_2C;
+            break;
+
+        case IF_DV_3F: // sha1c, sha1m, sha1p, sha1su0, sha256h, sha256h2, sha256su1 (vector)
+            switch (ins)
+            {
+                case INS_sha1su0:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                    result.insLatency    = PERFSCORE_LATENCY_2C;
+                    break;
+
+                case INS_sha256su0:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                    result.insLatency    = PERFSCORE_LATENCY_3C;
+                    break;
+
+                case INS_sha1c:
+                case INS_sha1m:
+                case INS_sha1p:
+                case INS_sha256h:
+                case INS_sha256h2:
+                case INS_sha256su1:
+                    result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+                    result.insLatency    = PERFSCORE_LATENCY_4C;
+                    break;
+
+                default:
+                    // all other instructions
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
+            }
+            break;
+
+        case IF_SI_0A: // brk   imm16
+            result.insThroughput = PERFSCORE_THROUGHPUT_1C;
+            result.insLatency    = PERFSCORE_LATENCY_1C;
+            break;
+
+        default:
+            // all other instructions
+            perfScoreUnhandledInstruction(id, &result);
+            break;
     }
 
     return result;
index 33d8592..90b254f 100644 (file)
@@ -14024,10 +14024,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
                     break;
 
                 default:
-                    // all others
-                    assert(!"unreached");
-                    result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT;
-                    result.insLatency    = PERFSCORE_LATENCY_DEFAULT;
+                    // unhandled instruction insFmt combination
+                    perfScoreUnhandledInstruction(id, &result);
                     break;
             }
             break;
@@ -14051,9 +14049,9 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
                     break;
 
                 default:
-                    // all others
-                    assert(!"unreached");
-                    result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT;
+                    // unhandled instruction insFmt combination
+                    perfScoreUnhandledInstruction(id, &result);
+                    break;
             }
             break;
 
@@ -14084,9 +14082,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
                     break;
 
                 default:
-                    // all others
-                    assert(!"unreached");
-                    result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT;
+                    // unhandled instruction insFmt combination
+                    perfScoreUnhandledInstruction(id, &result);
                     break;
             }
             break;
@@ -14220,8 +14217,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
                     break;
 
                 default:
-                    assert(!"Unhandled insFmt for INS_call");
-                    result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT;
+                    // unhandled instruction, insFmt combination
+                    perfScoreUnhandledInstruction(id, &result);
                     break;
             }
             break;
@@ -15063,12 +15060,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins
             break;
 
         default:
-            // static instruction s_ins = ins;
-            // printf("Unsupported instruction: %s", codeGen->genInsName(ins));
-            // assert(!"Unhandled ins for getInsExecutionCharacteristics");
-            // all other ins
-            result.insThroughput = PERFSCORE_THROUGHPUT_DEFAULT;
-            result.insLatency    = PERFSCORE_LATENCY_DEFAULT;
+            // unhandled instruction insFmt combination
+            perfScoreUnhandledInstruction(id, &result);
             break;
     }