From cf7ce3b522722f14906e0ff05e4c4665fc44ca13 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Thu, 18 Jun 2020 16:38:06 -0700 Subject: [PATCH] [Arm64] Implement fcvtxn, fcvtxn2, sqabs, sqneg, suqadd, usqadd (#38010) --- src/coreclr/src/jit/codegenarm64.cpp | 301 +++++++++++++++++++++-------------- src/coreclr/src/jit/emitarm64.cpp | 85 +++++++++- src/coreclr/src/jit/instrsarm64.h | 23 +++ 3 files changed, 288 insertions(+), 121 deletions(-) diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp index 7c91589..3b27de9 100644 --- a/src/coreclr/src/jit/codegenarm64.cpp +++ b/src/coreclr/src/jit/codegenarm64.cpp @@ -7604,6 +7604,70 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R(INS_fsqrt, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); theEmitter->emitIns_R_R(INS_fsqrt, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + // faddp scalar + theEmitter->emitIns_R_R(INS_faddp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_faddp, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_2D); + + // fcmeq Vd, Vn, #0.0 + theEmitter->emitIns_R_R(INS_fcmeq, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE + theEmitter->emitIns_R_R(INS_fcmeq, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE + + // fcmge Vd, Vn, #0.0 + theEmitter->emitIns_R_R(INS_fcmge, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE + theEmitter->emitIns_R_R(INS_fcmge, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE + + // fcmgt Vd, Vn, #0.0 + theEmitter->emitIns_R_R(INS_fcmgt, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE + theEmitter->emitIns_R_R(INS_fcmgt, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE + + // fcmle Vd, Vn, #0.0 + theEmitter->emitIns_R_R(INS_fcmle, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE + theEmitter->emitIns_R_R(INS_fcmle, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE + + // fcmlt Vd, Vn, #0.0 + theEmitter->emitIns_R_R(INS_fcmlt, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE + theEmitter->emitIns_R_R(INS_fcmlt, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE + + // frecpe scalar + theEmitter->emitIns_R_R(INS_frecpe, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE + theEmitter->emitIns_R_R(INS_frecpe, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE + theEmitter->emitIns_R_R(INS_frecpe, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_frecpe, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_frecpe, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + // frecpx scalar + theEmitter->emitIns_R_R(INS_frecpx, EA_4BYTE, REG_V0, REG_V1); + theEmitter->emitIns_R_R(INS_frecpx, EA_8BYTE, REG_V2, REG_V3); + + // frsqrte + theEmitter->emitIns_R_R(INS_frsqrte, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE + theEmitter->emitIns_R_R(INS_frsqrte, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE + theEmitter->emitIns_R_R(INS_frsqrte, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_frsqrte, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_frsqrte, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); + + // fcvtl{2} vector + theEmitter->emitIns_R_R(INS_fcvtl, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_fcvtl2, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_fcvtl, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtl2, EA_16BYTE, REG_V5, REG_V6, INS_OPTS_4S); + + // fcvtn{2} vector + theEmitter->emitIns_R_R(INS_fcvtn, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_fcvtn2, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_fcvtn, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtn2, EA_16BYTE, REG_V5, REG_V6, INS_OPTS_4S); + + // fcvtxn scalar + theEmitter->emitIns_R_R(INS_fcvtxn, EA_4BYTE, REG_V0, REG_V1); + + // fcvtxn{2} vector + theEmitter->emitIns_R_R(INS_fcvtxn, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_fcvtxn2, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_4S); + +#endif + +#ifdef ALL_ARM64_EMITTER_UNIT_TESTS genDefineTempLabel(genCreateTempLabel()); // abs scalar @@ -7618,34 +7682,17 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R(INS_abs, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); theEmitter->emitIns_R_R(INS_abs, EA_16BYTE, REG_V16, REG_V17, INS_OPTS_2D); - // neg scalar - theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V2, REG_V3); - - // neg vector - theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); - theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); - theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_4H); - theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); - theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V12, REG_V13, INS_OPTS_2S); - theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); - theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V16, REG_V17, INS_OPTS_2D); - - // mvn vector - theEmitter->emitIns_R_R(INS_mvn, EA_8BYTE, REG_V4, REG_V5); - theEmitter->emitIns_R_R(INS_mvn, EA_8BYTE, REG_V6, REG_V7, INS_OPTS_8B); - theEmitter->emitIns_R_R(INS_mvn, EA_16BYTE, REG_V8, REG_V9); - theEmitter->emitIns_R_R(INS_mvn, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_16B); + // addv vector + theEmitter->emitIns_R_R(INS_addv, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_addv, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_addv, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_addv, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_addv, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); // cnt vector theEmitter->emitIns_R_R(INS_cnt, EA_8BYTE, REG_V22, REG_V23, INS_OPTS_8B); theEmitter->emitIns_R_R(INS_cnt, EA_16BYTE, REG_V24, REG_V25, INS_OPTS_16B); - // not vector (the same encoding as mvn) - theEmitter->emitIns_R_R(INS_not, EA_8BYTE, REG_V12, REG_V13); - theEmitter->emitIns_R_R(INS_not, EA_8BYTE, REG_V14, REG_V15, INS_OPTS_8B); - theEmitter->emitIns_R_R(INS_not, EA_16BYTE, REG_V16, REG_V17); - theEmitter->emitIns_R_R(INS_not, EA_16BYTE, REG_V18, REG_V19, INS_OPTS_16B); - // cls vector theEmitter->emitIns_R_R(INS_cls, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); theEmitter->emitIns_R_R(INS_cls, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); @@ -7662,6 +7709,30 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R(INS_clz, EA_8BYTE, REG_V12, REG_V13, INS_OPTS_2S); theEmitter->emitIns_R_R(INS_clz, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); + // mvn vector + theEmitter->emitIns_R_R(INS_mvn, EA_8BYTE, REG_V4, REG_V5); + theEmitter->emitIns_R_R(INS_mvn, EA_8BYTE, REG_V6, REG_V7, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_mvn, EA_16BYTE, REG_V8, REG_V9); + theEmitter->emitIns_R_R(INS_mvn, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_16B); + + // neg scalar + theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V2, REG_V3); + + // neg vector + theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_neg, EA_8BYTE, REG_V12, REG_V13, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_neg, EA_16BYTE, REG_V16, REG_V17, INS_OPTS_2D); + + // not vector (the same encoding as mvn) + theEmitter->emitIns_R_R(INS_not, EA_8BYTE, REG_V12, REG_V13); + theEmitter->emitIns_R_R(INS_not, EA_8BYTE, REG_V14, REG_V15, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_not, EA_16BYTE, REG_V16, REG_V17); + theEmitter->emitIns_R_R(INS_not, EA_16BYTE, REG_V18, REG_V19, INS_OPTS_16B); + // rbit vector theEmitter->emitIns_R_R(INS_rbit, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); theEmitter->emitIns_R_R(INS_rbit, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_16B); @@ -7684,12 +7755,21 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R(INS_rev64, EA_8BYTE, REG_V12, REG_V13, INS_OPTS_2S); theEmitter->emitIns_R_R(INS_rev64, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); - // addv vector - theEmitter->emitIns_R_R(INS_addv, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); - theEmitter->emitIns_R_R(INS_addv, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); - theEmitter->emitIns_R_R(INS_addv, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_4H); - theEmitter->emitIns_R_R(INS_addv, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); - theEmitter->emitIns_R_R(INS_addv, EA_16BYTE, REG_V14, REG_V15, INS_OPTS_4S); + // sadalp vector + theEmitter->emitIns_R_R(INS_sadalp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_sadalp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_sadalp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_sadalp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_sadalp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_sadalp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + + // saddlp vector + theEmitter->emitIns_R_R(INS_saddlp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_saddlp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_saddlp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_saddlp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_saddlp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_saddlp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); // saddlv vector theEmitter->emitIns_R_R(INS_saddlv, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); @@ -7712,6 +7792,67 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R(INS_sminv, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); theEmitter->emitIns_R_R(INS_sminv, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_4S); + // sqabs scalar + theEmitter->emitIns_R_R(INS_sqabs, EA_1BYTE, REG_V0, REG_V1, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_sqabs, EA_2BYTE, REG_V2, REG_V3, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_sqabs, EA_4BYTE, REG_V4, REG_V5, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_sqabs, EA_8BYTE, REG_V6, REG_V7, INS_OPTS_NONE); + + // sqabs vector + theEmitter->emitIns_R_R(INS_sqabs, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_sqabs, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_sqabs, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_sqabs, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_sqabs, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_sqabs, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_sqabs, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + // sqneg scalar + theEmitter->emitIns_R_R(INS_sqneg, EA_1BYTE, REG_V0, REG_V1, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_sqneg, EA_2BYTE, REG_V2, REG_V3, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_sqneg, EA_4BYTE, REG_V4, REG_V5, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_sqneg, EA_8BYTE, REG_V6, REG_V7, INS_OPTS_NONE); + + // sqneg vector + theEmitter->emitIns_R_R(INS_sqneg, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_sqneg, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_sqneg, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_sqneg, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_sqneg, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_sqneg, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_sqneg, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + // suqadd scalar + theEmitter->emitIns_R_R(INS_suqadd, EA_1BYTE, REG_V0, REG_V1, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_suqadd, EA_2BYTE, REG_V2, REG_V3, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_suqadd, EA_4BYTE, REG_V4, REG_V5, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_suqadd, EA_8BYTE, REG_V6, REG_V7, INS_OPTS_NONE); + + // suqadd vector + theEmitter->emitIns_R_R(INS_suqadd, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_suqadd, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_suqadd, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_suqadd, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_suqadd, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_suqadd, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_suqadd, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); + + // uadalp vector + theEmitter->emitIns_R_R(INS_uadalp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_uadalp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_uadalp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_uadalp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_uadalp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_uadalp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + + // uaddlp vector + theEmitter->emitIns_R_R(INS_uaddlp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_uaddlp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_uaddlp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_uaddlp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_uaddlp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_uaddlp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + // uaddlv vector theEmitter->emitIns_R_R(INS_uaddlv, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_8B); theEmitter->emitIns_R_R(INS_uaddlv, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); @@ -7733,48 +7874,6 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R(INS_uminv, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_8H); theEmitter->emitIns_R_R(INS_uminv, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_4S); - // faddp scalar - theEmitter->emitIns_R_R(INS_faddp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_2S); - theEmitter->emitIns_R_R(INS_faddp, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_2D); - - // fcmeq Vd, Vn, #0.0 - theEmitter->emitIns_R_R(INS_fcmeq, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE - theEmitter->emitIns_R_R(INS_fcmeq, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE - - // fcmge Vd, Vn, #0.0 - theEmitter->emitIns_R_R(INS_fcmge, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE - theEmitter->emitIns_R_R(INS_fcmge, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE - - // fcmgt Vd, Vn, #0.0 - theEmitter->emitIns_R_R(INS_fcmgt, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE - theEmitter->emitIns_R_R(INS_fcmgt, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE - - // fcmle Vd, Vn, #0.0 - theEmitter->emitIns_R_R(INS_fcmle, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE - theEmitter->emitIns_R_R(INS_fcmle, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE - - // fcmlt Vd, Vn, #0.0 - theEmitter->emitIns_R_R(INS_fcmlt, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE - theEmitter->emitIns_R_R(INS_fcmlt, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE - - // frecpe scalar - theEmitter->emitIns_R_R(INS_frecpe, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE - theEmitter->emitIns_R_R(INS_frecpe, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE - theEmitter->emitIns_R_R(INS_frecpe, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); - theEmitter->emitIns_R_R(INS_frecpe, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); - theEmitter->emitIns_R_R(INS_frecpe, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); - - // frecpx scalar - theEmitter->emitIns_R_R(INS_frecpx, EA_4BYTE, REG_V0, REG_V1); - theEmitter->emitIns_R_R(INS_frecpx, EA_8BYTE, REG_V2, REG_V3); - - // frsqrte - theEmitter->emitIns_R_R(INS_frsqrte, EA_4BYTE, REG_V0, REG_V1); // scalar 4BYTE - theEmitter->emitIns_R_R(INS_frsqrte, EA_8BYTE, REG_V2, REG_V3); // scalar 8BYTE - theEmitter->emitIns_R_R(INS_frsqrte, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); - theEmitter->emitIns_R_R(INS_frsqrte, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_4S); - theEmitter->emitIns_R_R(INS_frsqrte, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_2D); - // urecpe vector theEmitter->emitIns_R_R(INS_urecpe, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_2S); theEmitter->emitIns_R_R(INS_urecpe, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_4S); @@ -7783,52 +7882,20 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R(INS_ursqrte, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_2S); theEmitter->emitIns_R_R(INS_ursqrte, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_4S); - // fcvtl{2} vector - theEmitter->emitIns_R_R(INS_fcvtl, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_4H); - theEmitter->emitIns_R_R(INS_fcvtl2, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_8H); - theEmitter->emitIns_R_R(INS_fcvtl, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); - theEmitter->emitIns_R_R(INS_fcvtl2, EA_16BYTE, REG_V5, REG_V6, INS_OPTS_4S); - - // fcvtn{2} vector - theEmitter->emitIns_R_R(INS_fcvtn, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_4H); - theEmitter->emitIns_R_R(INS_fcvtn2, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_8H); - theEmitter->emitIns_R_R(INS_fcvtn, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); - theEmitter->emitIns_R_R(INS_fcvtn2, EA_16BYTE, REG_V5, REG_V6, INS_OPTS_4S); - -#endif - -#ifdef ALL_ARM64_EMITTER_UNIT_TESTS - // sadalp vector - theEmitter->emitIns_R_R(INS_sadalp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); - theEmitter->emitIns_R_R(INS_sadalp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); - theEmitter->emitIns_R_R(INS_sadalp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); - theEmitter->emitIns_R_R(INS_sadalp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); - theEmitter->emitIns_R_R(INS_sadalp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); - theEmitter->emitIns_R_R(INS_sadalp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); - - // saddlp vector - theEmitter->emitIns_R_R(INS_saddlp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); - theEmitter->emitIns_R_R(INS_saddlp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); - theEmitter->emitIns_R_R(INS_saddlp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); - theEmitter->emitIns_R_R(INS_saddlp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); - theEmitter->emitIns_R_R(INS_saddlp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); - theEmitter->emitIns_R_R(INS_saddlp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); - - // uadalp vector - theEmitter->emitIns_R_R(INS_uadalp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); - theEmitter->emitIns_R_R(INS_uadalp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); - theEmitter->emitIns_R_R(INS_uadalp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); - theEmitter->emitIns_R_R(INS_uadalp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); - theEmitter->emitIns_R_R(INS_uadalp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); - theEmitter->emitIns_R_R(INS_uadalp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); - - // uaddlp vector - theEmitter->emitIns_R_R(INS_uaddlp, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); - theEmitter->emitIns_R_R(INS_uaddlp, EA_8BYTE, REG_V2, REG_V3, INS_OPTS_4H); - theEmitter->emitIns_R_R(INS_uaddlp, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_2S); - theEmitter->emitIns_R_R(INS_uaddlp, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_16B); - theEmitter->emitIns_R_R(INS_uaddlp, EA_16BYTE, REG_V8, REG_V9, INS_OPTS_8H); - theEmitter->emitIns_R_R(INS_uaddlp, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + // usqadd scalar + theEmitter->emitIns_R_R(INS_usqadd, EA_1BYTE, REG_V0, REG_V1, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_usqadd, EA_2BYTE, REG_V2, REG_V3, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_usqadd, EA_4BYTE, REG_V4, REG_V5, INS_OPTS_NONE); + theEmitter->emitIns_R_R(INS_usqadd, EA_8BYTE, REG_V6, REG_V7, INS_OPTS_NONE); + + // usqadd vector + theEmitter->emitIns_R_R(INS_usqadd, EA_8BYTE, REG_V0, REG_V1, INS_OPTS_8B); + theEmitter->emitIns_R_R(INS_usqadd, EA_16BYTE, REG_V2, REG_V3, INS_OPTS_16B); + theEmitter->emitIns_R_R(INS_usqadd, EA_8BYTE, REG_V4, REG_V5, INS_OPTS_4H); + theEmitter->emitIns_R_R(INS_usqadd, EA_16BYTE, REG_V6, REG_V7, INS_OPTS_8H); + theEmitter->emitIns_R_R(INS_usqadd, EA_8BYTE, REG_V8, REG_V9, INS_OPTS_2S); + theEmitter->emitIns_R_R(INS_usqadd, EA_16BYTE, REG_V10, REG_V11, INS_OPTS_4S); + theEmitter->emitIns_R_R(INS_usqadd, EA_16BYTE, REG_V12, REG_V13, INS_OPTS_2D); #endif // ALL_ARM64_EMITTER_UNIT_TESTS #ifdef ALL_ARM64_EMITTER_UNIT_TESTS diff --git a/src/coreclr/src/jit/emitarm64.cpp b/src/coreclr/src/jit/emitarm64.cpp index ced2297..53fa1e1 100644 --- a/src/coreclr/src/jit/emitarm64.cpp +++ b/src/coreclr/src/jit/emitarm64.cpp @@ -685,8 +685,11 @@ void emitter::emitInsSanityCheck(instrDesc* id) break; case IF_DV_2L: // DV_2L ........XX...... ......nnnnnddddd Vd Vn (abs, neg - scalar) - assert(id->idOpSize() == EA_8BYTE); // only type D is supported - __fallthrough; + assert(insOptsNone(id->idInsOpt())); + assert(isValidVectorElemsize(id->idOpSize())); + assert(isVectorRegister(id->idReg1())); + assert(isVectorRegister(id->idReg2())); + break; case IF_DV_2G: // DV_2G .........X...... ......nnnnnddddd Vd Vn (fmov, fcvtXX - register) case IF_DV_2K: // DV_2K .........X.mmmmm ......nnnnn..... Vn Vm (fcmp) @@ -4431,6 +4434,34 @@ void emitter::emitIns_R_R( fmt = IF_DV_2A; break; + case INS_fcvtxn: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + + if (insOptsAnyArrangement(opt)) + { + // Vector operation + assert(size == EA_8BYTE); + assert(opt == INS_OPTS_2S); + fmt = IF_DV_2A; + } + else + { + // Scalar operation + assert(insOptsNone(opt)); + assert(size == EA_4BYTE); + fmt = IF_DV_2G; + } + break; + + case INS_fcvtxn2: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + assert(size == EA_16BYTE); + assert(opt == INS_OPTS_4S); + fmt = IF_DV_2A; + break; + case INS_scvtf: case INS_ucvtf: if (insOptsAnyArrangement(opt)) @@ -4682,6 +4713,29 @@ void emitter::emitIns_R_R( fmt = IF_DV_2T; break; + case INS_sqabs: + case INS_sqneg: + case INS_suqadd: + case INS_usqadd: + assert(isVectorRegister(reg1)); + assert(isVectorRegister(reg2)); + + if (insOptsAnyArrangement(opt)) + { + // Vector operation + assert(isValidArrangement(size, opt)); + assert(opt != INS_OPTS_1D); // The encoding size = 11, Q = 0 is reserved + fmt = IF_DV_2M; + } + else + { + // Scalar operation + assert(insOptsNone(opt)); + assert(isValidVectorElemsize(size)); + fmt = IF_DV_2L; + } + break; + default: unreached(); break; @@ -14104,6 +14158,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_fcvtl2: case INS_fcvtn: case INS_fcvtn2: + case INS_fcvtxn: + case INS_fcvtxn2: result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency = PERFSCORE_LATENCY_4C; break; @@ -14151,6 +14207,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insLatency = PERFSCORE_LATENCY_3C; break; + case INS_fcvtxn: + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + case INS_fcmeq: case INS_fcmge: case INS_fcmgt: @@ -14564,8 +14625,19 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins switch (ins) { case INS_abs: - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency = PERFSCORE_LATENCY_3C; + case INS_sqneg: + case INS_suqadd: + case INS_usqadd: + if (id->idOpSize() == EA_16BYTE) + { + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + } + else + { + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + } + + result.insLatency = PERFSCORE_LATENCY_3C; break; case INS_addv: @@ -14611,6 +14683,11 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins result.insLatency = PERFSCORE_LATENCY_1C; break; + case INS_sqabs: + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency = PERFSCORE_LATENCY_4C; + break; + default: // all other instructions perfScoreUnhandledInstruction(id, &result); diff --git a/src/coreclr/src/jit/instrsarm64.h b/src/coreclr/src/jit/instrsarm64.h index 41d3344..0711e44 100644 --- a/src/coreclr/src/jit/instrsarm64.h +++ b/src/coreclr/src/jit/instrsarm64.h @@ -672,6 +672,10 @@ INST2(fcmlt, "fcmlt", 0, IF_EN2J, 0x0EA0E800, 0x5EA0E800) // fcmlt Vd,Vn DV_2A 0Q0011101X100000 111110nnnnnddddd 0EA0 E800 Vd,Vn (vector) // fcmlt Vd,Vn DV_2G 010111101X100000 111010nnnnnddddd 5EA0 E800 Vd,Vn (scalar) +INST2(fcvtxn, "fcvtxn", 0, IF_EN2J, 0x2E616800, 0x7E616800) + // fcvtxn Vd,Vn DV_2A 0010111001100001 011010nnnnnddddd 2E61 6800 Vd,Vn (vector) + // fcvtxn Vd,Vn DV_2G 0111111001100001 011010nnnnnddddd 7E61 6800 Vd,Vn (scalar) + INST2(fneg, "fneg", 0, IF_EN2J, 0x2EA0F800, 0x1E214000) // fneg Vd,Vn DV_2A 0Q1011101X100000 111110nnnnnddddd 2EA0 F800 Vd,Vn (vector) // fneg Vd,Vn DV_2G 000111100X100001 010000nnnnnddddd 1E21 4000 Vd,Vn (scalar) @@ -729,6 +733,22 @@ INST2(cmlt, "cmlt", 0, IF_EN2K, 0x0E20A800, 0x5E20A800) // cmlt Vd,Vn DV_2M 0Q101110XX100000 101010nnnnnddddd 0E20 A800 Vd,Vn (vector) // cmlt Vd,Vn DV_2L 01011110XX100000 101010nnnnnddddd 5E20 A800 Vd,Vn (scalar) +INST2(sqabs, "sqabs", 0, IF_EN2K, 0x0E207800, 0x5E207800) + // sqabs Vd,Vn DV_2M 0Q001110XX100000 011110nnnnnddddd 0E20 7800 Vd,Vn (vector) + // sqabs Vd,Vn DV_2L 01011110XX100000 011110nnnnnddddd 5E20 7800 Vd,Vn (scalar) + +INST2(sqneg, "sqneg", 0, IF_EN2K, 0x2E207800, 0x7E207800) + // sqneg Vd,Vn DV_2M 0Q101110XX100000 011110nnnnnddddd 2E20 7800 Vd,Vn (vector) + // sqneg Vd,Vn DV_2L 01111110XX100000 011110nnnnnddddd 7E20 7800 Vd,Vn (scalar) + +INST2(suqadd, "suqadd", 0, IF_EN2K, 0x0E203800, 0x5E203800) + // suqadd Vd,Vn DV_2M 0Q001110XX100000 001110nnnnnddddd 0E20 3800 Vd,Vn (vector) + // suqadd Vd,Vn DV_2L 01011110XX100000 001110nnnnnddddd 5E20 3800 Vd,Vn (scalar) + +INST2(usqadd, "usqadd", 0, IF_EN2K, 0x2E203800, 0x7E203800) + // usqadd Vd,Vn DV_2M 0Q101110XX100000 001110nnnnnddddd 2E20 3800 Vd,Vn (vector) + // usqadd Vd,Vn DV_2L 01111110XX100000 001110nnnnnddddd 7E20 3800 Vd,Vn (scalar) + // enum name info DR_2G DV_2M INST2(cls, "cls", 0, IF_EN2L, 0x5AC01400, 0x0E204800) // cls Rd,Rm DR_2G X101101011000000 000101nnnnnddddd 5AC0 1400 Rd Rn (general) @@ -1637,6 +1657,9 @@ INST1(fcvtn, "fcvtn", 0, IF_DV_2A, 0x0E216800) INST1(fcvtn2, "fcvtn2", 0, IF_DV_2A, 0x4E216800) // fcvtn2 Vd,Vn DV_2A 040011100X100001 011010nnnnnddddd 4E21 6800 Vd,Vn (vector) +INST1(fcvtxn2, "fcvtxn2", 0, IF_DV_2A, 0x6E616800) + // fcvtxn2 Vd,Vn DV_2A 0110111001100001 011010nnnnnddddd 6E61 6800 Vd,Vn (vector) + INST1(frecpx, "frecpx", 0, IF_DV_2G, 0x5EA1F800) // frecpx Vd,Vn DV_2G 010111101X100001 111110nnnnnddddd 5EA1 F800 Vd,Vn (scalar) -- 2.7.4