[mono][jit] Adding Vector128.Narrow,Widen as intrinsics on arm64. (#84837)

author Jan Dupej <109523496+jandupej@users.noreply.github.com>

Mon, 17 Apr 2023 14:41:58 +0000 (16:41 +0200)

committer GitHub <noreply@github.com>

Mon, 17 Apr 2023 14:41:58 +0000 (16:41 +0200)
author Jan Dupej <109523496+jandupej@users.noreply.github.com>
Mon, 17 Apr 2023 14:41:58 +0000 (16:41 +0200)
committer GitHub <noreply@github.com>
Mon, 17 Apr 2023 14:41:58 +0000 (16:41 +0200)
diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h

index 2687b9f..8593d3f 100644 (file)
--- a/src/mono/mono/arch/arm64/arm64-codegen.h
+++ b/src/mono/mono/arch/arm64/arm64-codegen.h
@@ -1208,6 +1208,8 @@ arm_encode_arith_imm (int imm, guint32 *shift)
  //   type  - data type of vector elements, one of {TYPE_I8, TYPE_I16, TYPE_I32, TYPE_I64}
  #define arm_neon_abs(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b0, (type), 0b01011, (rd), (rn))
  #define arm_neon_neg(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, (type), 0b01011, (rd), (rn))
+#define arm_neon_xtn(p, type, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, (type), 0b10010, (rd), (rn))
+#define arm_neon_xtn2(p, type, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, (type), 0b10010, (rd), (rn))
  
  // Parametrized variants of the float opcodes
  //   width - determines if full register or its lower half is used one of {VREG_LOW, VREG_FULL}
@@ -1215,6 +1217,11 @@ arm_encode_arith_imm (int imm, guint32 *shift)
  #define arm_neon_fabs(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b0, 0b10 | (type), 0b01111, (rd), (rn))
  #define arm_neon_fneg(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b01111, (rd), (rn))
  #define arm_neon_fsqrt(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11111, (rd), (rn))
+#define arm_neon_fcvtn(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10110, (rd), (rn))
+#define arm_neon_fcvtn2(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10110, (rd), (rn))
+#define arm_neon_fcvtl(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn))
+#define arm_neon_fcvtl2(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10111, (rd), (rn))
+
  
  // Parametrized variants of the bitwise opcodes
  //   width - determines if full register or its lower half is used, one of {VREG_LOW, VREG_FULL}
@@ -1304,13 +1311,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
  #define arm_neon_abs_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b01011, (rd), (rn))
  #define arm_neon_abs_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_8, 0b01011, (rd), (rn))
  
-#define arm_neon_xtn_8b(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10010, (rd), (rn))
-#define arm_neon_xtn2_8b(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10010, (rd), (rn))
-#define arm_neon_xtn_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10010, (rd), (rn))
-#define arm_neon_xtn2_8h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10010, (rd), (rn))
-#define arm_neon_xtn_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10010, (rd), (rn))
-#define arm_neon_xtn2_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10010, (rd), (rn))
-
  #define arm_neon_sqxtn_8b(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10100, (rd), (rn))
  #define arm_neon_sqxtn2_8b(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10100, (rd), (rn))
  #define arm_neon_sqxtn_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10100, (rd), (rn))
@@ -1318,16 +1318,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
  #define arm_neon_sqxtn_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10100, (rd), (rn))
  #define arm_neon_sqxtn2_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10100, (rd), (rn))
  
-#define arm_neon_fcvtn_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10110, (rd), (rn))
-#define arm_neon_fcvtn2_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10110, (rd), (rn))
-#define arm_neon_fcvtn_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10110, (rd), (rn))
-#define arm_neon_fcvtn2_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10110, (rd), (rn))
-
-#define arm_neon_fcvtl_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10111, (rd), (rn))
-#define arm_neon_fcvtl2_4h(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10111, (rd), (rn))
-#define arm_neon_fcvtl_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn))
-#define arm_neon_fcvtl2_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10111, (rd), (rn))
-
  #define arm_neon_frintn_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11000, (rd), (rn))
  #define arm_neon_frintn_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11000, (rd), (rn))
  #define arm_neon_frintn_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11000, (rd), (rn))
@@ -2284,18 +2274,26 @@ arm_encode_arith_imm (int imm, guint32 *shift)
  #define arm_neon_shimm_opcode(p, q, u, immh, immb, opcode, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001111000000000000010000000000 | (u) << 29 | (immh) << 19 | (immb) << 16 | (opcode) << 11, (rd), (rn))
  #define arm_neon_shimm_shr_immh_immb(size, shift) (((shift) - (16 << (size))) & 0b01111111)
  #define arm_neon_shimm_shr_opcode(p, q, u, size, opcode, rd, rn, shift) do { \
-       int32_t ___temp_emit0 = arm_neon_shimm_shr_immh_immb ((size), (shift)); \
-        arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)) \
+       int32_t __temp_emit0 = arm_neon_shimm_shr_immh_immb ((size), (shift)); \
+        arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)); \
  } while (0)
  
  #define arm_neon_shimm_shl_immh_immb(size, shift) (((shift) + (8 << (size))) & 0b01111111)
  #define arm_neon_shimm_shl_opcode(p, q, u, size, opcode, rd, rn, shift) do { \
-       int32_t ___temp_emit0 = arm_neon_shimm_shl_immh_immb ((size), (shift)); \
-        arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)) \
+       int32_t __temp_emit0 = arm_neon_shimm_shl_immh_immb ((size), (shift)); \
+        arm_neon_shimm_opcode ((p), (q), (u), (__temp_emit0 >> 3) & 0b1111, __temp_emit0 & 0b111, (opcode), (rd), (rn)); \
  } while (0)
  
  #define arm_neon_sli(p, width, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), (width), 0b1, (type), 0b01010, (rd), (rn), (shift))
  #define arm_neon_shrn(p, type, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, (type), 0b10000, (rd), (rn), (shift))
+#define arm_neon_sshll(p, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b0, (type), 0b10100, (rd), (rn), (shift))
+#define arm_neon_sshll2(p, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b0, (type), 0b10100, (rd), (rn), (shift))
+#define arm_neon_sxtl(p, type, rd, rn) arm_neon_sshll ((p), (type), (rd), (rn), 0)
+#define arm_neon_sxtl2(p, type, rd, rn) arm_neon_sshll2 ((p), (type), (rd), (rn), 0)
+#define arm_neon_ushll(p, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, (type), 0b10100, (rd), (rn), (shift))
+#define arm_neon_ushll2(p, type, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, (type), 0b10100, (rd), (rn), (shift))
+#define arm_neon_uxtl(p, type, rd, rn) arm_neon_ushll ((p), (type), (rd), (rn), 0)
+#define arm_neon_uxtl2(p, type, rd, rn) arm_neon_ushll2 ((p), (type), (rd), (rn), 0)
  
  #define arm_neon_sshr_8b(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b00000, (rd), (rn), (shift))
  #define arm_neon_sshr_16b(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b00000, (rd), (rn), (shift))
@@ -2373,13 +2371,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
  #define arm_neon_sqrshrn_2s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10011, (rd), (rn), (shift))
  #define arm_neon_sqrshrn2_2s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10011, (rd), (rn), (shift))
  
-#define arm_neon_shll_i_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b10100, (rd), (rn), (shift))
-#define arm_neon_shll2_i_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b10100, (rd), (rn), (shift))
-#define arm_neon_shll_i_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10100, (rd), (rn), (shift))
-#define arm_neon_shll2_i_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10100, (rd), (rn), (shift))
-#define arm_neon_shll_i_2s(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b10100, (rd), (rn), (shift))
-#define arm_neon_shll2_i_2s(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b0, SIZE_4, 0b10100, (rd), (rn), (shift))
-
  #define arm_neon_scvtf_i_4h(p, rd, rn, fbits) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b11100, (rd), (rn), (fbits))
  #define arm_neon_scvtf_i_8h(p, rd, rn, fbits) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11100, (rd), (rn), (fbits))
  #define arm_neon_scvtf_i_2s(p, rd, rn, fbits) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b0, SIZE_4, 0b11100, (rd), (rn), (fbits))
@@ -2476,13 +2467,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
  #define arm_neon_uqrshrn_2s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b1, SIZE_4, 0b10011, (rd), (rn), (shift))
  #define arm_neon_uqrshrn2_2s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b1, SIZE_4, 0b10011, (rd), (rn), (shift))
  
-#define arm_neon_ushll_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b10100, (rd), (rn), (shift))
-#define arm_neon_ushll2_8b(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b10100, (rd), (rn), (shift))
-#define arm_neon_ushll_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_2, 0b10100, (rd), (rn), (shift))
-#define arm_neon_ushll2_4h(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b10100, (rd), (rn), (shift))
-#define arm_neon_ushll_2s(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_LOW, 0b1, SIZE_4, 0b10100, (rd), (rn), (shift))
-#define arm_neon_ushll2_2s(p, rd, rn, shift) arm_neon_shimm_shl_opcode ((p), VREG_FULL, 0b1, SIZE_4, 0b10100, (rd), (rn), (shift))
-
  #define arm_neon_ucvtf_i_4h(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b1, SIZE_2, 0b11100, (rd), (rn), (shift))
  #define arm_neon_ucvtf_i_8h(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11100, (rd), (rn), (shift))
  #define arm_neon_ucvtf_i_2s(p, rd, rn, shift) arm_neon_shimm_shr_opcode ((p), VREG_LOW, 0b1, SIZE_4, 0b11100, (rd), (rn), (shift))
diff --git a/src/mono/mono/mini/cpu-arm64.mdesc b/src/mono/mono/mini/cpu-arm64.mdesc

index 8640097..a2af821 100644 (file)
--- a/src/mono/mono/mini/cpu-arm64.mdesc
+++ b/src/mono/mono/mini/cpu-arm64.mdesc
@@ -533,6 +533,11 @@ create_scalar_unsafe_int: dest:x src1:i len:4
  create_scalar_unsafe_float: dest:x src1:f len:4
  arm64_bic: dest:x src1:x src2:x len:4
  bitwise_select: dest:x src1:x src2:x src3:x len:12
+arm64_xtn: dest:x src1:x len:4
+arm64_xtn2: dest:x src1:x src2:x len:4 clob:1
+arm64_fcvtn: dest:x src1:x len:4
+arm64_fcvtn2: dest:x src1:x src2:x len:4 clob:1
+xunop: dest:x src1:x len:4
  arm64_ushl: dest:x src1:x src2:x len:4
  arm64_ext_imm: dest:x src1:x src2:x len:4
  
diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c

index b4dfb33..062eeb7 100644 (file)
--- a/src/mono/mono/mini/mini-arm64.c
+++ b/src/mono/mono/mini/mini-arm64.c
@@ -35,6 +35,8 @@
  #define EXPAND(x) x
  #define PARENTHESIZE(...) (__VA_ARGS__)
  #define EXPAND_FUN(m, ...) EXPAND(m PARENTHESIZE(__VA_ARGS__))
+#define OPFMT_DS dreg, sreg1
+#define OPFMT_TDS _t, dreg, sreg1
  #define OPFMT_WDSS _w, dreg, sreg1, sreg2
  #define OPFMT_WTDS _w, _t, dreg, sreg1
  #define OPFMT_WTDSS _w, _t, dreg, sreg1, sreg2
@@ -3774,7 +3776,8 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         code = emit_ldrfpq (code, dreg, sreg1, ins->inst_offset);
                         break;
                 case OP_XMOVE:
-                       arm_neon_mov (code, dreg, sreg1);
+                       if(dreg != sreg1)
+                               arm_neon_mov (code, dreg, sreg1);
                         break;
                 case OP_XCONST: {
                         if (cfg->compile_aot && cfg->code_exec_only) {
@@ -3848,6 +3851,29 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                         arm_neon_ins_e(code, t, dreg, sreg1, ins->inst_c0, 0);
                         break;
                 }
+               case OP_ARM64_XTN:
+                       // The '-1' here and in XTN2 is to account for the arm_neon_xtn macro defining
+                       // its type as the type of the destination. Here inst_c1 is the type of the 
+                       // source data. Since XTN(2) steps down the type by one; e.g. I4 to I2, we 
+                       // subtract unity.
+                       arm_neon_xtn (code, get_type_size_macro (ins->inst_c1) - 1, dreg, sreg1);
+                       break;
+
+               case OP_ARM64_XTN2: 
+                       g_assert (dreg == sreg1);
+                       arm_neon_xtn2 (code, get_type_size_macro (ins->inst_c1) - 1, dreg, sreg2);
+                       break;
+
+               case OP_ARM64_FCVTN:
+                       // Only double->float is supported here, while arm64 can also do float->half.
+                       arm_neon_fcvtn (code, dreg, sreg1);
+                       break;
+
+               case OP_ARM64_FCVTN2:
+                       g_assert (dreg == sreg1);
+                       arm_neon_fcvtn2 (code, dreg, sreg2); 
+                       break;
+
                 case OP_ARM64_XADDV: {
                         switch (ins->inst_c0) {
                         case INTRINS_AARCH64_ADV_SIMD_FADDV:
diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h

index 9b70bdc..335532e 100644 (file)
--- a/src/mono/mono/mini/mini-ops.h
+++ b/src/mono/mono/mini/mini-ops.h
@@ -1498,6 +1498,7 @@ MINI_OP(OP_XEXTRACT, "xextract", IREG, XREG, NONE)
  /*
   * Generic SIMD operations, the rest of the JIT doesn't care about the exact operation.
   */
+MINI_OP(OP_XUNOP, "xunop", XREG, XREG, NONE)
  MINI_OP(OP_XBINOP, "xbinop", XREG, XREG, XREG)
  MINI_OP(OP_XBINOP_FORCEINT, "xbinop_forceint", XREG, XREG, XREG)
  MINI_OP(OP_XBINOP_SCALAR, "xbinop_scalar", XREG, XREG, XREG)
diff --git a/src/mono/mono/mini/simd-arm64.h b/src/mono/mono/mini/simd-arm64.h

index 1f10471..68ee77a 100644 (file)
--- a/src/mono/mono/mini/simd-arm64.h
+++ b/src/mono/mono/mini/simd-arm64.h
@@ -49,6 +49,12 @@ SIMD_OP  (128, OP_XCOMPARE_FP, CMP_GE,               WTDSS,              _UNDEF,
  SIMD_OP  (128, OP_XCOMPARE_FP, CMP_LT,               WTDSS_REV,          _UNDEF,           _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmgt,   arm_neon_fcmgt)
  SIMD_OP  (128, OP_XCOMPARE_FP, CMP_LE,               WTDSS_REV,          _UNDEF,           _UNDEF,           _UNDEF,           _UNDEF,           arm_neon_fcmge,   arm_neon_fcmge)
  
+SIMD_OP  (128, OP_XUNOP,       OP_SIMD_FCVTL,        DS,                 _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_fcvtl,   _UNDEF)
+SIMD_OP  (128, OP_XUNOP,       OP_SIMD_FCVTL2,       DS,                 _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_fcvtl2,  _UNDEF) 
+SIMD_OP  (128, OP_XUNOP,       OP_ARM64_SXTL,        TDS,                arm_neon_sxtl,    arm_neon_sxtl,    arm_neon_sxtl,   _UNDEF,            _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XUNOP,       OP_ARM64_SXTL2,       TDS,                arm_neon_sxtl2,   arm_neon_sxtl2,   arm_neon_sxtl2,  _UNDEF,            _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XUNOP,       OP_ARM64_UXTL,        TDS,                arm_neon_uxtl,    arm_neon_uxtl,    arm_neon_uxtl,   _UNDEF,            _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XUNOP,       OP_ARM64_UXTL2,       TDS,                arm_neon_uxtl2,   arm_neon_uxtl2,   arm_neon_uxtl2,  _UNDEF,            _UNDEF,           _UNDEF)
  SIMD_OP  (128, OP_XBINOP,      OP_IADD,              WTDSS,              arm_neon_add,     arm_neon_add,     arm_neon_add,    arm_neon_add,      _UNDEF,           _UNDEF)
  SIMD_OP  (128, OP_XBINOP,      OP_FADD,              WTDSS,              _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_fadd,    arm_neon_fadd)
  SIMD_OP  (128, OP_XBINOP,      OP_ISUB,              WTDSS,              arm_neon_sub,     arm_neon_sub,     arm_neon_sub,    arm_neon_sub,      _UNDEF,           _UNDEF)
diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c

index f88c861..0b4ada4 100644 (file)
--- a/src/mono/mono/mini/simd-intrinsics.c
+++ b/src/mono/mono/mini/simd-intrinsics.c
@@ -1328,12 +1328,9 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
                 case SN_GetElement:
                 case SN_GetLower:
                 case SN_GetUpper:
-               case SN_Narrow:
                 case SN_Shuffle:
                 case SN_ToVector128:
                 case SN_ToVector128Unsafe:
-               case SN_WidenLower:
-               case SN_WidenUpper:
                 case SN_WithElement:
                         return NULL;
                 default:
@@ -1782,8 +1779,11 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
                 if (size == 16) {
                         switch (arg0_type) {
                         case MONO_TYPE_R8: {
-                               MonoInst *ins = emit_simd_ins (cfg, arg_class, OP_ARM64_FCVTN, args [0]->dreg, -1);
-                               return emit_simd_ins (cfg, arg_class, OP_ARM64_FCVTN2, ins->dreg, args [1]->dreg);
+                               MonoInst* ins = emit_simd_ins (cfg, arg_class, OP_ARM64_FCVTN, args [0]->dreg, -1);
+                               ins->inst_c1 = arg0_type;
+                               MonoInst* ret = emit_simd_ins (cfg, arg_class, OP_ARM64_FCVTN2, ins->dreg, args [1]->dreg);
+                               ret->inst_c1 = arg0_type;
+                               return ret;
                         }
                         case MONO_TYPE_I2:
                         case MONO_TYPE_I4:
@@ -1791,13 +1791,19 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
                         case MONO_TYPE_U2:
                         case MONO_TYPE_U4:
                         case MONO_TYPE_U8: {
-                               MonoInst *ins = emit_simd_ins (cfg, arg_class, OP_ARM64_XTN, args [0]->dreg, -1);
-                               return emit_simd_ins (cfg, arg_class, OP_ARM64_XTN2, ins->dreg, args [1]->dreg);
+                               MonoInst* ins = emit_simd_ins (cfg, arg_class, OP_ARM64_XTN, args [0]->dreg, -1);
+                               ins->inst_c1 = arg0_type;
+                               MonoInst* ret = emit_simd_ins (cfg, arg_class, OP_ARM64_XTN2, ins->dreg, args [1]->dreg);
+                               ret->inst_c1 = arg0_type;
+                               return ret;
                         }
                         default:
                                 return NULL;
                         }
                 } else {
+                       if (!COMPILE_LLVM (cfg))
+                               return NULL;
+
                         switch (arg0_type) {
                         case MONO_TYPE_R8: {
                                 //Widen arg0
@@ -1971,20 +1977,36 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
         }
         case SN_WidenLower:
         case SN_WidenUpper: {
-#if defined(TARGET_ARM64) || defined(TARGET_WASM)
                 if (!is_element_type_primitive (fsig->params [0]))
                         return NULL;
-
-               int op = id == SN_WidenLower ? OP_XLOWER : OP_XUPPER;
-               MonoInst *lower_or_upper_half = emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args);
-               if (type_enum_is_float (arg0_type)) {
-                       return emit_simd_ins (cfg, klass, OP_SIMD_FCVTL, lower_or_upper_half->dreg, -1);
-               } else {
-                       int zero = alloc_ireg (cfg);
-                       MONO_EMIT_NEW_ICONST (cfg, zero, 0);
-                       op = type_enum_is_unsigned (arg0_type) ? OP_SIMD_USHLL : OP_SIMD_SSHLL;
-                       return emit_simd_ins (cfg, klass, op, lower_or_upper_half->dreg, zero);
+#if defined(TARGET_ARM64)
+               if (!COMPILE_LLVM (cfg)) {
+                       int subop = 0;
+                       gboolean is_upper = (id == SN_WidenUpper);
+                       if (type_enum_is_float (arg0_type))
+                               subop = is_upper ? OP_SIMD_FCVTL2 : OP_SIMD_FCVTL;
+                       else if (type_enum_is_unsigned (arg0_type))
+                               subop = is_upper ? OP_ARM64_UXTL2 : OP_ARM64_UXTL;
+                       else
+                               subop = is_upper ? OP_ARM64_SXTL2 : OP_ARM64_SXTL;
+                       
+                       MonoInst* ins = emit_simd_ins (cfg, klass, OP_XUNOP, args [0]->dreg, -1);
+                       ins->inst_c0 = subop;
+                       ins->inst_c1 = arg0_type;
+                       return ins;
                 }
+#endif
+#if defined(TARGET_ARM64) || defined(TARGET_WASM)
+                       int op = id == SN_WidenLower ? OP_XLOWER : OP_XUPPER;
+                       MonoInst *lower_or_upper_half = emit_simd_ins_for_sig (cfg, klass, op, 0, arg0_type, fsig, args);
+                       if (type_enum_is_float (arg0_type)) {
+                               return emit_simd_ins (cfg, klass, OP_SIMD_FCVTL, lower_or_upper_half->dreg, -1);
+                       } else {
+                               int zero = alloc_ireg (cfg);
+                               MONO_EMIT_NEW_ICONST (cfg, zero, 0);
+                               op = type_enum_is_unsigned (arg0_type) ? OP_SIMD_USHLL : OP_SIMD_SSHLL;
+                               return emit_simd_ins (cfg, klass, op, lower_or_upper_half->dreg, zero);
+                       }
  #else
                 return NULL;
  #endif
author	Jan Dupej <109523496+jandupej@users.noreply.github.com>
	Mon, 17 Apr 2023 14:41:58 +0000 (16:41 +0200)
committer	GitHub <noreply@github.com>
	Mon, 17 Apr 2023 14:41:58 +0000 (16:41 +0200)
src/mono/mono/arch/arm64/arm64-codegen.h		patch \| blob \| history
src/mono/mono/mini/cpu-arm64.mdesc		patch \| blob \| history
src/mono/mono/mini/mini-arm64.c		patch \| blob \| history
src/mono/mono/mini/mini-ops.h		patch \| blob \| history
src/mono/mono/mini/simd-arm64.h		patch \| blob \| history
src/mono/mono/mini/simd-intrinsics.c		patch \| blob \| history